{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9988810145468108, "eval_steps": 500, "global_step": 4020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.353515625, "learning_rate": 9.950248756218907e-07, "loss": 2.6395, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.328125, "learning_rate": 4.975124378109453e-06, "loss": 2.6482, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.337890625, "learning_rate": 9.950248756218906e-06, "loss": 2.6389, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.3203125, "learning_rate": 1.4925373134328357e-05, "loss": 2.629, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.328125, "learning_rate": 1.990049751243781e-05, "loss": 2.6301, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.36328125, "learning_rate": 2.4875621890547266e-05, "loss": 2.6048, "step": 25 }, { "epoch": 0.02, "grad_norm": 0.298828125, "learning_rate": 2.9850746268656714e-05, "loss": 2.5818, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.26171875, "learning_rate": 3.4825870646766175e-05, "loss": 2.5372, "step": 35 }, { "epoch": 0.03, "grad_norm": 0.271484375, "learning_rate": 3.980099502487562e-05, "loss": 2.507, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.1748046875, "learning_rate": 4.477611940298508e-05, "loss": 2.4718, "step": 45 }, { "epoch": 0.04, "grad_norm": 0.1640625, "learning_rate": 4.975124378109453e-05, "loss": 2.4398, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.16015625, "learning_rate": 5.472636815920398e-05, "loss": 2.4118, "step": 55 }, { "epoch": 0.04, "grad_norm": 0.1728515625, "learning_rate": 5.970149253731343e-05, "loss": 2.3949, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.13671875, "learning_rate": 6.46766169154229e-05, "loss": 2.3556, "step": 65 }, { "epoch": 0.05, "grad_norm": 0.1357421875, "learning_rate": 6.965174129353235e-05, "loss": 2.3644, "step": 70 }, { "epoch": 0.06, "grad_norm": 0.1474609375, "learning_rate": 7.46268656716418e-05, "loss": 2.3262, "step": 75 }, { "epoch": 0.06, "grad_norm": 0.11865234375, "learning_rate": 7.960199004975125e-05, "loss": 2.3181, "step": 80 }, { "epoch": 0.06, "grad_norm": 0.11376953125, "learning_rate": 8.45771144278607e-05, "loss": 2.3195, "step": 85 }, { "epoch": 0.07, "grad_norm": 0.10693359375, "learning_rate": 8.955223880597016e-05, "loss": 2.2922, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.10009765625, "learning_rate": 9.452736318407961e-05, "loss": 2.2996, "step": 95 }, { "epoch": 0.07, "grad_norm": 0.095703125, "learning_rate": 9.950248756218906e-05, "loss": 2.2923, "step": 100 }, { "epoch": 0.08, "grad_norm": 0.09912109375, "learning_rate": 0.0001044776119402985, "loss": 2.2908, "step": 105 }, { "epoch": 0.08, "grad_norm": 0.11328125, "learning_rate": 0.00010945273631840796, "loss": 2.2892, "step": 110 }, { "epoch": 0.09, "grad_norm": 0.10107421875, "learning_rate": 0.00011442786069651741, "loss": 2.2772, "step": 115 }, { "epoch": 0.09, "grad_norm": 0.09814453125, "learning_rate": 0.00011940298507462686, "loss": 2.2492, "step": 120 }, { "epoch": 0.09, "grad_norm": 0.10693359375, "learning_rate": 0.0001243781094527363, "loss": 2.2669, "step": 125 }, { "epoch": 0.1, "grad_norm": 0.1025390625, "learning_rate": 0.0001293532338308458, "loss": 2.2486, "step": 130 }, { "epoch": 0.1, "grad_norm": 0.10205078125, "learning_rate": 0.00013432835820895525, "loss": 2.2655, "step": 135 }, { "epoch": 0.1, "grad_norm": 0.1005859375, "learning_rate": 0.0001393034825870647, "loss": 2.2695, "step": 140 }, { "epoch": 0.11, "grad_norm": 0.10546875, "learning_rate": 0.00014427860696517416, "loss": 2.2627, "step": 145 }, { "epoch": 0.11, "grad_norm": 0.1064453125, "learning_rate": 0.0001492537313432836, "loss": 2.2453, "step": 150 }, { "epoch": 0.12, "grad_norm": 0.1162109375, "learning_rate": 0.00015422885572139304, "loss": 2.2462, "step": 155 }, { "epoch": 0.12, "grad_norm": 0.119140625, "learning_rate": 0.0001592039800995025, "loss": 2.2574, "step": 160 }, { "epoch": 0.12, "grad_norm": 0.11474609375, "learning_rate": 0.00016417910447761195, "loss": 2.2522, "step": 165 }, { "epoch": 0.13, "grad_norm": 0.12255859375, "learning_rate": 0.0001691542288557214, "loss": 2.2428, "step": 170 }, { "epoch": 0.13, "grad_norm": 0.11669921875, "learning_rate": 0.00017412935323383086, "loss": 2.2617, "step": 175 }, { "epoch": 0.13, "grad_norm": 0.11328125, "learning_rate": 0.0001791044776119403, "loss": 2.2604, "step": 180 }, { "epoch": 0.14, "grad_norm": 0.1142578125, "learning_rate": 0.00018407960199004977, "loss": 2.2437, "step": 185 }, { "epoch": 0.14, "grad_norm": 0.1171875, "learning_rate": 0.00018905472636815922, "loss": 2.2489, "step": 190 }, { "epoch": 0.15, "grad_norm": 0.11181640625, "learning_rate": 0.00019402985074626867, "loss": 2.2788, "step": 195 }, { "epoch": 0.15, "grad_norm": 0.10986328125, "learning_rate": 0.00019900497512437813, "loss": 2.2438, "step": 200 }, { "epoch": 0.15, "grad_norm": 0.11181640625, "learning_rate": 0.00020398009950248756, "loss": 2.2603, "step": 205 }, { "epoch": 0.16, "grad_norm": 0.11181640625, "learning_rate": 0.000208955223880597, "loss": 2.242, "step": 210 }, { "epoch": 0.16, "grad_norm": 0.10986328125, "learning_rate": 0.00021393034825870647, "loss": 2.2205, "step": 215 }, { "epoch": 0.16, "grad_norm": 0.11669921875, "learning_rate": 0.00021890547263681592, "loss": 2.2329, "step": 220 }, { "epoch": 0.17, "grad_norm": 0.115234375, "learning_rate": 0.00022388059701492538, "loss": 2.2511, "step": 225 }, { "epoch": 0.17, "grad_norm": 0.1103515625, "learning_rate": 0.00022885572139303483, "loss": 2.2369, "step": 230 }, { "epoch": 0.18, "grad_norm": 0.1123046875, "learning_rate": 0.00023383084577114426, "loss": 2.2116, "step": 235 }, { "epoch": 0.18, "grad_norm": 0.11572265625, "learning_rate": 0.0002388059701492537, "loss": 2.2502, "step": 240 }, { "epoch": 0.18, "grad_norm": 0.10986328125, "learning_rate": 0.00024378109452736317, "loss": 2.2344, "step": 245 }, { "epoch": 0.19, "grad_norm": 0.1142578125, "learning_rate": 0.0002487562189054726, "loss": 2.2336, "step": 250 }, { "epoch": 0.19, "grad_norm": 0.11669921875, "learning_rate": 0.0002537313432835821, "loss": 2.2274, "step": 255 }, { "epoch": 0.19, "grad_norm": 0.109375, "learning_rate": 0.0002587064676616916, "loss": 2.2365, "step": 260 }, { "epoch": 0.2, "grad_norm": 0.1044921875, "learning_rate": 0.000263681592039801, "loss": 2.2357, "step": 265 }, { "epoch": 0.2, "grad_norm": 0.10498046875, "learning_rate": 0.0002686567164179105, "loss": 2.2314, "step": 270 }, { "epoch": 0.21, "grad_norm": 0.10791015625, "learning_rate": 0.0002736318407960199, "loss": 2.2381, "step": 275 }, { "epoch": 0.21, "grad_norm": 0.10888671875, "learning_rate": 0.0002786069651741294, "loss": 2.2481, "step": 280 }, { "epoch": 0.21, "grad_norm": 0.1142578125, "learning_rate": 0.00028358208955223883, "loss": 2.2104, "step": 285 }, { "epoch": 0.22, "grad_norm": 0.10302734375, "learning_rate": 0.0002885572139303483, "loss": 2.2203, "step": 290 }, { "epoch": 0.22, "grad_norm": 0.10986328125, "learning_rate": 0.00029353233830845774, "loss": 2.2426, "step": 295 }, { "epoch": 0.22, "grad_norm": 0.11083984375, "learning_rate": 0.0002985074626865672, "loss": 2.2137, "step": 300 }, { "epoch": 0.23, "grad_norm": 0.1064453125, "learning_rate": 0.00030348258706467665, "loss": 2.2393, "step": 305 }, { "epoch": 0.23, "grad_norm": 0.10205078125, "learning_rate": 0.0003084577114427861, "loss": 2.2369, "step": 310 }, { "epoch": 0.23, "grad_norm": 0.10595703125, "learning_rate": 0.00031343283582089556, "loss": 2.2205, "step": 315 }, { "epoch": 0.24, "grad_norm": 0.10400390625, "learning_rate": 0.000318407960199005, "loss": 2.2607, "step": 320 }, { "epoch": 0.24, "grad_norm": 0.10400390625, "learning_rate": 0.00032338308457711447, "loss": 2.201, "step": 325 }, { "epoch": 0.25, "grad_norm": 0.10595703125, "learning_rate": 0.0003283582089552239, "loss": 2.2304, "step": 330 }, { "epoch": 0.25, "grad_norm": 0.10595703125, "learning_rate": 0.0003333333333333334, "loss": 2.214, "step": 335 }, { "epoch": 0.25, "grad_norm": 0.1015625, "learning_rate": 0.0003383084577114428, "loss": 2.2251, "step": 340 }, { "epoch": 0.26, "grad_norm": 0.0986328125, "learning_rate": 0.0003432835820895523, "loss": 2.2159, "step": 345 }, { "epoch": 0.26, "grad_norm": 0.1025390625, "learning_rate": 0.0003482587064676617, "loss": 2.2085, "step": 350 }, { "epoch": 0.26, "grad_norm": 0.10302734375, "learning_rate": 0.0003532338308457712, "loss": 2.2165, "step": 355 }, { "epoch": 0.27, "grad_norm": 0.109375, "learning_rate": 0.0003582089552238806, "loss": 2.2196, "step": 360 }, { "epoch": 0.27, "grad_norm": 0.103515625, "learning_rate": 0.0003631840796019901, "loss": 2.212, "step": 365 }, { "epoch": 0.28, "grad_norm": 0.1044921875, "learning_rate": 0.00036815920398009953, "loss": 2.2246, "step": 370 }, { "epoch": 0.28, "grad_norm": 0.10009765625, "learning_rate": 0.00037313432835820896, "loss": 2.2166, "step": 375 }, { "epoch": 0.28, "grad_norm": 0.10498046875, "learning_rate": 0.00037810945273631844, "loss": 2.2207, "step": 380 }, { "epoch": 0.29, "grad_norm": 0.10693359375, "learning_rate": 0.00038308457711442787, "loss": 2.2283, "step": 385 }, { "epoch": 0.29, "grad_norm": 0.09765625, "learning_rate": 0.00038805970149253735, "loss": 2.2168, "step": 390 }, { "epoch": 0.29, "grad_norm": 0.0986328125, "learning_rate": 0.0003930348258706468, "loss": 2.2358, "step": 395 }, { "epoch": 0.3, "grad_norm": 0.1015625, "learning_rate": 0.00039800995024875626, "loss": 2.2289, "step": 400 }, { "epoch": 0.3, "grad_norm": 0.099609375, "learning_rate": 0.00039999932141401753, "loss": 2.2386, "step": 405 }, { "epoch": 0.31, "grad_norm": 0.1015625, "learning_rate": 0.000399995174516356, "loss": 2.2128, "step": 410 }, { "epoch": 0.31, "grad_norm": 0.09765625, "learning_rate": 0.00039998725779131805, "loss": 2.2145, "step": 415 }, { "epoch": 0.31, "grad_norm": 0.10107421875, "learning_rate": 0.0003999755713881306, "loss": 2.2056, "step": 420 }, { "epoch": 0.32, "grad_norm": 0.099609375, "learning_rate": 0.0003999601155270777, "loss": 2.2149, "step": 425 }, { "epoch": 0.32, "grad_norm": 0.099609375, "learning_rate": 0.00039994089049949597, "loss": 2.2165, "step": 430 }, { "epoch": 0.32, "grad_norm": 0.10107421875, "learning_rate": 0.0003999178966677693, "loss": 2.2031, "step": 435 }, { "epoch": 0.33, "grad_norm": 0.1025390625, "learning_rate": 0.00039989113446532205, "loss": 2.2134, "step": 440 }, { "epoch": 0.33, "grad_norm": 0.1025390625, "learning_rate": 0.0003998606043966108, "loss": 2.1896, "step": 445 }, { "epoch": 0.34, "grad_norm": 0.10400390625, "learning_rate": 0.00039982630703711496, "loss": 2.2205, "step": 450 }, { "epoch": 0.34, "grad_norm": 0.1005859375, "learning_rate": 0.0003997882430333256, "loss": 2.2203, "step": 455 }, { "epoch": 0.34, "grad_norm": 0.09912109375, "learning_rate": 0.00039974641310273386, "loss": 2.2134, "step": 460 }, { "epoch": 0.35, "grad_norm": 0.099609375, "learning_rate": 0.0003997008180338166, "loss": 2.2344, "step": 465 }, { "epoch": 0.35, "grad_norm": 0.10302734375, "learning_rate": 0.00039965145868602243, "loss": 2.1976, "step": 470 }, { "epoch": 0.35, "grad_norm": 0.09716796875, "learning_rate": 0.0003995983359897548, "loss": 2.2175, "step": 475 }, { "epoch": 0.36, "grad_norm": 0.0947265625, "learning_rate": 0.000399541450946355, "loss": 2.2145, "step": 480 }, { "epoch": 0.36, "grad_norm": 0.09619140625, "learning_rate": 0.00039948080462808266, "loss": 2.2186, "step": 485 }, { "epoch": 0.37, "grad_norm": 0.10009765625, "learning_rate": 0.0003994163981780963, "loss": 2.1911, "step": 490 }, { "epoch": 0.37, "grad_norm": 0.09716796875, "learning_rate": 0.00039934823281043103, "loss": 2.2123, "step": 495 }, { "epoch": 0.37, "grad_norm": 0.09716796875, "learning_rate": 0.00039927630980997634, "loss": 2.2019, "step": 500 }, { "epoch": 0.38, "grad_norm": 0.09814453125, "learning_rate": 0.00039920063053245145, "loss": 2.2251, "step": 505 }, { "epoch": 0.38, "grad_norm": 0.10009765625, "learning_rate": 0.00039912119640437963, "loss": 2.2116, "step": 510 }, { "epoch": 0.38, "grad_norm": 0.1005859375, "learning_rate": 0.00039903800892306194, "loss": 2.2142, "step": 515 }, { "epoch": 0.39, "grad_norm": 0.0986328125, "learning_rate": 0.00039895106965654836, "loss": 2.2006, "step": 520 }, { "epoch": 0.39, "grad_norm": 0.09765625, "learning_rate": 0.0003988603802436086, "loss": 2.2168, "step": 525 }, { "epoch": 0.4, "grad_norm": 0.10205078125, "learning_rate": 0.0003987659423937011, "loss": 2.2054, "step": 530 }, { "epoch": 0.4, "grad_norm": 0.1044921875, "learning_rate": 0.0003986677578869407, "loss": 2.2115, "step": 535 }, { "epoch": 0.4, "grad_norm": 0.0986328125, "learning_rate": 0.00039856582857406524, "loss": 2.1906, "step": 540 }, { "epoch": 0.41, "grad_norm": 0.0986328125, "learning_rate": 0.0003984601563764007, "loss": 2.2139, "step": 545 }, { "epoch": 0.41, "grad_norm": 0.10302734375, "learning_rate": 0.0003983507432858249, "loss": 2.1912, "step": 550 }, { "epoch": 0.41, "grad_norm": 0.09765625, "learning_rate": 0.0003982375913647298, "loss": 2.2201, "step": 555 }, { "epoch": 0.42, "grad_norm": 0.0966796875, "learning_rate": 0.000398120702745983, "loss": 2.1941, "step": 560 }, { "epoch": 0.42, "grad_norm": 0.09716796875, "learning_rate": 0.0003980000796328872, "loss": 2.2206, "step": 565 }, { "epoch": 0.43, "grad_norm": 0.1005859375, "learning_rate": 0.0003978757242991389, "loss": 2.2062, "step": 570 }, { "epoch": 0.43, "grad_norm": 0.0986328125, "learning_rate": 0.00039774763908878525, "loss": 2.2098, "step": 575 }, { "epoch": 0.43, "grad_norm": 0.09912109375, "learning_rate": 0.0003976158264161802, "loss": 2.2109, "step": 580 }, { "epoch": 0.44, "grad_norm": 0.0986328125, "learning_rate": 0.0003974802887659389, "loss": 2.2148, "step": 585 }, { "epoch": 0.44, "grad_norm": 0.09765625, "learning_rate": 0.0003973410286928906, "loss": 2.199, "step": 590 }, { "epoch": 0.44, "grad_norm": 0.09765625, "learning_rate": 0.0003971980488220308, "loss": 2.2271, "step": 595 }, { "epoch": 0.45, "grad_norm": 0.103515625, "learning_rate": 0.0003970513518484718, "loss": 2.2221, "step": 600 }, { "epoch": 0.45, "grad_norm": 0.09912109375, "learning_rate": 0.00039690094053739157, "loss": 2.1961, "step": 605 }, { "epoch": 0.46, "grad_norm": 0.1015625, "learning_rate": 0.0003967468177239819, "loss": 2.2078, "step": 610 }, { "epoch": 0.46, "grad_norm": 0.10205078125, "learning_rate": 0.00039658898631339496, "loss": 2.2026, "step": 615 }, { "epoch": 0.46, "grad_norm": 0.099609375, "learning_rate": 0.0003964274492806883, "loss": 2.2094, "step": 620 }, { "epoch": 0.47, "grad_norm": 0.09814453125, "learning_rate": 0.00039626220967076917, "loss": 2.2022, "step": 625 }, { "epoch": 0.47, "grad_norm": 0.0986328125, "learning_rate": 0.0003960932705983365, "loss": 2.1984, "step": 630 }, { "epoch": 0.47, "grad_norm": 0.10009765625, "learning_rate": 0.00039592063524782306, "loss": 2.1981, "step": 635 }, { "epoch": 0.48, "grad_norm": 0.1044921875, "learning_rate": 0.00039574430687333464, "loss": 2.2084, "step": 640 }, { "epoch": 0.48, "grad_norm": 0.10546875, "learning_rate": 0.00039556428879858904, "loss": 2.1912, "step": 645 }, { "epoch": 0.48, "grad_norm": 0.1005859375, "learning_rate": 0.00039538058441685353, "loss": 2.1871, "step": 650 }, { "epoch": 0.49, "grad_norm": 0.09912109375, "learning_rate": 0.0003951931971908807, "loss": 2.187, "step": 655 }, { "epoch": 0.49, "grad_norm": 0.0966796875, "learning_rate": 0.0003950021306528432, "loss": 2.1915, "step": 660 }, { "epoch": 0.5, "grad_norm": 0.09814453125, "learning_rate": 0.0003948073884042673, "loss": 2.1892, "step": 665 }, { "epoch": 0.5, "grad_norm": 0.09912109375, "learning_rate": 0.00039460897411596477, "loss": 2.2194, "step": 670 }, { "epoch": 0.5, "grad_norm": 0.10107421875, "learning_rate": 0.00039440689152796406, "loss": 2.2103, "step": 675 }, { "epoch": 0.51, "grad_norm": 0.10107421875, "learning_rate": 0.00039420114444943934, "loss": 2.2032, "step": 680 }, { "epoch": 0.51, "grad_norm": 0.1005859375, "learning_rate": 0.0003939917367586391, "loss": 2.1989, "step": 685 }, { "epoch": 0.51, "grad_norm": 0.1005859375, "learning_rate": 0.00039377867240281275, "loss": 2.1929, "step": 690 }, { "epoch": 0.52, "grad_norm": 0.1015625, "learning_rate": 0.0003935619553981364, "loss": 2.1961, "step": 695 }, { "epoch": 0.52, "grad_norm": 0.09912109375, "learning_rate": 0.0003933415898296372, "loss": 2.1941, "step": 700 }, { "epoch": 0.53, "grad_norm": 0.10009765625, "learning_rate": 0.000393117579851116, "loss": 2.1983, "step": 705 }, { "epoch": 0.53, "grad_norm": 0.10107421875, "learning_rate": 0.0003928899296850695, "loss": 2.1912, "step": 710 }, { "epoch": 0.53, "grad_norm": 0.10107421875, "learning_rate": 0.0003926586436226103, "loss": 2.2096, "step": 715 }, { "epoch": 0.54, "grad_norm": 0.10400390625, "learning_rate": 0.0003924237260233863, "loss": 2.2007, "step": 720 }, { "epoch": 0.54, "grad_norm": 0.1025390625, "learning_rate": 0.0003921851813154983, "loss": 2.2171, "step": 725 }, { "epoch": 0.54, "grad_norm": 0.10205078125, "learning_rate": 0.0003919430139954167, "loss": 2.2002, "step": 730 }, { "epoch": 0.55, "grad_norm": 0.10009765625, "learning_rate": 0.00039169722862789644, "loss": 2.1913, "step": 735 }, { "epoch": 0.55, "grad_norm": 0.1005859375, "learning_rate": 0.0003914478298458916, "loss": 2.1765, "step": 740 }, { "epoch": 0.56, "grad_norm": 0.099609375, "learning_rate": 0.00039119482235046716, "loss": 2.1971, "step": 745 }, { "epoch": 0.56, "grad_norm": 0.10205078125, "learning_rate": 0.00039093821091071117, "loss": 2.1978, "step": 750 }, { "epoch": 0.56, "grad_norm": 0.1005859375, "learning_rate": 0.00039067800036364443, "loss": 2.1954, "step": 755 }, { "epoch": 0.57, "grad_norm": 0.10205078125, "learning_rate": 0.0003904141956141297, "loss": 2.2025, "step": 760 }, { "epoch": 0.57, "grad_norm": 0.1015625, "learning_rate": 0.0003901468016347786, "loss": 2.2045, "step": 765 }, { "epoch": 0.57, "grad_norm": 0.103515625, "learning_rate": 0.00038987582346585847, "loss": 2.1939, "step": 770 }, { "epoch": 0.58, "grad_norm": 0.1025390625, "learning_rate": 0.0003896012662151972, "loss": 2.175, "step": 775 }, { "epoch": 0.58, "grad_norm": 0.099609375, "learning_rate": 0.00038932313505808685, "loss": 2.185, "step": 780 }, { "epoch": 0.59, "grad_norm": 0.10107421875, "learning_rate": 0.00038904143523718615, "loss": 2.1939, "step": 785 }, { "epoch": 0.59, "grad_norm": 0.099609375, "learning_rate": 0.00038875617206242174, "loss": 2.1634, "step": 790 }, { "epoch": 0.59, "grad_norm": 0.10205078125, "learning_rate": 0.0003884673509108879, "loss": 2.1947, "step": 795 }, { "epoch": 0.6, "grad_norm": 0.10107421875, "learning_rate": 0.00038817497722674546, "loss": 2.1925, "step": 800 }, { "epoch": 0.6, "grad_norm": 0.1015625, "learning_rate": 0.000387879056521119, "loss": 2.1809, "step": 805 }, { "epoch": 0.6, "grad_norm": 0.099609375, "learning_rate": 0.0003875795943719929, "loss": 2.1763, "step": 810 }, { "epoch": 0.61, "grad_norm": 0.1025390625, "learning_rate": 0.00038727659642410654, "loss": 2.2132, "step": 815 }, { "epoch": 0.61, "grad_norm": 0.10205078125, "learning_rate": 0.0003869700683888474, "loss": 2.194, "step": 820 }, { "epoch": 0.62, "grad_norm": 0.10107421875, "learning_rate": 0.0003866600160441438, "loss": 2.1861, "step": 825 }, { "epoch": 0.62, "grad_norm": 0.1005859375, "learning_rate": 0.00038634644523435587, "loss": 2.2093, "step": 830 }, { "epoch": 0.62, "grad_norm": 0.1005859375, "learning_rate": 0.0003860293618701653, "loss": 2.1894, "step": 835 }, { "epoch": 0.63, "grad_norm": 0.09912109375, "learning_rate": 0.0003857087719284641, "loss": 2.2219, "step": 840 }, { "epoch": 0.63, "grad_norm": 0.10009765625, "learning_rate": 0.00038538468145224165, "loss": 2.1982, "step": 845 }, { "epoch": 0.63, "grad_norm": 0.10205078125, "learning_rate": 0.00038505709655047113, "loss": 2.1954, "step": 850 }, { "epoch": 0.64, "grad_norm": 0.1005859375, "learning_rate": 0.00038472602339799427, "loss": 2.2064, "step": 855 }, { "epoch": 0.64, "grad_norm": 0.10107421875, "learning_rate": 0.0003843914682354047, "loss": 2.2166, "step": 860 }, { "epoch": 0.65, "grad_norm": 0.09912109375, "learning_rate": 0.00038405343736893065, "loss": 2.2001, "step": 865 }, { "epoch": 0.65, "grad_norm": 0.1015625, "learning_rate": 0.000383711937170316, "loss": 2.1765, "step": 870 }, { "epoch": 0.65, "grad_norm": 0.10302734375, "learning_rate": 0.00038336697407669994, "loss": 2.1826, "step": 875 }, { "epoch": 0.66, "grad_norm": 0.10302734375, "learning_rate": 0.000383018554590496, "loss": 2.1964, "step": 880 }, { "epoch": 0.66, "grad_norm": 0.09912109375, "learning_rate": 0.0003826666852792692, "loss": 2.1954, "step": 885 }, { "epoch": 0.66, "grad_norm": 0.1044921875, "learning_rate": 0.00038231137277561244, "loss": 2.2015, "step": 890 }, { "epoch": 0.67, "grad_norm": 0.1025390625, "learning_rate": 0.0003819526237770212, "loss": 2.1932, "step": 895 }, { "epoch": 0.67, "grad_norm": 0.1025390625, "learning_rate": 0.0003815904450457677, "loss": 2.1906, "step": 900 }, { "epoch": 0.68, "grad_norm": 0.10205078125, "learning_rate": 0.0003812248434087732, "loss": 2.1776, "step": 905 }, { "epoch": 0.68, "grad_norm": 0.10107421875, "learning_rate": 0.00038085582575747914, "loss": 2.1936, "step": 910 }, { "epoch": 0.68, "grad_norm": 0.1015625, "learning_rate": 0.0003804833990477177, "loss": 2.1819, "step": 915 }, { "epoch": 0.69, "grad_norm": 0.10107421875, "learning_rate": 0.00038010757029958016, "loss": 2.1918, "step": 920 }, { "epoch": 0.69, "grad_norm": 0.10498046875, "learning_rate": 0.0003797283465972851, "loss": 2.1776, "step": 925 }, { "epoch": 0.69, "grad_norm": 0.10302734375, "learning_rate": 0.0003793457350890443, "loss": 2.1786, "step": 930 }, { "epoch": 0.7, "grad_norm": 0.1015625, "learning_rate": 0.0003789597429869286, "loss": 2.2086, "step": 935 }, { "epoch": 0.7, "grad_norm": 0.10400390625, "learning_rate": 0.0003785703775667314, "loss": 2.2138, "step": 940 }, { "epoch": 0.7, "grad_norm": 0.099609375, "learning_rate": 0.00037817764616783196, "loss": 2.2123, "step": 945 }, { "epoch": 0.71, "grad_norm": 0.1025390625, "learning_rate": 0.0003777815561930568, "loss": 2.1889, "step": 950 }, { "epoch": 0.71, "grad_norm": 0.10205078125, "learning_rate": 0.0003773821151085401, "loss": 2.1946, "step": 955 }, { "epoch": 0.72, "grad_norm": 0.1025390625, "learning_rate": 0.00037697933044358335, "loss": 2.1688, "step": 960 }, { "epoch": 0.72, "grad_norm": 0.10302734375, "learning_rate": 0.0003765732097905129, "loss": 2.1795, "step": 965 }, { "epoch": 0.72, "grad_norm": 0.1005859375, "learning_rate": 0.00037616376080453737, "loss": 2.1966, "step": 970 }, { "epoch": 0.73, "grad_norm": 0.10107421875, "learning_rate": 0.0003757509912036028, "loss": 2.1901, "step": 975 }, { "epoch": 0.73, "grad_norm": 0.103515625, "learning_rate": 0.0003753349087682477, "loss": 2.1685, "step": 980 }, { "epoch": 0.73, "grad_norm": 0.1015625, "learning_rate": 0.000374915521341456, "loss": 2.1923, "step": 985 }, { "epoch": 0.74, "grad_norm": 0.10205078125, "learning_rate": 0.00037449283682850957, "loss": 2.1995, "step": 990 }, { "epoch": 0.74, "grad_norm": 0.10400390625, "learning_rate": 0.00037406686319683887, "loss": 2.1921, "step": 995 }, { "epoch": 0.75, "grad_norm": 0.10546875, "learning_rate": 0.00037363760847587284, "loss": 2.178, "step": 1000 }, { "epoch": 0.75, "grad_norm": 0.10400390625, "learning_rate": 0.00037320508075688776, "loss": 2.1711, "step": 1005 }, { "epoch": 0.75, "grad_norm": 0.10205078125, "learning_rate": 0.00037276928819285446, "loss": 2.1825, "step": 1010 }, { "epoch": 0.76, "grad_norm": 0.1015625, "learning_rate": 0.0003723302389982849, "loss": 2.1925, "step": 1015 }, { "epoch": 0.76, "grad_norm": 0.1044921875, "learning_rate": 0.0003718879414490771, "loss": 2.1758, "step": 1020 }, { "epoch": 0.76, "grad_norm": 0.10205078125, "learning_rate": 0.0003714424038823592, "loss": 2.1837, "step": 1025 }, { "epoch": 0.77, "grad_norm": 0.10302734375, "learning_rate": 0.00037099363469633245, "loss": 2.1914, "step": 1030 }, { "epoch": 0.77, "grad_norm": 0.1044921875, "learning_rate": 0.0003705416423501128, "loss": 2.1667, "step": 1035 }, { "epoch": 0.78, "grad_norm": 0.1015625, "learning_rate": 0.0003700864353635714, "loss": 2.1911, "step": 1040 }, { "epoch": 0.78, "grad_norm": 0.10205078125, "learning_rate": 0.00036962802231717403, "loss": 2.1867, "step": 1045 }, { "epoch": 0.78, "grad_norm": 0.10498046875, "learning_rate": 0.0003691664118518195, "loss": 2.1717, "step": 1050 }, { "epoch": 0.79, "grad_norm": 0.10302734375, "learning_rate": 0.0003687016126686765, "loss": 2.1794, "step": 1055 }, { "epoch": 0.79, "grad_norm": 0.1044921875, "learning_rate": 0.00036823363352901997, "loss": 2.2012, "step": 1060 }, { "epoch": 0.79, "grad_norm": 0.1025390625, "learning_rate": 0.0003677624832540655, "loss": 2.1957, "step": 1065 }, { "epoch": 0.8, "grad_norm": 0.10888671875, "learning_rate": 0.0003672881707248034, "loss": 2.1762, "step": 1070 }, { "epoch": 0.8, "grad_norm": 0.1025390625, "learning_rate": 0.0003668107048818312, "loss": 2.2005, "step": 1075 }, { "epoch": 0.81, "grad_norm": 0.10693359375, "learning_rate": 0.0003663300947251851, "loss": 2.2066, "step": 1080 }, { "epoch": 0.81, "grad_norm": 0.10302734375, "learning_rate": 0.0003658463493141703, "loss": 2.1813, "step": 1085 }, { "epoch": 0.81, "grad_norm": 0.10498046875, "learning_rate": 0.00036535947776719017, "loss": 2.1659, "step": 1090 }, { "epoch": 0.82, "grad_norm": 0.103515625, "learning_rate": 0.0003648694892615747, "loss": 2.191, "step": 1095 }, { "epoch": 0.82, "grad_norm": 0.1025390625, "learning_rate": 0.0003643763930334071, "loss": 2.1706, "step": 1100 }, { "epoch": 0.82, "grad_norm": 0.10546875, "learning_rate": 0.00036388019837734994, "loss": 2.1885, "step": 1105 }, { "epoch": 0.83, "grad_norm": 0.1015625, "learning_rate": 0.00036338091464646984, "loss": 2.1711, "step": 1110 }, { "epoch": 0.83, "grad_norm": 0.10400390625, "learning_rate": 0.0003628785512520613, "loss": 2.1687, "step": 1115 }, { "epoch": 0.84, "grad_norm": 0.1015625, "learning_rate": 0.0003623731176634691, "loss": 2.2004, "step": 1120 }, { "epoch": 0.84, "grad_norm": 0.1025390625, "learning_rate": 0.00036186462340791014, "loss": 2.188, "step": 1125 }, { "epoch": 0.84, "grad_norm": 0.1025390625, "learning_rate": 0.0003613530780702934, "loss": 2.1822, "step": 1130 }, { "epoch": 0.85, "grad_norm": 0.10400390625, "learning_rate": 0.00036083849129303966, "loss": 2.1848, "step": 1135 }, { "epoch": 0.85, "grad_norm": 0.10205078125, "learning_rate": 0.0003603208727758995, "loss": 2.1607, "step": 1140 }, { "epoch": 0.85, "grad_norm": 0.1025390625, "learning_rate": 0.00035980023227577063, "loss": 2.1863, "step": 1145 }, { "epoch": 0.86, "grad_norm": 0.10302734375, "learning_rate": 0.00035927657960651394, "loss": 2.1711, "step": 1150 }, { "epoch": 0.86, "grad_norm": 0.10205078125, "learning_rate": 0.0003587499246387684, "loss": 2.1806, "step": 1155 }, { "epoch": 0.87, "grad_norm": 0.10693359375, "learning_rate": 0.00035822027729976504, "loss": 2.1735, "step": 1160 }, { "epoch": 0.87, "grad_norm": 0.1044921875, "learning_rate": 0.00035768764757314, "loss": 2.1989, "step": 1165 }, { "epoch": 0.87, "grad_norm": 0.103515625, "learning_rate": 0.00035715204549874617, "loss": 2.1728, "step": 1170 }, { "epoch": 0.88, "grad_norm": 0.10302734375, "learning_rate": 0.0003566134811724639, "loss": 2.1933, "step": 1175 }, { "epoch": 0.88, "grad_norm": 0.10302734375, "learning_rate": 0.00035607196474601074, "loss": 2.1886, "step": 1180 }, { "epoch": 0.88, "grad_norm": 0.10205078125, "learning_rate": 0.00035552750642675043, "loss": 2.1829, "step": 1185 }, { "epoch": 0.89, "grad_norm": 0.1044921875, "learning_rate": 0.00035498011647749976, "loss": 2.1755, "step": 1190 }, { "epoch": 0.89, "grad_norm": 0.10302734375, "learning_rate": 0.00035442980521633595, "loss": 2.2011, "step": 1195 }, { "epoch": 0.9, "grad_norm": 0.1015625, "learning_rate": 0.00035387658301640136, "loss": 2.2043, "step": 1200 }, { "epoch": 0.9, "grad_norm": 0.10302734375, "learning_rate": 0.0003533204603057088, "loss": 2.1782, "step": 1205 }, { "epoch": 0.9, "grad_norm": 0.10498046875, "learning_rate": 0.00035276144756694406, "loss": 2.179, "step": 1210 }, { "epoch": 0.91, "grad_norm": 0.10205078125, "learning_rate": 0.00035219955533726915, "loss": 2.1841, "step": 1215 }, { "epoch": 0.91, "grad_norm": 0.10498046875, "learning_rate": 0.0003516347942081232, "loss": 2.1646, "step": 1220 }, { "epoch": 0.91, "grad_norm": 0.10205078125, "learning_rate": 0.00035106717482502267, "loss": 2.1878, "step": 1225 }, { "epoch": 0.92, "grad_norm": 0.10546875, "learning_rate": 0.0003504967078873613, "loss": 2.1753, "step": 1230 }, { "epoch": 0.92, "grad_norm": 0.103515625, "learning_rate": 0.000349923404148208, "loss": 2.1854, "step": 1235 }, { "epoch": 0.93, "grad_norm": 0.1044921875, "learning_rate": 0.0003493472744141041, "loss": 2.1878, "step": 1240 }, { "epoch": 0.93, "grad_norm": 0.10498046875, "learning_rate": 0.0003487683295448598, "loss": 2.1675, "step": 1245 }, { "epoch": 0.93, "grad_norm": 0.103515625, "learning_rate": 0.0003481865804533494, "loss": 2.1902, "step": 1250 }, { "epoch": 0.94, "grad_norm": 0.10546875, "learning_rate": 0.00034760203810530594, "loss": 2.1848, "step": 1255 }, { "epoch": 0.94, "grad_norm": 0.103515625, "learning_rate": 0.00034701471351911395, "loss": 2.1638, "step": 1260 }, { "epoch": 0.94, "grad_norm": 0.10546875, "learning_rate": 0.000346424617765602, "loss": 2.1763, "step": 1265 }, { "epoch": 0.95, "grad_norm": 0.1025390625, "learning_rate": 0.000345831761967834, "loss": 2.165, "step": 1270 }, { "epoch": 0.95, "grad_norm": 0.1044921875, "learning_rate": 0.00034523615730089986, "loss": 2.1875, "step": 1275 }, { "epoch": 0.95, "grad_norm": 0.10205078125, "learning_rate": 0.0003446378149917042, "loss": 2.1595, "step": 1280 }, { "epoch": 0.96, "grad_norm": 0.107421875, "learning_rate": 0.0003440367463187553, "loss": 2.1841, "step": 1285 }, { "epoch": 0.96, "grad_norm": 0.10546875, "learning_rate": 0.00034343296261195224, "loss": 2.1882, "step": 1290 }, { "epoch": 0.97, "grad_norm": 0.10595703125, "learning_rate": 0.0003428264752523712, "loss": 2.1671, "step": 1295 }, { "epoch": 0.97, "grad_norm": 0.10595703125, "learning_rate": 0.0003422172956720514, "loss": 2.1671, "step": 1300 }, { "epoch": 0.97, "grad_norm": 0.103515625, "learning_rate": 0.00034160543535377926, "loss": 2.1607, "step": 1305 }, { "epoch": 0.98, "grad_norm": 0.1083984375, "learning_rate": 0.0003409909058308718, "loss": 2.1827, "step": 1310 }, { "epoch": 0.98, "grad_norm": 0.10595703125, "learning_rate": 0.0003403737186869596, "loss": 2.183, "step": 1315 }, { "epoch": 0.98, "grad_norm": 0.10205078125, "learning_rate": 0.00033975388555576835, "loss": 2.1692, "step": 1320 }, { "epoch": 0.99, "grad_norm": 0.1064453125, "learning_rate": 0.0003391314181208995, "loss": 2.1786, "step": 1325 }, { "epoch": 0.99, "grad_norm": 0.10546875, "learning_rate": 0.00033850632811561, "loss": 2.166, "step": 1330 }, { "epoch": 1.0, "grad_norm": 0.1044921875, "learning_rate": 0.00033787862732259123, "loss": 2.1767, "step": 1335 }, { "epoch": 1.0, "grad_norm": 0.10986328125, "learning_rate": 0.0003372483275737468, "loss": 2.1729, "step": 1340 }, { "epoch": 1.0, "eval_loss": 2.1778452396392822, "eval_runtime": 189.5201, "eval_samples_per_second": 25.443, "eval_steps_per_second": 3.182, "step": 1340 }, { "epoch": 1.0, "grad_norm": 0.10546875, "learning_rate": 0.0003366154407499695, "loss": 2.1415, "step": 1345 }, { "epoch": 1.01, "grad_norm": 0.10595703125, "learning_rate": 0.0003359799787809179, "loss": 2.1377, "step": 1350 }, { "epoch": 1.01, "grad_norm": 0.1103515625, "learning_rate": 0.0003353419536447902, "loss": 2.1349, "step": 1355 }, { "epoch": 1.01, "grad_norm": 0.1123046875, "learning_rate": 0.00033470137736809995, "loss": 2.1339, "step": 1360 }, { "epoch": 1.02, "grad_norm": 0.11328125, "learning_rate": 0.0003340582620254484, "loss": 2.1364, "step": 1365 }, { "epoch": 1.02, "grad_norm": 0.107421875, "learning_rate": 0.000333412619739297, "loss": 2.1484, "step": 1370 }, { "epoch": 1.03, "grad_norm": 0.10888671875, "learning_rate": 0.0003327644626797394, "loss": 2.1753, "step": 1375 }, { "epoch": 1.03, "grad_norm": 0.107421875, "learning_rate": 0.0003321138030642714, "loss": 2.165, "step": 1380 }, { "epoch": 1.03, "grad_norm": 0.1103515625, "learning_rate": 0.00033146065315756113, "loss": 2.1622, "step": 1385 }, { "epoch": 1.04, "grad_norm": 0.1123046875, "learning_rate": 0.00033080502527121756, "loss": 2.1704, "step": 1390 }, { "epoch": 1.04, "grad_norm": 0.111328125, "learning_rate": 0.0003301469317635587, "loss": 2.162, "step": 1395 }, { "epoch": 1.04, "grad_norm": 0.11181640625, "learning_rate": 0.00032948638503937846, "loss": 2.1461, "step": 1400 }, { "epoch": 1.05, "grad_norm": 0.10693359375, "learning_rate": 0.0003288233975497128, "loss": 2.1515, "step": 1405 }, { "epoch": 1.05, "grad_norm": 0.10986328125, "learning_rate": 0.00032815798179160524, "loss": 2.1306, "step": 1410 }, { "epoch": 1.06, "grad_norm": 0.11083984375, "learning_rate": 0.0003274901503078711, "loss": 2.1343, "step": 1415 }, { "epoch": 1.06, "grad_norm": 0.11328125, "learning_rate": 0.0003268199156868612, "loss": 2.1235, "step": 1420 }, { "epoch": 1.06, "grad_norm": 0.11083984375, "learning_rate": 0.0003261472905622244, "loss": 2.1496, "step": 1425 }, { "epoch": 1.07, "grad_norm": 0.1103515625, "learning_rate": 0.0003254722876126697, "loss": 2.1503, "step": 1430 }, { "epoch": 1.07, "grad_norm": 0.11572265625, "learning_rate": 0.00032479491956172705, "loss": 2.1566, "step": 1435 }, { "epoch": 1.07, "grad_norm": 0.10986328125, "learning_rate": 0.0003241151991775076, "loss": 2.1638, "step": 1440 }, { "epoch": 1.08, "grad_norm": 0.11279296875, "learning_rate": 0.0003234331392724631, "loss": 2.1497, "step": 1445 }, { "epoch": 1.08, "grad_norm": 0.11376953125, "learning_rate": 0.00032274875270314426, "loss": 2.1478, "step": 1450 }, { "epoch": 1.09, "grad_norm": 0.1103515625, "learning_rate": 0.00032206205236995843, "loss": 2.149, "step": 1455 }, { "epoch": 1.09, "grad_norm": 0.11181640625, "learning_rate": 0.00032137305121692655, "loss": 2.1514, "step": 1460 }, { "epoch": 1.09, "grad_norm": 0.1123046875, "learning_rate": 0.00032068176223143884, "loss": 2.1817, "step": 1465 }, { "epoch": 1.1, "grad_norm": 0.11083984375, "learning_rate": 0.0003199881984440106, "loss": 2.1721, "step": 1470 }, { "epoch": 1.1, "grad_norm": 0.111328125, "learning_rate": 0.000319292372928036, "loss": 2.1319, "step": 1475 }, { "epoch": 1.1, "grad_norm": 0.11181640625, "learning_rate": 0.0003185942987995418, "loss": 2.15, "step": 1480 }, { "epoch": 1.11, "grad_norm": 0.11279296875, "learning_rate": 0.0003178939892169403, "loss": 2.1696, "step": 1485 }, { "epoch": 1.11, "grad_norm": 0.11376953125, "learning_rate": 0.0003171914573807813, "loss": 2.1567, "step": 1490 }, { "epoch": 1.12, "grad_norm": 0.115234375, "learning_rate": 0.0003164867165335029, "loss": 2.1454, "step": 1495 }, { "epoch": 1.12, "grad_norm": 0.11474609375, "learning_rate": 0.0003157797799591823, "loss": 2.1482, "step": 1500 }, { "epoch": 1.12, "grad_norm": 0.11328125, "learning_rate": 0.0003150706609832854, "loss": 2.1297, "step": 1505 }, { "epoch": 1.13, "grad_norm": 0.11474609375, "learning_rate": 0.00031435937297241527, "loss": 2.1142, "step": 1510 }, { "epoch": 1.13, "grad_norm": 0.1142578125, "learning_rate": 0.0003136459293340605, "loss": 2.1509, "step": 1515 }, { "epoch": 1.13, "grad_norm": 0.11376953125, "learning_rate": 0.00031293034351634227, "loss": 2.1382, "step": 1520 }, { "epoch": 1.14, "grad_norm": 0.115234375, "learning_rate": 0.00031221262900776116, "loss": 2.1552, "step": 1525 }, { "epoch": 1.14, "grad_norm": 0.11328125, "learning_rate": 0.0003114927993369425, "loss": 2.1434, "step": 1530 }, { "epoch": 1.15, "grad_norm": 0.1123046875, "learning_rate": 0.00031077086807238175, "loss": 2.1364, "step": 1535 }, { "epoch": 1.15, "grad_norm": 0.11279296875, "learning_rate": 0.00031004684882218845, "loss": 2.1569, "step": 1540 }, { "epoch": 1.15, "grad_norm": 0.11376953125, "learning_rate": 0.0003093207552338298, "loss": 2.1756, "step": 1545 }, { "epoch": 1.16, "grad_norm": 0.11474609375, "learning_rate": 0.0003085926009938735, "loss": 2.1448, "step": 1550 }, { "epoch": 1.16, "grad_norm": 0.115234375, "learning_rate": 0.0003078623998277296, "loss": 2.1403, "step": 1555 }, { "epoch": 1.16, "grad_norm": 0.11376953125, "learning_rate": 0.0003071301654993919, "loss": 2.1391, "step": 1560 }, { "epoch": 1.17, "grad_norm": 0.1162109375, "learning_rate": 0.0003063959118111785, "loss": 2.154, "step": 1565 }, { "epoch": 1.17, "grad_norm": 0.11572265625, "learning_rate": 0.0003056596526034717, "loss": 2.1494, "step": 1570 }, { "epoch": 1.17, "grad_norm": 0.1142578125, "learning_rate": 0.0003049214017544569, "loss": 2.1319, "step": 1575 }, { "epoch": 1.18, "grad_norm": 0.12060546875, "learning_rate": 0.0003041811731798611, "loss": 2.1384, "step": 1580 }, { "epoch": 1.18, "grad_norm": 0.115234375, "learning_rate": 0.0003034389808326907, "loss": 2.1642, "step": 1585 }, { "epoch": 1.19, "grad_norm": 0.1142578125, "learning_rate": 0.0003026948387029684, "loss": 2.1557, "step": 1590 }, { "epoch": 1.19, "grad_norm": 0.11376953125, "learning_rate": 0.0003019487608174695, "loss": 2.1392, "step": 1595 }, { "epoch": 1.19, "grad_norm": 0.11474609375, "learning_rate": 0.0003012007612394575, "loss": 2.1431, "step": 1600 }, { "epoch": 1.2, "grad_norm": 0.11474609375, "learning_rate": 0.000300450854068419, "loss": 2.1507, "step": 1605 }, { "epoch": 1.2, "grad_norm": 0.11474609375, "learning_rate": 0.000299699053439798, "loss": 2.147, "step": 1610 }, { "epoch": 1.2, "grad_norm": 0.11474609375, "learning_rate": 0.00029894537352472927, "loss": 2.1361, "step": 1615 }, { "epoch": 1.21, "grad_norm": 0.11572265625, "learning_rate": 0.00029818982852977157, "loss": 2.1514, "step": 1620 }, { "epoch": 1.21, "grad_norm": 0.12451171875, "learning_rate": 0.00029743243269663957, "loss": 2.1597, "step": 1625 }, { "epoch": 1.22, "grad_norm": 0.11474609375, "learning_rate": 0.0002966732003019353, "loss": 2.1449, "step": 1630 }, { "epoch": 1.22, "grad_norm": 0.1142578125, "learning_rate": 0.0002959121456568796, "loss": 2.1392, "step": 1635 }, { "epoch": 1.22, "grad_norm": 0.11474609375, "learning_rate": 0.00029514928310704164, "loss": 2.1396, "step": 1640 }, { "epoch": 1.23, "grad_norm": 0.11328125, "learning_rate": 0.000294384627032069, "loss": 2.1509, "step": 1645 }, { "epoch": 1.23, "grad_norm": 0.11376953125, "learning_rate": 0.0002936181918454164, "loss": 2.1538, "step": 1650 }, { "epoch": 1.23, "grad_norm": 0.11376953125, "learning_rate": 0.0002928499919940743, "loss": 2.1337, "step": 1655 }, { "epoch": 1.24, "grad_norm": 0.11669921875, "learning_rate": 0.0002920800419582961, "loss": 2.1475, "step": 1660 }, { "epoch": 1.24, "grad_norm": 0.1201171875, "learning_rate": 0.0002913083562513257, "loss": 2.1293, "step": 1665 }, { "epoch": 1.25, "grad_norm": 0.11474609375, "learning_rate": 0.0002905349494191235, "loss": 2.1539, "step": 1670 }, { "epoch": 1.25, "grad_norm": 0.11669921875, "learning_rate": 0.00028975983604009246, "loss": 2.1215, "step": 1675 }, { "epoch": 1.25, "grad_norm": 0.11669921875, "learning_rate": 0.0002889830307248033, "loss": 2.1578, "step": 1680 }, { "epoch": 1.26, "grad_norm": 0.11474609375, "learning_rate": 0.00028820454811571907, "loss": 2.1492, "step": 1685 }, { "epoch": 1.26, "grad_norm": 0.11474609375, "learning_rate": 0.0002874244028869191, "loss": 2.1536, "step": 1690 }, { "epoch": 1.26, "grad_norm": 0.11328125, "learning_rate": 0.0002866426097438222, "loss": 2.1584, "step": 1695 }, { "epoch": 1.27, "grad_norm": 0.1142578125, "learning_rate": 0.0002858591834229102, "loss": 2.163, "step": 1700 }, { "epoch": 1.27, "grad_norm": 0.1123046875, "learning_rate": 0.000285074138691449, "loss": 2.1423, "step": 1705 }, { "epoch": 1.28, "grad_norm": 0.1162109375, "learning_rate": 0.0002842874903472115, "loss": 2.1499, "step": 1710 }, { "epoch": 1.28, "grad_norm": 0.1162109375, "learning_rate": 0.00028349925321819776, "loss": 2.1552, "step": 1715 }, { "epoch": 1.28, "grad_norm": 0.11572265625, "learning_rate": 0.00028270944216235574, "loss": 2.1471, "step": 1720 }, { "epoch": 1.29, "grad_norm": 0.11767578125, "learning_rate": 0.0002819180720673013, "loss": 2.1362, "step": 1725 }, { "epoch": 1.29, "grad_norm": 0.1162109375, "learning_rate": 0.0002811251578500377, "loss": 2.173, "step": 1730 }, { "epoch": 1.29, "grad_norm": 0.11572265625, "learning_rate": 0.0002803307144566741, "loss": 2.1381, "step": 1735 }, { "epoch": 1.3, "grad_norm": 0.11376953125, "learning_rate": 0.00027953475686214404, "loss": 2.1409, "step": 1740 }, { "epoch": 1.3, "grad_norm": 0.115234375, "learning_rate": 0.0002787373000699232, "loss": 2.1486, "step": 1745 }, { "epoch": 1.31, "grad_norm": 0.11767578125, "learning_rate": 0.00027793835911174656, "loss": 2.1659, "step": 1750 }, { "epoch": 1.31, "grad_norm": 0.11474609375, "learning_rate": 0.00027713794904732483, "loss": 2.1639, "step": 1755 }, { "epoch": 1.31, "grad_norm": 0.1171875, "learning_rate": 0.00027633608496406103, "loss": 2.1667, "step": 1760 }, { "epoch": 1.32, "grad_norm": 0.1142578125, "learning_rate": 0.00027553278197676567, "loss": 2.1442, "step": 1765 }, { "epoch": 1.32, "grad_norm": 0.115234375, "learning_rate": 0.00027472805522737195, "loss": 2.117, "step": 1770 }, { "epoch": 1.32, "grad_norm": 0.11669921875, "learning_rate": 0.00027392191988465065, "loss": 2.1499, "step": 1775 }, { "epoch": 1.33, "grad_norm": 0.11474609375, "learning_rate": 0.0002731143911439237, "loss": 2.15, "step": 1780 }, { "epoch": 1.33, "grad_norm": 0.11328125, "learning_rate": 0.00027230548422677817, "loss": 2.1542, "step": 1785 }, { "epoch": 1.34, "grad_norm": 0.11767578125, "learning_rate": 0.0002714952143807792, "loss": 2.1437, "step": 1790 }, { "epoch": 1.34, "grad_norm": 0.1162109375, "learning_rate": 0.0002706835968791824, "loss": 2.1627, "step": 1795 }, { "epoch": 1.34, "grad_norm": 0.11669921875, "learning_rate": 0.0002698706470206464, "loss": 2.1453, "step": 1800 }, { "epoch": 1.35, "grad_norm": 0.11474609375, "learning_rate": 0.00026905638012894405, "loss": 2.1482, "step": 1805 }, { "epoch": 1.35, "grad_norm": 0.115234375, "learning_rate": 0.00026824081155267374, "loss": 2.1707, "step": 1810 }, { "epoch": 1.35, "grad_norm": 0.11669921875, "learning_rate": 0.00026742395666497015, "loss": 2.1583, "step": 1815 }, { "epoch": 1.36, "grad_norm": 0.11767578125, "learning_rate": 0.0002666058308632144, "loss": 2.1528, "step": 1820 }, { "epoch": 1.36, "grad_norm": 0.1171875, "learning_rate": 0.0002657864495687437, "loss": 2.1546, "step": 1825 }, { "epoch": 1.37, "grad_norm": 0.1162109375, "learning_rate": 0.00026496582822656094, "loss": 2.145, "step": 1830 }, { "epoch": 1.37, "grad_norm": 0.1162109375, "learning_rate": 0.00026414398230504335, "loss": 2.1581, "step": 1835 }, { "epoch": 1.37, "grad_norm": 0.11669921875, "learning_rate": 0.0002633209272956509, "loss": 2.1352, "step": 1840 }, { "epoch": 1.38, "grad_norm": 0.11572265625, "learning_rate": 0.0002624966787126345, "loss": 2.1661, "step": 1845 }, { "epoch": 1.38, "grad_norm": 0.11962890625, "learning_rate": 0.0002616712520927434, "loss": 2.15, "step": 1850 }, { "epoch": 1.38, "grad_norm": 0.11865234375, "learning_rate": 0.00026084466299493227, "loss": 2.1389, "step": 1855 }, { "epoch": 1.39, "grad_norm": 0.12109375, "learning_rate": 0.0002600169270000682, "loss": 2.1566, "step": 1860 }, { "epoch": 1.39, "grad_norm": 0.11865234375, "learning_rate": 0.0002591880597106365, "loss": 2.155, "step": 1865 }, { "epoch": 1.4, "grad_norm": 0.1171875, "learning_rate": 0.0002583580767504474, "loss": 2.1491, "step": 1870 }, { "epoch": 1.4, "grad_norm": 0.1171875, "learning_rate": 0.0002575269937643406, "loss": 2.1399, "step": 1875 }, { "epoch": 1.4, "grad_norm": 0.1162109375, "learning_rate": 0.00025669482641789106, "loss": 2.1316, "step": 1880 }, { "epoch": 1.41, "grad_norm": 0.1181640625, "learning_rate": 0.0002558615903971135, "loss": 2.1265, "step": 1885 }, { "epoch": 1.41, "grad_norm": 0.1181640625, "learning_rate": 0.00025502730140816666, "loss": 2.1384, "step": 1890 }, { "epoch": 1.41, "grad_norm": 0.1220703125, "learning_rate": 0.0002541919751770574, "loss": 2.1535, "step": 1895 }, { "epoch": 1.42, "grad_norm": 0.1201171875, "learning_rate": 0.00025335562744934403, "loss": 2.1292, "step": 1900 }, { "epoch": 1.42, "grad_norm": 0.1181640625, "learning_rate": 0.0002525182739898397, "loss": 2.1489, "step": 1905 }, { "epoch": 1.42, "grad_norm": 0.1171875, "learning_rate": 0.00025167993058231524, "loss": 2.1454, "step": 1910 }, { "epoch": 1.43, "grad_norm": 0.1181640625, "learning_rate": 0.00025084061302920146, "loss": 2.1436, "step": 1915 }, { "epoch": 1.43, "grad_norm": 0.11865234375, "learning_rate": 0.0002500003371512917, "loss": 2.1461, "step": 1920 }, { "epoch": 1.44, "grad_norm": 0.11572265625, "learning_rate": 0.000249159118787443, "loss": 2.1515, "step": 1925 }, { "epoch": 1.44, "grad_norm": 0.1142578125, "learning_rate": 0.00024831697379427807, "loss": 2.1204, "step": 1930 }, { "epoch": 1.44, "grad_norm": 0.11767578125, "learning_rate": 0.0002474739180458863, "loss": 2.1579, "step": 1935 }, { "epoch": 1.45, "grad_norm": 0.11865234375, "learning_rate": 0.0002466299674335241, "loss": 2.1382, "step": 1940 }, { "epoch": 1.45, "grad_norm": 0.12060546875, "learning_rate": 0.00024578513786531605, "loss": 2.1551, "step": 1945 }, { "epoch": 1.45, "grad_norm": 0.1171875, "learning_rate": 0.0002449394452659544, "loss": 2.1509, "step": 1950 }, { "epoch": 1.46, "grad_norm": 0.1171875, "learning_rate": 0.00024409290557639947, "loss": 2.1462, "step": 1955 }, { "epoch": 1.46, "grad_norm": 0.1162109375, "learning_rate": 0.00024324553475357866, "loss": 2.1404, "step": 1960 }, { "epoch": 1.47, "grad_norm": 0.11669921875, "learning_rate": 0.00024239734877008604, "loss": 2.1677, "step": 1965 }, { "epoch": 1.47, "grad_norm": 0.11962890625, "learning_rate": 0.000241548363613881, "loss": 2.1602, "step": 1970 }, { "epoch": 1.47, "grad_norm": 0.11767578125, "learning_rate": 0.00024069859528798714, "loss": 2.1534, "step": 1975 }, { "epoch": 1.48, "grad_norm": 0.115234375, "learning_rate": 0.0002398480598101903, "loss": 2.1448, "step": 1980 }, { "epoch": 1.48, "grad_norm": 0.11474609375, "learning_rate": 0.00023899677321273714, "loss": 2.1356, "step": 1985 }, { "epoch": 1.48, "grad_norm": 0.115234375, "learning_rate": 0.00023814475154203222, "loss": 2.154, "step": 1990 }, { "epoch": 1.49, "grad_norm": 0.11865234375, "learning_rate": 0.00023729201085833626, "loss": 2.1383, "step": 1995 }, { "epoch": 1.49, "grad_norm": 0.1181640625, "learning_rate": 0.00023643856723546295, "loss": 2.1611, "step": 2000 }, { "epoch": 1.5, "grad_norm": 0.1201171875, "learning_rate": 0.00023558443676047596, "loss": 2.1302, "step": 2005 }, { "epoch": 1.5, "grad_norm": 0.1171875, "learning_rate": 0.00023472963553338613, "loss": 2.1535, "step": 2010 }, { "epoch": 1.5, "grad_norm": 0.11962890625, "learning_rate": 0.00023387417966684742, "loss": 2.1414, "step": 2015 }, { "epoch": 1.51, "grad_norm": 0.11767578125, "learning_rate": 0.00023301808528585375, "loss": 2.1352, "step": 2020 }, { "epoch": 1.51, "grad_norm": 0.11962890625, "learning_rate": 0.0002321613685274346, "loss": 2.152, "step": 2025 }, { "epoch": 1.51, "grad_norm": 0.11865234375, "learning_rate": 0.00023130404554035102, "loss": 2.142, "step": 2030 }, { "epoch": 1.52, "grad_norm": 0.12158203125, "learning_rate": 0.00023044613248479116, "loss": 2.1588, "step": 2035 }, { "epoch": 1.52, "grad_norm": 0.1171875, "learning_rate": 0.000229587645532066, "loss": 2.1475, "step": 2040 }, { "epoch": 1.53, "grad_norm": 0.119140625, "learning_rate": 0.00022872860086430393, "loss": 2.1593, "step": 2045 }, { "epoch": 1.53, "grad_norm": 0.1181640625, "learning_rate": 0.00022786901467414619, "loss": 2.1467, "step": 2050 }, { "epoch": 1.53, "grad_norm": 0.11669921875, "learning_rate": 0.0002270089031644415, "loss": 2.1364, "step": 2055 }, { "epoch": 1.54, "grad_norm": 0.119140625, "learning_rate": 0.00022614828254794055, "loss": 2.1384, "step": 2060 }, { "epoch": 1.54, "grad_norm": 0.11669921875, "learning_rate": 0.00022528716904699056, "loss": 2.1428, "step": 2065 }, { "epoch": 1.54, "grad_norm": 0.1181640625, "learning_rate": 0.00022442557889322946, "loss": 2.1517, "step": 2070 }, { "epoch": 1.55, "grad_norm": 0.12060546875, "learning_rate": 0.00022356352832727985, "loss": 2.1474, "step": 2075 }, { "epoch": 1.55, "grad_norm": 0.1201171875, "learning_rate": 0.00022270103359844283, "loss": 2.1684, "step": 2080 }, { "epoch": 1.56, "grad_norm": 0.11669921875, "learning_rate": 0.00022183811096439194, "loss": 2.1616, "step": 2085 }, { "epoch": 1.56, "grad_norm": 0.12109375, "learning_rate": 0.00022097477669086638, "loss": 2.1468, "step": 2090 }, { "epoch": 1.56, "grad_norm": 0.1162109375, "learning_rate": 0.00022011104705136475, "loss": 2.1374, "step": 2095 }, { "epoch": 1.57, "grad_norm": 0.11962890625, "learning_rate": 0.00021924693832683806, "loss": 2.1539, "step": 2100 }, { "epoch": 1.57, "grad_norm": 0.1171875, "learning_rate": 0.00021838246680538293, "loss": 2.1514, "step": 2105 }, { "epoch": 1.57, "grad_norm": 0.12158203125, "learning_rate": 0.00021751764878193459, "loss": 2.1407, "step": 2110 }, { "epoch": 1.58, "grad_norm": 0.1181640625, "learning_rate": 0.00021665250055795957, "loss": 2.1485, "step": 2115 }, { "epoch": 1.58, "grad_norm": 0.11865234375, "learning_rate": 0.0002157870384411487, "loss": 2.1496, "step": 2120 }, { "epoch": 1.59, "grad_norm": 0.1181640625, "learning_rate": 0.00021492127874510946, "loss": 2.143, "step": 2125 }, { "epoch": 1.59, "grad_norm": 0.119140625, "learning_rate": 0.0002140552377890586, "loss": 2.1498, "step": 2130 }, { "epoch": 1.59, "grad_norm": 0.1201171875, "learning_rate": 0.00021318893189751457, "loss": 2.1586, "step": 2135 }, { "epoch": 1.6, "grad_norm": 0.119140625, "learning_rate": 0.00021232237739998965, "loss": 2.139, "step": 2140 }, { "epoch": 1.6, "grad_norm": 0.11865234375, "learning_rate": 0.00021145559063068223, "loss": 2.1481, "step": 2145 }, { "epoch": 1.6, "grad_norm": 0.12060546875, "learning_rate": 0.00021058858792816904, "loss": 2.1449, "step": 2150 }, { "epoch": 1.61, "grad_norm": 0.11865234375, "learning_rate": 0.00020972138563509708, "loss": 2.1629, "step": 2155 }, { "epoch": 1.61, "grad_norm": 0.1181640625, "learning_rate": 0.00020885400009787528, "loss": 2.1458, "step": 2160 }, { "epoch": 1.62, "grad_norm": 0.119140625, "learning_rate": 0.000207986447666367, "loss": 2.1531, "step": 2165 }, { "epoch": 1.62, "grad_norm": 0.11767578125, "learning_rate": 0.0002071187446935813, "loss": 2.1339, "step": 2170 }, { "epoch": 1.62, "grad_norm": 0.11767578125, "learning_rate": 0.00020625090753536492, "loss": 2.1525, "step": 2175 }, { "epoch": 1.63, "grad_norm": 0.11962890625, "learning_rate": 0.00020538295255009384, "loss": 2.1522, "step": 2180 }, { "epoch": 1.63, "grad_norm": 0.1171875, "learning_rate": 0.0002045148960983652, "loss": 2.1358, "step": 2185 }, { "epoch": 1.63, "grad_norm": 0.1181640625, "learning_rate": 0.0002036467545426886, "loss": 2.149, "step": 2190 }, { "epoch": 1.64, "grad_norm": 0.12109375, "learning_rate": 0.00020277854424717803, "loss": 2.1394, "step": 2195 }, { "epoch": 1.64, "grad_norm": 0.1171875, "learning_rate": 0.00020191028157724294, "loss": 2.1424, "step": 2200 }, { "epoch": 1.64, "grad_norm": 0.125, "learning_rate": 0.0002010419828992801, "loss": 2.1615, "step": 2205 }, { "epoch": 1.65, "grad_norm": 0.11572265625, "learning_rate": 0.00020017366458036513, "loss": 2.1549, "step": 2210 }, { "epoch": 1.65, "grad_norm": 0.11767578125, "learning_rate": 0.00019930534298794365, "loss": 2.1115, "step": 2215 }, { "epoch": 1.66, "grad_norm": 0.12109375, "learning_rate": 0.0001984370344895232, "loss": 2.1267, "step": 2220 }, { "epoch": 1.66, "grad_norm": 0.11962890625, "learning_rate": 0.00019756875545236453, "loss": 2.1387, "step": 2225 }, { "epoch": 1.66, "grad_norm": 0.11962890625, "learning_rate": 0.00019670052224317274, "loss": 2.1365, "step": 2230 }, { "epoch": 1.67, "grad_norm": 0.12109375, "learning_rate": 0.0001958323512277895, "loss": 2.1511, "step": 2235 }, { "epoch": 1.67, "grad_norm": 0.11865234375, "learning_rate": 0.0001949642587708838, "loss": 2.1503, "step": 2240 }, { "epoch": 1.67, "grad_norm": 0.119140625, "learning_rate": 0.00019409626123564403, "loss": 2.1469, "step": 2245 }, { "epoch": 1.68, "grad_norm": 0.119140625, "learning_rate": 0.00019322837498346934, "loss": 2.125, "step": 2250 }, { "epoch": 1.68, "grad_norm": 0.11865234375, "learning_rate": 0.00019236061637366124, "loss": 2.1331, "step": 2255 }, { "epoch": 1.69, "grad_norm": 0.11865234375, "learning_rate": 0.00019149300176311504, "loss": 2.1354, "step": 2260 }, { "epoch": 1.69, "grad_norm": 0.119140625, "learning_rate": 0.00019062554750601198, "loss": 2.1512, "step": 2265 }, { "epoch": 1.69, "grad_norm": 0.11767578125, "learning_rate": 0.0001897582699535107, "loss": 2.1151, "step": 2270 }, { "epoch": 1.7, "grad_norm": 0.12255859375, "learning_rate": 0.00018889118545343877, "loss": 2.1239, "step": 2275 }, { "epoch": 1.7, "grad_norm": 0.1181640625, "learning_rate": 0.000188024310349985, "loss": 2.1381, "step": 2280 }, { "epoch": 1.7, "grad_norm": 0.11669921875, "learning_rate": 0.00018715766098339117, "loss": 2.1306, "step": 2285 }, { "epoch": 1.71, "grad_norm": 0.11669921875, "learning_rate": 0.00018629125368964405, "loss": 2.1489, "step": 2290 }, { "epoch": 1.71, "grad_norm": 0.1220703125, "learning_rate": 0.00018542510480016713, "loss": 2.1547, "step": 2295 }, { "epoch": 1.72, "grad_norm": 0.119140625, "learning_rate": 0.00018455923064151342, "loss": 2.1343, "step": 2300 }, { "epoch": 1.72, "grad_norm": 0.12353515625, "learning_rate": 0.00018369364753505728, "loss": 2.144, "step": 2305 }, { "epoch": 1.72, "grad_norm": 0.11962890625, "learning_rate": 0.00018282837179668679, "loss": 2.1494, "step": 2310 }, { "epoch": 1.73, "grad_norm": 0.1220703125, "learning_rate": 0.00018196341973649637, "loss": 2.1511, "step": 2315 }, { "epoch": 1.73, "grad_norm": 0.11962890625, "learning_rate": 0.00018109880765847906, "loss": 2.148, "step": 2320 }, { "epoch": 1.73, "grad_norm": 0.12109375, "learning_rate": 0.00018023455186021961, "loss": 2.142, "step": 2325 }, { "epoch": 1.74, "grad_norm": 0.1201171875, "learning_rate": 0.0001793706686325868, "loss": 2.1445, "step": 2330 }, { "epoch": 1.74, "grad_norm": 0.12158203125, "learning_rate": 0.0001785071742594268, "loss": 2.1344, "step": 2335 }, { "epoch": 1.75, "grad_norm": 0.11669921875, "learning_rate": 0.00017764408501725593, "loss": 2.1214, "step": 2340 }, { "epoch": 1.75, "grad_norm": 0.11865234375, "learning_rate": 0.00017678141717495394, "loss": 2.1232, "step": 2345 }, { "epoch": 1.75, "grad_norm": 0.11962890625, "learning_rate": 0.00017591918699345755, "loss": 2.129, "step": 2350 }, { "epoch": 1.76, "grad_norm": 0.119140625, "learning_rate": 0.00017505741072545346, "loss": 2.1462, "step": 2355 }, { "epoch": 1.76, "grad_norm": 0.119140625, "learning_rate": 0.00017419610461507254, "loss": 2.1401, "step": 2360 }, { "epoch": 1.76, "grad_norm": 0.119140625, "learning_rate": 0.0001733352848975832, "loss": 2.1497, "step": 2365 }, { "epoch": 1.77, "grad_norm": 0.11767578125, "learning_rate": 0.00017247496779908565, "loss": 2.1356, "step": 2370 }, { "epoch": 1.77, "grad_norm": 0.1201171875, "learning_rate": 0.0001716151695362059, "loss": 2.1436, "step": 2375 }, { "epoch": 1.78, "grad_norm": 0.12060546875, "learning_rate": 0.00017075590631579019, "loss": 2.1538, "step": 2380 }, { "epoch": 1.78, "grad_norm": 0.11865234375, "learning_rate": 0.00016989719433459924, "loss": 2.1497, "step": 2385 }, { "epoch": 1.78, "grad_norm": 0.1171875, "learning_rate": 0.00016903904977900333, "loss": 2.1333, "step": 2390 }, { "epoch": 1.79, "grad_norm": 0.1220703125, "learning_rate": 0.000168181488824677, "loss": 2.1542, "step": 2395 }, { "epoch": 1.79, "grad_norm": 0.1171875, "learning_rate": 0.00016732452763629395, "loss": 2.1197, "step": 2400 }, { "epoch": 1.79, "grad_norm": 0.123046875, "learning_rate": 0.00016646818236722282, "loss": 2.1151, "step": 2405 }, { "epoch": 1.8, "grad_norm": 0.12060546875, "learning_rate": 0.00016561246915922204, "loss": 2.1505, "step": 2410 }, { "epoch": 1.8, "grad_norm": 0.11865234375, "learning_rate": 0.00016475740414213642, "loss": 2.1501, "step": 2415 }, { "epoch": 1.81, "grad_norm": 0.1201171875, "learning_rate": 0.00016390300343359216, "loss": 2.1556, "step": 2420 }, { "epoch": 1.81, "grad_norm": 0.119140625, "learning_rate": 0.0001630492831386939, "loss": 2.133, "step": 2425 }, { "epoch": 1.81, "grad_norm": 0.123046875, "learning_rate": 0.0001621962593497205, "loss": 2.162, "step": 2430 }, { "epoch": 1.82, "grad_norm": 0.12060546875, "learning_rate": 0.0001613439481458221, "loss": 2.1333, "step": 2435 }, { "epoch": 1.82, "grad_norm": 0.12060546875, "learning_rate": 0.000160492365592717, "loss": 2.1566, "step": 2440 }, { "epoch": 1.82, "grad_norm": 0.11865234375, "learning_rate": 0.00015964152774238842, "loss": 2.1692, "step": 2445 }, { "epoch": 1.83, "grad_norm": 0.119140625, "learning_rate": 0.00015879145063278256, "loss": 2.1413, "step": 2450 }, { "epoch": 1.83, "grad_norm": 0.12255859375, "learning_rate": 0.00015794215028750567, "loss": 2.1564, "step": 2455 }, { "epoch": 1.84, "grad_norm": 0.11962890625, "learning_rate": 0.00015709364271552262, "loss": 2.1305, "step": 2460 }, { "epoch": 1.84, "grad_norm": 0.12060546875, "learning_rate": 0.00015624594391085457, "loss": 2.1526, "step": 2465 }, { "epoch": 1.84, "grad_norm": 0.12353515625, "learning_rate": 0.00015539906985227798, "loss": 2.138, "step": 2470 }, { "epoch": 1.85, "grad_norm": 0.12353515625, "learning_rate": 0.0001545530365030229, "loss": 2.1432, "step": 2475 }, { "epoch": 1.85, "grad_norm": 0.1201171875, "learning_rate": 0.00015370785981047252, "loss": 2.1508, "step": 2480 }, { "epoch": 1.85, "grad_norm": 0.1201171875, "learning_rate": 0.00015286355570586255, "loss": 2.1347, "step": 2485 }, { "epoch": 1.86, "grad_norm": 0.12060546875, "learning_rate": 0.00015202014010398042, "loss": 2.1497, "step": 2490 }, { "epoch": 1.86, "grad_norm": 0.12060546875, "learning_rate": 0.00015117762890286602, "loss": 2.1377, "step": 2495 }, { "epoch": 1.86, "grad_norm": 0.1171875, "learning_rate": 0.0001503360379835113, "loss": 2.1337, "step": 2500 }, { "epoch": 1.87, "grad_norm": 0.11865234375, "learning_rate": 0.00014949538320956158, "loss": 2.156, "step": 2505 }, { "epoch": 1.87, "grad_norm": 0.11962890625, "learning_rate": 0.00014865568042701592, "loss": 2.1386, "step": 2510 }, { "epoch": 1.88, "grad_norm": 0.119140625, "learning_rate": 0.0001478169454639291, "loss": 2.1468, "step": 2515 }, { "epoch": 1.88, "grad_norm": 0.119140625, "learning_rate": 0.00014697919413011253, "loss": 2.1566, "step": 2520 }, { "epoch": 1.88, "grad_norm": 0.123046875, "learning_rate": 0.00014614244221683686, "loss": 2.1441, "step": 2525 }, { "epoch": 1.89, "grad_norm": 0.12060546875, "learning_rate": 0.00014530670549653407, "loss": 2.1473, "step": 2530 }, { "epoch": 1.89, "grad_norm": 0.12060546875, "learning_rate": 0.00014447199972249987, "loss": 2.1481, "step": 2535 }, { "epoch": 1.89, "grad_norm": 0.1201171875, "learning_rate": 0.00014363834062859748, "loss": 2.1546, "step": 2540 }, { "epoch": 1.9, "grad_norm": 0.12060546875, "learning_rate": 0.00014280574392896032, "loss": 2.1314, "step": 2545 }, { "epoch": 1.9, "grad_norm": 0.11767578125, "learning_rate": 0.0001419742253176962, "loss": 2.1129, "step": 2550 }, { "epoch": 1.91, "grad_norm": 0.119140625, "learning_rate": 0.00014114380046859138, "loss": 2.1353, "step": 2555 }, { "epoch": 1.91, "grad_norm": 0.1201171875, "learning_rate": 0.00014031448503481532, "loss": 2.1423, "step": 2560 }, { "epoch": 1.91, "grad_norm": 0.1201171875, "learning_rate": 0.00013948629464862516, "loss": 2.1173, "step": 2565 }, { "epoch": 1.92, "grad_norm": 0.1201171875, "learning_rate": 0.00013865924492107153, "loss": 2.1468, "step": 2570 }, { "epoch": 1.92, "grad_norm": 0.12158203125, "learning_rate": 0.00013783335144170418, "loss": 2.1517, "step": 2575 }, { "epoch": 1.92, "grad_norm": 0.119140625, "learning_rate": 0.0001370086297782779, "loss": 2.153, "step": 2580 }, { "epoch": 1.93, "grad_norm": 0.12109375, "learning_rate": 0.0001361850954764594, "loss": 2.1427, "step": 2585 }, { "epoch": 1.93, "grad_norm": 0.1181640625, "learning_rate": 0.0001353627640595338, "loss": 2.1477, "step": 2590 }, { "epoch": 1.94, "grad_norm": 0.12353515625, "learning_rate": 0.00013454165102811272, "loss": 2.1414, "step": 2595 }, { "epoch": 1.94, "grad_norm": 0.1220703125, "learning_rate": 0.00013372177185984134, "loss": 2.1579, "step": 2600 }, { "epoch": 1.94, "grad_norm": 0.11865234375, "learning_rate": 0.00013290314200910735, "loss": 2.123, "step": 2605 }, { "epoch": 1.95, "grad_norm": 0.119140625, "learning_rate": 0.00013208577690674905, "loss": 2.1311, "step": 2610 }, { "epoch": 1.95, "grad_norm": 0.119140625, "learning_rate": 0.00013126969195976495, "loss": 2.1314, "step": 2615 }, { "epoch": 1.95, "grad_norm": 0.11865234375, "learning_rate": 0.00013045490255102316, "loss": 2.1374, "step": 2620 }, { "epoch": 1.96, "grad_norm": 0.12060546875, "learning_rate": 0.00012964142403897112, "loss": 2.1489, "step": 2625 }, { "epoch": 1.96, "grad_norm": 0.1181640625, "learning_rate": 0.0001288292717573468, "loss": 2.145, "step": 2630 }, { "epoch": 1.97, "grad_norm": 0.1181640625, "learning_rate": 0.00012801846101488898, "loss": 2.1288, "step": 2635 }, { "epoch": 1.97, "grad_norm": 0.119140625, "learning_rate": 0.00012720900709504917, "loss": 2.1468, "step": 2640 }, { "epoch": 1.97, "grad_norm": 0.12109375, "learning_rate": 0.00012640092525570312, "loss": 2.1201, "step": 2645 }, { "epoch": 1.98, "grad_norm": 0.1220703125, "learning_rate": 0.0001255942307288637, "loss": 2.1523, "step": 2650 }, { "epoch": 1.98, "grad_norm": 0.1201171875, "learning_rate": 0.00012478893872039314, "loss": 2.146, "step": 2655 }, { "epoch": 1.98, "grad_norm": 0.11962890625, "learning_rate": 0.00012398506440971713, "loss": 2.1387, "step": 2660 }, { "epoch": 1.99, "grad_norm": 0.11767578125, "learning_rate": 0.00012318262294953815, "loss": 2.1272, "step": 2665 }, { "epoch": 1.99, "grad_norm": 0.119140625, "learning_rate": 0.00012238162946555002, "loss": 2.1488, "step": 2670 }, { "epoch": 2.0, "grad_norm": 0.11962890625, "learning_rate": 0.00012158209905615301, "loss": 2.1445, "step": 2675 }, { "epoch": 2.0, "grad_norm": 0.1201171875, "learning_rate": 0.00012078404679216864, "loss": 2.1327, "step": 2680 }, { "epoch": 2.0, "eval_loss": 2.1632144451141357, "eval_runtime": 186.9566, "eval_samples_per_second": 25.792, "eval_steps_per_second": 3.225, "step": 2681 }, { "epoch": 2.0, "grad_norm": 0.12158203125, "learning_rate": 0.0001199874877165564, "loss": 2.1156, "step": 2685 }, { "epoch": 2.01, "grad_norm": 0.1181640625, "learning_rate": 0.00011919243684412948, "loss": 2.115, "step": 2690 }, { "epoch": 2.01, "grad_norm": 0.11962890625, "learning_rate": 0.00011839890916127228, "loss": 2.0971, "step": 2695 }, { "epoch": 2.01, "grad_norm": 0.12109375, "learning_rate": 0.00011760691962565752, "loss": 2.119, "step": 2700 }, { "epoch": 2.02, "grad_norm": 0.1220703125, "learning_rate": 0.00011681648316596461, "loss": 2.1282, "step": 2705 }, { "epoch": 2.02, "grad_norm": 0.12255859375, "learning_rate": 0.00011602761468159813, "loss": 2.1151, "step": 2710 }, { "epoch": 2.03, "grad_norm": 0.1201171875, "learning_rate": 0.00011524032904240671, "loss": 2.101, "step": 2715 }, { "epoch": 2.03, "grad_norm": 0.12255859375, "learning_rate": 0.00011445464108840345, "loss": 2.113, "step": 2720 }, { "epoch": 2.03, "grad_norm": 0.123046875, "learning_rate": 0.0001136705656294851, "loss": 2.118, "step": 2725 }, { "epoch": 2.04, "grad_norm": 0.126953125, "learning_rate": 0.00011288811744515433, "loss": 2.1045, "step": 2730 }, { "epoch": 2.04, "grad_norm": 0.123046875, "learning_rate": 0.0001121073112842395, "loss": 2.108, "step": 2735 }, { "epoch": 2.04, "grad_norm": 0.12109375, "learning_rate": 0.00011132816186461821, "loss": 2.0919, "step": 2740 }, { "epoch": 2.05, "grad_norm": 0.1240234375, "learning_rate": 0.00011055068387293879, "loss": 2.1063, "step": 2745 }, { "epoch": 2.05, "grad_norm": 0.123046875, "learning_rate": 0.00010977489196434381, "loss": 2.1175, "step": 2750 }, { "epoch": 2.06, "grad_norm": 0.123046875, "learning_rate": 0.00010900080076219426, "loss": 2.1103, "step": 2755 }, { "epoch": 2.06, "grad_norm": 0.1220703125, "learning_rate": 0.00010822842485779285, "loss": 2.1111, "step": 2760 }, { "epoch": 2.06, "grad_norm": 0.12158203125, "learning_rate": 0.00010745777881011027, "loss": 2.0899, "step": 2765 }, { "epoch": 2.07, "grad_norm": 0.123046875, "learning_rate": 0.00010668887714550974, "loss": 2.0935, "step": 2770 }, { "epoch": 2.07, "grad_norm": 0.1201171875, "learning_rate": 0.0001059217343574737, "loss": 2.0919, "step": 2775 }, { "epoch": 2.07, "grad_norm": 0.123046875, "learning_rate": 0.00010515636490633043, "loss": 2.1157, "step": 2780 }, { "epoch": 2.08, "grad_norm": 0.12255859375, "learning_rate": 0.00010439278321898153, "loss": 2.1024, "step": 2785 }, { "epoch": 2.08, "grad_norm": 0.1240234375, "learning_rate": 0.00010363100368863021, "loss": 2.1038, "step": 2790 }, { "epoch": 2.09, "grad_norm": 0.12255859375, "learning_rate": 0.00010287104067450928, "loss": 2.1088, "step": 2795 }, { "epoch": 2.09, "grad_norm": 0.12353515625, "learning_rate": 0.00010211290850161144, "loss": 2.0861, "step": 2800 }, { "epoch": 2.09, "grad_norm": 0.12451171875, "learning_rate": 0.00010135662146041855, "loss": 2.1215, "step": 2805 }, { "epoch": 2.1, "grad_norm": 0.123046875, "learning_rate": 0.0001006021938066325, "loss": 2.1062, "step": 2810 }, { "epoch": 2.1, "grad_norm": 0.12451171875, "learning_rate": 9.984963976090651e-05, "loss": 2.1014, "step": 2815 }, { "epoch": 2.1, "grad_norm": 0.123046875, "learning_rate": 9.909897350857706e-05, "loss": 2.1023, "step": 2820 }, { "epoch": 2.11, "grad_norm": 0.12158203125, "learning_rate": 9.83502091993965e-05, "loss": 2.1025, "step": 2825 }, { "epoch": 2.11, "grad_norm": 0.1279296875, "learning_rate": 9.760336094726624e-05, "loss": 2.1241, "step": 2830 }, { "epoch": 2.11, "grad_norm": 0.12451171875, "learning_rate": 9.6858442829971e-05, "loss": 2.1155, "step": 2835 }, { "epoch": 2.12, "grad_norm": 0.123046875, "learning_rate": 9.611546888891307e-05, "loss": 2.1012, "step": 2840 }, { "epoch": 2.12, "grad_norm": 0.12451171875, "learning_rate": 9.537445312884788e-05, "loss": 2.1058, "step": 2845 }, { "epoch": 2.13, "grad_norm": 0.12353515625, "learning_rate": 9.463540951761989e-05, "loss": 2.0876, "step": 2850 }, { "epoch": 2.13, "grad_norm": 0.12451171875, "learning_rate": 9.389835198589944e-05, "loss": 2.1222, "step": 2855 }, { "epoch": 2.13, "grad_norm": 0.126953125, "learning_rate": 9.316329442691995e-05, "loss": 2.1107, "step": 2860 }, { "epoch": 2.14, "grad_norm": 0.1240234375, "learning_rate": 9.243025069621649e-05, "loss": 2.1065, "step": 2865 }, { "epoch": 2.14, "grad_norm": 0.12451171875, "learning_rate": 9.169923461136376e-05, "loss": 2.1193, "step": 2870 }, { "epoch": 2.14, "grad_norm": 0.125, "learning_rate": 9.097025995171669e-05, "loss": 2.1154, "step": 2875 }, { "epoch": 2.15, "grad_norm": 0.12255859375, "learning_rate": 9.024334045814988e-05, "loss": 2.1055, "step": 2880 }, { "epoch": 2.15, "grad_norm": 0.12255859375, "learning_rate": 8.951848983279898e-05, "loss": 2.1039, "step": 2885 }, { "epoch": 2.16, "grad_norm": 0.123046875, "learning_rate": 8.87957217388023e-05, "loss": 2.1249, "step": 2890 }, { "epoch": 2.16, "grad_norm": 0.125, "learning_rate": 8.80750498000432e-05, "loss": 2.1231, "step": 2895 }, { "epoch": 2.16, "grad_norm": 0.1240234375, "learning_rate": 8.735648760089367e-05, "loss": 2.1346, "step": 2900 }, { "epoch": 2.17, "grad_norm": 0.1259765625, "learning_rate": 8.66400486859575e-05, "loss": 2.107, "step": 2905 }, { "epoch": 2.17, "grad_norm": 0.12255859375, "learning_rate": 8.592574655981594e-05, "loss": 2.1189, "step": 2910 }, { "epoch": 2.17, "grad_norm": 0.1259765625, "learning_rate": 8.521359468677214e-05, "loss": 2.1061, "step": 2915 }, { "epoch": 2.18, "grad_norm": 0.1259765625, "learning_rate": 8.450360649059834e-05, "loss": 2.1297, "step": 2920 }, { "epoch": 2.18, "grad_norm": 0.1279296875, "learning_rate": 8.379579535428203e-05, "loss": 2.1119, "step": 2925 }, { "epoch": 2.19, "grad_norm": 0.12353515625, "learning_rate": 8.309017461977409e-05, "loss": 2.0947, "step": 2930 }, { "epoch": 2.19, "grad_norm": 0.1259765625, "learning_rate": 8.23867575877374e-05, "loss": 2.1072, "step": 2935 }, { "epoch": 2.19, "grad_norm": 0.123046875, "learning_rate": 8.168555751729551e-05, "loss": 2.106, "step": 2940 }, { "epoch": 2.2, "grad_norm": 0.1259765625, "learning_rate": 8.098658762578369e-05, "loss": 2.1183, "step": 2945 }, { "epoch": 2.2, "grad_norm": 0.125, "learning_rate": 8.028986108849887e-05, "loss": 2.1103, "step": 2950 }, { "epoch": 2.2, "grad_norm": 0.1279296875, "learning_rate": 7.959539103845184e-05, "loss": 2.1414, "step": 2955 }, { "epoch": 2.21, "grad_norm": 0.125, "learning_rate": 7.890319056611942e-05, "loss": 2.11, "step": 2960 }, { "epoch": 2.21, "grad_norm": 0.12255859375, "learning_rate": 7.82132727191978e-05, "loss": 2.1178, "step": 2965 }, { "epoch": 2.22, "grad_norm": 0.1240234375, "learning_rate": 7.752565050235694e-05, "loss": 2.1018, "step": 2970 }, { "epoch": 2.22, "grad_norm": 0.1259765625, "learning_rate": 7.684033687699455e-05, "loss": 2.1184, "step": 2975 }, { "epoch": 2.22, "grad_norm": 0.1259765625, "learning_rate": 7.615734476099284e-05, "loss": 2.1208, "step": 2980 }, { "epoch": 2.23, "grad_norm": 0.1279296875, "learning_rate": 7.547668702847421e-05, "loss": 2.1201, "step": 2985 }, { "epoch": 2.23, "grad_norm": 0.12451171875, "learning_rate": 7.479837650955906e-05, "loss": 2.123, "step": 2990 }, { "epoch": 2.23, "grad_norm": 0.1220703125, "learning_rate": 7.412242599012366e-05, "loss": 2.1324, "step": 2995 }, { "epoch": 2.24, "grad_norm": 0.123046875, "learning_rate": 7.34488482115593e-05, "loss": 2.1275, "step": 3000 }, { "epoch": 2.24, "grad_norm": 0.12353515625, "learning_rate": 7.277765587053206e-05, "loss": 2.1193, "step": 3005 }, { "epoch": 2.25, "grad_norm": 0.12451171875, "learning_rate": 7.210886161874344e-05, "loss": 2.1165, "step": 3010 }, { "epoch": 2.25, "grad_norm": 0.12353515625, "learning_rate": 7.144247806269213e-05, "loss": 2.1136, "step": 3015 }, { "epoch": 2.25, "grad_norm": 0.12353515625, "learning_rate": 7.0778517763436e-05, "loss": 2.0813, "step": 3020 }, { "epoch": 2.26, "grad_norm": 0.12451171875, "learning_rate": 7.011699323635559e-05, "loss": 2.0982, "step": 3025 }, { "epoch": 2.26, "grad_norm": 0.12451171875, "learning_rate": 6.94579169509181e-05, "loss": 2.1135, "step": 3030 }, { "epoch": 2.26, "grad_norm": 0.12255859375, "learning_rate": 6.88013013304424e-05, "loss": 2.1057, "step": 3035 }, { "epoch": 2.27, "grad_norm": 0.12353515625, "learning_rate": 6.814715875186475e-05, "loss": 2.1319, "step": 3040 }, { "epoch": 2.27, "grad_norm": 0.12353515625, "learning_rate": 6.749550154550585e-05, "loss": 2.1206, "step": 3045 }, { "epoch": 2.28, "grad_norm": 0.126953125, "learning_rate": 6.684634199483773e-05, "loss": 2.123, "step": 3050 }, { "epoch": 2.28, "grad_norm": 0.126953125, "learning_rate": 6.619969233625298e-05, "loss": 2.1197, "step": 3055 }, { "epoch": 2.28, "grad_norm": 0.1259765625, "learning_rate": 6.55555647588336e-05, "loss": 2.1075, "step": 3060 }, { "epoch": 2.29, "grad_norm": 0.125, "learning_rate": 6.491397140412139e-05, "loss": 2.1185, "step": 3065 }, { "epoch": 2.29, "grad_norm": 0.1259765625, "learning_rate": 6.42749243658891e-05, "loss": 2.1114, "step": 3070 }, { "epoch": 2.29, "grad_norm": 0.1259765625, "learning_rate": 6.363843568991243e-05, "loss": 2.0937, "step": 3075 }, { "epoch": 2.3, "grad_norm": 0.123046875, "learning_rate": 6.300451737374322e-05, "loss": 2.0954, "step": 3080 }, { "epoch": 2.3, "grad_norm": 0.125, "learning_rate": 6.237318136648258e-05, "loss": 2.1127, "step": 3085 }, { "epoch": 2.31, "grad_norm": 0.12451171875, "learning_rate": 6.174443956855671e-05, "loss": 2.1174, "step": 3090 }, { "epoch": 2.31, "grad_norm": 0.1328125, "learning_rate": 6.111830383149164e-05, "loss": 2.1148, "step": 3095 }, { "epoch": 2.31, "grad_norm": 0.1259765625, "learning_rate": 6.04947859576904e-05, "loss": 2.1382, "step": 3100 }, { "epoch": 2.32, "grad_norm": 0.126953125, "learning_rate": 5.9873897700210304e-05, "loss": 2.1125, "step": 3105 }, { "epoch": 2.32, "grad_norm": 0.125, "learning_rate": 5.92556507625414e-05, "loss": 2.1068, "step": 3110 }, { "epoch": 2.32, "grad_norm": 0.12451171875, "learning_rate": 5.86400567983862e-05, "loss": 2.1204, "step": 3115 }, { "epoch": 2.33, "grad_norm": 0.1240234375, "learning_rate": 5.802712741143934e-05, "loss": 2.1046, "step": 3120 }, { "epoch": 2.33, "grad_norm": 0.126953125, "learning_rate": 5.741687415516968e-05, "loss": 2.1179, "step": 3125 }, { "epoch": 2.33, "grad_norm": 0.125, "learning_rate": 5.680930853260182e-05, "loss": 2.1184, "step": 3130 }, { "epoch": 2.34, "grad_norm": 0.12451171875, "learning_rate": 5.6204441996099686e-05, "loss": 2.1204, "step": 3135 }, { "epoch": 2.34, "grad_norm": 0.125, "learning_rate": 5.560228594715049e-05, "loss": 2.1097, "step": 3140 }, { "epoch": 2.35, "grad_norm": 0.1328125, "learning_rate": 5.500285173614985e-05, "loss": 2.1141, "step": 3145 }, { "epoch": 2.35, "grad_norm": 0.125, "learning_rate": 5.4406150662188035e-05, "loss": 2.1096, "step": 3150 }, { "epoch": 2.35, "grad_norm": 0.1259765625, "learning_rate": 5.3812193972836436e-05, "loss": 2.1134, "step": 3155 }, { "epoch": 2.36, "grad_norm": 0.126953125, "learning_rate": 5.322099286393625e-05, "loss": 2.1132, "step": 3160 }, { "epoch": 2.36, "grad_norm": 0.126953125, "learning_rate": 5.263255847938693e-05, "loss": 2.1083, "step": 3165 }, { "epoch": 2.36, "grad_norm": 0.126953125, "learning_rate": 5.204690191093635e-05, "loss": 2.1093, "step": 3170 }, { "epoch": 2.37, "grad_norm": 0.1318359375, "learning_rate": 5.1464034197971726e-05, "loss": 2.1123, "step": 3175 }, { "epoch": 2.37, "grad_norm": 0.1240234375, "learning_rate": 5.08839663273114e-05, "loss": 2.1115, "step": 3180 }, { "epoch": 2.38, "grad_norm": 0.12451171875, "learning_rate": 5.030670923299785e-05, "loss": 2.1129, "step": 3185 }, { "epoch": 2.38, "grad_norm": 0.125, "learning_rate": 4.9732273796091685e-05, "loss": 2.1259, "step": 3190 }, { "epoch": 2.38, "grad_norm": 0.12451171875, "learning_rate": 4.916067084446632e-05, "loss": 2.1305, "step": 3195 }, { "epoch": 2.39, "grad_norm": 0.1259765625, "learning_rate": 4.859191115260393e-05, "loss": 2.098, "step": 3200 }, { "epoch": 2.39, "grad_norm": 0.126953125, "learning_rate": 4.8026005441392505e-05, "loss": 2.1109, "step": 3205 }, { "epoch": 2.39, "grad_norm": 0.1259765625, "learning_rate": 4.7462964377923635e-05, "loss": 2.1043, "step": 3210 }, { "epoch": 2.4, "grad_norm": 0.1240234375, "learning_rate": 4.690279857529145e-05, "loss": 2.0896, "step": 3215 }, { "epoch": 2.4, "grad_norm": 0.125, "learning_rate": 4.634551859239254e-05, "loss": 2.1266, "step": 3220 }, { "epoch": 2.41, "grad_norm": 0.125, "learning_rate": 4.57911349337272e-05, "loss": 2.0982, "step": 3225 }, { "epoch": 2.41, "grad_norm": 0.1279296875, "learning_rate": 4.523965804920078e-05, "loss": 2.1275, "step": 3230 }, { "epoch": 2.41, "grad_norm": 0.1279296875, "learning_rate": 4.469109833392759e-05, "loss": 2.1323, "step": 3235 }, { "epoch": 2.42, "grad_norm": 0.12451171875, "learning_rate": 4.414546612803421e-05, "loss": 2.1026, "step": 3240 }, { "epoch": 2.42, "grad_norm": 0.12353515625, "learning_rate": 4.3602771716464874e-05, "loss": 2.085, "step": 3245 }, { "epoch": 2.42, "grad_norm": 0.126953125, "learning_rate": 4.3063025328787676e-05, "loss": 2.1026, "step": 3250 }, { "epoch": 2.43, "grad_norm": 0.12451171875, "learning_rate": 4.252623713900159e-05, "loss": 2.1093, "step": 3255 }, { "epoch": 2.43, "grad_norm": 0.125, "learning_rate": 4.199241726534495e-05, "loss": 2.1026, "step": 3260 }, { "epoch": 2.44, "grad_norm": 0.1279296875, "learning_rate": 4.146157577010421e-05, "loss": 2.1192, "step": 3265 }, { "epoch": 2.44, "grad_norm": 0.12451171875, "learning_rate": 4.0933722659424945e-05, "loss": 2.1114, "step": 3270 }, { "epoch": 2.44, "grad_norm": 0.1259765625, "learning_rate": 4.040886788312268e-05, "loss": 2.1239, "step": 3275 }, { "epoch": 2.45, "grad_norm": 0.1259765625, "learning_rate": 3.9887021334495625e-05, "loss": 2.0952, "step": 3280 }, { "epoch": 2.45, "grad_norm": 0.1259765625, "learning_rate": 3.936819285013826e-05, "loss": 2.114, "step": 3285 }, { "epoch": 2.45, "grad_norm": 0.12890625, "learning_rate": 3.885239220975547e-05, "loss": 2.1189, "step": 3290 }, { "epoch": 2.46, "grad_norm": 0.125, "learning_rate": 3.833962913597893e-05, "loss": 2.0974, "step": 3295 }, { "epoch": 2.46, "grad_norm": 0.1259765625, "learning_rate": 3.7829913294183e-05, "loss": 2.1085, "step": 3300 }, { "epoch": 2.47, "grad_norm": 0.1240234375, "learning_rate": 3.73232542923033e-05, "loss": 2.1023, "step": 3305 }, { "epoch": 2.47, "grad_norm": 0.125, "learning_rate": 3.681966168065509e-05, "loss": 2.1213, "step": 3310 }, { "epoch": 2.47, "grad_norm": 0.12451171875, "learning_rate": 3.6319144951753436e-05, "loss": 2.1312, "step": 3315 }, { "epoch": 2.48, "grad_norm": 0.1259765625, "learning_rate": 3.582171354013444e-05, "loss": 2.1268, "step": 3320 }, { "epoch": 2.48, "grad_norm": 0.126953125, "learning_rate": 3.5327376822176885e-05, "loss": 2.1068, "step": 3325 }, { "epoch": 2.48, "grad_norm": 0.126953125, "learning_rate": 3.483614411592628e-05, "loss": 2.1057, "step": 3330 }, { "epoch": 2.49, "grad_norm": 0.12890625, "learning_rate": 3.434802468091836e-05, "loss": 2.103, "step": 3335 }, { "epoch": 2.49, "grad_norm": 0.1259765625, "learning_rate": 3.386302771800527e-05, "loss": 2.1166, "step": 3340 }, { "epoch": 2.5, "grad_norm": 0.1240234375, "learning_rate": 3.3381162369181717e-05, "loss": 2.1176, "step": 3345 }, { "epoch": 2.5, "grad_norm": 0.125, "learning_rate": 3.290243771741275e-05, "loss": 2.1137, "step": 3350 }, { "epoch": 2.5, "grad_norm": 0.1259765625, "learning_rate": 3.2426862786462565e-05, "loss": 2.1017, "step": 3355 }, { "epoch": 2.51, "grad_norm": 0.1259765625, "learning_rate": 3.195444654072439e-05, "loss": 2.1049, "step": 3360 }, { "epoch": 2.51, "grad_norm": 0.125, "learning_rate": 3.148519788505166e-05, "loss": 2.1144, "step": 3365 }, { "epoch": 2.51, "grad_norm": 0.125, "learning_rate": 3.101912566458989e-05, "loss": 2.0956, "step": 3370 }, { "epoch": 2.52, "grad_norm": 0.1259765625, "learning_rate": 3.0556238664610105e-05, "loss": 2.1077, "step": 3375 }, { "epoch": 2.52, "grad_norm": 0.126953125, "learning_rate": 3.009654561034323e-05, "loss": 2.1178, "step": 3380 }, { "epoch": 2.53, "grad_norm": 0.12451171875, "learning_rate": 2.9640055166815673e-05, "loss": 2.105, "step": 3385 }, { "epoch": 2.53, "grad_norm": 0.12451171875, "learning_rate": 2.918677593868586e-05, "loss": 2.1051, "step": 3390 }, { "epoch": 2.53, "grad_norm": 0.12451171875, "learning_rate": 2.8736716470082204e-05, "loss": 2.0967, "step": 3395 }, { "epoch": 2.54, "grad_norm": 0.12451171875, "learning_rate": 2.8289885244441803e-05, "loss": 2.1174, "step": 3400 }, { "epoch": 2.54, "grad_norm": 0.12890625, "learning_rate": 2.7846290684350963e-05, "loss": 2.1216, "step": 3405 }, { "epoch": 2.54, "grad_norm": 0.1259765625, "learning_rate": 2.740594115138595e-05, "loss": 2.1199, "step": 3410 }, { "epoch": 2.55, "grad_norm": 0.1240234375, "learning_rate": 2.6968844945955617e-05, "loss": 2.1112, "step": 3415 }, { "epoch": 2.55, "grad_norm": 0.1259765625, "learning_rate": 2.6535010307145002e-05, "loss": 2.1374, "step": 3420 }, { "epoch": 2.56, "grad_norm": 0.1259765625, "learning_rate": 2.6104445412559876e-05, "loss": 2.1233, "step": 3425 }, { "epoch": 2.56, "grad_norm": 0.1259765625, "learning_rate": 2.5677158378172707e-05, "loss": 2.1049, "step": 3430 }, { "epoch": 2.56, "grad_norm": 0.1279296875, "learning_rate": 2.5253157258169567e-05, "loss": 2.1058, "step": 3435 }, { "epoch": 2.57, "grad_norm": 0.125, "learning_rate": 2.4832450044798573e-05, "loss": 2.1154, "step": 3440 }, { "epoch": 2.57, "grad_norm": 0.126953125, "learning_rate": 2.4415044668218735e-05, "loss": 2.1126, "step": 3445 }, { "epoch": 2.57, "grad_norm": 0.1259765625, "learning_rate": 2.4000948996351104e-05, "loss": 2.132, "step": 3450 }, { "epoch": 2.58, "grad_norm": 0.1279296875, "learning_rate": 2.359017083472994e-05, "loss": 2.1093, "step": 3455 }, { "epoch": 2.58, "grad_norm": 0.12353515625, "learning_rate": 2.3182717926355845e-05, "loss": 2.0929, "step": 3460 }, { "epoch": 2.58, "grad_norm": 0.1259765625, "learning_rate": 2.277859795154986e-05, "loss": 2.1068, "step": 3465 }, { "epoch": 2.59, "grad_norm": 0.1259765625, "learning_rate": 2.237781852780838e-05, "loss": 2.1095, "step": 3470 }, { "epoch": 2.59, "grad_norm": 0.125, "learning_rate": 2.1980387209660026e-05, "loss": 2.1148, "step": 3475 }, { "epoch": 2.6, "grad_norm": 0.126953125, "learning_rate": 2.1586311488522702e-05, "loss": 2.1104, "step": 3480 }, { "epoch": 2.6, "grad_norm": 0.1259765625, "learning_rate": 2.1195598792562964e-05, "loss": 2.1175, "step": 3485 }, { "epoch": 2.6, "grad_norm": 0.1259765625, "learning_rate": 2.0808256486555554e-05, "loss": 2.1094, "step": 3490 }, { "epoch": 2.61, "grad_norm": 0.126953125, "learning_rate": 2.042429187174475e-05, "loss": 2.121, "step": 3495 }, { "epoch": 2.61, "grad_norm": 0.1240234375, "learning_rate": 2.0043712185706863e-05, "loss": 2.1047, "step": 3500 }, { "epoch": 2.61, "grad_norm": 0.126953125, "learning_rate": 1.966652460221341e-05, "loss": 2.1098, "step": 3505 }, { "epoch": 2.62, "grad_norm": 0.1279296875, "learning_rate": 1.9292736231096464e-05, "loss": 2.1114, "step": 3510 }, { "epoch": 2.62, "grad_norm": 0.12890625, "learning_rate": 1.8922354118114138e-05, "loss": 2.1267, "step": 3515 }, { "epoch": 2.63, "grad_norm": 0.125, "learning_rate": 1.8555385244818035e-05, "loss": 2.0916, "step": 3520 }, { "epoch": 2.63, "grad_norm": 0.126953125, "learning_rate": 1.8191836528421558e-05, "loss": 2.0985, "step": 3525 }, { "epoch": 2.63, "grad_norm": 0.123046875, "learning_rate": 1.7831714821669588e-05, "loss": 2.1129, "step": 3530 }, { "epoch": 2.64, "grad_norm": 0.1279296875, "learning_rate": 1.7475026912709235e-05, "loss": 2.0889, "step": 3535 }, { "epoch": 2.64, "grad_norm": 0.12890625, "learning_rate": 1.71217795249619e-05, "loss": 2.1067, "step": 3540 }, { "epoch": 2.64, "grad_norm": 0.1240234375, "learning_rate": 1.6771979316996677e-05, "loss": 2.0987, "step": 3545 }, { "epoch": 2.65, "grad_norm": 0.1259765625, "learning_rate": 1.6425632882404618e-05, "loss": 2.099, "step": 3550 }, { "epoch": 2.65, "grad_norm": 0.1279296875, "learning_rate": 1.6082746749674604e-05, "loss": 2.1211, "step": 3555 }, { "epoch": 2.66, "grad_norm": 0.12451171875, "learning_rate": 1.5743327382070206e-05, "loss": 2.1099, "step": 3560 }, { "epoch": 2.66, "grad_norm": 0.125, "learning_rate": 1.540738117750793e-05, "loss": 2.1109, "step": 3565 }, { "epoch": 2.66, "grad_norm": 0.126953125, "learning_rate": 1.507491446843654e-05, "loss": 2.1064, "step": 3570 }, { "epoch": 2.67, "grad_norm": 0.1259765625, "learning_rate": 1.4745933521717781e-05, "loss": 2.0908, "step": 3575 }, { "epoch": 2.67, "grad_norm": 0.1259765625, "learning_rate": 1.4420444538508083e-05, "loss": 2.119, "step": 3580 }, { "epoch": 2.67, "grad_norm": 0.125, "learning_rate": 1.4098453654141975e-05, "loss": 2.1008, "step": 3585 }, { "epoch": 2.68, "grad_norm": 0.130859375, "learning_rate": 1.377996693801611e-05, "loss": 2.1134, "step": 3590 }, { "epoch": 2.68, "grad_norm": 0.126953125, "learning_rate": 1.346499039347504e-05, "loss": 2.1147, "step": 3595 }, { "epoch": 2.69, "grad_norm": 0.1259765625, "learning_rate": 1.3153529957698008e-05, "loss": 2.1141, "step": 3600 }, { "epoch": 2.69, "grad_norm": 0.1259765625, "learning_rate": 1.2845591501587017e-05, "loss": 2.0835, "step": 3605 }, { "epoch": 2.69, "grad_norm": 0.126953125, "learning_rate": 1.254118082965634e-05, "loss": 2.1109, "step": 3610 }, { "epoch": 2.7, "grad_norm": 0.1259765625, "learning_rate": 1.2240303679922727e-05, "loss": 2.1165, "step": 3615 }, { "epoch": 2.7, "grad_norm": 0.1240234375, "learning_rate": 1.1942965723797671e-05, "loss": 2.1035, "step": 3620 }, { "epoch": 2.7, "grad_norm": 0.1259765625, "learning_rate": 1.164917256598017e-05, "loss": 2.1112, "step": 3625 }, { "epoch": 2.71, "grad_norm": 0.12451171875, "learning_rate": 1.1358929744351332e-05, "loss": 2.1051, "step": 3630 }, { "epoch": 2.71, "grad_norm": 0.1279296875, "learning_rate": 1.1072242729869819e-05, "loss": 2.1133, "step": 3635 }, { "epoch": 2.72, "grad_norm": 0.123046875, "learning_rate": 1.0789116926468756e-05, "loss": 2.1097, "step": 3640 }, { "epoch": 2.72, "grad_norm": 0.125, "learning_rate": 1.050955767095403e-05, "loss": 2.1082, "step": 3645 }, { "epoch": 2.72, "grad_norm": 0.1259765625, "learning_rate": 1.0233570232903323e-05, "loss": 2.1208, "step": 3650 }, { "epoch": 2.73, "grad_norm": 0.1240234375, "learning_rate": 9.961159814567267e-06, "loss": 2.097, "step": 3655 }, { "epoch": 2.73, "grad_norm": 0.1259765625, "learning_rate": 9.692331550770918e-06, "loss": 2.1102, "step": 3660 }, { "epoch": 2.73, "grad_norm": 0.126953125, "learning_rate": 9.42709050881736e-06, "loss": 2.0946, "step": 3665 }, { "epoch": 2.74, "grad_norm": 0.125, "learning_rate": 9.165441688391885e-06, "loss": 2.1129, "step": 3670 }, { "epoch": 2.74, "grad_norm": 0.1259765625, "learning_rate": 8.907390021467921e-06, "loss": 2.1016, "step": 3675 }, { "epoch": 2.75, "grad_norm": 0.12451171875, "learning_rate": 8.652940372214069e-06, "loss": 2.0817, "step": 3680 }, { "epoch": 2.75, "grad_norm": 0.125, "learning_rate": 8.40209753690222e-06, "loss": 2.1327, "step": 3685 }, { "epoch": 2.75, "grad_norm": 0.125, "learning_rate": 8.154866243817494e-06, "loss": 2.1231, "step": 3690 }, { "epoch": 2.76, "grad_norm": 0.125, "learning_rate": 7.911251153168752e-06, "loss": 2.1175, "step": 3695 }, { "epoch": 2.76, "grad_norm": 0.1259765625, "learning_rate": 7.67125685700103e-06, "loss": 2.113, "step": 3700 }, { "epoch": 2.76, "grad_norm": 0.1240234375, "learning_rate": 7.434887879108776e-06, "loss": 2.0957, "step": 3705 }, { "epoch": 2.77, "grad_norm": 0.126953125, "learning_rate": 7.202148674950704e-06, "loss": 2.1117, "step": 3710 }, { "epoch": 2.77, "grad_norm": 0.12451171875, "learning_rate": 6.97304363156579e-06, "loss": 2.1021, "step": 3715 }, { "epoch": 2.78, "grad_norm": 0.12451171875, "learning_rate": 6.747577067490563e-06, "loss": 2.1233, "step": 3720 }, { "epoch": 2.78, "grad_norm": 0.126953125, "learning_rate": 6.525753232677678e-06, "loss": 2.1289, "step": 3725 }, { "epoch": 2.78, "grad_norm": 0.125, "learning_rate": 6.307576308415852e-06, "loss": 2.1155, "step": 3730 }, { "epoch": 2.79, "grad_norm": 0.12451171875, "learning_rate": 6.093050407251033e-06, "loss": 2.108, "step": 3735 }, { "epoch": 2.79, "grad_norm": 0.130859375, "learning_rate": 5.882179572908841e-06, "loss": 2.1112, "step": 3740 }, { "epoch": 2.79, "grad_norm": 0.125, "learning_rate": 5.6749677802184095e-06, "loss": 2.118, "step": 3745 }, { "epoch": 2.8, "grad_norm": 0.12353515625, "learning_rate": 5.471418935037398e-06, "loss": 2.096, "step": 3750 }, { "epoch": 2.8, "grad_norm": 0.1279296875, "learning_rate": 5.271536874178451e-06, "loss": 2.1087, "step": 3755 }, { "epoch": 2.8, "grad_norm": 0.1259765625, "learning_rate": 5.075325365336791e-06, "loss": 2.1044, "step": 3760 }, { "epoch": 2.81, "grad_norm": 0.1279296875, "learning_rate": 4.882788107019231e-06, "loss": 2.127, "step": 3765 }, { "epoch": 2.81, "grad_norm": 0.12451171875, "learning_rate": 4.693928728474517e-06, "loss": 2.0874, "step": 3770 }, { "epoch": 2.82, "grad_norm": 0.1259765625, "learning_rate": 4.5087507896247605e-06, "loss": 2.1114, "step": 3775 }, { "epoch": 2.82, "grad_norm": 0.12890625, "learning_rate": 4.327257780998517e-06, "loss": 2.1198, "step": 3780 }, { "epoch": 2.82, "grad_norm": 0.1259765625, "learning_rate": 4.149453123664881e-06, "loss": 2.1174, "step": 3785 }, { "epoch": 2.83, "grad_norm": 0.1259765625, "learning_rate": 3.975340169169095e-06, "loss": 2.0976, "step": 3790 }, { "epoch": 2.83, "grad_norm": 0.125, "learning_rate": 3.804922199469174e-06, "loss": 2.0964, "step": 3795 }, { "epoch": 2.83, "grad_norm": 0.1259765625, "learning_rate": 3.6382024268743153e-06, "loss": 2.1217, "step": 3800 }, { "epoch": 2.84, "grad_norm": 0.126953125, "learning_rate": 3.4751839939841435e-06, "loss": 2.1284, "step": 3805 }, { "epoch": 2.84, "grad_norm": 0.12451171875, "learning_rate": 3.3158699736295375e-06, "loss": 2.1031, "step": 3810 }, { "epoch": 2.85, "grad_norm": 0.1240234375, "learning_rate": 3.160263368814764e-06, "loss": 2.1215, "step": 3815 }, { "epoch": 2.85, "grad_norm": 0.1298828125, "learning_rate": 3.0083671126607484e-06, "loss": 2.1315, "step": 3820 }, { "epoch": 2.85, "grad_norm": 0.126953125, "learning_rate": 2.860184068349958e-06, "loss": 2.1137, "step": 3825 }, { "epoch": 2.86, "grad_norm": 0.1259765625, "learning_rate": 2.7157170290721625e-06, "loss": 2.1069, "step": 3830 }, { "epoch": 2.86, "grad_norm": 0.12451171875, "learning_rate": 2.5749687179721815e-06, "loss": 2.1111, "step": 3835 }, { "epoch": 2.86, "grad_norm": 0.12451171875, "learning_rate": 2.4379417880981304e-06, "loss": 2.0955, "step": 3840 }, { "epoch": 2.87, "grad_norm": 0.12451171875, "learning_rate": 2.304638822351701e-06, "loss": 2.1026, "step": 3845 }, { "epoch": 2.87, "grad_norm": 0.125, "learning_rate": 2.1750623334393816e-06, "loss": 2.0882, "step": 3850 }, { "epoch": 2.88, "grad_norm": 0.125, "learning_rate": 2.049214763825069e-06, "loss": 2.1075, "step": 3855 }, { "epoch": 2.88, "grad_norm": 0.126953125, "learning_rate": 1.9270984856840867e-06, "loss": 2.1132, "step": 3860 }, { "epoch": 2.88, "grad_norm": 0.125, "learning_rate": 1.8087158008583515e-06, "loss": 2.1085, "step": 3865 }, { "epoch": 2.89, "grad_norm": 0.125, "learning_rate": 1.6940689408132092e-06, "loss": 2.0978, "step": 3870 }, { "epoch": 2.89, "grad_norm": 0.126953125, "learning_rate": 1.583160066595113e-06, "loss": 2.1095, "step": 3875 }, { "epoch": 2.89, "grad_norm": 0.1240234375, "learning_rate": 1.4759912687910771e-06, "loss": 2.1079, "step": 3880 }, { "epoch": 2.9, "grad_norm": 0.1279296875, "learning_rate": 1.3725645674891762e-06, "loss": 2.1186, "step": 3885 }, { "epoch": 2.9, "grad_norm": 0.12890625, "learning_rate": 1.2728819122404646e-06, "loss": 2.1165, "step": 3890 }, { "epoch": 2.91, "grad_norm": 0.12353515625, "learning_rate": 1.1769451820223376e-06, "loss": 2.1125, "step": 3895 }, { "epoch": 2.91, "grad_norm": 0.126953125, "learning_rate": 1.084756185202962e-06, "loss": 2.1132, "step": 3900 }, { "epoch": 2.91, "grad_norm": 0.126953125, "learning_rate": 9.963166595073014e-07, "loss": 2.1062, "step": 3905 }, { "epoch": 2.92, "grad_norm": 0.1240234375, "learning_rate": 9.116282719842772e-07, "loss": 2.1068, "step": 3910 }, { "epoch": 2.92, "grad_norm": 0.12451171875, "learning_rate": 8.306926189754372e-07, "loss": 2.1134, "step": 3915 }, { "epoch": 2.92, "grad_norm": 0.1279296875, "learning_rate": 7.535112260847799e-07, "loss": 2.1227, "step": 3920 }, { "epoch": 2.93, "grad_norm": 0.1240234375, "learning_rate": 6.800855481500445e-07, "loss": 2.1204, "step": 3925 }, { "epoch": 2.93, "grad_norm": 0.126953125, "learning_rate": 6.104169692153105e-07, "loss": 2.1101, "step": 3930 }, { "epoch": 2.94, "grad_norm": 0.1279296875, "learning_rate": 5.44506802504774e-07, "loss": 2.1111, "step": 3935 }, { "epoch": 2.94, "grad_norm": 0.125, "learning_rate": 4.823562903982337e-07, "loss": 2.1068, "step": 3940 }, { "epoch": 2.94, "grad_norm": 0.1298828125, "learning_rate": 4.239666044074442e-07, "loss": 2.1416, "step": 3945 }, { "epoch": 2.95, "grad_norm": 0.12451171875, "learning_rate": 3.693388451541102e-07, "loss": 2.1217, "step": 3950 }, { "epoch": 2.95, "grad_norm": 0.1240234375, "learning_rate": 3.1847404234923715e-07, "loss": 2.1201, "step": 3955 }, { "epoch": 2.95, "grad_norm": 0.1259765625, "learning_rate": 2.713731547735687e-07, "loss": 2.1135, "step": 3960 }, { "epoch": 2.96, "grad_norm": 0.125, "learning_rate": 2.280370702596013e-07, "loss": 2.0992, "step": 3965 }, { "epoch": 2.96, "grad_norm": 0.12353515625, "learning_rate": 1.8846660567484186e-07, "loss": 2.1167, "step": 3970 }, { "epoch": 2.97, "grad_norm": 0.12451171875, "learning_rate": 1.5266250690635363e-07, "loss": 2.0971, "step": 3975 }, { "epoch": 2.97, "grad_norm": 0.1240234375, "learning_rate": 1.2062544884683391e-07, "loss": 2.1277, "step": 3980 }, { "epoch": 2.97, "grad_norm": 0.125, "learning_rate": 9.235603538171322e-08, "loss": 2.1004, "step": 3985 }, { "epoch": 2.98, "grad_norm": 0.12451171875, "learning_rate": 6.785479937789773e-08, "loss": 2.0967, "step": 3990 }, { "epoch": 2.98, "grad_norm": 0.1259765625, "learning_rate": 4.712220267366618e-08, "loss": 2.1211, "step": 3995 }, { "epoch": 2.98, "grad_norm": 0.126953125, "learning_rate": 3.015863607003233e-08, "loss": 2.1347, "step": 4000 }, { "epoch": 2.99, "grad_norm": 0.125, "learning_rate": 1.69644193232843e-08, "loss": 2.1306, "step": 4005 }, { "epoch": 2.99, "grad_norm": 0.1240234375, "learning_rate": 7.539801139011538e-09, "loss": 2.1013, "step": 4010 }, { "epoch": 3.0, "grad_norm": 0.1240234375, "learning_rate": 1.884959167419709e-09, "loss": 2.099, "step": 4015 }, { "epoch": 3.0, "grad_norm": 0.1259765625, "learning_rate": 0.0, "loss": 2.0989, "step": 4020 }, { "epoch": 3.0, "eval_loss": 2.1637070178985596, "eval_runtime": 187.0083, "eval_samples_per_second": 25.785, "eval_steps_per_second": 3.224, "step": 4020 }, { "epoch": 3.0, "step": 4020, "total_flos": 9.114826330351862e+17, "train_loss": 2.1598364234563725, "train_runtime": 32657.7671, "train_samples_per_second": 7.879, "train_steps_per_second": 0.123 } ], "logging_steps": 5, "max_steps": 4020, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 9.114826330351862e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }