{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7135896458142392, "eval_steps": 250, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002854358583256957, "grad_norm": 2.75, "learning_rate": 8.571428571428571e-06, "loss": 3.7737, "step": 1 }, { "epoch": 0.0005708717166513914, "grad_norm": 3.03125, "learning_rate": 1.7142857142857142e-05, "loss": 3.8253, "step": 2 }, { "epoch": 0.000856307574977087, "grad_norm": 2.078125, "learning_rate": 2.571428571428571e-05, "loss": 3.8136, "step": 3 }, { "epoch": 0.0011417434333027827, "grad_norm": 2.53125, "learning_rate": 3.4285714285714284e-05, "loss": 3.7592, "step": 4 }, { "epoch": 0.0014271792916284785, "grad_norm": 2.515625, "learning_rate": 4.285714285714285e-05, "loss": 3.7806, "step": 5 }, { "epoch": 0.001712615149954174, "grad_norm": 2.375, "learning_rate": 5.142857142857142e-05, "loss": 3.7962, "step": 6 }, { "epoch": 0.0019980510082798697, "grad_norm": 2.53125, "learning_rate": 5.9999999999999995e-05, "loss": 3.7494, "step": 7 }, { "epoch": 0.0022834868666055655, "grad_norm": 2.859375, "learning_rate": 6.857142857142857e-05, "loss": 3.7721, "step": 8 }, { "epoch": 0.0025689227249312612, "grad_norm": 2.0625, "learning_rate": 7.714285714285713e-05, "loss": 3.744, "step": 9 }, { "epoch": 0.002854358583256957, "grad_norm": 1.828125, "learning_rate": 8.57142857142857e-05, "loss": 3.7373, "step": 10 }, { "epoch": 0.003139794441582653, "grad_norm": 2.65625, "learning_rate": 9.428571428571427e-05, "loss": 3.7021, "step": 11 }, { "epoch": 0.003425230299908348, "grad_norm": 4.375, "learning_rate": 0.00010285714285714284, "loss": 3.6838, "step": 12 }, { "epoch": 0.003710666158234044, "grad_norm": 2.5, "learning_rate": 0.00011142857142857142, "loss": 3.7084, "step": 13 }, { "epoch": 0.003996102016559739, "grad_norm": 4.25, "learning_rate": 0.00011999999999999999, "loss": 3.6489, "step": 14 }, { "epoch": 0.004281537874885435, "grad_norm": 2.84375, "learning_rate": 0.00012857142857142855, "loss": 3.6588, "step": 15 }, { "epoch": 0.004566973733211131, "grad_norm": 4.4375, "learning_rate": 0.00013714285714285713, "loss": 3.6394, "step": 16 }, { "epoch": 0.004852409591536827, "grad_norm": 3.203125, "learning_rate": 0.0001457142857142857, "loss": 3.5906, "step": 17 }, { "epoch": 0.0051378454498625225, "grad_norm": 2.640625, "learning_rate": 0.00015428571428571425, "loss": 3.5944, "step": 18 }, { "epoch": 0.005423281308188218, "grad_norm": 4.21875, "learning_rate": 0.00016285714285714284, "loss": 3.5843, "step": 19 }, { "epoch": 0.005708717166513914, "grad_norm": 2.875, "learning_rate": 0.0001714285714285714, "loss": 3.5661, "step": 20 }, { "epoch": 0.00599415302483961, "grad_norm": 4.375, "learning_rate": 0.00017999999999999998, "loss": 3.5976, "step": 21 }, { "epoch": 0.006279588883165306, "grad_norm": 3.0, "learning_rate": 0.00018857142857142854, "loss": 3.5226, "step": 22 }, { "epoch": 0.006565024741491001, "grad_norm": 2.828125, "learning_rate": 0.00019714285714285713, "loss": 3.5581, "step": 23 }, { "epoch": 0.006850460599816696, "grad_norm": 4.1875, "learning_rate": 0.0002057142857142857, "loss": 3.5337, "step": 24 }, { "epoch": 0.007135896458142392, "grad_norm": 5.375, "learning_rate": 0.00021428571428571427, "loss": 3.502, "step": 25 }, { "epoch": 0.007421332316468088, "grad_norm": 2.359375, "learning_rate": 0.00022285714285714283, "loss": 3.4848, "step": 26 }, { "epoch": 0.007706768174793784, "grad_norm": 7.65625, "learning_rate": 0.00023142857142857142, "loss": 3.5451, "step": 27 }, { "epoch": 0.007992204033119479, "grad_norm": 4.96875, "learning_rate": 0.00023999999999999998, "loss": 3.5235, "step": 28 }, { "epoch": 0.008277639891445174, "grad_norm": 6.65625, "learning_rate": 0.00024857142857142857, "loss": 3.5061, "step": 29 }, { "epoch": 0.00856307574977087, "grad_norm": 4.9375, "learning_rate": 0.0002571428571428571, "loss": 3.5228, "step": 30 }, { "epoch": 0.008848511608096566, "grad_norm": 7.75, "learning_rate": 0.0002657142857142857, "loss": 3.4963, "step": 31 }, { "epoch": 0.009133947466422262, "grad_norm": 4.75, "learning_rate": 0.00027428571428571427, "loss": 3.5074, "step": 32 }, { "epoch": 0.009419383324747958, "grad_norm": 5.3125, "learning_rate": 0.0002828571428571428, "loss": 3.4555, "step": 33 }, { "epoch": 0.009704819183073653, "grad_norm": 4.40625, "learning_rate": 0.0002914285714285714, "loss": 3.4634, "step": 34 }, { "epoch": 0.00999025504139935, "grad_norm": 4.8125, "learning_rate": 0.0003, "loss": 3.4516, "step": 35 }, { "epoch": 0.010275690899725045, "grad_norm": 3.921875, "learning_rate": 0.00029999993845357924, "loss": 3.4341, "step": 36 }, { "epoch": 0.01056112675805074, "grad_norm": 5.40625, "learning_rate": 0.0002999997538143675, "loss": 3.4625, "step": 37 }, { "epoch": 0.010846562616376437, "grad_norm": 4.59375, "learning_rate": 0.0002999994460825163, "loss": 3.4492, "step": 38 }, { "epoch": 0.011131998474702132, "grad_norm": 3.5625, "learning_rate": 0.0002999990152582781, "loss": 3.4078, "step": 39 }, { "epoch": 0.011417434333027828, "grad_norm": 4.75, "learning_rate": 0.00029999846134200653, "loss": 3.4077, "step": 40 }, { "epoch": 0.011702870191353524, "grad_norm": 4.03125, "learning_rate": 0.0002999977843341562, "loss": 3.4062, "step": 41 }, { "epoch": 0.01198830604967922, "grad_norm": 3.859375, "learning_rate": 0.0002999969842352825, "loss": 3.3895, "step": 42 }, { "epoch": 0.012273741908004916, "grad_norm": 3.25, "learning_rate": 0.0002999960610460421, "loss": 3.3762, "step": 43 }, { "epoch": 0.012559177766330611, "grad_norm": 4.0625, "learning_rate": 0.00029999501476719257, "loss": 3.3807, "step": 44 }, { "epoch": 0.012844613624656307, "grad_norm": 3.71875, "learning_rate": 0.00029999384539959253, "loss": 3.3432, "step": 45 }, { "epoch": 0.013130049482982001, "grad_norm": 3.328125, "learning_rate": 0.0002999925529442016, "loss": 3.3543, "step": 46 }, { "epoch": 0.013415485341307697, "grad_norm": 5.5625, "learning_rate": 0.0002999911374020804, "loss": 3.3339, "step": 47 }, { "epoch": 0.013700921199633393, "grad_norm": 2.25, "learning_rate": 0.00029998959877439044, "loss": 3.3377, "step": 48 }, { "epoch": 0.013986357057959089, "grad_norm": 4.84375, "learning_rate": 0.0002999879370623944, "loss": 3.4033, "step": 49 }, { "epoch": 0.014271792916284784, "grad_norm": 4.375, "learning_rate": 0.00029998615226745605, "loss": 3.3567, "step": 50 }, { "epoch": 0.01455722877461048, "grad_norm": 3.15625, "learning_rate": 0.0002999842443910399, "loss": 3.3819, "step": 51 }, { "epoch": 0.014842664632936176, "grad_norm": 4.5, "learning_rate": 0.0002999822134347115, "loss": 3.3586, "step": 52 }, { "epoch": 0.015128100491261872, "grad_norm": 3.671875, "learning_rate": 0.0002999800594001376, "loss": 3.3414, "step": 53 }, { "epoch": 0.015413536349587567, "grad_norm": 2.765625, "learning_rate": 0.000299977782289086, "loss": 3.3165, "step": 54 }, { "epoch": 0.01569897220791326, "grad_norm": 4.6875, "learning_rate": 0.00029997538210342503, "loss": 3.3446, "step": 55 }, { "epoch": 0.015984408066238957, "grad_norm": 4.0625, "learning_rate": 0.0002999728588451245, "loss": 3.3649, "step": 56 }, { "epoch": 0.016269843924564653, "grad_norm": 2.828125, "learning_rate": 0.000299970212516255, "loss": 3.3258, "step": 57 }, { "epoch": 0.01655527978289035, "grad_norm": 3.84375, "learning_rate": 0.0002999674431189883, "loss": 3.3137, "step": 58 }, { "epoch": 0.016840715641216045, "grad_norm": 2.53125, "learning_rate": 0.0002999645506555967, "loss": 3.31, "step": 59 }, { "epoch": 0.01712615149954174, "grad_norm": 3.796875, "learning_rate": 0.00029996153512845415, "loss": 3.3022, "step": 60 }, { "epoch": 0.017411587357867436, "grad_norm": 3.671875, "learning_rate": 0.00029995839654003504, "loss": 3.3119, "step": 61 }, { "epoch": 0.017697023216193132, "grad_norm": 2.828125, "learning_rate": 0.00029995513489291506, "loss": 3.306, "step": 62 }, { "epoch": 0.017982459074518828, "grad_norm": 2.96875, "learning_rate": 0.0002999517501897707, "loss": 3.2965, "step": 63 }, { "epoch": 0.018267894932844524, "grad_norm": 3.765625, "learning_rate": 0.0002999482424333796, "loss": 3.3035, "step": 64 }, { "epoch": 0.01855333079117022, "grad_norm": 2.96875, "learning_rate": 0.00029994461162662024, "loss": 3.2734, "step": 65 }, { "epoch": 0.018838766649495915, "grad_norm": 2.28125, "learning_rate": 0.0002999408577724721, "loss": 3.2772, "step": 66 }, { "epoch": 0.01912420250782161, "grad_norm": 3.46875, "learning_rate": 0.0002999369808740157, "loss": 3.2491, "step": 67 }, { "epoch": 0.019409638366147307, "grad_norm": 3.78125, "learning_rate": 0.00029993298093443246, "loss": 3.2943, "step": 68 }, { "epoch": 0.019695074224473003, "grad_norm": 1.9296875, "learning_rate": 0.0002999288579570049, "loss": 3.2525, "step": 69 }, { "epoch": 0.0199805100827987, "grad_norm": 4.0, "learning_rate": 0.00029992461194511624, "loss": 3.2765, "step": 70 }, { "epoch": 0.020265945941124394, "grad_norm": 2.578125, "learning_rate": 0.000299920242902251, "loss": 3.2538, "step": 71 }, { "epoch": 0.02055138179945009, "grad_norm": 2.84375, "learning_rate": 0.00029991575083199455, "loss": 3.2407, "step": 72 }, { "epoch": 0.020836817657775786, "grad_norm": 3.203125, "learning_rate": 0.00029991113573803294, "loss": 3.2537, "step": 73 }, { "epoch": 0.02112225351610148, "grad_norm": 4.34375, "learning_rate": 0.0002999063976241536, "loss": 3.2618, "step": 74 }, { "epoch": 0.021407689374427177, "grad_norm": 1.5390625, "learning_rate": 0.00029990153649424463, "loss": 3.2486, "step": 75 }, { "epoch": 0.021693125232752873, "grad_norm": 6.15625, "learning_rate": 0.0002998965523522951, "loss": 3.2839, "step": 76 }, { "epoch": 0.02197856109107857, "grad_norm": 3.953125, "learning_rate": 0.0002998914452023953, "loss": 3.2866, "step": 77 }, { "epoch": 0.022263996949404265, "grad_norm": 4.9375, "learning_rate": 0.00029988621504873606, "loss": 3.3082, "step": 78 }, { "epoch": 0.02254943280772996, "grad_norm": 3.578125, "learning_rate": 0.0002998808618956094, "loss": 3.2833, "step": 79 }, { "epoch": 0.022834868666055656, "grad_norm": 4.375, "learning_rate": 0.00029987538574740826, "loss": 3.2748, "step": 80 }, { "epoch": 0.023120304524381352, "grad_norm": 2.921875, "learning_rate": 0.0002998697866086264, "loss": 3.2491, "step": 81 }, { "epoch": 0.023405740382707048, "grad_norm": 3.5, "learning_rate": 0.0002998640644838587, "loss": 3.2526, "step": 82 }, { "epoch": 0.023691176241032744, "grad_norm": 3.09375, "learning_rate": 0.0002998582193778006, "loss": 3.2262, "step": 83 }, { "epoch": 0.02397661209935844, "grad_norm": 2.96875, "learning_rate": 0.000299852251295249, "loss": 3.2321, "step": 84 }, { "epoch": 0.024262047957684135, "grad_norm": 2.796875, "learning_rate": 0.0002998461602411013, "loss": 3.2485, "step": 85 }, { "epoch": 0.02454748381600983, "grad_norm": 2.46875, "learning_rate": 0.00029983994622035585, "loss": 3.2223, "step": 86 }, { "epoch": 0.024832919674335527, "grad_norm": 3.484375, "learning_rate": 0.0002998336092381121, "loss": 3.2184, "step": 87 }, { "epoch": 0.025118355532661223, "grad_norm": 2.734375, "learning_rate": 0.0002998271492995702, "loss": 3.2204, "step": 88 }, { "epoch": 0.02540379139098692, "grad_norm": 3.34375, "learning_rate": 0.00029982056641003147, "loss": 3.2185, "step": 89 }, { "epoch": 0.025689227249312614, "grad_norm": 2.03125, "learning_rate": 0.00029981386057489776, "loss": 3.1942, "step": 90 }, { "epoch": 0.025974663107638307, "grad_norm": 2.953125, "learning_rate": 0.00029980703179967213, "loss": 3.1724, "step": 91 }, { "epoch": 0.026260098965964002, "grad_norm": 3.015625, "learning_rate": 0.00029980008008995834, "loss": 3.2225, "step": 92 }, { "epoch": 0.026545534824289698, "grad_norm": 3.125, "learning_rate": 0.0002997930054514612, "loss": 3.2103, "step": 93 }, { "epoch": 0.026830970682615394, "grad_norm": 2.3125, "learning_rate": 0.0002997858078899861, "loss": 3.1942, "step": 94 }, { "epoch": 0.02711640654094109, "grad_norm": 2.234375, "learning_rate": 0.00029977848741143966, "loss": 3.1652, "step": 95 }, { "epoch": 0.027401842399266785, "grad_norm": 3.234375, "learning_rate": 0.0002997710440218291, "loss": 3.186, "step": 96 }, { "epoch": 0.02768727825759248, "grad_norm": 2.40625, "learning_rate": 0.0002997634777272627, "loss": 3.1928, "step": 97 }, { "epoch": 0.027972714115918177, "grad_norm": 2.625, "learning_rate": 0.0002997557885339494, "loss": 3.169, "step": 98 }, { "epoch": 0.028258149974243873, "grad_norm": 2.015625, "learning_rate": 0.00029974797644819926, "loss": 3.174, "step": 99 }, { "epoch": 0.02854358583256957, "grad_norm": 3.984375, "learning_rate": 0.0002997400414764229, "loss": 3.1859, "step": 100 }, { "epoch": 0.028829021690895264, "grad_norm": 2.234375, "learning_rate": 0.0002997319836251319, "loss": 3.1975, "step": 101 }, { "epoch": 0.02911445754922096, "grad_norm": 2.65625, "learning_rate": 0.0002997238029009387, "loss": 3.163, "step": 102 }, { "epoch": 0.029399893407546656, "grad_norm": 3.359375, "learning_rate": 0.0002997154993105566, "loss": 3.1766, "step": 103 }, { "epoch": 0.029685329265872352, "grad_norm": 3.078125, "learning_rate": 0.00029970707286079966, "loss": 3.1692, "step": 104 }, { "epoch": 0.029970765124198048, "grad_norm": 3.171875, "learning_rate": 0.00029969852355858276, "loss": 3.1785, "step": 105 }, { "epoch": 0.030256200982523743, "grad_norm": 2.09375, "learning_rate": 0.00029968985141092165, "loss": 3.1622, "step": 106 }, { "epoch": 0.03054163684084944, "grad_norm": 2.625, "learning_rate": 0.00029968105642493286, "loss": 3.1934, "step": 107 }, { "epoch": 0.030827072699175135, "grad_norm": 3.25, "learning_rate": 0.0002996721386078337, "loss": 3.1503, "step": 108 }, { "epoch": 0.03111250855750083, "grad_norm": 2.34375, "learning_rate": 0.00029966309796694226, "loss": 3.1415, "step": 109 }, { "epoch": 0.03139794441582652, "grad_norm": 2.6875, "learning_rate": 0.0002996539345096776, "loss": 3.169, "step": 110 }, { "epoch": 0.03168338027415222, "grad_norm": 1.828125, "learning_rate": 0.0002996446482435593, "loss": 3.1381, "step": 111 }, { "epoch": 0.031968816132477915, "grad_norm": 2.8125, "learning_rate": 0.0002996352391762079, "loss": 3.1506, "step": 112 }, { "epoch": 0.03225425199080361, "grad_norm": 2.796875, "learning_rate": 0.0002996257073153446, "loss": 3.1666, "step": 113 }, { "epoch": 0.032539687849129306, "grad_norm": 2.546875, "learning_rate": 0.00029961605266879153, "loss": 3.1883, "step": 114 }, { "epoch": 0.032825123707455, "grad_norm": 2.703125, "learning_rate": 0.0002996062752444714, "loss": 3.1594, "step": 115 }, { "epoch": 0.0331105595657807, "grad_norm": 2.15625, "learning_rate": 0.00029959637505040773, "loss": 3.1553, "step": 116 }, { "epoch": 0.033395995424106394, "grad_norm": 2.8125, "learning_rate": 0.00029958635209472486, "loss": 3.125, "step": 117 }, { "epoch": 0.03368143128243209, "grad_norm": 2.4375, "learning_rate": 0.00029957620638564785, "loss": 3.1074, "step": 118 }, { "epoch": 0.033966867140757785, "grad_norm": 2.03125, "learning_rate": 0.00029956593793150233, "loss": 3.1193, "step": 119 }, { "epoch": 0.03425230299908348, "grad_norm": 2.484375, "learning_rate": 0.0002995555467407149, "loss": 3.107, "step": 120 }, { "epoch": 0.03453773885740918, "grad_norm": 2.84375, "learning_rate": 0.0002995450328218127, "loss": 3.1292, "step": 121 }, { "epoch": 0.03482317471573487, "grad_norm": 2.0, "learning_rate": 0.0002995343961834238, "loss": 3.1159, "step": 122 }, { "epoch": 0.03510861057406057, "grad_norm": 2.390625, "learning_rate": 0.0002995236368342766, "loss": 3.1207, "step": 123 }, { "epoch": 0.035394046432386264, "grad_norm": 2.109375, "learning_rate": 0.00029951275478320056, "loss": 3.1056, "step": 124 }, { "epoch": 0.03567948229071196, "grad_norm": 2.984375, "learning_rate": 0.00029950175003912573, "loss": 3.1206, "step": 125 }, { "epoch": 0.035964918149037656, "grad_norm": 1.484375, "learning_rate": 0.0002994906226110827, "loss": 3.1213, "step": 126 }, { "epoch": 0.03625035400736335, "grad_norm": 2.96875, "learning_rate": 0.00029947937250820295, "loss": 3.1091, "step": 127 }, { "epoch": 0.03653578986568905, "grad_norm": 1.8125, "learning_rate": 0.0002994679997397185, "loss": 3.1071, "step": 128 }, { "epoch": 0.03682122572401474, "grad_norm": 3.15625, "learning_rate": 0.000299456504314962, "loss": 3.143, "step": 129 }, { "epoch": 0.03710666158234044, "grad_norm": 1.9765625, "learning_rate": 0.00029944488624336683, "loss": 3.1106, "step": 130 }, { "epoch": 0.037392097440666135, "grad_norm": 3.3125, "learning_rate": 0.00029943314553446706, "loss": 3.1163, "step": 131 }, { "epoch": 0.03767753329899183, "grad_norm": 2.578125, "learning_rate": 0.00029942128219789734, "loss": 3.1173, "step": 132 }, { "epoch": 0.037962969157317526, "grad_norm": 2.734375, "learning_rate": 0.0002994092962433929, "loss": 3.1289, "step": 133 }, { "epoch": 0.03824840501564322, "grad_norm": 2.484375, "learning_rate": 0.0002993971876807896, "loss": 3.1056, "step": 134 }, { "epoch": 0.03853384087396892, "grad_norm": 2.40625, "learning_rate": 0.0002993849565200241, "loss": 3.0896, "step": 135 }, { "epoch": 0.038819276732294614, "grad_norm": 2.359375, "learning_rate": 0.0002993726027711333, "loss": 3.1087, "step": 136 }, { "epoch": 0.03910471259062031, "grad_norm": 2.328125, "learning_rate": 0.00029936012644425517, "loss": 3.1059, "step": 137 }, { "epoch": 0.039390148448946005, "grad_norm": 2.984375, "learning_rate": 0.00029934752754962783, "loss": 3.1265, "step": 138 }, { "epoch": 0.0396755843072717, "grad_norm": 2.15625, "learning_rate": 0.00029933480609759027, "loss": 3.0987, "step": 139 }, { "epoch": 0.0399610201655974, "grad_norm": 2.59375, "learning_rate": 0.00029932196209858197, "loss": 3.1122, "step": 140 }, { "epoch": 0.04024645602392309, "grad_norm": 2.375, "learning_rate": 0.0002993089955631429, "loss": 3.0887, "step": 141 }, { "epoch": 0.04053189188224879, "grad_norm": 2.25, "learning_rate": 0.0002992959065019136, "loss": 3.0815, "step": 142 }, { "epoch": 0.040817327740574484, "grad_norm": 3.0, "learning_rate": 0.00029928269492563537, "loss": 3.0889, "step": 143 }, { "epoch": 0.04110276359890018, "grad_norm": 1.53125, "learning_rate": 0.00029926936084514967, "loss": 3.0793, "step": 144 }, { "epoch": 0.041388199457225876, "grad_norm": 2.59375, "learning_rate": 0.00029925590427139887, "loss": 3.0804, "step": 145 }, { "epoch": 0.04167363531555157, "grad_norm": 1.8984375, "learning_rate": 0.00029924232521542557, "loss": 3.0612, "step": 146 }, { "epoch": 0.04195907117387727, "grad_norm": 2.71875, "learning_rate": 0.00029922862368837315, "loss": 3.0698, "step": 147 }, { "epoch": 0.04224450703220296, "grad_norm": 2.859375, "learning_rate": 0.00029921479970148517, "loss": 3.088, "step": 148 }, { "epoch": 0.04252994289052866, "grad_norm": 1.9609375, "learning_rate": 0.00029920085326610595, "loss": 3.0765, "step": 149 }, { "epoch": 0.042815378748854355, "grad_norm": 3.515625, "learning_rate": 0.00029918678439368017, "loss": 3.0926, "step": 150 }, { "epoch": 0.04310081460718005, "grad_norm": 2.453125, "learning_rate": 0.000299172593095753, "loss": 3.0821, "step": 151 }, { "epoch": 0.043386250465505746, "grad_norm": 5.25, "learning_rate": 0.00029915827938397017, "loss": 3.0682, "step": 152 }, { "epoch": 0.04367168632383144, "grad_norm": 3.078125, "learning_rate": 0.0002991438432700777, "loss": 3.0657, "step": 153 }, { "epoch": 0.04395712218215714, "grad_norm": 4.03125, "learning_rate": 0.0002991292847659222, "loss": 3.0883, "step": 154 }, { "epoch": 0.044242558040482834, "grad_norm": 3.828125, "learning_rate": 0.0002991146038834505, "loss": 3.0962, "step": 155 }, { "epoch": 0.04452799389880853, "grad_norm": 2.578125, "learning_rate": 0.0002990998006347102, "loss": 3.0695, "step": 156 }, { "epoch": 0.044813429757134225, "grad_norm": 4.0625, "learning_rate": 0.0002990848750318491, "loss": 3.1003, "step": 157 }, { "epoch": 0.04509886561545992, "grad_norm": 2.90625, "learning_rate": 0.00029906982708711533, "loss": 3.0733, "step": 158 }, { "epoch": 0.04538430147378562, "grad_norm": 5.53125, "learning_rate": 0.0002990546568128576, "loss": 3.1179, "step": 159 }, { "epoch": 0.04566973733211131, "grad_norm": 4.625, "learning_rate": 0.00029903936422152487, "loss": 3.1125, "step": 160 }, { "epoch": 0.04595517319043701, "grad_norm": 4.90625, "learning_rate": 0.00029902394932566657, "loss": 3.0922, "step": 161 }, { "epoch": 0.046240609048762704, "grad_norm": 3.34375, "learning_rate": 0.00029900841213793247, "loss": 3.048, "step": 162 }, { "epoch": 0.0465260449070884, "grad_norm": 9.5, "learning_rate": 0.00029899275267107264, "loss": 3.1456, "step": 163 }, { "epoch": 0.046811480765414096, "grad_norm": 8.3125, "learning_rate": 0.00029897697093793753, "loss": 3.1066, "step": 164 }, { "epoch": 0.04709691662373979, "grad_norm": 3.0, "learning_rate": 0.000298961066951478, "loss": 3.0876, "step": 165 }, { "epoch": 0.04738235248206549, "grad_norm": 6.0625, "learning_rate": 0.0002989450407247451, "loss": 3.1259, "step": 166 }, { "epoch": 0.04766778834039118, "grad_norm": 5.96875, "learning_rate": 0.0002989288922708902, "loss": 3.1248, "step": 167 }, { "epoch": 0.04795322419871688, "grad_norm": 3.4375, "learning_rate": 0.0002989126216031652, "loss": 3.0802, "step": 168 }, { "epoch": 0.048238660057042575, "grad_norm": 3.890625, "learning_rate": 0.00029889622873492195, "loss": 3.0777, "step": 169 }, { "epoch": 0.04852409591536827, "grad_norm": 2.78125, "learning_rate": 0.0002988797136796128, "loss": 3.0904, "step": 170 }, { "epoch": 0.048809531773693966, "grad_norm": 3.453125, "learning_rate": 0.0002988630764507904, "loss": 3.081, "step": 171 }, { "epoch": 0.04909496763201966, "grad_norm": 2.859375, "learning_rate": 0.0002988463170621074, "loss": 3.0743, "step": 172 }, { "epoch": 0.04938040349034536, "grad_norm": 2.515625, "learning_rate": 0.00029882943552731703, "loss": 3.0189, "step": 173 }, { "epoch": 0.049665839348671054, "grad_norm": 2.6875, "learning_rate": 0.0002988124318602725, "loss": 3.0684, "step": 174 }, { "epoch": 0.04995127520699675, "grad_norm": 2.21875, "learning_rate": 0.0002987953060749274, "loss": 3.0479, "step": 175 }, { "epoch": 0.050236711065322445, "grad_norm": 2.90625, "learning_rate": 0.0002987780581853355, "loss": 3.0374, "step": 176 }, { "epoch": 0.05052214692364814, "grad_norm": 2.078125, "learning_rate": 0.0002987606882056507, "loss": 3.0589, "step": 177 }, { "epoch": 0.05080758278197384, "grad_norm": 3.59375, "learning_rate": 0.00029874319615012714, "loss": 3.0731, "step": 178 }, { "epoch": 0.05109301864029953, "grad_norm": 3.109375, "learning_rate": 0.00029872558203311914, "loss": 3.0793, "step": 179 }, { "epoch": 0.05137845449862523, "grad_norm": 2.546875, "learning_rate": 0.0002987078458690811, "loss": 3.0748, "step": 180 }, { "epoch": 0.05166389035695092, "grad_norm": 3.109375, "learning_rate": 0.0002986899876725678, "loss": 3.0308, "step": 181 }, { "epoch": 0.05194932621527661, "grad_norm": 2.0625, "learning_rate": 0.00029867200745823384, "loss": 3.0496, "step": 182 }, { "epoch": 0.05223476207360231, "grad_norm": 2.40625, "learning_rate": 0.0002986539052408343, "loss": 3.0577, "step": 183 }, { "epoch": 0.052520197931928005, "grad_norm": 2.75, "learning_rate": 0.0002986356810352241, "loss": 3.0357, "step": 184 }, { "epoch": 0.0528056337902537, "grad_norm": 1.546875, "learning_rate": 0.00029861733485635834, "loss": 3.023, "step": 185 }, { "epoch": 0.053091069648579396, "grad_norm": 2.6875, "learning_rate": 0.00029859886671929233, "loss": 3.0768, "step": 186 }, { "epoch": 0.05337650550690509, "grad_norm": 1.90625, "learning_rate": 0.00029858027663918135, "loss": 3.0272, "step": 187 }, { "epoch": 0.05366194136523079, "grad_norm": 2.328125, "learning_rate": 0.0002985615646312807, "loss": 3.0348, "step": 188 }, { "epoch": 0.053947377223556484, "grad_norm": 2.140625, "learning_rate": 0.00029854273071094596, "loss": 3.0245, "step": 189 }, { "epoch": 0.05423281308188218, "grad_norm": 1.9375, "learning_rate": 0.00029852377489363247, "loss": 3.0558, "step": 190 }, { "epoch": 0.054518248940207875, "grad_norm": 2.578125, "learning_rate": 0.00029850469719489573, "loss": 3.0611, "step": 191 }, { "epoch": 0.05480368479853357, "grad_norm": 1.9453125, "learning_rate": 0.00029848549763039135, "loss": 3.0442, "step": 192 }, { "epoch": 0.05508912065685927, "grad_norm": 2.796875, "learning_rate": 0.00029846617621587474, "loss": 3.06, "step": 193 }, { "epoch": 0.05537455651518496, "grad_norm": 1.765625, "learning_rate": 0.00029844673296720154, "loss": 3.0144, "step": 194 }, { "epoch": 0.05565999237351066, "grad_norm": 2.46875, "learning_rate": 0.0002984271679003272, "loss": 3.0423, "step": 195 }, { "epoch": 0.055945428231836354, "grad_norm": 1.8671875, "learning_rate": 0.0002984074810313071, "loss": 3.0504, "step": 196 }, { "epoch": 0.05623086409016205, "grad_norm": 2.203125, "learning_rate": 0.00029838767237629684, "loss": 3.0031, "step": 197 }, { "epoch": 0.056516299948487746, "grad_norm": 1.8046875, "learning_rate": 0.0002983677419515516, "loss": 3.0401, "step": 198 }, { "epoch": 0.05680173580681344, "grad_norm": 2.015625, "learning_rate": 0.00029834768977342677, "loss": 3.0359, "step": 199 }, { "epoch": 0.05708717166513914, "grad_norm": 2.5625, "learning_rate": 0.0002983275158583775, "loss": 3.028, "step": 200 }, { "epoch": 0.05737260752346483, "grad_norm": 1.9140625, "learning_rate": 0.0002983072202229589, "loss": 3.0115, "step": 201 }, { "epoch": 0.05765804338179053, "grad_norm": 1.953125, "learning_rate": 0.000298286802883826, "loss": 3.0221, "step": 202 }, { "epoch": 0.057943479240116225, "grad_norm": 2.109375, "learning_rate": 0.0002982662638577335, "loss": 3.0104, "step": 203 }, { "epoch": 0.05822891509844192, "grad_norm": 2.21875, "learning_rate": 0.00029824560316153633, "loss": 2.9983, "step": 204 }, { "epoch": 0.058514350956767616, "grad_norm": 1.8984375, "learning_rate": 0.00029822482081218887, "loss": 3.0208, "step": 205 }, { "epoch": 0.05879978681509331, "grad_norm": 2.5, "learning_rate": 0.00029820391682674563, "loss": 3.0206, "step": 206 }, { "epoch": 0.05908522267341901, "grad_norm": 2.109375, "learning_rate": 0.00029818289122236075, "loss": 3.0552, "step": 207 }, { "epoch": 0.059370658531744704, "grad_norm": 1.7421875, "learning_rate": 0.00029816174401628827, "loss": 3.0075, "step": 208 }, { "epoch": 0.0596560943900704, "grad_norm": 2.03125, "learning_rate": 0.00029814047522588194, "loss": 3.0068, "step": 209 }, { "epoch": 0.059941530248396095, "grad_norm": 1.5546875, "learning_rate": 0.0002981190848685954, "loss": 2.9909, "step": 210 }, { "epoch": 0.06022696610672179, "grad_norm": 2.453125, "learning_rate": 0.00029809757296198194, "loss": 2.9962, "step": 211 }, { "epoch": 0.06051240196504749, "grad_norm": 1.515625, "learning_rate": 0.00029807593952369465, "loss": 3.0294, "step": 212 }, { "epoch": 0.06079783782337318, "grad_norm": 2.484375, "learning_rate": 0.00029805418457148637, "loss": 2.9857, "step": 213 }, { "epoch": 0.06108327368169888, "grad_norm": 2.15625, "learning_rate": 0.00029803230812320956, "loss": 3.0202, "step": 214 }, { "epoch": 0.061368709540024574, "grad_norm": 2.0625, "learning_rate": 0.00029801031019681645, "loss": 2.9734, "step": 215 }, { "epoch": 0.06165414539835027, "grad_norm": 1.7109375, "learning_rate": 0.000297988190810359, "loss": 2.9859, "step": 216 }, { "epoch": 0.061939581256675966, "grad_norm": 3.203125, "learning_rate": 0.0002979659499819888, "loss": 3.0128, "step": 217 }, { "epoch": 0.06222501711500166, "grad_norm": 1.7734375, "learning_rate": 0.0002979435877299571, "loss": 3.0178, "step": 218 }, { "epoch": 0.06251045297332736, "grad_norm": 2.75, "learning_rate": 0.0002979211040726147, "loss": 2.9779, "step": 219 }, { "epoch": 0.06279588883165305, "grad_norm": 2.375, "learning_rate": 0.00029789849902841223, "loss": 2.9843, "step": 220 }, { "epoch": 0.06308132468997875, "grad_norm": 2.3125, "learning_rate": 0.0002978757726158998, "loss": 2.9943, "step": 221 }, { "epoch": 0.06336676054830444, "grad_norm": 2.421875, "learning_rate": 0.0002978529248537271, "loss": 3.0043, "step": 222 }, { "epoch": 0.06365219640663014, "grad_norm": 2.109375, "learning_rate": 0.00029782995576064337, "loss": 2.9729, "step": 223 }, { "epoch": 0.06393763226495583, "grad_norm": 1.484375, "learning_rate": 0.00029780686535549756, "loss": 2.9874, "step": 224 }, { "epoch": 0.06422306812328153, "grad_norm": 2.5, "learning_rate": 0.0002977836536572382, "loss": 3.0055, "step": 225 }, { "epoch": 0.06450850398160722, "grad_norm": 1.8046875, "learning_rate": 0.00029776032068491303, "loss": 3.0, "step": 226 }, { "epoch": 0.06479393983993292, "grad_norm": 3.03125, "learning_rate": 0.0002977368664576696, "loss": 3.0042, "step": 227 }, { "epoch": 0.06507937569825861, "grad_norm": 2.453125, "learning_rate": 0.000297713290994755, "loss": 2.9981, "step": 228 }, { "epoch": 0.06536481155658432, "grad_norm": 2.65625, "learning_rate": 0.0002976895943155156, "loss": 2.9803, "step": 229 }, { "epoch": 0.06565024741491, "grad_norm": 3.015625, "learning_rate": 0.00029766577643939744, "loss": 2.9994, "step": 230 }, { "epoch": 0.0659356832732357, "grad_norm": 1.8203125, "learning_rate": 0.0002976418373859458, "loss": 2.9842, "step": 231 }, { "epoch": 0.0662211191315614, "grad_norm": 5.125, "learning_rate": 0.00029761777717480554, "loss": 3.0053, "step": 232 }, { "epoch": 0.0665065549898871, "grad_norm": 3.8125, "learning_rate": 0.00029759359582572103, "loss": 2.9906, "step": 233 }, { "epoch": 0.06679199084821279, "grad_norm": 4.03125, "learning_rate": 0.00029756929335853584, "loss": 3.0234, "step": 234 }, { "epoch": 0.06707742670653849, "grad_norm": 3.125, "learning_rate": 0.0002975448697931931, "loss": 2.9871, "step": 235 }, { "epoch": 0.06736286256486418, "grad_norm": 4.03125, "learning_rate": 0.00029752032514973516, "loss": 3.0048, "step": 236 }, { "epoch": 0.06764829842318988, "grad_norm": 3.453125, "learning_rate": 0.0002974956594483039, "loss": 3.0141, "step": 237 }, { "epoch": 0.06793373428151557, "grad_norm": 2.90625, "learning_rate": 0.0002974708727091404, "loss": 2.9658, "step": 238 }, { "epoch": 0.06821917013984127, "grad_norm": 2.6875, "learning_rate": 0.00029744596495258525, "loss": 3.002, "step": 239 }, { "epoch": 0.06850460599816696, "grad_norm": 2.625, "learning_rate": 0.0002974209361990781, "loss": 2.9831, "step": 240 }, { "epoch": 0.06879004185649266, "grad_norm": 2.28125, "learning_rate": 0.0002973957864691581, "loss": 2.9823, "step": 241 }, { "epoch": 0.06907547771481835, "grad_norm": 2.609375, "learning_rate": 0.00029737051578346345, "loss": 2.9626, "step": 242 }, { "epoch": 0.06936091357314406, "grad_norm": 1.8203125, "learning_rate": 0.000297345124162732, "loss": 2.9729, "step": 243 }, { "epoch": 0.06964634943146975, "grad_norm": 3.3125, "learning_rate": 0.00029731961162780037, "loss": 3.0036, "step": 244 }, { "epoch": 0.06993178528979545, "grad_norm": 2.59375, "learning_rate": 0.0002972939781996047, "loss": 2.9818, "step": 245 }, { "epoch": 0.07021722114812114, "grad_norm": 4.53125, "learning_rate": 0.00029726822389918034, "loss": 2.9709, "step": 246 }, { "epoch": 0.07050265700644684, "grad_norm": 4.09375, "learning_rate": 0.0002972423487476617, "loss": 2.9748, "step": 247 }, { "epoch": 0.07078809286477253, "grad_norm": 2.8125, "learning_rate": 0.0002972163527662824, "loss": 2.96, "step": 248 }, { "epoch": 0.07107352872309823, "grad_norm": 3.75, "learning_rate": 0.00029719023597637523, "loss": 2.9929, "step": 249 }, { "epoch": 0.07135896458142392, "grad_norm": 2.640625, "learning_rate": 0.00029716399839937216, "loss": 2.9467, "step": 250 }, { "epoch": 0.07135896458142392, "eval_loss": 2.805173873901367, "eval_runtime": 5998.7495, "eval_samples_per_second": 10.717, "eval_steps_per_second": 10.717, "step": 250 }, { "epoch": 0.07164440043974962, "grad_norm": 3.8125, "learning_rate": 0.00029713764005680427, "loss": 2.9764, "step": 251 }, { "epoch": 0.07192983629807531, "grad_norm": 3.625, "learning_rate": 0.00029711116097030167, "loss": 2.9982, "step": 252 }, { "epoch": 0.07221527215640101, "grad_norm": 2.359375, "learning_rate": 0.0002970845611615935, "loss": 2.9649, "step": 253 }, { "epoch": 0.0725007080147267, "grad_norm": 3.265625, "learning_rate": 0.00029705784065250826, "loss": 2.9516, "step": 254 }, { "epoch": 0.0727861438730524, "grad_norm": 2.875, "learning_rate": 0.00029703099946497323, "loss": 2.9788, "step": 255 }, { "epoch": 0.0730715797313781, "grad_norm": 2.734375, "learning_rate": 0.0002970040376210148, "loss": 2.9737, "step": 256 }, { "epoch": 0.0733570155897038, "grad_norm": 2.5625, "learning_rate": 0.00029697695514275824, "loss": 2.9806, "step": 257 }, { "epoch": 0.07364245144802949, "grad_norm": 2.375, "learning_rate": 0.00029694975205242816, "loss": 2.9629, "step": 258 }, { "epoch": 0.07392788730635519, "grad_norm": 2.578125, "learning_rate": 0.00029692242837234777, "loss": 2.9698, "step": 259 }, { "epoch": 0.07421332316468088, "grad_norm": 1.7890625, "learning_rate": 0.0002968949841249395, "loss": 2.9449, "step": 260 }, { "epoch": 0.07449875902300658, "grad_norm": 3.234375, "learning_rate": 0.00029686741933272455, "loss": 2.9724, "step": 261 }, { "epoch": 0.07478419488133227, "grad_norm": 2.421875, "learning_rate": 0.0002968397340183232, "loss": 2.9606, "step": 262 }, { "epoch": 0.07506963073965797, "grad_norm": 4.5, "learning_rate": 0.00029681192820445445, "loss": 3.0101, "step": 263 }, { "epoch": 0.07535506659798366, "grad_norm": 3.3125, "learning_rate": 0.00029678400191393626, "loss": 2.9797, "step": 264 }, { "epoch": 0.07564050245630936, "grad_norm": 4.4375, "learning_rate": 0.0002967559551696856, "loss": 2.9859, "step": 265 }, { "epoch": 0.07592593831463505, "grad_norm": 3.953125, "learning_rate": 0.00029672778799471797, "loss": 2.9839, "step": 266 }, { "epoch": 0.07621137417296076, "grad_norm": 3.640625, "learning_rate": 0.0002966995004121481, "loss": 2.9812, "step": 267 }, { "epoch": 0.07649681003128644, "grad_norm": 3.546875, "learning_rate": 0.00029667109244518923, "loss": 2.9904, "step": 268 }, { "epoch": 0.07678224588961215, "grad_norm": 2.671875, "learning_rate": 0.0002966425641171534, "loss": 2.9614, "step": 269 }, { "epoch": 0.07706768174793784, "grad_norm": 2.6875, "learning_rate": 0.00029661391545145156, "loss": 2.9671, "step": 270 }, { "epoch": 0.07735311760626354, "grad_norm": 2.125, "learning_rate": 0.00029658514647159335, "loss": 2.9646, "step": 271 }, { "epoch": 0.07763855346458923, "grad_norm": 2.640625, "learning_rate": 0.0002965562572011872, "loss": 2.9729, "step": 272 }, { "epoch": 0.07792398932291493, "grad_norm": 1.78125, "learning_rate": 0.00029652724766394007, "loss": 2.9315, "step": 273 }, { "epoch": 0.07820942518124062, "grad_norm": 3.015625, "learning_rate": 0.0002964981178836578, "loss": 2.9511, "step": 274 }, { "epoch": 0.07849486103956632, "grad_norm": 2.171875, "learning_rate": 0.00029646886788424487, "loss": 2.9338, "step": 275 }, { "epoch": 0.07878029689789201, "grad_norm": 3.296875, "learning_rate": 0.0002964394976897043, "loss": 2.936, "step": 276 }, { "epoch": 0.07906573275621771, "grad_norm": 2.734375, "learning_rate": 0.0002964100073241379, "loss": 2.9335, "step": 277 }, { "epoch": 0.0793511686145434, "grad_norm": 2.65625, "learning_rate": 0.000296380396811746, "loss": 2.9638, "step": 278 }, { "epoch": 0.0796366044728691, "grad_norm": 2.203125, "learning_rate": 0.00029635066617682754, "loss": 2.9612, "step": 279 }, { "epoch": 0.0799220403311948, "grad_norm": 2.09375, "learning_rate": 0.00029632081544378003, "loss": 2.9579, "step": 280 }, { "epoch": 0.0802074761895205, "grad_norm": 1.734375, "learning_rate": 0.00029629084463709957, "loss": 2.9506, "step": 281 }, { "epoch": 0.08049291204784619, "grad_norm": 1.8671875, "learning_rate": 0.0002962607537813808, "loss": 2.9479, "step": 282 }, { "epoch": 0.08077834790617189, "grad_norm": 1.4609375, "learning_rate": 0.0002962305429013168, "loss": 2.9124, "step": 283 }, { "epoch": 0.08106378376449758, "grad_norm": 2.03125, "learning_rate": 0.0002962002120216992, "loss": 2.9741, "step": 284 }, { "epoch": 0.08134921962282328, "grad_norm": 1.625, "learning_rate": 0.0002961697611674181, "loss": 2.9481, "step": 285 }, { "epoch": 0.08163465548114897, "grad_norm": 1.875, "learning_rate": 0.00029613919036346203, "loss": 2.9457, "step": 286 }, { "epoch": 0.08192009133947467, "grad_norm": 1.78125, "learning_rate": 0.00029610849963491797, "loss": 2.9509, "step": 287 }, { "epoch": 0.08220552719780036, "grad_norm": 2.40625, "learning_rate": 0.0002960776890069714, "loss": 2.9441, "step": 288 }, { "epoch": 0.08249096305612605, "grad_norm": 2.03125, "learning_rate": 0.0002960467585049059, "loss": 2.9625, "step": 289 }, { "epoch": 0.08277639891445175, "grad_norm": 1.453125, "learning_rate": 0.0002960157081541039, "loss": 2.9183, "step": 290 }, { "epoch": 0.08306183477277744, "grad_norm": 2.0, "learning_rate": 0.0002959845379800457, "loss": 2.9312, "step": 291 }, { "epoch": 0.08334727063110314, "grad_norm": 1.734375, "learning_rate": 0.00029595324800831024, "loss": 2.9224, "step": 292 }, { "epoch": 0.08363270648942883, "grad_norm": 2.1875, "learning_rate": 0.0002959218382645746, "loss": 2.9394, "step": 293 }, { "epoch": 0.08391814234775453, "grad_norm": 1.96875, "learning_rate": 0.00029589030877461426, "loss": 2.9493, "step": 294 }, { "epoch": 0.08420357820608022, "grad_norm": 1.5078125, "learning_rate": 0.00029585865956430283, "loss": 2.9385, "step": 295 }, { "epoch": 0.08448901406440593, "grad_norm": 2.4375, "learning_rate": 0.00029582689065961237, "loss": 2.9265, "step": 296 }, { "epoch": 0.08477444992273162, "grad_norm": 1.609375, "learning_rate": 0.00029579500208661296, "loss": 2.9448, "step": 297 }, { "epoch": 0.08505988578105732, "grad_norm": 1.8984375, "learning_rate": 0.00029576299387147305, "loss": 2.9555, "step": 298 }, { "epoch": 0.085345321639383, "grad_norm": 1.6953125, "learning_rate": 0.00029573086604045904, "loss": 2.904, "step": 299 }, { "epoch": 0.08563075749770871, "grad_norm": 2.046875, "learning_rate": 0.0002956986186199358, "loss": 2.959, "step": 300 }, { "epoch": 0.0859161933560344, "grad_norm": 1.171875, "learning_rate": 0.0002956662516363661, "loss": 2.9075, "step": 301 }, { "epoch": 0.0862016292143601, "grad_norm": 2.40625, "learning_rate": 0.0002956337651163109, "loss": 2.9521, "step": 302 }, { "epoch": 0.08648706507268579, "grad_norm": 1.703125, "learning_rate": 0.00029560115908642924, "loss": 2.9425, "step": 303 }, { "epoch": 0.08677250093101149, "grad_norm": 2.59375, "learning_rate": 0.0002955684335734783, "loss": 2.9626, "step": 304 }, { "epoch": 0.08705793678933718, "grad_norm": 1.8515625, "learning_rate": 0.00029553558860431317, "loss": 2.9293, "step": 305 }, { "epoch": 0.08734337264766288, "grad_norm": 2.515625, "learning_rate": 0.0002955026242058872, "loss": 2.9332, "step": 306 }, { "epoch": 0.08762880850598857, "grad_norm": 2.03125, "learning_rate": 0.0002954695404052514, "loss": 2.9323, "step": 307 }, { "epoch": 0.08791424436431428, "grad_norm": 2.265625, "learning_rate": 0.0002954363372295551, "loss": 2.9408, "step": 308 }, { "epoch": 0.08819968022263996, "grad_norm": 1.859375, "learning_rate": 0.0002954030147060454, "loss": 2.9305, "step": 309 }, { "epoch": 0.08848511608096567, "grad_norm": 2.1875, "learning_rate": 0.0002953695728620675, "loss": 2.9323, "step": 310 }, { "epoch": 0.08877055193929136, "grad_norm": 1.7890625, "learning_rate": 0.00029533601172506427, "loss": 2.9138, "step": 311 }, { "epoch": 0.08905598779761706, "grad_norm": 2.40625, "learning_rate": 0.00029530233132257663, "loss": 2.9394, "step": 312 }, { "epoch": 0.08934142365594275, "grad_norm": 1.7578125, "learning_rate": 0.00029526853168224343, "loss": 2.8984, "step": 313 }, { "epoch": 0.08962685951426845, "grad_norm": 2.359375, "learning_rate": 0.0002952346128318013, "loss": 2.9322, "step": 314 }, { "epoch": 0.08991229537259414, "grad_norm": 1.9375, "learning_rate": 0.00029520057479908465, "loss": 2.9164, "step": 315 }, { "epoch": 0.09019773123091984, "grad_norm": 2.234375, "learning_rate": 0.0002951664176120257, "loss": 2.9167, "step": 316 }, { "epoch": 0.09048316708924553, "grad_norm": 1.9375, "learning_rate": 0.00029513214129865456, "loss": 2.9398, "step": 317 }, { "epoch": 0.09076860294757123, "grad_norm": 2.21875, "learning_rate": 0.00029509774588709896, "loss": 2.9395, "step": 318 }, { "epoch": 0.09105403880589692, "grad_norm": 1.828125, "learning_rate": 0.00029506323140558445, "loss": 2.9478, "step": 319 }, { "epoch": 0.09133947466422263, "grad_norm": 1.9140625, "learning_rate": 0.0002950285978824343, "loss": 2.9216, "step": 320 }, { "epoch": 0.09162491052254831, "grad_norm": 1.7109375, "learning_rate": 0.00029499384534606936, "loss": 2.8959, "step": 321 }, { "epoch": 0.09191034638087402, "grad_norm": 1.75, "learning_rate": 0.00029495897382500827, "loss": 2.9072, "step": 322 }, { "epoch": 0.0921957822391997, "grad_norm": 1.5234375, "learning_rate": 0.00029492398334786727, "loss": 2.9121, "step": 323 }, { "epoch": 0.09248121809752541, "grad_norm": 2.09375, "learning_rate": 0.0002948888739433602, "loss": 2.9344, "step": 324 }, { "epoch": 0.0927666539558511, "grad_norm": 1.765625, "learning_rate": 0.0002948536456402985, "loss": 2.9211, "step": 325 }, { "epoch": 0.0930520898141768, "grad_norm": 1.9296875, "learning_rate": 0.00029481829846759116, "loss": 2.9041, "step": 326 }, { "epoch": 0.09333752567250249, "grad_norm": 2.265625, "learning_rate": 0.0002947828324542448, "loss": 2.9353, "step": 327 }, { "epoch": 0.09362296153082819, "grad_norm": 1.1328125, "learning_rate": 0.0002947472476293634, "loss": 2.9037, "step": 328 }, { "epoch": 0.09390839738915388, "grad_norm": 1.8359375, "learning_rate": 0.00029471154402214864, "loss": 2.9166, "step": 329 }, { "epoch": 0.09419383324747958, "grad_norm": 2.078125, "learning_rate": 0.00029467572166189956, "loss": 2.9074, "step": 330 }, { "epoch": 0.09447926910580527, "grad_norm": 2.015625, "learning_rate": 0.00029463978057801257, "loss": 2.9137, "step": 331 }, { "epoch": 0.09476470496413097, "grad_norm": 1.7734375, "learning_rate": 0.00029460372079998177, "loss": 2.8971, "step": 332 }, { "epoch": 0.09505014082245666, "grad_norm": 1.296875, "learning_rate": 0.00029456754235739833, "loss": 2.8784, "step": 333 }, { "epoch": 0.09533557668078237, "grad_norm": 2.203125, "learning_rate": 0.0002945312452799511, "loss": 2.9102, "step": 334 }, { "epoch": 0.09562101253910806, "grad_norm": 1.34375, "learning_rate": 0.00029449482959742604, "loss": 2.9096, "step": 335 }, { "epoch": 0.09590644839743376, "grad_norm": 1.3671875, "learning_rate": 0.0002944582953397067, "loss": 2.8925, "step": 336 }, { "epoch": 0.09619188425575945, "grad_norm": 1.9375, "learning_rate": 0.0002944216425367736, "loss": 2.9094, "step": 337 }, { "epoch": 0.09647732011408515, "grad_norm": 1.9140625, "learning_rate": 0.0002943848712187048, "loss": 2.9133, "step": 338 }, { "epoch": 0.09676275597241084, "grad_norm": 1.90625, "learning_rate": 0.0002943479814156756, "loss": 2.9073, "step": 339 }, { "epoch": 0.09704819183073654, "grad_norm": 1.2734375, "learning_rate": 0.00029431097315795834, "loss": 2.8993, "step": 340 }, { "epoch": 0.09733362768906223, "grad_norm": 2.296875, "learning_rate": 0.00029427384647592284, "loss": 2.8968, "step": 341 }, { "epoch": 0.09761906354738793, "grad_norm": 1.4609375, "learning_rate": 0.0002942366014000359, "loss": 2.9124, "step": 342 }, { "epoch": 0.09790449940571362, "grad_norm": 2.484375, "learning_rate": 0.0002941992379608615, "loss": 2.8816, "step": 343 }, { "epoch": 0.09818993526403932, "grad_norm": 1.609375, "learning_rate": 0.00029416175618906084, "loss": 2.9015, "step": 344 }, { "epoch": 0.09847537112236501, "grad_norm": 2.6875, "learning_rate": 0.00029412415611539214, "loss": 2.9286, "step": 345 }, { "epoch": 0.09876080698069072, "grad_norm": 2.140625, "learning_rate": 0.00029408643777071073, "loss": 2.9316, "step": 346 }, { "epoch": 0.0990462428390164, "grad_norm": 2.28125, "learning_rate": 0.00029404860118596905, "loss": 2.894, "step": 347 }, { "epoch": 0.09933167869734211, "grad_norm": 2.09375, "learning_rate": 0.00029401064639221643, "loss": 2.8946, "step": 348 }, { "epoch": 0.0996171145556678, "grad_norm": 2.21875, "learning_rate": 0.0002939725734205994, "loss": 2.9068, "step": 349 }, { "epoch": 0.0999025504139935, "grad_norm": 1.84375, "learning_rate": 0.00029393438230236124, "loss": 2.8898, "step": 350 }, { "epoch": 0.10018798627231919, "grad_norm": 1.7578125, "learning_rate": 0.0002938960730688424, "loss": 2.8922, "step": 351 }, { "epoch": 0.10047342213064489, "grad_norm": 1.640625, "learning_rate": 0.00029385764575148014, "loss": 2.8772, "step": 352 }, { "epoch": 0.10075885798897058, "grad_norm": 1.7890625, "learning_rate": 0.00029381910038180856, "loss": 2.8961, "step": 353 }, { "epoch": 0.10104429384729628, "grad_norm": 1.890625, "learning_rate": 0.00029378043699145886, "loss": 2.9052, "step": 354 }, { "epoch": 0.10132972970562197, "grad_norm": 1.9375, "learning_rate": 0.0002937416556121589, "loss": 2.8703, "step": 355 }, { "epoch": 0.10161516556394767, "grad_norm": 1.5625, "learning_rate": 0.0002937027562757334, "loss": 2.8967, "step": 356 }, { "epoch": 0.10190060142227336, "grad_norm": 1.6640625, "learning_rate": 0.00029366373901410387, "loss": 2.913, "step": 357 }, { "epoch": 0.10218603728059907, "grad_norm": 1.53125, "learning_rate": 0.0002936246038592886, "loss": 2.8944, "step": 358 }, { "epoch": 0.10247147313892475, "grad_norm": 2.390625, "learning_rate": 0.00029358535084340274, "loss": 2.8808, "step": 359 }, { "epoch": 0.10275690899725046, "grad_norm": 1.421875, "learning_rate": 0.000293545979998658, "loss": 2.9055, "step": 360 }, { "epoch": 0.10304234485557615, "grad_norm": 1.71875, "learning_rate": 0.0002935064913573628, "loss": 2.8925, "step": 361 }, { "epoch": 0.10332778071390183, "grad_norm": 2.0625, "learning_rate": 0.0002934668849519223, "loss": 2.8751, "step": 362 }, { "epoch": 0.10361321657222754, "grad_norm": 2.015625, "learning_rate": 0.00029342716081483825, "loss": 2.8836, "step": 363 }, { "epoch": 0.10389865243055323, "grad_norm": 1.4375, "learning_rate": 0.0002933873189787091, "loss": 2.8702, "step": 364 }, { "epoch": 0.10418408828887893, "grad_norm": 2.375, "learning_rate": 0.0002933473594762297, "loss": 2.8953, "step": 365 }, { "epoch": 0.10446952414720462, "grad_norm": 1.3671875, "learning_rate": 0.00029330728234019173, "loss": 2.8753, "step": 366 }, { "epoch": 0.10475496000553032, "grad_norm": 2.90625, "learning_rate": 0.0002932670876034831, "loss": 2.8844, "step": 367 }, { "epoch": 0.10504039586385601, "grad_norm": 2.109375, "learning_rate": 0.00029322677529908844, "loss": 2.9018, "step": 368 }, { "epoch": 0.10532583172218171, "grad_norm": 2.328125, "learning_rate": 0.0002931863454600888, "loss": 2.8967, "step": 369 }, { "epoch": 0.1056112675805074, "grad_norm": 2.0625, "learning_rate": 0.0002931457981196616, "loss": 2.882, "step": 370 }, { "epoch": 0.1058967034388331, "grad_norm": 2.1875, "learning_rate": 0.00029310513331108086, "loss": 2.8641, "step": 371 }, { "epoch": 0.10618213929715879, "grad_norm": 1.78125, "learning_rate": 0.0002930643510677168, "loss": 2.8808, "step": 372 }, { "epoch": 0.1064675751554845, "grad_norm": 1.703125, "learning_rate": 0.00029302345142303616, "loss": 2.8699, "step": 373 }, { "epoch": 0.10675301101381018, "grad_norm": 1.8984375, "learning_rate": 0.0002929824344106019, "loss": 2.8467, "step": 374 }, { "epoch": 0.10703844687213589, "grad_norm": 1.546875, "learning_rate": 0.0002929413000640735, "loss": 2.8674, "step": 375 }, { "epoch": 0.10732388273046158, "grad_norm": 2.265625, "learning_rate": 0.0002929000484172064, "loss": 2.8897, "step": 376 }, { "epoch": 0.10760931858878728, "grad_norm": 1.328125, "learning_rate": 0.00029285867950385255, "loss": 2.8601, "step": 377 }, { "epoch": 0.10789475444711297, "grad_norm": 2.65625, "learning_rate": 0.00029281719335796013, "loss": 2.89, "step": 378 }, { "epoch": 0.10818019030543867, "grad_norm": 2.046875, "learning_rate": 0.00029277559001357343, "loss": 2.9044, "step": 379 }, { "epoch": 0.10846562616376436, "grad_norm": 2.109375, "learning_rate": 0.00029273386950483287, "loss": 2.8765, "step": 380 }, { "epoch": 0.10875106202209006, "grad_norm": 1.7265625, "learning_rate": 0.00029269203186597513, "loss": 2.8911, "step": 381 }, { "epoch": 0.10903649788041575, "grad_norm": 2.296875, "learning_rate": 0.00029265007713133304, "loss": 2.8756, "step": 382 }, { "epoch": 0.10932193373874145, "grad_norm": 1.5703125, "learning_rate": 0.00029260800533533534, "loss": 2.889, "step": 383 }, { "epoch": 0.10960736959706714, "grad_norm": 2.609375, "learning_rate": 0.000292565816512507, "loss": 2.8758, "step": 384 }, { "epoch": 0.10989280545539284, "grad_norm": 2.125, "learning_rate": 0.000292523510697469, "loss": 2.8699, "step": 385 }, { "epoch": 0.11017824131371853, "grad_norm": 2.515625, "learning_rate": 0.0002924810879249382, "loss": 2.8935, "step": 386 }, { "epoch": 0.11046367717204424, "grad_norm": 2.015625, "learning_rate": 0.00029243854822972763, "loss": 2.8723, "step": 387 }, { "epoch": 0.11074911303036993, "grad_norm": 2.21875, "learning_rate": 0.0002923958916467461, "loss": 2.894, "step": 388 }, { "epoch": 0.11103454888869563, "grad_norm": 2.0625, "learning_rate": 0.00029235311821099847, "loss": 2.8676, "step": 389 }, { "epoch": 0.11131998474702132, "grad_norm": 1.75, "learning_rate": 0.00029231022795758537, "loss": 2.8786, "step": 390 }, { "epoch": 0.11160542060534702, "grad_norm": 1.671875, "learning_rate": 0.0002922672209217033, "loss": 2.867, "step": 391 }, { "epoch": 0.11189085646367271, "grad_norm": 1.8359375, "learning_rate": 0.00029222409713864484, "loss": 2.8938, "step": 392 }, { "epoch": 0.11217629232199841, "grad_norm": 1.6640625, "learning_rate": 0.00029218085664379806, "loss": 2.8601, "step": 393 }, { "epoch": 0.1124617281803241, "grad_norm": 1.5078125, "learning_rate": 0.0002921374994726469, "loss": 2.8817, "step": 394 }, { "epoch": 0.1127471640386498, "grad_norm": 1.6015625, "learning_rate": 0.0002920940256607711, "loss": 2.8482, "step": 395 }, { "epoch": 0.11303259989697549, "grad_norm": 1.859375, "learning_rate": 0.0002920504352438462, "loss": 2.8996, "step": 396 }, { "epoch": 0.1133180357553012, "grad_norm": 2.015625, "learning_rate": 0.00029200672825764314, "loss": 2.8592, "step": 397 }, { "epoch": 0.11360347161362688, "grad_norm": 1.46875, "learning_rate": 0.00029196290473802885, "loss": 2.8327, "step": 398 }, { "epoch": 0.11388890747195259, "grad_norm": 1.515625, "learning_rate": 0.0002919189647209656, "loss": 2.8438, "step": 399 }, { "epoch": 0.11417434333027827, "grad_norm": 1.3359375, "learning_rate": 0.00029187490824251154, "loss": 2.884, "step": 400 }, { "epoch": 0.11445977918860398, "grad_norm": 1.828125, "learning_rate": 0.00029183073533882025, "loss": 2.8601, "step": 401 }, { "epoch": 0.11474521504692967, "grad_norm": 1.96875, "learning_rate": 0.00029178644604614077, "loss": 2.8788, "step": 402 }, { "epoch": 0.11503065090525537, "grad_norm": 1.7890625, "learning_rate": 0.00029174204040081773, "loss": 2.8823, "step": 403 }, { "epoch": 0.11531608676358106, "grad_norm": 1.7265625, "learning_rate": 0.0002916975184392912, "loss": 2.8464, "step": 404 }, { "epoch": 0.11560152262190676, "grad_norm": 1.0625, "learning_rate": 0.0002916528801980969, "loss": 2.8377, "step": 405 }, { "epoch": 0.11588695848023245, "grad_norm": 2.109375, "learning_rate": 0.00029160812571386575, "loss": 2.8409, "step": 406 }, { "epoch": 0.11617239433855815, "grad_norm": 1.4609375, "learning_rate": 0.00029156325502332413, "loss": 2.8581, "step": 407 }, { "epoch": 0.11645783019688384, "grad_norm": 1.9296875, "learning_rate": 0.00029151826816329365, "loss": 2.865, "step": 408 }, { "epoch": 0.11674326605520954, "grad_norm": 1.71875, "learning_rate": 0.00029147316517069157, "loss": 2.8527, "step": 409 }, { "epoch": 0.11702870191353523, "grad_norm": 1.4375, "learning_rate": 0.00029142794608253016, "loss": 2.8494, "step": 410 }, { "epoch": 0.11731413777186094, "grad_norm": 3.875, "learning_rate": 0.0002913826109359171, "loss": 2.8461, "step": 411 }, { "epoch": 0.11759957363018662, "grad_norm": 1.875, "learning_rate": 0.00029133715976805525, "loss": 2.8565, "step": 412 }, { "epoch": 0.11788500948851233, "grad_norm": 3.125, "learning_rate": 0.0002912915926162427, "loss": 2.8667, "step": 413 }, { "epoch": 0.11817044534683802, "grad_norm": 2.0625, "learning_rate": 0.00029124590951787267, "loss": 2.8504, "step": 414 }, { "epoch": 0.11845588120516372, "grad_norm": 3.46875, "learning_rate": 0.0002912001105104337, "loss": 2.8719, "step": 415 }, { "epoch": 0.11874131706348941, "grad_norm": 2.171875, "learning_rate": 0.00029115419563150916, "loss": 2.8702, "step": 416 }, { "epoch": 0.11902675292181511, "grad_norm": 4.71875, "learning_rate": 0.0002911081649187778, "loss": 2.8971, "step": 417 }, { "epoch": 0.1193121887801408, "grad_norm": 3.828125, "learning_rate": 0.0002910620184100133, "loss": 2.9119, "step": 418 }, { "epoch": 0.1195976246384665, "grad_norm": 4.25, "learning_rate": 0.0002910157561430842, "loss": 2.8927, "step": 419 }, { "epoch": 0.11988306049679219, "grad_norm": 3.65625, "learning_rate": 0.0002909693781559544, "loss": 2.861, "step": 420 }, { "epoch": 0.1201684963551179, "grad_norm": 3.59375, "learning_rate": 0.0002909228844866824, "loss": 2.8826, "step": 421 }, { "epoch": 0.12045393221344358, "grad_norm": 3.265625, "learning_rate": 0.0002908762751734219, "loss": 2.8495, "step": 422 }, { "epoch": 0.12073936807176928, "grad_norm": 3.484375, "learning_rate": 0.0002908295502544213, "loss": 2.8707, "step": 423 }, { "epoch": 0.12102480393009497, "grad_norm": 2.828125, "learning_rate": 0.00029078270976802393, "loss": 2.8647, "step": 424 }, { "epoch": 0.12131023978842068, "grad_norm": 3.515625, "learning_rate": 0.00029073575375266806, "loss": 2.8505, "step": 425 }, { "epoch": 0.12159567564674637, "grad_norm": 2.5, "learning_rate": 0.0002906886822468867, "loss": 2.8821, "step": 426 }, { "epoch": 0.12188111150507207, "grad_norm": 5.4375, "learning_rate": 0.0002906414952893075, "loss": 2.8788, "step": 427 }, { "epoch": 0.12216654736339776, "grad_norm": 4.40625, "learning_rate": 0.00029059419291865314, "loss": 2.8715, "step": 428 }, { "epoch": 0.12245198322172346, "grad_norm": 3.6875, "learning_rate": 0.0002905467751737407, "loss": 2.846, "step": 429 }, { "epoch": 0.12273741908004915, "grad_norm": 3.765625, "learning_rate": 0.00029049924209348214, "loss": 2.856, "step": 430 }, { "epoch": 0.12302285493837485, "grad_norm": 2.90625, "learning_rate": 0.000290451593716884, "loss": 2.8646, "step": 431 }, { "epoch": 0.12330829079670054, "grad_norm": 2.890625, "learning_rate": 0.00029040383008304744, "loss": 2.8408, "step": 432 }, { "epoch": 0.12359372665502623, "grad_norm": 2.984375, "learning_rate": 0.00029035595123116817, "loss": 2.8501, "step": 433 }, { "epoch": 0.12387916251335193, "grad_norm": 2.53125, "learning_rate": 0.0002903079572005365, "loss": 2.8384, "step": 434 }, { "epoch": 0.12416459837167762, "grad_norm": 3.40625, "learning_rate": 0.00029025984803053735, "loss": 2.8436, "step": 435 }, { "epoch": 0.12445003423000332, "grad_norm": 2.984375, "learning_rate": 0.0002902116237606498, "loss": 2.8543, "step": 436 }, { "epoch": 0.12473547008832901, "grad_norm": 3.65625, "learning_rate": 0.0002901632844304478, "loss": 2.8469, "step": 437 }, { "epoch": 0.12502090594665471, "grad_norm": 3.40625, "learning_rate": 0.0002901148300795994, "loss": 2.8636, "step": 438 }, { "epoch": 0.1253063418049804, "grad_norm": 3.546875, "learning_rate": 0.0002900662607478672, "loss": 2.8424, "step": 439 }, { "epoch": 0.1255917776633061, "grad_norm": 3.265625, "learning_rate": 0.00029001757647510815, "loss": 2.8493, "step": 440 }, { "epoch": 0.1258772135216318, "grad_norm": 2.890625, "learning_rate": 0.0002899687773012734, "loss": 2.8214, "step": 441 }, { "epoch": 0.1261626493799575, "grad_norm": 2.75, "learning_rate": 0.0002899198632664086, "loss": 2.8492, "step": 442 }, { "epoch": 0.1264480852382832, "grad_norm": 3.6875, "learning_rate": 0.0002898708344106533, "loss": 2.8111, "step": 443 }, { "epoch": 0.12673352109660888, "grad_norm": 3.546875, "learning_rate": 0.0002898216907742418, "loss": 2.8513, "step": 444 }, { "epoch": 0.1270189569549346, "grad_norm": 2.671875, "learning_rate": 0.0002897724323975021, "loss": 2.8602, "step": 445 }, { "epoch": 0.12730439281326028, "grad_norm": 2.578125, "learning_rate": 0.0002897230593208567, "loss": 2.8462, "step": 446 }, { "epoch": 0.12758982867158597, "grad_norm": 2.828125, "learning_rate": 0.00028967357158482196, "loss": 2.8422, "step": 447 }, { "epoch": 0.12787526452991166, "grad_norm": 2.375, "learning_rate": 0.00028962396923000846, "loss": 2.8382, "step": 448 }, { "epoch": 0.12816070038823738, "grad_norm": 3.6875, "learning_rate": 0.0002895742522971209, "loss": 2.8544, "step": 449 }, { "epoch": 0.12844613624656306, "grad_norm": 3.640625, "learning_rate": 0.0002895244208269579, "loss": 2.8542, "step": 450 }, { "epoch": 0.12873157210488875, "grad_norm": 2.40625, "learning_rate": 0.0002894744748604121, "loss": 2.8417, "step": 451 }, { "epoch": 0.12901700796321444, "grad_norm": 2.375, "learning_rate": 0.0002894244144384701, "loss": 2.8588, "step": 452 }, { "epoch": 0.12930244382154016, "grad_norm": 2.765625, "learning_rate": 0.0002893742396022125, "loss": 2.8388, "step": 453 }, { "epoch": 0.12958787967986585, "grad_norm": 2.359375, "learning_rate": 0.0002893239503928137, "loss": 2.8559, "step": 454 }, { "epoch": 0.12987331553819154, "grad_norm": 3.96875, "learning_rate": 0.00028927354685154185, "loss": 2.8341, "step": 455 }, { "epoch": 0.13015875139651722, "grad_norm": 3.765625, "learning_rate": 0.0002892230290197592, "loss": 2.8267, "step": 456 }, { "epoch": 0.13044418725484294, "grad_norm": 2.078125, "learning_rate": 0.0002891723969389216, "loss": 2.8497, "step": 457 }, { "epoch": 0.13072962311316863, "grad_norm": 2.0, "learning_rate": 0.0002891216506505787, "loss": 2.8252, "step": 458 }, { "epoch": 0.13101505897149432, "grad_norm": 3.5625, "learning_rate": 0.0002890707901963738, "loss": 2.8563, "step": 459 }, { "epoch": 0.13130049482982, "grad_norm": 3.171875, "learning_rate": 0.00028901981561804403, "loss": 2.861, "step": 460 }, { "epoch": 0.13158593068814572, "grad_norm": 2.734375, "learning_rate": 0.0002889687269574201, "loss": 2.8336, "step": 461 }, { "epoch": 0.1318713665464714, "grad_norm": 2.78125, "learning_rate": 0.0002889175242564263, "loss": 2.8575, "step": 462 }, { "epoch": 0.1321568024047971, "grad_norm": 2.265625, "learning_rate": 0.00028886620755708045, "loss": 2.8301, "step": 463 }, { "epoch": 0.1324422382631228, "grad_norm": 2.015625, "learning_rate": 0.0002888147769014942, "loss": 2.8299, "step": 464 }, { "epoch": 0.1327276741214485, "grad_norm": 3.1875, "learning_rate": 0.0002887632323318723, "loss": 2.8261, "step": 465 }, { "epoch": 0.1330131099797742, "grad_norm": 2.84375, "learning_rate": 0.0002887115738905134, "loss": 2.8398, "step": 466 }, { "epoch": 0.13329854583809989, "grad_norm": 2.828125, "learning_rate": 0.0002886598016198093, "loss": 2.8414, "step": 467 }, { "epoch": 0.13358398169642557, "grad_norm": 2.640625, "learning_rate": 0.00028860791556224524, "loss": 2.8286, "step": 468 }, { "epoch": 0.1338694175547513, "grad_norm": 2.8125, "learning_rate": 0.00028855591576040004, "loss": 2.8641, "step": 469 }, { "epoch": 0.13415485341307698, "grad_norm": 2.65625, "learning_rate": 0.0002885038022569457, "loss": 2.8478, "step": 470 }, { "epoch": 0.13444028927140267, "grad_norm": 2.703125, "learning_rate": 0.0002884515750946474, "loss": 2.8215, "step": 471 }, { "epoch": 0.13472572512972836, "grad_norm": 2.515625, "learning_rate": 0.0002883992343163639, "loss": 2.8004, "step": 472 }, { "epoch": 0.13501116098805407, "grad_norm": 2.75, "learning_rate": 0.00028834677996504696, "loss": 2.8395, "step": 473 }, { "epoch": 0.13529659684637976, "grad_norm": 2.625, "learning_rate": 0.00028829421208374166, "loss": 2.8313, "step": 474 }, { "epoch": 0.13558203270470545, "grad_norm": 2.640625, "learning_rate": 0.0002882415307155862, "loss": 2.841, "step": 475 }, { "epoch": 0.13586746856303114, "grad_norm": 2.4375, "learning_rate": 0.00028818873590381183, "loss": 2.8614, "step": 476 }, { "epoch": 0.13615290442135686, "grad_norm": 2.71875, "learning_rate": 0.000288135827691743, "loss": 2.8482, "step": 477 }, { "epoch": 0.13643834027968255, "grad_norm": 2.5, "learning_rate": 0.0002880828061227973, "loss": 2.8532, "step": 478 }, { "epoch": 0.13672377613800824, "grad_norm": 2.75, "learning_rate": 0.0002880296712404851, "loss": 2.8337, "step": 479 }, { "epoch": 0.13700921199633392, "grad_norm": 2.578125, "learning_rate": 0.0002879764230884099, "loss": 2.8183, "step": 480 }, { "epoch": 0.13729464785465964, "grad_norm": 2.515625, "learning_rate": 0.00028792306171026823, "loss": 2.8161, "step": 481 }, { "epoch": 0.13758008371298533, "grad_norm": 2.390625, "learning_rate": 0.00028786958714984936, "loss": 2.8174, "step": 482 }, { "epoch": 0.13786551957131102, "grad_norm": 2.609375, "learning_rate": 0.0002878159994510356, "loss": 2.8075, "step": 483 }, { "epoch": 0.1381509554296367, "grad_norm": 2.5, "learning_rate": 0.00028776229865780205, "loss": 2.8157, "step": 484 }, { "epoch": 0.13843639128796242, "grad_norm": 2.453125, "learning_rate": 0.0002877084848142165, "loss": 2.8291, "step": 485 }, { "epoch": 0.1387218271462881, "grad_norm": 2.375, "learning_rate": 0.0002876545579644396, "loss": 2.8247, "step": 486 }, { "epoch": 0.1390072630046138, "grad_norm": 2.671875, "learning_rate": 0.0002876005181527249, "loss": 2.8366, "step": 487 }, { "epoch": 0.1392926988629395, "grad_norm": 2.546875, "learning_rate": 0.0002875463654234183, "loss": 2.8679, "step": 488 }, { "epoch": 0.1395781347212652, "grad_norm": 2.5625, "learning_rate": 0.0002874920998209587, "loss": 2.8432, "step": 489 }, { "epoch": 0.1398635705795909, "grad_norm": 2.40625, "learning_rate": 0.00028743772138987745, "loss": 2.8366, "step": 490 }, { "epoch": 0.14014900643791658, "grad_norm": 2.546875, "learning_rate": 0.0002873832301747985, "loss": 2.8279, "step": 491 }, { "epoch": 0.14043444229624227, "grad_norm": 2.28125, "learning_rate": 0.00028732862622043835, "loss": 2.7933, "step": 492 }, { "epoch": 0.140719878154568, "grad_norm": 2.671875, "learning_rate": 0.000287273909571606, "loss": 2.8563, "step": 493 }, { "epoch": 0.14100531401289368, "grad_norm": 2.546875, "learning_rate": 0.00028721908027320314, "loss": 2.858, "step": 494 }, { "epoch": 0.14129074987121937, "grad_norm": 2.390625, "learning_rate": 0.00028716413837022355, "loss": 2.7946, "step": 495 }, { "epoch": 0.14157618572954506, "grad_norm": 2.15625, "learning_rate": 0.0002871090839077537, "loss": 2.7874, "step": 496 }, { "epoch": 0.14186162158787077, "grad_norm": 2.546875, "learning_rate": 0.0002870539169309723, "loss": 2.8255, "step": 497 }, { "epoch": 0.14214705744619646, "grad_norm": 2.453125, "learning_rate": 0.0002869986374851504, "loss": 2.8218, "step": 498 }, { "epoch": 0.14243249330452215, "grad_norm": 2.46875, "learning_rate": 0.00028694324561565136, "loss": 2.8197, "step": 499 }, { "epoch": 0.14271792916284784, "grad_norm": 2.296875, "learning_rate": 0.00028688774136793085, "loss": 2.8208, "step": 500 }, { "epoch": 0.14271792916284784, "eval_loss": 2.6708414554595947, "eval_runtime": 6008.9725, "eval_samples_per_second": 10.698, "eval_steps_per_second": 10.698, "step": 500 }, { "epoch": 0.14300336502117356, "grad_norm": 2.34375, "learning_rate": 0.00028683212478753663, "loss": 2.8263, "step": 501 }, { "epoch": 0.14328880087949925, "grad_norm": 2.046875, "learning_rate": 0.00028677639592010874, "loss": 2.8395, "step": 502 }, { "epoch": 0.14357423673782493, "grad_norm": 2.828125, "learning_rate": 0.00028672055481137937, "loss": 2.815, "step": 503 }, { "epoch": 0.14385967259615062, "grad_norm": 2.53125, "learning_rate": 0.0002866646015071728, "loss": 2.8157, "step": 504 }, { "epoch": 0.1441451084544763, "grad_norm": 2.421875, "learning_rate": 0.0002866085360534053, "loss": 2.8449, "step": 505 }, { "epoch": 0.14443054431280203, "grad_norm": 2.203125, "learning_rate": 0.00028655235849608533, "loss": 2.7893, "step": 506 }, { "epoch": 0.14471598017112772, "grad_norm": 2.359375, "learning_rate": 0.00028649606888131327, "loss": 2.8099, "step": 507 }, { "epoch": 0.1450014160294534, "grad_norm": 1.9453125, "learning_rate": 0.00028643966725528134, "loss": 2.8032, "step": 508 }, { "epoch": 0.1452868518877791, "grad_norm": 2.921875, "learning_rate": 0.0002863831536642739, "loss": 2.8453, "step": 509 }, { "epoch": 0.1455722877461048, "grad_norm": 2.59375, "learning_rate": 0.0002863265281546669, "loss": 2.7995, "step": 510 }, { "epoch": 0.1458577236044305, "grad_norm": 2.1875, "learning_rate": 0.0002862697907729285, "loss": 2.8297, "step": 511 }, { "epoch": 0.1461431594627562, "grad_norm": 2.015625, "learning_rate": 0.00028621294156561843, "loss": 2.7948, "step": 512 }, { "epoch": 0.14642859532108188, "grad_norm": 2.5, "learning_rate": 0.0002861559805793881, "loss": 2.8182, "step": 513 }, { "epoch": 0.1467140311794076, "grad_norm": 2.203125, "learning_rate": 0.0002860989078609809, "loss": 2.8126, "step": 514 }, { "epoch": 0.14699946703773328, "grad_norm": 2.5625, "learning_rate": 0.00028604172345723174, "loss": 2.8018, "step": 515 }, { "epoch": 0.14728490289605897, "grad_norm": 2.421875, "learning_rate": 0.00028598442741506724, "loss": 2.8455, "step": 516 }, { "epoch": 0.14757033875438466, "grad_norm": 2.671875, "learning_rate": 0.0002859270197815056, "loss": 2.82, "step": 517 }, { "epoch": 0.14785577461271038, "grad_norm": 2.375, "learning_rate": 0.0002858695006036566, "loss": 2.8428, "step": 518 }, { "epoch": 0.14814121047103607, "grad_norm": 2.828125, "learning_rate": 0.0002858118699287216, "loss": 2.8128, "step": 519 }, { "epoch": 0.14842664632936176, "grad_norm": 2.53125, "learning_rate": 0.00028575412780399345, "loss": 2.8563, "step": 520 }, { "epoch": 0.14871208218768744, "grad_norm": 3.28125, "learning_rate": 0.00028569627427685627, "loss": 2.8428, "step": 521 }, { "epoch": 0.14899751804601316, "grad_norm": 2.5625, "learning_rate": 0.000285638309394786, "loss": 2.8274, "step": 522 }, { "epoch": 0.14928295390433885, "grad_norm": 3.84375, "learning_rate": 0.0002855802332053496, "loss": 2.8169, "step": 523 }, { "epoch": 0.14956838976266454, "grad_norm": 3.453125, "learning_rate": 0.00028552204575620543, "loss": 2.828, "step": 524 }, { "epoch": 0.14985382562099023, "grad_norm": 2.53125, "learning_rate": 0.0002854637470951033, "loss": 2.8265, "step": 525 }, { "epoch": 0.15013926147931594, "grad_norm": 2.484375, "learning_rate": 0.00028540533726988414, "loss": 2.853, "step": 526 }, { "epoch": 0.15042469733764163, "grad_norm": 2.328125, "learning_rate": 0.00028534681632848025, "loss": 2.8193, "step": 527 }, { "epoch": 0.15071013319596732, "grad_norm": 2.015625, "learning_rate": 0.0002852881843189149, "loss": 2.8112, "step": 528 }, { "epoch": 0.150995569054293, "grad_norm": 2.71875, "learning_rate": 0.0002852294412893027, "loss": 2.8376, "step": 529 }, { "epoch": 0.15128100491261873, "grad_norm": 2.421875, "learning_rate": 0.00028517058728784933, "loss": 2.8126, "step": 530 }, { "epoch": 0.15156644077094442, "grad_norm": 2.5625, "learning_rate": 0.0002851116223628514, "loss": 2.8375, "step": 531 }, { "epoch": 0.1518518766292701, "grad_norm": 2.40625, "learning_rate": 0.00028505254656269673, "loss": 2.8186, "step": 532 }, { "epoch": 0.1521373124875958, "grad_norm": 2.3125, "learning_rate": 0.00028499335993586403, "loss": 2.8437, "step": 533 }, { "epoch": 0.1524227483459215, "grad_norm": 1.96875, "learning_rate": 0.0002849340625309229, "loss": 2.7927, "step": 534 }, { "epoch": 0.1527081842042472, "grad_norm": 2.578125, "learning_rate": 0.000284874654396534, "loss": 2.8123, "step": 535 }, { "epoch": 0.1529936200625729, "grad_norm": 2.171875, "learning_rate": 0.0002848151355814487, "loss": 2.8459, "step": 536 }, { "epoch": 0.15327905592089858, "grad_norm": 2.953125, "learning_rate": 0.0002847555061345093, "loss": 2.8225, "step": 537 }, { "epoch": 0.1535644917792243, "grad_norm": 2.84375, "learning_rate": 0.0002846957661046488, "loss": 2.8028, "step": 538 }, { "epoch": 0.15384992763754998, "grad_norm": 2.03125, "learning_rate": 0.0002846359155408911, "loss": 2.8167, "step": 539 }, { "epoch": 0.15413536349587567, "grad_norm": 1.8984375, "learning_rate": 0.0002845759544923507, "loss": 2.83, "step": 540 }, { "epoch": 0.15442079935420136, "grad_norm": 2.578125, "learning_rate": 0.00028451588300823266, "loss": 2.8233, "step": 541 }, { "epoch": 0.15470623521252708, "grad_norm": 2.03125, "learning_rate": 0.0002844557011378328, "loss": 2.8076, "step": 542 }, { "epoch": 0.15499167107085277, "grad_norm": 2.734375, "learning_rate": 0.00028439540893053766, "loss": 2.8473, "step": 543 }, { "epoch": 0.15527710692917845, "grad_norm": 2.5625, "learning_rate": 0.000284335006435824, "loss": 2.8175, "step": 544 }, { "epoch": 0.15556254278750414, "grad_norm": 2.28125, "learning_rate": 0.00028427449370325937, "loss": 2.8237, "step": 545 }, { "epoch": 0.15584797864582986, "grad_norm": 1.9921875, "learning_rate": 0.0002842138707825015, "loss": 2.8176, "step": 546 }, { "epoch": 0.15613341450415555, "grad_norm": 2.421875, "learning_rate": 0.0002841531377232989, "loss": 2.8295, "step": 547 }, { "epoch": 0.15641885036248124, "grad_norm": 1.9453125, "learning_rate": 0.0002840922945754901, "loss": 2.8035, "step": 548 }, { "epoch": 0.15670428622080693, "grad_norm": 2.921875, "learning_rate": 0.00028403134138900427, "loss": 2.8217, "step": 549 }, { "epoch": 0.15698972207913264, "grad_norm": 2.546875, "learning_rate": 0.0002839702782138607, "loss": 2.8093, "step": 550 }, { "epoch": 0.15727515793745833, "grad_norm": 2.296875, "learning_rate": 0.00028390910510016896, "loss": 2.8026, "step": 551 }, { "epoch": 0.15756059379578402, "grad_norm": 2.34375, "learning_rate": 0.00028384782209812893, "loss": 2.8124, "step": 552 }, { "epoch": 0.1578460296541097, "grad_norm": 1.84375, "learning_rate": 0.0002837864292580305, "loss": 2.8342, "step": 553 }, { "epoch": 0.15813146551243543, "grad_norm": 1.75, "learning_rate": 0.00028372492663025393, "loss": 2.7897, "step": 554 }, { "epoch": 0.15841690137076112, "grad_norm": 1.703125, "learning_rate": 0.0002836633142652693, "loss": 2.8149, "step": 555 }, { "epoch": 0.1587023372290868, "grad_norm": 1.390625, "learning_rate": 0.00028360159221363704, "loss": 2.8298, "step": 556 }, { "epoch": 0.1589877730874125, "grad_norm": 2.453125, "learning_rate": 0.00028353976052600727, "loss": 2.8108, "step": 557 }, { "epoch": 0.1592732089457382, "grad_norm": 1.8125, "learning_rate": 0.0002834778192531204, "loss": 2.7943, "step": 558 }, { "epoch": 0.1595586448040639, "grad_norm": 2.890625, "learning_rate": 0.00028341576844580647, "loss": 2.8394, "step": 559 }, { "epoch": 0.1598440806623896, "grad_norm": 2.796875, "learning_rate": 0.00028335360815498565, "loss": 2.8056, "step": 560 }, { "epoch": 0.16012951652071528, "grad_norm": 1.8515625, "learning_rate": 0.00028329133843166786, "loss": 2.8123, "step": 561 }, { "epoch": 0.160414952379041, "grad_norm": 2.515625, "learning_rate": 0.0002832289593269527, "loss": 2.8239, "step": 562 }, { "epoch": 0.16070038823736668, "grad_norm": 1.8203125, "learning_rate": 0.00028316647089202975, "loss": 2.8298, "step": 563 }, { "epoch": 0.16098582409569237, "grad_norm": 2.875, "learning_rate": 0.0002831038731781782, "loss": 2.839, "step": 564 }, { "epoch": 0.16127125995401806, "grad_norm": 2.65625, "learning_rate": 0.00028304116623676685, "loss": 2.8498, "step": 565 }, { "epoch": 0.16155669581234378, "grad_norm": 2.21875, "learning_rate": 0.0002829783501192542, "loss": 2.8228, "step": 566 }, { "epoch": 0.16184213167066946, "grad_norm": 2.109375, "learning_rate": 0.0002829154248771885, "loss": 2.8171, "step": 567 }, { "epoch": 0.16212756752899515, "grad_norm": 1.9453125, "learning_rate": 0.00028285239056220724, "loss": 2.7826, "step": 568 }, { "epoch": 0.16241300338732084, "grad_norm": 1.5078125, "learning_rate": 0.0002827892472260376, "loss": 2.8087, "step": 569 }, { "epoch": 0.16269843924564656, "grad_norm": 2.046875, "learning_rate": 0.00028272599492049625, "loss": 2.7997, "step": 570 }, { "epoch": 0.16298387510397225, "grad_norm": 1.609375, "learning_rate": 0.00028266263369748916, "loss": 2.8093, "step": 571 }, { "epoch": 0.16326931096229794, "grad_norm": 1.765625, "learning_rate": 0.0002825991636090118, "loss": 2.7765, "step": 572 }, { "epoch": 0.16355474682062363, "grad_norm": 1.3984375, "learning_rate": 0.0002825355847071489, "loss": 2.8033, "step": 573 }, { "epoch": 0.16384018267894934, "grad_norm": 50.75, "learning_rate": 0.00028247189704407456, "loss": 2.8378, "step": 574 }, { "epoch": 0.16412561853727503, "grad_norm": 4.03125, "learning_rate": 0.000282408100672052, "loss": 2.8366, "step": 575 }, { "epoch": 0.16441105439560072, "grad_norm": 2.625, "learning_rate": 0.0002823441956434338, "loss": 2.8565, "step": 576 }, { "epoch": 0.1646964902539264, "grad_norm": 3.15625, "learning_rate": 0.0002822801820106617, "loss": 2.8216, "step": 577 }, { "epoch": 0.1649819261122521, "grad_norm": 2.84375, "learning_rate": 0.0002822160598262663, "loss": 2.8249, "step": 578 }, { "epoch": 0.16526736197057781, "grad_norm": 2.25, "learning_rate": 0.00028215182914286766, "loss": 2.8343, "step": 579 }, { "epoch": 0.1655527978289035, "grad_norm": 2.53125, "learning_rate": 0.0002820874900131746, "loss": 2.8027, "step": 580 }, { "epoch": 0.1658382336872292, "grad_norm": 2.21875, "learning_rate": 0.00028202304248998506, "loss": 2.8204, "step": 581 }, { "epoch": 0.16612366954555488, "grad_norm": 2.078125, "learning_rate": 0.0002819584866261859, "loss": 2.8122, "step": 582 }, { "epoch": 0.1664091054038806, "grad_norm": 1.828125, "learning_rate": 0.0002818938224747529, "loss": 2.816, "step": 583 }, { "epoch": 0.16669454126220629, "grad_norm": 1.5390625, "learning_rate": 0.0002818290500887506, "loss": 2.8286, "step": 584 }, { "epoch": 0.16697997712053197, "grad_norm": 2.0625, "learning_rate": 0.0002817641695213327, "loss": 2.8046, "step": 585 }, { "epoch": 0.16726541297885766, "grad_norm": 1.3359375, "learning_rate": 0.00028169918082574105, "loss": 2.8249, "step": 586 }, { "epoch": 0.16755084883718338, "grad_norm": 1.9921875, "learning_rate": 0.0002816340840553069, "loss": 2.8051, "step": 587 }, { "epoch": 0.16783628469550907, "grad_norm": 1.4140625, "learning_rate": 0.00028156887926344975, "loss": 2.8328, "step": 588 }, { "epoch": 0.16812172055383476, "grad_norm": 2.1875, "learning_rate": 0.00028150356650367796, "loss": 2.8087, "step": 589 }, { "epoch": 0.16840715641216045, "grad_norm": 1.7421875, "learning_rate": 0.00028143814582958827, "loss": 2.7976, "step": 590 }, { "epoch": 0.16869259227048616, "grad_norm": 2.671875, "learning_rate": 0.0002813726172948664, "loss": 2.8238, "step": 591 }, { "epoch": 0.16897802812881185, "grad_norm": 2.0625, "learning_rate": 0.000281306980953286, "loss": 2.8243, "step": 592 }, { "epoch": 0.16926346398713754, "grad_norm": 2.734375, "learning_rate": 0.0002812412368587097, "loss": 2.8078, "step": 593 }, { "epoch": 0.16954889984546323, "grad_norm": 2.328125, "learning_rate": 0.0002811753850650883, "loss": 2.7899, "step": 594 }, { "epoch": 0.16983433570378895, "grad_norm": 3.09375, "learning_rate": 0.000281109425626461, "loss": 2.8176, "step": 595 }, { "epoch": 0.17011977156211464, "grad_norm": 3.015625, "learning_rate": 0.00028104335859695543, "loss": 2.8235, "step": 596 }, { "epoch": 0.17040520742044032, "grad_norm": 2.125, "learning_rate": 0.0002809771840307873, "loss": 2.7986, "step": 597 }, { "epoch": 0.170690643278766, "grad_norm": 1.8984375, "learning_rate": 0.0002809109019822609, "loss": 2.7848, "step": 598 }, { "epoch": 0.17097607913709173, "grad_norm": 2.203125, "learning_rate": 0.00028084451250576844, "loss": 2.7914, "step": 599 }, { "epoch": 0.17126151499541742, "grad_norm": 1.625, "learning_rate": 0.00028077801565579033, "loss": 2.8036, "step": 600 }, { "epoch": 0.1715469508537431, "grad_norm": 2.84375, "learning_rate": 0.0002807114114868953, "loss": 2.8006, "step": 601 }, { "epoch": 0.1718323867120688, "grad_norm": 2.40625, "learning_rate": 0.0002806447000537398, "loss": 2.7898, "step": 602 }, { "epoch": 0.1721178225703945, "grad_norm": 2.78125, "learning_rate": 0.00028057788141106865, "loss": 2.7905, "step": 603 }, { "epoch": 0.1724032584287202, "grad_norm": 2.5625, "learning_rate": 0.0002805109556137144, "loss": 2.8129, "step": 604 }, { "epoch": 0.1726886942870459, "grad_norm": 2.375, "learning_rate": 0.0002804439227165977, "loss": 2.8151, "step": 605 }, { "epoch": 0.17297413014537158, "grad_norm": 2.140625, "learning_rate": 0.00028037678277472697, "loss": 2.7888, "step": 606 }, { "epoch": 0.1732595660036973, "grad_norm": 2.515625, "learning_rate": 0.0002803095358431985, "loss": 2.7996, "step": 607 }, { "epoch": 0.17354500186202299, "grad_norm": 2.234375, "learning_rate": 0.00028024218197719643, "loss": 2.7932, "step": 608 }, { "epoch": 0.17383043772034867, "grad_norm": 2.609375, "learning_rate": 0.0002801747212319926, "loss": 2.7972, "step": 609 }, { "epoch": 0.17411587357867436, "grad_norm": 2.375, "learning_rate": 0.0002801071536629466, "loss": 2.8141, "step": 610 }, { "epoch": 0.17440130943700008, "grad_norm": 2.5, "learning_rate": 0.0002800394793255056, "loss": 2.8014, "step": 611 }, { "epoch": 0.17468674529532577, "grad_norm": 2.40625, "learning_rate": 0.00027997169827520454, "loss": 2.8036, "step": 612 }, { "epoch": 0.17497218115365146, "grad_norm": 2.359375, "learning_rate": 0.0002799038105676658, "loss": 2.8235, "step": 613 }, { "epoch": 0.17525761701197715, "grad_norm": 2.109375, "learning_rate": 0.00027983581625859927, "loss": 2.7849, "step": 614 }, { "epoch": 0.17554305287030286, "grad_norm": 2.40625, "learning_rate": 0.0002797677154038024, "loss": 2.7964, "step": 615 }, { "epoch": 0.17582848872862855, "grad_norm": 2.15625, "learning_rate": 0.00027969950805916, "loss": 2.8027, "step": 616 }, { "epoch": 0.17611392458695424, "grad_norm": 2.5, "learning_rate": 0.0002796311942806444, "loss": 2.783, "step": 617 }, { "epoch": 0.17639936044527993, "grad_norm": 2.25, "learning_rate": 0.00027956277412431507, "loss": 2.7981, "step": 618 }, { "epoch": 0.17668479630360565, "grad_norm": 2.46875, "learning_rate": 0.00027949424764631896, "loss": 2.8145, "step": 619 }, { "epoch": 0.17697023216193133, "grad_norm": 2.265625, "learning_rate": 0.0002794256149028902, "loss": 2.83, "step": 620 }, { "epoch": 0.17725566802025702, "grad_norm": 2.375, "learning_rate": 0.00027935687595035015, "loss": 2.811, "step": 621 }, { "epoch": 0.1775411038785827, "grad_norm": 2.09375, "learning_rate": 0.00027928803084510716, "loss": 2.8016, "step": 622 }, { "epoch": 0.17782653973690843, "grad_norm": 2.421875, "learning_rate": 0.000279219079643657, "loss": 2.7996, "step": 623 }, { "epoch": 0.17811197559523412, "grad_norm": 2.203125, "learning_rate": 0.0002791500224025822, "loss": 2.817, "step": 624 }, { "epoch": 0.1783974114535598, "grad_norm": 2.40625, "learning_rate": 0.00027908085917855243, "loss": 2.8096, "step": 625 }, { "epoch": 0.1786828473118855, "grad_norm": 2.09375, "learning_rate": 0.0002790115900283245, "loss": 2.7852, "step": 626 }, { "epoch": 0.1789682831702112, "grad_norm": 2.28125, "learning_rate": 0.00027894221500874184, "loss": 2.8088, "step": 627 }, { "epoch": 0.1792537190285369, "grad_norm": 2.046875, "learning_rate": 0.0002788727341767349, "loss": 2.767, "step": 628 }, { "epoch": 0.1795391548868626, "grad_norm": 2.4375, "learning_rate": 0.0002788031475893211, "loss": 2.7955, "step": 629 }, { "epoch": 0.17982459074518828, "grad_norm": 2.125, "learning_rate": 0.00027873345530360436, "loss": 2.8143, "step": 630 }, { "epoch": 0.180110026603514, "grad_norm": 2.625, "learning_rate": 0.00027866365737677564, "loss": 2.777, "step": 631 }, { "epoch": 0.18039546246183968, "grad_norm": 2.234375, "learning_rate": 0.00027859375386611227, "loss": 2.8, "step": 632 }, { "epoch": 0.18068089832016537, "grad_norm": 2.65625, "learning_rate": 0.0002785237448289786, "loss": 2.7796, "step": 633 }, { "epoch": 0.18096633417849106, "grad_norm": 2.421875, "learning_rate": 0.00027845363032282514, "loss": 2.8042, "step": 634 }, { "epoch": 0.18125177003681678, "grad_norm": 2.171875, "learning_rate": 0.0002783834104051893, "loss": 2.8206, "step": 635 }, { "epoch": 0.18153720589514247, "grad_norm": 2.171875, "learning_rate": 0.00027831308513369494, "loss": 2.812, "step": 636 }, { "epoch": 0.18182264175346816, "grad_norm": 1.953125, "learning_rate": 0.00027824265456605224, "loss": 2.7804, "step": 637 }, { "epoch": 0.18210807761179384, "grad_norm": 1.859375, "learning_rate": 0.00027817211876005786, "loss": 2.7941, "step": 638 }, { "epoch": 0.18239351347011956, "grad_norm": 1.734375, "learning_rate": 0.0002781014777735948, "loss": 2.7842, "step": 639 }, { "epoch": 0.18267894932844525, "grad_norm": 1.671875, "learning_rate": 0.00027803073166463244, "loss": 2.7955, "step": 640 }, { "epoch": 0.18296438518677094, "grad_norm": 1.7578125, "learning_rate": 0.00027795988049122625, "loss": 2.7597, "step": 641 }, { "epoch": 0.18324982104509663, "grad_norm": 1.4453125, "learning_rate": 0.0002778889243115183, "loss": 2.811, "step": 642 }, { "epoch": 0.18353525690342234, "grad_norm": 1.7734375, "learning_rate": 0.00027781786318373627, "loss": 2.7948, "step": 643 }, { "epoch": 0.18382069276174803, "grad_norm": 1.4296875, "learning_rate": 0.0002777466971661945, "loss": 2.7811, "step": 644 }, { "epoch": 0.18410612862007372, "grad_norm": 2.0625, "learning_rate": 0.00027767542631729306, "loss": 2.7838, "step": 645 }, { "epoch": 0.1843915644783994, "grad_norm": 1.65625, "learning_rate": 0.0002776040506955182, "loss": 2.7958, "step": 646 }, { "epoch": 0.18467700033672513, "grad_norm": 2.1875, "learning_rate": 0.0002775325703594421, "loss": 2.7798, "step": 647 }, { "epoch": 0.18496243619505082, "grad_norm": 1.8984375, "learning_rate": 0.0002774609853677229, "loss": 2.7891, "step": 648 }, { "epoch": 0.1852478720533765, "grad_norm": 2.25, "learning_rate": 0.0002773892957791045, "loss": 2.8067, "step": 649 }, { "epoch": 0.1855333079117022, "grad_norm": 1.9140625, "learning_rate": 0.0002773175016524169, "loss": 2.7842, "step": 650 }, { "epoch": 0.18581874377002788, "grad_norm": 2.265625, "learning_rate": 0.00027724560304657553, "loss": 2.7706, "step": 651 }, { "epoch": 0.1861041796283536, "grad_norm": 2.03125, "learning_rate": 0.0002771736000205819, "loss": 2.7912, "step": 652 }, { "epoch": 0.1863896154866793, "grad_norm": 2.40625, "learning_rate": 0.000277101492633523, "loss": 2.7859, "step": 653 }, { "epoch": 0.18667505134500498, "grad_norm": 2.140625, "learning_rate": 0.0002770292809445715, "loss": 2.7637, "step": 654 }, { "epoch": 0.18696048720333067, "grad_norm": 2.359375, "learning_rate": 0.0002769569650129857, "loss": 2.7884, "step": 655 }, { "epoch": 0.18724592306165638, "grad_norm": 2.234375, "learning_rate": 0.00027688454489810946, "loss": 2.7858, "step": 656 }, { "epoch": 0.18753135891998207, "grad_norm": 1.9921875, "learning_rate": 0.00027681202065937203, "loss": 2.7677, "step": 657 }, { "epoch": 0.18781679477830776, "grad_norm": 1.796875, "learning_rate": 0.00027673939235628827, "loss": 2.7883, "step": 658 }, { "epoch": 0.18810223063663345, "grad_norm": 2.21875, "learning_rate": 0.00027666666004845823, "loss": 2.7624, "step": 659 }, { "epoch": 0.18838766649495917, "grad_norm": 1.9609375, "learning_rate": 0.0002765938237955674, "loss": 2.8089, "step": 660 }, { "epoch": 0.18867310235328486, "grad_norm": 2.328125, "learning_rate": 0.0002765208836573868, "loss": 2.7795, "step": 661 }, { "epoch": 0.18895853821161054, "grad_norm": 2.140625, "learning_rate": 0.0002764478396937722, "loss": 2.7722, "step": 662 }, { "epoch": 0.18924397406993623, "grad_norm": 2.171875, "learning_rate": 0.00027637469196466506, "loss": 2.7653, "step": 663 }, { "epoch": 0.18952940992826195, "grad_norm": 1.984375, "learning_rate": 0.00027630144053009174, "loss": 2.7717, "step": 664 }, { "epoch": 0.18981484578658764, "grad_norm": 2.15625, "learning_rate": 0.0002762280854501638, "loss": 2.762, "step": 665 }, { "epoch": 0.19010028164491333, "grad_norm": 2.03125, "learning_rate": 0.00027615462678507775, "loss": 2.7989, "step": 666 }, { "epoch": 0.19038571750323902, "grad_norm": 2.203125, "learning_rate": 0.00027608106459511513, "loss": 2.7851, "step": 667 }, { "epoch": 0.19067115336156473, "grad_norm": 2.15625, "learning_rate": 0.0002760073989406425, "loss": 2.7428, "step": 668 }, { "epoch": 0.19095658921989042, "grad_norm": 1.9921875, "learning_rate": 0.00027593362988211133, "loss": 2.7699, "step": 669 }, { "epoch": 0.1912420250782161, "grad_norm": 1.875, "learning_rate": 0.00027585975748005783, "loss": 2.7797, "step": 670 }, { "epoch": 0.1915274609365418, "grad_norm": 2.109375, "learning_rate": 0.0002757857817951032, "loss": 2.7656, "step": 671 }, { "epoch": 0.19181289679486752, "grad_norm": 2.078125, "learning_rate": 0.00027571170288795323, "loss": 2.7674, "step": 672 }, { "epoch": 0.1920983326531932, "grad_norm": 1.9765625, "learning_rate": 0.0002756375208193985, "loss": 2.7576, "step": 673 }, { "epoch": 0.1923837685115189, "grad_norm": 1.8984375, "learning_rate": 0.0002755632356503141, "loss": 2.7844, "step": 674 }, { "epoch": 0.19266920436984458, "grad_norm": 2.03125, "learning_rate": 0.00027548884744166, "loss": 2.7817, "step": 675 }, { "epoch": 0.1929546402281703, "grad_norm": 1.8671875, "learning_rate": 0.0002754143562544805, "loss": 2.7589, "step": 676 }, { "epoch": 0.193240076086496, "grad_norm": 2.125, "learning_rate": 0.0002753397621499045, "loss": 2.7841, "step": 677 }, { "epoch": 0.19352551194482168, "grad_norm": 1.9453125, "learning_rate": 0.00027526506518914533, "loss": 2.7945, "step": 678 }, { "epoch": 0.19381094780314737, "grad_norm": 2.140625, "learning_rate": 0.00027519026543350067, "loss": 2.7896, "step": 679 }, { "epoch": 0.19409638366147308, "grad_norm": 1.9609375, "learning_rate": 0.0002751153629443528, "loss": 2.7839, "step": 680 }, { "epoch": 0.19438181951979877, "grad_norm": 2.078125, "learning_rate": 0.0002750403577831679, "loss": 2.7684, "step": 681 }, { "epoch": 0.19466725537812446, "grad_norm": 1.828125, "learning_rate": 0.00027496525001149676, "loss": 2.7598, "step": 682 }, { "epoch": 0.19495269123645015, "grad_norm": 2.015625, "learning_rate": 0.00027489003969097416, "loss": 2.7652, "step": 683 }, { "epoch": 0.19523812709477587, "grad_norm": 1.9765625, "learning_rate": 0.00027481472688331923, "loss": 2.7909, "step": 684 }, { "epoch": 0.19552356295310155, "grad_norm": 1.984375, "learning_rate": 0.00027473931165033496, "loss": 2.7535, "step": 685 }, { "epoch": 0.19580899881142724, "grad_norm": 1.8671875, "learning_rate": 0.00027466379405390864, "loss": 2.763, "step": 686 }, { "epoch": 0.19609443466975293, "grad_norm": 1.96875, "learning_rate": 0.0002745881741560113, "loss": 2.8034, "step": 687 }, { "epoch": 0.19637987052807865, "grad_norm": 1.671875, "learning_rate": 0.0002745124520186981, "loss": 2.7538, "step": 688 }, { "epoch": 0.19666530638640434, "grad_norm": 2.109375, "learning_rate": 0.0002744366277041082, "loss": 2.7494, "step": 689 }, { "epoch": 0.19695074224473003, "grad_norm": 1.8125, "learning_rate": 0.0002743607012744643, "loss": 2.7578, "step": 690 }, { "epoch": 0.19723617810305571, "grad_norm": 1.984375, "learning_rate": 0.00027428467279207316, "loss": 2.7845, "step": 691 }, { "epoch": 0.19752161396138143, "grad_norm": 1.7890625, "learning_rate": 0.00027420854231932515, "loss": 2.7833, "step": 692 }, { "epoch": 0.19780704981970712, "grad_norm": 1.8671875, "learning_rate": 0.0002741323099186944, "loss": 2.7835, "step": 693 }, { "epoch": 0.1980924856780328, "grad_norm": 1.6640625, "learning_rate": 0.00027405597565273866, "loss": 2.7663, "step": 694 }, { "epoch": 0.1983779215363585, "grad_norm": 2.0, "learning_rate": 0.00027397953958409923, "loss": 2.7737, "step": 695 }, { "epoch": 0.19866335739468421, "grad_norm": 1.71875, "learning_rate": 0.00027390300177550106, "loss": 2.7501, "step": 696 }, { "epoch": 0.1989487932530099, "grad_norm": 1.9296875, "learning_rate": 0.0002738263622897525, "loss": 2.7862, "step": 697 }, { "epoch": 0.1992342291113356, "grad_norm": 1.71875, "learning_rate": 0.0002737496211897453, "loss": 2.7629, "step": 698 }, { "epoch": 0.19951966496966128, "grad_norm": 1.8515625, "learning_rate": 0.0002736727785384548, "loss": 2.7394, "step": 699 }, { "epoch": 0.199805100827987, "grad_norm": 1.7421875, "learning_rate": 0.00027359583439893944, "loss": 2.7867, "step": 700 }, { "epoch": 0.2000905366863127, "grad_norm": 1.765625, "learning_rate": 0.00027351878883434105, "loss": 2.7564, "step": 701 }, { "epoch": 0.20037597254463838, "grad_norm": 1.53125, "learning_rate": 0.0002734416419078847, "loss": 2.7623, "step": 702 }, { "epoch": 0.20066140840296406, "grad_norm": 2.0625, "learning_rate": 0.00027336439368287857, "loss": 2.7678, "step": 703 }, { "epoch": 0.20094684426128978, "grad_norm": 1.9375, "learning_rate": 0.0002732870442227141, "loss": 2.7727, "step": 704 }, { "epoch": 0.20123228011961547, "grad_norm": 1.828125, "learning_rate": 0.00027320959359086565, "loss": 2.7808, "step": 705 }, { "epoch": 0.20151771597794116, "grad_norm": 1.703125, "learning_rate": 0.0002731320418508907, "loss": 2.7509, "step": 706 }, { "epoch": 0.20180315183626685, "grad_norm": 1.8125, "learning_rate": 0.0002730543890664297, "loss": 2.7839, "step": 707 }, { "epoch": 0.20208858769459256, "grad_norm": 1.546875, "learning_rate": 0.0002729766353012059, "loss": 2.7573, "step": 708 }, { "epoch": 0.20237402355291825, "grad_norm": 2.296875, "learning_rate": 0.0002728987806190257, "loss": 2.7872, "step": 709 }, { "epoch": 0.20265945941124394, "grad_norm": 1.9609375, "learning_rate": 0.00027282082508377795, "loss": 2.7727, "step": 710 }, { "epoch": 0.20294489526956963, "grad_norm": 2.203125, "learning_rate": 0.0002727427687594345, "loss": 2.7632, "step": 711 }, { "epoch": 0.20323033112789535, "grad_norm": 2.03125, "learning_rate": 0.00027266461171004985, "loss": 2.7631, "step": 712 }, { "epoch": 0.20351576698622104, "grad_norm": 2.046875, "learning_rate": 0.00027258635399976115, "loss": 2.768, "step": 713 }, { "epoch": 0.20380120284454672, "grad_norm": 1.7890625, "learning_rate": 0.00027250799569278816, "loss": 2.7666, "step": 714 }, { "epoch": 0.2040866387028724, "grad_norm": 2.25, "learning_rate": 0.00027242953685343327, "loss": 2.7794, "step": 715 }, { "epoch": 0.20437207456119813, "grad_norm": 1.890625, "learning_rate": 0.0002723509775460811, "loss": 2.7449, "step": 716 }, { "epoch": 0.20465751041952382, "grad_norm": 2.140625, "learning_rate": 0.00027227231783519913, "loss": 2.7529, "step": 717 }, { "epoch": 0.2049429462778495, "grad_norm": 1.8984375, "learning_rate": 0.0002721935577853368, "loss": 2.7785, "step": 718 }, { "epoch": 0.2052283821361752, "grad_norm": 1.9765625, "learning_rate": 0.00027211469746112624, "loss": 2.7653, "step": 719 }, { "epoch": 0.2055138179945009, "grad_norm": 1.734375, "learning_rate": 0.00027203573692728174, "loss": 2.7664, "step": 720 }, { "epoch": 0.2057992538528266, "grad_norm": 2.0625, "learning_rate": 0.0002719566762485997, "loss": 2.7677, "step": 721 }, { "epoch": 0.2060846897111523, "grad_norm": 1.7578125, "learning_rate": 0.0002718775154899589, "loss": 2.7667, "step": 722 }, { "epoch": 0.20637012556947798, "grad_norm": 2.078125, "learning_rate": 0.0002717982547163201, "loss": 2.7674, "step": 723 }, { "epoch": 0.20665556142780367, "grad_norm": 1.84375, "learning_rate": 0.0002717188939927262, "loss": 2.7747, "step": 724 }, { "epoch": 0.20694099728612939, "grad_norm": 1.984375, "learning_rate": 0.00027163943338430214, "loss": 2.7299, "step": 725 }, { "epoch": 0.20722643314445507, "grad_norm": 1.8359375, "learning_rate": 0.0002715598729562548, "loss": 2.7672, "step": 726 }, { "epoch": 0.20751186900278076, "grad_norm": 1.875, "learning_rate": 0.000271480212773873, "loss": 2.7847, "step": 727 }, { "epoch": 0.20779730486110645, "grad_norm": 1.609375, "learning_rate": 0.0002714004529025273, "loss": 2.7886, "step": 728 }, { "epoch": 0.20808274071943217, "grad_norm": 1.890625, "learning_rate": 0.00027132059340767025, "loss": 2.7586, "step": 729 }, { "epoch": 0.20836817657775786, "grad_norm": 1.5859375, "learning_rate": 0.00027124063435483603, "loss": 2.779, "step": 730 }, { "epoch": 0.20865361243608355, "grad_norm": 2.0625, "learning_rate": 0.0002711605758096406, "loss": 2.7593, "step": 731 }, { "epoch": 0.20893904829440924, "grad_norm": 1.7109375, "learning_rate": 0.0002710804178377814, "loss": 2.7684, "step": 732 }, { "epoch": 0.20922448415273495, "grad_norm": 2.03125, "learning_rate": 0.0002710001605050377, "loss": 2.7542, "step": 733 }, { "epoch": 0.20950992001106064, "grad_norm": 1.7578125, "learning_rate": 0.00027091980387727014, "loss": 2.7644, "step": 734 }, { "epoch": 0.20979535586938633, "grad_norm": 2.15625, "learning_rate": 0.00027083934802042084, "loss": 2.7772, "step": 735 }, { "epoch": 0.21008079172771202, "grad_norm": 1.8359375, "learning_rate": 0.0002707587930005136, "loss": 2.7419, "step": 736 }, { "epoch": 0.21036622758603774, "grad_norm": 2.09375, "learning_rate": 0.0002706781388836531, "loss": 2.7889, "step": 737 }, { "epoch": 0.21065166344436342, "grad_norm": 1.765625, "learning_rate": 0.00027059738573602583, "loss": 2.768, "step": 738 }, { "epoch": 0.2109370993026891, "grad_norm": 2.25, "learning_rate": 0.00027051653362389935, "loss": 2.8016, "step": 739 }, { "epoch": 0.2112225351610148, "grad_norm": 1.8046875, "learning_rate": 0.0002704355826136224, "loss": 2.758, "step": 740 }, { "epoch": 0.21150797101934052, "grad_norm": 2.203125, "learning_rate": 0.0002703545327716249, "loss": 2.7658, "step": 741 }, { "epoch": 0.2117934068776662, "grad_norm": 1.859375, "learning_rate": 0.00027027338416441785, "loss": 2.7693, "step": 742 }, { "epoch": 0.2120788427359919, "grad_norm": 2.40625, "learning_rate": 0.0002701921368585934, "loss": 2.7948, "step": 743 }, { "epoch": 0.21236427859431758, "grad_norm": 2.0, "learning_rate": 0.0002701107909208246, "loss": 2.7832, "step": 744 }, { "epoch": 0.2126497144526433, "grad_norm": 2.25, "learning_rate": 0.00027002934641786545, "loss": 2.7851, "step": 745 }, { "epoch": 0.212935150310969, "grad_norm": 2.0625, "learning_rate": 0.00026994780341655093, "loss": 2.7461, "step": 746 }, { "epoch": 0.21322058616929468, "grad_norm": 2.015625, "learning_rate": 0.0002698661619837967, "loss": 2.7511, "step": 747 }, { "epoch": 0.21350602202762037, "grad_norm": 1.8125, "learning_rate": 0.0002697844221865993, "loss": 2.7562, "step": 748 }, { "epoch": 0.21379145788594608, "grad_norm": 2.078125, "learning_rate": 0.00026970258409203594, "loss": 2.729, "step": 749 }, { "epoch": 0.21407689374427177, "grad_norm": 1.7421875, "learning_rate": 0.00026962064776726445, "loss": 2.7467, "step": 750 }, { "epoch": 0.21407689374427177, "eval_loss": 2.6212494373321533, "eval_runtime": 5936.0633, "eval_samples_per_second": 10.83, "eval_steps_per_second": 10.83, "step": 750 }, { "epoch": 0.21436232960259746, "grad_norm": 2.0625, "learning_rate": 0.0002695386132795234, "loss": 2.7875, "step": 751 }, { "epoch": 0.21464776546092315, "grad_norm": 1.8828125, "learning_rate": 0.0002694564806961319, "loss": 2.7879, "step": 752 }, { "epoch": 0.21493320131924887, "grad_norm": 1.9140625, "learning_rate": 0.00026937425008448937, "loss": 2.7634, "step": 753 }, { "epoch": 0.21521863717757456, "grad_norm": 1.65625, "learning_rate": 0.0002692919215120759, "loss": 2.7563, "step": 754 }, { "epoch": 0.21550407303590025, "grad_norm": 1.953125, "learning_rate": 0.0002692094950464519, "loss": 2.7836, "step": 755 }, { "epoch": 0.21578950889422593, "grad_norm": 1.59375, "learning_rate": 0.000269126970755258, "loss": 2.7366, "step": 756 }, { "epoch": 0.21607494475255165, "grad_norm": 1.828125, "learning_rate": 0.00026904434870621524, "loss": 2.7813, "step": 757 }, { "epoch": 0.21636038061087734, "grad_norm": 1.53125, "learning_rate": 0.00026896162896712476, "loss": 2.7718, "step": 758 }, { "epoch": 0.21664581646920303, "grad_norm": 2.0, "learning_rate": 0.00026887881160586813, "loss": 2.7536, "step": 759 }, { "epoch": 0.21693125232752872, "grad_norm": 1.6953125, "learning_rate": 0.0002687958966904067, "loss": 2.7619, "step": 760 }, { "epoch": 0.21721668818585443, "grad_norm": 1.84375, "learning_rate": 0.00026871288428878206, "loss": 2.7672, "step": 761 }, { "epoch": 0.21750212404418012, "grad_norm": 1.6328125, "learning_rate": 0.0002686297744691158, "loss": 2.7571, "step": 762 }, { "epoch": 0.2177875599025058, "grad_norm": 2.015625, "learning_rate": 0.0002685465672996093, "loss": 2.7652, "step": 763 }, { "epoch": 0.2180729957608315, "grad_norm": 1.6953125, "learning_rate": 0.000268463262848544, "loss": 2.7748, "step": 764 }, { "epoch": 0.21835843161915722, "grad_norm": 1.7109375, "learning_rate": 0.0002683798611842812, "loss": 2.7583, "step": 765 }, { "epoch": 0.2186438674774829, "grad_norm": 1.59375, "learning_rate": 0.0002682963623752617, "loss": 2.7586, "step": 766 }, { "epoch": 0.2189293033358086, "grad_norm": 1.828125, "learning_rate": 0.0002682127664900064, "loss": 2.7338, "step": 767 }, { "epoch": 0.21921473919413428, "grad_norm": 1.40625, "learning_rate": 0.0002681290735971156, "loss": 2.752, "step": 768 }, { "epoch": 0.21950017505246, "grad_norm": 1.796875, "learning_rate": 0.0002680452837652691, "loss": 2.7629, "step": 769 }, { "epoch": 0.2197856109107857, "grad_norm": 1.5625, "learning_rate": 0.0002679613970632267, "loss": 2.7652, "step": 770 }, { "epoch": 0.22007104676911138, "grad_norm": 2.109375, "learning_rate": 0.0002678774135598272, "loss": 2.7537, "step": 771 }, { "epoch": 0.22035648262743707, "grad_norm": 1.6875, "learning_rate": 0.00026779333332398923, "loss": 2.7141, "step": 772 }, { "epoch": 0.22064191848576278, "grad_norm": 1.90625, "learning_rate": 0.0002677091564247105, "loss": 2.757, "step": 773 }, { "epoch": 0.22092735434408847, "grad_norm": 1.6875, "learning_rate": 0.0002676248829310682, "loss": 2.7454, "step": 774 }, { "epoch": 0.22121279020241416, "grad_norm": 1.8671875, "learning_rate": 0.0002675405129122188, "loss": 2.7545, "step": 775 }, { "epoch": 0.22149822606073985, "grad_norm": 1.4765625, "learning_rate": 0.0002674560464373979, "loss": 2.7331, "step": 776 }, { "epoch": 0.22178366191906557, "grad_norm": 1.9609375, "learning_rate": 0.0002673714835759202, "loss": 2.7603, "step": 777 }, { "epoch": 0.22206909777739126, "grad_norm": 1.515625, "learning_rate": 0.00026728682439717974, "loss": 2.7551, "step": 778 }, { "epoch": 0.22235453363571694, "grad_norm": 2.234375, "learning_rate": 0.0002672020689706493, "loss": 2.7814, "step": 779 }, { "epoch": 0.22263996949404263, "grad_norm": 1.765625, "learning_rate": 0.00026711721736588103, "loss": 2.7604, "step": 780 }, { "epoch": 0.22292540535236835, "grad_norm": 2.0625, "learning_rate": 0.00026703226965250546, "loss": 2.7551, "step": 781 }, { "epoch": 0.22321084121069404, "grad_norm": 1.890625, "learning_rate": 0.00026694722590023246, "loss": 2.7357, "step": 782 }, { "epoch": 0.22349627706901973, "grad_norm": 1.875, "learning_rate": 0.00026686208617885055, "loss": 2.7532, "step": 783 }, { "epoch": 0.22378171292734542, "grad_norm": 1.6875, "learning_rate": 0.0002667768505582269, "loss": 2.7388, "step": 784 }, { "epoch": 0.22406714878567113, "grad_norm": 1.6484375, "learning_rate": 0.0002666915191083076, "loss": 2.7594, "step": 785 }, { "epoch": 0.22435258464399682, "grad_norm": 1.4296875, "learning_rate": 0.00026660609189911724, "loss": 2.7504, "step": 786 }, { "epoch": 0.2246380205023225, "grad_norm": 1.515625, "learning_rate": 0.00026652056900075885, "loss": 2.7631, "step": 787 }, { "epoch": 0.2249234563606482, "grad_norm": 1.2578125, "learning_rate": 0.0002664349504834143, "loss": 2.7534, "step": 788 }, { "epoch": 0.22520889221897392, "grad_norm": 1.6875, "learning_rate": 0.00026634923641734374, "loss": 2.7584, "step": 789 }, { "epoch": 0.2254943280772996, "grad_norm": 1.3125, "learning_rate": 0.00026626342687288576, "loss": 2.7519, "step": 790 }, { "epoch": 0.2257797639356253, "grad_norm": 2.0625, "learning_rate": 0.0002661775219204572, "loss": 2.7477, "step": 791 }, { "epoch": 0.22606519979395098, "grad_norm": 1.7578125, "learning_rate": 0.0002660915216305534, "loss": 2.7484, "step": 792 }, { "epoch": 0.22635063565227667, "grad_norm": 1.734375, "learning_rate": 0.0002660054260737478, "loss": 2.7718, "step": 793 }, { "epoch": 0.2266360715106024, "grad_norm": 1.6015625, "learning_rate": 0.000265919235320692, "loss": 2.7437, "step": 794 }, { "epoch": 0.22692150736892808, "grad_norm": 1.6953125, "learning_rate": 0.00026583294944211583, "loss": 2.7564, "step": 795 }, { "epoch": 0.22720694322725377, "grad_norm": 1.3046875, "learning_rate": 0.00026574656850882706, "loss": 2.7322, "step": 796 }, { "epoch": 0.22749237908557945, "grad_norm": 1.8828125, "learning_rate": 0.0002656600925917116, "loss": 2.7623, "step": 797 }, { "epoch": 0.22777781494390517, "grad_norm": 1.46875, "learning_rate": 0.00026557352176173317, "loss": 2.7294, "step": 798 }, { "epoch": 0.22806325080223086, "grad_norm": 2.1875, "learning_rate": 0.00026548685608993337, "loss": 2.7457, "step": 799 }, { "epoch": 0.22834868666055655, "grad_norm": 2.015625, "learning_rate": 0.0002654000956474318, "loss": 2.7512, "step": 800 }, { "epoch": 0.22863412251888224, "grad_norm": 1.6953125, "learning_rate": 0.0002653132405054257, "loss": 2.7251, "step": 801 }, { "epoch": 0.22891955837720795, "grad_norm": 1.59375, "learning_rate": 0.00026522629073519, "loss": 2.7645, "step": 802 }, { "epoch": 0.22920499423553364, "grad_norm": 1.6328125, "learning_rate": 0.00026513924640807733, "loss": 2.7856, "step": 803 }, { "epoch": 0.22949043009385933, "grad_norm": 1.3203125, "learning_rate": 0.000265052107595518, "loss": 2.7234, "step": 804 }, { "epoch": 0.22977586595218502, "grad_norm": 1.84375, "learning_rate": 0.00026496487436901964, "loss": 2.7626, "step": 805 }, { "epoch": 0.23006130181051074, "grad_norm": 1.578125, "learning_rate": 0.00026487754680016765, "loss": 2.7252, "step": 806 }, { "epoch": 0.23034673766883643, "grad_norm": 1.890625, "learning_rate": 0.0002647901249606245, "loss": 2.7371, "step": 807 }, { "epoch": 0.23063217352716212, "grad_norm": 1.7109375, "learning_rate": 0.00026470260892213034, "loss": 2.7533, "step": 808 }, { "epoch": 0.2309176093854878, "grad_norm": 1.734375, "learning_rate": 0.00026461499875650245, "loss": 2.7512, "step": 809 }, { "epoch": 0.23120304524381352, "grad_norm": 1.59375, "learning_rate": 0.0002645272945356354, "loss": 2.7423, "step": 810 }, { "epoch": 0.2314884811021392, "grad_norm": 1.890625, "learning_rate": 0.0002644394963315009, "loss": 2.7495, "step": 811 }, { "epoch": 0.2317739169604649, "grad_norm": 1.6484375, "learning_rate": 0.00026435160421614784, "loss": 2.7378, "step": 812 }, { "epoch": 0.2320593528187906, "grad_norm": 1.859375, "learning_rate": 0.0002642636182617022, "loss": 2.7887, "step": 813 }, { "epoch": 0.2323447886771163, "grad_norm": 1.5390625, "learning_rate": 0.0002641755385403669, "loss": 2.7452, "step": 814 }, { "epoch": 0.232630224535442, "grad_norm": 2.0625, "learning_rate": 0.0002640873651244217, "loss": 2.7407, "step": 815 }, { "epoch": 0.23291566039376768, "grad_norm": 1.671875, "learning_rate": 0.0002639990980862236, "loss": 2.7571, "step": 816 }, { "epoch": 0.23320109625209337, "grad_norm": 2.15625, "learning_rate": 0.00026391073749820607, "loss": 2.7219, "step": 817 }, { "epoch": 0.2334865321104191, "grad_norm": 1.953125, "learning_rate": 0.00026382228343287947, "loss": 2.7314, "step": 818 }, { "epoch": 0.23377196796874478, "grad_norm": 2.15625, "learning_rate": 0.0002637337359628309, "loss": 2.7363, "step": 819 }, { "epoch": 0.23405740382707046, "grad_norm": 1.953125, "learning_rate": 0.00026364509516072415, "loss": 2.7455, "step": 820 }, { "epoch": 0.23434283968539615, "grad_norm": 1.921875, "learning_rate": 0.00026355636109929946, "loss": 2.7301, "step": 821 }, { "epoch": 0.23462827554372187, "grad_norm": 1.7578125, "learning_rate": 0.0002634675338513738, "loss": 2.733, "step": 822 }, { "epoch": 0.23491371140204756, "grad_norm": 1.71875, "learning_rate": 0.00026337861348984024, "loss": 2.7564, "step": 823 }, { "epoch": 0.23519914726037325, "grad_norm": 1.4609375, "learning_rate": 0.00026328960008766884, "loss": 2.7489, "step": 824 }, { "epoch": 0.23548458311869894, "grad_norm": 1.9140625, "learning_rate": 0.0002632004937179055, "loss": 2.7493, "step": 825 }, { "epoch": 0.23577001897702465, "grad_norm": 1.6328125, "learning_rate": 0.00026311129445367255, "loss": 2.7289, "step": 826 }, { "epoch": 0.23605545483535034, "grad_norm": 2.15625, "learning_rate": 0.0002630220023681687, "loss": 2.7193, "step": 827 }, { "epoch": 0.23634089069367603, "grad_norm": 2.0, "learning_rate": 0.0002629326175346687, "loss": 2.738, "step": 828 }, { "epoch": 0.23662632655200172, "grad_norm": 1.921875, "learning_rate": 0.0002628431400265235, "loss": 2.7497, "step": 829 }, { "epoch": 0.23691176241032744, "grad_norm": 1.8203125, "learning_rate": 0.00026275356991715986, "loss": 2.7239, "step": 830 }, { "epoch": 0.23719719826865313, "grad_norm": 1.71875, "learning_rate": 0.0002626639072800809, "loss": 2.7372, "step": 831 }, { "epoch": 0.23748263412697881, "grad_norm": 1.4921875, "learning_rate": 0.00026257415218886536, "loss": 2.7284, "step": 832 }, { "epoch": 0.2377680699853045, "grad_norm": 2.046875, "learning_rate": 0.00026248430471716795, "loss": 2.7515, "step": 833 }, { "epoch": 0.23805350584363022, "grad_norm": 1.8984375, "learning_rate": 0.0002623943649387194, "loss": 2.7412, "step": 834 }, { "epoch": 0.2383389417019559, "grad_norm": 1.84375, "learning_rate": 0.0002623043329273257, "loss": 2.7339, "step": 835 }, { "epoch": 0.2386243775602816, "grad_norm": 1.6953125, "learning_rate": 0.0002622142087568691, "loss": 2.7482, "step": 836 }, { "epoch": 0.2389098134186073, "grad_norm": 1.7890625, "learning_rate": 0.00026212399250130706, "loss": 2.7411, "step": 837 }, { "epoch": 0.239195249276933, "grad_norm": 1.5234375, "learning_rate": 0.0002620336842346728, "loss": 2.7394, "step": 838 }, { "epoch": 0.2394806851352587, "grad_norm": 1.9375, "learning_rate": 0.0002619432840310749, "loss": 2.6938, "step": 839 }, { "epoch": 0.23976612099358438, "grad_norm": 1.7265625, "learning_rate": 0.00026185279196469757, "loss": 2.7298, "step": 840 }, { "epoch": 0.24005155685191007, "grad_norm": 1.8828125, "learning_rate": 0.00026176220810980035, "loss": 2.7237, "step": 841 }, { "epoch": 0.2403369927102358, "grad_norm": 1.7890625, "learning_rate": 0.00026167153254071795, "loss": 2.742, "step": 842 }, { "epoch": 0.24062242856856147, "grad_norm": 1.6015625, "learning_rate": 0.0002615807653318605, "loss": 2.7514, "step": 843 }, { "epoch": 0.24090786442688716, "grad_norm": 1.4453125, "learning_rate": 0.0002614899065577133, "loss": 2.7606, "step": 844 }, { "epoch": 0.24119330028521285, "grad_norm": 1.8125, "learning_rate": 0.0002613989562928369, "loss": 2.7474, "step": 845 }, { "epoch": 0.24147873614353857, "grad_norm": 1.53125, "learning_rate": 0.00026130791461186656, "loss": 2.7309, "step": 846 }, { "epoch": 0.24176417200186426, "grad_norm": 1.984375, "learning_rate": 0.000261216781589513, "loss": 2.726, "step": 847 }, { "epoch": 0.24204960786018995, "grad_norm": 1.703125, "learning_rate": 0.0002611255573005617, "loss": 2.7471, "step": 848 }, { "epoch": 0.24233504371851564, "grad_norm": 1.90625, "learning_rate": 0.00026103424181987293, "loss": 2.7328, "step": 849 }, { "epoch": 0.24262047957684135, "grad_norm": 1.7578125, "learning_rate": 0.00026094283522238204, "loss": 2.755, "step": 850 }, { "epoch": 0.24290591543516704, "grad_norm": 1.7890625, "learning_rate": 0.00026085133758309883, "loss": 2.7581, "step": 851 }, { "epoch": 0.24319135129349273, "grad_norm": 1.59375, "learning_rate": 0.00026075974897710815, "loss": 2.7312, "step": 852 }, { "epoch": 0.24347678715181842, "grad_norm": 1.9296875, "learning_rate": 0.0002606680694795693, "loss": 2.7274, "step": 853 }, { "epoch": 0.24376222301014414, "grad_norm": 1.5, "learning_rate": 0.0002605762991657163, "loss": 2.7208, "step": 854 }, { "epoch": 0.24404765886846982, "grad_norm": 2.296875, "learning_rate": 0.00026048443811085744, "loss": 2.7326, "step": 855 }, { "epoch": 0.2443330947267955, "grad_norm": 1.9765625, "learning_rate": 0.00026039248639037575, "loss": 2.7559, "step": 856 }, { "epoch": 0.2446185305851212, "grad_norm": 2.1875, "learning_rate": 0.00026030044407972854, "loss": 2.7389, "step": 857 }, { "epoch": 0.24490396644344692, "grad_norm": 2.03125, "learning_rate": 0.00026020831125444745, "loss": 2.7434, "step": 858 }, { "epoch": 0.2451894023017726, "grad_norm": 1.8828125, "learning_rate": 0.0002601160879901384, "loss": 2.745, "step": 859 }, { "epoch": 0.2454748381600983, "grad_norm": 1.6640625, "learning_rate": 0.0002600237743624816, "loss": 2.74, "step": 860 }, { "epoch": 0.24576027401842399, "grad_norm": 2.125, "learning_rate": 0.00025993137044723135, "loss": 2.736, "step": 861 }, { "epoch": 0.2460457098767497, "grad_norm": 1.7734375, "learning_rate": 0.0002598388763202161, "loss": 2.7447, "step": 862 }, { "epoch": 0.2463311457350754, "grad_norm": 2.1875, "learning_rate": 0.0002597462920573381, "loss": 2.7457, "step": 863 }, { "epoch": 0.24661658159340108, "grad_norm": 2.0, "learning_rate": 0.000259653617734574, "loss": 2.7256, "step": 864 }, { "epoch": 0.24690201745172677, "grad_norm": 1.953125, "learning_rate": 0.00025956085342797395, "loss": 2.7233, "step": 865 }, { "epoch": 0.24718745331005246, "grad_norm": 1.7265625, "learning_rate": 0.00025946799921366205, "loss": 2.7471, "step": 866 }, { "epoch": 0.24747288916837817, "grad_norm": 1.9296875, "learning_rate": 0.0002593750551678364, "loss": 2.7426, "step": 867 }, { "epoch": 0.24775832502670386, "grad_norm": 1.4921875, "learning_rate": 0.00025928202136676855, "loss": 2.6968, "step": 868 }, { "epoch": 0.24804376088502955, "grad_norm": 2.09375, "learning_rate": 0.0002591888978868038, "loss": 2.7192, "step": 869 }, { "epoch": 0.24832919674335524, "grad_norm": 1.828125, "learning_rate": 0.000259095684804361, "loss": 2.7436, "step": 870 }, { "epoch": 0.24861463260168096, "grad_norm": 1.9609375, "learning_rate": 0.0002590023821959326, "loss": 2.7627, "step": 871 }, { "epoch": 0.24890006846000665, "grad_norm": 1.7421875, "learning_rate": 0.00025890899013808455, "loss": 2.7603, "step": 872 }, { "epoch": 0.24918550431833233, "grad_norm": 1.6640625, "learning_rate": 0.0002588155087074561, "loss": 2.7315, "step": 873 }, { "epoch": 0.24947094017665802, "grad_norm": 1.515625, "learning_rate": 0.00025872193798075985, "loss": 2.7302, "step": 874 }, { "epoch": 0.24975637603498374, "grad_norm": 1.453125, "learning_rate": 0.0002586282780347818, "loss": 2.7236, "step": 875 }, { "epoch": 0.25004181189330943, "grad_norm": 1.25, "learning_rate": 0.00025853452894638093, "loss": 2.7152, "step": 876 }, { "epoch": 0.2503272477516351, "grad_norm": 1.3515625, "learning_rate": 0.00025844069079248964, "loss": 2.7169, "step": 877 }, { "epoch": 0.2506126836099608, "grad_norm": 1.125, "learning_rate": 0.00025834676365011326, "loss": 2.7202, "step": 878 }, { "epoch": 0.2508981194682865, "grad_norm": 1.6484375, "learning_rate": 0.00025825274759633016, "loss": 2.7239, "step": 879 }, { "epoch": 0.2511835553266122, "grad_norm": 1.234375, "learning_rate": 0.0002581586427082918, "loss": 2.7023, "step": 880 }, { "epoch": 0.25146899118493793, "grad_norm": 1.90625, "learning_rate": 0.0002580644490632222, "loss": 2.7203, "step": 881 }, { "epoch": 0.2517544270432636, "grad_norm": 1.5234375, "learning_rate": 0.0002579701667384187, "loss": 2.7288, "step": 882 }, { "epoch": 0.2520398629015893, "grad_norm": 1.90625, "learning_rate": 0.00025787579581125107, "loss": 2.7284, "step": 883 }, { "epoch": 0.252325298759915, "grad_norm": 1.7265625, "learning_rate": 0.00025778133635916183, "loss": 2.7377, "step": 884 }, { "epoch": 0.2526107346182407, "grad_norm": 1.75, "learning_rate": 0.0002576867884596663, "loss": 2.7267, "step": 885 }, { "epoch": 0.2528961704765664, "grad_norm": 1.5859375, "learning_rate": 0.00025759215219035213, "loss": 2.723, "step": 886 }, { "epoch": 0.25318160633489206, "grad_norm": 1.7109375, "learning_rate": 0.00025749742762887977, "loss": 2.7178, "step": 887 }, { "epoch": 0.25346704219321775, "grad_norm": 1.3828125, "learning_rate": 0.00025740261485298195, "loss": 2.7387, "step": 888 }, { "epoch": 0.2537524780515435, "grad_norm": 1.984375, "learning_rate": 0.0002573077139404638, "loss": 2.7513, "step": 889 }, { "epoch": 0.2540379139098692, "grad_norm": 1.7265625, "learning_rate": 0.0002572127249692028, "loss": 2.7288, "step": 890 }, { "epoch": 0.2543233497681949, "grad_norm": 1.734375, "learning_rate": 0.00025711764801714874, "loss": 2.7322, "step": 891 }, { "epoch": 0.25460878562652056, "grad_norm": 1.6015625, "learning_rate": 0.00025702248316232355, "loss": 2.7598, "step": 892 }, { "epoch": 0.25489422148484625, "grad_norm": 1.671875, "learning_rate": 0.0002569272304828213, "loss": 2.7304, "step": 893 }, { "epoch": 0.25517965734317194, "grad_norm": 1.421875, "learning_rate": 0.00025683189005680827, "loss": 2.7288, "step": 894 }, { "epoch": 0.25546509320149763, "grad_norm": 1.8203125, "learning_rate": 0.0002567364619625224, "loss": 2.753, "step": 895 }, { "epoch": 0.2557505290598233, "grad_norm": 1.5390625, "learning_rate": 0.00025664094627827393, "loss": 2.7233, "step": 896 }, { "epoch": 0.25603596491814906, "grad_norm": 1.8046875, "learning_rate": 0.00025654534308244484, "loss": 2.731, "step": 897 }, { "epoch": 0.25632140077647475, "grad_norm": 1.6328125, "learning_rate": 0.0002564496524534888, "loss": 2.7177, "step": 898 }, { "epoch": 0.25660683663480044, "grad_norm": 2.015625, "learning_rate": 0.00025635387446993154, "loss": 2.7327, "step": 899 }, { "epoch": 0.25689227249312613, "grad_norm": 1.7890625, "learning_rate": 0.0002562580092103702, "loss": 2.7251, "step": 900 }, { "epoch": 0.2571777083514518, "grad_norm": 1.90625, "learning_rate": 0.00025616205675347355, "loss": 2.7005, "step": 901 }, { "epoch": 0.2574631442097775, "grad_norm": 1.8125, "learning_rate": 0.00025606601717798207, "loss": 2.7263, "step": 902 }, { "epoch": 0.2577485800681032, "grad_norm": 1.6953125, "learning_rate": 0.0002559698905627077, "loss": 2.6863, "step": 903 }, { "epoch": 0.2580340159264289, "grad_norm": 1.5703125, "learning_rate": 0.00025587367698653367, "loss": 2.718, "step": 904 }, { "epoch": 0.25831945178475463, "grad_norm": 1.6640625, "learning_rate": 0.0002557773765284148, "loss": 2.7263, "step": 905 }, { "epoch": 0.2586048876430803, "grad_norm": 1.546875, "learning_rate": 0.0002556809892673769, "loss": 2.7485, "step": 906 }, { "epoch": 0.258890323501406, "grad_norm": 1.8359375, "learning_rate": 0.0002555845152825173, "loss": 2.6922, "step": 907 }, { "epoch": 0.2591757593597317, "grad_norm": 1.71875, "learning_rate": 0.00025548795465300426, "loss": 2.7269, "step": 908 }, { "epoch": 0.2594611952180574, "grad_norm": 1.6875, "learning_rate": 0.0002553913074580774, "loss": 2.7466, "step": 909 }, { "epoch": 0.25974663107638307, "grad_norm": 1.6015625, "learning_rate": 0.00025529457377704713, "loss": 2.728, "step": 910 }, { "epoch": 0.26003206693470876, "grad_norm": 1.703125, "learning_rate": 0.0002551977536892951, "loss": 2.7171, "step": 911 }, { "epoch": 0.26031750279303445, "grad_norm": 1.4921875, "learning_rate": 0.0002551008472742735, "loss": 2.7028, "step": 912 }, { "epoch": 0.2606029386513602, "grad_norm": 1.7578125, "learning_rate": 0.00025500385461150565, "loss": 2.7107, "step": 913 }, { "epoch": 0.2608883745096859, "grad_norm": 1.65625, "learning_rate": 0.0002549067757805856, "loss": 2.7452, "step": 914 }, { "epoch": 0.26117381036801157, "grad_norm": 1.515625, "learning_rate": 0.00025480961086117815, "loss": 2.7045, "step": 915 }, { "epoch": 0.26145924622633726, "grad_norm": 1.4453125, "learning_rate": 0.0002547123599330185, "loss": 2.72, "step": 916 }, { "epoch": 0.26174468208466295, "grad_norm": 1.640625, "learning_rate": 0.00025461502307591274, "loss": 2.7136, "step": 917 }, { "epoch": 0.26203011794298864, "grad_norm": 1.515625, "learning_rate": 0.0002545176003697372, "loss": 2.7097, "step": 918 }, { "epoch": 0.2623155538013143, "grad_norm": 1.6328125, "learning_rate": 0.000254420091894439, "loss": 2.7218, "step": 919 }, { "epoch": 0.26260098965964, "grad_norm": 1.4921875, "learning_rate": 0.0002543224977300352, "loss": 2.6923, "step": 920 }, { "epoch": 0.26288642551796576, "grad_norm": 1.6015625, "learning_rate": 0.0002542248179566137, "loss": 2.735, "step": 921 }, { "epoch": 0.26317186137629145, "grad_norm": 1.375, "learning_rate": 0.0002541270526543321, "loss": 2.7211, "step": 922 }, { "epoch": 0.26345729723461714, "grad_norm": 1.8203125, "learning_rate": 0.00025402920190341864, "loss": 2.73, "step": 923 }, { "epoch": 0.2637427330929428, "grad_norm": 1.625, "learning_rate": 0.0002539312657841714, "loss": 2.7038, "step": 924 }, { "epoch": 0.2640281689512685, "grad_norm": 1.65625, "learning_rate": 0.0002538332443769587, "loss": 2.7209, "step": 925 }, { "epoch": 0.2643136048095942, "grad_norm": 2.875, "learning_rate": 0.0002537351377622187, "loss": 2.7053, "step": 926 }, { "epoch": 0.2645990406679199, "grad_norm": 0.88671875, "learning_rate": 0.00025363694602045957, "loss": 2.7378, "step": 927 }, { "epoch": 0.2648844765262456, "grad_norm": 2.1875, "learning_rate": 0.0002535386692322593, "loss": 2.7339, "step": 928 }, { "epoch": 0.2651699123845713, "grad_norm": 1.875, "learning_rate": 0.0002534403074782657, "loss": 2.7474, "step": 929 }, { "epoch": 0.265455348242897, "grad_norm": 1.7734375, "learning_rate": 0.00025334186083919623, "loss": 2.7283, "step": 930 }, { "epoch": 0.2657407841012227, "grad_norm": 1.71875, "learning_rate": 0.00025324332939583813, "loss": 2.7195, "step": 931 }, { "epoch": 0.2660262199595484, "grad_norm": 1.2578125, "learning_rate": 0.0002531447132290482, "loss": 2.7133, "step": 932 }, { "epoch": 0.2663116558178741, "grad_norm": 1.484375, "learning_rate": 0.00025304601241975266, "loss": 2.737, "step": 933 }, { "epoch": 0.26659709167619977, "grad_norm": 1.1171875, "learning_rate": 0.0002529472270489473, "loss": 2.7129, "step": 934 }, { "epoch": 0.26688252753452546, "grad_norm": 1.828125, "learning_rate": 0.0002528483571976973, "loss": 2.7195, "step": 935 }, { "epoch": 0.26716796339285115, "grad_norm": 1.28125, "learning_rate": 0.00025274940294713706, "loss": 2.694, "step": 936 }, { "epoch": 0.2674533992511769, "grad_norm": 2.171875, "learning_rate": 0.00025265036437847036, "loss": 2.739, "step": 937 }, { "epoch": 0.2677388351095026, "grad_norm": 1.8046875, "learning_rate": 0.0002525512415729701, "loss": 2.7, "step": 938 }, { "epoch": 0.26802427096782827, "grad_norm": 1.4453125, "learning_rate": 0.00025245203461197834, "loss": 2.7329, "step": 939 }, { "epoch": 0.26830970682615396, "grad_norm": 1.5390625, "learning_rate": 0.0002523527435769062, "loss": 2.7321, "step": 940 }, { "epoch": 0.26859514268447965, "grad_norm": 1.25, "learning_rate": 0.0002522533685492338, "loss": 2.7274, "step": 941 }, { "epoch": 0.26888057854280534, "grad_norm": 1.5703125, "learning_rate": 0.0002521539096105101, "loss": 2.719, "step": 942 }, { "epoch": 0.269166014401131, "grad_norm": 1.203125, "learning_rate": 0.00025205436684235313, "loss": 2.7257, "step": 943 }, { "epoch": 0.2694514502594567, "grad_norm": 1.703125, "learning_rate": 0.0002519547403264494, "loss": 2.7126, "step": 944 }, { "epoch": 0.2697368861177824, "grad_norm": 1.359375, "learning_rate": 0.00025185503014455443, "loss": 2.7297, "step": 945 }, { "epoch": 0.27002232197610815, "grad_norm": 1.8203125, "learning_rate": 0.00025175523637849224, "loss": 2.7324, "step": 946 }, { "epoch": 0.27030775783443384, "grad_norm": 1.296875, "learning_rate": 0.0002516553591101555, "loss": 2.7367, "step": 947 }, { "epoch": 0.2705931936927595, "grad_norm": 1.890625, "learning_rate": 0.00025155539842150535, "loss": 2.6977, "step": 948 }, { "epoch": 0.2708786295510852, "grad_norm": 1.3125, "learning_rate": 0.0002514553543945715, "loss": 2.6864, "step": 949 }, { "epoch": 0.2711640654094109, "grad_norm": 2.015625, "learning_rate": 0.00025135522711145197, "loss": 2.7111, "step": 950 }, { "epoch": 0.2714495012677366, "grad_norm": 1.7265625, "learning_rate": 0.000251255016654313, "loss": 2.7124, "step": 951 }, { "epoch": 0.2717349371260623, "grad_norm": 1.875, "learning_rate": 0.0002511547231053893, "loss": 2.6945, "step": 952 }, { "epoch": 0.27202037298438797, "grad_norm": 1.765625, "learning_rate": 0.00025105434654698356, "loss": 2.7364, "step": 953 }, { "epoch": 0.2723058088427137, "grad_norm": 1.53125, "learning_rate": 0.00025095388706146676, "loss": 2.7086, "step": 954 }, { "epoch": 0.2725912447010394, "grad_norm": 1.4765625, "learning_rate": 0.00025085334473127786, "loss": 2.7037, "step": 955 }, { "epoch": 0.2728766805593651, "grad_norm": 1.359375, "learning_rate": 0.0002507527196389238, "loss": 2.7295, "step": 956 }, { "epoch": 0.2731621164176908, "grad_norm": 1.25, "learning_rate": 0.0002506520118669794, "loss": 2.6829, "step": 957 }, { "epoch": 0.27344755227601647, "grad_norm": 1.3671875, "learning_rate": 0.0002505512214980873, "loss": 2.6869, "step": 958 }, { "epoch": 0.27373298813434216, "grad_norm": 1.0390625, "learning_rate": 0.0002504503486149581, "loss": 2.6919, "step": 959 }, { "epoch": 0.27401842399266785, "grad_norm": 1.65625, "learning_rate": 0.00025034939330037, "loss": 2.6851, "step": 960 }, { "epoch": 0.27430385985099354, "grad_norm": 1.34375, "learning_rate": 0.0002502483556371688, "loss": 2.7326, "step": 961 }, { "epoch": 0.2745892957093193, "grad_norm": 1.8515625, "learning_rate": 0.00025014723570826794, "loss": 2.7369, "step": 962 }, { "epoch": 0.27487473156764497, "grad_norm": 1.6640625, "learning_rate": 0.00025004603359664833, "loss": 2.7398, "step": 963 }, { "epoch": 0.27516016742597066, "grad_norm": 1.6875, "learning_rate": 0.0002499447493853583, "loss": 2.7145, "step": 964 }, { "epoch": 0.27544560328429635, "grad_norm": 1.4921875, "learning_rate": 0.00024984338315751366, "loss": 2.733, "step": 965 }, { "epoch": 0.27573103914262204, "grad_norm": 1.7578125, "learning_rate": 0.00024974193499629745, "loss": 2.707, "step": 966 }, { "epoch": 0.2760164750009477, "grad_norm": 1.6171875, "learning_rate": 0.00024964040498496, "loss": 2.7282, "step": 967 }, { "epoch": 0.2763019108592734, "grad_norm": 1.765625, "learning_rate": 0.00024953879320681853, "loss": 2.7208, "step": 968 }, { "epoch": 0.2765873467175991, "grad_norm": 1.5625, "learning_rate": 0.00024943709974525793, "loss": 2.7021, "step": 969 }, { "epoch": 0.27687278257592485, "grad_norm": 1.609375, "learning_rate": 0.00024933532468372955, "loss": 2.7056, "step": 970 }, { "epoch": 0.27715821843425054, "grad_norm": 1.515625, "learning_rate": 0.00024923346810575193, "loss": 2.7342, "step": 971 }, { "epoch": 0.2774436542925762, "grad_norm": 1.5703125, "learning_rate": 0.0002491315300949106, "loss": 2.7258, "step": 972 }, { "epoch": 0.2777290901509019, "grad_norm": 1.40625, "learning_rate": 0.00024902951073485784, "loss": 2.7053, "step": 973 }, { "epoch": 0.2780145260092276, "grad_norm": 1.609375, "learning_rate": 0.00024892741010931264, "loss": 2.7111, "step": 974 }, { "epoch": 0.2782999618675533, "grad_norm": 1.390625, "learning_rate": 0.0002488252283020606, "loss": 2.6961, "step": 975 }, { "epoch": 0.278585397725879, "grad_norm": 1.7421875, "learning_rate": 0.00024872296539695427, "loss": 2.7148, "step": 976 }, { "epoch": 0.27887083358420467, "grad_norm": 1.5078125, "learning_rate": 0.00024862062147791233, "loss": 2.7192, "step": 977 }, { "epoch": 0.2791562694425304, "grad_norm": 1.671875, "learning_rate": 0.00024851819662892016, "loss": 2.725, "step": 978 }, { "epoch": 0.2794417053008561, "grad_norm": 1.546875, "learning_rate": 0.0002484156909340296, "loss": 2.7303, "step": 979 }, { "epoch": 0.2797271411591818, "grad_norm": 1.703125, "learning_rate": 0.00024831310447735874, "loss": 2.6735, "step": 980 }, { "epoch": 0.2800125770175075, "grad_norm": 1.4921875, "learning_rate": 0.00024821043734309204, "loss": 2.6935, "step": 981 }, { "epoch": 0.28029801287583317, "grad_norm": 1.7890625, "learning_rate": 0.0002481076896154799, "loss": 2.7103, "step": 982 }, { "epoch": 0.28058344873415886, "grad_norm": 1.6328125, "learning_rate": 0.00024800486137883926, "loss": 2.7239, "step": 983 }, { "epoch": 0.28086888459248455, "grad_norm": 1.6875, "learning_rate": 0.00024790195271755277, "loss": 2.7289, "step": 984 }, { "epoch": 0.28115432045081024, "grad_norm": 1.5078125, "learning_rate": 0.0002477989637160694, "loss": 2.7095, "step": 985 }, { "epoch": 0.281439756309136, "grad_norm": 1.7578125, "learning_rate": 0.0002476958944589037, "loss": 2.6648, "step": 986 }, { "epoch": 0.28172519216746167, "grad_norm": 1.546875, "learning_rate": 0.0002475927450306363, "loss": 2.666, "step": 987 }, { "epoch": 0.28201062802578736, "grad_norm": 1.796875, "learning_rate": 0.00024748951551591364, "loss": 2.7152, "step": 988 }, { "epoch": 0.28229606388411305, "grad_norm": 1.578125, "learning_rate": 0.00024738620599944774, "loss": 2.7102, "step": 989 }, { "epoch": 0.28258149974243874, "grad_norm": 1.703125, "learning_rate": 0.0002472828165660164, "loss": 2.7055, "step": 990 }, { "epoch": 0.2828669356007644, "grad_norm": 1.5625, "learning_rate": 0.0002471793473004629, "loss": 2.7004, "step": 991 }, { "epoch": 0.2831523714590901, "grad_norm": 1.734375, "learning_rate": 0.0002470757982876961, "loss": 2.6998, "step": 992 }, { "epoch": 0.2834378073174158, "grad_norm": 1.5546875, "learning_rate": 0.00024697216961269035, "loss": 2.7259, "step": 993 }, { "epoch": 0.28372324317574155, "grad_norm": 1.6953125, "learning_rate": 0.0002468684613604852, "loss": 2.6939, "step": 994 }, { "epoch": 0.28400867903406724, "grad_norm": 1.6015625, "learning_rate": 0.00024676467361618563, "loss": 2.7005, "step": 995 }, { "epoch": 0.2842941148923929, "grad_norm": 1.6015625, "learning_rate": 0.00024666080646496187, "loss": 2.7153, "step": 996 }, { "epoch": 0.2845795507507186, "grad_norm": 1.4296875, "learning_rate": 0.0002465568599920493, "loss": 2.7052, "step": 997 }, { "epoch": 0.2848649866090443, "grad_norm": 1.6171875, "learning_rate": 0.0002464528342827482, "loss": 2.7191, "step": 998 }, { "epoch": 0.28515042246737, "grad_norm": 1.5546875, "learning_rate": 0.00024634872942242423, "loss": 2.7117, "step": 999 }, { "epoch": 0.2854358583256957, "grad_norm": 1.734375, "learning_rate": 0.0002462445454965077, "loss": 2.6923, "step": 1000 }, { "epoch": 0.2854358583256957, "eval_loss": 2.571556806564331, "eval_runtime": 5980.855, "eval_samples_per_second": 10.749, "eval_steps_per_second": 10.749, "step": 1000 }, { "epoch": 0.28572129418402137, "grad_norm": 1.578125, "learning_rate": 0.00024614028259049397, "loss": 2.6922, "step": 1001 }, { "epoch": 0.2860067300423471, "grad_norm": 1.5625, "learning_rate": 0.0002460359407899431, "loss": 2.7178, "step": 1002 }, { "epoch": 0.2862921659006728, "grad_norm": 1.4609375, "learning_rate": 0.00024593152018048, "loss": 2.696, "step": 1003 }, { "epoch": 0.2865776017589985, "grad_norm": 1.625, "learning_rate": 0.00024582702084779414, "loss": 2.6841, "step": 1004 }, { "epoch": 0.2868630376173242, "grad_norm": 1.4140625, "learning_rate": 0.00024572244287763976, "loss": 2.6869, "step": 1005 }, { "epoch": 0.28714847347564987, "grad_norm": 1.5546875, "learning_rate": 0.0002456177863558354, "loss": 2.7185, "step": 1006 }, { "epoch": 0.28743390933397556, "grad_norm": 1.4140625, "learning_rate": 0.00024551305136826424, "loss": 2.69, "step": 1007 }, { "epoch": 0.28771934519230125, "grad_norm": 1.6171875, "learning_rate": 0.00024540823800087386, "loss": 2.6593, "step": 1008 }, { "epoch": 0.28800478105062693, "grad_norm": 1.3984375, "learning_rate": 0.00024530334633967595, "loss": 2.6818, "step": 1009 }, { "epoch": 0.2882902169089526, "grad_norm": 1.5390625, "learning_rate": 0.00024519837647074674, "loss": 2.7043, "step": 1010 }, { "epoch": 0.28857565276727837, "grad_norm": 1.40625, "learning_rate": 0.00024509332848022636, "loss": 2.7057, "step": 1011 }, { "epoch": 0.28886108862560406, "grad_norm": 1.5, "learning_rate": 0.0002449882024543193, "loss": 2.6855, "step": 1012 }, { "epoch": 0.28914652448392975, "grad_norm": 1.3515625, "learning_rate": 0.00024488299847929385, "loss": 2.7012, "step": 1013 }, { "epoch": 0.28943196034225543, "grad_norm": 1.5390625, "learning_rate": 0.0002447777166414825, "loss": 2.7178, "step": 1014 }, { "epoch": 0.2897173962005811, "grad_norm": 1.5625, "learning_rate": 0.0002446723570272814, "loss": 2.6926, "step": 1015 }, { "epoch": 0.2900028320589068, "grad_norm": 1.21875, "learning_rate": 0.00024456691972315076, "loss": 2.6914, "step": 1016 }, { "epoch": 0.2902882679172325, "grad_norm": 1.0390625, "learning_rate": 0.0002444614048156144, "loss": 2.6794, "step": 1017 }, { "epoch": 0.2905737037755582, "grad_norm": 1.4375, "learning_rate": 0.00024435581239125987, "loss": 2.7046, "step": 1018 }, { "epoch": 0.29085913963388393, "grad_norm": 1.09375, "learning_rate": 0.0002442501425367382, "loss": 2.6849, "step": 1019 }, { "epoch": 0.2911445754922096, "grad_norm": 1.796875, "learning_rate": 0.0002441443953387642, "loss": 2.6808, "step": 1020 }, { "epoch": 0.2914300113505353, "grad_norm": 1.65625, "learning_rate": 0.000244038570884116, "loss": 2.6968, "step": 1021 }, { "epoch": 0.291715447208861, "grad_norm": 1.5, "learning_rate": 0.00024393266925963505, "loss": 2.6755, "step": 1022 }, { "epoch": 0.2920008830671867, "grad_norm": 1.4765625, "learning_rate": 0.00024382669055222634, "loss": 2.7195, "step": 1023 }, { "epoch": 0.2922863189255124, "grad_norm": 1.1484375, "learning_rate": 0.000243720634848858, "loss": 2.6943, "step": 1024 }, { "epoch": 0.29257175478383807, "grad_norm": 1.1640625, "learning_rate": 0.0002436145022365613, "loss": 2.7172, "step": 1025 }, { "epoch": 0.29285719064216376, "grad_norm": 1.390625, "learning_rate": 0.00024350829280243074, "loss": 2.7061, "step": 1026 }, { "epoch": 0.2931426265004895, "grad_norm": 1.3359375, "learning_rate": 0.00024340200663362368, "loss": 2.6897, "step": 1027 }, { "epoch": 0.2934280623588152, "grad_norm": 0.96484375, "learning_rate": 0.00024329564381736068, "loss": 2.691, "step": 1028 }, { "epoch": 0.2937134982171409, "grad_norm": 0.8828125, "learning_rate": 0.000243189204440925, "loss": 2.7367, "step": 1029 }, { "epoch": 0.29399893407546657, "grad_norm": 1.171875, "learning_rate": 0.0002430826885916629, "loss": 2.6964, "step": 1030 }, { "epoch": 0.29428436993379226, "grad_norm": 1.1796875, "learning_rate": 0.0002429760963569832, "loss": 2.7204, "step": 1031 }, { "epoch": 0.29456980579211794, "grad_norm": 1.90625, "learning_rate": 0.00024286942782435753, "loss": 2.7186, "step": 1032 }, { "epoch": 0.29485524165044363, "grad_norm": 1.1328125, "learning_rate": 0.0002427626830813202, "loss": 2.6901, "step": 1033 }, { "epoch": 0.2951406775087693, "grad_norm": 1.2890625, "learning_rate": 0.0002426558622154679, "loss": 2.7291, "step": 1034 }, { "epoch": 0.29542611336709507, "grad_norm": 1.875, "learning_rate": 0.0002425489653144598, "loss": 2.717, "step": 1035 }, { "epoch": 0.29571154922542076, "grad_norm": 0.71484375, "learning_rate": 0.0002424419924660176, "loss": 2.7074, "step": 1036 }, { "epoch": 0.29599698508374644, "grad_norm": 2.03125, "learning_rate": 0.00024233494375792524, "loss": 2.7174, "step": 1037 }, { "epoch": 0.29628242094207213, "grad_norm": 1.1640625, "learning_rate": 0.00024222781927802888, "loss": 2.6859, "step": 1038 }, { "epoch": 0.2965678568003978, "grad_norm": 2.421875, "learning_rate": 0.0002421206191142369, "loss": 2.6916, "step": 1039 }, { "epoch": 0.2968532926587235, "grad_norm": 1.8984375, "learning_rate": 0.00024201334335451988, "loss": 2.7098, "step": 1040 }, { "epoch": 0.2971387285170492, "grad_norm": 2.09375, "learning_rate": 0.0002419059920869102, "loss": 2.7105, "step": 1041 }, { "epoch": 0.2974241643753749, "grad_norm": 1.65625, "learning_rate": 0.0002417985653995024, "loss": 2.7329, "step": 1042 }, { "epoch": 0.29770960023370063, "grad_norm": 2.328125, "learning_rate": 0.0002416910633804529, "loss": 2.6864, "step": 1043 }, { "epoch": 0.2979950360920263, "grad_norm": 1.6640625, "learning_rate": 0.00024158348611797985, "loss": 2.6915, "step": 1044 }, { "epoch": 0.298280471950352, "grad_norm": 2.578125, "learning_rate": 0.0002414758337003632, "loss": 2.71, "step": 1045 }, { "epoch": 0.2985659078086777, "grad_norm": 2.421875, "learning_rate": 0.00024136810621594454, "loss": 2.7174, "step": 1046 }, { "epoch": 0.2988513436670034, "grad_norm": 1.2578125, "learning_rate": 0.0002412603037531271, "loss": 2.7106, "step": 1047 }, { "epoch": 0.2991367795253291, "grad_norm": 1.5390625, "learning_rate": 0.00024115242640037569, "loss": 2.7032, "step": 1048 }, { "epoch": 0.29942221538365477, "grad_norm": 1.2421875, "learning_rate": 0.0002410444742462164, "loss": 2.6975, "step": 1049 }, { "epoch": 0.29970765124198046, "grad_norm": 1.484375, "learning_rate": 0.00024093644737923682, "loss": 2.6909, "step": 1050 }, { "epoch": 0.2999930871003062, "grad_norm": 1.1484375, "learning_rate": 0.00024082834588808592, "loss": 2.7097, "step": 1051 }, { "epoch": 0.3002785229586319, "grad_norm": 1.640625, "learning_rate": 0.0002407201698614738, "loss": 2.7031, "step": 1052 }, { "epoch": 0.3005639588169576, "grad_norm": 1.2734375, "learning_rate": 0.0002406119193881718, "loss": 2.6834, "step": 1053 }, { "epoch": 0.30084939467528327, "grad_norm": 1.953125, "learning_rate": 0.00024050359455701217, "loss": 2.7092, "step": 1054 }, { "epoch": 0.30113483053360895, "grad_norm": 1.7734375, "learning_rate": 0.00024039519545688846, "loss": 2.6838, "step": 1055 }, { "epoch": 0.30142026639193464, "grad_norm": 1.7265625, "learning_rate": 0.00024028672217675493, "loss": 2.7051, "step": 1056 }, { "epoch": 0.30170570225026033, "grad_norm": 1.5625, "learning_rate": 0.00024017817480562686, "loss": 2.698, "step": 1057 }, { "epoch": 0.301991138108586, "grad_norm": 1.59375, "learning_rate": 0.00024006955343258032, "loss": 2.6918, "step": 1058 }, { "epoch": 0.30227657396691177, "grad_norm": 1.46875, "learning_rate": 0.00023996085814675198, "loss": 2.7027, "step": 1059 }, { "epoch": 0.30256200982523745, "grad_norm": 1.34375, "learning_rate": 0.0002398520890373393, "loss": 2.6585, "step": 1060 }, { "epoch": 0.30284744568356314, "grad_norm": 1.3671875, "learning_rate": 0.00023974324619360028, "loss": 2.7134, "step": 1061 }, { "epoch": 0.30313288154188883, "grad_norm": 1.1328125, "learning_rate": 0.00023963432970485333, "loss": 2.7017, "step": 1062 }, { "epoch": 0.3034183174002145, "grad_norm": 1.328125, "learning_rate": 0.0002395253396604775, "loss": 2.7121, "step": 1063 }, { "epoch": 0.3037037532585402, "grad_norm": 1.1015625, "learning_rate": 0.00023941627614991205, "loss": 2.6666, "step": 1064 }, { "epoch": 0.3039891891168659, "grad_norm": 1.3203125, "learning_rate": 0.00023930713926265652, "loss": 2.6927, "step": 1065 }, { "epoch": 0.3042746249751916, "grad_norm": 1.0546875, "learning_rate": 0.00023919792908827072, "loss": 2.6844, "step": 1066 }, { "epoch": 0.30456006083351733, "grad_norm": 1.3125, "learning_rate": 0.00023908864571637464, "loss": 2.6666, "step": 1067 }, { "epoch": 0.304845496691843, "grad_norm": 1.03125, "learning_rate": 0.00023897928923664825, "loss": 2.6676, "step": 1068 }, { "epoch": 0.3051309325501687, "grad_norm": 1.3671875, "learning_rate": 0.00023886985973883157, "loss": 2.7065, "step": 1069 }, { "epoch": 0.3054163684084944, "grad_norm": 1.09375, "learning_rate": 0.00023876035731272444, "loss": 2.6579, "step": 1070 }, { "epoch": 0.3057018042668201, "grad_norm": 1.65625, "learning_rate": 0.00023865078204818676, "loss": 2.6919, "step": 1071 }, { "epoch": 0.3059872401251458, "grad_norm": 1.3515625, "learning_rate": 0.0002385411340351379, "loss": 2.6779, "step": 1072 }, { "epoch": 0.30627267598347147, "grad_norm": 1.59375, "learning_rate": 0.00023843141336355725, "loss": 2.6798, "step": 1073 }, { "epoch": 0.30655811184179715, "grad_norm": 1.4609375, "learning_rate": 0.0002383216201234836, "loss": 2.6775, "step": 1074 }, { "epoch": 0.3068435477001229, "grad_norm": 1.4765625, "learning_rate": 0.00023821175440501535, "loss": 2.693, "step": 1075 }, { "epoch": 0.3071289835584486, "grad_norm": 1.328125, "learning_rate": 0.00023810181629831042, "loss": 2.6807, "step": 1076 }, { "epoch": 0.3074144194167743, "grad_norm": 1.3125, "learning_rate": 0.0002379918058935861, "loss": 2.6583, "step": 1077 }, { "epoch": 0.30769985527509996, "grad_norm": 1.1171875, "learning_rate": 0.00023788172328111903, "loss": 2.6784, "step": 1078 }, { "epoch": 0.30798529113342565, "grad_norm": 1.40625, "learning_rate": 0.00023777156855124505, "loss": 2.6992, "step": 1079 }, { "epoch": 0.30827072699175134, "grad_norm": 1.0390625, "learning_rate": 0.00023766134179435921, "loss": 2.7007, "step": 1080 }, { "epoch": 0.30855616285007703, "grad_norm": 1.5390625, "learning_rate": 0.0002375510431009157, "loss": 2.698, "step": 1081 }, { "epoch": 0.3088415987084027, "grad_norm": 1.2109375, "learning_rate": 0.00023744067256142775, "loss": 2.6982, "step": 1082 }, { "epoch": 0.3091270345667284, "grad_norm": 1.7421875, "learning_rate": 0.00023733023026646744, "loss": 2.732, "step": 1083 }, { "epoch": 0.30941247042505415, "grad_norm": 1.53125, "learning_rate": 0.00023721971630666589, "loss": 2.7234, "step": 1084 }, { "epoch": 0.30969790628337984, "grad_norm": 1.3828125, "learning_rate": 0.00023710913077271286, "loss": 2.6996, "step": 1085 }, { "epoch": 0.30998334214170553, "grad_norm": 1.3359375, "learning_rate": 0.00023699847375535698, "loss": 2.7038, "step": 1086 }, { "epoch": 0.3102687780000312, "grad_norm": 1.296875, "learning_rate": 0.00023688774534540554, "loss": 2.6705, "step": 1087 }, { "epoch": 0.3105542138583569, "grad_norm": 1.1484375, "learning_rate": 0.0002367769456337243, "loss": 2.6632, "step": 1088 }, { "epoch": 0.3108396497166826, "grad_norm": 1.296875, "learning_rate": 0.00023666607471123767, "loss": 2.6572, "step": 1089 }, { "epoch": 0.3111250855750083, "grad_norm": 1.09375, "learning_rate": 0.0002365551326689283, "loss": 2.68, "step": 1090 }, { "epoch": 0.311410521433334, "grad_norm": 1.625, "learning_rate": 0.0002364441195978375, "loss": 2.6704, "step": 1091 }, { "epoch": 0.3116959572916597, "grad_norm": 1.359375, "learning_rate": 0.0002363330355890646, "loss": 2.6514, "step": 1092 }, { "epoch": 0.3119813931499854, "grad_norm": 1.4765625, "learning_rate": 0.00023622188073376728, "loss": 2.6773, "step": 1093 }, { "epoch": 0.3122668290083111, "grad_norm": 1.34375, "learning_rate": 0.00023611065512316127, "loss": 2.6896, "step": 1094 }, { "epoch": 0.3125522648666368, "grad_norm": 1.3515625, "learning_rate": 0.00023599935884852045, "loss": 2.7068, "step": 1095 }, { "epoch": 0.3128377007249625, "grad_norm": 1.21875, "learning_rate": 0.00023588799200117662, "loss": 2.6837, "step": 1096 }, { "epoch": 0.31312313658328816, "grad_norm": 1.3828125, "learning_rate": 0.00023577655467251963, "loss": 2.6873, "step": 1097 }, { "epoch": 0.31340857244161385, "grad_norm": 1.234375, "learning_rate": 0.0002356650469539969, "loss": 2.6891, "step": 1098 }, { "epoch": 0.31369400829993954, "grad_norm": 1.296875, "learning_rate": 0.0002355534689371139, "loss": 2.6888, "step": 1099 }, { "epoch": 0.3139794441582653, "grad_norm": 1.1796875, "learning_rate": 0.00023544182071343363, "loss": 2.6745, "step": 1100 }, { "epoch": 0.314264880016591, "grad_norm": 1.3046875, "learning_rate": 0.00023533010237457674, "loss": 2.6668, "step": 1101 }, { "epoch": 0.31455031587491666, "grad_norm": 1.1171875, "learning_rate": 0.00023521831401222132, "loss": 2.6679, "step": 1102 }, { "epoch": 0.31483575173324235, "grad_norm": 1.578125, "learning_rate": 0.00023510645571810316, "loss": 2.693, "step": 1103 }, { "epoch": 0.31512118759156804, "grad_norm": 1.34375, "learning_rate": 0.00023499452758401525, "loss": 2.6966, "step": 1104 }, { "epoch": 0.31540662344989373, "grad_norm": 1.59375, "learning_rate": 0.00023488252970180792, "loss": 2.6786, "step": 1105 }, { "epoch": 0.3156920593082194, "grad_norm": 1.3984375, "learning_rate": 0.00023477046216338875, "loss": 2.6579, "step": 1106 }, { "epoch": 0.3159774951665451, "grad_norm": 1.515625, "learning_rate": 0.0002346583250607225, "loss": 2.6717, "step": 1107 }, { "epoch": 0.31626293102487085, "grad_norm": 1.421875, "learning_rate": 0.00023454611848583104, "loss": 2.6939, "step": 1108 }, { "epoch": 0.31654836688319654, "grad_norm": 1.390625, "learning_rate": 0.00023443384253079308, "loss": 2.658, "step": 1109 }, { "epoch": 0.31683380274152223, "grad_norm": 1.21875, "learning_rate": 0.00023432149728774455, "loss": 2.6733, "step": 1110 }, { "epoch": 0.3171192385998479, "grad_norm": 1.5546875, "learning_rate": 0.000234209082848878, "loss": 2.6814, "step": 1111 }, { "epoch": 0.3174046744581736, "grad_norm": 1.2421875, "learning_rate": 0.00023409659930644287, "loss": 2.67, "step": 1112 }, { "epoch": 0.3176901103164993, "grad_norm": 1.8359375, "learning_rate": 0.00023398404675274522, "loss": 2.6662, "step": 1113 }, { "epoch": 0.317975546174825, "grad_norm": 1.7578125, "learning_rate": 0.00023387142528014798, "loss": 2.6935, "step": 1114 }, { "epoch": 0.3182609820331507, "grad_norm": 1.296875, "learning_rate": 0.00023375873498107026, "loss": 2.6746, "step": 1115 }, { "epoch": 0.3185464178914764, "grad_norm": 1.3125, "learning_rate": 0.00023364597594798802, "loss": 2.6977, "step": 1116 }, { "epoch": 0.3188318537498021, "grad_norm": 1.453125, "learning_rate": 0.0002335331482734333, "loss": 2.6889, "step": 1117 }, { "epoch": 0.3191172896081278, "grad_norm": 1.09375, "learning_rate": 0.00023342025204999472, "loss": 2.6725, "step": 1118 }, { "epoch": 0.3194027254664535, "grad_norm": 1.8203125, "learning_rate": 0.0002333072873703171, "loss": 2.669, "step": 1119 }, { "epoch": 0.3196881613247792, "grad_norm": 1.640625, "learning_rate": 0.00023319425432710136, "loss": 2.691, "step": 1120 }, { "epoch": 0.31997359718310486, "grad_norm": 1.5859375, "learning_rate": 0.0002330811530131045, "loss": 2.6734, "step": 1121 }, { "epoch": 0.32025903304143055, "grad_norm": 1.53125, "learning_rate": 0.0002329679835211397, "loss": 2.6915, "step": 1122 }, { "epoch": 0.32054446889975624, "grad_norm": 1.421875, "learning_rate": 0.00023285474594407585, "loss": 2.6766, "step": 1123 }, { "epoch": 0.320829904758082, "grad_norm": 1.2890625, "learning_rate": 0.000232741440374838, "loss": 2.6737, "step": 1124 }, { "epoch": 0.3211153406164077, "grad_norm": 1.484375, "learning_rate": 0.00023262806690640673, "loss": 2.6618, "step": 1125 }, { "epoch": 0.32140077647473336, "grad_norm": 1.2578125, "learning_rate": 0.00023251462563181853, "loss": 2.7, "step": 1126 }, { "epoch": 0.32168621233305905, "grad_norm": 1.6484375, "learning_rate": 0.00023240111664416544, "loss": 2.6777, "step": 1127 }, { "epoch": 0.32197164819138474, "grad_norm": 1.4765625, "learning_rate": 0.0002322875400365951, "loss": 2.6749, "step": 1128 }, { "epoch": 0.32225708404971043, "grad_norm": 1.5703125, "learning_rate": 0.00023217389590231058, "loss": 2.6936, "step": 1129 }, { "epoch": 0.3225425199080361, "grad_norm": 1.3359375, "learning_rate": 0.00023206018433457045, "loss": 2.6419, "step": 1130 }, { "epoch": 0.3228279557663618, "grad_norm": 1.453125, "learning_rate": 0.00023194640542668855, "loss": 2.6704, "step": 1131 }, { "epoch": 0.32311339162468755, "grad_norm": 1.3125, "learning_rate": 0.00023183255927203405, "loss": 2.7011, "step": 1132 }, { "epoch": 0.32339882748301324, "grad_norm": 1.40625, "learning_rate": 0.00023171864596403116, "loss": 2.683, "step": 1133 }, { "epoch": 0.32368426334133893, "grad_norm": 1.1875, "learning_rate": 0.00023160466559615946, "loss": 2.7078, "step": 1134 }, { "epoch": 0.3239696991996646, "grad_norm": 1.2734375, "learning_rate": 0.00023149061826195327, "loss": 2.6919, "step": 1135 }, { "epoch": 0.3242551350579903, "grad_norm": 1.09375, "learning_rate": 0.00023137650405500202, "loss": 2.6554, "step": 1136 }, { "epoch": 0.324540570916316, "grad_norm": 1.3046875, "learning_rate": 0.00023126232306895, "loss": 2.6734, "step": 1137 }, { "epoch": 0.3248260067746417, "grad_norm": 1.1484375, "learning_rate": 0.0002311480753974963, "loss": 2.6794, "step": 1138 }, { "epoch": 0.3251114426329674, "grad_norm": 1.296875, "learning_rate": 0.00023103376113439472, "loss": 2.6802, "step": 1139 }, { "epoch": 0.3253968784912931, "grad_norm": 1.046875, "learning_rate": 0.0002309193803734537, "loss": 2.6811, "step": 1140 }, { "epoch": 0.3256823143496188, "grad_norm": 1.5, "learning_rate": 0.00023080493320853628, "loss": 2.671, "step": 1141 }, { "epoch": 0.3259677502079445, "grad_norm": 1.0859375, "learning_rate": 0.00023069041973355992, "loss": 2.6759, "step": 1142 }, { "epoch": 0.3262531860662702, "grad_norm": 1.5859375, "learning_rate": 0.00023057584004249662, "loss": 2.682, "step": 1143 }, { "epoch": 0.3265386219245959, "grad_norm": 1.3515625, "learning_rate": 0.00023046119422937258, "loss": 2.6591, "step": 1144 }, { "epoch": 0.32682405778292156, "grad_norm": 1.6171875, "learning_rate": 0.00023034648238826836, "loss": 2.6607, "step": 1145 }, { "epoch": 0.32710949364124725, "grad_norm": 1.4140625, "learning_rate": 0.00023023170461331863, "loss": 2.6512, "step": 1146 }, { "epoch": 0.32739492949957294, "grad_norm": 1.6171875, "learning_rate": 0.0002301168609987123, "loss": 2.6913, "step": 1147 }, { "epoch": 0.3276803653578987, "grad_norm": 1.46875, "learning_rate": 0.00023000195163869216, "loss": 2.6783, "step": 1148 }, { "epoch": 0.3279658012162244, "grad_norm": 1.5546875, "learning_rate": 0.0002298869766275549, "loss": 2.6467, "step": 1149 }, { "epoch": 0.32825123707455006, "grad_norm": 1.40625, "learning_rate": 0.00022977193605965143, "loss": 2.7, "step": 1150 }, { "epoch": 0.32853667293287575, "grad_norm": 1.4453125, "learning_rate": 0.000229656830029386, "loss": 2.6604, "step": 1151 }, { "epoch": 0.32882210879120144, "grad_norm": 1.328125, "learning_rate": 0.0002295416586312169, "loss": 2.6538, "step": 1152 }, { "epoch": 0.32910754464952713, "grad_norm": 1.28125, "learning_rate": 0.00022942642195965596, "loss": 2.69, "step": 1153 }, { "epoch": 0.3293929805078528, "grad_norm": 1.2109375, "learning_rate": 0.0002293111201092686, "loss": 2.6806, "step": 1154 }, { "epoch": 0.3296784163661785, "grad_norm": 1.203125, "learning_rate": 0.00022919575317467358, "loss": 2.6815, "step": 1155 }, { "epoch": 0.3299638522245042, "grad_norm": 1.0390625, "learning_rate": 0.0002290803212505433, "loss": 2.6887, "step": 1156 }, { "epoch": 0.33024928808282994, "grad_norm": 1.5078125, "learning_rate": 0.00022896482443160335, "loss": 2.6799, "step": 1157 }, { "epoch": 0.33053472394115563, "grad_norm": 1.34375, "learning_rate": 0.00022884926281263265, "loss": 2.6802, "step": 1158 }, { "epoch": 0.3308201597994813, "grad_norm": 1.3984375, "learning_rate": 0.00022873363648846318, "loss": 2.6585, "step": 1159 }, { "epoch": 0.331105595657807, "grad_norm": 1.3203125, "learning_rate": 0.00022861794555398016, "loss": 2.6746, "step": 1160 }, { "epoch": 0.3313910315161327, "grad_norm": 1.40625, "learning_rate": 0.0002285021901041217, "loss": 2.6856, "step": 1161 }, { "epoch": 0.3316764673744584, "grad_norm": 1.234375, "learning_rate": 0.000228386370233879, "loss": 2.6456, "step": 1162 }, { "epoch": 0.3319619032327841, "grad_norm": 1.484375, "learning_rate": 0.00022827048603829596, "loss": 2.6973, "step": 1163 }, { "epoch": 0.33224733909110976, "grad_norm": 1.3359375, "learning_rate": 0.0002281545376124694, "loss": 2.665, "step": 1164 }, { "epoch": 0.3325327749494355, "grad_norm": 1.515625, "learning_rate": 0.00022803852505154867, "loss": 2.666, "step": 1165 }, { "epoch": 0.3328182108077612, "grad_norm": 1.390625, "learning_rate": 0.00022792244845073608, "loss": 2.6748, "step": 1166 }, { "epoch": 0.3331036466660869, "grad_norm": 1.40625, "learning_rate": 0.00022780630790528617, "loss": 2.6593, "step": 1167 }, { "epoch": 0.33338908252441257, "grad_norm": 1.296875, "learning_rate": 0.00022769010351050606, "loss": 2.6485, "step": 1168 }, { "epoch": 0.33367451838273826, "grad_norm": 1.375, "learning_rate": 0.00022757383536175529, "loss": 2.6684, "step": 1169 }, { "epoch": 0.33395995424106395, "grad_norm": 1.203125, "learning_rate": 0.00022745750355444573, "loss": 2.6508, "step": 1170 }, { "epoch": 0.33424539009938964, "grad_norm": 1.4453125, "learning_rate": 0.00022734110818404144, "loss": 2.6546, "step": 1171 }, { "epoch": 0.3345308259577153, "grad_norm": 1.3828125, "learning_rate": 0.00022722464934605869, "loss": 2.6864, "step": 1172 }, { "epoch": 0.33481626181604107, "grad_norm": 1.5, "learning_rate": 0.00022710812713606582, "loss": 2.6611, "step": 1173 }, { "epoch": 0.33510169767436676, "grad_norm": 1.328125, "learning_rate": 0.00022699154164968307, "loss": 2.6822, "step": 1174 }, { "epoch": 0.33538713353269245, "grad_norm": 1.3671875, "learning_rate": 0.0002268748929825828, "loss": 2.6522, "step": 1175 }, { "epoch": 0.33567256939101814, "grad_norm": 1.25, "learning_rate": 0.0002267581812304891, "loss": 2.6546, "step": 1176 }, { "epoch": 0.3359580052493438, "grad_norm": 1.390625, "learning_rate": 0.00022664140648917782, "loss": 2.6711, "step": 1177 }, { "epoch": 0.3362434411076695, "grad_norm": 1.15625, "learning_rate": 0.00022652456885447652, "loss": 2.6533, "step": 1178 }, { "epoch": 0.3365288769659952, "grad_norm": 1.53125, "learning_rate": 0.0002264076684222644, "loss": 2.6659, "step": 1179 }, { "epoch": 0.3368143128243209, "grad_norm": 1.359375, "learning_rate": 0.00022629070528847216, "loss": 2.6843, "step": 1180 }, { "epoch": 0.33709974868264664, "grad_norm": 1.4375, "learning_rate": 0.00022617367954908194, "loss": 2.6654, "step": 1181 }, { "epoch": 0.3373851845409723, "grad_norm": 1.2109375, "learning_rate": 0.00022605659130012733, "loss": 2.6624, "step": 1182 }, { "epoch": 0.337670620399298, "grad_norm": 1.3828125, "learning_rate": 0.00022593944063769314, "loss": 2.6839, "step": 1183 }, { "epoch": 0.3379560562576237, "grad_norm": 1.21875, "learning_rate": 0.0002258222276579154, "loss": 2.6787, "step": 1184 }, { "epoch": 0.3382414921159494, "grad_norm": 1.375, "learning_rate": 0.00022570495245698128, "loss": 2.6928, "step": 1185 }, { "epoch": 0.3385269279742751, "grad_norm": 1.2109375, "learning_rate": 0.00022558761513112913, "loss": 2.6999, "step": 1186 }, { "epoch": 0.33881236383260077, "grad_norm": 1.3984375, "learning_rate": 0.00022547021577664814, "loss": 2.6904, "step": 1187 }, { "epoch": 0.33909779969092646, "grad_norm": 1.171875, "learning_rate": 0.00022535275448987832, "loss": 2.6623, "step": 1188 }, { "epoch": 0.3393832355492522, "grad_norm": 1.2734375, "learning_rate": 0.00022523523136721085, "loss": 2.6658, "step": 1189 }, { "epoch": 0.3396686714075779, "grad_norm": 1.1171875, "learning_rate": 0.00022511764650508728, "loss": 2.6547, "step": 1190 }, { "epoch": 0.3399541072659036, "grad_norm": 1.3359375, "learning_rate": 0.000225, "loss": 2.6677, "step": 1191 }, { "epoch": 0.34023954312422927, "grad_norm": 1.171875, "learning_rate": 0.00022488229194849192, "loss": 2.6869, "step": 1192 }, { "epoch": 0.34052497898255496, "grad_norm": 1.40625, "learning_rate": 0.00022476452244715663, "loss": 2.6773, "step": 1193 }, { "epoch": 0.34081041484088065, "grad_norm": 1.1875, "learning_rate": 0.00022464669159263793, "loss": 2.6669, "step": 1194 }, { "epoch": 0.34109585069920634, "grad_norm": 1.3515625, "learning_rate": 0.00022452879948162998, "loss": 2.64, "step": 1195 }, { "epoch": 0.341381286557532, "grad_norm": 1.203125, "learning_rate": 0.0002244108462108774, "loss": 2.6452, "step": 1196 }, { "epoch": 0.34166672241585777, "grad_norm": 1.3359375, "learning_rate": 0.00022429283187717485, "loss": 2.6339, "step": 1197 }, { "epoch": 0.34195215827418346, "grad_norm": 1.1640625, "learning_rate": 0.00022417475657736705, "loss": 2.6572, "step": 1198 }, { "epoch": 0.34223759413250915, "grad_norm": 1.3125, "learning_rate": 0.00022405662040834895, "loss": 2.646, "step": 1199 }, { "epoch": 0.34252302999083484, "grad_norm": 1.1796875, "learning_rate": 0.00022393842346706523, "loss": 2.6676, "step": 1200 }, { "epoch": 0.3428084658491605, "grad_norm": 1.171875, "learning_rate": 0.00022382016585051058, "loss": 2.6574, "step": 1201 }, { "epoch": 0.3430939017074862, "grad_norm": 1.1171875, "learning_rate": 0.00022370184765572944, "loss": 2.6481, "step": 1202 }, { "epoch": 0.3433793375658119, "grad_norm": 1.15625, "learning_rate": 0.00022358346897981596, "loss": 2.675, "step": 1203 }, { "epoch": 0.3436647734241376, "grad_norm": 1.09375, "learning_rate": 0.0002234650299199139, "loss": 2.6475, "step": 1204 }, { "epoch": 0.34395020928246334, "grad_norm": 1.0234375, "learning_rate": 0.00022334653057321663, "loss": 2.6372, "step": 1205 }, { "epoch": 0.344235645140789, "grad_norm": 0.90625, "learning_rate": 0.00022322797103696692, "loss": 2.657, "step": 1206 }, { "epoch": 0.3445210809991147, "grad_norm": 0.98828125, "learning_rate": 0.00022310935140845706, "loss": 2.6606, "step": 1207 }, { "epoch": 0.3448065168574404, "grad_norm": 0.8515625, "learning_rate": 0.0002229906717850284, "loss": 2.6751, "step": 1208 }, { "epoch": 0.3450919527157661, "grad_norm": 0.9140625, "learning_rate": 0.00022287193226407185, "loss": 2.6703, "step": 1209 }, { "epoch": 0.3453773885740918, "grad_norm": 0.81640625, "learning_rate": 0.00022275313294302726, "loss": 2.6554, "step": 1210 }, { "epoch": 0.34566282443241747, "grad_norm": 0.9375, "learning_rate": 0.00022263427391938358, "loss": 2.6401, "step": 1211 }, { "epoch": 0.34594826029074316, "grad_norm": 0.78515625, "learning_rate": 0.00022251535529067877, "loss": 2.6659, "step": 1212 }, { "epoch": 0.3462336961490689, "grad_norm": 0.984375, "learning_rate": 0.00022239637715449977, "loss": 2.6972, "step": 1213 }, { "epoch": 0.3465191320073946, "grad_norm": 0.82421875, "learning_rate": 0.0002222773396084822, "loss": 2.6545, "step": 1214 }, { "epoch": 0.3468045678657203, "grad_norm": 0.80859375, "learning_rate": 0.0002221582427503106, "loss": 2.6515, "step": 1215 }, { "epoch": 0.34709000372404597, "grad_norm": 0.6953125, "learning_rate": 0.00022203908667771808, "loss": 2.6517, "step": 1216 }, { "epoch": 0.34737543958237166, "grad_norm": 0.73828125, "learning_rate": 0.00022191987148848636, "loss": 2.6596, "step": 1217 }, { "epoch": 0.34766087544069735, "grad_norm": 0.6640625, "learning_rate": 0.0002218005972804457, "loss": 2.6795, "step": 1218 }, { "epoch": 0.34794631129902304, "grad_norm": 0.73828125, "learning_rate": 0.00022168126415147478, "loss": 2.6416, "step": 1219 }, { "epoch": 0.3482317471573487, "grad_norm": 0.71875, "learning_rate": 0.00022156187219950059, "loss": 2.6384, "step": 1220 }, { "epoch": 0.34851718301567447, "grad_norm": 0.69140625, "learning_rate": 0.0002214424215224985, "loss": 2.6574, "step": 1221 }, { "epoch": 0.34880261887400016, "grad_norm": 0.77734375, "learning_rate": 0.0002213229122184919, "loss": 2.6864, "step": 1222 }, { "epoch": 0.34908805473232585, "grad_norm": 0.796875, "learning_rate": 0.0002212033443855525, "loss": 2.6457, "step": 1223 }, { "epoch": 0.34937349059065154, "grad_norm": 0.7265625, "learning_rate": 0.0002210837181217998, "loss": 2.6441, "step": 1224 }, { "epoch": 0.3496589264489772, "grad_norm": 0.8203125, "learning_rate": 0.0002209640335254015, "loss": 2.6643, "step": 1225 }, { "epoch": 0.3499443623073029, "grad_norm": 0.703125, "learning_rate": 0.00022084429069457297, "loss": 2.6436, "step": 1226 }, { "epoch": 0.3502297981656286, "grad_norm": 0.80859375, "learning_rate": 0.0002207244897275775, "loss": 2.6485, "step": 1227 }, { "epoch": 0.3505152340239543, "grad_norm": 0.80859375, "learning_rate": 0.00022060463072272595, "loss": 2.6534, "step": 1228 }, { "epoch": 0.35080066988228, "grad_norm": 0.8203125, "learning_rate": 0.00022048471377837697, "loss": 2.6605, "step": 1229 }, { "epoch": 0.3510861057406057, "grad_norm": 0.9296875, "learning_rate": 0.0002203647389929367, "loss": 2.6603, "step": 1230 }, { "epoch": 0.3513715415989314, "grad_norm": 1.2578125, "learning_rate": 0.00022024470646485862, "loss": 2.6937, "step": 1231 }, { "epoch": 0.3516569774572571, "grad_norm": 0.96484375, "learning_rate": 0.0002201246162926437, "loss": 2.6643, "step": 1232 }, { "epoch": 0.3519424133155828, "grad_norm": 0.875, "learning_rate": 0.00022000446857484035, "loss": 2.6523, "step": 1233 }, { "epoch": 0.3522278491739085, "grad_norm": 0.7578125, "learning_rate": 0.0002198842634100439, "loss": 2.6739, "step": 1234 }, { "epoch": 0.35251328503223417, "grad_norm": 0.59765625, "learning_rate": 0.00021976400089689712, "loss": 2.6605, "step": 1235 }, { "epoch": 0.35279872089055986, "grad_norm": 0.7109375, "learning_rate": 0.00021964368113408959, "loss": 2.6868, "step": 1236 }, { "epoch": 0.35308415674888555, "grad_norm": 0.828125, "learning_rate": 0.00021952330422035803, "loss": 2.6759, "step": 1237 }, { "epoch": 0.3533695926072113, "grad_norm": 0.96875, "learning_rate": 0.0002194028702544861, "loss": 2.6735, "step": 1238 }, { "epoch": 0.353655028465537, "grad_norm": 0.97265625, "learning_rate": 0.00021928237933530403, "loss": 2.661, "step": 1239 }, { "epoch": 0.35394046432386267, "grad_norm": 1.0546875, "learning_rate": 0.00021916183156168908, "loss": 2.6457, "step": 1240 }, { "epoch": 0.35422590018218836, "grad_norm": 0.9453125, "learning_rate": 0.00021904122703256498, "loss": 2.6761, "step": 1241 }, { "epoch": 0.35451133604051405, "grad_norm": 0.80859375, "learning_rate": 0.00021892056584690213, "loss": 2.6441, "step": 1242 }, { "epoch": 0.35479677189883974, "grad_norm": 0.94921875, "learning_rate": 0.00021879984810371734, "loss": 2.6453, "step": 1243 }, { "epoch": 0.3550822077571654, "grad_norm": 1.015625, "learning_rate": 0.00021867907390207394, "loss": 2.6208, "step": 1244 }, { "epoch": 0.3553676436154911, "grad_norm": 0.98046875, "learning_rate": 0.00021855824334108143, "loss": 2.6572, "step": 1245 }, { "epoch": 0.35565307947381686, "grad_norm": 0.85546875, "learning_rate": 0.00021843735651989575, "loss": 2.6826, "step": 1246 }, { "epoch": 0.35593851533214255, "grad_norm": 0.8125, "learning_rate": 0.00021831641353771885, "loss": 2.6611, "step": 1247 }, { "epoch": 0.35622395119046824, "grad_norm": 0.83203125, "learning_rate": 0.00021819541449379892, "loss": 2.6597, "step": 1248 }, { "epoch": 0.3565093870487939, "grad_norm": 0.99609375, "learning_rate": 0.00021807435948742994, "loss": 2.635, "step": 1249 }, { "epoch": 0.3567948229071196, "grad_norm": 0.9296875, "learning_rate": 0.00021795324861795208, "loss": 2.6526, "step": 1250 }, { "epoch": 0.3567948229071196, "eval_loss": 2.5330393314361572, "eval_runtime": 5928.9133, "eval_samples_per_second": 10.843, "eval_steps_per_second": 10.843, "step": 1250 }, { "epoch": 0.3570802587654453, "grad_norm": 0.84375, "learning_rate": 0.00021783208198475107, "loss": 2.6512, "step": 1251 }, { "epoch": 0.357365694623771, "grad_norm": 0.7890625, "learning_rate": 0.00021771085968725864, "loss": 2.6381, "step": 1252 }, { "epoch": 0.3576511304820967, "grad_norm": 0.7265625, "learning_rate": 0.00021758958182495214, "loss": 2.6498, "step": 1253 }, { "epoch": 0.3579365663404224, "grad_norm": 1.171875, "learning_rate": 0.00021746824849735435, "loss": 2.6614, "step": 1254 }, { "epoch": 0.3582220021987481, "grad_norm": 0.72265625, "learning_rate": 0.00021734685980403376, "loss": 2.6483, "step": 1255 }, { "epoch": 0.3585074380570738, "grad_norm": 0.89453125, "learning_rate": 0.0002172254158446043, "loss": 2.6365, "step": 1256 }, { "epoch": 0.3587928739153995, "grad_norm": 0.86328125, "learning_rate": 0.00021710391671872514, "loss": 2.6484, "step": 1257 }, { "epoch": 0.3590783097737252, "grad_norm": 0.8984375, "learning_rate": 0.00021698236252610072, "loss": 2.6372, "step": 1258 }, { "epoch": 0.35936374563205087, "grad_norm": 0.80859375, "learning_rate": 0.00021686075336648075, "loss": 2.6554, "step": 1259 }, { "epoch": 0.35964918149037656, "grad_norm": 0.8359375, "learning_rate": 0.00021673908933965996, "loss": 2.6511, "step": 1260 }, { "epoch": 0.35993461734870225, "grad_norm": 0.81640625, "learning_rate": 0.00021661737054547826, "loss": 2.6473, "step": 1261 }, { "epoch": 0.360220053207028, "grad_norm": 0.7890625, "learning_rate": 0.00021649559708382027, "loss": 2.6396, "step": 1262 }, { "epoch": 0.3605054890653537, "grad_norm": 0.87890625, "learning_rate": 0.0002163737690546157, "loss": 2.6517, "step": 1263 }, { "epoch": 0.36079092492367937, "grad_norm": 0.9765625, "learning_rate": 0.00021625188655783893, "loss": 2.6126, "step": 1264 }, { "epoch": 0.36107636078200506, "grad_norm": 0.89453125, "learning_rate": 0.000216129949693509, "loss": 2.6551, "step": 1265 }, { "epoch": 0.36136179664033075, "grad_norm": 0.8671875, "learning_rate": 0.0002160079585616896, "loss": 2.6316, "step": 1266 }, { "epoch": 0.36164723249865643, "grad_norm": 0.89453125, "learning_rate": 0.000215885913262489, "loss": 2.6376, "step": 1267 }, { "epoch": 0.3619326683569821, "grad_norm": 0.78125, "learning_rate": 0.00021576381389605992, "loss": 2.6378, "step": 1268 }, { "epoch": 0.3622181042153078, "grad_norm": 0.78515625, "learning_rate": 0.00021564166056259936, "loss": 2.6742, "step": 1269 }, { "epoch": 0.36250354007363356, "grad_norm": 0.9453125, "learning_rate": 0.00021551945336234867, "loss": 2.6676, "step": 1270 }, { "epoch": 0.36278897593195925, "grad_norm": 0.7578125, "learning_rate": 0.00021539719239559336, "loss": 2.6604, "step": 1271 }, { "epoch": 0.36307441179028493, "grad_norm": 0.734375, "learning_rate": 0.00021527487776266317, "loss": 2.6459, "step": 1272 }, { "epoch": 0.3633598476486106, "grad_norm": 0.74609375, "learning_rate": 0.0002151525095639318, "loss": 2.6323, "step": 1273 }, { "epoch": 0.3636452835069363, "grad_norm": 0.80859375, "learning_rate": 0.0002150300878998168, "loss": 2.6476, "step": 1274 }, { "epoch": 0.363930719365262, "grad_norm": 0.73046875, "learning_rate": 0.0002149076128707798, "loss": 2.6378, "step": 1275 }, { "epoch": 0.3642161552235877, "grad_norm": 0.73828125, "learning_rate": 0.00021478508457732615, "loss": 2.654, "step": 1276 }, { "epoch": 0.3645015910819134, "grad_norm": 0.62109375, "learning_rate": 0.00021466250312000482, "loss": 2.6398, "step": 1277 }, { "epoch": 0.3647870269402391, "grad_norm": 0.74609375, "learning_rate": 0.00021453986859940852, "loss": 2.6306, "step": 1278 }, { "epoch": 0.3650724627985648, "grad_norm": 0.84765625, "learning_rate": 0.00021441718111617344, "loss": 2.6299, "step": 1279 }, { "epoch": 0.3653578986568905, "grad_norm": 0.8125, "learning_rate": 0.00021429444077097928, "loss": 2.6466, "step": 1280 }, { "epoch": 0.3656433345152162, "grad_norm": 0.75390625, "learning_rate": 0.00021417164766454903, "loss": 2.6788, "step": 1281 }, { "epoch": 0.3659287703735419, "grad_norm": 0.61328125, "learning_rate": 0.00021404880189764913, "loss": 2.6416, "step": 1282 }, { "epoch": 0.36621420623186757, "grad_norm": 0.63671875, "learning_rate": 0.00021392590357108905, "loss": 2.6469, "step": 1283 }, { "epoch": 0.36649964209019326, "grad_norm": 0.65625, "learning_rate": 0.00021380295278572155, "loss": 2.6422, "step": 1284 }, { "epoch": 0.36678507794851894, "grad_norm": 0.63671875, "learning_rate": 0.00021367994964244236, "loss": 2.6202, "step": 1285 }, { "epoch": 0.3670705138068447, "grad_norm": 0.640625, "learning_rate": 0.00021355689424219023, "loss": 2.6281, "step": 1286 }, { "epoch": 0.3673559496651704, "grad_norm": 0.60546875, "learning_rate": 0.00021343378668594662, "loss": 2.6181, "step": 1287 }, { "epoch": 0.36764138552349607, "grad_norm": 0.62890625, "learning_rate": 0.00021331062707473605, "loss": 2.6632, "step": 1288 }, { "epoch": 0.36792682138182176, "grad_norm": 0.59765625, "learning_rate": 0.00021318741550962556, "loss": 2.6296, "step": 1289 }, { "epoch": 0.36821225724014744, "grad_norm": 0.58984375, "learning_rate": 0.00021306415209172502, "loss": 2.654, "step": 1290 }, { "epoch": 0.36849769309847313, "grad_norm": 0.54296875, "learning_rate": 0.00021294083692218653, "loss": 2.6375, "step": 1291 }, { "epoch": 0.3687831289567988, "grad_norm": 0.61328125, "learning_rate": 0.00021281747010220496, "loss": 2.6488, "step": 1292 }, { "epoch": 0.3690685648151245, "grad_norm": 0.62109375, "learning_rate": 0.0002126940517330175, "loss": 2.6565, "step": 1293 }, { "epoch": 0.36935400067345026, "grad_norm": 0.58984375, "learning_rate": 0.00021257058191590354, "loss": 2.6622, "step": 1294 }, { "epoch": 0.36963943653177594, "grad_norm": 0.59765625, "learning_rate": 0.00021244706075218472, "loss": 2.6498, "step": 1295 }, { "epoch": 0.36992487239010163, "grad_norm": 0.72265625, "learning_rate": 0.00021232348834322495, "loss": 2.6525, "step": 1296 }, { "epoch": 0.3702103082484273, "grad_norm": 0.75, "learning_rate": 0.00021219986479043001, "loss": 2.6365, "step": 1297 }, { "epoch": 0.370495744106753, "grad_norm": 0.6484375, "learning_rate": 0.00021207619019524777, "loss": 2.6502, "step": 1298 }, { "epoch": 0.3707811799650787, "grad_norm": 0.5859375, "learning_rate": 0.00021195246465916792, "loss": 2.6183, "step": 1299 }, { "epoch": 0.3710666158234044, "grad_norm": 0.52734375, "learning_rate": 0.00021182868828372196, "loss": 2.6646, "step": 1300 }, { "epoch": 0.3713520516817301, "grad_norm": 0.6484375, "learning_rate": 0.00021170486117048315, "loss": 2.6203, "step": 1301 }, { "epoch": 0.37163748754005577, "grad_norm": 0.62109375, "learning_rate": 0.0002115809834210664, "loss": 2.625, "step": 1302 }, { "epoch": 0.3719229233983815, "grad_norm": 0.6171875, "learning_rate": 0.0002114570551371281, "loss": 2.671, "step": 1303 }, { "epoch": 0.3722083592567072, "grad_norm": 0.54296875, "learning_rate": 0.00021133307642036615, "loss": 2.6239, "step": 1304 }, { "epoch": 0.3724937951150329, "grad_norm": 0.64453125, "learning_rate": 0.0002112090473725198, "loss": 2.643, "step": 1305 }, { "epoch": 0.3727792309733586, "grad_norm": 0.5390625, "learning_rate": 0.00021108496809536974, "loss": 2.627, "step": 1306 }, { "epoch": 0.37306466683168427, "grad_norm": 0.56640625, "learning_rate": 0.00021096083869073765, "loss": 2.6038, "step": 1307 }, { "epoch": 0.37335010269000996, "grad_norm": 0.640625, "learning_rate": 0.0002108366592604866, "loss": 2.6223, "step": 1308 }, { "epoch": 0.37363553854833564, "grad_norm": 0.7109375, "learning_rate": 0.00021071242990652043, "loss": 2.6492, "step": 1309 }, { "epoch": 0.37392097440666133, "grad_norm": 0.625, "learning_rate": 0.00021058815073078422, "loss": 2.6534, "step": 1310 }, { "epoch": 0.3742064102649871, "grad_norm": 0.58984375, "learning_rate": 0.00021046382183526378, "loss": 2.6197, "step": 1311 }, { "epoch": 0.37449184612331277, "grad_norm": 0.7265625, "learning_rate": 0.0002103394433219858, "loss": 2.632, "step": 1312 }, { "epoch": 0.37477728198163845, "grad_norm": 0.59375, "learning_rate": 0.00021021501529301756, "loss": 2.639, "step": 1313 }, { "epoch": 0.37506271783996414, "grad_norm": 0.63671875, "learning_rate": 0.00021009053785046706, "loss": 2.6138, "step": 1314 }, { "epoch": 0.37534815369828983, "grad_norm": 0.61328125, "learning_rate": 0.0002099660110964829, "loss": 2.647, "step": 1315 }, { "epoch": 0.3756335895566155, "grad_norm": 0.60546875, "learning_rate": 0.00020984143513325416, "loss": 2.6299, "step": 1316 }, { "epoch": 0.3759190254149412, "grad_norm": 0.55078125, "learning_rate": 0.0002097168100630101, "loss": 2.6422, "step": 1317 }, { "epoch": 0.3762044612732669, "grad_norm": 0.62109375, "learning_rate": 0.0002095921359880204, "loss": 2.6092, "step": 1318 }, { "epoch": 0.37648989713159264, "grad_norm": 0.609375, "learning_rate": 0.00020946741301059514, "loss": 2.6118, "step": 1319 }, { "epoch": 0.37677533298991833, "grad_norm": 0.64453125, "learning_rate": 0.0002093426412330842, "loss": 2.6348, "step": 1320 }, { "epoch": 0.377060768848244, "grad_norm": 0.671875, "learning_rate": 0.00020921782075787777, "loss": 2.6552, "step": 1321 }, { "epoch": 0.3773462047065697, "grad_norm": 0.58203125, "learning_rate": 0.00020909295168740577, "loss": 2.6427, "step": 1322 }, { "epoch": 0.3776316405648954, "grad_norm": 0.5625, "learning_rate": 0.00020896803412413824, "loss": 2.626, "step": 1323 }, { "epoch": 0.3779170764232211, "grad_norm": 0.59375, "learning_rate": 0.00020884306817058482, "loss": 2.6509, "step": 1324 }, { "epoch": 0.3782025122815468, "grad_norm": 0.58984375, "learning_rate": 0.00020871805392929502, "loss": 2.6215, "step": 1325 }, { "epoch": 0.37848794813987247, "grad_norm": 0.55859375, "learning_rate": 0.00020859299150285786, "loss": 2.6605, "step": 1326 }, { "epoch": 0.3787733839981982, "grad_norm": 0.625, "learning_rate": 0.00020846788099390188, "loss": 2.6488, "step": 1327 }, { "epoch": 0.3790588198565239, "grad_norm": 0.5703125, "learning_rate": 0.00020834272250509523, "loss": 2.6607, "step": 1328 }, { "epoch": 0.3793442557148496, "grad_norm": 0.6015625, "learning_rate": 0.00020821751613914525, "loss": 2.6426, "step": 1329 }, { "epoch": 0.3796296915731753, "grad_norm": 0.57421875, "learning_rate": 0.0002080922619987987, "loss": 2.6458, "step": 1330 }, { "epoch": 0.37991512743150097, "grad_norm": 0.57421875, "learning_rate": 0.00020796696018684152, "loss": 2.6278, "step": 1331 }, { "epoch": 0.38020056328982665, "grad_norm": 0.54296875, "learning_rate": 0.00020784161080609868, "loss": 2.6603, "step": 1332 }, { "epoch": 0.38048599914815234, "grad_norm": 0.6015625, "learning_rate": 0.00020771621395943436, "loss": 2.6395, "step": 1333 }, { "epoch": 0.38077143500647803, "grad_norm": 0.55859375, "learning_rate": 0.00020759076974975144, "loss": 2.6346, "step": 1334 }, { "epoch": 0.3810568708648038, "grad_norm": 0.58203125, "learning_rate": 0.00020746527827999195, "loss": 2.6412, "step": 1335 }, { "epoch": 0.38134230672312946, "grad_norm": 0.6015625, "learning_rate": 0.00020733973965313655, "loss": 2.6311, "step": 1336 }, { "epoch": 0.38162774258145515, "grad_norm": 0.57421875, "learning_rate": 0.0002072141539722046, "loss": 2.6174, "step": 1337 }, { "epoch": 0.38191317843978084, "grad_norm": 0.5625, "learning_rate": 0.00020708852134025397, "loss": 2.6192, "step": 1338 }, { "epoch": 0.38219861429810653, "grad_norm": 0.5703125, "learning_rate": 0.0002069628418603814, "loss": 2.6467, "step": 1339 }, { "epoch": 0.3824840501564322, "grad_norm": 0.57421875, "learning_rate": 0.00020683711563572167, "loss": 2.6369, "step": 1340 }, { "epoch": 0.3827694860147579, "grad_norm": 0.55078125, "learning_rate": 0.00020671134276944815, "loss": 2.6372, "step": 1341 }, { "epoch": 0.3830549218730836, "grad_norm": 0.53515625, "learning_rate": 0.0002065855233647725, "loss": 2.6436, "step": 1342 }, { "epoch": 0.38334035773140934, "grad_norm": 0.58203125, "learning_rate": 0.00020645965752494444, "loss": 2.6342, "step": 1343 }, { "epoch": 0.38362579358973503, "grad_norm": 0.60546875, "learning_rate": 0.0002063337453532519, "loss": 2.637, "step": 1344 }, { "epoch": 0.3839112294480607, "grad_norm": 0.58203125, "learning_rate": 0.0002062077869530207, "loss": 2.6444, "step": 1345 }, { "epoch": 0.3841966653063864, "grad_norm": 0.54296875, "learning_rate": 0.00020608178242761483, "loss": 2.6339, "step": 1346 }, { "epoch": 0.3844821011647121, "grad_norm": 0.5703125, "learning_rate": 0.00020595573188043594, "loss": 2.6422, "step": 1347 }, { "epoch": 0.3847675370230378, "grad_norm": 0.6484375, "learning_rate": 0.00020582963541492343, "loss": 2.6472, "step": 1348 }, { "epoch": 0.3850529728813635, "grad_norm": 0.65625, "learning_rate": 0.00020570349313455452, "loss": 2.6081, "step": 1349 }, { "epoch": 0.38533840873968916, "grad_norm": 0.55078125, "learning_rate": 0.00020557730514284396, "loss": 2.6214, "step": 1350 }, { "epoch": 0.3856238445980149, "grad_norm": 0.53515625, "learning_rate": 0.00020545107154334397, "loss": 2.6263, "step": 1351 }, { "epoch": 0.3859092804563406, "grad_norm": 0.6328125, "learning_rate": 0.0002053247924396442, "loss": 2.6092, "step": 1352 }, { "epoch": 0.3861947163146663, "grad_norm": 0.5625, "learning_rate": 0.0002051984679353718, "loss": 2.6329, "step": 1353 }, { "epoch": 0.386480152172992, "grad_norm": 0.5546875, "learning_rate": 0.0002050720981341909, "loss": 2.6087, "step": 1354 }, { "epoch": 0.38676558803131766, "grad_norm": 0.70703125, "learning_rate": 0.00020494568313980305, "loss": 2.6249, "step": 1355 }, { "epoch": 0.38705102388964335, "grad_norm": 0.7578125, "learning_rate": 0.00020481922305594678, "loss": 2.6385, "step": 1356 }, { "epoch": 0.38733645974796904, "grad_norm": 0.70703125, "learning_rate": 0.0002046927179863976, "loss": 2.632, "step": 1357 }, { "epoch": 0.38762189560629473, "grad_norm": 0.54296875, "learning_rate": 0.00020456616803496796, "loss": 2.642, "step": 1358 }, { "epoch": 0.3879073314646205, "grad_norm": 0.7109375, "learning_rate": 0.00020443957330550718, "loss": 2.6268, "step": 1359 }, { "epoch": 0.38819276732294616, "grad_norm": 0.6640625, "learning_rate": 0.0002043129339019013, "loss": 2.6379, "step": 1360 }, { "epoch": 0.38847820318127185, "grad_norm": 0.51953125, "learning_rate": 0.00020418624992807295, "loss": 2.6577, "step": 1361 }, { "epoch": 0.38876363903959754, "grad_norm": 0.67578125, "learning_rate": 0.00020405952148798144, "loss": 2.6331, "step": 1362 }, { "epoch": 0.38904907489792323, "grad_norm": 0.55078125, "learning_rate": 0.00020393274868562254, "loss": 2.6376, "step": 1363 }, { "epoch": 0.3893345107562489, "grad_norm": 0.609375, "learning_rate": 0.00020380593162502844, "loss": 2.6041, "step": 1364 }, { "epoch": 0.3896199466145746, "grad_norm": 0.6796875, "learning_rate": 0.00020367907041026755, "loss": 2.6439, "step": 1365 }, { "epoch": 0.3899053824729003, "grad_norm": 0.5625, "learning_rate": 0.00020355216514544462, "loss": 2.6405, "step": 1366 }, { "epoch": 0.39019081833122604, "grad_norm": 0.56640625, "learning_rate": 0.0002034252159347005, "loss": 2.6451, "step": 1367 }, { "epoch": 0.39047625418955173, "grad_norm": 0.60546875, "learning_rate": 0.00020329822288221218, "loss": 2.637, "step": 1368 }, { "epoch": 0.3907616900478774, "grad_norm": 0.578125, "learning_rate": 0.00020317118609219253, "loss": 2.5896, "step": 1369 }, { "epoch": 0.3910471259062031, "grad_norm": 0.59375, "learning_rate": 0.00020304410566889027, "loss": 2.641, "step": 1370 }, { "epoch": 0.3913325617645288, "grad_norm": 0.62890625, "learning_rate": 0.0002029169817165901, "loss": 2.6245, "step": 1371 }, { "epoch": 0.3916179976228545, "grad_norm": 0.56640625, "learning_rate": 0.0002027898143396123, "loss": 2.6347, "step": 1372 }, { "epoch": 0.3919034334811802, "grad_norm": 0.56640625, "learning_rate": 0.00020266260364231286, "loss": 2.6158, "step": 1373 }, { "epoch": 0.39218886933950586, "grad_norm": 0.62890625, "learning_rate": 0.00020253534972908326, "loss": 2.6349, "step": 1374 }, { "epoch": 0.39247430519783155, "grad_norm": 0.7421875, "learning_rate": 0.00020240805270435044, "loss": 2.6329, "step": 1375 }, { "epoch": 0.3927597410561573, "grad_norm": 0.6875, "learning_rate": 0.00020228071267257687, "loss": 2.6633, "step": 1376 }, { "epoch": 0.393045176914483, "grad_norm": 0.703125, "learning_rate": 0.00020215332973826003, "loss": 2.6117, "step": 1377 }, { "epoch": 0.3933306127728087, "grad_norm": 0.91796875, "learning_rate": 0.00020202590400593285, "loss": 2.6286, "step": 1378 }, { "epoch": 0.39361604863113436, "grad_norm": 0.87109375, "learning_rate": 0.00020189843558016338, "loss": 2.6105, "step": 1379 }, { "epoch": 0.39390148448946005, "grad_norm": 0.98828125, "learning_rate": 0.0002017709245655545, "loss": 2.6128, "step": 1380 }, { "epoch": 0.39418692034778574, "grad_norm": 0.82421875, "learning_rate": 0.00020164337106674417, "loss": 2.6243, "step": 1381 }, { "epoch": 0.39447235620611143, "grad_norm": 0.69140625, "learning_rate": 0.0002015157751884053, "loss": 2.6557, "step": 1382 }, { "epoch": 0.3947577920644371, "grad_norm": 0.8984375, "learning_rate": 0.0002013881370352454, "loss": 2.624, "step": 1383 }, { "epoch": 0.39504322792276286, "grad_norm": 0.76171875, "learning_rate": 0.00020126045671200682, "loss": 2.6279, "step": 1384 }, { "epoch": 0.39532866378108855, "grad_norm": 0.5859375, "learning_rate": 0.00020113273432346632, "loss": 2.6363, "step": 1385 }, { "epoch": 0.39561409963941424, "grad_norm": 0.79296875, "learning_rate": 0.00020100496997443553, "loss": 2.6274, "step": 1386 }, { "epoch": 0.39589953549773993, "grad_norm": 0.6484375, "learning_rate": 0.00020087716376976014, "loss": 2.6191, "step": 1387 }, { "epoch": 0.3961849713560656, "grad_norm": 0.640625, "learning_rate": 0.00020074931581432035, "loss": 2.6355, "step": 1388 }, { "epoch": 0.3964704072143913, "grad_norm": 0.6875, "learning_rate": 0.0002006214262130307, "loss": 2.6386, "step": 1389 }, { "epoch": 0.396755843072717, "grad_norm": 0.6171875, "learning_rate": 0.0002004934950708397, "loss": 2.6345, "step": 1390 }, { "epoch": 0.3970412789310427, "grad_norm": 0.55078125, "learning_rate": 0.00020036552249273014, "loss": 2.6081, "step": 1391 }, { "epoch": 0.39732671478936843, "grad_norm": 0.65625, "learning_rate": 0.00020023750858371876, "loss": 2.6243, "step": 1392 }, { "epoch": 0.3976121506476941, "grad_norm": 0.55078125, "learning_rate": 0.00020010945344885615, "loss": 2.6405, "step": 1393 }, { "epoch": 0.3978975865060198, "grad_norm": 0.61328125, "learning_rate": 0.0001999813571932268, "loss": 2.5995, "step": 1394 }, { "epoch": 0.3981830223643455, "grad_norm": 0.5859375, "learning_rate": 0.00019985321992194892, "loss": 2.6225, "step": 1395 }, { "epoch": 0.3984684582226712, "grad_norm": 0.57421875, "learning_rate": 0.00019972504174017446, "loss": 2.6077, "step": 1396 }, { "epoch": 0.3987538940809969, "grad_norm": 0.59765625, "learning_rate": 0.00019959682275308869, "loss": 2.6165, "step": 1397 }, { "epoch": 0.39903932993932256, "grad_norm": 0.66796875, "learning_rate": 0.0001994685630659107, "loss": 2.601, "step": 1398 }, { "epoch": 0.39932476579764825, "grad_norm": 0.6171875, "learning_rate": 0.00019934026278389274, "loss": 2.6332, "step": 1399 }, { "epoch": 0.399610201655974, "grad_norm": 0.6015625, "learning_rate": 0.00019921192201232047, "loss": 2.6224, "step": 1400 }, { "epoch": 0.3998956375142997, "grad_norm": 0.58203125, "learning_rate": 0.0001990835408565127, "loss": 2.5961, "step": 1401 }, { "epoch": 0.4001810733726254, "grad_norm": 0.56640625, "learning_rate": 0.0001989551194218216, "loss": 2.6291, "step": 1402 }, { "epoch": 0.40046650923095106, "grad_norm": 0.56640625, "learning_rate": 0.00019882665781363208, "loss": 2.6164, "step": 1403 }, { "epoch": 0.40075194508927675, "grad_norm": 0.6015625, "learning_rate": 0.00019869815613736224, "loss": 2.6452, "step": 1404 }, { "epoch": 0.40103738094760244, "grad_norm": 0.6015625, "learning_rate": 0.00019856961449846294, "loss": 2.6502, "step": 1405 }, { "epoch": 0.40132281680592813, "grad_norm": 0.62109375, "learning_rate": 0.0001984410330024179, "loss": 2.6174, "step": 1406 }, { "epoch": 0.4016082526642538, "grad_norm": 0.56640625, "learning_rate": 0.0001983124117547436, "loss": 2.5982, "step": 1407 }, { "epoch": 0.40189368852257956, "grad_norm": 0.5625, "learning_rate": 0.00019818375086098897, "loss": 2.5949, "step": 1408 }, { "epoch": 0.40217912438090525, "grad_norm": 1.015625, "learning_rate": 0.00019805505042673564, "loss": 2.6337, "step": 1409 }, { "epoch": 0.40246456023923094, "grad_norm": 0.64453125, "learning_rate": 0.00019792631055759764, "loss": 2.6204, "step": 1410 }, { "epoch": 0.40274999609755663, "grad_norm": 0.61328125, "learning_rate": 0.00019779753135922126, "loss": 2.6416, "step": 1411 }, { "epoch": 0.4030354319558823, "grad_norm": 0.60546875, "learning_rate": 0.00019766871293728524, "loss": 2.6037, "step": 1412 }, { "epoch": 0.403320867814208, "grad_norm": 0.63671875, "learning_rate": 0.00019753985539750036, "loss": 2.6191, "step": 1413 }, { "epoch": 0.4036063036725337, "grad_norm": 0.578125, "learning_rate": 0.00019741095884560957, "loss": 2.6103, "step": 1414 }, { "epoch": 0.4038917395308594, "grad_norm": 0.57421875, "learning_rate": 0.00019728202338738785, "loss": 2.6346, "step": 1415 }, { "epoch": 0.40417717538918513, "grad_norm": 0.6796875, "learning_rate": 0.0001971530491286421, "loss": 2.6142, "step": 1416 }, { "epoch": 0.4044626112475108, "grad_norm": 0.51953125, "learning_rate": 0.00019702403617521093, "loss": 2.612, "step": 1417 }, { "epoch": 0.4047480471058365, "grad_norm": 0.625, "learning_rate": 0.00019689498463296487, "loss": 2.6237, "step": 1418 }, { "epoch": 0.4050334829641622, "grad_norm": 0.5625, "learning_rate": 0.00019676589460780616, "loss": 2.6104, "step": 1419 }, { "epoch": 0.4053189188224879, "grad_norm": 0.5703125, "learning_rate": 0.00019663676620566836, "loss": 2.6246, "step": 1420 }, { "epoch": 0.4056043546808136, "grad_norm": 0.6328125, "learning_rate": 0.00019650759953251677, "loss": 2.6212, "step": 1421 }, { "epoch": 0.40588979053913926, "grad_norm": 0.578125, "learning_rate": 0.00019637839469434804, "loss": 2.6268, "step": 1422 }, { "epoch": 0.40617522639746495, "grad_norm": 0.578125, "learning_rate": 0.00019624915179719004, "loss": 2.6045, "step": 1423 }, { "epoch": 0.4064606622557907, "grad_norm": 1.125, "learning_rate": 0.00019611987094710192, "loss": 2.5961, "step": 1424 }, { "epoch": 0.4067460981141164, "grad_norm": 0.5625, "learning_rate": 0.00019599055225017408, "loss": 2.5987, "step": 1425 }, { "epoch": 0.40703153397244207, "grad_norm": 0.6328125, "learning_rate": 0.00019586119581252781, "loss": 2.6394, "step": 1426 }, { "epoch": 0.40731696983076776, "grad_norm": 0.5859375, "learning_rate": 0.00019573180174031556, "loss": 2.5998, "step": 1427 }, { "epoch": 0.40760240568909345, "grad_norm": 0.6015625, "learning_rate": 0.00019560237013972046, "loss": 2.6149, "step": 1428 }, { "epoch": 0.40788784154741914, "grad_norm": 0.5859375, "learning_rate": 0.0001954729011169565, "loss": 2.6389, "step": 1429 }, { "epoch": 0.4081732774057448, "grad_norm": 0.59375, "learning_rate": 0.00019534339477826854, "loss": 2.6498, "step": 1430 }, { "epoch": 0.4084587132640705, "grad_norm": 0.60546875, "learning_rate": 0.00019521385122993185, "loss": 2.6256, "step": 1431 }, { "epoch": 0.40874414912239626, "grad_norm": 0.66015625, "learning_rate": 0.00019508427057825237, "loss": 2.614, "step": 1432 }, { "epoch": 0.40902958498072195, "grad_norm": 0.58203125, "learning_rate": 0.0001949546529295664, "loss": 2.5803, "step": 1433 }, { "epoch": 0.40931502083904764, "grad_norm": 0.625, "learning_rate": 0.00019482499839024062, "loss": 2.6267, "step": 1434 }, { "epoch": 0.4096004566973733, "grad_norm": 0.58203125, "learning_rate": 0.00019469530706667205, "loss": 2.627, "step": 1435 }, { "epoch": 0.409885892555699, "grad_norm": 0.5859375, "learning_rate": 0.0001945655790652878, "loss": 2.6262, "step": 1436 }, { "epoch": 0.4101713284140247, "grad_norm": 0.62890625, "learning_rate": 0.00019443581449254515, "loss": 2.6189, "step": 1437 }, { "epoch": 0.4104567642723504, "grad_norm": 0.55078125, "learning_rate": 0.00019430601345493136, "loss": 2.6023, "step": 1438 }, { "epoch": 0.4107422001306761, "grad_norm": 0.58203125, "learning_rate": 0.0001941761760589637, "loss": 2.6085, "step": 1439 }, { "epoch": 0.4110276359890018, "grad_norm": 0.5625, "learning_rate": 0.00019404630241118902, "loss": 2.6117, "step": 1440 }, { "epoch": 0.4113130718473275, "grad_norm": 0.58203125, "learning_rate": 0.00019391639261818428, "loss": 2.6289, "step": 1441 }, { "epoch": 0.4115985077056532, "grad_norm": 0.55859375, "learning_rate": 0.00019378644678655582, "loss": 2.6221, "step": 1442 }, { "epoch": 0.4118839435639789, "grad_norm": 0.5546875, "learning_rate": 0.00019365646502293962, "loss": 2.6028, "step": 1443 }, { "epoch": 0.4121693794223046, "grad_norm": 0.5546875, "learning_rate": 0.00019352644743400124, "loss": 2.599, "step": 1444 }, { "epoch": 0.41245481528063027, "grad_norm": 0.68359375, "learning_rate": 0.0001933963941264356, "loss": 2.6002, "step": 1445 }, { "epoch": 0.41274025113895596, "grad_norm": 0.53125, "learning_rate": 0.0001932663052069668, "loss": 2.6078, "step": 1446 }, { "epoch": 0.41302568699728165, "grad_norm": 0.62109375, "learning_rate": 0.00019313618078234843, "loss": 2.6375, "step": 1447 }, { "epoch": 0.41331112285560734, "grad_norm": 0.625, "learning_rate": 0.00019300602095936287, "loss": 2.6145, "step": 1448 }, { "epoch": 0.4135965587139331, "grad_norm": 1.578125, "learning_rate": 0.00019287582584482193, "loss": 2.6075, "step": 1449 }, { "epoch": 0.41388199457225877, "grad_norm": 1.3671875, "learning_rate": 0.00019274559554556604, "loss": 2.5988, "step": 1450 }, { "epoch": 0.41416743043058446, "grad_norm": 1.0, "learning_rate": 0.00019261533016846468, "loss": 2.6142, "step": 1451 }, { "epoch": 0.41445286628891015, "grad_norm": 0.65234375, "learning_rate": 0.00019248502982041613, "loss": 2.5849, "step": 1452 }, { "epoch": 0.41473830214723584, "grad_norm": 0.703125, "learning_rate": 0.00019235469460834732, "loss": 2.6181, "step": 1453 }, { "epoch": 0.4150237380055615, "grad_norm": 0.61328125, "learning_rate": 0.00019222432463921374, "loss": 2.5999, "step": 1454 }, { "epoch": 0.4153091738638872, "grad_norm": 0.75, "learning_rate": 0.0001920939200199995, "loss": 2.6166, "step": 1455 }, { "epoch": 0.4155946097222129, "grad_norm": 0.78515625, "learning_rate": 0.00019196348085771713, "loss": 2.6053, "step": 1456 }, { "epoch": 0.41588004558053865, "grad_norm": 0.69140625, "learning_rate": 0.0001918330072594074, "loss": 2.6113, "step": 1457 }, { "epoch": 0.41616548143886434, "grad_norm": 0.6484375, "learning_rate": 0.00019170249933213947, "loss": 2.6028, "step": 1458 }, { "epoch": 0.41645091729719, "grad_norm": 0.765625, "learning_rate": 0.00019157195718301067, "loss": 2.6048, "step": 1459 }, { "epoch": 0.4167363531555157, "grad_norm": 0.7421875, "learning_rate": 0.00019144138091914617, "loss": 2.6143, "step": 1460 }, { "epoch": 0.4170217890138414, "grad_norm": 0.5859375, "learning_rate": 0.00019131077064769953, "loss": 2.6159, "step": 1461 }, { "epoch": 0.4173072248721671, "grad_norm": 0.76171875, "learning_rate": 0.00019118012647585192, "loss": 2.5989, "step": 1462 }, { "epoch": 0.4175926607304928, "grad_norm": 0.7421875, "learning_rate": 0.00019104944851081244, "loss": 2.6203, "step": 1463 }, { "epoch": 0.41787809658881847, "grad_norm": 0.6015625, "learning_rate": 0.00019091873685981786, "loss": 2.596, "step": 1464 }, { "epoch": 0.4181635324471442, "grad_norm": 0.875, "learning_rate": 0.00019078799163013273, "loss": 2.5961, "step": 1465 }, { "epoch": 0.4184489683054699, "grad_norm": 0.8125, "learning_rate": 0.000190657212929049, "loss": 2.6254, "step": 1466 }, { "epoch": 0.4187344041637956, "grad_norm": 0.57421875, "learning_rate": 0.0001905264008638861, "loss": 2.616, "step": 1467 }, { "epoch": 0.4190198400221213, "grad_norm": 0.671875, "learning_rate": 0.00019039555554199099, "loss": 2.635, "step": 1468 }, { "epoch": 0.41930527588044697, "grad_norm": 0.609375, "learning_rate": 0.0001902646770707378, "loss": 2.5834, "step": 1469 }, { "epoch": 0.41959071173877266, "grad_norm": 0.58203125, "learning_rate": 0.00019013376555752782, "loss": 2.61, "step": 1470 }, { "epoch": 0.41987614759709835, "grad_norm": 0.6015625, "learning_rate": 0.00019000282110978958, "loss": 2.6072, "step": 1471 }, { "epoch": 0.42016158345542404, "grad_norm": 0.578125, "learning_rate": 0.00018987184383497855, "loss": 2.5803, "step": 1472 }, { "epoch": 0.4204470193137498, "grad_norm": 0.5546875, "learning_rate": 0.00018974083384057713, "loss": 2.639, "step": 1473 }, { "epoch": 0.42073245517207547, "grad_norm": 0.64453125, "learning_rate": 0.00018960979123409466, "loss": 2.5955, "step": 1474 }, { "epoch": 0.42101789103040116, "grad_norm": 0.5234375, "learning_rate": 0.0001894787161230672, "loss": 2.6356, "step": 1475 }, { "epoch": 0.42130332688872685, "grad_norm": 0.578125, "learning_rate": 0.0001893476086150574, "loss": 2.6224, "step": 1476 }, { "epoch": 0.42158876274705254, "grad_norm": 0.62109375, "learning_rate": 0.00018921646881765456, "loss": 2.6103, "step": 1477 }, { "epoch": 0.4218741986053782, "grad_norm": 0.578125, "learning_rate": 0.0001890852968384746, "loss": 2.6162, "step": 1478 }, { "epoch": 0.4221596344637039, "grad_norm": 0.52734375, "learning_rate": 0.0001889540927851596, "loss": 2.628, "step": 1479 }, { "epoch": 0.4224450703220296, "grad_norm": 0.65625, "learning_rate": 0.0001888228567653781, "loss": 2.6217, "step": 1480 }, { "epoch": 0.42273050618035535, "grad_norm": 0.52734375, "learning_rate": 0.00018869158888682494, "loss": 2.613, "step": 1481 }, { "epoch": 0.42301594203868104, "grad_norm": 0.5703125, "learning_rate": 0.00018856028925722104, "loss": 2.608, "step": 1482 }, { "epoch": 0.4233013778970067, "grad_norm": 0.57421875, "learning_rate": 0.00018842895798431327, "loss": 2.6083, "step": 1483 }, { "epoch": 0.4235868137553324, "grad_norm": 0.51171875, "learning_rate": 0.00018829759517587457, "loss": 2.6065, "step": 1484 }, { "epoch": 0.4238722496136581, "grad_norm": 0.62890625, "learning_rate": 0.00018816620093970387, "loss": 2.6158, "step": 1485 }, { "epoch": 0.4241576854719838, "grad_norm": 0.625, "learning_rate": 0.00018803477538362562, "loss": 2.628, "step": 1486 }, { "epoch": 0.4244431213303095, "grad_norm": 0.52734375, "learning_rate": 0.00018790331861549023, "loss": 2.6095, "step": 1487 }, { "epoch": 0.42472855718863517, "grad_norm": 0.58984375, "learning_rate": 0.00018777183074317349, "loss": 2.5987, "step": 1488 }, { "epoch": 0.4250139930469609, "grad_norm": 0.5625, "learning_rate": 0.000187640311874577, "loss": 2.5805, "step": 1489 }, { "epoch": 0.4252994289052866, "grad_norm": 0.515625, "learning_rate": 0.00018750876211762752, "loss": 2.6163, "step": 1490 }, { "epoch": 0.4255848647636123, "grad_norm": 0.53515625, "learning_rate": 0.00018737718158027734, "loss": 2.596, "step": 1491 }, { "epoch": 0.425870300621938, "grad_norm": 0.54296875, "learning_rate": 0.00018724557037050384, "loss": 2.6397, "step": 1492 }, { "epoch": 0.42615573648026367, "grad_norm": 0.53125, "learning_rate": 0.0001871139285963098, "loss": 2.6378, "step": 1493 }, { "epoch": 0.42644117233858936, "grad_norm": 0.546875, "learning_rate": 0.00018698225636572285, "loss": 2.6063, "step": 1494 }, { "epoch": 0.42672660819691505, "grad_norm": 0.5234375, "learning_rate": 0.0001868505537867958, "loss": 2.6003, "step": 1495 }, { "epoch": 0.42701204405524074, "grad_norm": 0.58984375, "learning_rate": 0.00018671882096760623, "loss": 2.595, "step": 1496 }, { "epoch": 0.4272974799135665, "grad_norm": 0.5546875, "learning_rate": 0.00018658705801625656, "loss": 2.5969, "step": 1497 }, { "epoch": 0.42758291577189217, "grad_norm": 0.515625, "learning_rate": 0.00018645526504087402, "loss": 2.6158, "step": 1498 }, { "epoch": 0.42786835163021786, "grad_norm": 0.5546875, "learning_rate": 0.00018632344214961045, "loss": 2.6027, "step": 1499 }, { "epoch": 0.42815378748854355, "grad_norm": 0.53515625, "learning_rate": 0.0001861915894506421, "loss": 2.6258, "step": 1500 }, { "epoch": 0.42815378748854355, "eval_loss": 2.498450517654419, "eval_runtime": 5960.8882, "eval_samples_per_second": 10.785, "eval_steps_per_second": 10.785, "step": 1500 }, { "epoch": 0.42843922334686924, "grad_norm": 0.578125, "learning_rate": 0.00018605970705216988, "loss": 2.5927, "step": 1501 }, { "epoch": 0.4287246592051949, "grad_norm": 0.51171875, "learning_rate": 0.00018592779506241902, "loss": 2.5965, "step": 1502 }, { "epoch": 0.4290100950635206, "grad_norm": 0.5625, "learning_rate": 0.00018579585358963885, "loss": 2.6102, "step": 1503 }, { "epoch": 0.4292955309218463, "grad_norm": 0.5390625, "learning_rate": 0.00018566388274210316, "loss": 2.5903, "step": 1504 }, { "epoch": 0.42958096678017205, "grad_norm": 0.515625, "learning_rate": 0.00018553188262810974, "loss": 2.6056, "step": 1505 }, { "epoch": 0.42986640263849774, "grad_norm": 0.56640625, "learning_rate": 0.00018539985335598033, "loss": 2.6157, "step": 1506 }, { "epoch": 0.4301518384968234, "grad_norm": 0.53125, "learning_rate": 0.00018526779503406059, "loss": 2.5769, "step": 1507 }, { "epoch": 0.4304372743551491, "grad_norm": 0.55078125, "learning_rate": 0.00018513570777072024, "loss": 2.6171, "step": 1508 }, { "epoch": 0.4307227102134748, "grad_norm": 0.52734375, "learning_rate": 0.0001850035916743525, "loss": 2.5859, "step": 1509 }, { "epoch": 0.4310081460718005, "grad_norm": 0.52734375, "learning_rate": 0.00018487144685337432, "loss": 2.5976, "step": 1510 }, { "epoch": 0.4312935819301262, "grad_norm": 0.5390625, "learning_rate": 0.00018473927341622627, "loss": 2.6144, "step": 1511 }, { "epoch": 0.43157901778845187, "grad_norm": 0.53125, "learning_rate": 0.0001846070714713724, "loss": 2.6233, "step": 1512 }, { "epoch": 0.4318644536467776, "grad_norm": 0.5390625, "learning_rate": 0.0001844748411273001, "loss": 2.6009, "step": 1513 }, { "epoch": 0.4321498895051033, "grad_norm": 0.578125, "learning_rate": 0.00018434258249252008, "loss": 2.6117, "step": 1514 }, { "epoch": 0.432435325363429, "grad_norm": 0.50390625, "learning_rate": 0.00018421029567556633, "loss": 2.6089, "step": 1515 }, { "epoch": 0.4327207612217547, "grad_norm": 0.5390625, "learning_rate": 0.00018407798078499588, "loss": 2.5967, "step": 1516 }, { "epoch": 0.43300619708008037, "grad_norm": 0.515625, "learning_rate": 0.0001839456379293889, "loss": 2.6026, "step": 1517 }, { "epoch": 0.43329163293840606, "grad_norm": 0.515625, "learning_rate": 0.00018381326721734833, "loss": 2.6104, "step": 1518 }, { "epoch": 0.43357706879673175, "grad_norm": 0.51953125, "learning_rate": 0.00018368086875750013, "loss": 2.6096, "step": 1519 }, { "epoch": 0.43386250465505743, "grad_norm": 0.486328125, "learning_rate": 0.00018354844265849307, "loss": 2.6035, "step": 1520 }, { "epoch": 0.4341479405133831, "grad_norm": 0.5234375, "learning_rate": 0.0001834159890289984, "loss": 2.6119, "step": 1521 }, { "epoch": 0.43443337637170887, "grad_norm": 0.4921875, "learning_rate": 0.00018328350797771018, "loss": 2.6295, "step": 1522 }, { "epoch": 0.43471881223003456, "grad_norm": 0.515625, "learning_rate": 0.0001831509996133447, "loss": 2.5938, "step": 1523 }, { "epoch": 0.43500424808836025, "grad_norm": 0.50390625, "learning_rate": 0.000183018464044641, "loss": 2.6174, "step": 1524 }, { "epoch": 0.43528968394668593, "grad_norm": 0.486328125, "learning_rate": 0.00018288590138036028, "loss": 2.6166, "step": 1525 }, { "epoch": 0.4355751198050116, "grad_norm": 0.50390625, "learning_rate": 0.00018275331172928587, "loss": 2.6148, "step": 1526 }, { "epoch": 0.4358605556633373, "grad_norm": 0.498046875, "learning_rate": 0.00018262069520022338, "loss": 2.5973, "step": 1527 }, { "epoch": 0.436145991521663, "grad_norm": 0.51953125, "learning_rate": 0.00018248805190200048, "loss": 2.5931, "step": 1528 }, { "epoch": 0.4364314273799887, "grad_norm": 0.51171875, "learning_rate": 0.0001823553819434668, "loss": 2.5844, "step": 1529 }, { "epoch": 0.43671686323831443, "grad_norm": 0.515625, "learning_rate": 0.00018222268543349374, "loss": 2.6187, "step": 1530 }, { "epoch": 0.4370022990966401, "grad_norm": 0.5234375, "learning_rate": 0.00018208996248097458, "loss": 2.5919, "step": 1531 }, { "epoch": 0.4372877349549658, "grad_norm": 0.53125, "learning_rate": 0.00018195721319482438, "loss": 2.6071, "step": 1532 }, { "epoch": 0.4375731708132915, "grad_norm": 0.515625, "learning_rate": 0.00018182443768397963, "loss": 2.6021, "step": 1533 }, { "epoch": 0.4378586066716172, "grad_norm": 0.5546875, "learning_rate": 0.00018169163605739845, "loss": 2.5948, "step": 1534 }, { "epoch": 0.4381440425299429, "grad_norm": 0.53515625, "learning_rate": 0.0001815588084240604, "loss": 2.6145, "step": 1535 }, { "epoch": 0.43842947838826857, "grad_norm": 0.55859375, "learning_rate": 0.0001814259548929663, "loss": 2.5996, "step": 1536 }, { "epoch": 0.43871491424659426, "grad_norm": 0.55078125, "learning_rate": 0.0001812930755731383, "loss": 2.6011, "step": 1537 }, { "epoch": 0.43900035010492, "grad_norm": 0.56640625, "learning_rate": 0.00018116017057361972, "loss": 2.6185, "step": 1538 }, { "epoch": 0.4392857859632457, "grad_norm": 0.609375, "learning_rate": 0.00018102724000347488, "loss": 2.5761, "step": 1539 }, { "epoch": 0.4395712218215714, "grad_norm": 0.51953125, "learning_rate": 0.00018089428397178908, "loss": 2.6193, "step": 1540 }, { "epoch": 0.43985665767989707, "grad_norm": 0.494140625, "learning_rate": 0.0001807613025876687, "loss": 2.6, "step": 1541 }, { "epoch": 0.44014209353822276, "grad_norm": 0.53515625, "learning_rate": 0.00018062829596024067, "loss": 2.5964, "step": 1542 }, { "epoch": 0.44042752939654845, "grad_norm": 0.5234375, "learning_rate": 0.0001804952641986527, "loss": 2.5884, "step": 1543 }, { "epoch": 0.44071296525487413, "grad_norm": 0.5, "learning_rate": 0.00018036220741207332, "loss": 2.5893, "step": 1544 }, { "epoch": 0.4409984011131998, "grad_norm": 0.484375, "learning_rate": 0.0001802291257096914, "loss": 2.5842, "step": 1545 }, { "epoch": 0.44128383697152557, "grad_norm": 0.498046875, "learning_rate": 0.00018009601920071624, "loss": 2.6291, "step": 1546 }, { "epoch": 0.44156927282985126, "grad_norm": 0.470703125, "learning_rate": 0.00017996288799437758, "loss": 2.6153, "step": 1547 }, { "epoch": 0.44185470868817694, "grad_norm": 0.50390625, "learning_rate": 0.00017982973219992548, "loss": 2.5752, "step": 1548 }, { "epoch": 0.44214014454650263, "grad_norm": 0.482421875, "learning_rate": 0.00017969655192663007, "loss": 2.5856, "step": 1549 }, { "epoch": 0.4424255804048283, "grad_norm": 0.48828125, "learning_rate": 0.00017956334728378158, "loss": 2.5989, "step": 1550 }, { "epoch": 0.442711016263154, "grad_norm": 0.5078125, "learning_rate": 0.00017943011838069021, "loss": 2.621, "step": 1551 }, { "epoch": 0.4429964521214797, "grad_norm": 0.51171875, "learning_rate": 0.0001792968653266863, "loss": 2.6003, "step": 1552 }, { "epoch": 0.4432818879798054, "grad_norm": 0.52734375, "learning_rate": 0.00017916358823111972, "loss": 2.6094, "step": 1553 }, { "epoch": 0.44356732383813113, "grad_norm": 0.490234375, "learning_rate": 0.0001790302872033601, "loss": 2.6167, "step": 1554 }, { "epoch": 0.4438527596964568, "grad_norm": 0.494140625, "learning_rate": 0.00017889696235279693, "loss": 2.576, "step": 1555 }, { "epoch": 0.4441381955547825, "grad_norm": 0.478515625, "learning_rate": 0.00017876361378883903, "loss": 2.5914, "step": 1556 }, { "epoch": 0.4444236314131082, "grad_norm": 0.515625, "learning_rate": 0.00017863024162091478, "loss": 2.591, "step": 1557 }, { "epoch": 0.4447090672714339, "grad_norm": 0.482421875, "learning_rate": 0.0001784968459584719, "loss": 2.6002, "step": 1558 }, { "epoch": 0.4449945031297596, "grad_norm": 0.482421875, "learning_rate": 0.00017836342691097742, "loss": 2.5826, "step": 1559 }, { "epoch": 0.44527993898808527, "grad_norm": 0.4921875, "learning_rate": 0.0001782299845879175, "loss": 2.5972, "step": 1560 }, { "epoch": 0.44556537484641096, "grad_norm": 0.48046875, "learning_rate": 0.00017809651909879749, "loss": 2.5984, "step": 1561 }, { "epoch": 0.4458508107047367, "grad_norm": 0.5, "learning_rate": 0.00017796303055314164, "loss": 2.5803, "step": 1562 }, { "epoch": 0.4461362465630624, "grad_norm": 0.515625, "learning_rate": 0.00017782951906049316, "loss": 2.6079, "step": 1563 }, { "epoch": 0.4464216824213881, "grad_norm": 0.486328125, "learning_rate": 0.00017769598473041422, "loss": 2.5998, "step": 1564 }, { "epoch": 0.44670711827971377, "grad_norm": 0.5234375, "learning_rate": 0.00017756242767248557, "loss": 2.5921, "step": 1565 }, { "epoch": 0.44699255413803946, "grad_norm": 0.52734375, "learning_rate": 0.0001774288479963066, "loss": 2.5799, "step": 1566 }, { "epoch": 0.44727798999636514, "grad_norm": 0.55078125, "learning_rate": 0.00017729524581149537, "loss": 2.639, "step": 1567 }, { "epoch": 0.44756342585469083, "grad_norm": 0.52734375, "learning_rate": 0.00017716162122768836, "loss": 2.613, "step": 1568 }, { "epoch": 0.4478488617130165, "grad_norm": 0.5, "learning_rate": 0.0001770279743545405, "loss": 2.6075, "step": 1569 }, { "epoch": 0.44813429757134227, "grad_norm": 0.515625, "learning_rate": 0.00017689430530172482, "loss": 2.5834, "step": 1570 }, { "epoch": 0.44841973342966795, "grad_norm": 0.51171875, "learning_rate": 0.00017676061417893274, "loss": 2.607, "step": 1571 }, { "epoch": 0.44870516928799364, "grad_norm": 0.51171875, "learning_rate": 0.00017662690109587382, "loss": 2.5996, "step": 1572 }, { "epoch": 0.44899060514631933, "grad_norm": 0.51171875, "learning_rate": 0.00017649316616227538, "loss": 2.5941, "step": 1573 }, { "epoch": 0.449276041004645, "grad_norm": 0.5234375, "learning_rate": 0.0001763594094878829, "loss": 2.5961, "step": 1574 }, { "epoch": 0.4495614768629707, "grad_norm": 0.50390625, "learning_rate": 0.00017622563118245972, "loss": 2.5923, "step": 1575 }, { "epoch": 0.4498469127212964, "grad_norm": 0.53515625, "learning_rate": 0.00017609183135578675, "loss": 2.5981, "step": 1576 }, { "epoch": 0.4501323485796221, "grad_norm": 0.5234375, "learning_rate": 0.00017595801011766274, "loss": 2.6039, "step": 1577 }, { "epoch": 0.45041778443794783, "grad_norm": 0.51171875, "learning_rate": 0.00017582416757790388, "loss": 2.587, "step": 1578 }, { "epoch": 0.4507032202962735, "grad_norm": 0.52734375, "learning_rate": 0.0001756903038463439, "loss": 2.5729, "step": 1579 }, { "epoch": 0.4509886561545992, "grad_norm": 0.47265625, "learning_rate": 0.0001755564190328339, "loss": 2.6028, "step": 1580 }, { "epoch": 0.4512740920129249, "grad_norm": 0.53125, "learning_rate": 0.00017542251324724237, "loss": 2.5784, "step": 1581 }, { "epoch": 0.4515595278712506, "grad_norm": 0.50390625, "learning_rate": 0.00017528858659945486, "loss": 2.6228, "step": 1582 }, { "epoch": 0.4518449637295763, "grad_norm": 0.51171875, "learning_rate": 0.00017515463919937413, "loss": 2.6181, "step": 1583 }, { "epoch": 0.45213039958790197, "grad_norm": 0.498046875, "learning_rate": 0.00017502067115691996, "loss": 2.5915, "step": 1584 }, { "epoch": 0.45241583544622765, "grad_norm": 0.462890625, "learning_rate": 0.0001748866825820291, "loss": 2.6104, "step": 1585 }, { "epoch": 0.45270127130455334, "grad_norm": 0.49609375, "learning_rate": 0.00017475267358465504, "loss": 2.5913, "step": 1586 }, { "epoch": 0.4529867071628791, "grad_norm": 0.484375, "learning_rate": 0.00017461864427476814, "loss": 2.6017, "step": 1587 }, { "epoch": 0.4532721430212048, "grad_norm": 0.55078125, "learning_rate": 0.0001744845947623554, "loss": 2.6186, "step": 1588 }, { "epoch": 0.45355757887953047, "grad_norm": 0.5078125, "learning_rate": 0.00017435052515742038, "loss": 2.5961, "step": 1589 }, { "epoch": 0.45384301473785615, "grad_norm": 0.54296875, "learning_rate": 0.00017421643556998312, "loss": 2.5929, "step": 1590 }, { "epoch": 0.45412845059618184, "grad_norm": 0.50390625, "learning_rate": 0.0001740823261100801, "loss": 2.5902, "step": 1591 }, { "epoch": 0.45441388645450753, "grad_norm": 0.5390625, "learning_rate": 0.0001739481968877641, "loss": 2.5817, "step": 1592 }, { "epoch": 0.4546993223128332, "grad_norm": 0.50390625, "learning_rate": 0.00017381404801310404, "loss": 2.5856, "step": 1593 }, { "epoch": 0.4549847581711589, "grad_norm": 0.515625, "learning_rate": 0.00017367987959618505, "loss": 2.5742, "step": 1594 }, { "epoch": 0.45527019402948465, "grad_norm": 0.51953125, "learning_rate": 0.00017354569174710834, "loss": 2.5916, "step": 1595 }, { "epoch": 0.45555562988781034, "grad_norm": 0.515625, "learning_rate": 0.00017341148457599096, "loss": 2.5964, "step": 1596 }, { "epoch": 0.45584106574613603, "grad_norm": 0.5625, "learning_rate": 0.00017327725819296576, "loss": 2.597, "step": 1597 }, { "epoch": 0.4561265016044617, "grad_norm": 0.48046875, "learning_rate": 0.0001731430127081816, "loss": 2.5921, "step": 1598 }, { "epoch": 0.4564119374627874, "grad_norm": 0.5390625, "learning_rate": 0.00017300874823180282, "loss": 2.61, "step": 1599 }, { "epoch": 0.4566973733211131, "grad_norm": 0.5703125, "learning_rate": 0.00017287446487400935, "loss": 2.5985, "step": 1600 }, { "epoch": 0.4569828091794388, "grad_norm": 0.494140625, "learning_rate": 0.00017274016274499665, "loss": 2.6079, "step": 1601 }, { "epoch": 0.4572682450377645, "grad_norm": 0.55078125, "learning_rate": 0.00017260584195497567, "loss": 2.5797, "step": 1602 }, { "epoch": 0.4575536808960902, "grad_norm": 0.494140625, "learning_rate": 0.00017247150261417255, "loss": 2.6106, "step": 1603 }, { "epoch": 0.4578391167544159, "grad_norm": 0.53125, "learning_rate": 0.0001723371448328287, "loss": 2.5846, "step": 1604 }, { "epoch": 0.4581245526127416, "grad_norm": 0.55078125, "learning_rate": 0.00017220276872120072, "loss": 2.5763, "step": 1605 }, { "epoch": 0.4584099884710673, "grad_norm": 0.52734375, "learning_rate": 0.00017206837438956004, "loss": 2.5878, "step": 1606 }, { "epoch": 0.458695424329393, "grad_norm": 0.5390625, "learning_rate": 0.00017193396194819328, "loss": 2.5931, "step": 1607 }, { "epoch": 0.45898086018771866, "grad_norm": 0.5078125, "learning_rate": 0.00017179953150740193, "loss": 2.5835, "step": 1608 }, { "epoch": 0.45926629604604435, "grad_norm": 0.5625, "learning_rate": 0.000171665083177502, "loss": 2.6094, "step": 1609 }, { "epoch": 0.45955173190437004, "grad_norm": 0.515625, "learning_rate": 0.00017153061706882443, "loss": 2.6024, "step": 1610 }, { "epoch": 0.4598371677626958, "grad_norm": 0.53515625, "learning_rate": 0.0001713961332917146, "loss": 2.618, "step": 1611 }, { "epoch": 0.4601226036210215, "grad_norm": 0.5625, "learning_rate": 0.00017126163195653254, "loss": 2.6115, "step": 1612 }, { "epoch": 0.46040803947934716, "grad_norm": 0.515625, "learning_rate": 0.00017112711317365247, "loss": 2.5529, "step": 1613 }, { "epoch": 0.46069347533767285, "grad_norm": 0.52734375, "learning_rate": 0.00017099257705346314, "loss": 2.6051, "step": 1614 }, { "epoch": 0.46097891119599854, "grad_norm": 0.56640625, "learning_rate": 0.00017085802370636743, "loss": 2.6073, "step": 1615 }, { "epoch": 0.46126434705432423, "grad_norm": 0.494140625, "learning_rate": 0.00017072345324278232, "loss": 2.5969, "step": 1616 }, { "epoch": 0.4615497829126499, "grad_norm": 0.54296875, "learning_rate": 0.00017058886577313892, "loss": 2.6139, "step": 1617 }, { "epoch": 0.4618352187709756, "grad_norm": 0.50390625, "learning_rate": 0.00017045426140788224, "loss": 2.5696, "step": 1618 }, { "epoch": 0.46212065462930135, "grad_norm": 0.53125, "learning_rate": 0.00017031964025747117, "loss": 2.5835, "step": 1619 }, { "epoch": 0.46240609048762704, "grad_norm": 0.515625, "learning_rate": 0.00017018500243237838, "loss": 2.5731, "step": 1620 }, { "epoch": 0.46269152634595273, "grad_norm": 0.482421875, "learning_rate": 0.00017005034804309027, "loss": 2.6096, "step": 1621 }, { "epoch": 0.4629769622042784, "grad_norm": 0.494140625, "learning_rate": 0.00016991567720010668, "loss": 2.6063, "step": 1622 }, { "epoch": 0.4632623980626041, "grad_norm": 0.453125, "learning_rate": 0.00016978099001394112, "loss": 2.6002, "step": 1623 }, { "epoch": 0.4635478339209298, "grad_norm": 0.5234375, "learning_rate": 0.00016964628659512046, "loss": 2.5955, "step": 1624 }, { "epoch": 0.4638332697792555, "grad_norm": 0.482421875, "learning_rate": 0.00016951156705418484, "loss": 2.5975, "step": 1625 }, { "epoch": 0.4641187056375812, "grad_norm": 0.609375, "learning_rate": 0.00016937683150168765, "loss": 2.5944, "step": 1626 }, { "epoch": 0.4644041414959069, "grad_norm": 0.9921875, "learning_rate": 0.0001692420800481955, "loss": 2.5734, "step": 1627 }, { "epoch": 0.4646895773542326, "grad_norm": 0.73828125, "learning_rate": 0.000169107312804288, "loss": 2.6232, "step": 1628 }, { "epoch": 0.4649750132125583, "grad_norm": 0.8046875, "learning_rate": 0.0001689725298805576, "loss": 2.5985, "step": 1629 }, { "epoch": 0.465260449070884, "grad_norm": 1.5234375, "learning_rate": 0.00016883773138760976, "loss": 2.578, "step": 1630 }, { "epoch": 0.4655458849292097, "grad_norm": 0.83203125, "learning_rate": 0.00016870291743606273, "loss": 2.5762, "step": 1631 }, { "epoch": 0.46583132078753536, "grad_norm": 0.8359375, "learning_rate": 0.0001685680881365474, "loss": 2.5714, "step": 1632 }, { "epoch": 0.46611675664586105, "grad_norm": 0.8671875, "learning_rate": 0.00016843324359970712, "loss": 2.5721, "step": 1633 }, { "epoch": 0.46640219250418674, "grad_norm": 0.7109375, "learning_rate": 0.00016829838393619796, "loss": 2.6092, "step": 1634 }, { "epoch": 0.4666876283625125, "grad_norm": 0.6875, "learning_rate": 0.00016816350925668837, "loss": 2.5973, "step": 1635 }, { "epoch": 0.4669730642208382, "grad_norm": 0.85546875, "learning_rate": 0.000168028619671859, "loss": 2.6003, "step": 1636 }, { "epoch": 0.46725850007916386, "grad_norm": 0.6171875, "learning_rate": 0.00016789371529240271, "loss": 2.612, "step": 1637 }, { "epoch": 0.46754393593748955, "grad_norm": 0.79296875, "learning_rate": 0.0001677587962290248, "loss": 2.5903, "step": 1638 }, { "epoch": 0.46782937179581524, "grad_norm": 0.58984375, "learning_rate": 0.00016762386259244224, "loss": 2.5791, "step": 1639 }, { "epoch": 0.46811480765414093, "grad_norm": 0.65234375, "learning_rate": 0.0001674889144933842, "loss": 2.6103, "step": 1640 }, { "epoch": 0.4684002435124666, "grad_norm": 0.63671875, "learning_rate": 0.00016735395204259162, "loss": 2.5757, "step": 1641 }, { "epoch": 0.4686856793707923, "grad_norm": 0.60546875, "learning_rate": 0.00016721897535081724, "loss": 2.5925, "step": 1642 }, { "epoch": 0.46897111522911805, "grad_norm": 0.66796875, "learning_rate": 0.00016708398452882552, "loss": 2.6213, "step": 1643 }, { "epoch": 0.46925655108744374, "grad_norm": 0.5546875, "learning_rate": 0.00016694897968739245, "loss": 2.5948, "step": 1644 }, { "epoch": 0.46954198694576943, "grad_norm": 0.6015625, "learning_rate": 0.0001668139609373056, "loss": 2.5849, "step": 1645 }, { "epoch": 0.4698274228040951, "grad_norm": 0.62109375, "learning_rate": 0.00016667892838936389, "loss": 2.6265, "step": 1646 }, { "epoch": 0.4701128586624208, "grad_norm": 0.57421875, "learning_rate": 0.00016654388215437755, "loss": 2.6059, "step": 1647 }, { "epoch": 0.4703982945207465, "grad_norm": 0.5078125, "learning_rate": 0.0001664088223431682, "loss": 2.6298, "step": 1648 }, { "epoch": 0.4706837303790722, "grad_norm": 0.5390625, "learning_rate": 0.0001662737490665683, "loss": 2.6045, "step": 1649 }, { "epoch": 0.4709691662373979, "grad_norm": 0.52734375, "learning_rate": 0.0001661386624354217, "loss": 2.6153, "step": 1650 }, { "epoch": 0.4712546020957236, "grad_norm": 0.498046875, "learning_rate": 0.00016600356256058296, "loss": 2.5974, "step": 1651 }, { "epoch": 0.4715400379540493, "grad_norm": 0.515625, "learning_rate": 0.00016586844955291768, "loss": 2.5846, "step": 1652 }, { "epoch": 0.471825473812375, "grad_norm": 0.515625, "learning_rate": 0.00016573332352330203, "loss": 2.5888, "step": 1653 }, { "epoch": 0.4721109096707007, "grad_norm": 0.515625, "learning_rate": 0.00016559818458262304, "loss": 2.5823, "step": 1654 }, { "epoch": 0.4723963455290264, "grad_norm": 0.50390625, "learning_rate": 0.00016546303284177837, "loss": 2.5973, "step": 1655 }, { "epoch": 0.47268178138735206, "grad_norm": 0.51953125, "learning_rate": 0.000165327868411676, "loss": 2.5688, "step": 1656 }, { "epoch": 0.47296721724567775, "grad_norm": 0.51171875, "learning_rate": 0.00016519269140323443, "loss": 2.584, "step": 1657 }, { "epoch": 0.47325265310400344, "grad_norm": 0.51953125, "learning_rate": 0.00016505750192738253, "loss": 2.5829, "step": 1658 }, { "epoch": 0.47353808896232913, "grad_norm": 0.50390625, "learning_rate": 0.00016492230009505928, "loss": 2.5653, "step": 1659 }, { "epoch": 0.4738235248206549, "grad_norm": 0.5078125, "learning_rate": 0.0001647870860172139, "loss": 2.6081, "step": 1660 }, { "epoch": 0.47410896067898056, "grad_norm": 0.49609375, "learning_rate": 0.00016465185980480562, "loss": 2.5732, "step": 1661 }, { "epoch": 0.47439439653730625, "grad_norm": 0.53515625, "learning_rate": 0.0001645166215688036, "loss": 2.5776, "step": 1662 }, { "epoch": 0.47467983239563194, "grad_norm": 0.5078125, "learning_rate": 0.000164381371420187, "loss": 2.5898, "step": 1663 }, { "epoch": 0.47496526825395763, "grad_norm": 0.53515625, "learning_rate": 0.00016424610946994453, "loss": 2.6061, "step": 1664 }, { "epoch": 0.4752507041122833, "grad_norm": 0.50390625, "learning_rate": 0.00016411083582907476, "loss": 2.5932, "step": 1665 }, { "epoch": 0.475536139970609, "grad_norm": 0.478515625, "learning_rate": 0.0001639755506085858, "loss": 2.5887, "step": 1666 }, { "epoch": 0.4758215758289347, "grad_norm": 0.484375, "learning_rate": 0.0001638402539194953, "loss": 2.597, "step": 1667 }, { "epoch": 0.47610701168726044, "grad_norm": 0.50390625, "learning_rate": 0.00016370494587283026, "loss": 2.5624, "step": 1668 }, { "epoch": 0.47639244754558613, "grad_norm": 0.44921875, "learning_rate": 0.00016356962657962693, "loss": 2.571, "step": 1669 }, { "epoch": 0.4766778834039118, "grad_norm": 0.51171875, "learning_rate": 0.00016343429615093104, "loss": 2.5971, "step": 1670 }, { "epoch": 0.4769633192622375, "grad_norm": 0.462890625, "learning_rate": 0.00016329895469779725, "loss": 2.5999, "step": 1671 }, { "epoch": 0.4772487551205632, "grad_norm": 0.48046875, "learning_rate": 0.00016316360233128933, "loss": 2.5949, "step": 1672 }, { "epoch": 0.4775341909788889, "grad_norm": 0.46484375, "learning_rate": 0.0001630282391624799, "loss": 2.599, "step": 1673 }, { "epoch": 0.4778196268372146, "grad_norm": 0.52734375, "learning_rate": 0.00016289286530245064, "loss": 2.5983, "step": 1674 }, { "epoch": 0.47810506269554026, "grad_norm": 0.4921875, "learning_rate": 0.00016275748086229193, "loss": 2.5857, "step": 1675 }, { "epoch": 0.478390498553866, "grad_norm": 0.44140625, "learning_rate": 0.0001626220859531027, "loss": 2.5945, "step": 1676 }, { "epoch": 0.4786759344121917, "grad_norm": 0.494140625, "learning_rate": 0.00016248668068599066, "loss": 2.6017, "step": 1677 }, { "epoch": 0.4789613702705174, "grad_norm": 0.46484375, "learning_rate": 0.0001623512651720719, "loss": 2.6014, "step": 1678 }, { "epoch": 0.4792468061288431, "grad_norm": 0.486328125, "learning_rate": 0.00016221583952247097, "loss": 2.5712, "step": 1679 }, { "epoch": 0.47953224198716876, "grad_norm": 0.458984375, "learning_rate": 0.00016208040384832072, "loss": 2.5989, "step": 1680 }, { "epoch": 0.47981767784549445, "grad_norm": 0.48828125, "learning_rate": 0.00016194495826076224, "loss": 2.5548, "step": 1681 }, { "epoch": 0.48010311370382014, "grad_norm": 0.47265625, "learning_rate": 0.0001618095028709447, "loss": 2.5883, "step": 1682 }, { "epoch": 0.48038854956214583, "grad_norm": 0.9296875, "learning_rate": 0.0001616740377900254, "loss": 2.6151, "step": 1683 }, { "epoch": 0.4806739854204716, "grad_norm": 0.50390625, "learning_rate": 0.00016153856312916957, "loss": 2.5432, "step": 1684 }, { "epoch": 0.48095942127879726, "grad_norm": 0.671875, "learning_rate": 0.00016140307899955024, "loss": 2.5735, "step": 1685 }, { "epoch": 0.48124485713712295, "grad_norm": 0.671875, "learning_rate": 0.00016126758551234825, "loss": 2.5766, "step": 1686 }, { "epoch": 0.48153029299544864, "grad_norm": 0.578125, "learning_rate": 0.0001611320827787522, "loss": 2.5697, "step": 1687 }, { "epoch": 0.4818157288537743, "grad_norm": 0.5859375, "learning_rate": 0.00016099657090995812, "loss": 2.5824, "step": 1688 }, { "epoch": 0.4821011647121, "grad_norm": 0.50390625, "learning_rate": 0.0001608610500171696, "loss": 2.5885, "step": 1689 }, { "epoch": 0.4823866005704257, "grad_norm": 0.5078125, "learning_rate": 0.00016072552021159775, "loss": 2.5984, "step": 1690 }, { "epoch": 0.4826720364287514, "grad_norm": 0.55078125, "learning_rate": 0.0001605899816044608, "loss": 2.6025, "step": 1691 }, { "epoch": 0.48295747228707714, "grad_norm": 0.5, "learning_rate": 0.00016045443430698437, "loss": 2.6107, "step": 1692 }, { "epoch": 0.4832429081454028, "grad_norm": 0.52734375, "learning_rate": 0.00016031887843040104, "loss": 2.5978, "step": 1693 }, { "epoch": 0.4835283440037285, "grad_norm": 0.53515625, "learning_rate": 0.00016018331408595063, "loss": 2.5974, "step": 1694 }, { "epoch": 0.4838137798620542, "grad_norm": 0.53515625, "learning_rate": 0.00016004774138487983, "loss": 2.6113, "step": 1695 }, { "epoch": 0.4840992157203799, "grad_norm": 0.51171875, "learning_rate": 0.00015991216043844208, "loss": 2.5766, "step": 1696 }, { "epoch": 0.4843846515787056, "grad_norm": 0.5, "learning_rate": 0.00015977657135789764, "loss": 2.5671, "step": 1697 }, { "epoch": 0.48467008743703127, "grad_norm": 0.54296875, "learning_rate": 0.0001596409742545136, "loss": 2.6138, "step": 1698 }, { "epoch": 0.48495552329535696, "grad_norm": 0.45703125, "learning_rate": 0.00015950536923956346, "loss": 2.5962, "step": 1699 }, { "epoch": 0.4852409591536827, "grad_norm": 0.50390625, "learning_rate": 0.00015936975642432725, "loss": 2.5992, "step": 1700 }, { "epoch": 0.4855263950120084, "grad_norm": 0.50390625, "learning_rate": 0.00015923413592009144, "loss": 2.5925, "step": 1701 }, { "epoch": 0.4858118308703341, "grad_norm": 0.462890625, "learning_rate": 0.00015909850783814874, "loss": 2.5949, "step": 1702 }, { "epoch": 0.48609726672865977, "grad_norm": 0.515625, "learning_rate": 0.00015896287228979816, "loss": 2.5671, "step": 1703 }, { "epoch": 0.48638270258698546, "grad_norm": 0.5, "learning_rate": 0.00015882722938634477, "loss": 2.5684, "step": 1704 }, { "epoch": 0.48666813844531115, "grad_norm": 0.482421875, "learning_rate": 0.00015869157923909978, "loss": 2.59, "step": 1705 }, { "epoch": 0.48695357430363684, "grad_norm": 0.515625, "learning_rate": 0.00015855592195938018, "loss": 2.587, "step": 1706 }, { "epoch": 0.4872390101619625, "grad_norm": 0.46875, "learning_rate": 0.00015842025765850894, "loss": 2.5942, "step": 1707 }, { "epoch": 0.48752444602028827, "grad_norm": 0.48046875, "learning_rate": 0.00015828458644781478, "loss": 2.604, "step": 1708 }, { "epoch": 0.48780988187861396, "grad_norm": 0.44140625, "learning_rate": 0.00015814890843863204, "loss": 2.5862, "step": 1709 }, { "epoch": 0.48809531773693965, "grad_norm": 0.486328125, "learning_rate": 0.00015801322374230068, "loss": 2.5813, "step": 1710 }, { "epoch": 0.48838075359526534, "grad_norm": 0.4453125, "learning_rate": 0.00015787753247016608, "loss": 2.5988, "step": 1711 }, { "epoch": 0.488666189453591, "grad_norm": 0.470703125, "learning_rate": 0.00015774183473357914, "loss": 2.5786, "step": 1712 }, { "epoch": 0.4889516253119167, "grad_norm": 0.48828125, "learning_rate": 0.00015760613064389595, "loss": 2.5616, "step": 1713 }, { "epoch": 0.4892370611702424, "grad_norm": 0.484375, "learning_rate": 0.00015747042031247785, "loss": 2.5828, "step": 1714 }, { "epoch": 0.4895224970285681, "grad_norm": 0.47265625, "learning_rate": 0.0001573347038506914, "loss": 2.565, "step": 1715 }, { "epoch": 0.48980793288689384, "grad_norm": 0.46875, "learning_rate": 0.00015719898136990794, "loss": 2.5747, "step": 1716 }, { "epoch": 0.4900933687452195, "grad_norm": 0.466796875, "learning_rate": 0.00015706325298150403, "loss": 2.5779, "step": 1717 }, { "epoch": 0.4903788046035452, "grad_norm": 0.4921875, "learning_rate": 0.00015692751879686095, "loss": 2.5682, "step": 1718 }, { "epoch": 0.4906642404618709, "grad_norm": 0.48828125, "learning_rate": 0.00015679177892736468, "loss": 2.5675, "step": 1719 }, { "epoch": 0.4909496763201966, "grad_norm": 0.4765625, "learning_rate": 0.00015665603348440595, "loss": 2.5824, "step": 1720 }, { "epoch": 0.4912351121785223, "grad_norm": 0.52734375, "learning_rate": 0.0001565202825793801, "loss": 2.5604, "step": 1721 }, { "epoch": 0.49152054803684797, "grad_norm": 0.5, "learning_rate": 0.0001563845263236868, "loss": 2.5612, "step": 1722 }, { "epoch": 0.49180598389517366, "grad_norm": 0.5234375, "learning_rate": 0.0001562487648287303, "loss": 2.6068, "step": 1723 }, { "epoch": 0.4920914197534994, "grad_norm": 0.47265625, "learning_rate": 0.000156112998205919, "loss": 2.5695, "step": 1724 }, { "epoch": 0.4923768556118251, "grad_norm": 0.51953125, "learning_rate": 0.00015597722656666554, "loss": 2.5929, "step": 1725 }, { "epoch": 0.4926622914701508, "grad_norm": 0.515625, "learning_rate": 0.00015584145002238677, "loss": 2.5656, "step": 1726 }, { "epoch": 0.49294772732847647, "grad_norm": 0.482421875, "learning_rate": 0.00015570566868450343, "loss": 2.5609, "step": 1727 }, { "epoch": 0.49323316318680216, "grad_norm": 0.5234375, "learning_rate": 0.00015556988266444028, "loss": 2.5954, "step": 1728 }, { "epoch": 0.49351859904512785, "grad_norm": 0.48828125, "learning_rate": 0.0001554340920736259, "loss": 2.5662, "step": 1729 }, { "epoch": 0.49380403490345354, "grad_norm": 0.4921875, "learning_rate": 0.00015529829702349266, "loss": 2.6074, "step": 1730 }, { "epoch": 0.4940894707617792, "grad_norm": 0.53515625, "learning_rate": 0.0001551624976254765, "loss": 2.593, "step": 1731 }, { "epoch": 0.4943749066201049, "grad_norm": 0.5, "learning_rate": 0.00015502669399101695, "loss": 2.6089, "step": 1732 }, { "epoch": 0.49466034247843066, "grad_norm": 0.5, "learning_rate": 0.00015489088623155716, "loss": 2.5917, "step": 1733 }, { "epoch": 0.49494577833675635, "grad_norm": 0.53515625, "learning_rate": 0.00015475507445854343, "loss": 2.566, "step": 1734 }, { "epoch": 0.49523121419508204, "grad_norm": 0.5, "learning_rate": 0.00015461925878342556, "loss": 2.5928, "step": 1735 }, { "epoch": 0.4955166500534077, "grad_norm": 0.55859375, "learning_rate": 0.00015448343931765635, "loss": 2.5719, "step": 1736 }, { "epoch": 0.4958020859117334, "grad_norm": 0.50390625, "learning_rate": 0.000154347616172692, "loss": 2.5568, "step": 1737 }, { "epoch": 0.4960875217700591, "grad_norm": 0.49609375, "learning_rate": 0.00015421178945999143, "loss": 2.5836, "step": 1738 }, { "epoch": 0.4963729576283848, "grad_norm": 0.498046875, "learning_rate": 0.00015407595929101665, "loss": 2.5957, "step": 1739 }, { "epoch": 0.4966583934867105, "grad_norm": 0.4609375, "learning_rate": 0.0001539401257772324, "loss": 2.6004, "step": 1740 }, { "epoch": 0.4969438293450362, "grad_norm": 0.51171875, "learning_rate": 0.0001538042890301064, "loss": 2.5866, "step": 1741 }, { "epoch": 0.4972292652033619, "grad_norm": 0.478515625, "learning_rate": 0.00015366844916110868, "loss": 2.5744, "step": 1742 }, { "epoch": 0.4975147010616876, "grad_norm": 0.474609375, "learning_rate": 0.00015353260628171212, "loss": 2.6165, "step": 1743 }, { "epoch": 0.4978001369200133, "grad_norm": 0.5, "learning_rate": 0.0001533967605033919, "loss": 2.5778, "step": 1744 }, { "epoch": 0.498085572778339, "grad_norm": 0.423828125, "learning_rate": 0.00015326091193762568, "loss": 2.5816, "step": 1745 }, { "epoch": 0.49837100863666467, "grad_norm": 0.5078125, "learning_rate": 0.00015312506069589335, "loss": 2.6123, "step": 1746 }, { "epoch": 0.49865644449499036, "grad_norm": 0.458984375, "learning_rate": 0.00015298920688967702, "loss": 2.5834, "step": 1747 }, { "epoch": 0.49894188035331605, "grad_norm": 0.51953125, "learning_rate": 0.00015285335063046089, "loss": 2.5644, "step": 1748 }, { "epoch": 0.4992273162116418, "grad_norm": 0.50390625, "learning_rate": 0.00015271749202973116, "loss": 2.5766, "step": 1749 }, { "epoch": 0.4995127520699675, "grad_norm": 0.52734375, "learning_rate": 0.000152581631198976, "loss": 2.5764, "step": 1750 }, { "epoch": 0.4995127520699675, "eval_loss": 2.4794108867645264, "eval_runtime": 6003.2988, "eval_samples_per_second": 10.708, "eval_steps_per_second": 10.708, "step": 1750 }, { "epoch": 0.49979818792829317, "grad_norm": 0.462890625, "learning_rate": 0.00015244576824968538, "loss": 2.5287, "step": 1751 }, { "epoch": 0.5000836237866189, "grad_norm": 0.486328125, "learning_rate": 0.000152309903293351, "loss": 2.5808, "step": 1752 }, { "epoch": 0.5003690596449446, "grad_norm": 0.4609375, "learning_rate": 0.00015217403644146626, "loss": 2.6024, "step": 1753 }, { "epoch": 0.5006544955032702, "grad_norm": 0.50390625, "learning_rate": 0.000152038167805526, "loss": 2.6072, "step": 1754 }, { "epoch": 0.500939931361596, "grad_norm": 0.50390625, "learning_rate": 0.00015190229749702664, "loss": 2.5662, "step": 1755 }, { "epoch": 0.5012253672199216, "grad_norm": 0.56640625, "learning_rate": 0.00015176642562746587, "loss": 2.5949, "step": 1756 }, { "epoch": 0.5015108030782474, "grad_norm": 0.5546875, "learning_rate": 0.0001516305523083428, "loss": 2.5952, "step": 1757 }, { "epoch": 0.501796238936573, "grad_norm": 0.58984375, "learning_rate": 0.00015149467765115764, "loss": 2.5761, "step": 1758 }, { "epoch": 0.5020816747948987, "grad_norm": 0.51953125, "learning_rate": 0.0001513588017674117, "loss": 2.5776, "step": 1759 }, { "epoch": 0.5023671106532244, "grad_norm": 0.5078125, "learning_rate": 0.0001512229247686072, "loss": 2.5913, "step": 1760 }, { "epoch": 0.5026525465115501, "grad_norm": 0.498046875, "learning_rate": 0.00015108704676624756, "loss": 2.6031, "step": 1761 }, { "epoch": 0.5029379823698759, "grad_norm": 0.55859375, "learning_rate": 0.00015095116787183668, "loss": 2.5457, "step": 1762 }, { "epoch": 0.5032234182282015, "grad_norm": 0.51171875, "learning_rate": 0.0001508152881968795, "loss": 2.5609, "step": 1763 }, { "epoch": 0.5035088540865272, "grad_norm": 0.498046875, "learning_rate": 0.00015067940785288135, "loss": 2.6055, "step": 1764 }, { "epoch": 0.5037942899448529, "grad_norm": 0.51171875, "learning_rate": 0.0001505435269513482, "loss": 2.597, "step": 1765 }, { "epoch": 0.5040797258031786, "grad_norm": 0.458984375, "learning_rate": 0.00015040764560378658, "loss": 2.5936, "step": 1766 }, { "epoch": 0.5043651616615042, "grad_norm": 0.578125, "learning_rate": 0.00015027176392170326, "loss": 2.5551, "step": 1767 }, { "epoch": 0.50465059751983, "grad_norm": 0.51953125, "learning_rate": 0.00015013588201660529, "loss": 2.5881, "step": 1768 }, { "epoch": 0.5049360333781557, "grad_norm": 0.515625, "learning_rate": 0.00015, "loss": 2.5998, "step": 1769 }, { "epoch": 0.5052214692364814, "grad_norm": 0.451171875, "learning_rate": 0.0001498641179833947, "loss": 2.58, "step": 1770 }, { "epoch": 0.5055069050948071, "grad_norm": 0.53515625, "learning_rate": 0.00014972823607829674, "loss": 2.5808, "step": 1771 }, { "epoch": 0.5057923409531327, "grad_norm": 0.455078125, "learning_rate": 0.00014959235439621343, "loss": 2.575, "step": 1772 }, { "epoch": 0.5060777768114585, "grad_norm": 0.5234375, "learning_rate": 0.00014945647304865175, "loss": 2.5957, "step": 1773 }, { "epoch": 0.5063632126697841, "grad_norm": 0.5, "learning_rate": 0.00014932059214711868, "loss": 2.5831, "step": 1774 }, { "epoch": 0.5066486485281099, "grad_norm": 0.59375, "learning_rate": 0.00014918471180312053, "loss": 2.5812, "step": 1775 }, { "epoch": 0.5069340843864355, "grad_norm": 0.52734375, "learning_rate": 0.0001490488321281633, "loss": 2.5925, "step": 1776 }, { "epoch": 0.5072195202447612, "grad_norm": 0.494140625, "learning_rate": 0.00014891295323375244, "loss": 2.5934, "step": 1777 }, { "epoch": 0.507504956103087, "grad_norm": 0.5078125, "learning_rate": 0.0001487770752313928, "loss": 2.5923, "step": 1778 }, { "epoch": 0.5077903919614126, "grad_norm": 0.466796875, "learning_rate": 0.00014864119823258836, "loss": 2.5811, "step": 1779 }, { "epoch": 0.5080758278197384, "grad_norm": 0.490234375, "learning_rate": 0.00014850532234884236, "loss": 2.5726, "step": 1780 }, { "epoch": 0.508361263678064, "grad_norm": 0.53515625, "learning_rate": 0.00014836944769165716, "loss": 2.57, "step": 1781 }, { "epoch": 0.5086466995363897, "grad_norm": 0.51171875, "learning_rate": 0.0001482335743725341, "loss": 2.584, "step": 1782 }, { "epoch": 0.5089321353947154, "grad_norm": 0.48828125, "learning_rate": 0.00014809770250297336, "loss": 2.5903, "step": 1783 }, { "epoch": 0.5092175712530411, "grad_norm": 0.51171875, "learning_rate": 0.000147961832194474, "loss": 2.6009, "step": 1784 }, { "epoch": 0.5095030071113669, "grad_norm": 0.478515625, "learning_rate": 0.00014782596355853374, "loss": 2.6057, "step": 1785 }, { "epoch": 0.5097884429696925, "grad_norm": 0.49609375, "learning_rate": 0.00014769009670664897, "loss": 2.5661, "step": 1786 }, { "epoch": 0.5100738788280182, "grad_norm": 0.447265625, "learning_rate": 0.0001475542317503146, "loss": 2.5986, "step": 1787 }, { "epoch": 0.5103593146863439, "grad_norm": 0.5234375, "learning_rate": 0.000147418368801024, "loss": 2.5837, "step": 1788 }, { "epoch": 0.5106447505446696, "grad_norm": 0.474609375, "learning_rate": 0.0001472825079702688, "loss": 2.5738, "step": 1789 }, { "epoch": 0.5109301864029953, "grad_norm": 0.48828125, "learning_rate": 0.0001471466493695391, "loss": 2.5681, "step": 1790 }, { "epoch": 0.511215622261321, "grad_norm": 0.466796875, "learning_rate": 0.00014701079311032298, "loss": 2.5817, "step": 1791 }, { "epoch": 0.5115010581196466, "grad_norm": 0.48828125, "learning_rate": 0.00014687493930410663, "loss": 2.5813, "step": 1792 }, { "epoch": 0.5117864939779724, "grad_norm": 0.478515625, "learning_rate": 0.00014673908806237432, "loss": 2.5893, "step": 1793 }, { "epoch": 0.5120719298362981, "grad_norm": 0.498046875, "learning_rate": 0.0001466032394966081, "loss": 2.6104, "step": 1794 }, { "epoch": 0.5123573656946238, "grad_norm": 0.5078125, "learning_rate": 0.0001464673937182879, "loss": 2.6105, "step": 1795 }, { "epoch": 0.5126428015529495, "grad_norm": 0.494140625, "learning_rate": 0.00014633155083889132, "loss": 2.6015, "step": 1796 }, { "epoch": 0.5129282374112751, "grad_norm": 0.5, "learning_rate": 0.00014619571096989359, "loss": 2.578, "step": 1797 }, { "epoch": 0.5132136732696009, "grad_norm": 0.47265625, "learning_rate": 0.00014605987422276756, "loss": 2.5755, "step": 1798 }, { "epoch": 0.5134991091279265, "grad_norm": 0.50390625, "learning_rate": 0.00014592404070898335, "loss": 2.5822, "step": 1799 }, { "epoch": 0.5137845449862523, "grad_norm": 0.474609375, "learning_rate": 0.00014578821054000854, "loss": 2.5701, "step": 1800 }, { "epoch": 0.514069980844578, "grad_norm": 0.51953125, "learning_rate": 0.000145652383827308, "loss": 2.5652, "step": 1801 }, { "epoch": 0.5143554167029036, "grad_norm": 0.5078125, "learning_rate": 0.00014551656068234362, "loss": 2.5589, "step": 1802 }, { "epoch": 0.5146408525612294, "grad_norm": 0.4609375, "learning_rate": 0.00014538074121657447, "loss": 2.5928, "step": 1803 }, { "epoch": 0.514926288419555, "grad_norm": 0.48046875, "learning_rate": 0.00014524492554145657, "loss": 2.5787, "step": 1804 }, { "epoch": 0.5152117242778808, "grad_norm": 0.474609375, "learning_rate": 0.0001451091137684428, "loss": 2.6031, "step": 1805 }, { "epoch": 0.5154971601362064, "grad_norm": 0.478515625, "learning_rate": 0.00014497330600898297, "loss": 2.6, "step": 1806 }, { "epoch": 0.5157825959945321, "grad_norm": 0.4609375, "learning_rate": 0.0001448375023745235, "loss": 2.5984, "step": 1807 }, { "epoch": 0.5160680318528578, "grad_norm": 0.45703125, "learning_rate": 0.00014470170297650734, "loss": 2.5901, "step": 1808 }, { "epoch": 0.5163534677111835, "grad_norm": 0.5078125, "learning_rate": 0.00014456590792637407, "loss": 2.555, "step": 1809 }, { "epoch": 0.5166389035695093, "grad_norm": 0.4453125, "learning_rate": 0.0001444301173355597, "loss": 2.5745, "step": 1810 }, { "epoch": 0.5169243394278349, "grad_norm": 0.4765625, "learning_rate": 0.0001442943313154966, "loss": 2.5377, "step": 1811 }, { "epoch": 0.5172097752861606, "grad_norm": 0.455078125, "learning_rate": 0.00014415854997761328, "loss": 2.5617, "step": 1812 }, { "epoch": 0.5174952111444863, "grad_norm": 0.46875, "learning_rate": 0.0001440227734333344, "loss": 2.5987, "step": 1813 }, { "epoch": 0.517780647002812, "grad_norm": 0.44921875, "learning_rate": 0.000143887001794081, "loss": 2.5686, "step": 1814 }, { "epoch": 0.5180660828611376, "grad_norm": 0.427734375, "learning_rate": 0.00014375123517126968, "loss": 2.5911, "step": 1815 }, { "epoch": 0.5183515187194634, "grad_norm": 0.43359375, "learning_rate": 0.00014361547367631317, "loss": 2.5687, "step": 1816 }, { "epoch": 0.518636954577789, "grad_norm": 0.447265625, "learning_rate": 0.00014347971742061989, "loss": 2.6098, "step": 1817 }, { "epoch": 0.5189223904361148, "grad_norm": 0.474609375, "learning_rate": 0.00014334396651559405, "loss": 2.5648, "step": 1818 }, { "epoch": 0.5192078262944405, "grad_norm": 0.40625, "learning_rate": 0.00014320822107263532, "loss": 2.583, "step": 1819 }, { "epoch": 0.5194932621527661, "grad_norm": 0.50390625, "learning_rate": 0.00014307248120313908, "loss": 2.5763, "step": 1820 }, { "epoch": 0.5197786980110919, "grad_norm": 0.44140625, "learning_rate": 0.00014293674701849595, "loss": 2.5835, "step": 1821 }, { "epoch": 0.5200641338694175, "grad_norm": 0.478515625, "learning_rate": 0.00014280101863009203, "loss": 2.5738, "step": 1822 }, { "epoch": 0.5203495697277433, "grad_norm": 0.447265625, "learning_rate": 0.0001426652961493086, "loss": 2.5956, "step": 1823 }, { "epoch": 0.5206350055860689, "grad_norm": 0.5390625, "learning_rate": 0.00014252957968752212, "loss": 2.5553, "step": 1824 }, { "epoch": 0.5209204414443946, "grad_norm": 0.484375, "learning_rate": 0.00014239386935610405, "loss": 2.5876, "step": 1825 }, { "epoch": 0.5212058773027204, "grad_norm": 0.53515625, "learning_rate": 0.00014225816526642086, "loss": 2.592, "step": 1826 }, { "epoch": 0.521491313161046, "grad_norm": 0.4609375, "learning_rate": 0.00014212246752983392, "loss": 2.5715, "step": 1827 }, { "epoch": 0.5217767490193718, "grad_norm": 0.4765625, "learning_rate": 0.00014198677625769937, "loss": 2.5873, "step": 1828 }, { "epoch": 0.5220621848776974, "grad_norm": 0.46875, "learning_rate": 0.0001418510915613679, "loss": 2.5964, "step": 1829 }, { "epoch": 0.5223476207360231, "grad_norm": 0.470703125, "learning_rate": 0.0001417154135521852, "loss": 2.5588, "step": 1830 }, { "epoch": 0.5226330565943488, "grad_norm": 0.478515625, "learning_rate": 0.00014157974234149103, "loss": 2.5652, "step": 1831 }, { "epoch": 0.5229184924526745, "grad_norm": 0.46484375, "learning_rate": 0.00014144407804061982, "loss": 2.6088, "step": 1832 }, { "epoch": 0.5232039283110002, "grad_norm": 0.494140625, "learning_rate": 0.00014130842076090023, "loss": 2.5847, "step": 1833 }, { "epoch": 0.5234893641693259, "grad_norm": 0.439453125, "learning_rate": 0.0001411727706136552, "loss": 2.5664, "step": 1834 }, { "epoch": 0.5237748000276516, "grad_norm": 0.458984375, "learning_rate": 0.00014103712771020187, "loss": 2.5667, "step": 1835 }, { "epoch": 0.5240602358859773, "grad_norm": 0.447265625, "learning_rate": 0.00014090149216185123, "loss": 2.5789, "step": 1836 }, { "epoch": 0.524345671744303, "grad_norm": 0.55859375, "learning_rate": 0.00014076586407990856, "loss": 2.5775, "step": 1837 }, { "epoch": 0.5246311076026287, "grad_norm": 0.48046875, "learning_rate": 0.00014063024357567275, "loss": 2.5817, "step": 1838 }, { "epoch": 0.5249165434609544, "grad_norm": 0.453125, "learning_rate": 0.00014049463076043652, "loss": 2.6099, "step": 1839 }, { "epoch": 0.52520197931928, "grad_norm": 0.453125, "learning_rate": 0.00014035902574548637, "loss": 2.5589, "step": 1840 }, { "epoch": 0.5254874151776058, "grad_norm": 0.44921875, "learning_rate": 0.00014022342864210234, "loss": 2.5884, "step": 1841 }, { "epoch": 0.5257728510359315, "grad_norm": 0.458984375, "learning_rate": 0.00014008783956155797, "loss": 2.606, "step": 1842 }, { "epoch": 0.5260582868942572, "grad_norm": 0.474609375, "learning_rate": 0.0001399522586151202, "loss": 2.5597, "step": 1843 }, { "epoch": 0.5263437227525829, "grad_norm": 0.478515625, "learning_rate": 0.00013981668591404932, "loss": 2.5987, "step": 1844 }, { "epoch": 0.5266291586109085, "grad_norm": 0.48046875, "learning_rate": 0.00013968112156959893, "loss": 2.5708, "step": 1845 }, { "epoch": 0.5269145944692343, "grad_norm": 0.43359375, "learning_rate": 0.00013954556569301563, "loss": 2.5932, "step": 1846 }, { "epoch": 0.5272000303275599, "grad_norm": 0.478515625, "learning_rate": 0.0001394100183955392, "loss": 2.6022, "step": 1847 }, { "epoch": 0.5274854661858857, "grad_norm": 0.43359375, "learning_rate": 0.00013927447978840225, "loss": 2.5497, "step": 1848 }, { "epoch": 0.5277709020442113, "grad_norm": 0.515625, "learning_rate": 0.00013913894998283038, "loss": 2.5742, "step": 1849 }, { "epoch": 0.528056337902537, "grad_norm": 0.486328125, "learning_rate": 0.00013900342909004188, "loss": 2.624, "step": 1850 }, { "epoch": 0.5283417737608628, "grad_norm": 0.5, "learning_rate": 0.00013886791722124783, "loss": 2.5814, "step": 1851 }, { "epoch": 0.5286272096191884, "grad_norm": 0.44921875, "learning_rate": 0.00013873241448765167, "loss": 2.5622, "step": 1852 }, { "epoch": 0.5289126454775142, "grad_norm": 0.474609375, "learning_rate": 0.00013859692100044973, "loss": 2.5673, "step": 1853 }, { "epoch": 0.5291980813358398, "grad_norm": 0.4765625, "learning_rate": 0.00013846143687083043, "loss": 2.5758, "step": 1854 }, { "epoch": 0.5294835171941655, "grad_norm": 0.4765625, "learning_rate": 0.00013832596220997458, "loss": 2.5934, "step": 1855 }, { "epoch": 0.5297689530524912, "grad_norm": 0.455078125, "learning_rate": 0.0001381904971290553, "loss": 2.5529, "step": 1856 }, { "epoch": 0.5300543889108169, "grad_norm": 0.447265625, "learning_rate": 0.00013805504173923776, "loss": 2.5794, "step": 1857 }, { "epoch": 0.5303398247691427, "grad_norm": 0.466796875, "learning_rate": 0.0001379195961516793, "loss": 2.5519, "step": 1858 }, { "epoch": 0.5306252606274683, "grad_norm": 0.482421875, "learning_rate": 0.00013778416047752903, "loss": 2.5965, "step": 1859 }, { "epoch": 0.530910696485794, "grad_norm": 0.455078125, "learning_rate": 0.0001376487348279281, "loss": 2.5725, "step": 1860 }, { "epoch": 0.5311961323441197, "grad_norm": 0.484375, "learning_rate": 0.0001375133193140093, "loss": 2.5638, "step": 1861 }, { "epoch": 0.5314815682024454, "grad_norm": 0.46875, "learning_rate": 0.00013737791404689728, "loss": 2.5935, "step": 1862 }, { "epoch": 0.531767004060771, "grad_norm": 0.470703125, "learning_rate": 0.00013724251913770807, "loss": 2.6033, "step": 1863 }, { "epoch": 0.5320524399190968, "grad_norm": 0.44921875, "learning_rate": 0.00013710713469754934, "loss": 2.5982, "step": 1864 }, { "epoch": 0.5323378757774224, "grad_norm": 0.5078125, "learning_rate": 0.00013697176083752008, "loss": 2.5374, "step": 1865 }, { "epoch": 0.5326233116357482, "grad_norm": 0.443359375, "learning_rate": 0.0001368363976687107, "loss": 2.5623, "step": 1866 }, { "epoch": 0.5329087474940739, "grad_norm": 0.494140625, "learning_rate": 0.00013670104530220275, "loss": 2.574, "step": 1867 }, { "epoch": 0.5331941833523995, "grad_norm": 0.45703125, "learning_rate": 0.0001365657038490689, "loss": 2.5917, "step": 1868 }, { "epoch": 0.5334796192107253, "grad_norm": 0.490234375, "learning_rate": 0.000136430373420373, "loss": 2.5844, "step": 1869 }, { "epoch": 0.5337650550690509, "grad_norm": 0.419921875, "learning_rate": 0.00013629505412716974, "loss": 2.6019, "step": 1870 }, { "epoch": 0.5340504909273767, "grad_norm": 0.478515625, "learning_rate": 0.0001361597460805047, "loss": 2.5718, "step": 1871 }, { "epoch": 0.5343359267857023, "grad_norm": 0.46484375, "learning_rate": 0.0001360244493914142, "loss": 2.5665, "step": 1872 }, { "epoch": 0.534621362644028, "grad_norm": 0.45703125, "learning_rate": 0.0001358891641709252, "loss": 2.5814, "step": 1873 }, { "epoch": 0.5349067985023538, "grad_norm": 0.478515625, "learning_rate": 0.00013575389053005547, "loss": 2.5467, "step": 1874 }, { "epoch": 0.5351922343606794, "grad_norm": 0.66015625, "learning_rate": 0.00013561862857981304, "loss": 2.5697, "step": 1875 }, { "epoch": 0.5354776702190052, "grad_norm": 0.55078125, "learning_rate": 0.00013548337843119634, "loss": 2.5856, "step": 1876 }, { "epoch": 0.5357631060773308, "grad_norm": 0.5625, "learning_rate": 0.00013534814019519438, "loss": 2.5662, "step": 1877 }, { "epoch": 0.5360485419356565, "grad_norm": 0.5625, "learning_rate": 0.00013521291398278608, "loss": 2.5983, "step": 1878 }, { "epoch": 0.5363339777939822, "grad_norm": 0.57421875, "learning_rate": 0.00013507769990494072, "loss": 2.5893, "step": 1879 }, { "epoch": 0.5366194136523079, "grad_norm": 0.671875, "learning_rate": 0.00013494249807261748, "loss": 2.5852, "step": 1880 }, { "epoch": 0.5369048495106336, "grad_norm": 0.55078125, "learning_rate": 0.00013480730859676557, "loss": 2.5667, "step": 1881 }, { "epoch": 0.5371902853689593, "grad_norm": 0.82421875, "learning_rate": 0.00013467213158832402, "loss": 2.5674, "step": 1882 }, { "epoch": 0.537475721227285, "grad_norm": 0.5078125, "learning_rate": 0.00013453696715822163, "loss": 2.5955, "step": 1883 }, { "epoch": 0.5377611570856107, "grad_norm": 0.67578125, "learning_rate": 0.0001344018154173769, "loss": 2.5681, "step": 1884 }, { "epoch": 0.5380465929439364, "grad_norm": 0.55859375, "learning_rate": 0.00013426667647669795, "loss": 2.6069, "step": 1885 }, { "epoch": 0.538332028802262, "grad_norm": 0.609375, "learning_rate": 0.00013413155044708232, "loss": 2.5682, "step": 1886 }, { "epoch": 0.5386174646605878, "grad_norm": 0.53125, "learning_rate": 0.00013399643743941701, "loss": 2.5783, "step": 1887 }, { "epoch": 0.5389029005189134, "grad_norm": 0.59375, "learning_rate": 0.0001338613375645783, "loss": 2.5545, "step": 1888 }, { "epoch": 0.5391883363772392, "grad_norm": 0.57421875, "learning_rate": 0.00013372625093343167, "loss": 2.5683, "step": 1889 }, { "epoch": 0.5394737722355648, "grad_norm": 0.52734375, "learning_rate": 0.00013359117765683183, "loss": 2.5635, "step": 1890 }, { "epoch": 0.5397592080938906, "grad_norm": 0.546875, "learning_rate": 0.00013345611784562245, "loss": 2.5851, "step": 1891 }, { "epoch": 0.5400446439522163, "grad_norm": 0.578125, "learning_rate": 0.0001333210716106361, "loss": 2.5822, "step": 1892 }, { "epoch": 0.5403300798105419, "grad_norm": 0.46484375, "learning_rate": 0.00013318603906269436, "loss": 2.587, "step": 1893 }, { "epoch": 0.5406155156688677, "grad_norm": 0.62890625, "learning_rate": 0.00013305102031260755, "loss": 2.5887, "step": 1894 }, { "epoch": 0.5409009515271933, "grad_norm": 0.443359375, "learning_rate": 0.00013291601547117448, "loss": 2.5895, "step": 1895 }, { "epoch": 0.541186387385519, "grad_norm": 0.56640625, "learning_rate": 0.00013278102464918276, "loss": 2.5535, "step": 1896 }, { "epoch": 0.5414718232438447, "grad_norm": 0.447265625, "learning_rate": 0.00013264604795740838, "loss": 2.5836, "step": 1897 }, { "epoch": 0.5417572591021704, "grad_norm": 0.5390625, "learning_rate": 0.00013251108550661585, "loss": 2.5933, "step": 1898 }, { "epoch": 0.5420426949604962, "grad_norm": 0.45703125, "learning_rate": 0.0001323761374075578, "loss": 2.5745, "step": 1899 }, { "epoch": 0.5423281308188218, "grad_norm": 0.490234375, "learning_rate": 0.0001322412037709752, "loss": 2.5632, "step": 1900 }, { "epoch": 0.5426135666771476, "grad_norm": 0.5, "learning_rate": 0.00013210628470759726, "loss": 2.5525, "step": 1901 }, { "epoch": 0.5428990025354732, "grad_norm": 0.5078125, "learning_rate": 0.000131971380328141, "loss": 2.6075, "step": 1902 }, { "epoch": 0.5431844383937989, "grad_norm": 0.447265625, "learning_rate": 0.0001318364907433116, "loss": 2.5948, "step": 1903 }, { "epoch": 0.5434698742521246, "grad_norm": 0.53125, "learning_rate": 0.00013170161606380204, "loss": 2.6039, "step": 1904 }, { "epoch": 0.5437553101104503, "grad_norm": 0.453125, "learning_rate": 0.00013156675640029289, "loss": 2.5849, "step": 1905 }, { "epoch": 0.5440407459687759, "grad_norm": 0.546875, "learning_rate": 0.00013143191186345266, "loss": 2.5805, "step": 1906 }, { "epoch": 0.5443261818271017, "grad_norm": 0.431640625, "learning_rate": 0.00013129708256393724, "loss": 2.5466, "step": 1907 }, { "epoch": 0.5446116176854274, "grad_norm": 0.515625, "learning_rate": 0.00013116226861239019, "loss": 2.5889, "step": 1908 }, { "epoch": 0.5448970535437531, "grad_norm": 0.45703125, "learning_rate": 0.00013102747011944238, "loss": 2.5744, "step": 1909 }, { "epoch": 0.5451824894020788, "grad_norm": 0.484375, "learning_rate": 0.000130892687195712, "loss": 2.5408, "step": 1910 }, { "epoch": 0.5454679252604044, "grad_norm": 0.470703125, "learning_rate": 0.00013075791995180447, "loss": 2.5915, "step": 1911 }, { "epoch": 0.5457533611187302, "grad_norm": 0.439453125, "learning_rate": 0.00013062316849831232, "loss": 2.5739, "step": 1912 }, { "epoch": 0.5460387969770558, "grad_norm": 0.458984375, "learning_rate": 0.00013048843294581516, "loss": 2.5662, "step": 1913 }, { "epoch": 0.5463242328353816, "grad_norm": 0.447265625, "learning_rate": 0.00013035371340487954, "loss": 2.5486, "step": 1914 }, { "epoch": 0.5466096686937073, "grad_norm": 0.47265625, "learning_rate": 0.00013021900998605885, "loss": 2.5508, "step": 1915 }, { "epoch": 0.5468951045520329, "grad_norm": 0.45703125, "learning_rate": 0.0001300843227998933, "loss": 2.5886, "step": 1916 }, { "epoch": 0.5471805404103587, "grad_norm": 0.455078125, "learning_rate": 0.00012994965195690976, "loss": 2.5568, "step": 1917 }, { "epoch": 0.5474659762686843, "grad_norm": 0.443359375, "learning_rate": 0.0001298149975676216, "loss": 2.5776, "step": 1918 }, { "epoch": 0.5477514121270101, "grad_norm": 0.4296875, "learning_rate": 0.0001296803597425288, "loss": 2.5829, "step": 1919 }, { "epoch": 0.5480368479853357, "grad_norm": 0.455078125, "learning_rate": 0.00012954573859211773, "loss": 2.5828, "step": 1920 }, { "epoch": 0.5483222838436614, "grad_norm": 0.408203125, "learning_rate": 0.00012941113422686108, "loss": 2.5825, "step": 1921 }, { "epoch": 0.5486077197019871, "grad_norm": 0.474609375, "learning_rate": 0.0001292765467572177, "loss": 2.5706, "step": 1922 }, { "epoch": 0.5488931555603128, "grad_norm": 0.435546875, "learning_rate": 0.00012914197629363257, "loss": 2.546, "step": 1923 }, { "epoch": 0.5491785914186386, "grad_norm": 0.4609375, "learning_rate": 0.00012900742294653684, "loss": 2.6005, "step": 1924 }, { "epoch": 0.5494640272769642, "grad_norm": 0.494140625, "learning_rate": 0.0001288728868263475, "loss": 2.5664, "step": 1925 }, { "epoch": 0.5497494631352899, "grad_norm": 0.439453125, "learning_rate": 0.00012873836804346746, "loss": 2.5662, "step": 1926 }, { "epoch": 0.5500348989936156, "grad_norm": 0.486328125, "learning_rate": 0.00012860386670828538, "loss": 2.5691, "step": 1927 }, { "epoch": 0.5503203348519413, "grad_norm": 0.458984375, "learning_rate": 0.0001284693829311756, "loss": 2.556, "step": 1928 }, { "epoch": 0.550605770710267, "grad_norm": 0.494140625, "learning_rate": 0.00012833491682249802, "loss": 2.5723, "step": 1929 }, { "epoch": 0.5508912065685927, "grad_norm": 0.439453125, "learning_rate": 0.0001282004684925981, "loss": 2.5932, "step": 1930 }, { "epoch": 0.5511766424269184, "grad_norm": 0.52734375, "learning_rate": 0.00012806603805180666, "loss": 2.5586, "step": 1931 }, { "epoch": 0.5514620782852441, "grad_norm": 0.4453125, "learning_rate": 0.00012793162561043994, "loss": 2.6137, "step": 1932 }, { "epoch": 0.5517475141435698, "grad_norm": 0.474609375, "learning_rate": 0.0001277972312787993, "loss": 2.5864, "step": 1933 }, { "epoch": 0.5520329500018955, "grad_norm": 0.44140625, "learning_rate": 0.0001276628551671713, "loss": 2.5684, "step": 1934 }, { "epoch": 0.5523183858602212, "grad_norm": 0.470703125, "learning_rate": 0.00012752849738582745, "loss": 2.5812, "step": 1935 }, { "epoch": 0.5526038217185468, "grad_norm": 0.44921875, "learning_rate": 0.0001273941580450243, "loss": 2.5645, "step": 1936 }, { "epoch": 0.5528892575768726, "grad_norm": 0.49609375, "learning_rate": 0.00012725983725500332, "loss": 2.5597, "step": 1937 }, { "epoch": 0.5531746934351982, "grad_norm": 0.43359375, "learning_rate": 0.0001271255351259907, "loss": 2.5787, "step": 1938 }, { "epoch": 0.553460129293524, "grad_norm": 0.466796875, "learning_rate": 0.00012699125176819716, "loss": 2.5669, "step": 1939 }, { "epoch": 0.5537455651518497, "grad_norm": 0.78125, "learning_rate": 0.00012685698729181837, "loss": 2.5653, "step": 1940 }, { "epoch": 0.5540310010101753, "grad_norm": 0.48828125, "learning_rate": 0.0001267227418070342, "loss": 2.5713, "step": 1941 }, { "epoch": 0.5543164368685011, "grad_norm": 0.46484375, "learning_rate": 0.00012658851542400907, "loss": 2.5643, "step": 1942 }, { "epoch": 0.5546018727268267, "grad_norm": 0.431640625, "learning_rate": 0.00012645430825289163, "loss": 2.5536, "step": 1943 }, { "epoch": 0.5548873085851525, "grad_norm": 0.53515625, "learning_rate": 0.00012632012040381493, "loss": 2.5869, "step": 1944 }, { "epoch": 0.5551727444434781, "grad_norm": 0.44921875, "learning_rate": 0.00012618595198689596, "loss": 2.5626, "step": 1945 }, { "epoch": 0.5554581803018038, "grad_norm": 0.484375, "learning_rate": 0.0001260518031122359, "loss": 2.5907, "step": 1946 }, { "epoch": 0.5557436161601295, "grad_norm": 0.431640625, "learning_rate": 0.00012591767388991985, "loss": 2.5852, "step": 1947 }, { "epoch": 0.5560290520184552, "grad_norm": 0.458984375, "learning_rate": 0.00012578356443001683, "loss": 2.557, "step": 1948 }, { "epoch": 0.556314487876781, "grad_norm": 0.453125, "learning_rate": 0.0001256494748425796, "loss": 2.581, "step": 1949 }, { "epoch": 0.5565999237351066, "grad_norm": 0.451171875, "learning_rate": 0.00012551540523764458, "loss": 2.5861, "step": 1950 }, { "epoch": 0.5568853595934323, "grad_norm": 0.49609375, "learning_rate": 0.00012538135572523183, "loss": 2.5701, "step": 1951 }, { "epoch": 0.557170795451758, "grad_norm": 0.482421875, "learning_rate": 0.00012524732641534496, "loss": 2.5348, "step": 1952 }, { "epoch": 0.5574562313100837, "grad_norm": 0.458984375, "learning_rate": 0.00012511331741797092, "loss": 2.5597, "step": 1953 }, { "epoch": 0.5577416671684093, "grad_norm": 0.4921875, "learning_rate": 0.00012497932884308002, "loss": 2.5808, "step": 1954 }, { "epoch": 0.5580271030267351, "grad_norm": 0.439453125, "learning_rate": 0.00012484536080062581, "loss": 2.5469, "step": 1955 }, { "epoch": 0.5583125388850608, "grad_norm": 0.54296875, "learning_rate": 0.00012471141340054508, "loss": 2.5758, "step": 1956 }, { "epoch": 0.5585979747433865, "grad_norm": 0.43359375, "learning_rate": 0.00012457748675275763, "loss": 2.5819, "step": 1957 }, { "epoch": 0.5588834106017122, "grad_norm": 0.494140625, "learning_rate": 0.00012444358096716607, "loss": 2.5616, "step": 1958 }, { "epoch": 0.5591688464600378, "grad_norm": 0.43359375, "learning_rate": 0.0001243096961536561, "loss": 2.5502, "step": 1959 }, { "epoch": 0.5594542823183636, "grad_norm": 0.421875, "learning_rate": 0.00012417583242209612, "loss": 2.5667, "step": 1960 }, { "epoch": 0.5597397181766892, "grad_norm": 0.478515625, "learning_rate": 0.00012404198988233729, "loss": 2.5661, "step": 1961 }, { "epoch": 0.560025154035015, "grad_norm": 0.447265625, "learning_rate": 0.00012390816864421325, "loss": 2.5755, "step": 1962 }, { "epoch": 0.5603105898933406, "grad_norm": 0.466796875, "learning_rate": 0.00012377436881754025, "loss": 2.5679, "step": 1963 }, { "epoch": 0.5605960257516663, "grad_norm": 0.4296875, "learning_rate": 0.00012364059051211707, "loss": 2.5471, "step": 1964 }, { "epoch": 0.5608814616099921, "grad_norm": 0.455078125, "learning_rate": 0.00012350683383772462, "loss": 2.5443, "step": 1965 }, { "epoch": 0.5611668974683177, "grad_norm": 0.46875, "learning_rate": 0.00012337309890412618, "loss": 2.5963, "step": 1966 }, { "epoch": 0.5614523333266435, "grad_norm": 0.443359375, "learning_rate": 0.00012323938582106724, "loss": 2.5735, "step": 1967 }, { "epoch": 0.5617377691849691, "grad_norm": 0.48046875, "learning_rate": 0.00012310569469827518, "loss": 2.5885, "step": 1968 }, { "epoch": 0.5620232050432948, "grad_norm": 0.458984375, "learning_rate": 0.00012297202564545953, "loss": 2.5558, "step": 1969 }, { "epoch": 0.5623086409016205, "grad_norm": 0.419921875, "learning_rate": 0.0001228383787723116, "loss": 2.5914, "step": 1970 }, { "epoch": 0.5625940767599462, "grad_norm": 0.458984375, "learning_rate": 0.0001227047541885046, "loss": 2.5518, "step": 1971 }, { "epoch": 0.562879512618272, "grad_norm": 0.431640625, "learning_rate": 0.00012257115200369338, "loss": 2.541, "step": 1972 }, { "epoch": 0.5631649484765976, "grad_norm": 0.4453125, "learning_rate": 0.0001224375723275144, "loss": 2.5672, "step": 1973 }, { "epoch": 0.5634503843349233, "grad_norm": 0.4140625, "learning_rate": 0.00012230401526958578, "loss": 2.579, "step": 1974 }, { "epoch": 0.563735820193249, "grad_norm": 0.431640625, "learning_rate": 0.0001221704809395068, "loss": 2.5442, "step": 1975 }, { "epoch": 0.5640212560515747, "grad_norm": 0.447265625, "learning_rate": 0.00012203696944685838, "loss": 2.582, "step": 1976 }, { "epoch": 0.5643066919099003, "grad_norm": 0.41015625, "learning_rate": 0.00012190348090120253, "loss": 2.5607, "step": 1977 }, { "epoch": 0.5645921277682261, "grad_norm": 0.41796875, "learning_rate": 0.00012177001541208247, "loss": 2.5668, "step": 1978 }, { "epoch": 0.5648775636265517, "grad_norm": 0.423828125, "learning_rate": 0.00012163657308902254, "loss": 2.5663, "step": 1979 }, { "epoch": 0.5651629994848775, "grad_norm": 0.40625, "learning_rate": 0.00012150315404152809, "loss": 2.575, "step": 1980 }, { "epoch": 0.5654484353432032, "grad_norm": 0.458984375, "learning_rate": 0.00012136975837908521, "loss": 2.5806, "step": 1981 }, { "epoch": 0.5657338712015288, "grad_norm": 0.43359375, "learning_rate": 0.00012123638621116096, "loss": 2.5632, "step": 1982 }, { "epoch": 0.5660193070598546, "grad_norm": 0.451171875, "learning_rate": 0.00012110303764720305, "loss": 2.5993, "step": 1983 }, { "epoch": 0.5663047429181802, "grad_norm": 0.42578125, "learning_rate": 0.00012096971279663991, "loss": 2.5778, "step": 1984 }, { "epoch": 0.566590178776506, "grad_norm": 0.462890625, "learning_rate": 0.00012083641176888034, "loss": 2.5656, "step": 1985 }, { "epoch": 0.5668756146348316, "grad_norm": 0.419921875, "learning_rate": 0.00012070313467331368, "loss": 2.5657, "step": 1986 }, { "epoch": 0.5671610504931573, "grad_norm": 0.427734375, "learning_rate": 0.00012056988161930973, "loss": 2.5606, "step": 1987 }, { "epoch": 0.5674464863514831, "grad_norm": 0.44140625, "learning_rate": 0.00012043665271621843, "loss": 2.5621, "step": 1988 }, { "epoch": 0.5677319222098087, "grad_norm": 0.455078125, "learning_rate": 0.00012030344807336993, "loss": 2.5528, "step": 1989 }, { "epoch": 0.5680173580681345, "grad_norm": 0.416015625, "learning_rate": 0.00012017026780007452, "loss": 2.5568, "step": 1990 }, { "epoch": 0.5683027939264601, "grad_norm": 0.46484375, "learning_rate": 0.00012003711200562242, "loss": 2.5495, "step": 1991 }, { "epoch": 0.5685882297847858, "grad_norm": 0.412109375, "learning_rate": 0.00011990398079928378, "loss": 2.5533, "step": 1992 }, { "epoch": 0.5688736656431115, "grad_norm": 0.447265625, "learning_rate": 0.00011977087429030862, "loss": 2.55, "step": 1993 }, { "epoch": 0.5691591015014372, "grad_norm": 0.50390625, "learning_rate": 0.00011963779258792664, "loss": 2.5533, "step": 1994 }, { "epoch": 0.5694445373597629, "grad_norm": 0.453125, "learning_rate": 0.00011950473580134723, "loss": 2.567, "step": 1995 }, { "epoch": 0.5697299732180886, "grad_norm": 0.50390625, "learning_rate": 0.00011937170403975933, "loss": 2.5419, "step": 1996 }, { "epoch": 0.5700154090764143, "grad_norm": 0.42578125, "learning_rate": 0.00011923869741233131, "loss": 2.56, "step": 1997 }, { "epoch": 0.57030084493474, "grad_norm": 0.486328125, "learning_rate": 0.00011910571602821089, "loss": 2.571, "step": 1998 }, { "epoch": 0.5705862807930657, "grad_norm": 0.40625, "learning_rate": 0.00011897275999652513, "loss": 2.5794, "step": 1999 }, { "epoch": 0.5708717166513914, "grad_norm": 0.455078125, "learning_rate": 0.00011883982942638028, "loss": 2.5708, "step": 2000 }, { "epoch": 0.5708717166513914, "eval_loss": 2.470252513885498, "eval_runtime": 5925.0122, "eval_samples_per_second": 10.85, "eval_steps_per_second": 10.85, "step": 2000 }, { "epoch": 0.5711571525097171, "grad_norm": 0.435546875, "learning_rate": 0.00011870692442686172, "loss": 2.5898, "step": 2001 }, { "epoch": 0.5714425883680427, "grad_norm": 0.423828125, "learning_rate": 0.00011857404510703366, "loss": 2.5845, "step": 2002 }, { "epoch": 0.5717280242263685, "grad_norm": 0.5, "learning_rate": 0.0001184411915759396, "loss": 2.5365, "step": 2003 }, { "epoch": 0.5720134600846942, "grad_norm": 0.4140625, "learning_rate": 0.00011830836394260153, "loss": 2.562, "step": 2004 }, { "epoch": 0.5722988959430199, "grad_norm": 0.4453125, "learning_rate": 0.00011817556231602037, "loss": 2.5718, "step": 2005 }, { "epoch": 0.5725843318013456, "grad_norm": 0.416015625, "learning_rate": 0.00011804278680517561, "loss": 2.5428, "step": 2006 }, { "epoch": 0.5728697676596712, "grad_norm": 0.439453125, "learning_rate": 0.00011791003751902542, "loss": 2.5839, "step": 2007 }, { "epoch": 0.573155203517997, "grad_norm": 0.4609375, "learning_rate": 0.00011777731456650629, "loss": 2.5791, "step": 2008 }, { "epoch": 0.5734406393763226, "grad_norm": 0.43359375, "learning_rate": 0.00011764461805653324, "loss": 2.5559, "step": 2009 }, { "epoch": 0.5737260752346484, "grad_norm": 0.484375, "learning_rate": 0.00011751194809799949, "loss": 2.5588, "step": 2010 }, { "epoch": 0.574011511092974, "grad_norm": 0.47265625, "learning_rate": 0.00011737930479977658, "loss": 2.597, "step": 2011 }, { "epoch": 0.5742969469512997, "grad_norm": 0.474609375, "learning_rate": 0.00011724668827071413, "loss": 2.5619, "step": 2012 }, { "epoch": 0.5745823828096255, "grad_norm": 0.458984375, "learning_rate": 0.00011711409861963971, "loss": 2.5595, "step": 2013 }, { "epoch": 0.5748678186679511, "grad_norm": 0.478515625, "learning_rate": 0.00011698153595535897, "loss": 2.5641, "step": 2014 }, { "epoch": 0.5751532545262769, "grad_norm": 0.435546875, "learning_rate": 0.0001168490003866553, "loss": 2.5707, "step": 2015 }, { "epoch": 0.5754386903846025, "grad_norm": 0.490234375, "learning_rate": 0.00011671649202228988, "loss": 2.5486, "step": 2016 }, { "epoch": 0.5757241262429282, "grad_norm": 0.453125, "learning_rate": 0.00011658401097100161, "loss": 2.5753, "step": 2017 }, { "epoch": 0.5760095621012539, "grad_norm": 0.50390625, "learning_rate": 0.0001164515573415069, "loss": 2.5995, "step": 2018 }, { "epoch": 0.5762949979595796, "grad_norm": 0.4609375, "learning_rate": 0.00011631913124249981, "loss": 2.587, "step": 2019 }, { "epoch": 0.5765804338179052, "grad_norm": 0.439453125, "learning_rate": 0.00011618673278265168, "loss": 2.5885, "step": 2020 }, { "epoch": 0.576865869676231, "grad_norm": 0.435546875, "learning_rate": 0.00011605436207061112, "loss": 2.5741, "step": 2021 }, { "epoch": 0.5771513055345567, "grad_norm": 0.431640625, "learning_rate": 0.00011592201921500408, "loss": 2.5782, "step": 2022 }, { "epoch": 0.5774367413928824, "grad_norm": 0.42578125, "learning_rate": 0.00011578970432443364, "loss": 2.5819, "step": 2023 }, { "epoch": 0.5777221772512081, "grad_norm": 0.427734375, "learning_rate": 0.00011565741750747992, "loss": 2.5745, "step": 2024 }, { "epoch": 0.5780076131095337, "grad_norm": 0.455078125, "learning_rate": 0.00011552515887269992, "loss": 2.5694, "step": 2025 }, { "epoch": 0.5782930489678595, "grad_norm": 0.416015625, "learning_rate": 0.00011539292852862757, "loss": 2.5542, "step": 2026 }, { "epoch": 0.5785784848261851, "grad_norm": 0.396484375, "learning_rate": 0.0001152607265837737, "loss": 2.5776, "step": 2027 }, { "epoch": 0.5788639206845109, "grad_norm": 0.431640625, "learning_rate": 0.00011512855314662566, "loss": 2.555, "step": 2028 }, { "epoch": 0.5791493565428366, "grad_norm": 0.71484375, "learning_rate": 0.00011499640832564749, "loss": 2.5699, "step": 2029 }, { "epoch": 0.5794347924011622, "grad_norm": 0.44140625, "learning_rate": 0.00011486429222927976, "loss": 2.5698, "step": 2030 }, { "epoch": 0.579720228259488, "grad_norm": 0.427734375, "learning_rate": 0.00011473220496593937, "loss": 2.546, "step": 2031 }, { "epoch": 0.5800056641178136, "grad_norm": 0.439453125, "learning_rate": 0.0001146001466440197, "loss": 2.563, "step": 2032 }, { "epoch": 0.5802910999761394, "grad_norm": 0.4296875, "learning_rate": 0.00011446811737189029, "loss": 2.5682, "step": 2033 }, { "epoch": 0.580576535834465, "grad_norm": 0.44921875, "learning_rate": 0.0001143361172578968, "loss": 2.5643, "step": 2034 }, { "epoch": 0.5808619716927907, "grad_norm": 0.416015625, "learning_rate": 0.00011420414641036111, "loss": 2.5385, "step": 2035 }, { "epoch": 0.5811474075511164, "grad_norm": 0.453125, "learning_rate": 0.00011407220493758099, "loss": 2.5788, "step": 2036 }, { "epoch": 0.5814328434094421, "grad_norm": 0.4375, "learning_rate": 0.00011394029294783011, "loss": 2.5717, "step": 2037 }, { "epoch": 0.5817182792677679, "grad_norm": 0.46484375, "learning_rate": 0.00011380841054935789, "loss": 2.595, "step": 2038 }, { "epoch": 0.5820037151260935, "grad_norm": 0.484375, "learning_rate": 0.00011367655785038957, "loss": 2.5678, "step": 2039 }, { "epoch": 0.5822891509844192, "grad_norm": 0.427734375, "learning_rate": 0.00011354473495912596, "loss": 2.5785, "step": 2040 }, { "epoch": 0.5825745868427449, "grad_norm": 0.4453125, "learning_rate": 0.00011341294198374341, "loss": 2.5803, "step": 2041 }, { "epoch": 0.5828600227010706, "grad_norm": 0.451171875, "learning_rate": 0.00011328117903239376, "loss": 2.5802, "step": 2042 }, { "epoch": 0.5831454585593963, "grad_norm": 0.44140625, "learning_rate": 0.00011314944621320421, "loss": 2.5512, "step": 2043 }, { "epoch": 0.583430894417722, "grad_norm": 0.447265625, "learning_rate": 0.00011301774363427714, "loss": 2.5891, "step": 2044 }, { "epoch": 0.5837163302760477, "grad_norm": 0.4453125, "learning_rate": 0.00011288607140369021, "loss": 2.5855, "step": 2045 }, { "epoch": 0.5840017661343734, "grad_norm": 0.451171875, "learning_rate": 0.00011275442962949613, "loss": 2.5551, "step": 2046 }, { "epoch": 0.5842872019926991, "grad_norm": 0.4296875, "learning_rate": 0.00011262281841972272, "loss": 2.5605, "step": 2047 }, { "epoch": 0.5845726378510248, "grad_norm": 0.48046875, "learning_rate": 0.0001124912378823725, "loss": 2.5974, "step": 2048 }, { "epoch": 0.5848580737093505, "grad_norm": 0.482421875, "learning_rate": 0.00011235968812542298, "loss": 2.5483, "step": 2049 }, { "epoch": 0.5851435095676761, "grad_norm": 0.474609375, "learning_rate": 0.00011222816925682647, "loss": 2.5846, "step": 2050 }, { "epoch": 0.5854289454260019, "grad_norm": 0.490234375, "learning_rate": 0.00011209668138450979, "loss": 2.572, "step": 2051 }, { "epoch": 0.5857143812843275, "grad_norm": 0.466796875, "learning_rate": 0.00011196522461637439, "loss": 2.5609, "step": 2052 }, { "epoch": 0.5859998171426533, "grad_norm": 0.52734375, "learning_rate": 0.00011183379906029615, "loss": 2.5499, "step": 2053 }, { "epoch": 0.586285253000979, "grad_norm": 0.490234375, "learning_rate": 0.00011170240482412542, "loss": 2.5417, "step": 2054 }, { "epoch": 0.5865706888593046, "grad_norm": 0.5390625, "learning_rate": 0.00011157104201568677, "loss": 2.5613, "step": 2055 }, { "epoch": 0.5868561247176304, "grad_norm": 0.4609375, "learning_rate": 0.000111439710742779, "loss": 2.5377, "step": 2056 }, { "epoch": 0.587141560575956, "grad_norm": 0.5703125, "learning_rate": 0.00011130841111317501, "loss": 2.5511, "step": 2057 }, { "epoch": 0.5874269964342818, "grad_norm": 0.4296875, "learning_rate": 0.00011117714323462186, "loss": 2.581, "step": 2058 }, { "epoch": 0.5877124322926074, "grad_norm": 0.4921875, "learning_rate": 0.0001110459072148404, "loss": 2.556, "step": 2059 }, { "epoch": 0.5879978681509331, "grad_norm": 0.44140625, "learning_rate": 0.00011091470316152543, "loss": 2.5631, "step": 2060 }, { "epoch": 0.5882833040092589, "grad_norm": 0.4609375, "learning_rate": 0.00011078353118234542, "loss": 2.5587, "step": 2061 }, { "epoch": 0.5885687398675845, "grad_norm": 0.486328125, "learning_rate": 0.00011065239138494263, "loss": 2.5622, "step": 2062 }, { "epoch": 0.5888541757259103, "grad_norm": 0.421875, "learning_rate": 0.0001105212838769328, "loss": 2.5687, "step": 2063 }, { "epoch": 0.5891396115842359, "grad_norm": 0.458984375, "learning_rate": 0.00011039020876590535, "loss": 2.5541, "step": 2064 }, { "epoch": 0.5894250474425616, "grad_norm": 0.44140625, "learning_rate": 0.00011025916615942281, "loss": 2.5607, "step": 2065 }, { "epoch": 0.5897104833008873, "grad_norm": 0.423828125, "learning_rate": 0.00011012815616502145, "loss": 2.5617, "step": 2066 }, { "epoch": 0.589995919159213, "grad_norm": 0.46875, "learning_rate": 0.00010999717889021042, "loss": 2.5915, "step": 2067 }, { "epoch": 0.5902813550175386, "grad_norm": 0.408203125, "learning_rate": 0.00010986623444247216, "loss": 2.5686, "step": 2068 }, { "epoch": 0.5905667908758644, "grad_norm": 0.45703125, "learning_rate": 0.0001097353229292622, "loss": 2.5715, "step": 2069 }, { "epoch": 0.5908522267341901, "grad_norm": 0.44140625, "learning_rate": 0.00010960444445800901, "loss": 2.5551, "step": 2070 }, { "epoch": 0.5911376625925158, "grad_norm": 0.4140625, "learning_rate": 0.0001094735991361139, "loss": 2.5485, "step": 2071 }, { "epoch": 0.5914230984508415, "grad_norm": 0.453125, "learning_rate": 0.00010934278707095103, "loss": 2.5534, "step": 2072 }, { "epoch": 0.5917085343091671, "grad_norm": 0.427734375, "learning_rate": 0.00010921200836986727, "loss": 2.56, "step": 2073 }, { "epoch": 0.5919939701674929, "grad_norm": 0.435546875, "learning_rate": 0.00010908126314018212, "loss": 2.5518, "step": 2074 }, { "epoch": 0.5922794060258185, "grad_norm": 0.455078125, "learning_rate": 0.00010895055148918756, "loss": 2.587, "step": 2075 }, { "epoch": 0.5925648418841443, "grad_norm": 0.419921875, "learning_rate": 0.00010881987352414806, "loss": 2.5573, "step": 2076 }, { "epoch": 0.59285027774247, "grad_norm": 0.439453125, "learning_rate": 0.00010868922935230049, "loss": 2.5569, "step": 2077 }, { "epoch": 0.5931357136007956, "grad_norm": 0.462890625, "learning_rate": 0.00010855861908085383, "loss": 2.5437, "step": 2078 }, { "epoch": 0.5934211494591214, "grad_norm": 0.4296875, "learning_rate": 0.00010842804281698937, "loss": 2.554, "step": 2079 }, { "epoch": 0.593706585317447, "grad_norm": 0.46875, "learning_rate": 0.00010829750066786052, "loss": 2.5834, "step": 2080 }, { "epoch": 0.5939920211757728, "grad_norm": 0.4140625, "learning_rate": 0.00010816699274059255, "loss": 2.5947, "step": 2081 }, { "epoch": 0.5942774570340984, "grad_norm": 0.470703125, "learning_rate": 0.00010803651914228285, "loss": 2.557, "step": 2082 }, { "epoch": 0.5945628928924241, "grad_norm": 0.400390625, "learning_rate": 0.00010790607998000048, "loss": 2.5781, "step": 2083 }, { "epoch": 0.5948483287507498, "grad_norm": 0.455078125, "learning_rate": 0.00010777567536078623, "loss": 2.57, "step": 2084 }, { "epoch": 0.5951337646090755, "grad_norm": 0.42578125, "learning_rate": 0.0001076453053916527, "loss": 2.5555, "step": 2085 }, { "epoch": 0.5954192004674013, "grad_norm": 0.4296875, "learning_rate": 0.00010751497017958385, "loss": 2.6032, "step": 2086 }, { "epoch": 0.5957046363257269, "grad_norm": 0.5546875, "learning_rate": 0.00010738466983153533, "loss": 2.5711, "step": 2087 }, { "epoch": 0.5959900721840526, "grad_norm": 0.439453125, "learning_rate": 0.000107254404454434, "loss": 2.5851, "step": 2088 }, { "epoch": 0.5962755080423783, "grad_norm": 0.49609375, "learning_rate": 0.00010712417415517808, "loss": 2.5805, "step": 2089 }, { "epoch": 0.596560943900704, "grad_norm": 0.451171875, "learning_rate": 0.00010699397904063708, "loss": 2.5809, "step": 2090 }, { "epoch": 0.5968463797590297, "grad_norm": 0.57421875, "learning_rate": 0.00010686381921765158, "loss": 2.5796, "step": 2091 }, { "epoch": 0.5971318156173554, "grad_norm": 0.462890625, "learning_rate": 0.00010673369479303315, "loss": 2.5641, "step": 2092 }, { "epoch": 0.597417251475681, "grad_norm": 0.42578125, "learning_rate": 0.00010660360587356438, "loss": 2.5651, "step": 2093 }, { "epoch": 0.5977026873340068, "grad_norm": 0.44921875, "learning_rate": 0.00010647355256599877, "loss": 2.5639, "step": 2094 }, { "epoch": 0.5979881231923325, "grad_norm": 0.423828125, "learning_rate": 0.00010634353497706037, "loss": 2.5482, "step": 2095 }, { "epoch": 0.5982735590506582, "grad_norm": 0.439453125, "learning_rate": 0.0001062135532134442, "loss": 2.5762, "step": 2096 }, { "epoch": 0.5985589949089839, "grad_norm": 0.419921875, "learning_rate": 0.0001060836073818157, "loss": 2.573, "step": 2097 }, { "epoch": 0.5988444307673095, "grad_norm": 0.4453125, "learning_rate": 0.00010595369758881091, "loss": 2.5582, "step": 2098 }, { "epoch": 0.5991298666256353, "grad_norm": 0.455078125, "learning_rate": 0.00010582382394103628, "loss": 2.6, "step": 2099 }, { "epoch": 0.5994153024839609, "grad_norm": 0.400390625, "learning_rate": 0.0001056939865450686, "loss": 2.573, "step": 2100 }, { "epoch": 0.5997007383422867, "grad_norm": 0.419921875, "learning_rate": 0.00010556418550745482, "loss": 2.5422, "step": 2101 }, { "epoch": 0.5999861742006124, "grad_norm": 0.427734375, "learning_rate": 0.00010543442093471218, "loss": 2.5682, "step": 2102 }, { "epoch": 0.600271610058938, "grad_norm": 0.451171875, "learning_rate": 0.00010530469293332797, "loss": 2.563, "step": 2103 }, { "epoch": 0.6005570459172638, "grad_norm": 0.41015625, "learning_rate": 0.00010517500160975935, "loss": 2.5584, "step": 2104 }, { "epoch": 0.6008424817755894, "grad_norm": 0.4296875, "learning_rate": 0.00010504534707043357, "loss": 2.5646, "step": 2105 }, { "epoch": 0.6011279176339152, "grad_norm": 0.447265625, "learning_rate": 0.00010491572942174763, "loss": 2.5812, "step": 2106 }, { "epoch": 0.6014133534922408, "grad_norm": 0.46875, "learning_rate": 0.00010478614877006813, "loss": 2.5652, "step": 2107 }, { "epoch": 0.6016987893505665, "grad_norm": 0.443359375, "learning_rate": 0.00010465660522173144, "loss": 2.5468, "step": 2108 }, { "epoch": 0.6019842252088922, "grad_norm": 0.4140625, "learning_rate": 0.00010452709888304347, "loss": 2.5424, "step": 2109 }, { "epoch": 0.6022696610672179, "grad_norm": 0.43359375, "learning_rate": 0.0001043976298602796, "loss": 2.579, "step": 2110 }, { "epoch": 0.6025550969255437, "grad_norm": 0.45703125, "learning_rate": 0.00010426819825968449, "loss": 2.5618, "step": 2111 }, { "epoch": 0.6028405327838693, "grad_norm": 0.421875, "learning_rate": 0.00010413880418747215, "loss": 2.5656, "step": 2112 }, { "epoch": 0.603125968642195, "grad_norm": 0.4609375, "learning_rate": 0.00010400944774982593, "loss": 2.5724, "step": 2113 }, { "epoch": 0.6034114045005207, "grad_norm": 0.435546875, "learning_rate": 0.00010388012905289808, "loss": 2.5452, "step": 2114 }, { "epoch": 0.6036968403588464, "grad_norm": 0.41796875, "learning_rate": 0.00010375084820280998, "loss": 2.5538, "step": 2115 }, { "epoch": 0.603982276217172, "grad_norm": 0.4296875, "learning_rate": 0.00010362160530565197, "loss": 2.5399, "step": 2116 }, { "epoch": 0.6042677120754978, "grad_norm": 0.42578125, "learning_rate": 0.00010349240046748324, "loss": 2.5613, "step": 2117 }, { "epoch": 0.6045531479338235, "grad_norm": 0.412109375, "learning_rate": 0.00010336323379433165, "loss": 2.5742, "step": 2118 }, { "epoch": 0.6048385837921492, "grad_norm": 0.41015625, "learning_rate": 0.00010323410539219388, "loss": 2.5627, "step": 2119 }, { "epoch": 0.6051240196504749, "grad_norm": 0.412109375, "learning_rate": 0.00010310501536703507, "loss": 2.5675, "step": 2120 }, { "epoch": 0.6054094555088005, "grad_norm": 0.412109375, "learning_rate": 0.00010297596382478906, "loss": 2.5845, "step": 2121 }, { "epoch": 0.6056948913671263, "grad_norm": 0.419921875, "learning_rate": 0.00010284695087135791, "loss": 2.5579, "step": 2122 }, { "epoch": 0.6059803272254519, "grad_norm": 0.423828125, "learning_rate": 0.00010271797661261215, "loss": 2.5864, "step": 2123 }, { "epoch": 0.6062657630837777, "grad_norm": 0.390625, "learning_rate": 0.0001025890411543904, "loss": 2.5851, "step": 2124 }, { "epoch": 0.6065511989421033, "grad_norm": 0.412109375, "learning_rate": 0.00010246014460249964, "loss": 2.5753, "step": 2125 }, { "epoch": 0.606836634800429, "grad_norm": 0.404296875, "learning_rate": 0.00010233128706271475, "loss": 2.5756, "step": 2126 }, { "epoch": 0.6071220706587548, "grad_norm": 0.380859375, "learning_rate": 0.00010220246864077875, "loss": 2.5755, "step": 2127 }, { "epoch": 0.6074075065170804, "grad_norm": 0.384765625, "learning_rate": 0.00010207368944240234, "loss": 2.5598, "step": 2128 }, { "epoch": 0.6076929423754062, "grad_norm": 0.4140625, "learning_rate": 0.00010194494957326434, "loss": 2.564, "step": 2129 }, { "epoch": 0.6079783782337318, "grad_norm": 0.388671875, "learning_rate": 0.00010181624913901099, "loss": 2.5546, "step": 2130 }, { "epoch": 0.6082638140920575, "grad_norm": 0.38671875, "learning_rate": 0.0001016875882452564, "loss": 2.5709, "step": 2131 }, { "epoch": 0.6085492499503832, "grad_norm": 0.42578125, "learning_rate": 0.00010155896699758206, "loss": 2.5293, "step": 2132 }, { "epoch": 0.6088346858087089, "grad_norm": 0.384765625, "learning_rate": 0.00010143038550153703, "loss": 2.5746, "step": 2133 }, { "epoch": 0.6091201216670347, "grad_norm": 0.45703125, "learning_rate": 0.0001013018438626378, "loss": 2.5632, "step": 2134 }, { "epoch": 0.6094055575253603, "grad_norm": 0.408203125, "learning_rate": 0.00010117334218636793, "loss": 2.5465, "step": 2135 }, { "epoch": 0.609690993383686, "grad_norm": 0.400390625, "learning_rate": 0.00010104488057817839, "loss": 2.5461, "step": 2136 }, { "epoch": 0.6099764292420117, "grad_norm": 0.408203125, "learning_rate": 0.00010091645914348724, "loss": 2.5891, "step": 2137 }, { "epoch": 0.6102618651003374, "grad_norm": 0.412109375, "learning_rate": 0.00010078807798767953, "loss": 2.5954, "step": 2138 }, { "epoch": 0.610547300958663, "grad_norm": 0.4140625, "learning_rate": 0.00010065973721610727, "loss": 2.5611, "step": 2139 }, { "epoch": 0.6108327368169888, "grad_norm": 0.392578125, "learning_rate": 0.00010053143693408932, "loss": 2.5958, "step": 2140 }, { "epoch": 0.6111181726753144, "grad_norm": 0.41015625, "learning_rate": 0.00010040317724691133, "loss": 2.5734, "step": 2141 }, { "epoch": 0.6114036085336402, "grad_norm": 0.40625, "learning_rate": 0.00010027495825982558, "loss": 2.5665, "step": 2142 }, { "epoch": 0.6116890443919659, "grad_norm": 0.388671875, "learning_rate": 0.00010014678007805106, "loss": 2.5597, "step": 2143 }, { "epoch": 0.6119744802502916, "grad_norm": 0.4140625, "learning_rate": 0.00010001864280677316, "loss": 2.5883, "step": 2144 }, { "epoch": 0.6122599161086173, "grad_norm": 0.41015625, "learning_rate": 9.989054655114383e-05, "loss": 2.5357, "step": 2145 }, { "epoch": 0.6125453519669429, "grad_norm": 0.40625, "learning_rate": 9.976249141628124e-05, "loss": 2.5692, "step": 2146 }, { "epoch": 0.6128307878252687, "grad_norm": 0.4296875, "learning_rate": 9.963447750726984e-05, "loss": 2.5544, "step": 2147 }, { "epoch": 0.6131162236835943, "grad_norm": 0.390625, "learning_rate": 9.95065049291603e-05, "loss": 2.5472, "step": 2148 }, { "epoch": 0.61340165954192, "grad_norm": 0.3984375, "learning_rate": 9.937857378696932e-05, "loss": 2.6036, "step": 2149 }, { "epoch": 0.6136870954002458, "grad_norm": 0.40234375, "learning_rate": 9.925068418567967e-05, "loss": 2.5645, "step": 2150 }, { "epoch": 0.6139725312585714, "grad_norm": 0.396484375, "learning_rate": 9.912283623023988e-05, "loss": 2.5646, "step": 2151 }, { "epoch": 0.6142579671168972, "grad_norm": 0.4140625, "learning_rate": 9.899503002556442e-05, "loss": 2.5792, "step": 2152 }, { "epoch": 0.6145434029752228, "grad_norm": 0.39453125, "learning_rate": 9.886726567653362e-05, "loss": 2.5629, "step": 2153 }, { "epoch": 0.6148288388335486, "grad_norm": 0.4375, "learning_rate": 9.87395432879932e-05, "loss": 2.5558, "step": 2154 }, { "epoch": 0.6151142746918742, "grad_norm": 0.416015625, "learning_rate": 9.861186296475458e-05, "loss": 2.5663, "step": 2155 }, { "epoch": 0.6153997105501999, "grad_norm": 0.390625, "learning_rate": 9.84842248115947e-05, "loss": 2.5347, "step": 2156 }, { "epoch": 0.6156851464085256, "grad_norm": 0.3828125, "learning_rate": 9.835662893325584e-05, "loss": 2.5608, "step": 2157 }, { "epoch": 0.6159705822668513, "grad_norm": 0.3984375, "learning_rate": 9.822907543444553e-05, "loss": 2.5695, "step": 2158 }, { "epoch": 0.616256018125177, "grad_norm": 0.376953125, "learning_rate": 9.810156441983665e-05, "loss": 2.5549, "step": 2159 }, { "epoch": 0.6165414539835027, "grad_norm": 0.41015625, "learning_rate": 9.797409599406709e-05, "loss": 2.5916, "step": 2160 }, { "epoch": 0.6168268898418284, "grad_norm": 0.4140625, "learning_rate": 9.784667026173993e-05, "loss": 2.546, "step": 2161 }, { "epoch": 0.6171123257001541, "grad_norm": 0.380859375, "learning_rate": 9.771928732742313e-05, "loss": 2.5728, "step": 2162 }, { "epoch": 0.6173977615584798, "grad_norm": 0.376953125, "learning_rate": 9.759194729564954e-05, "loss": 2.5711, "step": 2163 }, { "epoch": 0.6176831974168054, "grad_norm": 0.421875, "learning_rate": 9.746465027091676e-05, "loss": 2.5335, "step": 2164 }, { "epoch": 0.6179686332751312, "grad_norm": 0.376953125, "learning_rate": 9.733739635768714e-05, "loss": 2.5583, "step": 2165 }, { "epoch": 0.6182540691334568, "grad_norm": 0.404296875, "learning_rate": 9.721018566038767e-05, "loss": 2.537, "step": 2166 }, { "epoch": 0.6185395049917826, "grad_norm": 0.421875, "learning_rate": 9.708301828340993e-05, "loss": 2.5576, "step": 2167 }, { "epoch": 0.6188249408501083, "grad_norm": 0.388671875, "learning_rate": 9.695589433110968e-05, "loss": 2.5786, "step": 2168 }, { "epoch": 0.6191103767084339, "grad_norm": 0.37890625, "learning_rate": 9.682881390780749e-05, "loss": 2.584, "step": 2169 }, { "epoch": 0.6193958125667597, "grad_norm": 0.41796875, "learning_rate": 9.67017771177878e-05, "loss": 2.5681, "step": 2170 }, { "epoch": 0.6196812484250853, "grad_norm": 0.392578125, "learning_rate": 9.657478406529946e-05, "loss": 2.553, "step": 2171 }, { "epoch": 0.6199666842834111, "grad_norm": 0.390625, "learning_rate": 9.644783485455537e-05, "loss": 2.5665, "step": 2172 }, { "epoch": 0.6202521201417367, "grad_norm": 0.39453125, "learning_rate": 9.632092958973246e-05, "loss": 2.5572, "step": 2173 }, { "epoch": 0.6205375560000624, "grad_norm": 0.40234375, "learning_rate": 9.61940683749716e-05, "loss": 2.5576, "step": 2174 }, { "epoch": 0.6208229918583882, "grad_norm": 0.3828125, "learning_rate": 9.606725131437739e-05, "loss": 2.5667, "step": 2175 }, { "epoch": 0.6211084277167138, "grad_norm": 0.400390625, "learning_rate": 9.594047851201855e-05, "loss": 2.5688, "step": 2176 }, { "epoch": 0.6213938635750396, "grad_norm": 0.38671875, "learning_rate": 9.581375007192705e-05, "loss": 2.5627, "step": 2177 }, { "epoch": 0.6216792994333652, "grad_norm": 0.400390625, "learning_rate": 9.568706609809872e-05, "loss": 2.5918, "step": 2178 }, { "epoch": 0.6219647352916909, "grad_norm": 0.396484375, "learning_rate": 9.556042669449281e-05, "loss": 2.5662, "step": 2179 }, { "epoch": 0.6222501711500166, "grad_norm": 0.396484375, "learning_rate": 9.543383196503206e-05, "loss": 2.5345, "step": 2180 }, { "epoch": 0.6225356070083423, "grad_norm": 0.40234375, "learning_rate": 9.530728201360244e-05, "loss": 2.5612, "step": 2181 }, { "epoch": 0.622821042866668, "grad_norm": 0.390625, "learning_rate": 9.518077694405322e-05, "loss": 2.5691, "step": 2182 }, { "epoch": 0.6231064787249937, "grad_norm": 0.40234375, "learning_rate": 9.505431686019692e-05, "loss": 2.5599, "step": 2183 }, { "epoch": 0.6233919145833194, "grad_norm": 0.39453125, "learning_rate": 9.492790186580906e-05, "loss": 2.5384, "step": 2184 }, { "epoch": 0.6236773504416451, "grad_norm": 0.388671875, "learning_rate": 9.480153206462817e-05, "loss": 2.5833, "step": 2185 }, { "epoch": 0.6239627862999708, "grad_norm": 0.3828125, "learning_rate": 9.467520756035575e-05, "loss": 2.5582, "step": 2186 }, { "epoch": 0.6242482221582965, "grad_norm": 0.390625, "learning_rate": 9.454892845665603e-05, "loss": 2.5327, "step": 2187 }, { "epoch": 0.6245336580166222, "grad_norm": 0.41015625, "learning_rate": 9.442269485715602e-05, "loss": 2.5675, "step": 2188 }, { "epoch": 0.6248190938749478, "grad_norm": 0.38671875, "learning_rate": 9.429650686544546e-05, "loss": 2.5706, "step": 2189 }, { "epoch": 0.6251045297332736, "grad_norm": 0.41015625, "learning_rate": 9.417036458507658e-05, "loss": 2.5732, "step": 2190 }, { "epoch": 0.6253899655915993, "grad_norm": 0.40234375, "learning_rate": 9.404426811956404e-05, "loss": 2.57, "step": 2191 }, { "epoch": 0.625675401449925, "grad_norm": 0.40234375, "learning_rate": 9.391821757238511e-05, "loss": 2.5336, "step": 2192 }, { "epoch": 0.6259608373082507, "grad_norm": 0.40625, "learning_rate": 9.379221304697925e-05, "loss": 2.5533, "step": 2193 }, { "epoch": 0.6262462731665763, "grad_norm": 0.40234375, "learning_rate": 9.366625464674811e-05, "loss": 2.5648, "step": 2194 }, { "epoch": 0.6265317090249021, "grad_norm": 0.40625, "learning_rate": 9.354034247505556e-05, "loss": 2.5672, "step": 2195 }, { "epoch": 0.6268171448832277, "grad_norm": 0.40234375, "learning_rate": 9.341447663522749e-05, "loss": 2.5789, "step": 2196 }, { "epoch": 0.6271025807415535, "grad_norm": 0.384765625, "learning_rate": 9.328865723055185e-05, "loss": 2.5557, "step": 2197 }, { "epoch": 0.6273880165998791, "grad_norm": 0.431640625, "learning_rate": 9.316288436427834e-05, "loss": 2.5479, "step": 2198 }, { "epoch": 0.6276734524582048, "grad_norm": 0.40234375, "learning_rate": 9.30371581396186e-05, "loss": 2.5853, "step": 2199 }, { "epoch": 0.6279588883165306, "grad_norm": 0.380859375, "learning_rate": 9.291147865974599e-05, "loss": 2.588, "step": 2200 }, { "epoch": 0.6282443241748562, "grad_norm": 0.37890625, "learning_rate": 9.278584602779541e-05, "loss": 2.5675, "step": 2201 }, { "epoch": 0.628529760033182, "grad_norm": 0.396484375, "learning_rate": 9.266026034686341e-05, "loss": 2.59, "step": 2202 }, { "epoch": 0.6288151958915076, "grad_norm": 0.44140625, "learning_rate": 9.253472172000802e-05, "loss": 2.5578, "step": 2203 }, { "epoch": 0.6291006317498333, "grad_norm": 0.40234375, "learning_rate": 9.240923025024853e-05, "loss": 2.5348, "step": 2204 }, { "epoch": 0.629386067608159, "grad_norm": 0.423828125, "learning_rate": 9.228378604056568e-05, "loss": 2.5759, "step": 2205 }, { "epoch": 0.6296715034664847, "grad_norm": 0.416015625, "learning_rate": 9.215838919390132e-05, "loss": 2.5559, "step": 2206 }, { "epoch": 0.6299569393248104, "grad_norm": 0.41015625, "learning_rate": 9.203303981315847e-05, "loss": 2.5611, "step": 2207 }, { "epoch": 0.6302423751831361, "grad_norm": 0.41015625, "learning_rate": 9.190773800120126e-05, "loss": 2.5746, "step": 2208 }, { "epoch": 0.6305278110414618, "grad_norm": 0.396484375, "learning_rate": 9.178248386085474e-05, "loss": 2.5519, "step": 2209 }, { "epoch": 0.6308132468997875, "grad_norm": 0.408203125, "learning_rate": 9.165727749490477e-05, "loss": 2.5576, "step": 2210 }, { "epoch": 0.6310986827581132, "grad_norm": 0.408203125, "learning_rate": 9.15321190060981e-05, "loss": 2.5854, "step": 2211 }, { "epoch": 0.6313841186164388, "grad_norm": 0.404296875, "learning_rate": 9.140700849714216e-05, "loss": 2.5661, "step": 2212 }, { "epoch": 0.6316695544747646, "grad_norm": 0.41015625, "learning_rate": 9.128194607070498e-05, "loss": 2.5572, "step": 2213 }, { "epoch": 0.6319549903330902, "grad_norm": 0.404296875, "learning_rate": 9.115693182941518e-05, "loss": 2.5889, "step": 2214 }, { "epoch": 0.632240426191416, "grad_norm": 0.421875, "learning_rate": 9.103196587586172e-05, "loss": 2.5474, "step": 2215 }, { "epoch": 0.6325258620497417, "grad_norm": 0.412109375, "learning_rate": 9.090704831259422e-05, "loss": 2.5664, "step": 2216 }, { "epoch": 0.6328112979080673, "grad_norm": 0.376953125, "learning_rate": 9.078217924212224e-05, "loss": 2.5648, "step": 2217 }, { "epoch": 0.6330967337663931, "grad_norm": 0.412109375, "learning_rate": 9.065735876691578e-05, "loss": 2.5675, "step": 2218 }, { "epoch": 0.6333821696247187, "grad_norm": 0.39453125, "learning_rate": 9.053258698940484e-05, "loss": 2.5783, "step": 2219 }, { "epoch": 0.6336676054830445, "grad_norm": 0.4140625, "learning_rate": 9.040786401197957e-05, "loss": 2.561, "step": 2220 }, { "epoch": 0.6339530413413701, "grad_norm": 0.390625, "learning_rate": 9.028318993698993e-05, "loss": 2.5814, "step": 2221 }, { "epoch": 0.6342384771996958, "grad_norm": 0.421875, "learning_rate": 9.015856486674587e-05, "loss": 2.6124, "step": 2222 }, { "epoch": 0.6345239130580216, "grad_norm": 0.458984375, "learning_rate": 9.003398890351704e-05, "loss": 2.5395, "step": 2223 }, { "epoch": 0.6348093489163472, "grad_norm": 0.400390625, "learning_rate": 8.99094621495329e-05, "loss": 2.5417, "step": 2224 }, { "epoch": 0.635094784774673, "grad_norm": 0.388671875, "learning_rate": 8.978498470698244e-05, "loss": 2.5751, "step": 2225 }, { "epoch": 0.6353802206329986, "grad_norm": 0.439453125, "learning_rate": 8.966055667801422e-05, "loss": 2.5614, "step": 2226 }, { "epoch": 0.6356656564913243, "grad_norm": 0.423828125, "learning_rate": 8.95361781647362e-05, "loss": 2.5633, "step": 2227 }, { "epoch": 0.63595109234965, "grad_norm": 0.396484375, "learning_rate": 8.941184926921576e-05, "loss": 2.5668, "step": 2228 }, { "epoch": 0.6362365282079757, "grad_norm": 0.384765625, "learning_rate": 8.928757009347956e-05, "loss": 2.5793, "step": 2229 }, { "epoch": 0.6365219640663013, "grad_norm": 0.373046875, "learning_rate": 8.916334073951345e-05, "loss": 2.5548, "step": 2230 }, { "epoch": 0.6368073999246271, "grad_norm": 0.419921875, "learning_rate": 8.90391613092623e-05, "loss": 2.5783, "step": 2231 }, { "epoch": 0.6370928357829528, "grad_norm": 0.419921875, "learning_rate": 8.891503190463024e-05, "loss": 2.5809, "step": 2232 }, { "epoch": 0.6373782716412785, "grad_norm": 0.390625, "learning_rate": 8.879095262748018e-05, "loss": 2.5614, "step": 2233 }, { "epoch": 0.6376637074996042, "grad_norm": 0.41796875, "learning_rate": 8.866692357963387e-05, "loss": 2.5739, "step": 2234 }, { "epoch": 0.6379491433579298, "grad_norm": 0.416015625, "learning_rate": 8.854294486287188e-05, "loss": 2.5764, "step": 2235 }, { "epoch": 0.6382345792162556, "grad_norm": 0.4375, "learning_rate": 8.84190165789336e-05, "loss": 2.5702, "step": 2236 }, { "epoch": 0.6385200150745812, "grad_norm": 0.40625, "learning_rate": 8.829513882951686e-05, "loss": 2.5682, "step": 2237 }, { "epoch": 0.638805450932907, "grad_norm": 0.423828125, "learning_rate": 8.8171311716278e-05, "loss": 2.5557, "step": 2238 }, { "epoch": 0.6390908867912326, "grad_norm": 0.42578125, "learning_rate": 8.804753534083208e-05, "loss": 2.5917, "step": 2239 }, { "epoch": 0.6393763226495583, "grad_norm": 0.390625, "learning_rate": 8.79238098047522e-05, "loss": 2.5776, "step": 2240 }, { "epoch": 0.6396617585078841, "grad_norm": 0.3984375, "learning_rate": 8.780013520956996e-05, "loss": 2.5412, "step": 2241 }, { "epoch": 0.6399471943662097, "grad_norm": 0.423828125, "learning_rate": 8.767651165677502e-05, "loss": 2.572, "step": 2242 }, { "epoch": 0.6402326302245355, "grad_norm": 0.388671875, "learning_rate": 8.755293924781523e-05, "loss": 2.5363, "step": 2243 }, { "epoch": 0.6405180660828611, "grad_norm": 0.390625, "learning_rate": 8.742941808409647e-05, "loss": 2.5623, "step": 2244 }, { "epoch": 0.6408035019411868, "grad_norm": 0.404296875, "learning_rate": 8.730594826698253e-05, "loss": 2.551, "step": 2245 }, { "epoch": 0.6410889377995125, "grad_norm": 0.37109375, "learning_rate": 8.718252989779496e-05, "loss": 2.5181, "step": 2246 }, { "epoch": 0.6413743736578382, "grad_norm": 0.396484375, "learning_rate": 8.705916307781344e-05, "loss": 2.5543, "step": 2247 }, { "epoch": 0.641659809516164, "grad_norm": 0.392578125, "learning_rate": 8.6935847908275e-05, "loss": 2.5636, "step": 2248 }, { "epoch": 0.6419452453744896, "grad_norm": 0.416015625, "learning_rate": 8.681258449037438e-05, "loss": 2.5439, "step": 2249 }, { "epoch": 0.6422306812328153, "grad_norm": 0.396484375, "learning_rate": 8.668937292526394e-05, "loss": 2.5287, "step": 2250 }, { "epoch": 0.6422306812328153, "eval_loss": 2.4652860164642334, "eval_runtime": 6001.1587, "eval_samples_per_second": 10.712, "eval_steps_per_second": 10.712, "step": 2250 }, { "epoch": 0.642516117091141, "grad_norm": 0.400390625, "learning_rate": 8.656621331405339e-05, "loss": 2.5401, "step": 2251 }, { "epoch": 0.6428015529494667, "grad_norm": 0.373046875, "learning_rate": 8.644310575780979e-05, "loss": 2.5709, "step": 2252 }, { "epoch": 0.6430869888077924, "grad_norm": 0.37890625, "learning_rate": 8.632005035755766e-05, "loss": 2.6213, "step": 2253 }, { "epoch": 0.6433724246661181, "grad_norm": 0.38671875, "learning_rate": 8.619704721427843e-05, "loss": 2.5512, "step": 2254 }, { "epoch": 0.6436578605244437, "grad_norm": 0.376953125, "learning_rate": 8.607409642891091e-05, "loss": 2.563, "step": 2255 }, { "epoch": 0.6439432963827695, "grad_norm": 0.39453125, "learning_rate": 8.595119810235088e-05, "loss": 2.5438, "step": 2256 }, { "epoch": 0.6442287322410952, "grad_norm": 0.38671875, "learning_rate": 8.582835233545093e-05, "loss": 2.5563, "step": 2257 }, { "epoch": 0.6445141680994209, "grad_norm": 0.38671875, "learning_rate": 8.570555922902074e-05, "loss": 2.5278, "step": 2258 }, { "epoch": 0.6447996039577466, "grad_norm": 0.388671875, "learning_rate": 8.558281888382659e-05, "loss": 2.5753, "step": 2259 }, { "epoch": 0.6450850398160722, "grad_norm": 0.380859375, "learning_rate": 8.546013140059148e-05, "loss": 2.5751, "step": 2260 }, { "epoch": 0.645370475674398, "grad_norm": 0.37890625, "learning_rate": 8.53374968799952e-05, "loss": 2.5553, "step": 2261 }, { "epoch": 0.6456559115327236, "grad_norm": 0.3828125, "learning_rate": 8.521491542267386e-05, "loss": 2.5534, "step": 2262 }, { "epoch": 0.6459413473910494, "grad_norm": 0.37890625, "learning_rate": 8.509238712922014e-05, "loss": 2.5781, "step": 2263 }, { "epoch": 0.6462267832493751, "grad_norm": 0.365234375, "learning_rate": 8.496991210018319e-05, "loss": 2.5595, "step": 2264 }, { "epoch": 0.6465122191077007, "grad_norm": 0.390625, "learning_rate": 8.484749043606824e-05, "loss": 2.5502, "step": 2265 }, { "epoch": 0.6467976549660265, "grad_norm": 0.3671875, "learning_rate": 8.472512223733679e-05, "loss": 2.5458, "step": 2266 }, { "epoch": 0.6470830908243521, "grad_norm": 0.375, "learning_rate": 8.460280760440664e-05, "loss": 2.5653, "step": 2267 }, { "epoch": 0.6473685266826779, "grad_norm": 0.361328125, "learning_rate": 8.448054663765135e-05, "loss": 2.5727, "step": 2268 }, { "epoch": 0.6476539625410035, "grad_norm": 0.390625, "learning_rate": 8.435833943740064e-05, "loss": 2.5665, "step": 2269 }, { "epoch": 0.6479393983993292, "grad_norm": 0.390625, "learning_rate": 8.423618610394004e-05, "loss": 2.5411, "step": 2270 }, { "epoch": 0.6482248342576549, "grad_norm": 0.375, "learning_rate": 8.411408673751096e-05, "loss": 2.5636, "step": 2271 }, { "epoch": 0.6485102701159806, "grad_norm": 0.369140625, "learning_rate": 8.399204143831036e-05, "loss": 2.5729, "step": 2272 }, { "epoch": 0.6487957059743064, "grad_norm": 0.37890625, "learning_rate": 8.387005030649102e-05, "loss": 2.5837, "step": 2273 }, { "epoch": 0.649081141832632, "grad_norm": 0.375, "learning_rate": 8.374811344216105e-05, "loss": 2.5646, "step": 2274 }, { "epoch": 0.6493665776909577, "grad_norm": 0.380859375, "learning_rate": 8.362623094538428e-05, "loss": 2.5886, "step": 2275 }, { "epoch": 0.6496520135492834, "grad_norm": 0.39453125, "learning_rate": 8.350440291617974e-05, "loss": 2.5494, "step": 2276 }, { "epoch": 0.6499374494076091, "grad_norm": 0.400390625, "learning_rate": 8.338262945452176e-05, "loss": 2.5577, "step": 2277 }, { "epoch": 0.6502228852659347, "grad_norm": 0.369140625, "learning_rate": 8.326091066033998e-05, "loss": 2.5796, "step": 2278 }, { "epoch": 0.6505083211242605, "grad_norm": 0.376953125, "learning_rate": 8.313924663351926e-05, "loss": 2.574, "step": 2279 }, { "epoch": 0.6507937569825862, "grad_norm": 0.38671875, "learning_rate": 8.301763747389925e-05, "loss": 2.5544, "step": 2280 }, { "epoch": 0.6510791928409119, "grad_norm": 0.36328125, "learning_rate": 8.289608328127483e-05, "loss": 2.5358, "step": 2281 }, { "epoch": 0.6513646286992376, "grad_norm": 0.38671875, "learning_rate": 8.277458415539569e-05, "loss": 2.5567, "step": 2282 }, { "epoch": 0.6516500645575632, "grad_norm": 0.375, "learning_rate": 8.265314019596617e-05, "loss": 2.5566, "step": 2283 }, { "epoch": 0.651935500415889, "grad_norm": 0.369140625, "learning_rate": 8.253175150264565e-05, "loss": 2.5591, "step": 2284 }, { "epoch": 0.6522209362742146, "grad_norm": 0.375, "learning_rate": 8.241041817504791e-05, "loss": 2.5519, "step": 2285 }, { "epoch": 0.6525063721325404, "grad_norm": 0.380859375, "learning_rate": 8.228914031274128e-05, "loss": 2.5378, "step": 2286 }, { "epoch": 0.652791807990866, "grad_norm": 0.392578125, "learning_rate": 8.21679180152489e-05, "loss": 2.5576, "step": 2287 }, { "epoch": 0.6530772438491917, "grad_norm": 0.361328125, "learning_rate": 8.204675138204794e-05, "loss": 2.5636, "step": 2288 }, { "epoch": 0.6533626797075175, "grad_norm": 0.37109375, "learning_rate": 8.192564051257001e-05, "loss": 2.5682, "step": 2289 }, { "epoch": 0.6536481155658431, "grad_norm": 0.376953125, "learning_rate": 8.180458550620109e-05, "loss": 2.5616, "step": 2290 }, { "epoch": 0.6539335514241689, "grad_norm": 0.3671875, "learning_rate": 8.168358646228115e-05, "loss": 2.5503, "step": 2291 }, { "epoch": 0.6542189872824945, "grad_norm": 0.3828125, "learning_rate": 8.156264348010425e-05, "loss": 2.548, "step": 2292 }, { "epoch": 0.6545044231408202, "grad_norm": 0.365234375, "learning_rate": 8.144175665891858e-05, "loss": 2.5327, "step": 2293 }, { "epoch": 0.6547898589991459, "grad_norm": 0.369140625, "learning_rate": 8.132092609792608e-05, "loss": 2.5491, "step": 2294 }, { "epoch": 0.6550752948574716, "grad_norm": 0.373046875, "learning_rate": 8.120015189628259e-05, "loss": 2.5576, "step": 2295 }, { "epoch": 0.6553607307157974, "grad_norm": 0.375, "learning_rate": 8.107943415309786e-05, "loss": 2.5687, "step": 2296 }, { "epoch": 0.655646166574123, "grad_norm": 0.388671875, "learning_rate": 8.095877296743497e-05, "loss": 2.5506, "step": 2297 }, { "epoch": 0.6559316024324487, "grad_norm": 0.361328125, "learning_rate": 8.083816843831091e-05, "loss": 2.5609, "step": 2298 }, { "epoch": 0.6562170382907744, "grad_norm": 0.35546875, "learning_rate": 8.071762066469598e-05, "loss": 2.5515, "step": 2299 }, { "epoch": 0.6565024741491001, "grad_norm": 0.3671875, "learning_rate": 8.059712974551392e-05, "loss": 2.5587, "step": 2300 }, { "epoch": 0.6567879100074258, "grad_norm": 0.384765625, "learning_rate": 8.047669577964197e-05, "loss": 2.5523, "step": 2301 }, { "epoch": 0.6570733458657515, "grad_norm": 0.384765625, "learning_rate": 8.03563188659104e-05, "loss": 2.5321, "step": 2302 }, { "epoch": 0.6573587817240771, "grad_norm": 0.36328125, "learning_rate": 8.023599910310287e-05, "loss": 2.5848, "step": 2303 }, { "epoch": 0.6576442175824029, "grad_norm": 0.353515625, "learning_rate": 8.011573658995606e-05, "loss": 2.539, "step": 2304 }, { "epoch": 0.6579296534407286, "grad_norm": 0.384765625, "learning_rate": 7.999553142515969e-05, "loss": 2.5545, "step": 2305 }, { "epoch": 0.6582150892990543, "grad_norm": 0.373046875, "learning_rate": 7.987538370735624e-05, "loss": 2.5481, "step": 2306 }, { "epoch": 0.65850052515738, "grad_norm": 0.373046875, "learning_rate": 7.975529353514141e-05, "loss": 2.5889, "step": 2307 }, { "epoch": 0.6587859610157056, "grad_norm": 0.37109375, "learning_rate": 7.963526100706337e-05, "loss": 2.5113, "step": 2308 }, { "epoch": 0.6590713968740314, "grad_norm": 0.361328125, "learning_rate": 7.951528622162297e-05, "loss": 2.5789, "step": 2309 }, { "epoch": 0.659356832732357, "grad_norm": 0.36328125, "learning_rate": 7.9395369277274e-05, "loss": 2.546, "step": 2310 }, { "epoch": 0.6596422685906828, "grad_norm": 0.3671875, "learning_rate": 7.927551027242252e-05, "loss": 2.5322, "step": 2311 }, { "epoch": 0.6599277044490084, "grad_norm": 0.384765625, "learning_rate": 7.9155709305427e-05, "loss": 2.5277, "step": 2312 }, { "epoch": 0.6602131403073341, "grad_norm": 0.384765625, "learning_rate": 7.90359664745985e-05, "loss": 2.5684, "step": 2313 }, { "epoch": 0.6604985761656599, "grad_norm": 0.369140625, "learning_rate": 7.891628187820021e-05, "loss": 2.5712, "step": 2314 }, { "epoch": 0.6607840120239855, "grad_norm": 0.384765625, "learning_rate": 7.87966556144475e-05, "loss": 2.5458, "step": 2315 }, { "epoch": 0.6610694478823113, "grad_norm": 0.40234375, "learning_rate": 7.867708778150812e-05, "loss": 2.572, "step": 2316 }, { "epoch": 0.6613548837406369, "grad_norm": 0.376953125, "learning_rate": 7.855757847750151e-05, "loss": 2.553, "step": 2317 }, { "epoch": 0.6616403195989626, "grad_norm": 0.38671875, "learning_rate": 7.843812780049935e-05, "loss": 2.5738, "step": 2318 }, { "epoch": 0.6619257554572883, "grad_norm": 0.375, "learning_rate": 7.831873584852522e-05, "loss": 2.5652, "step": 2319 }, { "epoch": 0.662211191315614, "grad_norm": 0.37890625, "learning_rate": 7.819940271955425e-05, "loss": 2.5447, "step": 2320 }, { "epoch": 0.6624966271739398, "grad_norm": 0.375, "learning_rate": 7.808012851151362e-05, "loss": 2.5698, "step": 2321 }, { "epoch": 0.6627820630322654, "grad_norm": 0.3828125, "learning_rate": 7.796091332228193e-05, "loss": 2.54, "step": 2322 }, { "epoch": 0.6630674988905911, "grad_norm": 0.3515625, "learning_rate": 7.784175724968939e-05, "loss": 2.5497, "step": 2323 }, { "epoch": 0.6633529347489168, "grad_norm": 0.376953125, "learning_rate": 7.772266039151781e-05, "loss": 2.5507, "step": 2324 }, { "epoch": 0.6636383706072425, "grad_norm": 3.140625, "learning_rate": 7.760362284550024e-05, "loss": 2.5712, "step": 2325 }, { "epoch": 0.6639238064655681, "grad_norm": 0.67578125, "learning_rate": 7.748464470932117e-05, "loss": 2.5554, "step": 2326 }, { "epoch": 0.6642092423238939, "grad_norm": 1.328125, "learning_rate": 7.73657260806164e-05, "loss": 2.5577, "step": 2327 }, { "epoch": 0.6644946781822195, "grad_norm": 0.38671875, "learning_rate": 7.724686705697274e-05, "loss": 2.5744, "step": 2328 }, { "epoch": 0.6647801140405453, "grad_norm": 0.431640625, "learning_rate": 7.712806773592811e-05, "loss": 2.547, "step": 2329 }, { "epoch": 0.665065549898871, "grad_norm": 0.400390625, "learning_rate": 7.700932821497157e-05, "loss": 2.558, "step": 2330 }, { "epoch": 0.6653509857571966, "grad_norm": 0.39453125, "learning_rate": 7.689064859154299e-05, "loss": 2.5383, "step": 2331 }, { "epoch": 0.6656364216155224, "grad_norm": 0.3671875, "learning_rate": 7.677202896303307e-05, "loss": 2.6, "step": 2332 }, { "epoch": 0.665921857473848, "grad_norm": 0.3828125, "learning_rate": 7.665346942678335e-05, "loss": 2.5926, "step": 2333 }, { "epoch": 0.6662072933321738, "grad_norm": 0.384765625, "learning_rate": 7.653497008008611e-05, "loss": 2.5573, "step": 2334 }, { "epoch": 0.6664927291904994, "grad_norm": 0.3828125, "learning_rate": 7.641653102018402e-05, "loss": 2.5838, "step": 2335 }, { "epoch": 0.6667781650488251, "grad_norm": 0.380859375, "learning_rate": 7.629815234427057e-05, "loss": 2.5812, "step": 2336 }, { "epoch": 0.6670636009071509, "grad_norm": 0.41015625, "learning_rate": 7.617983414948937e-05, "loss": 2.5533, "step": 2337 }, { "epoch": 0.6673490367654765, "grad_norm": 0.376953125, "learning_rate": 7.606157653293476e-05, "loss": 2.5459, "step": 2338 }, { "epoch": 0.6676344726238023, "grad_norm": 0.419921875, "learning_rate": 7.594337959165107e-05, "loss": 2.5619, "step": 2339 }, { "epoch": 0.6679199084821279, "grad_norm": 0.380859375, "learning_rate": 7.582524342263292e-05, "loss": 2.5708, "step": 2340 }, { "epoch": 0.6682053443404536, "grad_norm": 0.392578125, "learning_rate": 7.570716812282512e-05, "loss": 2.5465, "step": 2341 }, { "epoch": 0.6684907801987793, "grad_norm": 0.388671875, "learning_rate": 7.558915378912257e-05, "loss": 2.5456, "step": 2342 }, { "epoch": 0.668776216057105, "grad_norm": 0.3828125, "learning_rate": 7.547120051836996e-05, "loss": 2.5814, "step": 2343 }, { "epoch": 0.6690616519154307, "grad_norm": 0.3984375, "learning_rate": 7.535330840736209e-05, "loss": 2.5684, "step": 2344 }, { "epoch": 0.6693470877737564, "grad_norm": 0.357421875, "learning_rate": 7.523547755284337e-05, "loss": 2.5622, "step": 2345 }, { "epoch": 0.6696325236320821, "grad_norm": 0.392578125, "learning_rate": 7.511770805150802e-05, "loss": 2.5668, "step": 2346 }, { "epoch": 0.6699179594904078, "grad_norm": 0.390625, "learning_rate": 7.500000000000002e-05, "loss": 2.5299, "step": 2347 }, { "epoch": 0.6702033953487335, "grad_norm": 0.384765625, "learning_rate": 7.488235349491278e-05, "loss": 2.546, "step": 2348 }, { "epoch": 0.6704888312070592, "grad_norm": 0.388671875, "learning_rate": 7.47647686327891e-05, "loss": 2.5488, "step": 2349 }, { "epoch": 0.6707742670653849, "grad_norm": 0.419921875, "learning_rate": 7.464724551012161e-05, "loss": 2.5425, "step": 2350 }, { "epoch": 0.6710597029237105, "grad_norm": 0.365234375, "learning_rate": 7.45297842233519e-05, "loss": 2.5346, "step": 2351 }, { "epoch": 0.6713451387820363, "grad_norm": 0.373046875, "learning_rate": 7.441238486887083e-05, "loss": 2.5254, "step": 2352 }, { "epoch": 0.671630574640362, "grad_norm": 0.380859375, "learning_rate": 7.42950475430187e-05, "loss": 2.5561, "step": 2353 }, { "epoch": 0.6719160104986877, "grad_norm": 0.376953125, "learning_rate": 7.417777234208463e-05, "loss": 2.5601, "step": 2354 }, { "epoch": 0.6722014463570134, "grad_norm": 0.3671875, "learning_rate": 7.406055936230687e-05, "loss": 2.5617, "step": 2355 }, { "epoch": 0.672486882215339, "grad_norm": 0.39453125, "learning_rate": 7.394340869987267e-05, "loss": 2.5633, "step": 2356 }, { "epoch": 0.6727723180736648, "grad_norm": 0.380859375, "learning_rate": 7.382632045091803e-05, "loss": 2.5703, "step": 2357 }, { "epoch": 0.6730577539319904, "grad_norm": 0.37109375, "learning_rate": 7.37092947115278e-05, "loss": 2.5611, "step": 2358 }, { "epoch": 0.6733431897903162, "grad_norm": 0.369140625, "learning_rate": 7.359233157773557e-05, "loss": 2.5762, "step": 2359 }, { "epoch": 0.6736286256486418, "grad_norm": 0.373046875, "learning_rate": 7.347543114552343e-05, "loss": 2.5665, "step": 2360 }, { "epoch": 0.6739140615069675, "grad_norm": 0.40234375, "learning_rate": 7.335859351082217e-05, "loss": 2.548, "step": 2361 }, { "epoch": 0.6741994973652933, "grad_norm": 0.365234375, "learning_rate": 7.324181876951092e-05, "loss": 2.5389, "step": 2362 }, { "epoch": 0.6744849332236189, "grad_norm": 0.390625, "learning_rate": 7.312510701741717e-05, "loss": 2.5481, "step": 2363 }, { "epoch": 0.6747703690819447, "grad_norm": 0.3671875, "learning_rate": 7.300845835031693e-05, "loss": 2.5571, "step": 2364 }, { "epoch": 0.6750558049402703, "grad_norm": 0.3828125, "learning_rate": 7.28918728639342e-05, "loss": 2.5809, "step": 2365 }, { "epoch": 0.675341240798596, "grad_norm": 0.384765625, "learning_rate": 7.277535065394127e-05, "loss": 2.5644, "step": 2366 }, { "epoch": 0.6756266766569217, "grad_norm": 0.359375, "learning_rate": 7.265889181595853e-05, "loss": 2.5799, "step": 2367 }, { "epoch": 0.6759121125152474, "grad_norm": 0.373046875, "learning_rate": 7.254249644555429e-05, "loss": 2.5631, "step": 2368 }, { "epoch": 0.6761975483735732, "grad_norm": 0.36328125, "learning_rate": 7.242616463824469e-05, "loss": 2.5673, "step": 2369 }, { "epoch": 0.6764829842318988, "grad_norm": 0.37109375, "learning_rate": 7.230989648949396e-05, "loss": 2.5697, "step": 2370 }, { "epoch": 0.6767684200902245, "grad_norm": 0.36328125, "learning_rate": 7.219369209471387e-05, "loss": 2.569, "step": 2371 }, { "epoch": 0.6770538559485502, "grad_norm": 0.357421875, "learning_rate": 7.207755154926386e-05, "loss": 2.5493, "step": 2372 }, { "epoch": 0.6773392918068759, "grad_norm": 0.357421875, "learning_rate": 7.196147494845127e-05, "loss": 2.5515, "step": 2373 }, { "epoch": 0.6776247276652015, "grad_norm": 0.396484375, "learning_rate": 7.184546238753064e-05, "loss": 2.5449, "step": 2374 }, { "epoch": 0.6779101635235273, "grad_norm": 0.36328125, "learning_rate": 7.172951396170402e-05, "loss": 2.5657, "step": 2375 }, { "epoch": 0.6781955993818529, "grad_norm": 0.376953125, "learning_rate": 7.1613629766121e-05, "loss": 2.5615, "step": 2376 }, { "epoch": 0.6784810352401787, "grad_norm": 0.39453125, "learning_rate": 7.149780989587825e-05, "loss": 2.5787, "step": 2377 }, { "epoch": 0.6787664710985044, "grad_norm": 0.359375, "learning_rate": 7.138205444601985e-05, "loss": 2.5632, "step": 2378 }, { "epoch": 0.67905190695683, "grad_norm": 0.375, "learning_rate": 7.126636351153684e-05, "loss": 2.5594, "step": 2379 }, { "epoch": 0.6793373428151558, "grad_norm": 0.373046875, "learning_rate": 7.115073718736735e-05, "loss": 2.55, "step": 2380 }, { "epoch": 0.6796227786734814, "grad_norm": 0.357421875, "learning_rate": 7.10351755683966e-05, "loss": 2.5493, "step": 2381 }, { "epoch": 0.6799082145318072, "grad_norm": 0.3671875, "learning_rate": 7.09196787494567e-05, "loss": 2.54, "step": 2382 }, { "epoch": 0.6801936503901328, "grad_norm": 0.35546875, "learning_rate": 7.08042468253264e-05, "loss": 2.5681, "step": 2383 }, { "epoch": 0.6804790862484585, "grad_norm": 0.375, "learning_rate": 7.068887989073143e-05, "loss": 2.5505, "step": 2384 }, { "epoch": 0.6807645221067842, "grad_norm": 0.388671875, "learning_rate": 7.057357804034404e-05, "loss": 2.5489, "step": 2385 }, { "epoch": 0.6810499579651099, "grad_norm": 0.373046875, "learning_rate": 7.045834136878308e-05, "loss": 2.5669, "step": 2386 }, { "epoch": 0.6813353938234357, "grad_norm": 0.373046875, "learning_rate": 7.0343169970614e-05, "loss": 2.5354, "step": 2387 }, { "epoch": 0.6816208296817613, "grad_norm": 0.359375, "learning_rate": 7.022806394034856e-05, "loss": 2.5571, "step": 2388 }, { "epoch": 0.681906265540087, "grad_norm": 0.369140625, "learning_rate": 7.0113023372445e-05, "loss": 2.5556, "step": 2389 }, { "epoch": 0.6821917013984127, "grad_norm": 0.36328125, "learning_rate": 6.999804836130784e-05, "loss": 2.5822, "step": 2390 }, { "epoch": 0.6824771372567384, "grad_norm": 0.365234375, "learning_rate": 6.988313900128769e-05, "loss": 2.5923, "step": 2391 }, { "epoch": 0.682762573115064, "grad_norm": 0.384765625, "learning_rate": 6.97682953866813e-05, "loss": 2.5303, "step": 2392 }, { "epoch": 0.6830480089733898, "grad_norm": 0.37109375, "learning_rate": 6.965351761173165e-05, "loss": 2.5794, "step": 2393 }, { "epoch": 0.6833334448317155, "grad_norm": 0.35546875, "learning_rate": 6.953880577062745e-05, "loss": 2.582, "step": 2394 }, { "epoch": 0.6836188806900412, "grad_norm": 0.37109375, "learning_rate": 6.94241599575034e-05, "loss": 2.5485, "step": 2395 }, { "epoch": 0.6839043165483669, "grad_norm": 0.361328125, "learning_rate": 6.930958026644005e-05, "loss": 2.5524, "step": 2396 }, { "epoch": 0.6841897524066926, "grad_norm": 0.36328125, "learning_rate": 6.919506679146372e-05, "loss": 2.5754, "step": 2397 }, { "epoch": 0.6844751882650183, "grad_norm": 0.357421875, "learning_rate": 6.908061962654626e-05, "loss": 2.5647, "step": 2398 }, { "epoch": 0.6847606241233439, "grad_norm": 0.373046875, "learning_rate": 6.896623886560528e-05, "loss": 2.567, "step": 2399 }, { "epoch": 0.6850460599816697, "grad_norm": 0.36328125, "learning_rate": 6.885192460250366e-05, "loss": 2.5596, "step": 2400 }, { "epoch": 0.6853314958399953, "grad_norm": 0.40234375, "learning_rate": 6.873767693105e-05, "loss": 2.5652, "step": 2401 }, { "epoch": 0.685616931698321, "grad_norm": 0.369140625, "learning_rate": 6.8623495944998e-05, "loss": 2.5612, "step": 2402 }, { "epoch": 0.6859023675566468, "grad_norm": 0.37109375, "learning_rate": 6.850938173804672e-05, "loss": 2.5595, "step": 2403 }, { "epoch": 0.6861878034149724, "grad_norm": 0.380859375, "learning_rate": 6.839533440384051e-05, "loss": 2.5805, "step": 2404 }, { "epoch": 0.6864732392732982, "grad_norm": 0.353515625, "learning_rate": 6.82813540359688e-05, "loss": 2.5742, "step": 2405 }, { "epoch": 0.6867586751316238, "grad_norm": 0.365234375, "learning_rate": 6.816744072796592e-05, "loss": 2.5801, "step": 2406 }, { "epoch": 0.6870441109899496, "grad_norm": 0.365234375, "learning_rate": 6.805359457331144e-05, "loss": 2.5545, "step": 2407 }, { "epoch": 0.6873295468482752, "grad_norm": 0.369140625, "learning_rate": 6.793981566542957e-05, "loss": 2.553, "step": 2408 }, { "epoch": 0.6876149827066009, "grad_norm": 0.365234375, "learning_rate": 6.78261040976894e-05, "loss": 2.5477, "step": 2409 }, { "epoch": 0.6879004185649267, "grad_norm": 0.36328125, "learning_rate": 6.771245996340491e-05, "loss": 2.5584, "step": 2410 }, { "epoch": 0.6881858544232523, "grad_norm": 0.4453125, "learning_rate": 6.759888335583458e-05, "loss": 2.5786, "step": 2411 }, { "epoch": 0.688471290281578, "grad_norm": 0.34765625, "learning_rate": 6.748537436818142e-05, "loss": 2.5663, "step": 2412 }, { "epoch": 0.6887567261399037, "grad_norm": 0.38671875, "learning_rate": 6.737193309359324e-05, "loss": 2.5402, "step": 2413 }, { "epoch": 0.6890421619982294, "grad_norm": 0.353515625, "learning_rate": 6.7258559625162e-05, "loss": 2.5748, "step": 2414 }, { "epoch": 0.6893275978565551, "grad_norm": 0.357421875, "learning_rate": 6.714525405592412e-05, "loss": 2.5759, "step": 2415 }, { "epoch": 0.6896130337148808, "grad_norm": 0.3828125, "learning_rate": 6.703201647886034e-05, "loss": 2.5636, "step": 2416 }, { "epoch": 0.6898984695732064, "grad_norm": 0.4765625, "learning_rate": 6.691884698689548e-05, "loss": 2.5573, "step": 2417 }, { "epoch": 0.6901839054315322, "grad_norm": 0.369140625, "learning_rate": 6.680574567289864e-05, "loss": 2.5802, "step": 2418 }, { "epoch": 0.6904693412898579, "grad_norm": 0.373046875, "learning_rate": 6.66927126296829e-05, "loss": 2.5497, "step": 2419 }, { "epoch": 0.6907547771481836, "grad_norm": 0.36328125, "learning_rate": 6.657974795000525e-05, "loss": 2.5806, "step": 2420 }, { "epoch": 0.6910402130065093, "grad_norm": 0.37109375, "learning_rate": 6.646685172656667e-05, "loss": 2.5485, "step": 2421 }, { "epoch": 0.6913256488648349, "grad_norm": 0.37109375, "learning_rate": 6.6354024052012e-05, "loss": 2.5518, "step": 2422 }, { "epoch": 0.6916110847231607, "grad_norm": 0.373046875, "learning_rate": 6.62412650189297e-05, "loss": 2.5628, "step": 2423 }, { "epoch": 0.6918965205814863, "grad_norm": 0.349609375, "learning_rate": 6.612857471985203e-05, "loss": 2.5364, "step": 2424 }, { "epoch": 0.6921819564398121, "grad_norm": 0.365234375, "learning_rate": 6.601595324725474e-05, "loss": 2.5879, "step": 2425 }, { "epoch": 0.6924673922981378, "grad_norm": 0.353515625, "learning_rate": 6.590340069355713e-05, "loss": 2.5652, "step": 2426 }, { "epoch": 0.6927528281564634, "grad_norm": 0.37109375, "learning_rate": 6.579091715112201e-05, "loss": 2.544, "step": 2427 }, { "epoch": 0.6930382640147892, "grad_norm": 0.384765625, "learning_rate": 6.567850271225543e-05, "loss": 2.5717, "step": 2428 }, { "epoch": 0.6933236998731148, "grad_norm": 0.37109375, "learning_rate": 6.556615746920685e-05, "loss": 2.5632, "step": 2429 }, { "epoch": 0.6936091357314406, "grad_norm": 0.3515625, "learning_rate": 6.545388151416896e-05, "loss": 2.544, "step": 2430 }, { "epoch": 0.6938945715897662, "grad_norm": 0.36328125, "learning_rate": 6.534167493927748e-05, "loss": 2.5697, "step": 2431 }, { "epoch": 0.6941800074480919, "grad_norm": 0.35546875, "learning_rate": 6.522953783661121e-05, "loss": 2.5455, "step": 2432 }, { "epoch": 0.6944654433064176, "grad_norm": 0.404296875, "learning_rate": 6.511747029819207e-05, "loss": 2.5844, "step": 2433 }, { "epoch": 0.6947508791647433, "grad_norm": 0.36328125, "learning_rate": 6.500547241598478e-05, "loss": 2.5579, "step": 2434 }, { "epoch": 0.6950363150230691, "grad_norm": 0.3828125, "learning_rate": 6.489354428189683e-05, "loss": 2.5542, "step": 2435 }, { "epoch": 0.6953217508813947, "grad_norm": 0.3671875, "learning_rate": 6.478168598777864e-05, "loss": 2.5787, "step": 2436 }, { "epoch": 0.6956071867397204, "grad_norm": 0.39453125, "learning_rate": 6.466989762542332e-05, "loss": 2.5676, "step": 2437 }, { "epoch": 0.6958926225980461, "grad_norm": 0.3671875, "learning_rate": 6.455817928656636e-05, "loss": 2.5601, "step": 2438 }, { "epoch": 0.6961780584563718, "grad_norm": 0.33984375, "learning_rate": 6.444653106288612e-05, "loss": 2.5721, "step": 2439 }, { "epoch": 0.6964634943146975, "grad_norm": 0.42578125, "learning_rate": 6.433495304600306e-05, "loss": 2.5427, "step": 2440 }, { "epoch": 0.6967489301730232, "grad_norm": 0.361328125, "learning_rate": 6.422344532748039e-05, "loss": 2.5505, "step": 2441 }, { "epoch": 0.6970343660313489, "grad_norm": 0.384765625, "learning_rate": 6.411200799882338e-05, "loss": 2.5491, "step": 2442 }, { "epoch": 0.6973198018896746, "grad_norm": 0.36328125, "learning_rate": 6.400064115147955e-05, "loss": 2.5645, "step": 2443 }, { "epoch": 0.6976052377480003, "grad_norm": 0.34765625, "learning_rate": 6.38893448768387e-05, "loss": 2.5374, "step": 2444 }, { "epoch": 0.697890673606326, "grad_norm": 0.3515625, "learning_rate": 6.377811926623273e-05, "loss": 2.5343, "step": 2445 }, { "epoch": 0.6981761094646517, "grad_norm": 0.345703125, "learning_rate": 6.366696441093536e-05, "loss": 2.6022, "step": 2446 }, { "epoch": 0.6984615453229773, "grad_norm": 0.365234375, "learning_rate": 6.355588040216248e-05, "loss": 2.5745, "step": 2447 }, { "epoch": 0.6987469811813031, "grad_norm": 0.390625, "learning_rate": 6.344486733107168e-05, "loss": 2.5623, "step": 2448 }, { "epoch": 0.6990324170396287, "grad_norm": 0.353515625, "learning_rate": 6.333392528876233e-05, "loss": 2.567, "step": 2449 }, { "epoch": 0.6993178528979545, "grad_norm": 0.359375, "learning_rate": 6.32230543662757e-05, "loss": 2.5734, "step": 2450 }, { "epoch": 0.6996032887562802, "grad_norm": 0.38671875, "learning_rate": 6.311225465459442e-05, "loss": 2.5358, "step": 2451 }, { "epoch": 0.6998887246146058, "grad_norm": 0.369140625, "learning_rate": 6.300152624464296e-05, "loss": 2.5494, "step": 2452 }, { "epoch": 0.7001741604729316, "grad_norm": 0.3515625, "learning_rate": 6.289086922728712e-05, "loss": 2.5602, "step": 2453 }, { "epoch": 0.7004595963312572, "grad_norm": 0.3515625, "learning_rate": 6.278028369333413e-05, "loss": 2.5788, "step": 2454 }, { "epoch": 0.700745032189583, "grad_norm": 0.392578125, "learning_rate": 6.266976973353252e-05, "loss": 2.5591, "step": 2455 }, { "epoch": 0.7010304680479086, "grad_norm": 0.3671875, "learning_rate": 6.255932743857226e-05, "loss": 2.5517, "step": 2456 }, { "epoch": 0.7013159039062343, "grad_norm": 0.353515625, "learning_rate": 6.244895689908426e-05, "loss": 2.5502, "step": 2457 }, { "epoch": 0.70160133976456, "grad_norm": 0.373046875, "learning_rate": 6.233865820564079e-05, "loss": 2.5815, "step": 2458 }, { "epoch": 0.7018867756228857, "grad_norm": 0.353515625, "learning_rate": 6.222843144875492e-05, "loss": 2.5633, "step": 2459 }, { "epoch": 0.7021722114812115, "grad_norm": 0.373046875, "learning_rate": 6.211827671888098e-05, "loss": 2.5513, "step": 2460 }, { "epoch": 0.7024576473395371, "grad_norm": 0.380859375, "learning_rate": 6.200819410641385e-05, "loss": 2.569, "step": 2461 }, { "epoch": 0.7027430831978628, "grad_norm": 0.37109375, "learning_rate": 6.189818370168956e-05, "loss": 2.559, "step": 2462 }, { "epoch": 0.7030285190561885, "grad_norm": 0.369140625, "learning_rate": 6.17882455949846e-05, "loss": 2.5625, "step": 2463 }, { "epoch": 0.7033139549145142, "grad_norm": 0.359375, "learning_rate": 6.16783798765164e-05, "loss": 2.552, "step": 2464 }, { "epoch": 0.7035993907728398, "grad_norm": 0.365234375, "learning_rate": 6.156858663644277e-05, "loss": 2.5329, "step": 2465 }, { "epoch": 0.7038848266311656, "grad_norm": 0.33984375, "learning_rate": 6.145886596486208e-05, "loss": 2.5371, "step": 2466 }, { "epoch": 0.7041702624894913, "grad_norm": 0.337890625, "learning_rate": 6.134921795181324e-05, "loss": 2.561, "step": 2467 }, { "epoch": 0.704455698347817, "grad_norm": 0.34765625, "learning_rate": 6.123964268727554e-05, "loss": 2.5607, "step": 2468 }, { "epoch": 0.7047411342061427, "grad_norm": 0.3515625, "learning_rate": 6.113014026116841e-05, "loss": 2.5781, "step": 2469 }, { "epoch": 0.7050265700644683, "grad_norm": 0.369140625, "learning_rate": 6.102071076335173e-05, "loss": 2.5742, "step": 2470 }, { "epoch": 0.7053120059227941, "grad_norm": 0.341796875, "learning_rate": 6.091135428362536e-05, "loss": 2.5736, "step": 2471 }, { "epoch": 0.7055974417811197, "grad_norm": 0.36328125, "learning_rate": 6.0802070911729246e-05, "loss": 2.5795, "step": 2472 }, { "epoch": 0.7058828776394455, "grad_norm": 0.357421875, "learning_rate": 6.06928607373435e-05, "loss": 2.5563, "step": 2473 }, { "epoch": 0.7061683134977711, "grad_norm": 0.357421875, "learning_rate": 6.058372385008801e-05, "loss": 2.5287, "step": 2474 }, { "epoch": 0.7064537493560968, "grad_norm": 0.34765625, "learning_rate": 6.047466033952245e-05, "loss": 2.5752, "step": 2475 }, { "epoch": 0.7067391852144226, "grad_norm": 0.34765625, "learning_rate": 6.036567029514665e-05, "loss": 2.5511, "step": 2476 }, { "epoch": 0.7070246210727482, "grad_norm": 0.357421875, "learning_rate": 6.025675380639976e-05, "loss": 2.5685, "step": 2477 }, { "epoch": 0.707310056931074, "grad_norm": 0.357421875, "learning_rate": 6.0147910962660684e-05, "loss": 2.577, "step": 2478 }, { "epoch": 0.7075954927893996, "grad_norm": 0.3671875, "learning_rate": 6.003914185324802e-05, "loss": 2.5451, "step": 2479 }, { "epoch": 0.7078809286477253, "grad_norm": 0.349609375, "learning_rate": 5.993044656741965e-05, "loss": 2.5405, "step": 2480 }, { "epoch": 0.708166364506051, "grad_norm": 0.34765625, "learning_rate": 5.982182519437311e-05, "loss": 2.5569, "step": 2481 }, { "epoch": 0.7084518003643767, "grad_norm": 0.373046875, "learning_rate": 5.971327782324508e-05, "loss": 2.5454, "step": 2482 }, { "epoch": 0.7087372362227025, "grad_norm": 0.369140625, "learning_rate": 5.960480454311155e-05, "loss": 2.5725, "step": 2483 }, { "epoch": 0.7090226720810281, "grad_norm": 0.34375, "learning_rate": 5.949640544298779e-05, "loss": 2.5612, "step": 2484 }, { "epoch": 0.7093081079393538, "grad_norm": 0.3359375, "learning_rate": 5.938808061182823e-05, "loss": 2.5581, "step": 2485 }, { "epoch": 0.7095935437976795, "grad_norm": 0.34765625, "learning_rate": 5.927983013852614e-05, "loss": 2.5476, "step": 2486 }, { "epoch": 0.7098789796560052, "grad_norm": 0.359375, "learning_rate": 5.917165411191405e-05, "loss": 2.5592, "step": 2487 }, { "epoch": 0.7101644155143308, "grad_norm": 0.36328125, "learning_rate": 5.906355262076317e-05, "loss": 2.5649, "step": 2488 }, { "epoch": 0.7104498513726566, "grad_norm": 0.3515625, "learning_rate": 5.895552575378361e-05, "loss": 2.5849, "step": 2489 }, { "epoch": 0.7107352872309822, "grad_norm": 0.34765625, "learning_rate": 5.8847573599624335e-05, "loss": 2.5812, "step": 2490 }, { "epoch": 0.711020723089308, "grad_norm": 0.365234375, "learning_rate": 5.8739696246872853e-05, "loss": 2.5425, "step": 2491 }, { "epoch": 0.7113061589476337, "grad_norm": 0.353515625, "learning_rate": 5.863189378405541e-05, "loss": 2.554, "step": 2492 }, { "epoch": 0.7115915948059593, "grad_norm": 0.361328125, "learning_rate": 5.8524166299636785e-05, "loss": 2.5374, "step": 2493 }, { "epoch": 0.7118770306642851, "grad_norm": 0.353515625, "learning_rate": 5.841651388202015e-05, "loss": 2.5079, "step": 2494 }, { "epoch": 0.7121624665226107, "grad_norm": 0.380859375, "learning_rate": 5.8308936619547076e-05, "loss": 2.5421, "step": 2495 }, { "epoch": 0.7124479023809365, "grad_norm": 0.376953125, "learning_rate": 5.820143460049759e-05, "loss": 2.5617, "step": 2496 }, { "epoch": 0.7127333382392621, "grad_norm": 0.3515625, "learning_rate": 5.809400791308978e-05, "loss": 2.5253, "step": 2497 }, { "epoch": 0.7130187740975878, "grad_norm": 0.34765625, "learning_rate": 5.798665664548015e-05, "loss": 2.5518, "step": 2498 }, { "epoch": 0.7133042099559136, "grad_norm": 0.369140625, "learning_rate": 5.787938088576305e-05, "loss": 2.5575, "step": 2499 }, { "epoch": 0.7135896458142392, "grad_norm": 0.359375, "learning_rate": 5.777218072197113e-05, "loss": 2.5604, "step": 2500 }, { "epoch": 0.7135896458142392, "eval_loss": 2.4628705978393555, "eval_runtime": 5982.5105, "eval_samples_per_second": 10.746, "eval_steps_per_second": 10.746, "step": 2500 } ], "logging_steps": 1, "max_steps": 3503, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "total_flos": 9.70632734441472e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }