{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04718650465966733, "eval_steps": 200, "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011796626164916834, "grad_norm": 93.0, "learning_rate": 1.8912529550827425e-07, "loss": 7.9641, "num_input_tokens_seen": 1179648, "step": 1 }, { "epoch": 0.0002359325232983367, "grad_norm": 95.0, "learning_rate": 3.782505910165485e-07, "loss": 7.9866, "num_input_tokens_seen": 2359296, "step": 2 }, { "epoch": 0.000353898784947505, "grad_norm": 97.5, "learning_rate": 5.673758865248227e-07, "loss": 7.983, "num_input_tokens_seen": 3538944, "step": 3 }, { "epoch": 0.0004718650465966734, "grad_norm": 92.0, "learning_rate": 7.56501182033097e-07, "loss": 7.9018, "num_input_tokens_seen": 4718592, "step": 4 }, { "epoch": 0.0005898313082458417, "grad_norm": 102.5, "learning_rate": 9.456264775413712e-07, "loss": 8.0784, "num_input_tokens_seen": 5898240, "step": 5 }, { "epoch": 0.00070779756989501, "grad_norm": 95.0, "learning_rate": 1.1347517730496454e-06, "loss": 7.9578, "num_input_tokens_seen": 7077888, "step": 6 }, { "epoch": 0.0008257638315441783, "grad_norm": 93.5, "learning_rate": 1.3238770685579196e-06, "loss": 7.975, "num_input_tokens_seen": 8257536, "step": 7 }, { "epoch": 0.0009437300931933467, "grad_norm": 85.0, "learning_rate": 1.513002364066194e-06, "loss": 7.8405, "num_input_tokens_seen": 9437184, "step": 8 }, { "epoch": 0.001061696354842515, "grad_norm": 83.0, "learning_rate": 1.7021276595744682e-06, "loss": 7.841, "num_input_tokens_seen": 10616832, "step": 9 }, { "epoch": 0.0011796626164916834, "grad_norm": 82.5, "learning_rate": 1.8912529550827423e-06, "loss": 7.9491, "num_input_tokens_seen": 11796480, "step": 10 }, { "epoch": 0.0012976288781408518, "grad_norm": 68.0, "learning_rate": 2.0803782505910165e-06, "loss": 7.5612, "num_input_tokens_seen": 12976128, "step": 11 }, { "epoch": 0.00141559513979002, "grad_norm": 66.5, "learning_rate": 2.269503546099291e-06, "loss": 7.5808, "num_input_tokens_seen": 14155776, "step": 12 }, { "epoch": 0.0015335614014391884, "grad_norm": 57.5, "learning_rate": 2.4586288416075653e-06, "loss": 7.4494, "num_input_tokens_seen": 15335424, "step": 13 }, { "epoch": 0.0016515276630883566, "grad_norm": 51.25, "learning_rate": 2.6477541371158392e-06, "loss": 7.4606, "num_input_tokens_seen": 16515072, "step": 14 }, { "epoch": 0.001769493924737525, "grad_norm": 42.75, "learning_rate": 2.836879432624114e-06, "loss": 7.2804, "num_input_tokens_seen": 17694720, "step": 15 }, { "epoch": 0.0018874601863866935, "grad_norm": 38.0, "learning_rate": 3.026004728132388e-06, "loss": 7.2173, "num_input_tokens_seen": 18874368, "step": 16 }, { "epoch": 0.0020054264480358617, "grad_norm": 33.75, "learning_rate": 3.2151300236406624e-06, "loss": 7.0934, "num_input_tokens_seen": 20054016, "step": 17 }, { "epoch": 0.00212339270968503, "grad_norm": 30.5, "learning_rate": 3.4042553191489363e-06, "loss": 7.1284, "num_input_tokens_seen": 21233664, "step": 18 }, { "epoch": 0.0022413589713341986, "grad_norm": 26.5, "learning_rate": 3.5933806146572107e-06, "loss": 6.932, "num_input_tokens_seen": 22413312, "step": 19 }, { "epoch": 0.0023593252329833668, "grad_norm": 23.875, "learning_rate": 3.7825059101654847e-06, "loss": 6.9163, "num_input_tokens_seen": 23592960, "step": 20 }, { "epoch": 0.002477291494632535, "grad_norm": 21.625, "learning_rate": 3.9716312056737595e-06, "loss": 6.8335, "num_input_tokens_seen": 24772608, "step": 21 }, { "epoch": 0.0025952577562817036, "grad_norm": 18.625, "learning_rate": 4.160756501182033e-06, "loss": 6.6761, "num_input_tokens_seen": 25952256, "step": 22 }, { "epoch": 0.002713224017930872, "grad_norm": 16.0, "learning_rate": 4.349881796690308e-06, "loss": 6.6474, "num_input_tokens_seen": 27131904, "step": 23 }, { "epoch": 0.00283119027958004, "grad_norm": 18.0, "learning_rate": 4.539007092198582e-06, "loss": 6.6062, "num_input_tokens_seen": 28311552, "step": 24 }, { "epoch": 0.0029491565412292082, "grad_norm": 17.5, "learning_rate": 4.728132387706856e-06, "loss": 6.6435, "num_input_tokens_seen": 29491200, "step": 25 }, { "epoch": 0.003067122802878377, "grad_norm": 14.25, "learning_rate": 4.9172576832151305e-06, "loss": 6.57, "num_input_tokens_seen": 30670848, "step": 26 }, { "epoch": 0.003185089064527545, "grad_norm": 13.3125, "learning_rate": 5.106382978723404e-06, "loss": 6.4037, "num_input_tokens_seen": 31850496, "step": 27 }, { "epoch": 0.0033030553261767133, "grad_norm": 10.8125, "learning_rate": 5.2955082742316784e-06, "loss": 6.2927, "num_input_tokens_seen": 33030144, "step": 28 }, { "epoch": 0.003421021587825882, "grad_norm": 9.75, "learning_rate": 5.484633569739954e-06, "loss": 6.2084, "num_input_tokens_seen": 34209792, "step": 29 }, { "epoch": 0.00353898784947505, "grad_norm": 10.25, "learning_rate": 5.673758865248228e-06, "loss": 6.2349, "num_input_tokens_seen": 35389440, "step": 30 }, { "epoch": 0.0036569541111242184, "grad_norm": 10.8125, "learning_rate": 5.862884160756502e-06, "loss": 6.155, "num_input_tokens_seen": 36569088, "step": 31 }, { "epoch": 0.003774920372773387, "grad_norm": 10.0, "learning_rate": 6.052009456264776e-06, "loss": 6.1846, "num_input_tokens_seen": 37748736, "step": 32 }, { "epoch": 0.003892886634422555, "grad_norm": 8.0, "learning_rate": 6.24113475177305e-06, "loss": 6.0656, "num_input_tokens_seen": 38928384, "step": 33 }, { "epoch": 0.004010852896071723, "grad_norm": 8.0625, "learning_rate": 6.430260047281325e-06, "loss": 5.9873, "num_input_tokens_seen": 40108032, "step": 34 }, { "epoch": 0.004128819157720892, "grad_norm": 8.875, "learning_rate": 6.619385342789598e-06, "loss": 6.0008, "num_input_tokens_seen": 41287680, "step": 35 }, { "epoch": 0.00424678541937006, "grad_norm": 7.34375, "learning_rate": 6.808510638297873e-06, "loss": 6.0852, "num_input_tokens_seen": 42467328, "step": 36 }, { "epoch": 0.004364751681019229, "grad_norm": 6.28125, "learning_rate": 6.997635933806147e-06, "loss": 5.8264, "num_input_tokens_seen": 43646976, "step": 37 }, { "epoch": 0.004482717942668397, "grad_norm": 6.75, "learning_rate": 7.186761229314421e-06, "loss": 5.9511, "num_input_tokens_seen": 44826624, "step": 38 }, { "epoch": 0.004600684204317565, "grad_norm": 6.96875, "learning_rate": 7.375886524822695e-06, "loss": 5.8042, "num_input_tokens_seen": 46006272, "step": 39 }, { "epoch": 0.0047186504659667335, "grad_norm": 5.59375, "learning_rate": 7.565011820330969e-06, "loss": 5.8955, "num_input_tokens_seen": 47185920, "step": 40 }, { "epoch": 0.004836616727615902, "grad_norm": 5.3125, "learning_rate": 7.754137115839244e-06, "loss": 5.7764, "num_input_tokens_seen": 48365568, "step": 41 }, { "epoch": 0.00495458298926507, "grad_norm": 5.40625, "learning_rate": 7.943262411347519e-06, "loss": 5.7784, "num_input_tokens_seen": 49545216, "step": 42 }, { "epoch": 0.005072549250914238, "grad_norm": 5.03125, "learning_rate": 8.132387706855792e-06, "loss": 5.7388, "num_input_tokens_seen": 50724864, "step": 43 }, { "epoch": 0.005190515512563407, "grad_norm": 4.84375, "learning_rate": 8.321513002364066e-06, "loss": 5.7821, "num_input_tokens_seen": 51904512, "step": 44 }, { "epoch": 0.0053084817742125754, "grad_norm": 4.78125, "learning_rate": 8.510638297872341e-06, "loss": 5.6466, "num_input_tokens_seen": 53084160, "step": 45 }, { "epoch": 0.005426448035861744, "grad_norm": 4.125, "learning_rate": 8.699763593380616e-06, "loss": 5.6224, "num_input_tokens_seen": 54263808, "step": 46 }, { "epoch": 0.005544414297510912, "grad_norm": 5.0, "learning_rate": 8.888888888888888e-06, "loss": 5.9019, "num_input_tokens_seen": 55443456, "step": 47 }, { "epoch": 0.00566238055916008, "grad_norm": 4.0, "learning_rate": 9.078014184397164e-06, "loss": 5.5812, "num_input_tokens_seen": 56623104, "step": 48 }, { "epoch": 0.005780346820809248, "grad_norm": 3.796875, "learning_rate": 9.267139479905439e-06, "loss": 5.633, "num_input_tokens_seen": 57802752, "step": 49 }, { "epoch": 0.0058983130824584165, "grad_norm": 3.84375, "learning_rate": 9.456264775413712e-06, "loss": 5.5031, "num_input_tokens_seen": 58982400, "step": 50 }, { "epoch": 0.0060162793441075856, "grad_norm": 3.4375, "learning_rate": 9.645390070921986e-06, "loss": 5.5146, "num_input_tokens_seen": 60162048, "step": 51 }, { "epoch": 0.006134245605756754, "grad_norm": 3.609375, "learning_rate": 9.834515366430261e-06, "loss": 5.3805, "num_input_tokens_seen": 61341696, "step": 52 }, { "epoch": 0.006252211867405922, "grad_norm": 3.234375, "learning_rate": 1.0023640661938535e-05, "loss": 5.4098, "num_input_tokens_seen": 62521344, "step": 53 }, { "epoch": 0.00637017812905509, "grad_norm": 3.40625, "learning_rate": 1.0212765957446808e-05, "loss": 5.3773, "num_input_tokens_seen": 63700992, "step": 54 }, { "epoch": 0.006488144390704258, "grad_norm": 3.1875, "learning_rate": 1.0401891252955083e-05, "loss": 5.3287, "num_input_tokens_seen": 64880640, "step": 55 }, { "epoch": 0.006606110652353427, "grad_norm": 3.046875, "learning_rate": 1.0591016548463357e-05, "loss": 5.2282, "num_input_tokens_seen": 66060288, "step": 56 }, { "epoch": 0.006724076914002596, "grad_norm": 3.0625, "learning_rate": 1.0780141843971632e-05, "loss": 5.2967, "num_input_tokens_seen": 67239936, "step": 57 }, { "epoch": 0.006842043175651764, "grad_norm": 2.859375, "learning_rate": 1.0969267139479907e-05, "loss": 5.1126, "num_input_tokens_seen": 68419584, "step": 58 }, { "epoch": 0.006960009437300932, "grad_norm": 3.0625, "learning_rate": 1.1158392434988181e-05, "loss": 5.3309, "num_input_tokens_seen": 69599232, "step": 59 }, { "epoch": 0.0070779756989501, "grad_norm": 3.0625, "learning_rate": 1.1347517730496456e-05, "loss": 5.2134, "num_input_tokens_seen": 70778880, "step": 60 }, { "epoch": 0.0071959419605992685, "grad_norm": 3.546875, "learning_rate": 1.153664302600473e-05, "loss": 5.1972, "num_input_tokens_seen": 71958528, "step": 61 }, { "epoch": 0.007313908222248437, "grad_norm": 2.828125, "learning_rate": 1.1725768321513003e-05, "loss": 5.0123, "num_input_tokens_seen": 73138176, "step": 62 }, { "epoch": 0.007431874483897605, "grad_norm": 3.203125, "learning_rate": 1.1914893617021277e-05, "loss": 5.1108, "num_input_tokens_seen": 74317824, "step": 63 }, { "epoch": 0.007549840745546774, "grad_norm": 3.609375, "learning_rate": 1.2104018912529552e-05, "loss": 5.2287, "num_input_tokens_seen": 75497472, "step": 64 }, { "epoch": 0.007667807007195942, "grad_norm": 2.703125, "learning_rate": 1.2293144208037825e-05, "loss": 5.0902, "num_input_tokens_seen": 76677120, "step": 65 }, { "epoch": 0.00778577326884511, "grad_norm": 3.1875, "learning_rate": 1.24822695035461e-05, "loss": 5.1526, "num_input_tokens_seen": 77856768, "step": 66 }, { "epoch": 0.007903739530494279, "grad_norm": 3.359375, "learning_rate": 1.2671394799054376e-05, "loss": 4.8855, "num_input_tokens_seen": 79036416, "step": 67 }, { "epoch": 0.008021705792143447, "grad_norm": 2.765625, "learning_rate": 1.286052009456265e-05, "loss": 5.0226, "num_input_tokens_seen": 80216064, "step": 68 }, { "epoch": 0.008139672053792615, "grad_norm": 3.078125, "learning_rate": 1.3049645390070925e-05, "loss": 5.1366, "num_input_tokens_seen": 81395712, "step": 69 }, { "epoch": 0.008257638315441783, "grad_norm": 3.71875, "learning_rate": 1.3238770685579197e-05, "loss": 5.0749, "num_input_tokens_seen": 82575360, "step": 70 }, { "epoch": 0.008375604577090951, "grad_norm": 2.765625, "learning_rate": 1.3427895981087472e-05, "loss": 4.8899, "num_input_tokens_seen": 83755008, "step": 71 }, { "epoch": 0.00849357083874012, "grad_norm": 3.140625, "learning_rate": 1.3617021276595745e-05, "loss": 4.8809, "num_input_tokens_seen": 84934656, "step": 72 }, { "epoch": 0.008611537100389288, "grad_norm": 3.296875, "learning_rate": 1.380614657210402e-05, "loss": 4.8839, "num_input_tokens_seen": 86114304, "step": 73 }, { "epoch": 0.008729503362038458, "grad_norm": 2.953125, "learning_rate": 1.3995271867612294e-05, "loss": 4.9931, "num_input_tokens_seen": 87293952, "step": 74 }, { "epoch": 0.008847469623687626, "grad_norm": 2.9375, "learning_rate": 1.418439716312057e-05, "loss": 4.8041, "num_input_tokens_seen": 88473600, "step": 75 }, { "epoch": 0.008965435885336794, "grad_norm": 3.5625, "learning_rate": 1.4373522458628843e-05, "loss": 4.8611, "num_input_tokens_seen": 89653248, "step": 76 }, { "epoch": 0.009083402146985962, "grad_norm": 3.734375, "learning_rate": 1.4562647754137118e-05, "loss": 4.7747, "num_input_tokens_seen": 90832896, "step": 77 }, { "epoch": 0.00920136840863513, "grad_norm": 2.984375, "learning_rate": 1.475177304964539e-05, "loss": 4.6884, "num_input_tokens_seen": 92012544, "step": 78 }, { "epoch": 0.009319334670284299, "grad_norm": 2.671875, "learning_rate": 1.4940898345153665e-05, "loss": 4.6617, "num_input_tokens_seen": 93192192, "step": 79 }, { "epoch": 0.009437300931933467, "grad_norm": 2.84375, "learning_rate": 1.5130023640661939e-05, "loss": 4.6174, "num_input_tokens_seen": 94371840, "step": 80 }, { "epoch": 0.009555267193582635, "grad_norm": 3.71875, "learning_rate": 1.5319148936170214e-05, "loss": 4.5483, "num_input_tokens_seen": 95551488, "step": 81 }, { "epoch": 0.009673233455231803, "grad_norm": 4.34375, "learning_rate": 1.5508274231678487e-05, "loss": 4.604, "num_input_tokens_seen": 96731136, "step": 82 }, { "epoch": 0.009791199716880972, "grad_norm": 3.09375, "learning_rate": 1.5697399527186764e-05, "loss": 4.5806, "num_input_tokens_seen": 97910784, "step": 83 }, { "epoch": 0.00990916597853014, "grad_norm": 3.203125, "learning_rate": 1.5886524822695038e-05, "loss": 4.4723, "num_input_tokens_seen": 99090432, "step": 84 }, { "epoch": 0.010027132240179308, "grad_norm": 3.8125, "learning_rate": 1.607565011820331e-05, "loss": 4.4723, "num_input_tokens_seen": 100270080, "step": 85 }, { "epoch": 0.010145098501828476, "grad_norm": 3.03125, "learning_rate": 1.6264775413711585e-05, "loss": 4.4519, "num_input_tokens_seen": 101449728, "step": 86 }, { "epoch": 0.010263064763477646, "grad_norm": 5.53125, "learning_rate": 1.645390070921986e-05, "loss": 4.4077, "num_input_tokens_seen": 102629376, "step": 87 }, { "epoch": 0.010381031025126814, "grad_norm": 4.03125, "learning_rate": 1.6643026004728132e-05, "loss": 4.5295, "num_input_tokens_seen": 103809024, "step": 88 }, { "epoch": 0.010498997286775983, "grad_norm": 3.484375, "learning_rate": 1.683215130023641e-05, "loss": 4.3235, "num_input_tokens_seen": 104988672, "step": 89 }, { "epoch": 0.010616963548425151, "grad_norm": 5.78125, "learning_rate": 1.7021276595744682e-05, "loss": 4.3245, "num_input_tokens_seen": 106168320, "step": 90 }, { "epoch": 0.010734929810074319, "grad_norm": 3.0625, "learning_rate": 1.7210401891252956e-05, "loss": 4.2554, "num_input_tokens_seen": 107347968, "step": 91 }, { "epoch": 0.010852896071723487, "grad_norm": 3.671875, "learning_rate": 1.7399527186761233e-05, "loss": 4.2572, "num_input_tokens_seen": 108527616, "step": 92 }, { "epoch": 0.010970862333372655, "grad_norm": 5.0625, "learning_rate": 1.7588652482269506e-05, "loss": 4.2442, "num_input_tokens_seen": 109707264, "step": 93 }, { "epoch": 0.011088828595021824, "grad_norm": 2.6875, "learning_rate": 1.7777777777777777e-05, "loss": 4.3006, "num_input_tokens_seen": 110886912, "step": 94 }, { "epoch": 0.011206794856670992, "grad_norm": 3.171875, "learning_rate": 1.7966903073286054e-05, "loss": 4.2813, "num_input_tokens_seen": 112066560, "step": 95 }, { "epoch": 0.01132476111832016, "grad_norm": 2.765625, "learning_rate": 1.8156028368794327e-05, "loss": 4.1186, "num_input_tokens_seen": 113246208, "step": 96 }, { "epoch": 0.011442727379969328, "grad_norm": 3.1875, "learning_rate": 1.83451536643026e-05, "loss": 4.1652, "num_input_tokens_seen": 114425856, "step": 97 }, { "epoch": 0.011560693641618497, "grad_norm": 4.71875, "learning_rate": 1.8534278959810878e-05, "loss": 4.1923, "num_input_tokens_seen": 115605504, "step": 98 }, { "epoch": 0.011678659903267665, "grad_norm": 3.359375, "learning_rate": 1.872340425531915e-05, "loss": 4.0728, "num_input_tokens_seen": 116785152, "step": 99 }, { "epoch": 0.011796626164916833, "grad_norm": 2.484375, "learning_rate": 1.8912529550827425e-05, "loss": 4.2189, "num_input_tokens_seen": 117964800, "step": 100 }, { "epoch": 0.011914592426566003, "grad_norm": 3.109375, "learning_rate": 1.91016548463357e-05, "loss": 4.0632, "num_input_tokens_seen": 119144448, "step": 101 }, { "epoch": 0.012032558688215171, "grad_norm": 4.40625, "learning_rate": 1.929078014184397e-05, "loss": 3.8775, "num_input_tokens_seen": 120324096, "step": 102 }, { "epoch": 0.01215052494986434, "grad_norm": 2.5625, "learning_rate": 1.9479905437352245e-05, "loss": 3.8658, "num_input_tokens_seen": 121503744, "step": 103 }, { "epoch": 0.012268491211513508, "grad_norm": 3.703125, "learning_rate": 1.9669030732860522e-05, "loss": 3.9888, "num_input_tokens_seen": 122683392, "step": 104 }, { "epoch": 0.012386457473162676, "grad_norm": 2.78125, "learning_rate": 1.9858156028368796e-05, "loss": 3.9439, "num_input_tokens_seen": 123863040, "step": 105 }, { "epoch": 0.012504423734811844, "grad_norm": 3.4375, "learning_rate": 2.004728132387707e-05, "loss": 4.0098, "num_input_tokens_seen": 125042688, "step": 106 }, { "epoch": 0.012622389996461012, "grad_norm": 2.71875, "learning_rate": 2.0236406619385343e-05, "loss": 3.9454, "num_input_tokens_seen": 126222336, "step": 107 }, { "epoch": 0.01274035625811018, "grad_norm": 3.03125, "learning_rate": 2.0425531914893616e-05, "loss": 3.8491, "num_input_tokens_seen": 127401984, "step": 108 }, { "epoch": 0.012858322519759349, "grad_norm": 3.59375, "learning_rate": 2.0614657210401893e-05, "loss": 3.9602, "num_input_tokens_seen": 128581632, "step": 109 }, { "epoch": 0.012976288781408517, "grad_norm": 5.25, "learning_rate": 2.0803782505910167e-05, "loss": 3.9542, "num_input_tokens_seen": 129761280, "step": 110 }, { "epoch": 0.013094255043057685, "grad_norm": 2.34375, "learning_rate": 2.099290780141844e-05, "loss": 3.7846, "num_input_tokens_seen": 130940928, "step": 111 }, { "epoch": 0.013212221304706853, "grad_norm": 3.484375, "learning_rate": 2.1182033096926714e-05, "loss": 3.9128, "num_input_tokens_seen": 132120576, "step": 112 }, { "epoch": 0.013330187566356021, "grad_norm": 7.4375, "learning_rate": 2.137115839243499e-05, "loss": 3.7365, "num_input_tokens_seen": 133300224, "step": 113 }, { "epoch": 0.013448153828005191, "grad_norm": 3.6875, "learning_rate": 2.1560283687943264e-05, "loss": 3.8298, "num_input_tokens_seen": 134479872, "step": 114 }, { "epoch": 0.01356612008965436, "grad_norm": 11.25, "learning_rate": 2.1749408983451538e-05, "loss": 3.9192, "num_input_tokens_seen": 135659520, "step": 115 }, { "epoch": 0.013684086351303528, "grad_norm": 9.875, "learning_rate": 2.1938534278959815e-05, "loss": 3.7628, "num_input_tokens_seen": 136839168, "step": 116 }, { "epoch": 0.013802052612952696, "grad_norm": 4.90625, "learning_rate": 2.2127659574468088e-05, "loss": 3.7212, "num_input_tokens_seen": 138018816, "step": 117 }, { "epoch": 0.013920018874601864, "grad_norm": 6.875, "learning_rate": 2.2316784869976362e-05, "loss": 3.8291, "num_input_tokens_seen": 139198464, "step": 118 }, { "epoch": 0.014037985136251032, "grad_norm": 3.984375, "learning_rate": 2.2505910165484635e-05, "loss": 3.7104, "num_input_tokens_seen": 140378112, "step": 119 }, { "epoch": 0.0141559513979002, "grad_norm": 5.0625, "learning_rate": 2.2695035460992912e-05, "loss": 3.6898, "num_input_tokens_seen": 141557760, "step": 120 }, { "epoch": 0.014273917659549369, "grad_norm": 3.828125, "learning_rate": 2.2884160756501186e-05, "loss": 3.61, "num_input_tokens_seen": 142737408, "step": 121 }, { "epoch": 0.014391883921198537, "grad_norm": 4.875, "learning_rate": 2.307328605200946e-05, "loss": 3.6886, "num_input_tokens_seen": 143917056, "step": 122 }, { "epoch": 0.014509850182847705, "grad_norm": 4.75, "learning_rate": 2.326241134751773e-05, "loss": 3.6435, "num_input_tokens_seen": 145096704, "step": 123 }, { "epoch": 0.014627816444496873, "grad_norm": 3.75, "learning_rate": 2.3451536643026006e-05, "loss": 3.6619, "num_input_tokens_seen": 146276352, "step": 124 }, { "epoch": 0.014745782706146042, "grad_norm": 5.375, "learning_rate": 2.364066193853428e-05, "loss": 3.631, "num_input_tokens_seen": 147456000, "step": 125 }, { "epoch": 0.01486374896779521, "grad_norm": 4.125, "learning_rate": 2.3829787234042553e-05, "loss": 3.6482, "num_input_tokens_seen": 148635648, "step": 126 }, { "epoch": 0.014981715229444378, "grad_norm": 7.78125, "learning_rate": 2.4018912529550827e-05, "loss": 3.6785, "num_input_tokens_seen": 149815296, "step": 127 }, { "epoch": 0.015099681491093548, "grad_norm": 7.0, "learning_rate": 2.4208037825059104e-05, "loss": 3.6113, "num_input_tokens_seen": 150994944, "step": 128 }, { "epoch": 0.015217647752742716, "grad_norm": 4.46875, "learning_rate": 2.4397163120567377e-05, "loss": 3.6015, "num_input_tokens_seen": 152174592, "step": 129 }, { "epoch": 0.015335614014391884, "grad_norm": 4.1875, "learning_rate": 2.458628841607565e-05, "loss": 3.5241, "num_input_tokens_seen": 153354240, "step": 130 }, { "epoch": 0.015453580276041053, "grad_norm": 4.0625, "learning_rate": 2.4775413711583928e-05, "loss": 3.6007, "num_input_tokens_seen": 154533888, "step": 131 }, { "epoch": 0.01557154653769022, "grad_norm": 3.046875, "learning_rate": 2.49645390070922e-05, "loss": 3.5949, "num_input_tokens_seen": 155713536, "step": 132 }, { "epoch": 0.01568951279933939, "grad_norm": 3.265625, "learning_rate": 2.5153664302600475e-05, "loss": 3.5403, "num_input_tokens_seen": 156893184, "step": 133 }, { "epoch": 0.015807479060988557, "grad_norm": 2.859375, "learning_rate": 2.5342789598108752e-05, "loss": 3.5399, "num_input_tokens_seen": 158072832, "step": 134 }, { "epoch": 0.015925445322637725, "grad_norm": 2.6875, "learning_rate": 2.5531914893617025e-05, "loss": 3.5408, "num_input_tokens_seen": 159252480, "step": 135 }, { "epoch": 0.016043411584286894, "grad_norm": 2.03125, "learning_rate": 2.57210401891253e-05, "loss": 3.4678, "num_input_tokens_seen": 160432128, "step": 136 }, { "epoch": 0.016161377845936062, "grad_norm": 2.796875, "learning_rate": 2.5910165484633572e-05, "loss": 3.6019, "num_input_tokens_seen": 161611776, "step": 137 }, { "epoch": 0.01627934410758523, "grad_norm": 1.96875, "learning_rate": 2.609929078014185e-05, "loss": 3.4525, "num_input_tokens_seen": 162791424, "step": 138 }, { "epoch": 0.0163973103692344, "grad_norm": 2.390625, "learning_rate": 2.628841607565012e-05, "loss": 3.557, "num_input_tokens_seen": 163971072, "step": 139 }, { "epoch": 0.016515276630883566, "grad_norm": 2.734375, "learning_rate": 2.6477541371158393e-05, "loss": 3.5324, "num_input_tokens_seen": 165150720, "step": 140 }, { "epoch": 0.016633242892532735, "grad_norm": 3.703125, "learning_rate": 2.6666666666666667e-05, "loss": 3.4707, "num_input_tokens_seen": 166330368, "step": 141 }, { "epoch": 0.016751209154181903, "grad_norm": 3.53125, "learning_rate": 2.6855791962174944e-05, "loss": 3.3545, "num_input_tokens_seen": 167510016, "step": 142 }, { "epoch": 0.01686917541583107, "grad_norm": 1.7890625, "learning_rate": 2.7044917257683217e-05, "loss": 3.4051, "num_input_tokens_seen": 168689664, "step": 143 }, { "epoch": 0.01698714167748024, "grad_norm": 4.78125, "learning_rate": 2.723404255319149e-05, "loss": 3.4119, "num_input_tokens_seen": 169869312, "step": 144 }, { "epoch": 0.017105107939129408, "grad_norm": 3.0625, "learning_rate": 2.7423167848699764e-05, "loss": 3.5149, "num_input_tokens_seen": 171048960, "step": 145 }, { "epoch": 0.017223074200778576, "grad_norm": 3.671875, "learning_rate": 2.761229314420804e-05, "loss": 3.3442, "num_input_tokens_seen": 172228608, "step": 146 }, { "epoch": 0.017341040462427744, "grad_norm": 3.453125, "learning_rate": 2.7801418439716315e-05, "loss": 3.3277, "num_input_tokens_seen": 173408256, "step": 147 }, { "epoch": 0.017459006724076916, "grad_norm": 3.4375, "learning_rate": 2.7990543735224588e-05, "loss": 3.3905, "num_input_tokens_seen": 174587904, "step": 148 }, { "epoch": 0.017576972985726084, "grad_norm": 2.0, "learning_rate": 2.8179669030732865e-05, "loss": 3.276, "num_input_tokens_seen": 175767552, "step": 149 }, { "epoch": 0.017694939247375252, "grad_norm": 4.0, "learning_rate": 2.836879432624114e-05, "loss": 3.3211, "num_input_tokens_seen": 176947200, "step": 150 }, { "epoch": 0.01781290550902442, "grad_norm": 4.03125, "learning_rate": 2.8557919621749412e-05, "loss": 3.3483, "num_input_tokens_seen": 178126848, "step": 151 }, { "epoch": 0.01793087177067359, "grad_norm": 2.53125, "learning_rate": 2.8747044917257686e-05, "loss": 3.3391, "num_input_tokens_seen": 179306496, "step": 152 }, { "epoch": 0.018048838032322757, "grad_norm": 5.5625, "learning_rate": 2.8936170212765963e-05, "loss": 3.4719, "num_input_tokens_seen": 180486144, "step": 153 }, { "epoch": 0.018166804293971925, "grad_norm": 3.171875, "learning_rate": 2.9125295508274236e-05, "loss": 3.2727, "num_input_tokens_seen": 181665792, "step": 154 }, { "epoch": 0.018284770555621093, "grad_norm": 5.09375, "learning_rate": 2.9314420803782506e-05, "loss": 3.284, "num_input_tokens_seen": 182845440, "step": 155 }, { "epoch": 0.01840273681727026, "grad_norm": 3.25, "learning_rate": 2.950354609929078e-05, "loss": 3.2279, "num_input_tokens_seen": 184025088, "step": 156 }, { "epoch": 0.01852070307891943, "grad_norm": 3.71875, "learning_rate": 2.9692671394799057e-05, "loss": 3.2438, "num_input_tokens_seen": 185204736, "step": 157 }, { "epoch": 0.018638669340568598, "grad_norm": 3.875, "learning_rate": 2.988179669030733e-05, "loss": 3.3257, "num_input_tokens_seen": 186384384, "step": 158 }, { "epoch": 0.018756635602217766, "grad_norm": 2.21875, "learning_rate": 3.0070921985815604e-05, "loss": 3.2727, "num_input_tokens_seen": 187564032, "step": 159 }, { "epoch": 0.018874601863866934, "grad_norm": 4.125, "learning_rate": 3.0260047281323877e-05, "loss": 3.245, "num_input_tokens_seen": 188743680, "step": 160 }, { "epoch": 0.018992568125516102, "grad_norm": 3.640625, "learning_rate": 3.0449172576832154e-05, "loss": 3.1904, "num_input_tokens_seen": 189923328, "step": 161 }, { "epoch": 0.01911053438716527, "grad_norm": 2.625, "learning_rate": 3.063829787234043e-05, "loss": 3.2754, "num_input_tokens_seen": 191102976, "step": 162 }, { "epoch": 0.01922850064881444, "grad_norm": 3.578125, "learning_rate": 3.0827423167848705e-05, "loss": 3.1889, "num_input_tokens_seen": 192282624, "step": 163 }, { "epoch": 0.019346466910463607, "grad_norm": 2.953125, "learning_rate": 3.1016548463356975e-05, "loss": 3.2809, "num_input_tokens_seen": 193462272, "step": 164 }, { "epoch": 0.019464433172112775, "grad_norm": 2.234375, "learning_rate": 3.120567375886525e-05, "loss": 3.1929, "num_input_tokens_seen": 194641920, "step": 165 }, { "epoch": 0.019582399433761943, "grad_norm": 2.515625, "learning_rate": 3.139479905437353e-05, "loss": 3.1474, "num_input_tokens_seen": 195821568, "step": 166 }, { "epoch": 0.01970036569541111, "grad_norm": 4.09375, "learning_rate": 3.15839243498818e-05, "loss": 3.0433, "num_input_tokens_seen": 197001216, "step": 167 }, { "epoch": 0.01981833195706028, "grad_norm": 2.6875, "learning_rate": 3.1773049645390076e-05, "loss": 3.1527, "num_input_tokens_seen": 198180864, "step": 168 }, { "epoch": 0.019936298218709448, "grad_norm": 3.5, "learning_rate": 3.196217494089835e-05, "loss": 3.2145, "num_input_tokens_seen": 199360512, "step": 169 }, { "epoch": 0.020054264480358616, "grad_norm": 4.9375, "learning_rate": 3.215130023640662e-05, "loss": 3.2642, "num_input_tokens_seen": 200540160, "step": 170 }, { "epoch": 0.020172230742007784, "grad_norm": 2.484375, "learning_rate": 3.234042553191489e-05, "loss": 3.1286, "num_input_tokens_seen": 201719808, "step": 171 }, { "epoch": 0.020290197003656953, "grad_norm": 8.1875, "learning_rate": 3.252955082742317e-05, "loss": 3.1274, "num_input_tokens_seen": 202899456, "step": 172 }, { "epoch": 0.02040816326530612, "grad_norm": 6.375, "learning_rate": 3.271867612293144e-05, "loss": 3.096, "num_input_tokens_seen": 204079104, "step": 173 }, { "epoch": 0.020526129526955292, "grad_norm": 5.8125, "learning_rate": 3.290780141843972e-05, "loss": 3.1647, "num_input_tokens_seen": 205258752, "step": 174 }, { "epoch": 0.02064409578860446, "grad_norm": 5.6875, "learning_rate": 3.3096926713947994e-05, "loss": 3.269, "num_input_tokens_seen": 206438400, "step": 175 }, { "epoch": 0.02076206205025363, "grad_norm": 4.4375, "learning_rate": 3.3286052009456264e-05, "loss": 3.2106, "num_input_tokens_seen": 207618048, "step": 176 }, { "epoch": 0.020880028311902797, "grad_norm": 3.109375, "learning_rate": 3.347517730496454e-05, "loss": 3.1545, "num_input_tokens_seen": 208797696, "step": 177 }, { "epoch": 0.020997994573551965, "grad_norm": 6.6875, "learning_rate": 3.366430260047282e-05, "loss": 3.1045, "num_input_tokens_seen": 209977344, "step": 178 }, { "epoch": 0.021115960835201134, "grad_norm": 6.0625, "learning_rate": 3.385342789598109e-05, "loss": 3.0496, "num_input_tokens_seen": 211156992, "step": 179 }, { "epoch": 0.021233927096850302, "grad_norm": 4.03125, "learning_rate": 3.4042553191489365e-05, "loss": 3.081, "num_input_tokens_seen": 212336640, "step": 180 }, { "epoch": 0.02135189335849947, "grad_norm": 3.921875, "learning_rate": 3.423167848699764e-05, "loss": 3.0552, "num_input_tokens_seen": 213516288, "step": 181 }, { "epoch": 0.021469859620148638, "grad_norm": 4.5, "learning_rate": 3.442080378250591e-05, "loss": 3.0172, "num_input_tokens_seen": 214695936, "step": 182 }, { "epoch": 0.021587825881797806, "grad_norm": 4.0625, "learning_rate": 3.460992907801419e-05, "loss": 3.1379, "num_input_tokens_seen": 215875584, "step": 183 }, { "epoch": 0.021705792143446975, "grad_norm": 4.9375, "learning_rate": 3.4799054373522466e-05, "loss": 3.1235, "num_input_tokens_seen": 217055232, "step": 184 }, { "epoch": 0.021823758405096143, "grad_norm": 3.78125, "learning_rate": 3.4988179669030736e-05, "loss": 3.1189, "num_input_tokens_seen": 218234880, "step": 185 }, { "epoch": 0.02194172466674531, "grad_norm": 5.0625, "learning_rate": 3.517730496453901e-05, "loss": 3.0035, "num_input_tokens_seen": 219414528, "step": 186 }, { "epoch": 0.02205969092839448, "grad_norm": 4.28125, "learning_rate": 3.536643026004728e-05, "loss": 3.0478, "num_input_tokens_seen": 220594176, "step": 187 }, { "epoch": 0.022177657190043647, "grad_norm": 5.8125, "learning_rate": 3.555555555555555e-05, "loss": 3.0777, "num_input_tokens_seen": 221773824, "step": 188 }, { "epoch": 0.022295623451692816, "grad_norm": 5.0625, "learning_rate": 3.574468085106383e-05, "loss": 3.0665, "num_input_tokens_seen": 222953472, "step": 189 }, { "epoch": 0.022413589713341984, "grad_norm": 4.25, "learning_rate": 3.593380614657211e-05, "loss": 3.0271, "num_input_tokens_seen": 224133120, "step": 190 }, { "epoch": 0.022531555974991152, "grad_norm": 4.03125, "learning_rate": 3.612293144208038e-05, "loss": 3.033, "num_input_tokens_seen": 225312768, "step": 191 }, { "epoch": 0.02264952223664032, "grad_norm": 5.09375, "learning_rate": 3.6312056737588654e-05, "loss": 3.143, "num_input_tokens_seen": 226492416, "step": 192 }, { "epoch": 0.02276748849828949, "grad_norm": 4.09375, "learning_rate": 3.650118203309693e-05, "loss": 3.0347, "num_input_tokens_seen": 227672064, "step": 193 }, { "epoch": 0.022885454759938657, "grad_norm": 4.65625, "learning_rate": 3.66903073286052e-05, "loss": 3.07, "num_input_tokens_seen": 228851712, "step": 194 }, { "epoch": 0.023003421021587825, "grad_norm": 3.90625, "learning_rate": 3.687943262411348e-05, "loss": 3.0225, "num_input_tokens_seen": 230031360, "step": 195 }, { "epoch": 0.023121387283236993, "grad_norm": 4.96875, "learning_rate": 3.7068557919621755e-05, "loss": 3.0222, "num_input_tokens_seen": 231211008, "step": 196 }, { "epoch": 0.02323935354488616, "grad_norm": 3.84375, "learning_rate": 3.7257683215130025e-05, "loss": 2.9755, "num_input_tokens_seen": 232390656, "step": 197 }, { "epoch": 0.02335731980653533, "grad_norm": 5.0, "learning_rate": 3.74468085106383e-05, "loss": 3.0163, "num_input_tokens_seen": 233570304, "step": 198 }, { "epoch": 0.023475286068184498, "grad_norm": 4.53125, "learning_rate": 3.763593380614658e-05, "loss": 2.9553, "num_input_tokens_seen": 234749952, "step": 199 }, { "epoch": 0.023593252329833666, "grad_norm": 4.6875, "learning_rate": 3.782505910165485e-05, "loss": 3.015, "num_input_tokens_seen": 235929600, "step": 200 }, { "epoch": 0.023593252329833666, "eval_wikipedia_loss": 2.9615590572357178, "eval_wikipedia_runtime": 172.3085, "eval_wikipedia_samples_per_second": 4.074, "eval_wikipedia_steps_per_second": 0.174, "num_input_tokens_seen": 235929600, "step": 200 }, { "epoch": 0.023593252329833666, "eval_toxicity_loss": 4.73836088180542, "eval_toxicity_runtime": 0.999, "eval_toxicity_samples_per_second": 2.002, "eval_toxicity_steps_per_second": 1.001, "num_input_tokens_seen": 235929600, "step": 200 }, { "epoch": 0.023711218591482838, "grad_norm": 4.0, "learning_rate": 3.8014184397163126e-05, "loss": 2.9448, "num_input_tokens_seen": 237109248, "step": 201 }, { "epoch": 0.023829184853132006, "grad_norm": 4.78125, "learning_rate": 3.82033096926714e-05, "loss": 3.1131, "num_input_tokens_seen": 238288896, "step": 202 }, { "epoch": 0.023947151114781174, "grad_norm": 3.828125, "learning_rate": 3.839243498817967e-05, "loss": 2.9217, "num_input_tokens_seen": 239468544, "step": 203 }, { "epoch": 0.024065117376430342, "grad_norm": 4.46875, "learning_rate": 3.858156028368794e-05, "loss": 2.9658, "num_input_tokens_seen": 240648192, "step": 204 }, { "epoch": 0.02418308363807951, "grad_norm": 3.703125, "learning_rate": 3.877068557919622e-05, "loss": 2.9857, "num_input_tokens_seen": 241827840, "step": 205 }, { "epoch": 0.02430104989972868, "grad_norm": 4.9375, "learning_rate": 3.895981087470449e-05, "loss": 2.9761, "num_input_tokens_seen": 243007488, "step": 206 }, { "epoch": 0.024419016161377847, "grad_norm": 4.28125, "learning_rate": 3.914893617021277e-05, "loss": 2.981, "num_input_tokens_seen": 244187136, "step": 207 }, { "epoch": 0.024536982423027015, "grad_norm": 4.3125, "learning_rate": 3.9338061465721044e-05, "loss": 3.0308, "num_input_tokens_seen": 245366784, "step": 208 }, { "epoch": 0.024654948684676183, "grad_norm": 4.0, "learning_rate": 3.9527186761229314e-05, "loss": 2.9529, "num_input_tokens_seen": 246546432, "step": 209 }, { "epoch": 0.02477291494632535, "grad_norm": 4.28125, "learning_rate": 3.971631205673759e-05, "loss": 2.9297, "num_input_tokens_seen": 247726080, "step": 210 }, { "epoch": 0.02489088120797452, "grad_norm": 3.328125, "learning_rate": 3.990543735224587e-05, "loss": 2.8328, "num_input_tokens_seen": 248905728, "step": 211 }, { "epoch": 0.025008847469623688, "grad_norm": 4.6875, "learning_rate": 4.009456264775414e-05, "loss": 2.9565, "num_input_tokens_seen": 250085376, "step": 212 }, { "epoch": 0.025126813731272856, "grad_norm": 3.734375, "learning_rate": 4.028368794326241e-05, "loss": 2.9276, "num_input_tokens_seen": 251265024, "step": 213 }, { "epoch": 0.025244779992922024, "grad_norm": 4.90625, "learning_rate": 4.0472813238770685e-05, "loss": 2.8872, "num_input_tokens_seen": 252444672, "step": 214 }, { "epoch": 0.025362746254571193, "grad_norm": 3.984375, "learning_rate": 4.066193853427896e-05, "loss": 2.9423, "num_input_tokens_seen": 253624320, "step": 215 }, { "epoch": 0.02548071251622036, "grad_norm": 4.28125, "learning_rate": 4.085106382978723e-05, "loss": 2.9417, "num_input_tokens_seen": 254803968, "step": 216 }, { "epoch": 0.02559867877786953, "grad_norm": 3.53125, "learning_rate": 4.104018912529551e-05, "loss": 2.878, "num_input_tokens_seen": 255983616, "step": 217 }, { "epoch": 0.025716645039518697, "grad_norm": 3.71875, "learning_rate": 4.1229314420803786e-05, "loss": 2.8925, "num_input_tokens_seen": 257163264, "step": 218 }, { "epoch": 0.025834611301167865, "grad_norm": 3.109375, "learning_rate": 4.1418439716312056e-05, "loss": 2.9432, "num_input_tokens_seen": 258342912, "step": 219 }, { "epoch": 0.025952577562817034, "grad_norm": 4.875, "learning_rate": 4.1607565011820333e-05, "loss": 2.8552, "num_input_tokens_seen": 259522560, "step": 220 }, { "epoch": 0.026070543824466202, "grad_norm": 3.09375, "learning_rate": 4.1796690307328604e-05, "loss": 2.8685, "num_input_tokens_seen": 260702208, "step": 221 }, { "epoch": 0.02618851008611537, "grad_norm": 4.6875, "learning_rate": 4.198581560283688e-05, "loss": 2.9284, "num_input_tokens_seen": 261881856, "step": 222 }, { "epoch": 0.026306476347764538, "grad_norm": 3.265625, "learning_rate": 4.217494089834516e-05, "loss": 2.8862, "num_input_tokens_seen": 263061504, "step": 223 }, { "epoch": 0.026424442609413706, "grad_norm": 4.5625, "learning_rate": 4.236406619385343e-05, "loss": 2.8105, "num_input_tokens_seen": 264241152, "step": 224 }, { "epoch": 0.026542408871062875, "grad_norm": 3.546875, "learning_rate": 4.2553191489361704e-05, "loss": 2.9406, "num_input_tokens_seen": 265420800, "step": 225 }, { "epoch": 0.026660375132712043, "grad_norm": 3.640625, "learning_rate": 4.274231678486998e-05, "loss": 2.907, "num_input_tokens_seen": 266600448, "step": 226 }, { "epoch": 0.02677834139436121, "grad_norm": 3.015625, "learning_rate": 4.293144208037825e-05, "loss": 3.0143, "num_input_tokens_seen": 267780096, "step": 227 }, { "epoch": 0.026896307656010383, "grad_norm": 4.25, "learning_rate": 4.312056737588653e-05, "loss": 2.9126, "num_input_tokens_seen": 268959744, "step": 228 }, { "epoch": 0.02701427391765955, "grad_norm": 2.5, "learning_rate": 4.3309692671394805e-05, "loss": 2.8879, "num_input_tokens_seen": 270139392, "step": 229 }, { "epoch": 0.02713224017930872, "grad_norm": 3.34375, "learning_rate": 4.3498817966903076e-05, "loss": 2.8174, "num_input_tokens_seen": 271319040, "step": 230 }, { "epoch": 0.027250206440957887, "grad_norm": 4.40625, "learning_rate": 4.368794326241135e-05, "loss": 2.895, "num_input_tokens_seen": 272498688, "step": 231 }, { "epoch": 0.027368172702607056, "grad_norm": 2.140625, "learning_rate": 4.387706855791963e-05, "loss": 2.8573, "num_input_tokens_seen": 273678336, "step": 232 }, { "epoch": 0.027486138964256224, "grad_norm": 3.609375, "learning_rate": 4.40661938534279e-05, "loss": 2.7662, "num_input_tokens_seen": 274857984, "step": 233 }, { "epoch": 0.027604105225905392, "grad_norm": 3.484375, "learning_rate": 4.4255319148936176e-05, "loss": 2.8103, "num_input_tokens_seen": 276037632, "step": 234 }, { "epoch": 0.02772207148755456, "grad_norm": 1.9765625, "learning_rate": 4.444444444444445e-05, "loss": 2.8271, "num_input_tokens_seen": 277217280, "step": 235 }, { "epoch": 0.02784003774920373, "grad_norm": 3.328125, "learning_rate": 4.4633569739952723e-05, "loss": 2.8881, "num_input_tokens_seen": 278396928, "step": 236 }, { "epoch": 0.027958004010852897, "grad_norm": 3.9375, "learning_rate": 4.4822695035461e-05, "loss": 2.9018, "num_input_tokens_seen": 279576576, "step": 237 }, { "epoch": 0.028075970272502065, "grad_norm": 5.25, "learning_rate": 4.501182033096927e-05, "loss": 2.8584, "num_input_tokens_seen": 280756224, "step": 238 }, { "epoch": 0.028193936534151233, "grad_norm": 2.328125, "learning_rate": 4.520094562647755e-05, "loss": 2.8435, "num_input_tokens_seen": 281935872, "step": 239 }, { "epoch": 0.0283119027958004, "grad_norm": 3.375, "learning_rate": 4.5390070921985824e-05, "loss": 2.8827, "num_input_tokens_seen": 283115520, "step": 240 }, { "epoch": 0.02842986905744957, "grad_norm": 2.3125, "learning_rate": 4.5579196217494095e-05, "loss": 2.8415, "num_input_tokens_seen": 284295168, "step": 241 }, { "epoch": 0.028547835319098738, "grad_norm": 3.828125, "learning_rate": 4.576832151300237e-05, "loss": 2.8305, "num_input_tokens_seen": 285474816, "step": 242 }, { "epoch": 0.028665801580747906, "grad_norm": 2.5625, "learning_rate": 4.595744680851065e-05, "loss": 2.7947, "num_input_tokens_seen": 286654464, "step": 243 }, { "epoch": 0.028783767842397074, "grad_norm": 2.375, "learning_rate": 4.614657210401892e-05, "loss": 2.812, "num_input_tokens_seen": 287834112, "step": 244 }, { "epoch": 0.028901734104046242, "grad_norm": 2.578125, "learning_rate": 4.633569739952719e-05, "loss": 2.8157, "num_input_tokens_seen": 289013760, "step": 245 }, { "epoch": 0.02901970036569541, "grad_norm": 3.625, "learning_rate": 4.652482269503546e-05, "loss": 2.8768, "num_input_tokens_seen": 290193408, "step": 246 }, { "epoch": 0.02913766662734458, "grad_norm": 2.328125, "learning_rate": 4.6713947990543736e-05, "loss": 2.834, "num_input_tokens_seen": 291373056, "step": 247 }, { "epoch": 0.029255632888993747, "grad_norm": 2.1875, "learning_rate": 4.690307328605201e-05, "loss": 2.8125, "num_input_tokens_seen": 292552704, "step": 248 }, { "epoch": 0.029373599150642915, "grad_norm": 2.921875, "learning_rate": 4.709219858156028e-05, "loss": 2.7906, "num_input_tokens_seen": 293732352, "step": 249 }, { "epoch": 0.029491565412292083, "grad_norm": 1.921875, "learning_rate": 4.728132387706856e-05, "loss": 2.8163, "num_input_tokens_seen": 294912000, "step": 250 }, { "epoch": 0.02960953167394125, "grad_norm": 2.75, "learning_rate": 4.747044917257684e-05, "loss": 2.7826, "num_input_tokens_seen": 296091648, "step": 251 }, { "epoch": 0.02972749793559042, "grad_norm": 2.671875, "learning_rate": 4.765957446808511e-05, "loss": 2.8368, "num_input_tokens_seen": 297271296, "step": 252 }, { "epoch": 0.029845464197239588, "grad_norm": 2.796875, "learning_rate": 4.7848699763593384e-05, "loss": 2.7188, "num_input_tokens_seen": 298450944, "step": 253 }, { "epoch": 0.029963430458888756, "grad_norm": 1.984375, "learning_rate": 4.8037825059101654e-05, "loss": 2.7609, "num_input_tokens_seen": 299630592, "step": 254 }, { "epoch": 0.030081396720537928, "grad_norm": 2.484375, "learning_rate": 4.822695035460993e-05, "loss": 2.742, "num_input_tokens_seen": 300810240, "step": 255 }, { "epoch": 0.030199362982187096, "grad_norm": 1.84375, "learning_rate": 4.841607565011821e-05, "loss": 2.7751, "num_input_tokens_seen": 301989888, "step": 256 }, { "epoch": 0.030317329243836264, "grad_norm": 3.078125, "learning_rate": 4.860520094562648e-05, "loss": 2.6949, "num_input_tokens_seen": 303169536, "step": 257 }, { "epoch": 0.030435295505485432, "grad_norm": 3.25, "learning_rate": 4.8794326241134755e-05, "loss": 2.8174, "num_input_tokens_seen": 304349184, "step": 258 }, { "epoch": 0.0305532617671346, "grad_norm": 1.4921875, "learning_rate": 4.898345153664303e-05, "loss": 2.6962, "num_input_tokens_seen": 305528832, "step": 259 }, { "epoch": 0.03067122802878377, "grad_norm": 3.046875, "learning_rate": 4.91725768321513e-05, "loss": 2.8484, "num_input_tokens_seen": 306708480, "step": 260 }, { "epoch": 0.030789194290432937, "grad_norm": 2.78125, "learning_rate": 4.936170212765958e-05, "loss": 2.632, "num_input_tokens_seen": 307888128, "step": 261 }, { "epoch": 0.030907160552082105, "grad_norm": 2.546875, "learning_rate": 4.9550827423167856e-05, "loss": 2.8744, "num_input_tokens_seen": 309067776, "step": 262 }, { "epoch": 0.031025126813731273, "grad_norm": 2.359375, "learning_rate": 4.9739952718676126e-05, "loss": 2.7714, "num_input_tokens_seen": 310247424, "step": 263 }, { "epoch": 0.03114309307538044, "grad_norm": 1.8515625, "learning_rate": 4.99290780141844e-05, "loss": 2.8042, "num_input_tokens_seen": 311427072, "step": 264 }, { "epoch": 0.031261059337029606, "grad_norm": 2.265625, "learning_rate": 5.011820330969268e-05, "loss": 2.7448, "num_input_tokens_seen": 312606720, "step": 265 }, { "epoch": 0.03137902559867878, "grad_norm": 1.78125, "learning_rate": 5.030732860520095e-05, "loss": 2.678, "num_input_tokens_seen": 313786368, "step": 266 }, { "epoch": 0.03149699186032794, "grad_norm": 2.203125, "learning_rate": 5.049645390070923e-05, "loss": 2.676, "num_input_tokens_seen": 314966016, "step": 267 }, { "epoch": 0.031614958121977114, "grad_norm": 1.90625, "learning_rate": 5.0685579196217504e-05, "loss": 2.6778, "num_input_tokens_seen": 316145664, "step": 268 }, { "epoch": 0.031732924383626286, "grad_norm": 2.203125, "learning_rate": 5.0874704491725774e-05, "loss": 2.7624, "num_input_tokens_seen": 317325312, "step": 269 }, { "epoch": 0.03185089064527545, "grad_norm": 3.53125, "learning_rate": 5.106382978723405e-05, "loss": 2.6462, "num_input_tokens_seen": 318504960, "step": 270 }, { "epoch": 0.03196885690692462, "grad_norm": 2.1875, "learning_rate": 5.125295508274232e-05, "loss": 2.6475, "num_input_tokens_seen": 319684608, "step": 271 }, { "epoch": 0.03208682316857379, "grad_norm": 1.703125, "learning_rate": 5.14420803782506e-05, "loss": 2.7214, "num_input_tokens_seen": 320864256, "step": 272 }, { "epoch": 0.03220478943022296, "grad_norm": 2.25, "learning_rate": 5.1631205673758875e-05, "loss": 2.6597, "num_input_tokens_seen": 322043904, "step": 273 }, { "epoch": 0.032322755691872124, "grad_norm": 2.171875, "learning_rate": 5.1820330969267145e-05, "loss": 2.6412, "num_input_tokens_seen": 323223552, "step": 274 }, { "epoch": 0.032440721953521295, "grad_norm": 2.25, "learning_rate": 5.200945626477542e-05, "loss": 2.7664, "num_input_tokens_seen": 324403200, "step": 275 }, { "epoch": 0.03255868821517046, "grad_norm": 3.296875, "learning_rate": 5.21985815602837e-05, "loss": 2.7168, "num_input_tokens_seen": 325582848, "step": 276 }, { "epoch": 0.03267665447681963, "grad_norm": 1.515625, "learning_rate": 5.238770685579196e-05, "loss": 2.698, "num_input_tokens_seen": 326762496, "step": 277 }, { "epoch": 0.0327946207384688, "grad_norm": 1.75, "learning_rate": 5.257683215130024e-05, "loss": 2.7068, "num_input_tokens_seen": 327942144, "step": 278 }, { "epoch": 0.03291258700011797, "grad_norm": 2.859375, "learning_rate": 5.276595744680851e-05, "loss": 2.6487, "num_input_tokens_seen": 329121792, "step": 279 }, { "epoch": 0.03303055326176713, "grad_norm": 2.515625, "learning_rate": 5.2955082742316786e-05, "loss": 2.5925, "num_input_tokens_seen": 330301440, "step": 280 }, { "epoch": 0.033148519523416305, "grad_norm": 2.40625, "learning_rate": 5.314420803782506e-05, "loss": 2.6826, "num_input_tokens_seen": 331481088, "step": 281 }, { "epoch": 0.03326648578506547, "grad_norm": 2.015625, "learning_rate": 5.333333333333333e-05, "loss": 2.6997, "num_input_tokens_seen": 332660736, "step": 282 }, { "epoch": 0.03338445204671464, "grad_norm": 2.234375, "learning_rate": 5.352245862884161e-05, "loss": 2.6378, "num_input_tokens_seen": 333840384, "step": 283 }, { "epoch": 0.033502418308363806, "grad_norm": 1.984375, "learning_rate": 5.371158392434989e-05, "loss": 2.588, "num_input_tokens_seen": 335020032, "step": 284 }, { "epoch": 0.03362038457001298, "grad_norm": 2.546875, "learning_rate": 5.390070921985816e-05, "loss": 2.6051, "num_input_tokens_seen": 336199680, "step": 285 }, { "epoch": 0.03373835083166214, "grad_norm": 1.7421875, "learning_rate": 5.4089834515366434e-05, "loss": 2.6874, "num_input_tokens_seen": 337379328, "step": 286 }, { "epoch": 0.033856317093311314, "grad_norm": 2.09375, "learning_rate": 5.4278959810874704e-05, "loss": 2.7186, "num_input_tokens_seen": 338558976, "step": 287 }, { "epoch": 0.03397428335496048, "grad_norm": 1.796875, "learning_rate": 5.446808510638298e-05, "loss": 2.6698, "num_input_tokens_seen": 339738624, "step": 288 }, { "epoch": 0.03409224961660965, "grad_norm": 2.3125, "learning_rate": 5.465721040189126e-05, "loss": 2.603, "num_input_tokens_seen": 340918272, "step": 289 }, { "epoch": 0.034210215878258815, "grad_norm": 2.140625, "learning_rate": 5.484633569739953e-05, "loss": 2.6479, "num_input_tokens_seen": 342097920, "step": 290 }, { "epoch": 0.03432818213990799, "grad_norm": 2.140625, "learning_rate": 5.5035460992907805e-05, "loss": 2.7037, "num_input_tokens_seen": 343277568, "step": 291 }, { "epoch": 0.03444614840155715, "grad_norm": 1.90625, "learning_rate": 5.522458628841608e-05, "loss": 2.681, "num_input_tokens_seen": 344457216, "step": 292 }, { "epoch": 0.03456411466320632, "grad_norm": 2.046875, "learning_rate": 5.541371158392435e-05, "loss": 2.5947, "num_input_tokens_seen": 345636864, "step": 293 }, { "epoch": 0.03468208092485549, "grad_norm": 1.734375, "learning_rate": 5.560283687943263e-05, "loss": 2.6439, "num_input_tokens_seen": 346816512, "step": 294 }, { "epoch": 0.03480004718650466, "grad_norm": 2.28125, "learning_rate": 5.5791962174940906e-05, "loss": 2.6404, "num_input_tokens_seen": 347996160, "step": 295 }, { "epoch": 0.03491801344815383, "grad_norm": 2.03125, "learning_rate": 5.5981087470449176e-05, "loss": 2.6686, "num_input_tokens_seen": 349175808, "step": 296 }, { "epoch": 0.035035979709802996, "grad_norm": 1.7265625, "learning_rate": 5.617021276595745e-05, "loss": 2.666, "num_input_tokens_seen": 350355456, "step": 297 }, { "epoch": 0.03515394597145217, "grad_norm": 2.109375, "learning_rate": 5.635933806146573e-05, "loss": 2.7266, "num_input_tokens_seen": 351535104, "step": 298 }, { "epoch": 0.03527191223310133, "grad_norm": 2.03125, "learning_rate": 5.6548463356974e-05, "loss": 2.6655, "num_input_tokens_seen": 352714752, "step": 299 }, { "epoch": 0.035389878494750504, "grad_norm": 1.9296875, "learning_rate": 5.673758865248228e-05, "loss": 2.6975, "num_input_tokens_seen": 353894400, "step": 300 }, { "epoch": 0.03550784475639967, "grad_norm": 2.15625, "learning_rate": 5.692671394799055e-05, "loss": 2.6232, "num_input_tokens_seen": 355074048, "step": 301 }, { "epoch": 0.03562581101804884, "grad_norm": 2.140625, "learning_rate": 5.7115839243498824e-05, "loss": 2.6344, "num_input_tokens_seen": 356253696, "step": 302 }, { "epoch": 0.035743777279698005, "grad_norm": 1.7109375, "learning_rate": 5.73049645390071e-05, "loss": 2.5703, "num_input_tokens_seen": 357433344, "step": 303 }, { "epoch": 0.03586174354134718, "grad_norm": 2.125, "learning_rate": 5.749408983451537e-05, "loss": 2.5405, "num_input_tokens_seen": 358612992, "step": 304 }, { "epoch": 0.03597970980299634, "grad_norm": 2.546875, "learning_rate": 5.768321513002365e-05, "loss": 2.6079, "num_input_tokens_seen": 359792640, "step": 305 }, { "epoch": 0.03609767606464551, "grad_norm": 2.015625, "learning_rate": 5.7872340425531925e-05, "loss": 2.6958, "num_input_tokens_seen": 360972288, "step": 306 }, { "epoch": 0.03621564232629468, "grad_norm": 1.46875, "learning_rate": 5.8061465721040195e-05, "loss": 2.5541, "num_input_tokens_seen": 362151936, "step": 307 }, { "epoch": 0.03633360858794385, "grad_norm": 1.796875, "learning_rate": 5.825059101654847e-05, "loss": 2.6427, "num_input_tokens_seen": 363331584, "step": 308 }, { "epoch": 0.036451574849593014, "grad_norm": 2.171875, "learning_rate": 5.843971631205675e-05, "loss": 2.5853, "num_input_tokens_seen": 364511232, "step": 309 }, { "epoch": 0.036569541111242186, "grad_norm": 1.8046875, "learning_rate": 5.862884160756501e-05, "loss": 2.5426, "num_input_tokens_seen": 365690880, "step": 310 }, { "epoch": 0.03668750737289135, "grad_norm": 1.625, "learning_rate": 5.881796690307329e-05, "loss": 2.6835, "num_input_tokens_seen": 366870528, "step": 311 }, { "epoch": 0.03680547363454052, "grad_norm": 2.640625, "learning_rate": 5.900709219858156e-05, "loss": 2.626, "num_input_tokens_seen": 368050176, "step": 312 }, { "epoch": 0.03692343989618969, "grad_norm": 1.4765625, "learning_rate": 5.9196217494089836e-05, "loss": 2.5361, "num_input_tokens_seen": 369229824, "step": 313 }, { "epoch": 0.03704140615783886, "grad_norm": 3.03125, "learning_rate": 5.938534278959811e-05, "loss": 2.5762, "num_input_tokens_seen": 370409472, "step": 314 }, { "epoch": 0.037159372419488024, "grad_norm": 1.890625, "learning_rate": 5.9574468085106384e-05, "loss": 2.5874, "num_input_tokens_seen": 371589120, "step": 315 }, { "epoch": 0.037277338681137195, "grad_norm": 2.6875, "learning_rate": 5.976359338061466e-05, "loss": 2.5874, "num_input_tokens_seen": 372768768, "step": 316 }, { "epoch": 0.03739530494278636, "grad_norm": 1.8359375, "learning_rate": 5.995271867612294e-05, "loss": 2.4888, "num_input_tokens_seen": 373948416, "step": 317 }, { "epoch": 0.03751327120443553, "grad_norm": 2.25, "learning_rate": 6.014184397163121e-05, "loss": 2.5391, "num_input_tokens_seen": 375128064, "step": 318 }, { "epoch": 0.0376312374660847, "grad_norm": 1.890625, "learning_rate": 6.0330969267139484e-05, "loss": 2.6462, "num_input_tokens_seen": 376307712, "step": 319 }, { "epoch": 0.03774920372773387, "grad_norm": 1.875, "learning_rate": 6.0520094562647755e-05, "loss": 2.4726, "num_input_tokens_seen": 377487360, "step": 320 }, { "epoch": 0.03786716998938304, "grad_norm": 2.296875, "learning_rate": 6.070921985815603e-05, "loss": 2.5759, "num_input_tokens_seen": 378667008, "step": 321 }, { "epoch": 0.037985136251032205, "grad_norm": 2.703125, "learning_rate": 6.089834515366431e-05, "loss": 2.5199, "num_input_tokens_seen": 379846656, "step": 322 }, { "epoch": 0.038103102512681376, "grad_norm": 2.390625, "learning_rate": 6.108747044917259e-05, "loss": 2.6243, "num_input_tokens_seen": 381026304, "step": 323 }, { "epoch": 0.03822106877433054, "grad_norm": 1.65625, "learning_rate": 6.127659574468086e-05, "loss": 2.5922, "num_input_tokens_seen": 382205952, "step": 324 }, { "epoch": 0.03833903503597971, "grad_norm": 1.640625, "learning_rate": 6.146572104018913e-05, "loss": 2.5376, "num_input_tokens_seen": 383385600, "step": 325 }, { "epoch": 0.03845700129762888, "grad_norm": 2.03125, "learning_rate": 6.165484633569741e-05, "loss": 2.547, "num_input_tokens_seen": 384565248, "step": 326 }, { "epoch": 0.03857496755927805, "grad_norm": 2.765625, "learning_rate": 6.184397163120568e-05, "loss": 2.5095, "num_input_tokens_seen": 385744896, "step": 327 }, { "epoch": 0.038692933820927214, "grad_norm": 1.40625, "learning_rate": 6.203309692671395e-05, "loss": 2.5431, "num_input_tokens_seen": 386924544, "step": 328 }, { "epoch": 0.038810900082576386, "grad_norm": 2.203125, "learning_rate": 6.222222222222223e-05, "loss": 2.5744, "num_input_tokens_seen": 388104192, "step": 329 }, { "epoch": 0.03892886634422555, "grad_norm": 1.9921875, "learning_rate": 6.24113475177305e-05, "loss": 2.6511, "num_input_tokens_seen": 389283840, "step": 330 }, { "epoch": 0.03904683260587472, "grad_norm": 3.15625, "learning_rate": 6.260047281323877e-05, "loss": 2.5556, "num_input_tokens_seen": 390463488, "step": 331 }, { "epoch": 0.03916479886752389, "grad_norm": 1.953125, "learning_rate": 6.278959810874706e-05, "loss": 2.5616, "num_input_tokens_seen": 391643136, "step": 332 }, { "epoch": 0.03928276512917306, "grad_norm": 1.6484375, "learning_rate": 6.297872340425533e-05, "loss": 2.5747, "num_input_tokens_seen": 392822784, "step": 333 }, { "epoch": 0.03940073139082222, "grad_norm": 3.15625, "learning_rate": 6.31678486997636e-05, "loss": 2.4747, "num_input_tokens_seen": 394002432, "step": 334 }, { "epoch": 0.039518697652471395, "grad_norm": 1.765625, "learning_rate": 6.335697399527188e-05, "loss": 2.5124, "num_input_tokens_seen": 395182080, "step": 335 }, { "epoch": 0.03963666391412056, "grad_norm": 2.15625, "learning_rate": 6.354609929078015e-05, "loss": 2.552, "num_input_tokens_seen": 396361728, "step": 336 }, { "epoch": 0.03975463017576973, "grad_norm": 1.8125, "learning_rate": 6.373522458628842e-05, "loss": 2.4541, "num_input_tokens_seen": 397541376, "step": 337 }, { "epoch": 0.039872596437418896, "grad_norm": 3.15625, "learning_rate": 6.39243498817967e-05, "loss": 2.4496, "num_input_tokens_seen": 398721024, "step": 338 }, { "epoch": 0.03999056269906807, "grad_norm": 2.015625, "learning_rate": 6.411347517730498e-05, "loss": 2.5182, "num_input_tokens_seen": 399900672, "step": 339 }, { "epoch": 0.04010852896071723, "grad_norm": 1.8671875, "learning_rate": 6.430260047281325e-05, "loss": 2.5118, "num_input_tokens_seen": 401080320, "step": 340 }, { "epoch": 0.040226495222366404, "grad_norm": 4.46875, "learning_rate": 6.449172576832153e-05, "loss": 2.5161, "num_input_tokens_seen": 402259968, "step": 341 }, { "epoch": 0.04034446148401557, "grad_norm": 1.8828125, "learning_rate": 6.468085106382979e-05, "loss": 2.5411, "num_input_tokens_seen": 403439616, "step": 342 }, { "epoch": 0.04046242774566474, "grad_norm": 6.53125, "learning_rate": 6.486997635933806e-05, "loss": 2.5341, "num_input_tokens_seen": 404619264, "step": 343 }, { "epoch": 0.040580394007313905, "grad_norm": 5.0, "learning_rate": 6.505910165484634e-05, "loss": 2.5139, "num_input_tokens_seen": 405798912, "step": 344 }, { "epoch": 0.04069836026896308, "grad_norm": 6.9375, "learning_rate": 6.524822695035461e-05, "loss": 2.542, "num_input_tokens_seen": 406978560, "step": 345 }, { "epoch": 0.04081632653061224, "grad_norm": 6.625, "learning_rate": 6.543735224586288e-05, "loss": 2.6481, "num_input_tokens_seen": 408158208, "step": 346 }, { "epoch": 0.04093429279226141, "grad_norm": 1.9375, "learning_rate": 6.562647754137116e-05, "loss": 2.5413, "num_input_tokens_seen": 409337856, "step": 347 }, { "epoch": 0.041052259053910585, "grad_norm": 3.25, "learning_rate": 6.581560283687943e-05, "loss": 2.5869, "num_input_tokens_seen": 410517504, "step": 348 }, { "epoch": 0.04117022531555975, "grad_norm": 2.171875, "learning_rate": 6.60047281323877e-05, "loss": 2.5717, "num_input_tokens_seen": 411697152, "step": 349 }, { "epoch": 0.04128819157720892, "grad_norm": 2.296875, "learning_rate": 6.619385342789599e-05, "loss": 2.569, "num_input_tokens_seen": 412876800, "step": 350 }, { "epoch": 0.041406157838858086, "grad_norm": 2.53125, "learning_rate": 6.638297872340426e-05, "loss": 2.5656, "num_input_tokens_seen": 414056448, "step": 351 }, { "epoch": 0.04152412410050726, "grad_norm": 1.5625, "learning_rate": 6.657210401891253e-05, "loss": 2.4558, "num_input_tokens_seen": 415236096, "step": 352 }, { "epoch": 0.04164209036215642, "grad_norm": 3.046875, "learning_rate": 6.676122931442081e-05, "loss": 2.5107, "num_input_tokens_seen": 416415744, "step": 353 }, { "epoch": 0.041760056623805594, "grad_norm": 2.03125, "learning_rate": 6.695035460992908e-05, "loss": 2.5576, "num_input_tokens_seen": 417595392, "step": 354 }, { "epoch": 0.04187802288545476, "grad_norm": 2.015625, "learning_rate": 6.713947990543735e-05, "loss": 2.5029, "num_input_tokens_seen": 418775040, "step": 355 }, { "epoch": 0.04199598914710393, "grad_norm": 1.84375, "learning_rate": 6.732860520094564e-05, "loss": 2.547, "num_input_tokens_seen": 419954688, "step": 356 }, { "epoch": 0.042113955408753095, "grad_norm": 1.9453125, "learning_rate": 6.75177304964539e-05, "loss": 2.4942, "num_input_tokens_seen": 421134336, "step": 357 }, { "epoch": 0.04223192167040227, "grad_norm": 2.109375, "learning_rate": 6.770685579196218e-05, "loss": 2.4546, "num_input_tokens_seen": 422313984, "step": 358 }, { "epoch": 0.04234988793205143, "grad_norm": 1.875, "learning_rate": 6.789598108747046e-05, "loss": 2.6115, "num_input_tokens_seen": 423493632, "step": 359 }, { "epoch": 0.042467854193700603, "grad_norm": 1.5, "learning_rate": 6.808510638297873e-05, "loss": 2.4977, "num_input_tokens_seen": 424673280, "step": 360 }, { "epoch": 0.04258582045534977, "grad_norm": 1.84375, "learning_rate": 6.8274231678487e-05, "loss": 2.4208, "num_input_tokens_seen": 425852928, "step": 361 }, { "epoch": 0.04270378671699894, "grad_norm": 1.421875, "learning_rate": 6.846335697399528e-05, "loss": 2.5416, "num_input_tokens_seen": 427032576, "step": 362 }, { "epoch": 0.042821752978648105, "grad_norm": 1.78125, "learning_rate": 6.865248226950355e-05, "loss": 2.4938, "num_input_tokens_seen": 428212224, "step": 363 }, { "epoch": 0.042939719240297276, "grad_norm": 1.5859375, "learning_rate": 6.884160756501182e-05, "loss": 2.5203, "num_input_tokens_seen": 429391872, "step": 364 }, { "epoch": 0.04305768550194644, "grad_norm": 1.9296875, "learning_rate": 6.903073286052011e-05, "loss": 2.515, "num_input_tokens_seen": 430571520, "step": 365 }, { "epoch": 0.04317565176359561, "grad_norm": 2.25, "learning_rate": 6.921985815602838e-05, "loss": 2.5211, "num_input_tokens_seen": 431751168, "step": 366 }, { "epoch": 0.04329361802524478, "grad_norm": 1.796875, "learning_rate": 6.940898345153665e-05, "loss": 2.4887, "num_input_tokens_seen": 432930816, "step": 367 }, { "epoch": 0.04341158428689395, "grad_norm": 1.5234375, "learning_rate": 6.959810874704493e-05, "loss": 2.4896, "num_input_tokens_seen": 434110464, "step": 368 }, { "epoch": 0.043529550548543114, "grad_norm": 1.6171875, "learning_rate": 6.97872340425532e-05, "loss": 2.4172, "num_input_tokens_seen": 435290112, "step": 369 }, { "epoch": 0.043647516810192286, "grad_norm": 2.1875, "learning_rate": 6.997635933806147e-05, "loss": 2.4538, "num_input_tokens_seen": 436469760, "step": 370 }, { "epoch": 0.04376548307184145, "grad_norm": 2.015625, "learning_rate": 7.016548463356976e-05, "loss": 2.4439, "num_input_tokens_seen": 437649408, "step": 371 }, { "epoch": 0.04388344933349062, "grad_norm": 1.828125, "learning_rate": 7.035460992907803e-05, "loss": 2.4898, "num_input_tokens_seen": 438829056, "step": 372 }, { "epoch": 0.04400141559513979, "grad_norm": 1.6171875, "learning_rate": 7.05437352245863e-05, "loss": 2.4337, "num_input_tokens_seen": 440008704, "step": 373 }, { "epoch": 0.04411938185678896, "grad_norm": 1.53125, "learning_rate": 7.073286052009457e-05, "loss": 2.4729, "num_input_tokens_seen": 441188352, "step": 374 }, { "epoch": 0.04423734811843813, "grad_norm": 1.9765625, "learning_rate": 7.092198581560284e-05, "loss": 2.4801, "num_input_tokens_seen": 442368000, "step": 375 }, { "epoch": 0.044355314380087295, "grad_norm": 2.0, "learning_rate": 7.11111111111111e-05, "loss": 2.5573, "num_input_tokens_seen": 443547648, "step": 376 }, { "epoch": 0.044473280641736467, "grad_norm": 1.640625, "learning_rate": 7.130023640661939e-05, "loss": 2.4675, "num_input_tokens_seen": 444727296, "step": 377 }, { "epoch": 0.04459124690338563, "grad_norm": 1.8984375, "learning_rate": 7.148936170212766e-05, "loss": 2.4965, "num_input_tokens_seen": 445906944, "step": 378 }, { "epoch": 0.0447092131650348, "grad_norm": 1.2578125, "learning_rate": 7.167848699763593e-05, "loss": 2.4161, "num_input_tokens_seen": 447086592, "step": 379 }, { "epoch": 0.04482717942668397, "grad_norm": 1.8359375, "learning_rate": 7.186761229314421e-05, "loss": 2.4437, "num_input_tokens_seen": 448266240, "step": 380 }, { "epoch": 0.04494514568833314, "grad_norm": 1.8046875, "learning_rate": 7.205673758865248e-05, "loss": 2.4978, "num_input_tokens_seen": 449445888, "step": 381 }, { "epoch": 0.045063111949982304, "grad_norm": 1.53125, "learning_rate": 7.224586288416075e-05, "loss": 2.4316, "num_input_tokens_seen": 450625536, "step": 382 }, { "epoch": 0.045181078211631476, "grad_norm": 2.0625, "learning_rate": 7.243498817966904e-05, "loss": 2.5152, "num_input_tokens_seen": 451805184, "step": 383 }, { "epoch": 0.04529904447328064, "grad_norm": 1.65625, "learning_rate": 7.262411347517731e-05, "loss": 2.4161, "num_input_tokens_seen": 452984832, "step": 384 }, { "epoch": 0.04541701073492981, "grad_norm": 1.8515625, "learning_rate": 7.281323877068558e-05, "loss": 2.4274, "num_input_tokens_seen": 454164480, "step": 385 }, { "epoch": 0.04553497699657898, "grad_norm": 2.0625, "learning_rate": 7.300236406619386e-05, "loss": 2.4693, "num_input_tokens_seen": 455344128, "step": 386 }, { "epoch": 0.04565294325822815, "grad_norm": 1.765625, "learning_rate": 7.319148936170213e-05, "loss": 2.4907, "num_input_tokens_seen": 456523776, "step": 387 }, { "epoch": 0.04577090951987731, "grad_norm": 1.71875, "learning_rate": 7.33806146572104e-05, "loss": 2.4603, "num_input_tokens_seen": 457703424, "step": 388 }, { "epoch": 0.045888875781526485, "grad_norm": 1.7734375, "learning_rate": 7.356973995271869e-05, "loss": 2.4651, "num_input_tokens_seen": 458883072, "step": 389 }, { "epoch": 0.04600684204317565, "grad_norm": 1.9765625, "learning_rate": 7.375886524822696e-05, "loss": 2.4041, "num_input_tokens_seen": 460062720, "step": 390 }, { "epoch": 0.04612480830482482, "grad_norm": 2.90625, "learning_rate": 7.394799054373523e-05, "loss": 2.3896, "num_input_tokens_seen": 461242368, "step": 391 }, { "epoch": 0.046242774566473986, "grad_norm": 1.515625, "learning_rate": 7.413711583924351e-05, "loss": 2.3966, "num_input_tokens_seen": 462422016, "step": 392 }, { "epoch": 0.04636074082812316, "grad_norm": 1.984375, "learning_rate": 7.432624113475178e-05, "loss": 2.5009, "num_input_tokens_seen": 463601664, "step": 393 }, { "epoch": 0.04647870708977232, "grad_norm": 1.8984375, "learning_rate": 7.451536643026005e-05, "loss": 2.4524, "num_input_tokens_seen": 464781312, "step": 394 }, { "epoch": 0.046596673351421494, "grad_norm": 2.46875, "learning_rate": 7.470449172576833e-05, "loss": 2.4169, "num_input_tokens_seen": 465960960, "step": 395 }, { "epoch": 0.04671463961307066, "grad_norm": 1.609375, "learning_rate": 7.48936170212766e-05, "loss": 2.5313, "num_input_tokens_seen": 467140608, "step": 396 }, { "epoch": 0.04683260587471983, "grad_norm": 1.7890625, "learning_rate": 7.508274231678487e-05, "loss": 2.5893, "num_input_tokens_seen": 468320256, "step": 397 }, { "epoch": 0.046950572136368995, "grad_norm": 1.9453125, "learning_rate": 7.527186761229316e-05, "loss": 2.453, "num_input_tokens_seen": 469499904, "step": 398 }, { "epoch": 0.04706853839801817, "grad_norm": 1.78125, "learning_rate": 7.546099290780143e-05, "loss": 2.4462, "num_input_tokens_seen": 470679552, "step": 399 }, { "epoch": 0.04718650465966733, "grad_norm": 2.140625, "learning_rate": 7.56501182033097e-05, "loss": 2.5393, "num_input_tokens_seen": 471859200, "step": 400 }, { "epoch": 0.04718650465966733, "eval_wikipedia_loss": 2.4841043949127197, "eval_wikipedia_runtime": 173.8643, "eval_wikipedia_samples_per_second": 4.038, "eval_wikipedia_steps_per_second": 0.173, "num_input_tokens_seen": 471859200, "step": 400 }, { "epoch": 0.04718650465966733, "eval_toxicity_loss": 4.2398834228515625, "eval_toxicity_runtime": 0.9943, "eval_toxicity_samples_per_second": 2.011, "eval_toxicity_steps_per_second": 1.006, "num_input_tokens_seen": 471859200, "step": 400 }, { "epoch": 0.047304470921316503, "grad_norm": 1.9609375, "learning_rate": 7.583924349881798e-05, "loss": 2.4697, "num_input_tokens_seen": 473038848, "step": 401 }, { "epoch": 0.047422437182965675, "grad_norm": 1.703125, "learning_rate": 7.602836879432625e-05, "loss": 2.4191, "num_input_tokens_seen": 474218496, "step": 402 }, { "epoch": 0.04754040344461484, "grad_norm": 1.2421875, "learning_rate": 7.621749408983452e-05, "loss": 2.3892, "num_input_tokens_seen": 475398144, "step": 403 }, { "epoch": 0.04765836970626401, "grad_norm": 1.6640625, "learning_rate": 7.64066193853428e-05, "loss": 2.4324, "num_input_tokens_seen": 476577792, "step": 404 }, { "epoch": 0.047776335967913176, "grad_norm": 2.015625, "learning_rate": 7.659574468085108e-05, "loss": 2.4291, "num_input_tokens_seen": 477757440, "step": 405 }, { "epoch": 0.04789430222956235, "grad_norm": 2.65625, "learning_rate": 7.678486997635935e-05, "loss": 2.4577, "num_input_tokens_seen": 478937088, "step": 406 }, { "epoch": 0.04801226849121151, "grad_norm": 1.2265625, "learning_rate": 7.697399527186762e-05, "loss": 2.4554, "num_input_tokens_seen": 480116736, "step": 407 }, { "epoch": 0.048130234752860684, "grad_norm": 3.453125, "learning_rate": 7.716312056737589e-05, "loss": 2.3692, "num_input_tokens_seen": 481296384, "step": 408 }, { "epoch": 0.04824820101450985, "grad_norm": 1.953125, "learning_rate": 7.735224586288416e-05, "loss": 2.518, "num_input_tokens_seen": 482476032, "step": 409 }, { "epoch": 0.04836616727615902, "grad_norm": 2.046875, "learning_rate": 7.754137115839244e-05, "loss": 2.495, "num_input_tokens_seen": 483655680, "step": 410 }, { "epoch": 0.048484133537808186, "grad_norm": 1.9765625, "learning_rate": 7.773049645390071e-05, "loss": 2.4344, "num_input_tokens_seen": 484835328, "step": 411 }, { "epoch": 0.04860209979945736, "grad_norm": 1.9296875, "learning_rate": 7.791962174940898e-05, "loss": 2.4095, "num_input_tokens_seen": 486014976, "step": 412 }, { "epoch": 0.04872006606110652, "grad_norm": 2.234375, "learning_rate": 7.810874704491726e-05, "loss": 2.4562, "num_input_tokens_seen": 487194624, "step": 413 }, { "epoch": 0.048838032322755694, "grad_norm": 1.2109375, "learning_rate": 7.829787234042553e-05, "loss": 2.3863, "num_input_tokens_seen": 488374272, "step": 414 }, { "epoch": 0.04895599858440486, "grad_norm": 1.96875, "learning_rate": 7.84869976359338e-05, "loss": 2.4577, "num_input_tokens_seen": 489553920, "step": 415 }, { "epoch": 0.04907396484605403, "grad_norm": 1.375, "learning_rate": 7.867612293144209e-05, "loss": 2.4166, "num_input_tokens_seen": 490733568, "step": 416 }, { "epoch": 0.049191931107703195, "grad_norm": 1.453125, "learning_rate": 7.886524822695036e-05, "loss": 2.4524, "num_input_tokens_seen": 491913216, "step": 417 }, { "epoch": 0.049309897369352367, "grad_norm": 1.390625, "learning_rate": 7.905437352245863e-05, "loss": 2.396, "num_input_tokens_seen": 493092864, "step": 418 }, { "epoch": 0.04942786363100153, "grad_norm": 1.21875, "learning_rate": 7.924349881796691e-05, "loss": 2.3969, "num_input_tokens_seen": 494272512, "step": 419 }, { "epoch": 0.0495458298926507, "grad_norm": 1.625, "learning_rate": 7.943262411347518e-05, "loss": 2.4247, "num_input_tokens_seen": 495452160, "step": 420 }, { "epoch": 0.04966379615429987, "grad_norm": 2.3125, "learning_rate": 7.962174940898345e-05, "loss": 2.5668, "num_input_tokens_seen": 496631808, "step": 421 }, { "epoch": 0.04978176241594904, "grad_norm": 2.078125, "learning_rate": 7.981087470449174e-05, "loss": 2.3802, "num_input_tokens_seen": 497811456, "step": 422 }, { "epoch": 0.049899728677598204, "grad_norm": 1.1953125, "learning_rate": 8e-05, "loss": 2.4226, "num_input_tokens_seen": 498991104, "step": 423 }, { "epoch": 0.050017694939247376, "grad_norm": 1.453125, "learning_rate": 7.999999695696824e-05, "loss": 2.4269, "num_input_tokens_seen": 500170752, "step": 424 }, { "epoch": 0.05013566120089654, "grad_norm": 1.4609375, "learning_rate": 7.999998782787343e-05, "loss": 2.4702, "num_input_tokens_seen": 501350400, "step": 425 }, { "epoch": 0.05025362746254571, "grad_norm": 1.96875, "learning_rate": 7.999997261271695e-05, "loss": 2.3265, "num_input_tokens_seen": 502530048, "step": 426 }, { "epoch": 0.05037159372419488, "grad_norm": 2.265625, "learning_rate": 7.999995131150112e-05, "loss": 2.4, "num_input_tokens_seen": 503709696, "step": 427 }, { "epoch": 0.05048955998584405, "grad_norm": 1.3359375, "learning_rate": 7.999992392422917e-05, "loss": 2.4076, "num_input_tokens_seen": 504889344, "step": 428 }, { "epoch": 0.05060752624749322, "grad_norm": 1.140625, "learning_rate": 7.999989045090529e-05, "loss": 2.3864, "num_input_tokens_seen": 506068992, "step": 429 }, { "epoch": 0.050725492509142385, "grad_norm": 1.4765625, "learning_rate": 7.999985089153455e-05, "loss": 2.4194, "num_input_tokens_seen": 507248640, "step": 430 }, { "epoch": 0.05084345877079156, "grad_norm": 2.0625, "learning_rate": 7.999980524612299e-05, "loss": 2.4248, "num_input_tokens_seen": 508428288, "step": 431 }, { "epoch": 0.05096142503244072, "grad_norm": 1.953125, "learning_rate": 7.999975351467752e-05, "loss": 2.3898, "num_input_tokens_seen": 509607936, "step": 432 }, { "epoch": 0.05107939129408989, "grad_norm": 1.421875, "learning_rate": 7.999969569720605e-05, "loss": 2.3805, "num_input_tokens_seen": 510787584, "step": 433 }, { "epoch": 0.05119735755573906, "grad_norm": 1.1328125, "learning_rate": 7.999963179371735e-05, "loss": 2.4378, "num_input_tokens_seen": 511967232, "step": 434 }, { "epoch": 0.05131532381738823, "grad_norm": 1.3984375, "learning_rate": 7.999956180422118e-05, "loss": 2.4174, "num_input_tokens_seen": 513146880, "step": 435 }, { "epoch": 0.051433290079037394, "grad_norm": 1.6171875, "learning_rate": 7.999948572872813e-05, "loss": 2.3132, "num_input_tokens_seen": 514326528, "step": 436 }, { "epoch": 0.051551256340686566, "grad_norm": 2.0, "learning_rate": 7.999940356724983e-05, "loss": 2.3784, "num_input_tokens_seen": 515506176, "step": 437 }, { "epoch": 0.05166922260233573, "grad_norm": 1.3359375, "learning_rate": 7.999931531979876e-05, "loss": 2.3361, "num_input_tokens_seen": 516685824, "step": 438 }, { "epoch": 0.0517871888639849, "grad_norm": 1.328125, "learning_rate": 7.999922098638835e-05, "loss": 2.4646, "num_input_tokens_seen": 517865472, "step": 439 }, { "epoch": 0.05190515512563407, "grad_norm": 1.453125, "learning_rate": 7.999912056703294e-05, "loss": 2.3374, "num_input_tokens_seen": 519045120, "step": 440 }, { "epoch": 0.05202312138728324, "grad_norm": 1.9296875, "learning_rate": 7.999901406174781e-05, "loss": 2.3778, "num_input_tokens_seen": 520224768, "step": 441 }, { "epoch": 0.052141087648932403, "grad_norm": 1.3203125, "learning_rate": 7.999890147054918e-05, "loss": 2.328, "num_input_tokens_seen": 521404416, "step": 442 }, { "epoch": 0.052259053910581575, "grad_norm": 1.453125, "learning_rate": 7.999878279345418e-05, "loss": 2.3145, "num_input_tokens_seen": 522584064, "step": 443 }, { "epoch": 0.05237702017223074, "grad_norm": 1.3046875, "learning_rate": 7.999865803048087e-05, "loss": 2.3785, "num_input_tokens_seen": 523763712, "step": 444 }, { "epoch": 0.05249498643387991, "grad_norm": 1.3359375, "learning_rate": 7.99985271816482e-05, "loss": 2.3462, "num_input_tokens_seen": 524943360, "step": 445 }, { "epoch": 0.052612952695529076, "grad_norm": 1.453125, "learning_rate": 7.999839024697612e-05, "loss": 2.5261, "num_input_tokens_seen": 526123008, "step": 446 }, { "epoch": 0.05273091895717825, "grad_norm": 1.4140625, "learning_rate": 7.999824722648545e-05, "loss": 2.3022, "num_input_tokens_seen": 527302656, "step": 447 }, { "epoch": 0.05284888521882741, "grad_norm": 1.3359375, "learning_rate": 7.999809812019794e-05, "loss": 2.4073, "num_input_tokens_seen": 528482304, "step": 448 }, { "epoch": 0.052966851480476584, "grad_norm": 1.1015625, "learning_rate": 7.999794292813627e-05, "loss": 2.3876, "num_input_tokens_seen": 529661952, "step": 449 }, { "epoch": 0.05308481774212575, "grad_norm": 1.8203125, "learning_rate": 7.99977816503241e-05, "loss": 2.3443, "num_input_tokens_seen": 530841600, "step": 450 }, { "epoch": 0.05320278400377492, "grad_norm": 2.375, "learning_rate": 7.999761428678591e-05, "loss": 2.4333, "num_input_tokens_seen": 532021248, "step": 451 }, { "epoch": 0.053320750265424086, "grad_norm": 1.796875, "learning_rate": 7.999744083754721e-05, "loss": 2.3346, "num_input_tokens_seen": 533200896, "step": 452 }, { "epoch": 0.05343871652707326, "grad_norm": 1.046875, "learning_rate": 7.999726130263437e-05, "loss": 2.356, "num_input_tokens_seen": 534380544, "step": 453 }, { "epoch": 0.05355668278872242, "grad_norm": 1.578125, "learning_rate": 7.99970756820747e-05, "loss": 2.2924, "num_input_tokens_seen": 535560192, "step": 454 }, { "epoch": 0.053674649050371594, "grad_norm": 1.9375, "learning_rate": 7.999688397589647e-05, "loss": 2.3714, "num_input_tokens_seen": 536739840, "step": 455 }, { "epoch": 0.053792615312020765, "grad_norm": 1.546875, "learning_rate": 7.99966861841288e-05, "loss": 2.3435, "num_input_tokens_seen": 537919488, "step": 456 }, { "epoch": 0.05391058157366993, "grad_norm": 1.4375, "learning_rate": 7.999648230680182e-05, "loss": 2.4296, "num_input_tokens_seen": 539099136, "step": 457 }, { "epoch": 0.0540285478353191, "grad_norm": 1.2890625, "learning_rate": 7.999627234394656e-05, "loss": 2.4, "num_input_tokens_seen": 540278784, "step": 458 }, { "epoch": 0.054146514096968267, "grad_norm": 1.5625, "learning_rate": 7.999605629559493e-05, "loss": 2.3882, "num_input_tokens_seen": 541458432, "step": 459 }, { "epoch": 0.05426448035861744, "grad_norm": 1.5234375, "learning_rate": 7.999583416177985e-05, "loss": 2.4006, "num_input_tokens_seen": 542638080, "step": 460 }, { "epoch": 0.0543824466202666, "grad_norm": 1.953125, "learning_rate": 7.999560594253506e-05, "loss": 2.3288, "num_input_tokens_seen": 543817728, "step": 461 }, { "epoch": 0.054500412881915775, "grad_norm": 1.2421875, "learning_rate": 7.999537163789534e-05, "loss": 2.3453, "num_input_tokens_seen": 544997376, "step": 462 }, { "epoch": 0.05461837914356494, "grad_norm": 1.015625, "learning_rate": 7.999513124789631e-05, "loss": 2.3976, "num_input_tokens_seen": 546177024, "step": 463 }, { "epoch": 0.05473634540521411, "grad_norm": 1.0078125, "learning_rate": 7.999488477257453e-05, "loss": 2.3466, "num_input_tokens_seen": 547356672, "step": 464 }, { "epoch": 0.054854311666863276, "grad_norm": 1.1171875, "learning_rate": 7.999463221196753e-05, "loss": 2.3788, "num_input_tokens_seen": 548536320, "step": 465 }, { "epoch": 0.05497227792851245, "grad_norm": 1.1171875, "learning_rate": 7.999437356611375e-05, "loss": 2.2683, "num_input_tokens_seen": 549715968, "step": 466 }, { "epoch": 0.05509024419016161, "grad_norm": 1.109375, "learning_rate": 7.99941088350525e-05, "loss": 2.2787, "num_input_tokens_seen": 550895616, "step": 467 }, { "epoch": 0.055208210451810784, "grad_norm": 1.5, "learning_rate": 7.999383801882408e-05, "loss": 2.3686, "num_input_tokens_seen": 552075264, "step": 468 }, { "epoch": 0.05532617671345995, "grad_norm": 1.8203125, "learning_rate": 7.99935611174697e-05, "loss": 2.283, "num_input_tokens_seen": 553254912, "step": 469 }, { "epoch": 0.05544414297510912, "grad_norm": 1.2890625, "learning_rate": 7.999327813103149e-05, "loss": 2.3282, "num_input_tokens_seen": 554434560, "step": 470 }, { "epoch": 0.055562109236758285, "grad_norm": 1.390625, "learning_rate": 7.999298905955251e-05, "loss": 2.5573, "num_input_tokens_seen": 555614208, "step": 471 }, { "epoch": 0.05568007549840746, "grad_norm": 1.1640625, "learning_rate": 7.999269390307672e-05, "loss": 2.3396, "num_input_tokens_seen": 556793856, "step": 472 }, { "epoch": 0.05579804176005662, "grad_norm": 1.484375, "learning_rate": 7.999239266164906e-05, "loss": 2.424, "num_input_tokens_seen": 557973504, "step": 473 }, { "epoch": 0.05591600802170579, "grad_norm": 1.40625, "learning_rate": 7.999208533531534e-05, "loss": 2.2834, "num_input_tokens_seen": 559153152, "step": 474 }, { "epoch": 0.05603397428335496, "grad_norm": 1.1953125, "learning_rate": 7.999177192412233e-05, "loss": 2.3158, "num_input_tokens_seen": 560332800, "step": 475 }, { "epoch": 0.05615194054500413, "grad_norm": 1.5703125, "learning_rate": 7.999145242811773e-05, "loss": 2.3454, "num_input_tokens_seen": 561512448, "step": 476 }, { "epoch": 0.056269906806653294, "grad_norm": 1.796875, "learning_rate": 7.999112684735013e-05, "loss": 2.3211, "num_input_tokens_seen": 562692096, "step": 477 }, { "epoch": 0.056387873068302466, "grad_norm": 1.046875, "learning_rate": 7.999079518186908e-05, "loss": 2.2571, "num_input_tokens_seen": 563871744, "step": 478 }, { "epoch": 0.05650583932995163, "grad_norm": 1.4140625, "learning_rate": 7.999045743172504e-05, "loss": 2.2938, "num_input_tokens_seen": 565051392, "step": 479 }, { "epoch": 0.0566238055916008, "grad_norm": 1.6328125, "learning_rate": 7.99901135969694e-05, "loss": 2.4742, "num_input_tokens_seen": 566231040, "step": 480 }, { "epoch": 0.05674177185324997, "grad_norm": 1.484375, "learning_rate": 7.998976367765446e-05, "loss": 2.3188, "num_input_tokens_seen": 567410688, "step": 481 }, { "epoch": 0.05685973811489914, "grad_norm": 1.7109375, "learning_rate": 7.998940767383348e-05, "loss": 2.2454, "num_input_tokens_seen": 568590336, "step": 482 }, { "epoch": 0.05697770437654831, "grad_norm": 1.1328125, "learning_rate": 7.998904558556062e-05, "loss": 2.3923, "num_input_tokens_seen": 569769984, "step": 483 }, { "epoch": 0.057095670638197475, "grad_norm": 1.421875, "learning_rate": 7.998867741289097e-05, "loss": 2.3465, "num_input_tokens_seen": 570949632, "step": 484 }, { "epoch": 0.05721363689984665, "grad_norm": 1.0078125, "learning_rate": 7.998830315588056e-05, "loss": 2.3016, "num_input_tokens_seen": 572129280, "step": 485 }, { "epoch": 0.05733160316149581, "grad_norm": 1.734375, "learning_rate": 7.998792281458631e-05, "loss": 2.2886, "num_input_tokens_seen": 573308928, "step": 486 }, { "epoch": 0.05744956942314498, "grad_norm": 1.140625, "learning_rate": 7.998753638906613e-05, "loss": 2.3398, "num_input_tokens_seen": 574488576, "step": 487 }, { "epoch": 0.05756753568479415, "grad_norm": 2.0625, "learning_rate": 7.998714387937875e-05, "loss": 2.3558, "num_input_tokens_seen": 575668224, "step": 488 }, { "epoch": 0.05768550194644332, "grad_norm": 1.7578125, "learning_rate": 7.998674528558397e-05, "loss": 2.3123, "num_input_tokens_seen": 576847872, "step": 489 }, { "epoch": 0.057803468208092484, "grad_norm": 1.109375, "learning_rate": 7.998634060774239e-05, "loss": 2.4113, "num_input_tokens_seen": 578027520, "step": 490 }, { "epoch": 0.057921434469741656, "grad_norm": 2.125, "learning_rate": 7.998592984591557e-05, "loss": 2.3259, "num_input_tokens_seen": 579207168, "step": 491 }, { "epoch": 0.05803940073139082, "grad_norm": 1.390625, "learning_rate": 7.998551300016603e-05, "loss": 2.39, "num_input_tokens_seen": 580386816, "step": 492 }, { "epoch": 0.05815736699303999, "grad_norm": 1.078125, "learning_rate": 7.99850900705572e-05, "loss": 2.3108, "num_input_tokens_seen": 581566464, "step": 493 }, { "epoch": 0.05827533325468916, "grad_norm": 1.0625, "learning_rate": 7.998466105715342e-05, "loss": 2.2948, "num_input_tokens_seen": 582746112, "step": 494 }, { "epoch": 0.05839329951633833, "grad_norm": 1.0390625, "learning_rate": 7.998422596001997e-05, "loss": 2.3144, "num_input_tokens_seen": 583925760, "step": 495 }, { "epoch": 0.058511265777987494, "grad_norm": 1.2734375, "learning_rate": 7.998378477922303e-05, "loss": 2.3439, "num_input_tokens_seen": 585105408, "step": 496 }, { "epoch": 0.058629232039636665, "grad_norm": 1.0546875, "learning_rate": 7.998333751482976e-05, "loss": 2.3061, "num_input_tokens_seen": 586285056, "step": 497 }, { "epoch": 0.05874719830128583, "grad_norm": 1.0703125, "learning_rate": 7.99828841669082e-05, "loss": 2.2375, "num_input_tokens_seen": 587464704, "step": 498 }, { "epoch": 0.058865164562935, "grad_norm": 1.078125, "learning_rate": 7.998242473552732e-05, "loss": 2.2959, "num_input_tokens_seen": 588644352, "step": 499 }, { "epoch": 0.058983130824584167, "grad_norm": 1.546875, "learning_rate": 7.9981959220757e-05, "loss": 2.3324, "num_input_tokens_seen": 589824000, "step": 500 }, { "epoch": 0.05910109708623334, "grad_norm": 1.7734375, "learning_rate": 7.998148762266812e-05, "loss": 2.267, "num_input_tokens_seen": 591003648, "step": 501 }, { "epoch": 0.0592190633478825, "grad_norm": 1.0859375, "learning_rate": 7.99810099413324e-05, "loss": 2.376, "num_input_tokens_seen": 592183296, "step": 502 }, { "epoch": 0.059337029609531675, "grad_norm": 0.91015625, "learning_rate": 7.998052617682253e-05, "loss": 2.2746, "num_input_tokens_seen": 593362944, "step": 503 }, { "epoch": 0.05945499587118084, "grad_norm": 1.1015625, "learning_rate": 7.998003632921212e-05, "loss": 2.3335, "num_input_tokens_seen": 594542592, "step": 504 }, { "epoch": 0.05957296213283001, "grad_norm": 1.4921875, "learning_rate": 7.997954039857569e-05, "loss": 2.2334, "num_input_tokens_seen": 595722240, "step": 505 }, { "epoch": 0.059690928394479176, "grad_norm": 1.578125, "learning_rate": 7.99790383849887e-05, "loss": 2.3015, "num_input_tokens_seen": 596901888, "step": 506 }, { "epoch": 0.05980889465612835, "grad_norm": 1.390625, "learning_rate": 7.997853028852755e-05, "loss": 2.2773, "num_input_tokens_seen": 598081536, "step": 507 }, { "epoch": 0.05992686091777751, "grad_norm": 1.0, "learning_rate": 7.997801610926952e-05, "loss": 2.2407, "num_input_tokens_seen": 599261184, "step": 508 }, { "epoch": 0.060044827179426684, "grad_norm": 1.0625, "learning_rate": 7.997749584729286e-05, "loss": 2.4037, "num_input_tokens_seen": 600440832, "step": 509 }, { "epoch": 0.060162793441075856, "grad_norm": 1.3125, "learning_rate": 7.997696950267672e-05, "loss": 2.3178, "num_input_tokens_seen": 601620480, "step": 510 }, { "epoch": 0.06028075970272502, "grad_norm": 1.53125, "learning_rate": 7.997643707550117e-05, "loss": 2.2891, "num_input_tokens_seen": 602800128, "step": 511 }, { "epoch": 0.06039872596437419, "grad_norm": 1.4765625, "learning_rate": 7.997589856584725e-05, "loss": 2.3047, "num_input_tokens_seen": 603979776, "step": 512 }, { "epoch": 0.06051669222602336, "grad_norm": 1.0234375, "learning_rate": 7.99753539737969e-05, "loss": 2.2871, "num_input_tokens_seen": 605159424, "step": 513 }, { "epoch": 0.06063465848767253, "grad_norm": 1.796875, "learning_rate": 7.997480329943294e-05, "loss": 2.2966, "num_input_tokens_seen": 606339072, "step": 514 }, { "epoch": 0.06075262474932169, "grad_norm": 0.95703125, "learning_rate": 7.99742465428392e-05, "loss": 2.1938, "num_input_tokens_seen": 607518720, "step": 515 }, { "epoch": 0.060870591010970865, "grad_norm": 0.87890625, "learning_rate": 7.997368370410035e-05, "loss": 2.2878, "num_input_tokens_seen": 608698368, "step": 516 }, { "epoch": 0.06098855727262003, "grad_norm": 1.046875, "learning_rate": 7.997311478330206e-05, "loss": 2.2645, "num_input_tokens_seen": 609878016, "step": 517 }, { "epoch": 0.0611065235342692, "grad_norm": 0.99609375, "learning_rate": 7.997253978053087e-05, "loss": 2.2939, "num_input_tokens_seen": 611057664, "step": 518 }, { "epoch": 0.061224489795918366, "grad_norm": 0.99609375, "learning_rate": 7.997195869587428e-05, "loss": 2.2763, "num_input_tokens_seen": 612237312, "step": 519 }, { "epoch": 0.06134245605756754, "grad_norm": 1.671875, "learning_rate": 7.997137152942071e-05, "loss": 2.3172, "num_input_tokens_seen": 613416960, "step": 520 }, { "epoch": 0.0614604223192167, "grad_norm": 1.8203125, "learning_rate": 7.997077828125948e-05, "loss": 2.2249, "num_input_tokens_seen": 614596608, "step": 521 }, { "epoch": 0.061578388580865874, "grad_norm": 0.78515625, "learning_rate": 7.997017895148088e-05, "loss": 2.3344, "num_input_tokens_seen": 615776256, "step": 522 }, { "epoch": 0.06169635484251504, "grad_norm": 1.90625, "learning_rate": 7.996957354017605e-05, "loss": 2.2967, "num_input_tokens_seen": 616955904, "step": 523 }, { "epoch": 0.06181432110416421, "grad_norm": 1.4765625, "learning_rate": 7.996896204743716e-05, "loss": 2.2287, "num_input_tokens_seen": 618135552, "step": 524 }, { "epoch": 0.061932287365813375, "grad_norm": 0.8671875, "learning_rate": 7.996834447335722e-05, "loss": 2.2897, "num_input_tokens_seen": 619315200, "step": 525 }, { "epoch": 0.06205025362746255, "grad_norm": 1.2578125, "learning_rate": 7.99677208180302e-05, "loss": 2.3208, "num_input_tokens_seen": 620494848, "step": 526 }, { "epoch": 0.06216821988911171, "grad_norm": 2.078125, "learning_rate": 7.996709108155098e-05, "loss": 2.2778, "num_input_tokens_seen": 621674496, "step": 527 }, { "epoch": 0.06228618615076088, "grad_norm": 0.91015625, "learning_rate": 7.99664552640154e-05, "loss": 2.1945, "num_input_tokens_seen": 622854144, "step": 528 }, { "epoch": 0.06240415241241005, "grad_norm": 2.53125, "learning_rate": 7.996581336552018e-05, "loss": 2.3752, "num_input_tokens_seen": 624033792, "step": 529 }, { "epoch": 0.06252211867405921, "grad_norm": 1.140625, "learning_rate": 7.996516538616299e-05, "loss": 2.3217, "num_input_tokens_seen": 625213440, "step": 530 }, { "epoch": 0.06264008493570838, "grad_norm": 1.890625, "learning_rate": 7.996451132604242e-05, "loss": 2.3374, "num_input_tokens_seen": 626393088, "step": 531 }, { "epoch": 0.06275805119735756, "grad_norm": 1.328125, "learning_rate": 7.9963851185258e-05, "loss": 2.2926, "num_input_tokens_seen": 627572736, "step": 532 }, { "epoch": 0.06287601745900673, "grad_norm": 1.1328125, "learning_rate": 7.996318496391015e-05, "loss": 2.2946, "num_input_tokens_seen": 628752384, "step": 533 }, { "epoch": 0.06299398372065589, "grad_norm": 1.4375, "learning_rate": 7.996251266210025e-05, "loss": 2.2977, "num_input_tokens_seen": 629932032, "step": 534 }, { "epoch": 0.06311194998230506, "grad_norm": 1.2109375, "learning_rate": 7.996183427993058e-05, "loss": 2.2898, "num_input_tokens_seen": 631111680, "step": 535 }, { "epoch": 0.06322991624395423, "grad_norm": 1.203125, "learning_rate": 7.996114981750438e-05, "loss": 2.2557, "num_input_tokens_seen": 632291328, "step": 536 }, { "epoch": 0.0633478825056034, "grad_norm": 1.0546875, "learning_rate": 7.996045927492578e-05, "loss": 2.3759, "num_input_tokens_seen": 633470976, "step": 537 }, { "epoch": 0.06346584876725257, "grad_norm": 1.1015625, "learning_rate": 7.995976265229984e-05, "loss": 2.2283, "num_input_tokens_seen": 634650624, "step": 538 }, { "epoch": 0.06358381502890173, "grad_norm": 1.1875, "learning_rate": 7.995905994973257e-05, "loss": 2.4155, "num_input_tokens_seen": 635830272, "step": 539 }, { "epoch": 0.0637017812905509, "grad_norm": 1.2890625, "learning_rate": 7.995835116733086e-05, "loss": 2.3187, "num_input_tokens_seen": 637009920, "step": 540 }, { "epoch": 0.06381974755220007, "grad_norm": 1.671875, "learning_rate": 7.995763630520257e-05, "loss": 2.2263, "num_input_tokens_seen": 638189568, "step": 541 }, { "epoch": 0.06393771381384925, "grad_norm": 1.0546875, "learning_rate": 7.995691536345647e-05, "loss": 2.2788, "num_input_tokens_seen": 639369216, "step": 542 }, { "epoch": 0.0640556800754984, "grad_norm": 1.0703125, "learning_rate": 7.995618834220223e-05, "loss": 2.2173, "num_input_tokens_seen": 640548864, "step": 543 }, { "epoch": 0.06417364633714757, "grad_norm": 1.1953125, "learning_rate": 7.99554552415505e-05, "loss": 2.2742, "num_input_tokens_seen": 641728512, "step": 544 }, { "epoch": 0.06429161259879675, "grad_norm": 1.453125, "learning_rate": 7.99547160616128e-05, "loss": 2.2516, "num_input_tokens_seen": 642908160, "step": 545 }, { "epoch": 0.06440957886044592, "grad_norm": 1.0078125, "learning_rate": 7.995397080250162e-05, "loss": 2.33, "num_input_tokens_seen": 644087808, "step": 546 }, { "epoch": 0.06452754512209508, "grad_norm": 1.0703125, "learning_rate": 7.995321946433033e-05, "loss": 2.2447, "num_input_tokens_seen": 645267456, "step": 547 }, { "epoch": 0.06464551138374425, "grad_norm": 1.7421875, "learning_rate": 7.995246204721324e-05, "loss": 2.369, "num_input_tokens_seen": 646447104, "step": 548 }, { "epoch": 0.06476347764539342, "grad_norm": 1.25, "learning_rate": 7.995169855126561e-05, "loss": 2.2753, "num_input_tokens_seen": 647626752, "step": 549 }, { "epoch": 0.06488144390704259, "grad_norm": 1.6171875, "learning_rate": 7.995092897660363e-05, "loss": 2.2828, "num_input_tokens_seen": 648806400, "step": 550 }, { "epoch": 0.06499941016869175, "grad_norm": 1.015625, "learning_rate": 7.995015332334433e-05, "loss": 2.244, "num_input_tokens_seen": 649986048, "step": 551 }, { "epoch": 0.06511737643034092, "grad_norm": 1.5859375, "learning_rate": 7.994937159160578e-05, "loss": 2.2447, "num_input_tokens_seen": 651165696, "step": 552 }, { "epoch": 0.06523534269199009, "grad_norm": 1.0234375, "learning_rate": 7.99485837815069e-05, "loss": 2.2295, "num_input_tokens_seen": 652345344, "step": 553 }, { "epoch": 0.06535330895363926, "grad_norm": 1.1875, "learning_rate": 7.994778989316757e-05, "loss": 2.2049, "num_input_tokens_seen": 653524992, "step": 554 }, { "epoch": 0.06547127521528842, "grad_norm": 0.9609375, "learning_rate": 7.994698992670855e-05, "loss": 2.257, "num_input_tokens_seen": 654704640, "step": 555 }, { "epoch": 0.0655892414769376, "grad_norm": 1.0625, "learning_rate": 7.99461838822516e-05, "loss": 2.1423, "num_input_tokens_seen": 655884288, "step": 556 }, { "epoch": 0.06570720773858676, "grad_norm": 1.109375, "learning_rate": 7.994537175991934e-05, "loss": 2.2261, "num_input_tokens_seen": 657063936, "step": 557 }, { "epoch": 0.06582517400023594, "grad_norm": 1.5859375, "learning_rate": 7.994455355983532e-05, "loss": 2.2693, "num_input_tokens_seen": 658243584, "step": 558 }, { "epoch": 0.0659431402618851, "grad_norm": 1.125, "learning_rate": 7.994372928212406e-05, "loss": 2.3283, "num_input_tokens_seen": 659423232, "step": 559 }, { "epoch": 0.06606110652353427, "grad_norm": 0.9609375, "learning_rate": 7.994289892691094e-05, "loss": 2.2631, "num_input_tokens_seen": 660602880, "step": 560 }, { "epoch": 0.06617907278518344, "grad_norm": 0.93359375, "learning_rate": 7.994206249432234e-05, "loss": 2.1609, "num_input_tokens_seen": 661782528, "step": 561 }, { "epoch": 0.06629703904683261, "grad_norm": 1.4375, "learning_rate": 7.994121998448549e-05, "loss": 2.1385, "num_input_tokens_seen": 662962176, "step": 562 }, { "epoch": 0.06641500530848178, "grad_norm": 1.625, "learning_rate": 7.99403713975286e-05, "loss": 2.22, "num_input_tokens_seen": 664141824, "step": 563 }, { "epoch": 0.06653297157013094, "grad_norm": 0.8125, "learning_rate": 7.993951673358078e-05, "loss": 2.2546, "num_input_tokens_seen": 665321472, "step": 564 }, { "epoch": 0.06665093783178011, "grad_norm": 1.2734375, "learning_rate": 7.993865599277206e-05, "loss": 2.217, "num_input_tokens_seen": 666501120, "step": 565 }, { "epoch": 0.06676890409342928, "grad_norm": 1.6484375, "learning_rate": 7.993778917523343e-05, "loss": 2.1732, "num_input_tokens_seen": 667680768, "step": 566 }, { "epoch": 0.06688687035507845, "grad_norm": 0.96484375, "learning_rate": 7.993691628109673e-05, "loss": 2.2929, "num_input_tokens_seen": 668860416, "step": 567 }, { "epoch": 0.06700483661672761, "grad_norm": 1.203125, "learning_rate": 7.993603731049481e-05, "loss": 2.3019, "num_input_tokens_seen": 670040064, "step": 568 }, { "epoch": 0.06712280287837678, "grad_norm": 1.25, "learning_rate": 7.99351522635614e-05, "loss": 2.2712, "num_input_tokens_seen": 671219712, "step": 569 }, { "epoch": 0.06724076914002595, "grad_norm": 1.078125, "learning_rate": 7.993426114043115e-05, "loss": 2.1995, "num_input_tokens_seen": 672399360, "step": 570 }, { "epoch": 0.06735873540167513, "grad_norm": 0.99609375, "learning_rate": 7.993336394123965e-05, "loss": 2.189, "num_input_tokens_seen": 673579008, "step": 571 }, { "epoch": 0.06747670166332428, "grad_norm": 1.0390625, "learning_rate": 7.993246066612343e-05, "loss": 2.2997, "num_input_tokens_seen": 674758656, "step": 572 }, { "epoch": 0.06759466792497346, "grad_norm": 0.95703125, "learning_rate": 7.993155131521991e-05, "loss": 2.2692, "num_input_tokens_seen": 675938304, "step": 573 }, { "epoch": 0.06771263418662263, "grad_norm": 1.125, "learning_rate": 7.993063588866742e-05, "loss": 2.243, "num_input_tokens_seen": 677117952, "step": 574 }, { "epoch": 0.0678306004482718, "grad_norm": 1.09375, "learning_rate": 7.992971438660529e-05, "loss": 2.184, "num_input_tokens_seen": 678297600, "step": 575 }, { "epoch": 0.06794856670992096, "grad_norm": 0.94921875, "learning_rate": 7.99287868091737e-05, "loss": 2.1793, "num_input_tokens_seen": 679477248, "step": 576 }, { "epoch": 0.06806653297157013, "grad_norm": 1.171875, "learning_rate": 7.99278531565138e-05, "loss": 2.3607, "num_input_tokens_seen": 680656896, "step": 577 }, { "epoch": 0.0681844992332193, "grad_norm": 0.96484375, "learning_rate": 7.992691342876765e-05, "loss": 2.2439, "num_input_tokens_seen": 681836544, "step": 578 }, { "epoch": 0.06830246549486847, "grad_norm": 1.09375, "learning_rate": 7.99259676260782e-05, "loss": 2.2214, "num_input_tokens_seen": 683016192, "step": 579 }, { "epoch": 0.06842043175651763, "grad_norm": 1.1015625, "learning_rate": 7.992501574858937e-05, "loss": 2.1635, "num_input_tokens_seen": 684195840, "step": 580 }, { "epoch": 0.0685383980181668, "grad_norm": 1.25, "learning_rate": 7.9924057796446e-05, "loss": 2.1955, "num_input_tokens_seen": 685375488, "step": 581 }, { "epoch": 0.06865636427981597, "grad_norm": 0.8515625, "learning_rate": 7.992309376979385e-05, "loss": 2.1876, "num_input_tokens_seen": 686555136, "step": 582 }, { "epoch": 0.06877433054146515, "grad_norm": 0.9296875, "learning_rate": 7.992212366877959e-05, "loss": 2.1243, "num_input_tokens_seen": 687734784, "step": 583 }, { "epoch": 0.0688922968031143, "grad_norm": 1.1875, "learning_rate": 7.992114749355079e-05, "loss": 2.2922, "num_input_tokens_seen": 688914432, "step": 584 }, { "epoch": 0.06901026306476347, "grad_norm": 1.1796875, "learning_rate": 7.992016524425603e-05, "loss": 2.1665, "num_input_tokens_seen": 690094080, "step": 585 }, { "epoch": 0.06912822932641265, "grad_norm": 0.84375, "learning_rate": 7.991917692104473e-05, "loss": 2.207, "num_input_tokens_seen": 691273728, "step": 586 }, { "epoch": 0.06924619558806182, "grad_norm": 0.8671875, "learning_rate": 7.991818252406726e-05, "loss": 2.1341, "num_input_tokens_seen": 692453376, "step": 587 }, { "epoch": 0.06936416184971098, "grad_norm": 1.109375, "learning_rate": 7.991718205347494e-05, "loss": 2.1968, "num_input_tokens_seen": 693633024, "step": 588 }, { "epoch": 0.06948212811136015, "grad_norm": 1.078125, "learning_rate": 7.991617550941998e-05, "loss": 2.1095, "num_input_tokens_seen": 694812672, "step": 589 }, { "epoch": 0.06960009437300932, "grad_norm": 0.89453125, "learning_rate": 7.991516289205554e-05, "loss": 2.2366, "num_input_tokens_seen": 695992320, "step": 590 }, { "epoch": 0.06971806063465849, "grad_norm": 1.0703125, "learning_rate": 7.991414420153569e-05, "loss": 2.2333, "num_input_tokens_seen": 697171968, "step": 591 }, { "epoch": 0.06983602689630766, "grad_norm": 0.8125, "learning_rate": 7.991311943801539e-05, "loss": 2.1065, "num_input_tokens_seen": 698351616, "step": 592 }, { "epoch": 0.06995399315795682, "grad_norm": 1.0703125, "learning_rate": 7.99120886016506e-05, "loss": 2.1005, "num_input_tokens_seen": 699531264, "step": 593 }, { "epoch": 0.07007195941960599, "grad_norm": 1.046875, "learning_rate": 7.991105169259815e-05, "loss": 2.2168, "num_input_tokens_seen": 700710912, "step": 594 }, { "epoch": 0.07018992568125516, "grad_norm": 1.0078125, "learning_rate": 7.991000871101581e-05, "loss": 2.3617, "num_input_tokens_seen": 701890560, "step": 595 }, { "epoch": 0.07030789194290434, "grad_norm": 1.703125, "learning_rate": 7.990895965706227e-05, "loss": 2.2199, "num_input_tokens_seen": 703070208, "step": 596 }, { "epoch": 0.0704258582045535, "grad_norm": 1.0078125, "learning_rate": 7.990790453089714e-05, "loss": 2.1908, "num_input_tokens_seen": 704249856, "step": 597 }, { "epoch": 0.07054382446620266, "grad_norm": 0.87109375, "learning_rate": 7.990684333268097e-05, "loss": 2.1718, "num_input_tokens_seen": 705429504, "step": 598 }, { "epoch": 0.07066179072785184, "grad_norm": 0.80078125, "learning_rate": 7.99057760625752e-05, "loss": 2.1296, "num_input_tokens_seen": 706609152, "step": 599 }, { "epoch": 0.07077975698950101, "grad_norm": 0.84375, "learning_rate": 7.990470272074225e-05, "loss": 2.1698, "num_input_tokens_seen": 707788800, "step": 600 }, { "epoch": 0.07077975698950101, "eval_wikipedia_loss": 2.343088150024414, "eval_wikipedia_runtime": 167.6638, "eval_wikipedia_samples_per_second": 4.187, "eval_wikipedia_steps_per_second": 0.179, "num_input_tokens_seen": 707788800, "step": 600 }, { "epoch": 0.07077975698950101, "eval_toxicity_loss": 4.090839862823486, "eval_toxicity_runtime": 1.2034, "eval_toxicity_samples_per_second": 1.662, "eval_toxicity_steps_per_second": 0.831, "num_input_tokens_seen": 707788800, "step": 600 }, { "epoch": 0.00011796626164916834, "grad_norm": 1.2421875, "learning_rate": 7.99036233073454e-05, "loss": 2.1019, "num_input_tokens_seen": 708968448, "step": 601 }, { "epoch": 0.0002359325232983367, "grad_norm": 1.328125, "learning_rate": 7.99025378225489e-05, "loss": 2.1446, "num_input_tokens_seen": 710148096, "step": 602 }, { "epoch": 0.000353898784947505, "grad_norm": 1.203125, "learning_rate": 7.990144626651791e-05, "loss": 2.1271, "num_input_tokens_seen": 711327744, "step": 603 }, { "epoch": 0.0004718650465966734, "grad_norm": 1.3359375, "learning_rate": 7.990034863941851e-05, "loss": 2.1123, "num_input_tokens_seen": 712507392, "step": 604 }, { "epoch": 0.0005898313082458417, "grad_norm": 0.8203125, "learning_rate": 7.989924494141771e-05, "loss": 2.1567, "num_input_tokens_seen": 713687040, "step": 605 }, { "epoch": 0.00070779756989501, "grad_norm": 0.91796875, "learning_rate": 7.989813517268343e-05, "loss": 2.1786, "num_input_tokens_seen": 714866688, "step": 606 }, { "epoch": 0.0008257638315441783, "grad_norm": 1.078125, "learning_rate": 7.989701933338453e-05, "loss": 2.2267, "num_input_tokens_seen": 716046336, "step": 607 }, { "epoch": 0.0009437300931933467, "grad_norm": 1.578125, "learning_rate": 7.989589742369077e-05, "loss": 2.1507, "num_input_tokens_seen": 717225984, "step": 608 }, { "epoch": 0.001061696354842515, "grad_norm": 1.0546875, "learning_rate": 7.989476944377286e-05, "loss": 2.1274, "num_input_tokens_seen": 718405632, "step": 609 }, { "epoch": 0.0011796626164916834, "grad_norm": 0.984375, "learning_rate": 7.989363539380245e-05, "loss": 2.3054, "num_input_tokens_seen": 719585280, "step": 610 }, { "epoch": 0.0012976288781408518, "grad_norm": 1.0, "learning_rate": 7.989249527395205e-05, "loss": 2.1765, "num_input_tokens_seen": 720764928, "step": 611 }, { "epoch": 0.00141559513979002, "grad_norm": 1.078125, "learning_rate": 7.989134908439515e-05, "loss": 2.1669, "num_input_tokens_seen": 721944576, "step": 612 }, { "epoch": 0.0015335614014391884, "grad_norm": 0.97265625, "learning_rate": 7.989019682530614e-05, "loss": 2.1091, "num_input_tokens_seen": 723124224, "step": 613 }, { "epoch": 0.0016515276630883566, "grad_norm": 1.265625, "learning_rate": 7.988903849686033e-05, "loss": 2.0888, "num_input_tokens_seen": 724303872, "step": 614 }, { "epoch": 0.001769493924737525, "grad_norm": 0.94921875, "learning_rate": 7.988787409923398e-05, "loss": 2.1604, "num_input_tokens_seen": 725483520, "step": 615 }, { "epoch": 0.0018874601863866935, "grad_norm": 0.96875, "learning_rate": 7.988670363260425e-05, "loss": 2.1344, "num_input_tokens_seen": 726663168, "step": 616 }, { "epoch": 0.0020054264480358617, "grad_norm": 0.85546875, "learning_rate": 7.988552709714921e-05, "loss": 2.2063, "num_input_tokens_seen": 727842816, "step": 617 }, { "epoch": 0.00212339270968503, "grad_norm": 1.21875, "learning_rate": 7.98843444930479e-05, "loss": 2.2359, "num_input_tokens_seen": 729022464, "step": 618 }, { "epoch": 0.0022413589713341986, "grad_norm": 1.4609375, "learning_rate": 7.988315582048022e-05, "loss": 2.1383, "num_input_tokens_seen": 730202112, "step": 619 }, { "epoch": 0.0023593252329833668, "grad_norm": 0.99609375, "learning_rate": 7.988196107962707e-05, "loss": 2.2032, "num_input_tokens_seen": 731381760, "step": 620 }, { "epoch": 0.002477291494632535, "grad_norm": 0.828125, "learning_rate": 7.98807602706702e-05, "loss": 2.2634, "num_input_tokens_seen": 732561408, "step": 621 }, { "epoch": 0.0025952577562817036, "grad_norm": 0.90234375, "learning_rate": 7.987955339379234e-05, "loss": 2.177, "num_input_tokens_seen": 733741056, "step": 622 }, { "epoch": 0.002713224017930872, "grad_norm": 1.4375, "learning_rate": 7.987834044917709e-05, "loss": 2.1918, "num_input_tokens_seen": 734920704, "step": 623 }, { "epoch": 0.00283119027958004, "grad_norm": 0.98828125, "learning_rate": 7.9877121437009e-05, "loss": 2.2417, "num_input_tokens_seen": 736100352, "step": 624 }, { "epoch": 0.0029491565412292082, "grad_norm": 0.8203125, "learning_rate": 7.987589635747359e-05, "loss": 2.1719, "num_input_tokens_seen": 737280000, "step": 625 }, { "epoch": 0.003067122802878377, "grad_norm": 1.0, "learning_rate": 7.987466521075722e-05, "loss": 2.2465, "num_input_tokens_seen": 738459648, "step": 626 }, { "epoch": 0.003185089064527545, "grad_norm": 1.0, "learning_rate": 7.987342799704721e-05, "loss": 2.1183, "num_input_tokens_seen": 739639296, "step": 627 }, { "epoch": 0.0033030553261767133, "grad_norm": 1.1796875, "learning_rate": 7.987218471653181e-05, "loss": 2.0837, "num_input_tokens_seen": 740818944, "step": 628 }, { "epoch": 0.003421021587825882, "grad_norm": 1.2734375, "learning_rate": 7.987093536940019e-05, "loss": 2.1209, "num_input_tokens_seen": 741998592, "step": 629 }, { "epoch": 0.00353898784947505, "grad_norm": 0.71875, "learning_rate": 7.986967995584245e-05, "loss": 2.137, "num_input_tokens_seen": 743178240, "step": 630 }, { "epoch": 0.0036569541111242184, "grad_norm": 0.91015625, "learning_rate": 7.986841847604958e-05, "loss": 2.1377, "num_input_tokens_seen": 744357888, "step": 631 }, { "epoch": 0.003774920372773387, "grad_norm": 1.5625, "learning_rate": 7.986715093021353e-05, "loss": 2.2592, "num_input_tokens_seen": 745537536, "step": 632 }, { "epoch": 0.003892886634422555, "grad_norm": 0.99609375, "learning_rate": 7.986587731852717e-05, "loss": 2.1295, "num_input_tokens_seen": 746717184, "step": 633 }, { "epoch": 0.004010852896071723, "grad_norm": 0.85546875, "learning_rate": 7.986459764118427e-05, "loss": 2.1378, "num_input_tokens_seen": 747896832, "step": 634 }, { "epoch": 0.004128819157720892, "grad_norm": 1.53125, "learning_rate": 7.986331189837952e-05, "loss": 2.1956, "num_input_tokens_seen": 749076480, "step": 635 }, { "epoch": 0.00424678541937006, "grad_norm": 0.9296875, "learning_rate": 7.986202009030858e-05, "loss": 2.2073, "num_input_tokens_seen": 750256128, "step": 636 }, { "epoch": 0.004364751681019229, "grad_norm": 1.0703125, "learning_rate": 7.986072221716798e-05, "loss": 2.1139, "num_input_tokens_seen": 751435776, "step": 637 }, { "epoch": 0.004482717942668397, "grad_norm": 1.0390625, "learning_rate": 7.985941827915519e-05, "loss": 2.2407, "num_input_tokens_seen": 752615424, "step": 638 }, { "epoch": 0.004600684204317565, "grad_norm": 1.2578125, "learning_rate": 7.985810827646862e-05, "loss": 2.1899, "num_input_tokens_seen": 753795072, "step": 639 }, { "epoch": 0.0047186504659667335, "grad_norm": 1.0625, "learning_rate": 7.985679220930758e-05, "loss": 2.2486, "num_input_tokens_seen": 754974720, "step": 640 }, { "epoch": 0.004836616727615902, "grad_norm": 0.875, "learning_rate": 7.985547007787231e-05, "loss": 2.1191, "num_input_tokens_seen": 756154368, "step": 641 }, { "epoch": 0.00495458298926507, "grad_norm": 0.8515625, "learning_rate": 7.985414188236398e-05, "loss": 2.0786, "num_input_tokens_seen": 757334016, "step": 642 }, { "epoch": 0.005072549250914238, "grad_norm": 1.0078125, "learning_rate": 7.985280762298468e-05, "loss": 2.2075, "num_input_tokens_seen": 758513664, "step": 643 }, { "epoch": 0.005190515512563407, "grad_norm": 1.0546875, "learning_rate": 7.985146729993741e-05, "loss": 2.1767, "num_input_tokens_seen": 759693312, "step": 644 }, { "epoch": 0.0053084817742125754, "grad_norm": 0.9921875, "learning_rate": 7.985012091342611e-05, "loss": 2.1301, "num_input_tokens_seen": 760872960, "step": 645 }, { "epoch": 0.005426448035861744, "grad_norm": 0.96484375, "learning_rate": 7.984876846365564e-05, "loss": 2.1197, "num_input_tokens_seen": 762052608, "step": 646 }, { "epoch": 0.005544414297510912, "grad_norm": 1.25, "learning_rate": 7.984740995083175e-05, "loss": 2.4474, "num_input_tokens_seen": 763232256, "step": 647 }, { "epoch": 0.00566238055916008, "grad_norm": 1.1796875, "learning_rate": 7.984604537516118e-05, "loss": 2.1303, "num_input_tokens_seen": 764411904, "step": 648 }, { "epoch": 0.005780346820809248, "grad_norm": 1.09375, "learning_rate": 7.984467473685153e-05, "loss": 2.2679, "num_input_tokens_seen": 765591552, "step": 649 }, { "epoch": 0.0058983130824584165, "grad_norm": 0.87109375, "learning_rate": 7.984329803611133e-05, "loss": 2.1687, "num_input_tokens_seen": 766771200, "step": 650 }, { "epoch": 0.0060162793441075856, "grad_norm": 1.1015625, "learning_rate": 7.984191527315009e-05, "loss": 2.1592, "num_input_tokens_seen": 767950848, "step": 651 }, { "epoch": 0.006134245605756754, "grad_norm": 0.9140625, "learning_rate": 7.984052644817815e-05, "loss": 2.1808, "num_input_tokens_seen": 769130496, "step": 652 }, { "epoch": 0.006252211867405922, "grad_norm": 0.80078125, "learning_rate": 7.983913156140685e-05, "loss": 2.209, "num_input_tokens_seen": 770310144, "step": 653 }, { "epoch": 0.00637017812905509, "grad_norm": 1.1640625, "learning_rate": 7.983773061304843e-05, "loss": 2.1247, "num_input_tokens_seen": 771489792, "step": 654 }, { "epoch": 0.006488144390704258, "grad_norm": 1.0078125, "learning_rate": 7.983632360331603e-05, "loss": 2.1624, "num_input_tokens_seen": 772669440, "step": 655 }, { "epoch": 0.006606110652353427, "grad_norm": 1.28125, "learning_rate": 7.983491053242373e-05, "loss": 2.1374, "num_input_tokens_seen": 773849088, "step": 656 }, { "epoch": 0.006724076914002596, "grad_norm": 1.078125, "learning_rate": 7.983349140058654e-05, "loss": 2.1465, "num_input_tokens_seen": 775028736, "step": 657 }, { "epoch": 0.006842043175651764, "grad_norm": 1.109375, "learning_rate": 7.983206620802038e-05, "loss": 2.1039, "num_input_tokens_seen": 776208384, "step": 658 }, { "epoch": 0.006960009437300932, "grad_norm": 0.90234375, "learning_rate": 7.98306349549421e-05, "loss": 2.2616, "num_input_tokens_seen": 777388032, "step": 659 }, { "epoch": 0.0070779756989501, "grad_norm": 1.1484375, "learning_rate": 7.982919764156945e-05, "loss": 2.1215, "num_input_tokens_seen": 778567680, "step": 660 }, { "epoch": 0.0071959419605992685, "grad_norm": 1.0078125, "learning_rate": 7.982775426812114e-05, "loss": 2.1266, "num_input_tokens_seen": 779747328, "step": 661 }, { "epoch": 0.007313908222248437, "grad_norm": 1.2890625, "learning_rate": 7.982630483481678e-05, "loss": 2.1429, "num_input_tokens_seen": 780926976, "step": 662 }, { "epoch": 0.007431874483897605, "grad_norm": 0.90234375, "learning_rate": 7.982484934187687e-05, "loss": 2.0725, "num_input_tokens_seen": 782106624, "step": 663 }, { "epoch": 0.007549840745546774, "grad_norm": 1.015625, "learning_rate": 7.982338778952292e-05, "loss": 2.2623, "num_input_tokens_seen": 783286272, "step": 664 }, { "epoch": 0.007667807007195942, "grad_norm": 1.515625, "learning_rate": 7.982192017797727e-05, "loss": 2.1474, "num_input_tokens_seen": 784465920, "step": 665 }, { "epoch": 0.00778577326884511, "grad_norm": 1.0390625, "learning_rate": 7.982044650746321e-05, "loss": 2.213, "num_input_tokens_seen": 785645568, "step": 666 }, { "epoch": 0.007903739530494279, "grad_norm": 0.94921875, "learning_rate": 7.9818966778205e-05, "loss": 2.1183, "num_input_tokens_seen": 786825216, "step": 667 }, { "epoch": 0.008021705792143447, "grad_norm": 1.453125, "learning_rate": 7.981748099042777e-05, "loss": 2.203, "num_input_tokens_seen": 788004864, "step": 668 }, { "epoch": 0.008139672053792615, "grad_norm": 1.1640625, "learning_rate": 7.981598914435756e-05, "loss": 2.247, "num_input_tokens_seen": 789184512, "step": 669 }, { "epoch": 0.008257638315441783, "grad_norm": 0.78515625, "learning_rate": 7.98144912402214e-05, "loss": 2.1766, "num_input_tokens_seen": 790364160, "step": 670 }, { "epoch": 0.008375604577090951, "grad_norm": 1.46875, "learning_rate": 7.981298727824715e-05, "loss": 2.1032, "num_input_tokens_seen": 791543808, "step": 671 }, { "epoch": 0.00849357083874012, "grad_norm": 1.0078125, "learning_rate": 7.981147725866367e-05, "loss": 2.0328, "num_input_tokens_seen": 792723456, "step": 672 }, { "epoch": 0.008611537100389288, "grad_norm": 0.98828125, "learning_rate": 7.980996118170071e-05, "loss": 2.156, "num_input_tokens_seen": 793903104, "step": 673 }, { "epoch": 0.008729503362038458, "grad_norm": 1.515625, "learning_rate": 7.980843904758894e-05, "loss": 2.2632, "num_input_tokens_seen": 795082752, "step": 674 }, { "epoch": 0.008847469623687626, "grad_norm": 0.98828125, "learning_rate": 7.980691085655995e-05, "loss": 2.1247, "num_input_tokens_seen": 796262400, "step": 675 }, { "epoch": 0.008965435885336794, "grad_norm": 0.85546875, "learning_rate": 7.980537660884625e-05, "loss": 2.1808, "num_input_tokens_seen": 797442048, "step": 676 }, { "epoch": 0.009083402146985962, "grad_norm": 1.375, "learning_rate": 7.980383630468132e-05, "loss": 2.1755, "num_input_tokens_seen": 798621696, "step": 677 }, { "epoch": 0.00920136840863513, "grad_norm": 1.0546875, "learning_rate": 7.980228994429947e-05, "loss": 2.1093, "num_input_tokens_seen": 799801344, "step": 678 }, { "epoch": 0.009319334670284299, "grad_norm": 0.78515625, "learning_rate": 7.9800737527936e-05, "loss": 2.0614, "num_input_tokens_seen": 800980992, "step": 679 }, { "epoch": 0.009437300931933467, "grad_norm": 0.78125, "learning_rate": 7.979917905582712e-05, "loss": 2.0862, "num_input_tokens_seen": 802160640, "step": 680 }, { "epoch": 0.009555267193582635, "grad_norm": 1.140625, "learning_rate": 7.979761452820993e-05, "loss": 2.1008, "num_input_tokens_seen": 803340288, "step": 681 }, { "epoch": 0.009673233455231803, "grad_norm": 1.0859375, "learning_rate": 7.979604394532251e-05, "loss": 2.1894, "num_input_tokens_seen": 804519936, "step": 682 }, { "epoch": 0.009791199716880972, "grad_norm": 0.921875, "learning_rate": 7.979446730740381e-05, "loss": 2.1344, "num_input_tokens_seen": 805699584, "step": 683 }, { "epoch": 0.00990916597853014, "grad_norm": 1.1171875, "learning_rate": 7.979288461469371e-05, "loss": 2.1419, "num_input_tokens_seen": 806879232, "step": 684 }, { "epoch": 0.010027132240179308, "grad_norm": 0.8046875, "learning_rate": 7.9791295867433e-05, "loss": 2.1314, "num_input_tokens_seen": 808058880, "step": 685 }, { "epoch": 0.010145098501828476, "grad_norm": 0.89453125, "learning_rate": 7.978970106586347e-05, "loss": 2.1138, "num_input_tokens_seen": 809238528, "step": 686 }, { "epoch": 0.010263064763477646, "grad_norm": 0.78125, "learning_rate": 7.978810021022773e-05, "loss": 2.1573, "num_input_tokens_seen": 810418176, "step": 687 }, { "epoch": 0.010381031025126814, "grad_norm": 0.96875, "learning_rate": 7.978649330076936e-05, "loss": 2.239, "num_input_tokens_seen": 811597824, "step": 688 }, { "epoch": 0.010498997286775983, "grad_norm": 1.109375, "learning_rate": 7.978488033773285e-05, "loss": 2.0574, "num_input_tokens_seen": 812777472, "step": 689 }, { "epoch": 0.010616963548425151, "grad_norm": 1.046875, "learning_rate": 7.978326132136364e-05, "loss": 2.0513, "num_input_tokens_seen": 813957120, "step": 690 }, { "epoch": 0.010734929810074319, "grad_norm": 1.0390625, "learning_rate": 7.978163625190803e-05, "loss": 2.1201, "num_input_tokens_seen": 815136768, "step": 691 }, { "epoch": 0.010852896071723487, "grad_norm": 0.93359375, "learning_rate": 7.978000512961329e-05, "loss": 2.0681, "num_input_tokens_seen": 816316416, "step": 692 }, { "epoch": 0.010970862333372655, "grad_norm": 1.1015625, "learning_rate": 7.977836795472761e-05, "loss": 2.0907, "num_input_tokens_seen": 817496064, "step": 693 }, { "epoch": 0.011088828595021824, "grad_norm": 0.94140625, "learning_rate": 7.977672472750006e-05, "loss": 2.1739, "num_input_tokens_seen": 818675712, "step": 694 }, { "epoch": 0.011206794856670992, "grad_norm": 0.84765625, "learning_rate": 7.977507544818069e-05, "loss": 2.1536, "num_input_tokens_seen": 819855360, "step": 695 }, { "epoch": 0.01132476111832016, "grad_norm": 1.4453125, "learning_rate": 7.977342011702043e-05, "loss": 2.0564, "num_input_tokens_seen": 821035008, "step": 696 }, { "epoch": 0.011442727379969328, "grad_norm": 0.90234375, "learning_rate": 7.977175873427114e-05, "loss": 2.0971, "num_input_tokens_seen": 822214656, "step": 697 }, { "epoch": 0.011560693641618497, "grad_norm": 0.81640625, "learning_rate": 7.977009130018561e-05, "loss": 2.1439, "num_input_tokens_seen": 823394304, "step": 698 }, { "epoch": 0.011678659903267665, "grad_norm": 0.91796875, "learning_rate": 7.976841781501751e-05, "loss": 2.0159, "num_input_tokens_seen": 824573952, "step": 699 }, { "epoch": 0.011796626164916833, "grad_norm": 0.8515625, "learning_rate": 7.97667382790215e-05, "loss": 2.2684, "num_input_tokens_seen": 825753600, "step": 700 }, { "epoch": 0.011914592426566003, "grad_norm": 0.79296875, "learning_rate": 7.976505269245314e-05, "loss": 2.0875, "num_input_tokens_seen": 826933248, "step": 701 }, { "epoch": 0.012032558688215171, "grad_norm": 0.76171875, "learning_rate": 7.976336105556884e-05, "loss": 2.0368, "num_input_tokens_seen": 828112896, "step": 702 }, { "epoch": 0.01215052494986434, "grad_norm": 0.83203125, "learning_rate": 7.976166336862602e-05, "loss": 2.0681, "num_input_tokens_seen": 829292544, "step": 703 }, { "epoch": 0.012268491211513508, "grad_norm": 1.1953125, "learning_rate": 7.975995963188297e-05, "loss": 2.1092, "num_input_tokens_seen": 830472192, "step": 704 }, { "epoch": 0.012386457473162676, "grad_norm": 1.8046875, "learning_rate": 7.975824984559893e-05, "loss": 2.1025, "num_input_tokens_seen": 831651840, "step": 705 }, { "epoch": 0.012504423734811844, "grad_norm": 0.77734375, "learning_rate": 7.975653401003404e-05, "loss": 2.1325, "num_input_tokens_seen": 832831488, "step": 706 }, { "epoch": 0.012622389996461012, "grad_norm": 1.4921875, "learning_rate": 7.975481212544938e-05, "loss": 2.1961, "num_input_tokens_seen": 834011136, "step": 707 }, { "epoch": 0.01274035625811018, "grad_norm": 1.4296875, "learning_rate": 7.97530841921069e-05, "loss": 2.0702, "num_input_tokens_seen": 835190784, "step": 708 }, { "epoch": 0.012858322519759349, "grad_norm": 0.90234375, "learning_rate": 7.975135021026956e-05, "loss": 2.165, "num_input_tokens_seen": 836370432, "step": 709 }, { "epoch": 0.012976288781408517, "grad_norm": 1.984375, "learning_rate": 7.974961018020115e-05, "loss": 2.1599, "num_input_tokens_seen": 837550080, "step": 710 }, { "epoch": 0.013094255043057685, "grad_norm": 0.99609375, "learning_rate": 7.974786410216643e-05, "loss": 2.0392, "num_input_tokens_seen": 838729728, "step": 711 }, { "epoch": 0.013212221304706853, "grad_norm": 2.28125, "learning_rate": 7.974611197643108e-05, "loss": 2.2548, "num_input_tokens_seen": 839909376, "step": 712 }, { "epoch": 0.013330187566356021, "grad_norm": 1.2890625, "learning_rate": 7.974435380326166e-05, "loss": 2.0817, "num_input_tokens_seen": 841089024, "step": 713 }, { "epoch": 0.013448153828005191, "grad_norm": 2.46875, "learning_rate": 7.97425895829257e-05, "loss": 2.1358, "num_input_tokens_seen": 842268672, "step": 714 }, { "epoch": 0.01356612008965436, "grad_norm": 1.8828125, "learning_rate": 7.974081931569163e-05, "loss": 2.207, "num_input_tokens_seen": 843448320, "step": 715 }, { "epoch": 0.013684086351303528, "grad_norm": 2.296875, "learning_rate": 7.97390430018288e-05, "loss": 2.106, "num_input_tokens_seen": 844627968, "step": 716 }, { "epoch": 0.013802052612952696, "grad_norm": 1.8125, "learning_rate": 7.973726064160746e-05, "loss": 2.0721, "num_input_tokens_seen": 845807616, "step": 717 }, { "epoch": 0.013920018874601864, "grad_norm": 1.984375, "learning_rate": 7.973547223529882e-05, "loss": 2.1795, "num_input_tokens_seen": 846987264, "step": 718 }, { "epoch": 0.014037985136251032, "grad_norm": 1.703125, "learning_rate": 7.973367778317497e-05, "loss": 2.1144, "num_input_tokens_seen": 848166912, "step": 719 }, { "epoch": 0.0141559513979002, "grad_norm": 2.046875, "learning_rate": 7.973187728550897e-05, "loss": 2.0617, "num_input_tokens_seen": 849346560, "step": 720 }, { "epoch": 0.014273917659549369, "grad_norm": 1.6875, "learning_rate": 7.973007074257472e-05, "loss": 1.9967, "num_input_tokens_seen": 850526208, "step": 721 }, { "epoch": 0.014391883921198537, "grad_norm": 1.6328125, "learning_rate": 7.972825815464713e-05, "loss": 2.0598, "num_input_tokens_seen": 851705856, "step": 722 }, { "epoch": 0.014509850182847705, "grad_norm": 1.6875, "learning_rate": 7.972643952200198e-05, "loss": 2.0309, "num_input_tokens_seen": 852885504, "step": 723 }, { "epoch": 0.014627816444496873, "grad_norm": 1.3828125, "learning_rate": 7.972461484491597e-05, "loss": 2.0394, "num_input_tokens_seen": 854065152, "step": 724 }, { "epoch": 0.014745782706146042, "grad_norm": 1.4765625, "learning_rate": 7.972278412366672e-05, "loss": 2.0426, "num_input_tokens_seen": 855244800, "step": 725 }, { "epoch": 0.01486374896779521, "grad_norm": 1.421875, "learning_rate": 7.97209473585328e-05, "loss": 2.0396, "num_input_tokens_seen": 856424448, "step": 726 }, { "epoch": 0.014981715229444378, "grad_norm": 1.09375, "learning_rate": 7.971910454979367e-05, "loss": 2.0892, "num_input_tokens_seen": 857604096, "step": 727 }, { "epoch": 0.015099681491093548, "grad_norm": 1.8515625, "learning_rate": 7.971725569772968e-05, "loss": 2.1033, "num_input_tokens_seen": 858783744, "step": 728 }, { "epoch": 0.015217647752742716, "grad_norm": 1.5234375, "learning_rate": 7.97154008026222e-05, "loss": 1.975, "num_input_tokens_seen": 859963392, "step": 729 }, { "epoch": 0.015335614014391884, "grad_norm": 1.703125, "learning_rate": 7.97135398647534e-05, "loss": 2.0152, "num_input_tokens_seen": 861143040, "step": 730 }, { "epoch": 0.015453580276041053, "grad_norm": 1.4453125, "learning_rate": 7.971167288440646e-05, "loss": 2.0447, "num_input_tokens_seen": 862322688, "step": 731 }, { "epoch": 0.01557154653769022, "grad_norm": 1.4921875, "learning_rate": 7.970979986186541e-05, "loss": 2.0918, "num_input_tokens_seen": 863502336, "step": 732 }, { "epoch": 0.01568951279933939, "grad_norm": 1.2734375, "learning_rate": 7.970792079741527e-05, "loss": 2.0034, "num_input_tokens_seen": 864681984, "step": 733 }, { "epoch": 0.015807479060988557, "grad_norm": 1.28125, "learning_rate": 7.970603569134192e-05, "loss": 2.0668, "num_input_tokens_seen": 865861632, "step": 734 }, { "epoch": 0.015925445322637725, "grad_norm": 1.171875, "learning_rate": 7.970414454393218e-05, "loss": 1.9964, "num_input_tokens_seen": 867041280, "step": 735 }, { "epoch": 0.016043411584286894, "grad_norm": 1.703125, "learning_rate": 7.970224735547382e-05, "loss": 1.9982, "num_input_tokens_seen": 868220928, "step": 736 }, { "epoch": 0.016161377845936062, "grad_norm": 1.4921875, "learning_rate": 7.970034412625547e-05, "loss": 2.1421, "num_input_tokens_seen": 869400576, "step": 737 }, { "epoch": 0.01627934410758523, "grad_norm": 1.5078125, "learning_rate": 7.96984348565667e-05, "loss": 2.0866, "num_input_tokens_seen": 870580224, "step": 738 }, { "epoch": 0.0163973103692344, "grad_norm": 1.296875, "learning_rate": 7.969651954669805e-05, "loss": 2.1172, "num_input_tokens_seen": 871759872, "step": 739 }, { "epoch": 0.016515276630883566, "grad_norm": 1.515625, "learning_rate": 7.96945981969409e-05, "loss": 2.1618, "num_input_tokens_seen": 872939520, "step": 740 }, { "epoch": 0.016633242892532735, "grad_norm": 1.1640625, "learning_rate": 7.96926708075876e-05, "loss": 2.0966, "num_input_tokens_seen": 874119168, "step": 741 }, { "epoch": 0.016751209154181903, "grad_norm": 1.609375, "learning_rate": 7.969073737893142e-05, "loss": 1.9877, "num_input_tokens_seen": 875298816, "step": 742 }, { "epoch": 0.01686917541583107, "grad_norm": 1.2421875, "learning_rate": 7.968879791126652e-05, "loss": 2.09, "num_input_tokens_seen": 876478464, "step": 743 }, { "epoch": 0.01698714167748024, "grad_norm": 1.5703125, "learning_rate": 7.968685240488798e-05, "loss": 2.0698, "num_input_tokens_seen": 877658112, "step": 744 }, { "epoch": 0.017105107939129408, "grad_norm": 1.296875, "learning_rate": 7.968490086009184e-05, "loss": 2.1101, "num_input_tokens_seen": 878837760, "step": 745 }, { "epoch": 0.017223074200778576, "grad_norm": 1.1640625, "learning_rate": 7.9682943277175e-05, "loss": 1.96, "num_input_tokens_seen": 880017408, "step": 746 }, { "epoch": 0.017341040462427744, "grad_norm": 1.0546875, "learning_rate": 7.968097965643533e-05, "loss": 1.9999, "num_input_tokens_seen": 881197056, "step": 747 }, { "epoch": 0.017459006724076916, "grad_norm": 1.140625, "learning_rate": 7.96790099981716e-05, "loss": 2.0833, "num_input_tokens_seen": 882376704, "step": 748 }, { "epoch": 0.017576972985726084, "grad_norm": 0.87890625, "learning_rate": 7.967703430268349e-05, "loss": 1.9634, "num_input_tokens_seen": 883556352, "step": 749 }, { "epoch": 0.017694939247375252, "grad_norm": 1.359375, "learning_rate": 7.967505257027158e-05, "loss": 2.1086, "num_input_tokens_seen": 884736000, "step": 750 }, { "epoch": 0.01781290550902442, "grad_norm": 0.875, "learning_rate": 7.967306480123745e-05, "loss": 2.0866, "num_input_tokens_seen": 885915648, "step": 751 }, { "epoch": 0.01793087177067359, "grad_norm": 1.2734375, "learning_rate": 7.967107099588349e-05, "loss": 2.1024, "num_input_tokens_seen": 887095296, "step": 752 }, { "epoch": 0.018048838032322757, "grad_norm": 0.91015625, "learning_rate": 7.966907115451311e-05, "loss": 2.1854, "num_input_tokens_seen": 888274944, "step": 753 }, { "epoch": 0.018166804293971925, "grad_norm": 1.5625, "learning_rate": 7.966706527743052e-05, "loss": 2.0433, "num_input_tokens_seen": 889454592, "step": 754 }, { "epoch": 0.018284770555621093, "grad_norm": 1.1640625, "learning_rate": 7.966505336494098e-05, "loss": 2.0656, "num_input_tokens_seen": 890634240, "step": 755 }, { "epoch": 0.01840273681727026, "grad_norm": 1.546875, "learning_rate": 7.96630354173506e-05, "loss": 1.9556, "num_input_tokens_seen": 891813888, "step": 756 }, { "epoch": 0.01852070307891943, "grad_norm": 1.421875, "learning_rate": 7.966101143496637e-05, "loss": 2.0212, "num_input_tokens_seen": 892993536, "step": 757 }, { "epoch": 0.018638669340568598, "grad_norm": 1.4765625, "learning_rate": 7.965898141809629e-05, "loss": 2.1419, "num_input_tokens_seen": 894173184, "step": 758 }, { "epoch": 0.018756635602217766, "grad_norm": 1.2265625, "learning_rate": 7.96569453670492e-05, "loss": 2.0694, "num_input_tokens_seen": 895352832, "step": 759 }, { "epoch": 0.018874601863866934, "grad_norm": 1.2890625, "learning_rate": 7.96549032821349e-05, "loss": 2.0899, "num_input_tokens_seen": 896532480, "step": 760 }, { "epoch": 0.018992568125516102, "grad_norm": 0.9375, "learning_rate": 7.96528551636641e-05, "loss": 2.0442, "num_input_tokens_seen": 897712128, "step": 761 }, { "epoch": 0.01911053438716527, "grad_norm": 1.171875, "learning_rate": 7.96508010119484e-05, "loss": 2.0397, "num_input_tokens_seen": 898891776, "step": 762 }, { "epoch": 0.01922850064881444, "grad_norm": 0.86328125, "learning_rate": 7.964874082730039e-05, "loss": 2.0424, "num_input_tokens_seen": 900071424, "step": 763 }, { "epoch": 0.019346466910463607, "grad_norm": 0.9140625, "learning_rate": 7.964667461003347e-05, "loss": 2.1146, "num_input_tokens_seen": 901251072, "step": 764 }, { "epoch": 0.019464433172112775, "grad_norm": 0.80078125, "learning_rate": 7.964460236046209e-05, "loss": 2.0401, "num_input_tokens_seen": 902430720, "step": 765 }, { "epoch": 0.019582399433761943, "grad_norm": 1.0078125, "learning_rate": 7.964252407890147e-05, "loss": 1.9694, "num_input_tokens_seen": 903610368, "step": 766 }, { "epoch": 0.01970036569541111, "grad_norm": 1.0234375, "learning_rate": 7.964043976566787e-05, "loss": 1.9582, "num_input_tokens_seen": 904790016, "step": 767 }, { "epoch": 0.01981833195706028, "grad_norm": 1.3515625, "learning_rate": 7.963834942107843e-05, "loss": 2.0266, "num_input_tokens_seen": 905969664, "step": 768 }, { "epoch": 0.019936298218709448, "grad_norm": 0.8203125, "learning_rate": 7.963625304545115e-05, "loss": 2.1604, "num_input_tokens_seen": 907149312, "step": 769 }, { "epoch": 0.020054264480358616, "grad_norm": 0.90234375, "learning_rate": 7.963415063910505e-05, "loss": 2.1845, "num_input_tokens_seen": 908328960, "step": 770 }, { "epoch": 0.020172230742007784, "grad_norm": 0.828125, "learning_rate": 7.963204220236e-05, "loss": 2.0049, "num_input_tokens_seen": 909508608, "step": 771 }, { "epoch": 0.020290197003656953, "grad_norm": 1.3359375, "learning_rate": 7.962992773553678e-05, "loss": 2.0349, "num_input_tokens_seen": 910688256, "step": 772 }, { "epoch": 0.02040816326530612, "grad_norm": 0.99609375, "learning_rate": 7.962780723895712e-05, "loss": 1.9531, "num_input_tokens_seen": 911867904, "step": 773 }, { "epoch": 0.020526129526955292, "grad_norm": 0.9609375, "learning_rate": 7.962568071294368e-05, "loss": 2.0456, "num_input_tokens_seen": 913047552, "step": 774 }, { "epoch": 0.02064409578860446, "grad_norm": 1.1796875, "learning_rate": 7.962354815781999e-05, "loss": 2.1398, "num_input_tokens_seen": 914227200, "step": 775 }, { "epoch": 0.02076206205025363, "grad_norm": 1.0546875, "learning_rate": 7.96214095739105e-05, "loss": 2.1565, "num_input_tokens_seen": 915406848, "step": 776 }, { "epoch": 0.020880028311902797, "grad_norm": 1.4296875, "learning_rate": 7.961926496154066e-05, "loss": 2.0268, "num_input_tokens_seen": 916586496, "step": 777 }, { "epoch": 0.020997994573551965, "grad_norm": 1.015625, "learning_rate": 7.961711432103672e-05, "loss": 2.0407, "num_input_tokens_seen": 917766144, "step": 778 }, { "epoch": 0.021115960835201134, "grad_norm": 1.2890625, "learning_rate": 7.961495765272593e-05, "loss": 1.9779, "num_input_tokens_seen": 918945792, "step": 779 }, { "epoch": 0.021233927096850302, "grad_norm": 1.0234375, "learning_rate": 7.961279495693644e-05, "loss": 2.0524, "num_input_tokens_seen": 920125440, "step": 780 }, { "epoch": 0.02135189335849947, "grad_norm": 0.90234375, "learning_rate": 7.961062623399728e-05, "loss": 2.012, "num_input_tokens_seen": 921305088, "step": 781 }, { "epoch": 0.021469859620148638, "grad_norm": 0.8125, "learning_rate": 7.960845148423844e-05, "loss": 1.9574, "num_input_tokens_seen": 922484736, "step": 782 }, { "epoch": 0.021587825881797806, "grad_norm": 0.78515625, "learning_rate": 7.960627070799081e-05, "loss": 2.0983, "num_input_tokens_seen": 923664384, "step": 783 }, { "epoch": 0.021705792143446975, "grad_norm": 1.1640625, "learning_rate": 7.96040839055862e-05, "loss": 2.0531, "num_input_tokens_seen": 924844032, "step": 784 }, { "epoch": 0.021823758405096143, "grad_norm": 1.0703125, "learning_rate": 7.960189107735734e-05, "loss": 2.0866, "num_input_tokens_seen": 926023680, "step": 785 }, { "epoch": 0.02194172466674531, "grad_norm": 0.88671875, "learning_rate": 7.959969222363786e-05, "loss": 2.012, "num_input_tokens_seen": 927203328, "step": 786 }, { "epoch": 0.02205969092839448, "grad_norm": 0.77734375, "learning_rate": 7.959748734476231e-05, "loss": 2.0116, "num_input_tokens_seen": 928382976, "step": 787 }, { "epoch": 0.022177657190043647, "grad_norm": 0.96875, "learning_rate": 7.95952764410662e-05, "loss": 2.0609, "num_input_tokens_seen": 929562624, "step": 788 }, { "epoch": 0.022295623451692816, "grad_norm": 0.86328125, "learning_rate": 7.95930595128859e-05, "loss": 2.0261, "num_input_tokens_seen": 930742272, "step": 789 }, { "epoch": 0.022413589713341984, "grad_norm": 0.984375, "learning_rate": 7.959083656055872e-05, "loss": 1.9831, "num_input_tokens_seen": 931921920, "step": 790 }, { "epoch": 0.022531555974991152, "grad_norm": 1.5078125, "learning_rate": 7.958860758442289e-05, "loss": 2.0684, "num_input_tokens_seen": 933101568, "step": 791 }, { "epoch": 0.02264952223664032, "grad_norm": 0.85546875, "learning_rate": 7.958637258481755e-05, "loss": 2.114, "num_input_tokens_seen": 934281216, "step": 792 }, { "epoch": 0.02276748849828949, "grad_norm": 0.80859375, "learning_rate": 7.958413156208275e-05, "loss": 2.0545, "num_input_tokens_seen": 935460864, "step": 793 }, { "epoch": 0.022885454759938657, "grad_norm": 0.8046875, "learning_rate": 7.958188451655949e-05, "loss": 2.1056, "num_input_tokens_seen": 936640512, "step": 794 }, { "epoch": 0.023003421021587825, "grad_norm": 0.8359375, "learning_rate": 7.957963144858964e-05, "loss": 1.9934, "num_input_tokens_seen": 937820160, "step": 795 }, { "epoch": 0.023121387283236993, "grad_norm": 0.82421875, "learning_rate": 7.957737235851602e-05, "loss": 2.0787, "num_input_tokens_seen": 938999808, "step": 796 }, { "epoch": 0.02323935354488616, "grad_norm": 0.76171875, "learning_rate": 7.957510724668234e-05, "loss": 2.0125, "num_input_tokens_seen": 940179456, "step": 797 }, { "epoch": 0.02335731980653533, "grad_norm": 0.7734375, "learning_rate": 7.957283611343325e-05, "loss": 1.9983, "num_input_tokens_seen": 941359104, "step": 798 }, { "epoch": 0.023475286068184498, "grad_norm": 0.73046875, "learning_rate": 7.95705589591143e-05, "loss": 2.0102, "num_input_tokens_seen": 942538752, "step": 799 }, { "epoch": 0.023593252329833666, "grad_norm": 0.921875, "learning_rate": 7.956827578407198e-05, "loss": 2.0244, "num_input_tokens_seen": 943718400, "step": 800 }, { "epoch": 0.023593252329833666, "eval_wikipedia_loss": 2.2881267070770264, "eval_wikipedia_runtime": 163.2839, "eval_wikipedia_samples_per_second": 4.299, "eval_wikipedia_steps_per_second": 0.184, "num_input_tokens_seen": 943718400, "step": 800 }, { "epoch": 0.023593252329833666, "eval_toxicity_loss": 4.0411834716796875, "eval_toxicity_runtime": 0.9512, "eval_toxicity_samples_per_second": 2.103, "eval_toxicity_steps_per_second": 1.051, "num_input_tokens_seen": 943718400, "step": 800 }, { "epoch": 0.023711218591482838, "grad_norm": 0.80859375, "learning_rate": 7.956598658865366e-05, "loss": 2.0013, "num_input_tokens_seen": 944898048, "step": 801 }, { "epoch": 0.023829184853132006, "grad_norm": 0.8203125, "learning_rate": 7.956369137320765e-05, "loss": 2.1992, "num_input_tokens_seen": 946077696, "step": 802 }, { "epoch": 0.023947151114781174, "grad_norm": 0.95703125, "learning_rate": 7.956139013808317e-05, "loss": 1.9885, "num_input_tokens_seen": 947257344, "step": 803 }, { "epoch": 0.024065117376430342, "grad_norm": 1.515625, "learning_rate": 7.955908288363037e-05, "loss": 2.0842, "num_input_tokens_seen": 948436992, "step": 804 }, { "epoch": 0.02418308363807951, "grad_norm": 1.0859375, "learning_rate": 7.955676961020027e-05, "loss": 2.056, "num_input_tokens_seen": 949616640, "step": 805 }, { "epoch": 0.02430104989972868, "grad_norm": 0.7578125, "learning_rate": 7.955445031814487e-05, "loss": 1.9946, "num_input_tokens_seen": 950796288, "step": 806 }, { "epoch": 0.024419016161377847, "grad_norm": 1.078125, "learning_rate": 7.955212500781705e-05, "loss": 2.0638, "num_input_tokens_seen": 951975936, "step": 807 }, { "epoch": 0.024536982423027015, "grad_norm": 1.3515625, "learning_rate": 7.954979367957059e-05, "loss": 2.0086, "num_input_tokens_seen": 953155584, "step": 808 }, { "epoch": 0.024654948684676183, "grad_norm": 0.90625, "learning_rate": 7.954745633376022e-05, "loss": 1.9927, "num_input_tokens_seen": 954335232, "step": 809 }, { "epoch": 0.02477291494632535, "grad_norm": 1.0390625, "learning_rate": 7.954511297074158e-05, "loss": 2.074, "num_input_tokens_seen": 955514880, "step": 810 }, { "epoch": 0.02489088120797452, "grad_norm": 1.15625, "learning_rate": 7.95427635908712e-05, "loss": 2.0084, "num_input_tokens_seen": 956694528, "step": 811 }, { "epoch": 0.025008847469623688, "grad_norm": 0.8984375, "learning_rate": 7.954040819450654e-05, "loss": 2.0414, "num_input_tokens_seen": 957874176, "step": 812 }, { "epoch": 0.025126813731272856, "grad_norm": 1.0078125, "learning_rate": 7.953804678200599e-05, "loss": 2.0282, "num_input_tokens_seen": 959053824, "step": 813 }, { "epoch": 0.025244779992922024, "grad_norm": 1.078125, "learning_rate": 7.953567935372883e-05, "loss": 2.0436, "num_input_tokens_seen": 960233472, "step": 814 }, { "epoch": 0.025362746254571193, "grad_norm": 0.75, "learning_rate": 7.953330591003528e-05, "loss": 2.0458, "num_input_tokens_seen": 961413120, "step": 815 }, { "epoch": 0.02548071251622036, "grad_norm": 0.94140625, "learning_rate": 7.953092645128646e-05, "loss": 1.9329, "num_input_tokens_seen": 962592768, "step": 816 }, { "epoch": 0.02559867877786953, "grad_norm": 1.0703125, "learning_rate": 7.95285409778444e-05, "loss": 1.9266, "num_input_tokens_seen": 963772416, "step": 817 }, { "epoch": 0.025716645039518697, "grad_norm": 0.96484375, "learning_rate": 7.952614949007208e-05, "loss": 1.9903, "num_input_tokens_seen": 964952064, "step": 818 }, { "epoch": 0.025834611301167865, "grad_norm": 0.765625, "learning_rate": 7.952375198833333e-05, "loss": 2.1298, "num_input_tokens_seen": 966131712, "step": 819 }, { "epoch": 0.025952577562817034, "grad_norm": 0.8671875, "learning_rate": 7.952134847299296e-05, "loss": 2.0372, "num_input_tokens_seen": 967311360, "step": 820 }, { "epoch": 0.026070543824466202, "grad_norm": 0.90625, "learning_rate": 7.951893894441665e-05, "loss": 1.9484, "num_input_tokens_seen": 968491008, "step": 821 }, { "epoch": 0.02618851008611537, "grad_norm": 0.87109375, "learning_rate": 7.951652340297105e-05, "loss": 1.9851, "num_input_tokens_seen": 969670656, "step": 822 }, { "epoch": 0.026306476347764538, "grad_norm": 0.8046875, "learning_rate": 7.951410184902364e-05, "loss": 2.0184, "num_input_tokens_seen": 970850304, "step": 823 }, { "epoch": 0.026424442609413706, "grad_norm": 0.7578125, "learning_rate": 7.951167428294288e-05, "loss": 1.9319, "num_input_tokens_seen": 972029952, "step": 824 }, { "epoch": 0.026542408871062875, "grad_norm": 0.71875, "learning_rate": 7.950924070509816e-05, "loss": 2.0554, "num_input_tokens_seen": 973209600, "step": 825 }, { "epoch": 0.026660375132712043, "grad_norm": 0.76953125, "learning_rate": 7.950680111585972e-05, "loss": 2.0083, "num_input_tokens_seen": 974389248, "step": 826 }, { "epoch": 0.02677834139436121, "grad_norm": 0.74609375, "learning_rate": 7.950435551559875e-05, "loss": 2.1506, "num_input_tokens_seen": 975568896, "step": 827 }, { "epoch": 0.026896307656010383, "grad_norm": 0.796875, "learning_rate": 7.950190390468734e-05, "loss": 2.0305, "num_input_tokens_seen": 976748544, "step": 828 }, { "epoch": 0.02701427391765955, "grad_norm": 0.8515625, "learning_rate": 7.949944628349855e-05, "loss": 2.0154, "num_input_tokens_seen": 977928192, "step": 829 }, { "epoch": 0.02713224017930872, "grad_norm": 0.796875, "learning_rate": 7.949698265240626e-05, "loss": 1.918, "num_input_tokens_seen": 979107840, "step": 830 }, { "epoch": 0.027250206440957887, "grad_norm": 0.875, "learning_rate": 7.949451301178536e-05, "loss": 1.9816, "num_input_tokens_seen": 980287488, "step": 831 }, { "epoch": 0.027368172702607056, "grad_norm": 0.80859375, "learning_rate": 7.949203736201159e-05, "loss": 2.0066, "num_input_tokens_seen": 981467136, "step": 832 }, { "epoch": 0.027486138964256224, "grad_norm": 0.91015625, "learning_rate": 7.948955570346161e-05, "loss": 1.9281, "num_input_tokens_seen": 982646784, "step": 833 }, { "epoch": 0.027604105225905392, "grad_norm": 0.8125, "learning_rate": 7.948706803651302e-05, "loss": 1.9352, "num_input_tokens_seen": 983826432, "step": 834 }, { "epoch": 0.02772207148755456, "grad_norm": 1.0078125, "learning_rate": 7.948457436154434e-05, "loss": 1.9934, "num_input_tokens_seen": 985006080, "step": 835 }, { "epoch": 0.02784003774920373, "grad_norm": 0.8359375, "learning_rate": 7.948207467893496e-05, "loss": 2.0736, "num_input_tokens_seen": 986185728, "step": 836 }, { "epoch": 0.027958004010852897, "grad_norm": 0.859375, "learning_rate": 7.947956898906523e-05, "loss": 2.0934, "num_input_tokens_seen": 987365376, "step": 837 }, { "epoch": 0.028075970272502065, "grad_norm": 0.875, "learning_rate": 7.947705729231637e-05, "loss": 2.0025, "num_input_tokens_seen": 988545024, "step": 838 }, { "epoch": 0.028193936534151233, "grad_norm": 0.7734375, "learning_rate": 7.947453958907057e-05, "loss": 1.964, "num_input_tokens_seen": 989724672, "step": 839 }, { "epoch": 0.0283119027958004, "grad_norm": 0.7734375, "learning_rate": 7.947201587971088e-05, "loss": 2.0628, "num_input_tokens_seen": 990904320, "step": 840 }, { "epoch": 0.02842986905744957, "grad_norm": 0.88671875, "learning_rate": 7.946948616462129e-05, "loss": 1.9837, "num_input_tokens_seen": 992083968, "step": 841 }, { "epoch": 0.028547835319098738, "grad_norm": 1.09375, "learning_rate": 7.94669504441867e-05, "loss": 1.942, "num_input_tokens_seen": 993263616, "step": 842 }, { "epoch": 0.028665801580747906, "grad_norm": 1.1640625, "learning_rate": 7.946440871879295e-05, "loss": 1.9283, "num_input_tokens_seen": 994443264, "step": 843 }, { "epoch": 0.028783767842397074, "grad_norm": 0.8515625, "learning_rate": 7.946186098882672e-05, "loss": 1.969, "num_input_tokens_seen": 995622912, "step": 844 }, { "epoch": 0.028901734104046242, "grad_norm": 0.7578125, "learning_rate": 7.94593072546757e-05, "loss": 1.9535, "num_input_tokens_seen": 996802560, "step": 845 }, { "epoch": 0.02901970036569541, "grad_norm": 0.83984375, "learning_rate": 7.94567475167284e-05, "loss": 2.0129, "num_input_tokens_seen": 997982208, "step": 846 }, { "epoch": 0.02913766662734458, "grad_norm": 0.94921875, "learning_rate": 7.945418177537432e-05, "loss": 1.9015, "num_input_tokens_seen": 999161856, "step": 847 }, { "epoch": 0.029255632888993747, "grad_norm": 1.4921875, "learning_rate": 7.945161003100383e-05, "loss": 1.9187, "num_input_tokens_seen": 1000341504, "step": 848 }, { "epoch": 0.029373599150642915, "grad_norm": 1.0, "learning_rate": 7.944903228400822e-05, "loss": 1.9865, "num_input_tokens_seen": 1001521152, "step": 849 }, { "epoch": 0.029491565412292083, "grad_norm": 0.703125, "learning_rate": 7.944644853477972e-05, "loss": 2.0223, "num_input_tokens_seen": 1002700800, "step": 850 }, { "epoch": 0.02960953167394125, "grad_norm": 0.84375, "learning_rate": 7.944385878371143e-05, "loss": 1.9971, "num_input_tokens_seen": 1003880448, "step": 851 }, { "epoch": 0.02972749793559042, "grad_norm": 0.8046875, "learning_rate": 7.944126303119738e-05, "loss": 1.9669, "num_input_tokens_seen": 1005060096, "step": 852 }, { "epoch": 0.029845464197239588, "grad_norm": 0.7421875, "learning_rate": 7.943866127763253e-05, "loss": 1.9804, "num_input_tokens_seen": 1006239744, "step": 853 }, { "epoch": 0.029963430458888756, "grad_norm": 0.98046875, "learning_rate": 7.943605352341274e-05, "loss": 1.8757, "num_input_tokens_seen": 1007419392, "step": 854 }, { "epoch": 0.030081396720537928, "grad_norm": 1.046875, "learning_rate": 7.943343976893479e-05, "loss": 1.9259, "num_input_tokens_seen": 1008599040, "step": 855 }, { "epoch": 0.030199362982187096, "grad_norm": 0.90234375, "learning_rate": 7.943082001459636e-05, "loss": 2.0436, "num_input_tokens_seen": 1009778688, "step": 856 }, { "epoch": 0.030317329243836264, "grad_norm": 0.8125, "learning_rate": 7.942819426079605e-05, "loss": 1.987, "num_input_tokens_seen": 1010958336, "step": 857 }, { "epoch": 0.030435295505485432, "grad_norm": 0.75390625, "learning_rate": 7.942556250793337e-05, "loss": 2.1196, "num_input_tokens_seen": 1012137984, "step": 858 }, { "epoch": 0.0305532617671346, "grad_norm": 0.76953125, "learning_rate": 7.942292475640875e-05, "loss": 1.9059, "num_input_tokens_seen": 1013317632, "step": 859 }, { "epoch": 0.03067122802878377, "grad_norm": 0.82421875, "learning_rate": 7.942028100662351e-05, "loss": 2.0198, "num_input_tokens_seen": 1014497280, "step": 860 }, { "epoch": 0.030789194290432937, "grad_norm": 0.83984375, "learning_rate": 7.941763125897995e-05, "loss": 1.9257, "num_input_tokens_seen": 1015676928, "step": 861 }, { "epoch": 0.030907160552082105, "grad_norm": 0.9375, "learning_rate": 7.941497551388116e-05, "loss": 2.0592, "num_input_tokens_seen": 1016856576, "step": 862 }, { "epoch": 0.031025126813731273, "grad_norm": 1.1796875, "learning_rate": 7.941231377173129e-05, "loss": 1.9876, "num_input_tokens_seen": 1018036224, "step": 863 }, { "epoch": 0.03114309307538044, "grad_norm": 1.2734375, "learning_rate": 7.940964603293528e-05, "loss": 2.0618, "num_input_tokens_seen": 1019215872, "step": 864 }, { "epoch": 0.031261059337029606, "grad_norm": 0.9609375, "learning_rate": 7.940697229789903e-05, "loss": 1.9937, "num_input_tokens_seen": 1020395520, "step": 865 }, { "epoch": 0.03137902559867878, "grad_norm": 0.75, "learning_rate": 7.940429256702939e-05, "loss": 1.9126, "num_input_tokens_seen": 1021575168, "step": 866 }, { "epoch": 0.03149699186032794, "grad_norm": 0.92578125, "learning_rate": 7.940160684073406e-05, "loss": 1.97, "num_input_tokens_seen": 1022754816, "step": 867 }, { "epoch": 0.031614958121977114, "grad_norm": 1.1640625, "learning_rate": 7.939891511942168e-05, "loss": 1.9342, "num_input_tokens_seen": 1023934464, "step": 868 }, { "epoch": 0.031732924383626286, "grad_norm": 0.94921875, "learning_rate": 7.939621740350179e-05, "loss": 1.9973, "num_input_tokens_seen": 1025114112, "step": 869 }, { "epoch": 0.03185089064527545, "grad_norm": 0.91015625, "learning_rate": 7.939351369338487e-05, "loss": 1.9705, "num_input_tokens_seen": 1026293760, "step": 870 }, { "epoch": 0.03196885690692462, "grad_norm": 0.75, "learning_rate": 7.939080398948229e-05, "loss": 1.9954, "num_input_tokens_seen": 1027473408, "step": 871 }, { "epoch": 0.03208682316857379, "grad_norm": 0.96484375, "learning_rate": 7.938808829220632e-05, "loss": 2.0211, "num_input_tokens_seen": 1028653056, "step": 872 }, { "epoch": 0.03220478943022296, "grad_norm": 1.2265625, "learning_rate": 7.938536660197018e-05, "loss": 1.9562, "num_input_tokens_seen": 1029832704, "step": 873 }, { "epoch": 0.032322755691872124, "grad_norm": 1.1328125, "learning_rate": 7.938263891918796e-05, "loss": 1.927, "num_input_tokens_seen": 1031012352, "step": 874 }, { "epoch": 0.032440721953521295, "grad_norm": 0.83203125, "learning_rate": 7.93799052442747e-05, "loss": 2.042, "num_input_tokens_seen": 1032192000, "step": 875 }, { "epoch": 0.03255868821517046, "grad_norm": 1.1328125, "learning_rate": 7.937716557764632e-05, "loss": 1.9994, "num_input_tokens_seen": 1033371648, "step": 876 }, { "epoch": 0.03267665447681963, "grad_norm": 1.0703125, "learning_rate": 7.937441991971966e-05, "loss": 1.9857, "num_input_tokens_seen": 1034551296, "step": 877 }, { "epoch": 0.0327946207384688, "grad_norm": 0.84375, "learning_rate": 7.93716682709125e-05, "loss": 1.9931, "num_input_tokens_seen": 1035730944, "step": 878 }, { "epoch": 0.03291258700011797, "grad_norm": 0.8046875, "learning_rate": 7.936891063164346e-05, "loss": 1.8531, "num_input_tokens_seen": 1036910592, "step": 879 }, { "epoch": 0.03303055326176713, "grad_norm": 0.875, "learning_rate": 7.936614700233218e-05, "loss": 1.8975, "num_input_tokens_seen": 1038090240, "step": 880 }, { "epoch": 0.033148519523416305, "grad_norm": 0.72265625, "learning_rate": 7.936337738339911e-05, "loss": 1.9755, "num_input_tokens_seen": 1039269888, "step": 881 }, { "epoch": 0.03326648578506547, "grad_norm": 0.79296875, "learning_rate": 7.936060177526567e-05, "loss": 1.9892, "num_input_tokens_seen": 1040449536, "step": 882 }, { "epoch": 0.03338445204671464, "grad_norm": 0.91796875, "learning_rate": 7.935782017835417e-05, "loss": 1.8774, "num_input_tokens_seen": 1041629184, "step": 883 }, { "epoch": 0.033502418308363806, "grad_norm": 0.98828125, "learning_rate": 7.935503259308782e-05, "loss": 1.8458, "num_input_tokens_seen": 1042808832, "step": 884 }, { "epoch": 0.03362038457001298, "grad_norm": 0.89453125, "learning_rate": 7.935223901989078e-05, "loss": 1.9332, "num_input_tokens_seen": 1043988480, "step": 885 }, { "epoch": 0.03373835083166214, "grad_norm": 0.87109375, "learning_rate": 7.934943945918806e-05, "loss": 1.9231, "num_input_tokens_seen": 1045168128, "step": 886 }, { "epoch": 0.033856317093311314, "grad_norm": 1.421875, "learning_rate": 7.934663391140566e-05, "loss": 1.9749, "num_input_tokens_seen": 1046347776, "step": 887 }, { "epoch": 0.03397428335496048, "grad_norm": 0.8046875, "learning_rate": 7.934382237697043e-05, "loss": 1.9468, "num_input_tokens_seen": 1047527424, "step": 888 }, { "epoch": 0.03409224961660965, "grad_norm": 0.90234375, "learning_rate": 7.934100485631014e-05, "loss": 1.9363, "num_input_tokens_seen": 1048707072, "step": 889 }, { "epoch": 0.034210215878258815, "grad_norm": 0.85546875, "learning_rate": 7.933818134985349e-05, "loss": 1.9687, "num_input_tokens_seen": 1049886720, "step": 890 }, { "epoch": 0.03432818213990799, "grad_norm": 0.8125, "learning_rate": 7.933535185803008e-05, "loss": 2.0288, "num_input_tokens_seen": 1051066368, "step": 891 }, { "epoch": 0.03444614840155715, "grad_norm": 0.8125, "learning_rate": 7.933251638127042e-05, "loss": 1.9486, "num_input_tokens_seen": 1052246016, "step": 892 }, { "epoch": 0.03456411466320632, "grad_norm": 0.84375, "learning_rate": 7.932967492000594e-05, "loss": 1.8646, "num_input_tokens_seen": 1053425664, "step": 893 }, { "epoch": 0.03468208092485549, "grad_norm": 0.8515625, "learning_rate": 7.932682747466896e-05, "loss": 1.9055, "num_input_tokens_seen": 1054605312, "step": 894 }, { "epoch": 0.03480004718650466, "grad_norm": 0.8203125, "learning_rate": 7.932397404569274e-05, "loss": 1.9025, "num_input_tokens_seen": 1055784960, "step": 895 }, { "epoch": 0.03491801344815383, "grad_norm": 0.75390625, "learning_rate": 7.932111463351142e-05, "loss": 1.9739, "num_input_tokens_seen": 1056964608, "step": 896 }, { "epoch": 0.035035979709802996, "grad_norm": 0.73828125, "learning_rate": 7.931824923856006e-05, "loss": 2.0355, "num_input_tokens_seen": 1058144256, "step": 897 }, { "epoch": 0.03515394597145217, "grad_norm": 0.94921875, "learning_rate": 7.931537786127464e-05, "loss": 2.0128, "num_input_tokens_seen": 1059323904, "step": 898 }, { "epoch": 0.03527191223310133, "grad_norm": 0.7578125, "learning_rate": 7.931250050209206e-05, "loss": 1.9691, "num_input_tokens_seen": 1060503552, "step": 899 }, { "epoch": 0.035389878494750504, "grad_norm": 0.74609375, "learning_rate": 7.93096171614501e-05, "loss": 2.0682, "num_input_tokens_seen": 1061683200, "step": 900 }, { "epoch": 0.03550784475639967, "grad_norm": 0.83203125, "learning_rate": 7.930672783978745e-05, "loss": 1.9375, "num_input_tokens_seen": 1062862848, "step": 901 }, { "epoch": 0.03562581101804884, "grad_norm": 1.1015625, "learning_rate": 7.930383253754376e-05, "loss": 1.945, "num_input_tokens_seen": 1064042496, "step": 902 }, { "epoch": 0.035743777279698005, "grad_norm": 1.140625, "learning_rate": 7.930093125515954e-05, "loss": 1.9439, "num_input_tokens_seen": 1065222144, "step": 903 }, { "epoch": 0.03586174354134718, "grad_norm": 0.984375, "learning_rate": 7.92980239930762e-05, "loss": 1.8896, "num_input_tokens_seen": 1066401792, "step": 904 }, { "epoch": 0.03597970980299634, "grad_norm": 1.03125, "learning_rate": 7.929511075173612e-05, "loss": 1.9784, "num_input_tokens_seen": 1067581440, "step": 905 }, { "epoch": 0.03609767606464551, "grad_norm": 0.84765625, "learning_rate": 7.929219153158253e-05, "loss": 2.0275, "num_input_tokens_seen": 1068761088, "step": 906 }, { "epoch": 0.03621564232629468, "grad_norm": 0.95703125, "learning_rate": 7.928926633305962e-05, "loss": 1.822, "num_input_tokens_seen": 1069940736, "step": 907 }, { "epoch": 0.03633360858794385, "grad_norm": 1.015625, "learning_rate": 7.928633515661242e-05, "loss": 1.9427, "num_input_tokens_seen": 1071120384, "step": 908 }, { "epoch": 0.036451574849593014, "grad_norm": 1.0859375, "learning_rate": 7.928339800268697e-05, "loss": 1.9247, "num_input_tokens_seen": 1072300032, "step": 909 }, { "epoch": 0.036569541111242186, "grad_norm": 0.9921875, "learning_rate": 7.928045487173013e-05, "loss": 1.8571, "num_input_tokens_seen": 1073479680, "step": 910 }, { "epoch": 0.03668750737289135, "grad_norm": 0.875, "learning_rate": 7.927750576418968e-05, "loss": 1.987, "num_input_tokens_seen": 1074659328, "step": 911 }, { "epoch": 0.03680547363454052, "grad_norm": 0.9296875, "learning_rate": 7.927455068051437e-05, "loss": 2.0172, "num_input_tokens_seen": 1075838976, "step": 912 }, { "epoch": 0.03692343989618969, "grad_norm": 0.78125, "learning_rate": 7.927158962115382e-05, "loss": 1.9209, "num_input_tokens_seen": 1077018624, "step": 913 }, { "epoch": 0.03704140615783886, "grad_norm": 0.9140625, "learning_rate": 7.926862258655853e-05, "loss": 1.8831, "num_input_tokens_seen": 1078198272, "step": 914 }, { "epoch": 0.037159372419488024, "grad_norm": 0.91015625, "learning_rate": 7.926564957717998e-05, "loss": 1.8794, "num_input_tokens_seen": 1079377920, "step": 915 }, { "epoch": 0.037277338681137195, "grad_norm": 0.8828125, "learning_rate": 7.926267059347046e-05, "loss": 1.8865, "num_input_tokens_seen": 1080557568, "step": 916 }, { "epoch": 0.03739530494278636, "grad_norm": 0.9140625, "learning_rate": 7.925968563588328e-05, "loss": 1.83, "num_input_tokens_seen": 1081737216, "step": 917 }, { "epoch": 0.03751327120443553, "grad_norm": 0.92578125, "learning_rate": 7.925669470487258e-05, "loss": 1.9207, "num_input_tokens_seen": 1082916864, "step": 918 }, { "epoch": 0.0376312374660847, "grad_norm": 1.0078125, "learning_rate": 7.925369780089345e-05, "loss": 1.9191, "num_input_tokens_seen": 1084096512, "step": 919 }, { "epoch": 0.03774920372773387, "grad_norm": 0.79296875, "learning_rate": 7.925069492440188e-05, "loss": 1.8717, "num_input_tokens_seen": 1085276160, "step": 920 }, { "epoch": 0.03786716998938304, "grad_norm": 0.83203125, "learning_rate": 7.924768607585472e-05, "loss": 1.9212, "num_input_tokens_seen": 1086455808, "step": 921 }, { "epoch": 0.037985136251032205, "grad_norm": 0.765625, "learning_rate": 7.924467125570982e-05, "loss": 1.9191, "num_input_tokens_seen": 1087635456, "step": 922 }, { "epoch": 0.038103102512681376, "grad_norm": 0.91796875, "learning_rate": 7.924165046442586e-05, "loss": 1.9994, "num_input_tokens_seen": 1088815104, "step": 923 }, { "epoch": 0.03822106877433054, "grad_norm": 0.75390625, "learning_rate": 7.923862370246247e-05, "loss": 1.9119, "num_input_tokens_seen": 1089994752, "step": 924 }, { "epoch": 0.03833903503597971, "grad_norm": 0.78125, "learning_rate": 7.923559097028017e-05, "loss": 1.9034, "num_input_tokens_seen": 1091174400, "step": 925 }, { "epoch": 0.03845700129762888, "grad_norm": 0.90234375, "learning_rate": 7.92325522683404e-05, "loss": 1.9324, "num_input_tokens_seen": 1092354048, "step": 926 }, { "epoch": 0.03857496755927805, "grad_norm": 0.765625, "learning_rate": 7.92295075971055e-05, "loss": 1.9267, "num_input_tokens_seen": 1093533696, "step": 927 }, { "epoch": 0.038692933820927214, "grad_norm": 0.828125, "learning_rate": 7.922645695703873e-05, "loss": 1.9072, "num_input_tokens_seen": 1094713344, "step": 928 }, { "epoch": 0.038810900082576386, "grad_norm": 0.9609375, "learning_rate": 7.922340034860424e-05, "loss": 1.9171, "num_input_tokens_seen": 1095892992, "step": 929 }, { "epoch": 0.03892886634422555, "grad_norm": 0.9296875, "learning_rate": 7.922033777226711e-05, "loss": 1.9862, "num_input_tokens_seen": 1097072640, "step": 930 }, { "epoch": 0.03904683260587472, "grad_norm": 0.95703125, "learning_rate": 7.92172692284933e-05, "loss": 1.9236, "num_input_tokens_seen": 1098252288, "step": 931 }, { "epoch": 0.03916479886752389, "grad_norm": 0.7890625, "learning_rate": 7.92141947177497e-05, "loss": 1.9795, "num_input_tokens_seen": 1099431936, "step": 932 }, { "epoch": 0.03928276512917306, "grad_norm": 0.875, "learning_rate": 7.92111142405041e-05, "loss": 1.9381, "num_input_tokens_seen": 1100611584, "step": 933 }, { "epoch": 0.03940073139082222, "grad_norm": 0.8046875, "learning_rate": 7.92080277972252e-05, "loss": 1.8348, "num_input_tokens_seen": 1101791232, "step": 934 }, { "epoch": 0.039518697652471395, "grad_norm": 0.8203125, "learning_rate": 7.920493538838262e-05, "loss": 1.8595, "num_input_tokens_seen": 1102970880, "step": 935 }, { "epoch": 0.03963666391412056, "grad_norm": 0.921875, "learning_rate": 7.920183701444686e-05, "loss": 1.8674, "num_input_tokens_seen": 1104150528, "step": 936 }, { "epoch": 0.03975463017576973, "grad_norm": 0.88671875, "learning_rate": 7.919873267588936e-05, "loss": 1.8276, "num_input_tokens_seen": 1105330176, "step": 937 }, { "epoch": 0.039872596437418896, "grad_norm": 0.78515625, "learning_rate": 7.919562237318243e-05, "loss": 1.8478, "num_input_tokens_seen": 1106509824, "step": 938 }, { "epoch": 0.03999056269906807, "grad_norm": 0.765625, "learning_rate": 7.91925061067993e-05, "loss": 1.9069, "num_input_tokens_seen": 1107689472, "step": 939 }, { "epoch": 0.04010852896071723, "grad_norm": 0.79296875, "learning_rate": 7.918938387721413e-05, "loss": 1.8889, "num_input_tokens_seen": 1108869120, "step": 940 }, { "epoch": 0.040226495222366404, "grad_norm": 0.828125, "learning_rate": 7.918625568490199e-05, "loss": 1.9663, "num_input_tokens_seen": 1110048768, "step": 941 }, { "epoch": 0.04034446148401557, "grad_norm": 0.953125, "learning_rate": 7.918312153033882e-05, "loss": 1.9711, "num_input_tokens_seen": 1111228416, "step": 942 }, { "epoch": 0.04046242774566474, "grad_norm": 1.046875, "learning_rate": 7.917998141400147e-05, "loss": 1.9236, "num_input_tokens_seen": 1112408064, "step": 943 }, { "epoch": 0.040580394007313905, "grad_norm": 1.0703125, "learning_rate": 7.917683533636773e-05, "loss": 1.9647, "num_input_tokens_seen": 1113587712, "step": 944 }, { "epoch": 0.04069836026896308, "grad_norm": 0.97265625, "learning_rate": 7.91736832979163e-05, "loss": 1.9309, "num_input_tokens_seen": 1114767360, "step": 945 }, { "epoch": 0.04081632653061224, "grad_norm": 0.81640625, "learning_rate": 7.917052529912675e-05, "loss": 2.0155, "num_input_tokens_seen": 1115947008, "step": 946 }, { "epoch": 0.04093429279226141, "grad_norm": 0.8359375, "learning_rate": 7.916736134047956e-05, "loss": 1.8469, "num_input_tokens_seen": 1117126656, "step": 947 }, { "epoch": 0.041052259053910585, "grad_norm": 0.890625, "learning_rate": 7.916419142245615e-05, "loss": 1.9435, "num_input_tokens_seen": 1118306304, "step": 948 }, { "epoch": 0.04117022531555975, "grad_norm": 1.15625, "learning_rate": 7.916101554553882e-05, "loss": 1.8697, "num_input_tokens_seen": 1119485952, "step": 949 }, { "epoch": 0.04128819157720892, "grad_norm": 0.95703125, "learning_rate": 7.91578337102108e-05, "loss": 1.979, "num_input_tokens_seen": 1120665600, "step": 950 }, { "epoch": 0.041406157838858086, "grad_norm": 0.71875, "learning_rate": 7.91546459169562e-05, "loss": 1.9996, "num_input_tokens_seen": 1121845248, "step": 951 }, { "epoch": 0.04152412410050726, "grad_norm": 0.734375, "learning_rate": 7.915145216626004e-05, "loss": 1.9028, "num_input_tokens_seen": 1123024896, "step": 952 }, { "epoch": 0.04164209036215642, "grad_norm": 0.77734375, "learning_rate": 7.914825245860827e-05, "loss": 1.9244, "num_input_tokens_seen": 1124204544, "step": 953 }, { "epoch": 0.041760056623805594, "grad_norm": 0.79296875, "learning_rate": 7.914504679448771e-05, "loss": 2.0091, "num_input_tokens_seen": 1125384192, "step": 954 }, { "epoch": 0.04187802288545476, "grad_norm": 0.7109375, "learning_rate": 7.914183517438612e-05, "loss": 1.881, "num_input_tokens_seen": 1126563840, "step": 955 }, { "epoch": 0.04199598914710393, "grad_norm": 1.03125, "learning_rate": 7.913861759879215e-05, "loss": 1.9797, "num_input_tokens_seen": 1127743488, "step": 956 }, { "epoch": 0.042113955408753095, "grad_norm": 1.0078125, "learning_rate": 7.913539406819537e-05, "loss": 1.8792, "num_input_tokens_seen": 1128923136, "step": 957 }, { "epoch": 0.04223192167040227, "grad_norm": 0.921875, "learning_rate": 7.913216458308624e-05, "loss": 1.8002, "num_input_tokens_seen": 1130102784, "step": 958 }, { "epoch": 0.04234988793205143, "grad_norm": 1.0703125, "learning_rate": 7.912892914395612e-05, "loss": 1.9734, "num_input_tokens_seen": 1131282432, "step": 959 }, { "epoch": 0.042467854193700603, "grad_norm": 1.0078125, "learning_rate": 7.91256877512973e-05, "loss": 1.8323, "num_input_tokens_seen": 1132462080, "step": 960 }, { "epoch": 0.04258582045534977, "grad_norm": 0.87109375, "learning_rate": 7.912244040560296e-05, "loss": 1.8318, "num_input_tokens_seen": 1133641728, "step": 961 }, { "epoch": 0.04270378671699894, "grad_norm": 0.87890625, "learning_rate": 7.911918710736718e-05, "loss": 1.8872, "num_input_tokens_seen": 1134821376, "step": 962 }, { "epoch": 0.042821752978648105, "grad_norm": 1.0078125, "learning_rate": 7.911592785708496e-05, "loss": 1.9225, "num_input_tokens_seen": 1136001024, "step": 963 }, { "epoch": 0.042939719240297276, "grad_norm": 1.1015625, "learning_rate": 7.911266265525221e-05, "loss": 1.7839, "num_input_tokens_seen": 1137180672, "step": 964 }, { "epoch": 0.04305768550194644, "grad_norm": 0.88671875, "learning_rate": 7.910939150236573e-05, "loss": 1.8849, "num_input_tokens_seen": 1138360320, "step": 965 }, { "epoch": 0.04317565176359561, "grad_norm": 1.3125, "learning_rate": 7.910611439892322e-05, "loss": 1.8538, "num_input_tokens_seen": 1139539968, "step": 966 }, { "epoch": 0.04329361802524478, "grad_norm": 1.1171875, "learning_rate": 7.91028313454233e-05, "loss": 1.8658, "num_input_tokens_seen": 1140719616, "step": 967 }, { "epoch": 0.04341158428689395, "grad_norm": 0.89453125, "learning_rate": 7.909954234236551e-05, "loss": 1.859, "num_input_tokens_seen": 1141899264, "step": 968 }, { "epoch": 0.043529550548543114, "grad_norm": 0.890625, "learning_rate": 7.909624739025026e-05, "loss": 1.8266, "num_input_tokens_seen": 1143078912, "step": 969 }, { "epoch": 0.043647516810192286, "grad_norm": 0.87109375, "learning_rate": 7.909294648957889e-05, "loss": 1.819, "num_input_tokens_seen": 1144258560, "step": 970 }, { "epoch": 0.04376548307184145, "grad_norm": 1.1875, "learning_rate": 7.908963964085362e-05, "loss": 1.8053, "num_input_tokens_seen": 1145438208, "step": 971 }, { "epoch": 0.04388344933349062, "grad_norm": 0.92578125, "learning_rate": 7.908632684457762e-05, "loss": 1.8585, "num_input_tokens_seen": 1146617856, "step": 972 }, { "epoch": 0.04400141559513979, "grad_norm": 0.80859375, "learning_rate": 7.908300810125491e-05, "loss": 1.8612, "num_input_tokens_seen": 1147797504, "step": 973 }, { "epoch": 0.04411938185678896, "grad_norm": 0.953125, "learning_rate": 7.907968341139046e-05, "loss": 1.7937, "num_input_tokens_seen": 1148977152, "step": 974 }, { "epoch": 0.04423734811843813, "grad_norm": 0.85546875, "learning_rate": 7.907635277549013e-05, "loss": 1.9481, "num_input_tokens_seen": 1150156800, "step": 975 }, { "epoch": 0.044355314380087295, "grad_norm": 0.85546875, "learning_rate": 7.907301619406067e-05, "loss": 2.0073, "num_input_tokens_seen": 1151336448, "step": 976 }, { "epoch": 0.044473280641736467, "grad_norm": 0.9375, "learning_rate": 7.906967366760974e-05, "loss": 1.8315, "num_input_tokens_seen": 1152516096, "step": 977 }, { "epoch": 0.04459124690338563, "grad_norm": 0.9453125, "learning_rate": 7.906632519664592e-05, "loss": 1.9134, "num_input_tokens_seen": 1153695744, "step": 978 }, { "epoch": 0.0447092131650348, "grad_norm": 1.15625, "learning_rate": 7.90629707816787e-05, "loss": 1.8331, "num_input_tokens_seen": 1154875392, "step": 979 }, { "epoch": 0.04482717942668397, "grad_norm": 1.1484375, "learning_rate": 7.905961042321842e-05, "loss": 1.8492, "num_input_tokens_seen": 1156055040, "step": 980 }, { "epoch": 0.04494514568833314, "grad_norm": 0.890625, "learning_rate": 7.905624412177642e-05, "loss": 1.9344, "num_input_tokens_seen": 1157234688, "step": 981 }, { "epoch": 0.045063111949982304, "grad_norm": 0.96484375, "learning_rate": 7.905287187786483e-05, "loss": 1.8134, "num_input_tokens_seen": 1158414336, "step": 982 }, { "epoch": 0.045181078211631476, "grad_norm": 1.03125, "learning_rate": 7.904949369199678e-05, "loss": 1.9497, "num_input_tokens_seen": 1159593984, "step": 983 }, { "epoch": 0.04529904447328064, "grad_norm": 0.953125, "learning_rate": 7.904610956468626e-05, "loss": 1.7974, "num_input_tokens_seen": 1160773632, "step": 984 }, { "epoch": 0.04541701073492981, "grad_norm": 0.8359375, "learning_rate": 7.904271949644816e-05, "loss": 1.8029, "num_input_tokens_seen": 1161953280, "step": 985 }, { "epoch": 0.04553497699657898, "grad_norm": 0.8046875, "learning_rate": 7.903932348779829e-05, "loss": 1.9024, "num_input_tokens_seen": 1163132928, "step": 986 }, { "epoch": 0.04565294325822815, "grad_norm": 1.1875, "learning_rate": 7.903592153925336e-05, "loss": 1.8734, "num_input_tokens_seen": 1164312576, "step": 987 }, { "epoch": 0.04577090951987731, "grad_norm": 0.78125, "learning_rate": 7.903251365133098e-05, "loss": 1.8746, "num_input_tokens_seen": 1165492224, "step": 988 }, { "epoch": 0.045888875781526485, "grad_norm": 0.79296875, "learning_rate": 7.902909982454966e-05, "loss": 1.9698, "num_input_tokens_seen": 1166671872, "step": 989 }, { "epoch": 0.04600684204317565, "grad_norm": 0.8359375, "learning_rate": 7.902568005942882e-05, "loss": 1.8322, "num_input_tokens_seen": 1167851520, "step": 990 }, { "epoch": 0.04612480830482482, "grad_norm": 0.88671875, "learning_rate": 7.902225435648881e-05, "loss": 1.8545, "num_input_tokens_seen": 1169031168, "step": 991 }, { "epoch": 0.046242774566473986, "grad_norm": 0.79296875, "learning_rate": 7.901882271625082e-05, "loss": 1.8321, "num_input_tokens_seen": 1170210816, "step": 992 }, { "epoch": 0.04636074082812316, "grad_norm": 0.83984375, "learning_rate": 7.901538513923699e-05, "loss": 1.8958, "num_input_tokens_seen": 1171390464, "step": 993 }, { "epoch": 0.04647870708977232, "grad_norm": 1.0078125, "learning_rate": 7.901194162597036e-05, "loss": 1.8725, "num_input_tokens_seen": 1172570112, "step": 994 }, { "epoch": 0.046596673351421494, "grad_norm": 0.85546875, "learning_rate": 7.900849217697486e-05, "loss": 1.9313, "num_input_tokens_seen": 1173749760, "step": 995 }, { "epoch": 0.04671463961307066, "grad_norm": 1.0078125, "learning_rate": 7.900503679277534e-05, "loss": 1.9648, "num_input_tokens_seen": 1174929408, "step": 996 }, { "epoch": 0.04683260587471983, "grad_norm": 0.7265625, "learning_rate": 7.900157547389752e-05, "loss": 2.073, "num_input_tokens_seen": 1176109056, "step": 997 }, { "epoch": 0.046950572136368995, "grad_norm": 0.98828125, "learning_rate": 7.899810822086806e-05, "loss": 1.8845, "num_input_tokens_seen": 1177288704, "step": 998 }, { "epoch": 0.04706853839801817, "grad_norm": 0.921875, "learning_rate": 7.899463503421451e-05, "loss": 1.8769, "num_input_tokens_seen": 1178468352, "step": 999 }, { "epoch": 0.04718650465966733, "grad_norm": 0.98046875, "learning_rate": 7.899115591446533e-05, "loss": 1.9261, "num_input_tokens_seen": 1179648000, "step": 1000 }, { "epoch": 0.04718650465966733, "eval_wikipedia_loss": 2.2636232376098633, "eval_wikipedia_runtime": 162.9449, "eval_wikipedia_samples_per_second": 4.308, "eval_wikipedia_steps_per_second": 0.184, "num_input_tokens_seen": 1179648000, "step": 1000 }, { "epoch": 0.04718650465966733, "eval_toxicity_loss": 3.9912185668945312, "eval_toxicity_runtime": 0.9818, "eval_toxicity_samples_per_second": 2.037, "eval_toxicity_steps_per_second": 1.018, "num_input_tokens_seen": 1179648000, "step": 1000 }, { "epoch": 0.00011796626164916834, "grad_norm": 1.125, "learning_rate": 7.898767086214984e-05, "loss": 1.693, "num_input_tokens_seen": 1180827648, "step": 1001 }, { "epoch": 0.0002359325232983367, "grad_norm": 1.15625, "learning_rate": 7.898417987779834e-05, "loss": 1.7842, "num_input_tokens_seen": 1182007296, "step": 1002 }, { "epoch": 0.000353898784947505, "grad_norm": 1.1953125, "learning_rate": 7.898068296194194e-05, "loss": 1.6674, "num_input_tokens_seen": 1183186944, "step": 1003 }, { "epoch": 0.0004718650465966734, "grad_norm": 0.9453125, "learning_rate": 7.897718011511275e-05, "loss": 1.6872, "num_input_tokens_seen": 1184366592, "step": 1004 }, { "epoch": 0.0005898313082458417, "grad_norm": 1.0234375, "learning_rate": 7.89736713378437e-05, "loss": 1.7766, "num_input_tokens_seen": 1185546240, "step": 1005 }, { "epoch": 0.00070779756989501, "grad_norm": 1.1015625, "learning_rate": 7.897015663066865e-05, "loss": 1.7041, "num_input_tokens_seen": 1186725888, "step": 1006 }, { "epoch": 0.0008257638315441783, "grad_norm": 1.1484375, "learning_rate": 7.896663599412241e-05, "loss": 1.8234, "num_input_tokens_seen": 1187905536, "step": 1007 }, { "epoch": 0.0009437300931933467, "grad_norm": 1.3515625, "learning_rate": 7.896310942874061e-05, "loss": 1.6623, "num_input_tokens_seen": 1189085184, "step": 1008 }, { "epoch": 0.001061696354842515, "grad_norm": 1.0625, "learning_rate": 7.895957693505987e-05, "loss": 1.7256, "num_input_tokens_seen": 1190264832, "step": 1009 }, { "epoch": 0.0011796626164916834, "grad_norm": 1.3203125, "learning_rate": 7.89560385136176e-05, "loss": 1.832, "num_input_tokens_seen": 1191444480, "step": 1010 }, { "epoch": 0.0012976288781408518, "grad_norm": 1.1953125, "learning_rate": 7.895249416495223e-05, "loss": 1.7132, "num_input_tokens_seen": 1192624128, "step": 1011 }, { "epoch": 0.00141559513979002, "grad_norm": 0.98046875, "learning_rate": 7.8948943889603e-05, "loss": 1.7064, "num_input_tokens_seen": 1193803776, "step": 1012 }, { "epoch": 0.0015335614014391884, "grad_norm": 1.1171875, "learning_rate": 7.894538768811011e-05, "loss": 1.5843, "num_input_tokens_seen": 1194983424, "step": 1013 }, { "epoch": 0.0016515276630883566, "grad_norm": 0.99609375, "learning_rate": 7.894182556101465e-05, "loss": 1.6346, "num_input_tokens_seen": 1196163072, "step": 1014 }, { "epoch": 0.001769493924737525, "grad_norm": 0.8984375, "learning_rate": 7.893825750885859e-05, "loss": 1.6726, "num_input_tokens_seen": 1197342720, "step": 1015 }, { "epoch": 0.0018874601863866935, "grad_norm": 0.9921875, "learning_rate": 7.89346835321848e-05, "loss": 1.6584, "num_input_tokens_seen": 1198522368, "step": 1016 }, { "epoch": 0.0020054264480358617, "grad_norm": 1.2421875, "learning_rate": 7.89311036315371e-05, "loss": 1.7154, "num_input_tokens_seen": 1199702016, "step": 1017 }, { "epoch": 0.00212339270968503, "grad_norm": 0.8828125, "learning_rate": 7.892751780746016e-05, "loss": 1.8305, "num_input_tokens_seen": 1200881664, "step": 1018 }, { "epoch": 0.0022413589713341986, "grad_norm": 0.85546875, "learning_rate": 7.892392606049957e-05, "loss": 1.7607, "num_input_tokens_seen": 1202061312, "step": 1019 }, { "epoch": 0.0023593252329833668, "grad_norm": 1.109375, "learning_rate": 7.892032839120181e-05, "loss": 1.7711, "num_input_tokens_seen": 1203240960, "step": 1020 }, { "epoch": 0.002477291494632535, "grad_norm": 1.21875, "learning_rate": 7.89167248001143e-05, "loss": 1.7846, "num_input_tokens_seen": 1204420608, "step": 1021 }, { "epoch": 0.0025952577562817036, "grad_norm": 0.9453125, "learning_rate": 7.89131152877853e-05, "loss": 1.7185, "num_input_tokens_seen": 1205600256, "step": 1022 }, { "epoch": 0.002713224017930872, "grad_norm": 1.3203125, "learning_rate": 7.890949985476401e-05, "loss": 1.6928, "num_input_tokens_seen": 1206779904, "step": 1023 }, { "epoch": 0.00283119027958004, "grad_norm": 0.9453125, "learning_rate": 7.890587850160054e-05, "loss": 1.844, "num_input_tokens_seen": 1207959552, "step": 1024 }, { "epoch": 0.0029491565412292082, "grad_norm": 1.09375, "learning_rate": 7.890225122884587e-05, "loss": 1.7122, "num_input_tokens_seen": 1209139200, "step": 1025 }, { "epoch": 0.003067122802878377, "grad_norm": 1.296875, "learning_rate": 7.88986180370519e-05, "loss": 1.7787, "num_input_tokens_seen": 1210318848, "step": 1026 }, { "epoch": 0.003185089064527545, "grad_norm": 0.97265625, "learning_rate": 7.889497892677144e-05, "loss": 1.6745, "num_input_tokens_seen": 1211498496, "step": 1027 }, { "epoch": 0.0033030553261767133, "grad_norm": 1.2421875, "learning_rate": 7.889133389855816e-05, "loss": 1.6877, "num_input_tokens_seen": 1212678144, "step": 1028 }, { "epoch": 0.003421021587825882, "grad_norm": 0.96875, "learning_rate": 7.888768295296667e-05, "loss": 1.6812, "num_input_tokens_seen": 1213857792, "step": 1029 }, { "epoch": 0.00353898784947505, "grad_norm": 1.40625, "learning_rate": 7.888402609055247e-05, "loss": 1.6829, "num_input_tokens_seen": 1215037440, "step": 1030 }, { "epoch": 0.0036569541111242184, "grad_norm": 0.9609375, "learning_rate": 7.888036331187195e-05, "loss": 1.698, "num_input_tokens_seen": 1216217088, "step": 1031 }, { "epoch": 0.003774920372773387, "grad_norm": 1.5234375, "learning_rate": 7.887669461748241e-05, "loss": 1.8116, "num_input_tokens_seen": 1217396736, "step": 1032 }, { "epoch": 0.003892886634422555, "grad_norm": 0.94140625, "learning_rate": 7.887302000794204e-05, "loss": 1.7049, "num_input_tokens_seen": 1218576384, "step": 1033 }, { "epoch": 0.004010852896071723, "grad_norm": 1.0703125, "learning_rate": 7.886933948380997e-05, "loss": 1.6622, "num_input_tokens_seen": 1219756032, "step": 1034 }, { "epoch": 0.004128819157720892, "grad_norm": 1.078125, "learning_rate": 7.886565304564616e-05, "loss": 1.7226, "num_input_tokens_seen": 1220935680, "step": 1035 }, { "epoch": 0.00424678541937006, "grad_norm": 0.7890625, "learning_rate": 7.886196069401153e-05, "loss": 1.8099, "num_input_tokens_seen": 1222115328, "step": 1036 }, { "epoch": 0.004364751681019229, "grad_norm": 1.140625, "learning_rate": 7.885826242946786e-05, "loss": 1.6033, "num_input_tokens_seen": 1223294976, "step": 1037 }, { "epoch": 0.004482717942668397, "grad_norm": 0.87109375, "learning_rate": 7.885455825257785e-05, "loss": 1.8159, "num_input_tokens_seen": 1224474624, "step": 1038 }, { "epoch": 0.004600684204317565, "grad_norm": 1.0078125, "learning_rate": 7.885084816390511e-05, "loss": 1.7535, "num_input_tokens_seen": 1225654272, "step": 1039 }, { "epoch": 0.0047186504659667335, "grad_norm": 0.9140625, "learning_rate": 7.884713216401412e-05, "loss": 1.8334, "num_input_tokens_seen": 1226833920, "step": 1040 }, { "epoch": 0.004836616727615902, "grad_norm": 0.9609375, "learning_rate": 7.88434102534703e-05, "loss": 1.6799, "num_input_tokens_seen": 1228013568, "step": 1041 }, { "epoch": 0.00495458298926507, "grad_norm": 0.99609375, "learning_rate": 7.883968243283992e-05, "loss": 1.6823, "num_input_tokens_seen": 1229193216, "step": 1042 }, { "epoch": 0.005072549250914238, "grad_norm": 1.125, "learning_rate": 7.883594870269019e-05, "loss": 1.7959, "num_input_tokens_seen": 1230372864, "step": 1043 }, { "epoch": 0.005190515512563407, "grad_norm": 1.078125, "learning_rate": 7.883220906358917e-05, "loss": 1.7708, "num_input_tokens_seen": 1231552512, "step": 1044 }, { "epoch": 0.0053084817742125754, "grad_norm": 0.89453125, "learning_rate": 7.88284635161059e-05, "loss": 1.7716, "num_input_tokens_seen": 1232732160, "step": 1045 }, { "epoch": 0.005426448035861744, "grad_norm": 0.97265625, "learning_rate": 7.882471206081023e-05, "loss": 1.7232, "num_input_tokens_seen": 1233911808, "step": 1046 }, { "epoch": 0.005544414297510912, "grad_norm": 1.0234375, "learning_rate": 7.882095469827299e-05, "loss": 2.0954, "num_input_tokens_seen": 1235091456, "step": 1047 }, { "epoch": 0.00566238055916008, "grad_norm": 0.94140625, "learning_rate": 7.881719142906584e-05, "loss": 1.7849, "num_input_tokens_seen": 1236271104, "step": 1048 }, { "epoch": 0.005780346820809248, "grad_norm": 0.94921875, "learning_rate": 7.881342225376137e-05, "loss": 1.9185, "num_input_tokens_seen": 1237450752, "step": 1049 }, { "epoch": 0.0058983130824584165, "grad_norm": 1.0234375, "learning_rate": 7.880964717293307e-05, "loss": 1.7487, "num_input_tokens_seen": 1238630400, "step": 1050 }, { "epoch": 0.0060162793441075856, "grad_norm": 0.921875, "learning_rate": 7.880586618715532e-05, "loss": 1.7824, "num_input_tokens_seen": 1239810048, "step": 1051 }, { "epoch": 0.006134245605756754, "grad_norm": 1.1171875, "learning_rate": 7.88020792970034e-05, "loss": 1.8, "num_input_tokens_seen": 1240989696, "step": 1052 }, { "epoch": 0.006252211867405922, "grad_norm": 0.9453125, "learning_rate": 7.87982865030535e-05, "loss": 1.8677, "num_input_tokens_seen": 1242169344, "step": 1053 }, { "epoch": 0.00637017812905509, "grad_norm": 1.0625, "learning_rate": 7.879448780588271e-05, "loss": 1.7398, "num_input_tokens_seen": 1243348992, "step": 1054 }, { "epoch": 0.006488144390704258, "grad_norm": 1.390625, "learning_rate": 7.879068320606899e-05, "loss": 1.7627, "num_input_tokens_seen": 1244528640, "step": 1055 }, { "epoch": 0.006606110652353427, "grad_norm": 1.0859375, "learning_rate": 7.878687270419122e-05, "loss": 1.854, "num_input_tokens_seen": 1245708288, "step": 1056 }, { "epoch": 0.006724076914002596, "grad_norm": 1.109375, "learning_rate": 7.878305630082919e-05, "loss": 1.7986, "num_input_tokens_seen": 1246887936, "step": 1057 }, { "epoch": 0.006842043175651764, "grad_norm": 1.078125, "learning_rate": 7.877923399656354e-05, "loss": 1.785, "num_input_tokens_seen": 1248067584, "step": 1058 }, { "epoch": 0.006960009437300932, "grad_norm": 0.94921875, "learning_rate": 7.877540579197586e-05, "loss": 1.9014, "num_input_tokens_seen": 1249247232, "step": 1059 }, { "epoch": 0.0070779756989501, "grad_norm": 1.109375, "learning_rate": 7.877157168764863e-05, "loss": 1.8105, "num_input_tokens_seen": 1250426880, "step": 1060 }, { "epoch": 0.0071959419605992685, "grad_norm": 0.99609375, "learning_rate": 7.876773168416519e-05, "loss": 1.801, "num_input_tokens_seen": 1251606528, "step": 1061 }, { "epoch": 0.007313908222248437, "grad_norm": 0.94140625, "learning_rate": 7.87638857821098e-05, "loss": 1.8477, "num_input_tokens_seen": 1252786176, "step": 1062 }, { "epoch": 0.007431874483897605, "grad_norm": 1.046875, "learning_rate": 7.876003398206765e-05, "loss": 1.7174, "num_input_tokens_seen": 1253965824, "step": 1063 }, { "epoch": 0.007549840745546774, "grad_norm": 0.9375, "learning_rate": 7.875617628462477e-05, "loss": 1.9367, "num_input_tokens_seen": 1255145472, "step": 1064 }, { "epoch": 0.007667807007195942, "grad_norm": 1.2265625, "learning_rate": 7.875231269036813e-05, "loss": 1.8491, "num_input_tokens_seen": 1256325120, "step": 1065 }, { "epoch": 0.00778577326884511, "grad_norm": 1.0625, "learning_rate": 7.874844319988556e-05, "loss": 1.8657, "num_input_tokens_seen": 1257504768, "step": 1066 }, { "epoch": 0.007903739530494279, "grad_norm": 0.94140625, "learning_rate": 7.874456781376584e-05, "loss": 1.8064, "num_input_tokens_seen": 1258684416, "step": 1067 }, { "epoch": 0.008021705792143447, "grad_norm": 1.09375, "learning_rate": 7.874068653259861e-05, "loss": 1.93, "num_input_tokens_seen": 1259864064, "step": 1068 }, { "epoch": 0.008139672053792615, "grad_norm": 0.96875, "learning_rate": 7.87367993569744e-05, "loss": 1.9201, "num_input_tokens_seen": 1261043712, "step": 1069 }, { "epoch": 0.008257638315441783, "grad_norm": 0.984375, "learning_rate": 7.873290628748465e-05, "loss": 1.826, "num_input_tokens_seen": 1262223360, "step": 1070 }, { "epoch": 0.008375604577090951, "grad_norm": 0.98828125, "learning_rate": 7.87290073247217e-05, "loss": 1.7763, "num_input_tokens_seen": 1263403008, "step": 1071 }, { "epoch": 0.00849357083874012, "grad_norm": 0.859375, "learning_rate": 7.872510246927879e-05, "loss": 1.6945, "num_input_tokens_seen": 1264582656, "step": 1072 }, { "epoch": 0.008611537100389288, "grad_norm": 1.046875, "learning_rate": 7.872119172175006e-05, "loss": 1.8033, "num_input_tokens_seen": 1265762304, "step": 1073 }, { "epoch": 0.008729503362038458, "grad_norm": 0.9453125, "learning_rate": 7.871727508273049e-05, "loss": 1.9424, "num_input_tokens_seen": 1266941952, "step": 1074 }, { "epoch": 0.008847469623687626, "grad_norm": 0.90625, "learning_rate": 7.871335255281606e-05, "loss": 1.7735, "num_input_tokens_seen": 1268121600, "step": 1075 }, { "epoch": 0.008965435885336794, "grad_norm": 0.9453125, "learning_rate": 7.870942413260354e-05, "loss": 1.8556, "num_input_tokens_seen": 1269301248, "step": 1076 }, { "epoch": 0.009083402146985962, "grad_norm": 0.92578125, "learning_rate": 7.87054898226907e-05, "loss": 1.8768, "num_input_tokens_seen": 1270480896, "step": 1077 }, { "epoch": 0.00920136840863513, "grad_norm": 1.0859375, "learning_rate": 7.87015496236761e-05, "loss": 1.7556, "num_input_tokens_seen": 1271660544, "step": 1078 }, { "epoch": 0.009319334670284299, "grad_norm": 0.9453125, "learning_rate": 7.869760353615926e-05, "loss": 1.7153, "num_input_tokens_seen": 1272840192, "step": 1079 }, { "epoch": 0.009437300931933467, "grad_norm": 0.9921875, "learning_rate": 7.869365156074061e-05, "loss": 1.7451, "num_input_tokens_seen": 1274019840, "step": 1080 }, { "epoch": 0.009555267193582635, "grad_norm": 0.99609375, "learning_rate": 7.868969369802142e-05, "loss": 1.732, "num_input_tokens_seen": 1275199488, "step": 1081 }, { "epoch": 0.009673233455231803, "grad_norm": 1.015625, "learning_rate": 7.86857299486039e-05, "loss": 1.8516, "num_input_tokens_seen": 1276379136, "step": 1082 }, { "epoch": 0.009791199716880972, "grad_norm": 1.15625, "learning_rate": 7.868176031309115e-05, "loss": 1.7259, "num_input_tokens_seen": 1277558784, "step": 1083 }, { "epoch": 0.00990916597853014, "grad_norm": 1.03125, "learning_rate": 7.867778479208712e-05, "loss": 1.8024, "num_input_tokens_seen": 1278738432, "step": 1084 }, { "epoch": 0.010027132240179308, "grad_norm": 1.0703125, "learning_rate": 7.867380338619672e-05, "loss": 1.7921, "num_input_tokens_seen": 1279918080, "step": 1085 }, { "epoch": 0.010145098501828476, "grad_norm": 1.0234375, "learning_rate": 7.866981609602572e-05, "loss": 1.7405, "num_input_tokens_seen": 1281097728, "step": 1086 }, { "epoch": 0.010263064763477646, "grad_norm": 0.92578125, "learning_rate": 7.86658229221808e-05, "loss": 1.8575, "num_input_tokens_seen": 1282277376, "step": 1087 }, { "epoch": 0.010381031025126814, "grad_norm": 1.046875, "learning_rate": 7.866182386526954e-05, "loss": 1.9094, "num_input_tokens_seen": 1283457024, "step": 1088 }, { "epoch": 0.010498997286775983, "grad_norm": 1.25, "learning_rate": 7.865781892590036e-05, "loss": 1.7406, "num_input_tokens_seen": 1284636672, "step": 1089 }, { "epoch": 0.010616963548425151, "grad_norm": 1.3125, "learning_rate": 7.865380810468265e-05, "loss": 1.6847, "num_input_tokens_seen": 1285816320, "step": 1090 }, { "epoch": 0.010734929810074319, "grad_norm": 1.0, "learning_rate": 7.864979140222666e-05, "loss": 1.7862, "num_input_tokens_seen": 1286995968, "step": 1091 }, { "epoch": 0.010852896071723487, "grad_norm": 0.98046875, "learning_rate": 7.864576881914354e-05, "loss": 1.7013, "num_input_tokens_seen": 1288175616, "step": 1092 }, { "epoch": 0.010970862333372655, "grad_norm": 0.91015625, "learning_rate": 7.86417403560453e-05, "loss": 1.735, "num_input_tokens_seen": 1289355264, "step": 1093 }, { "epoch": 0.011088828595021824, "grad_norm": 0.9609375, "learning_rate": 7.863770601354492e-05, "loss": 1.8639, "num_input_tokens_seen": 1290534912, "step": 1094 }, { "epoch": 0.011206794856670992, "grad_norm": 1.046875, "learning_rate": 7.863366579225622e-05, "loss": 1.8047, "num_input_tokens_seen": 1291714560, "step": 1095 }, { "epoch": 0.01132476111832016, "grad_norm": 0.8046875, "learning_rate": 7.862961969279391e-05, "loss": 1.7503, "num_input_tokens_seen": 1292894208, "step": 1096 }, { "epoch": 0.011442727379969328, "grad_norm": 1.1171875, "learning_rate": 7.862556771577363e-05, "loss": 1.7217, "num_input_tokens_seen": 1294073856, "step": 1097 }, { "epoch": 0.011560693641618497, "grad_norm": 1.078125, "learning_rate": 7.862150986181187e-05, "loss": 1.7873, "num_input_tokens_seen": 1295253504, "step": 1098 }, { "epoch": 0.011678659903267665, "grad_norm": 0.98828125, "learning_rate": 7.861744613152609e-05, "loss": 1.6499, "num_input_tokens_seen": 1296433152, "step": 1099 }, { "epoch": 0.011796626164916833, "grad_norm": 0.9375, "learning_rate": 7.861337652553452e-05, "loss": 1.9022, "num_input_tokens_seen": 1297612800, "step": 1100 }, { "epoch": 0.011914592426566003, "grad_norm": 1.1015625, "learning_rate": 7.86093010444564e-05, "loss": 1.7538, "num_input_tokens_seen": 1298792448, "step": 1101 }, { "epoch": 0.012032558688215171, "grad_norm": 0.9296875, "learning_rate": 7.860521968891183e-05, "loss": 1.6857, "num_input_tokens_seen": 1299972096, "step": 1102 }, { "epoch": 0.01215052494986434, "grad_norm": 0.95703125, "learning_rate": 7.860113245952179e-05, "loss": 1.7463, "num_input_tokens_seen": 1301151744, "step": 1103 }, { "epoch": 0.012268491211513508, "grad_norm": 0.96875, "learning_rate": 7.859703935690812e-05, "loss": 1.7936, "num_input_tokens_seen": 1302331392, "step": 1104 }, { "epoch": 0.012386457473162676, "grad_norm": 0.93359375, "learning_rate": 7.859294038169364e-05, "loss": 1.8009, "num_input_tokens_seen": 1303511040, "step": 1105 }, { "epoch": 0.012504423734811844, "grad_norm": 0.99609375, "learning_rate": 7.858883553450199e-05, "loss": 1.8159, "num_input_tokens_seen": 1304690688, "step": 1106 }, { "epoch": 0.012622389996461012, "grad_norm": 0.9765625, "learning_rate": 7.858472481595775e-05, "loss": 1.9189, "num_input_tokens_seen": 1305870336, "step": 1107 }, { "epoch": 0.01274035625811018, "grad_norm": 0.921875, "learning_rate": 7.858060822668634e-05, "loss": 1.7446, "num_input_tokens_seen": 1307049984, "step": 1108 }, { "epoch": 0.012858322519759349, "grad_norm": 0.953125, "learning_rate": 7.857648576731412e-05, "loss": 1.8123, "num_input_tokens_seen": 1308229632, "step": 1109 }, { "epoch": 0.012976288781408517, "grad_norm": 0.91796875, "learning_rate": 7.857235743846834e-05, "loss": 1.8646, "num_input_tokens_seen": 1309409280, "step": 1110 }, { "epoch": 0.013094255043057685, "grad_norm": 0.94140625, "learning_rate": 7.856822324077713e-05, "loss": 1.6761, "num_input_tokens_seen": 1310588928, "step": 1111 }, { "epoch": 0.013212221304706853, "grad_norm": 1.0703125, "learning_rate": 7.856408317486951e-05, "loss": 2.004, "num_input_tokens_seen": 1311768576, "step": 1112 }, { "epoch": 0.013330187566356021, "grad_norm": 1.0390625, "learning_rate": 7.855993724137539e-05, "loss": 1.7488, "num_input_tokens_seen": 1312948224, "step": 1113 }, { "epoch": 0.013448153828005191, "grad_norm": 1.078125, "learning_rate": 7.855578544092559e-05, "loss": 1.8885, "num_input_tokens_seen": 1314127872, "step": 1114 }, { "epoch": 0.01356612008965436, "grad_norm": 0.921875, "learning_rate": 7.85516277741518e-05, "loss": 1.9571, "num_input_tokens_seen": 1315307520, "step": 1115 }, { "epoch": 0.013684086351303528, "grad_norm": 1.046875, "learning_rate": 7.854746424168664e-05, "loss": 1.8484, "num_input_tokens_seen": 1316487168, "step": 1116 }, { "epoch": 0.013802052612952696, "grad_norm": 0.87109375, "learning_rate": 7.854329484416358e-05, "loss": 1.8106, "num_input_tokens_seen": 1317666816, "step": 1117 }, { "epoch": 0.013920018874601864, "grad_norm": 1.0390625, "learning_rate": 7.853911958221699e-05, "loss": 1.9163, "num_input_tokens_seen": 1318846464, "step": 1118 }, { "epoch": 0.014037985136251032, "grad_norm": 1.0390625, "learning_rate": 7.853493845648218e-05, "loss": 1.8216, "num_input_tokens_seen": 1320026112, "step": 1119 }, { "epoch": 0.0141559513979002, "grad_norm": 0.85546875, "learning_rate": 7.853075146759527e-05, "loss": 1.8019, "num_input_tokens_seen": 1321205760, "step": 1120 }, { "epoch": 0.014273917659549369, "grad_norm": 0.828125, "learning_rate": 7.852655861619336e-05, "loss": 1.7269, "num_input_tokens_seen": 1322385408, "step": 1121 }, { "epoch": 0.014391883921198537, "grad_norm": 0.9140625, "learning_rate": 7.852235990291436e-05, "loss": 1.7582, "num_input_tokens_seen": 1323565056, "step": 1122 }, { "epoch": 0.014509850182847705, "grad_norm": 0.92578125, "learning_rate": 7.851815532839713e-05, "loss": 1.7213, "num_input_tokens_seen": 1324744704, "step": 1123 }, { "epoch": 0.014627816444496873, "grad_norm": 0.90234375, "learning_rate": 7.85139448932814e-05, "loss": 1.703, "num_input_tokens_seen": 1325924352, "step": 1124 }, { "epoch": 0.014745782706146042, "grad_norm": 0.86328125, "learning_rate": 7.85097285982078e-05, "loss": 1.7028, "num_input_tokens_seen": 1327104000, "step": 1125 }, { "epoch": 0.01486374896779521, "grad_norm": 0.90625, "learning_rate": 7.850550644381784e-05, "loss": 1.7196, "num_input_tokens_seen": 1328283648, "step": 1126 }, { "epoch": 0.014981715229444378, "grad_norm": 1.0078125, "learning_rate": 7.850127843075394e-05, "loss": 1.6825, "num_input_tokens_seen": 1329463296, "step": 1127 }, { "epoch": 0.015099681491093548, "grad_norm": 1.4765625, "learning_rate": 7.849704455965937e-05, "loss": 1.7549, "num_input_tokens_seen": 1330642944, "step": 1128 }, { "epoch": 0.015217647752742716, "grad_norm": 1.078125, "learning_rate": 7.849280483117834e-05, "loss": 1.6149, "num_input_tokens_seen": 1331822592, "step": 1129 }, { "epoch": 0.015335614014391884, "grad_norm": 1.1484375, "learning_rate": 7.848855924595594e-05, "loss": 1.634, "num_input_tokens_seen": 1333002240, "step": 1130 }, { "epoch": 0.015453580276041053, "grad_norm": 1.0625, "learning_rate": 7.848430780463814e-05, "loss": 1.6845, "num_input_tokens_seen": 1334181888, "step": 1131 }, { "epoch": 0.01557154653769022, "grad_norm": 1.1640625, "learning_rate": 7.848005050787178e-05, "loss": 1.7053, "num_input_tokens_seen": 1335361536, "step": 1132 }, { "epoch": 0.01568951279933939, "grad_norm": 0.9921875, "learning_rate": 7.847578735630464e-05, "loss": 1.6666, "num_input_tokens_seen": 1336541184, "step": 1133 }, { "epoch": 0.015807479060988557, "grad_norm": 0.9921875, "learning_rate": 7.847151835058534e-05, "loss": 1.6833, "num_input_tokens_seen": 1337720832, "step": 1134 }, { "epoch": 0.015925445322637725, "grad_norm": 0.96875, "learning_rate": 7.846724349136344e-05, "loss": 1.587, "num_input_tokens_seen": 1338900480, "step": 1135 }, { "epoch": 0.016043411584286894, "grad_norm": 1.0703125, "learning_rate": 7.846296277928937e-05, "loss": 1.6853, "num_input_tokens_seen": 1340080128, "step": 1136 }, { "epoch": 0.016161377845936062, "grad_norm": 0.8515625, "learning_rate": 7.845867621501442e-05, "loss": 1.836, "num_input_tokens_seen": 1341259776, "step": 1137 }, { "epoch": 0.01627934410758523, "grad_norm": 1.0859375, "learning_rate": 7.845438379919081e-05, "loss": 1.7539, "num_input_tokens_seen": 1342439424, "step": 1138 }, { "epoch": 0.0163973103692344, "grad_norm": 0.98046875, "learning_rate": 7.845008553247166e-05, "loss": 1.749, "num_input_tokens_seen": 1343619072, "step": 1139 }, { "epoch": 0.016515276630883566, "grad_norm": 1.2734375, "learning_rate": 7.844578141551092e-05, "loss": 1.8204, "num_input_tokens_seen": 1344798720, "step": 1140 }, { "epoch": 0.016633242892532735, "grad_norm": 0.9921875, "learning_rate": 7.844147144896349e-05, "loss": 1.7323, "num_input_tokens_seen": 1345978368, "step": 1141 }, { "epoch": 0.016751209154181903, "grad_norm": 0.90234375, "learning_rate": 7.843715563348515e-05, "loss": 1.6755, "num_input_tokens_seen": 1347158016, "step": 1142 }, { "epoch": 0.01686917541583107, "grad_norm": 0.91796875, "learning_rate": 7.843283396973253e-05, "loss": 1.7822, "num_input_tokens_seen": 1348337664, "step": 1143 }, { "epoch": 0.01698714167748024, "grad_norm": 1.1015625, "learning_rate": 7.842850645836319e-05, "loss": 1.7529, "num_input_tokens_seen": 1349517312, "step": 1144 }, { "epoch": 0.017105107939129408, "grad_norm": 0.98046875, "learning_rate": 7.842417310003557e-05, "loss": 1.7434, "num_input_tokens_seen": 1350696960, "step": 1145 }, { "epoch": 0.017223074200778576, "grad_norm": 1.0390625, "learning_rate": 7.8419833895409e-05, "loss": 1.5889, "num_input_tokens_seen": 1351876608, "step": 1146 }, { "epoch": 0.017341040462427744, "grad_norm": 1.0, "learning_rate": 7.84154888451437e-05, "loss": 1.5818, "num_input_tokens_seen": 1353056256, "step": 1147 }, { "epoch": 0.017459006724076916, "grad_norm": 1.0, "learning_rate": 7.841113794990076e-05, "loss": 1.7138, "num_input_tokens_seen": 1354235904, "step": 1148 }, { "epoch": 0.017576972985726084, "grad_norm": 0.8984375, "learning_rate": 7.840678121034219e-05, "loss": 1.6106, "num_input_tokens_seen": 1355415552, "step": 1149 }, { "epoch": 0.017694939247375252, "grad_norm": 1.0546875, "learning_rate": 7.840241862713087e-05, "loss": 1.7624, "num_input_tokens_seen": 1356595200, "step": 1150 }, { "epoch": 0.01781290550902442, "grad_norm": 1.0625, "learning_rate": 7.839805020093057e-05, "loss": 1.6687, "num_input_tokens_seen": 1357774848, "step": 1151 }, { "epoch": 0.01793087177067359, "grad_norm": 1.109375, "learning_rate": 7.839367593240596e-05, "loss": 1.6976, "num_input_tokens_seen": 1358954496, "step": 1152 }, { "epoch": 0.018048838032322757, "grad_norm": 0.87890625, "learning_rate": 7.838929582222259e-05, "loss": 1.8581, "num_input_tokens_seen": 1360134144, "step": 1153 }, { "epoch": 0.018166804293971925, "grad_norm": 1.1640625, "learning_rate": 7.83849098710469e-05, "loss": 1.6766, "num_input_tokens_seen": 1361313792, "step": 1154 }, { "epoch": 0.018284770555621093, "grad_norm": 0.9140625, "learning_rate": 7.838051807954621e-05, "loss": 1.7232, "num_input_tokens_seen": 1362493440, "step": 1155 }, { "epoch": 0.01840273681727026, "grad_norm": 0.91796875, "learning_rate": 7.837612044838877e-05, "loss": 1.6636, "num_input_tokens_seen": 1363673088, "step": 1156 }, { "epoch": 0.01852070307891943, "grad_norm": 1.0078125, "learning_rate": 7.837171697824363e-05, "loss": 1.6333, "num_input_tokens_seen": 1364852736, "step": 1157 }, { "epoch": 0.018638669340568598, "grad_norm": 0.984375, "learning_rate": 7.836730766978085e-05, "loss": 1.8061, "num_input_tokens_seen": 1366032384, "step": 1158 }, { "epoch": 0.018756635602217766, "grad_norm": 1.03125, "learning_rate": 7.836289252367125e-05, "loss": 1.6617, "num_input_tokens_seen": 1367212032, "step": 1159 }, { "epoch": 0.018874601863866934, "grad_norm": 0.859375, "learning_rate": 7.835847154058666e-05, "loss": 1.8089, "num_input_tokens_seen": 1368391680, "step": 1160 }, { "epoch": 0.018992568125516102, "grad_norm": 0.92578125, "learning_rate": 7.835404472119971e-05, "loss": 1.697, "num_input_tokens_seen": 1369571328, "step": 1161 }, { "epoch": 0.01911053438716527, "grad_norm": 0.984375, "learning_rate": 7.834961206618396e-05, "loss": 1.6818, "num_input_tokens_seen": 1370750976, "step": 1162 }, { "epoch": 0.01922850064881444, "grad_norm": 0.86328125, "learning_rate": 7.834517357621381e-05, "loss": 1.6874, "num_input_tokens_seen": 1371930624, "step": 1163 }, { "epoch": 0.019346466910463607, "grad_norm": 0.9375, "learning_rate": 7.834072925196463e-05, "loss": 1.7766, "num_input_tokens_seen": 1373110272, "step": 1164 }, { "epoch": 0.019464433172112775, "grad_norm": 1.0, "learning_rate": 7.83362790941126e-05, "loss": 1.7272, "num_input_tokens_seen": 1374289920, "step": 1165 }, { "epoch": 0.019582399433761943, "grad_norm": 1.2734375, "learning_rate": 7.833182310333483e-05, "loss": 1.5872, "num_input_tokens_seen": 1375469568, "step": 1166 }, { "epoch": 0.01970036569541111, "grad_norm": 1.0234375, "learning_rate": 7.83273612803093e-05, "loss": 1.6183, "num_input_tokens_seen": 1376649216, "step": 1167 }, { "epoch": 0.01981833195706028, "grad_norm": 0.86328125, "learning_rate": 7.832289362571489e-05, "loss": 1.7423, "num_input_tokens_seen": 1377828864, "step": 1168 }, { "epoch": 0.019936298218709448, "grad_norm": 0.8515625, "learning_rate": 7.831842014023137e-05, "loss": 1.8817, "num_input_tokens_seen": 1379008512, "step": 1169 }, { "epoch": 0.020054264480358616, "grad_norm": 0.9765625, "learning_rate": 7.831394082453938e-05, "loss": 1.8874, "num_input_tokens_seen": 1380188160, "step": 1170 }, { "epoch": 0.020172230742007784, "grad_norm": 1.125, "learning_rate": 7.830945567932043e-05, "loss": 1.6719, "num_input_tokens_seen": 1381367808, "step": 1171 }, { "epoch": 0.020290197003656953, "grad_norm": 1.0859375, "learning_rate": 7.830496470525699e-05, "loss": 1.744, "num_input_tokens_seen": 1382547456, "step": 1172 }, { "epoch": 0.02040816326530612, "grad_norm": 0.98046875, "learning_rate": 7.830046790303232e-05, "loss": 1.6126, "num_input_tokens_seen": 1383727104, "step": 1173 }, { "epoch": 0.020526129526955292, "grad_norm": 1.1015625, "learning_rate": 7.829596527333065e-05, "loss": 1.7485, "num_input_tokens_seen": 1384906752, "step": 1174 }, { "epoch": 0.02064409578860446, "grad_norm": 1.0, "learning_rate": 7.829145681683706e-05, "loss": 1.8069, "num_input_tokens_seen": 1386086400, "step": 1175 }, { "epoch": 0.02076206205025363, "grad_norm": 1.1171875, "learning_rate": 7.82869425342375e-05, "loss": 1.837, "num_input_tokens_seen": 1387266048, "step": 1176 }, { "epoch": 0.020880028311902797, "grad_norm": 1.3984375, "learning_rate": 7.828242242621884e-05, "loss": 1.7099, "num_input_tokens_seen": 1388445696, "step": 1177 }, { "epoch": 0.020997994573551965, "grad_norm": 1.078125, "learning_rate": 7.82778964934688e-05, "loss": 1.7364, "num_input_tokens_seen": 1389625344, "step": 1178 }, { "epoch": 0.021115960835201134, "grad_norm": 1.46875, "learning_rate": 7.827336473667604e-05, "loss": 1.6631, "num_input_tokens_seen": 1390804992, "step": 1179 }, { "epoch": 0.021233927096850302, "grad_norm": 1.0703125, "learning_rate": 7.826882715653005e-05, "loss": 1.7466, "num_input_tokens_seen": 1391984640, "step": 1180 }, { "epoch": 0.02135189335849947, "grad_norm": 1.6953125, "learning_rate": 7.826428375372125e-05, "loss": 1.7107, "num_input_tokens_seen": 1393164288, "step": 1181 }, { "epoch": 0.021469859620148638, "grad_norm": 1.296875, "learning_rate": 7.825973452894091e-05, "loss": 1.6893, "num_input_tokens_seen": 1394343936, "step": 1182 }, { "epoch": 0.021587825881797806, "grad_norm": 1.2890625, "learning_rate": 7.82551794828812e-05, "loss": 1.8002, "num_input_tokens_seen": 1395523584, "step": 1183 }, { "epoch": 0.021705792143446975, "grad_norm": 1.0625, "learning_rate": 7.82506186162352e-05, "loss": 1.7513, "num_input_tokens_seen": 1396703232, "step": 1184 }, { "epoch": 0.021823758405096143, "grad_norm": 1.09375, "learning_rate": 7.824605192969681e-05, "loss": 1.8065, "num_input_tokens_seen": 1397882880, "step": 1185 }, { "epoch": 0.02194172466674531, "grad_norm": 1.078125, "learning_rate": 7.824147942396091e-05, "loss": 1.7148, "num_input_tokens_seen": 1399062528, "step": 1186 }, { "epoch": 0.02205969092839448, "grad_norm": 0.921875, "learning_rate": 7.823690109972317e-05, "loss": 1.7104, "num_input_tokens_seen": 1400242176, "step": 1187 }, { "epoch": 0.022177657190043647, "grad_norm": 1.0859375, "learning_rate": 7.823231695768023e-05, "loss": 1.7102, "num_input_tokens_seen": 1401421824, "step": 1188 }, { "epoch": 0.022295623451692816, "grad_norm": 0.83984375, "learning_rate": 7.822772699852954e-05, "loss": 1.7314, "num_input_tokens_seen": 1402601472, "step": 1189 }, { "epoch": 0.022413589713341984, "grad_norm": 0.99609375, "learning_rate": 7.822313122296947e-05, "loss": 1.6595, "num_input_tokens_seen": 1403781120, "step": 1190 }, { "epoch": 0.022531555974991152, "grad_norm": 1.046875, "learning_rate": 7.821852963169931e-05, "loss": 1.789, "num_input_tokens_seen": 1404960768, "step": 1191 }, { "epoch": 0.02264952223664032, "grad_norm": 0.98046875, "learning_rate": 7.821392222541918e-05, "loss": 1.8047, "num_input_tokens_seen": 1406140416, "step": 1192 }, { "epoch": 0.02276748849828949, "grad_norm": 0.875, "learning_rate": 7.820930900483009e-05, "loss": 1.7661, "num_input_tokens_seen": 1407320064, "step": 1193 }, { "epoch": 0.022885454759938657, "grad_norm": 0.921875, "learning_rate": 7.820468997063396e-05, "loss": 1.7989, "num_input_tokens_seen": 1408499712, "step": 1194 }, { "epoch": 0.023003421021587825, "grad_norm": 1.03125, "learning_rate": 7.820006512353358e-05, "loss": 1.6524, "num_input_tokens_seen": 1409679360, "step": 1195 }, { "epoch": 0.023121387283236993, "grad_norm": 0.8671875, "learning_rate": 7.819543446423264e-05, "loss": 1.7819, "num_input_tokens_seen": 1410859008, "step": 1196 }, { "epoch": 0.02323935354488616, "grad_norm": 0.88671875, "learning_rate": 7.819079799343568e-05, "loss": 1.723, "num_input_tokens_seen": 1412038656, "step": 1197 }, { "epoch": 0.02335731980653533, "grad_norm": 0.8203125, "learning_rate": 7.818615571184816e-05, "loss": 1.7317, "num_input_tokens_seen": 1413218304, "step": 1198 }, { "epoch": 0.023475286068184498, "grad_norm": 0.87109375, "learning_rate": 7.818150762017642e-05, "loss": 1.7398, "num_input_tokens_seen": 1414397952, "step": 1199 }, { "epoch": 0.023593252329833666, "grad_norm": 0.94921875, "learning_rate": 7.817685371912765e-05, "loss": 1.7017, "num_input_tokens_seen": 1415577600, "step": 1200 }, { "epoch": 0.023593252329833666, "eval_wikipedia_loss": 2.2694852352142334, "eval_wikipedia_runtime": 161.0095, "eval_wikipedia_samples_per_second": 4.36, "eval_wikipedia_steps_per_second": 0.186, "num_input_tokens_seen": 1415577600, "step": 1200 }, { "epoch": 0.023593252329833666, "eval_toxicity_loss": 4.077282905578613, "eval_toxicity_runtime": 1.0499, "eval_toxicity_samples_per_second": 1.905, "eval_toxicity_steps_per_second": 0.952, "num_input_tokens_seen": 1415577600, "step": 1200 }, { "epoch": 0.023711218591482838, "grad_norm": 0.86328125, "learning_rate": 7.817219400940997e-05, "loss": 1.7017, "num_input_tokens_seen": 1416757248, "step": 1201 }, { "epoch": 0.023829184853132006, "grad_norm": 0.88671875, "learning_rate": 7.816752849173236e-05, "loss": 1.9035, "num_input_tokens_seen": 1417936896, "step": 1202 }, { "epoch": 0.023947151114781174, "grad_norm": 0.92578125, "learning_rate": 7.816285716680469e-05, "loss": 1.6969, "num_input_tokens_seen": 1419116544, "step": 1203 }, { "epoch": 0.024065117376430342, "grad_norm": 0.9609375, "learning_rate": 7.815818003533769e-05, "loss": 1.857, "num_input_tokens_seen": 1420296192, "step": 1204 }, { "epoch": 0.02418308363807951, "grad_norm": 0.84765625, "learning_rate": 7.815349709804301e-05, "loss": 1.794, "num_input_tokens_seen": 1421475840, "step": 1205 }, { "epoch": 0.02430104989972868, "grad_norm": 0.953125, "learning_rate": 7.814880835563315e-05, "loss": 1.7131, "num_input_tokens_seen": 1422655488, "step": 1206 }, { "epoch": 0.024419016161377847, "grad_norm": 0.828125, "learning_rate": 7.814411380882151e-05, "loss": 1.7848, "num_input_tokens_seen": 1423835136, "step": 1207 }, { "epoch": 0.024536982423027015, "grad_norm": 0.97265625, "learning_rate": 7.813941345832242e-05, "loss": 1.7348, "num_input_tokens_seen": 1425014784, "step": 1208 }, { "epoch": 0.024654948684676183, "grad_norm": 1.1171875, "learning_rate": 7.813470730485098e-05, "loss": 1.6488, "num_input_tokens_seen": 1426194432, "step": 1209 }, { "epoch": 0.02477291494632535, "grad_norm": 0.93359375, "learning_rate": 7.812999534912326e-05, "loss": 1.7997, "num_input_tokens_seen": 1427374080, "step": 1210 }, { "epoch": 0.02489088120797452, "grad_norm": 0.796875, "learning_rate": 7.812527759185621e-05, "loss": 1.7712, "num_input_tokens_seen": 1428553728, "step": 1211 }, { "epoch": 0.025008847469623688, "grad_norm": 1.078125, "learning_rate": 7.812055403376764e-05, "loss": 1.7391, "num_input_tokens_seen": 1429733376, "step": 1212 }, { "epoch": 0.025126813731272856, "grad_norm": 0.8984375, "learning_rate": 7.811582467557623e-05, "loss": 1.737, "num_input_tokens_seen": 1430913024, "step": 1213 }, { "epoch": 0.025244779992922024, "grad_norm": 0.94140625, "learning_rate": 7.811108951800157e-05, "loss": 1.7396, "num_input_tokens_seen": 1432092672, "step": 1214 }, { "epoch": 0.025362746254571193, "grad_norm": 1.109375, "learning_rate": 7.810634856176413e-05, "loss": 1.7338, "num_input_tokens_seen": 1433272320, "step": 1215 }, { "epoch": 0.02548071251622036, "grad_norm": 0.9375, "learning_rate": 7.810160180758523e-05, "loss": 1.6398, "num_input_tokens_seen": 1434451968, "step": 1216 }, { "epoch": 0.02559867877786953, "grad_norm": 0.96484375, "learning_rate": 7.809684925618712e-05, "loss": 1.6268, "num_input_tokens_seen": 1435631616, "step": 1217 }, { "epoch": 0.025716645039518697, "grad_norm": 1.1328125, "learning_rate": 7.80920909082929e-05, "loss": 1.6998, "num_input_tokens_seen": 1436811264, "step": 1218 }, { "epoch": 0.025834611301167865, "grad_norm": 0.91015625, "learning_rate": 7.808732676462655e-05, "loss": 1.8721, "num_input_tokens_seen": 1437990912, "step": 1219 }, { "epoch": 0.025952577562817034, "grad_norm": 0.8671875, "learning_rate": 7.808255682591295e-05, "loss": 1.7911, "num_input_tokens_seen": 1439170560, "step": 1220 }, { "epoch": 0.026070543824466202, "grad_norm": 1.1328125, "learning_rate": 7.807778109287787e-05, "loss": 1.6226, "num_input_tokens_seen": 1440350208, "step": 1221 }, { "epoch": 0.02618851008611537, "grad_norm": 0.9140625, "learning_rate": 7.807299956624792e-05, "loss": 1.6958, "num_input_tokens_seen": 1441529856, "step": 1222 }, { "epoch": 0.026306476347764538, "grad_norm": 0.9765625, "learning_rate": 7.806821224675062e-05, "loss": 1.7473, "num_input_tokens_seen": 1442709504, "step": 1223 }, { "epoch": 0.026424442609413706, "grad_norm": 0.90234375, "learning_rate": 7.80634191351144e-05, "loss": 1.6486, "num_input_tokens_seen": 1443889152, "step": 1224 }, { "epoch": 0.026542408871062875, "grad_norm": 0.921875, "learning_rate": 7.80586202320685e-05, "loss": 1.7505, "num_input_tokens_seen": 1445068800, "step": 1225 }, { "epoch": 0.026660375132712043, "grad_norm": 0.94140625, "learning_rate": 7.80538155383431e-05, "loss": 1.7118, "num_input_tokens_seen": 1446248448, "step": 1226 }, { "epoch": 0.02677834139436121, "grad_norm": 0.91796875, "learning_rate": 7.804900505466923e-05, "loss": 1.8553, "num_input_tokens_seen": 1447428096, "step": 1227 }, { "epoch": 0.026896307656010383, "grad_norm": 0.9375, "learning_rate": 7.804418878177882e-05, "loss": 1.7392, "num_input_tokens_seen": 1448607744, "step": 1228 }, { "epoch": 0.02701427391765955, "grad_norm": 0.90625, "learning_rate": 7.803936672040468e-05, "loss": 1.7218, "num_input_tokens_seen": 1449787392, "step": 1229 }, { "epoch": 0.02713224017930872, "grad_norm": 1.046875, "learning_rate": 7.803453887128049e-05, "loss": 1.5923, "num_input_tokens_seen": 1450967040, "step": 1230 }, { "epoch": 0.027250206440957887, "grad_norm": 0.83984375, "learning_rate": 7.802970523514081e-05, "loss": 1.6976, "num_input_tokens_seen": 1452146688, "step": 1231 }, { "epoch": 0.027368172702607056, "grad_norm": 1.0078125, "learning_rate": 7.802486581272109e-05, "loss": 1.7327, "num_input_tokens_seen": 1453326336, "step": 1232 }, { "epoch": 0.027486138964256224, "grad_norm": 0.78515625, "learning_rate": 7.802002060475765e-05, "loss": 1.6797, "num_input_tokens_seen": 1454505984, "step": 1233 }, { "epoch": 0.027604105225905392, "grad_norm": 1.1015625, "learning_rate": 7.801516961198771e-05, "loss": 1.6193, "num_input_tokens_seen": 1455685632, "step": 1234 }, { "epoch": 0.02772207148755456, "grad_norm": 0.953125, "learning_rate": 7.801031283514933e-05, "loss": 1.6832, "num_input_tokens_seen": 1456865280, "step": 1235 }, { "epoch": 0.02784003774920373, "grad_norm": 1.1484375, "learning_rate": 7.800545027498151e-05, "loss": 1.8275, "num_input_tokens_seen": 1458044928, "step": 1236 }, { "epoch": 0.027958004010852897, "grad_norm": 0.984375, "learning_rate": 7.800058193222407e-05, "loss": 1.808, "num_input_tokens_seen": 1459224576, "step": 1237 }, { "epoch": 0.028075970272502065, "grad_norm": 1.2578125, "learning_rate": 7.799570780761775e-05, "loss": 1.6494, "num_input_tokens_seen": 1460404224, "step": 1238 }, { "epoch": 0.028193936534151233, "grad_norm": 0.96875, "learning_rate": 7.799082790190414e-05, "loss": 1.6716, "num_input_tokens_seen": 1461583872, "step": 1239 }, { "epoch": 0.0283119027958004, "grad_norm": 1.125, "learning_rate": 7.798594221582575e-05, "loss": 1.7444, "num_input_tokens_seen": 1462763520, "step": 1240 }, { "epoch": 0.02842986905744957, "grad_norm": 1.015625, "learning_rate": 7.798105075012591e-05, "loss": 1.6863, "num_input_tokens_seen": 1463943168, "step": 1241 }, { "epoch": 0.028547835319098738, "grad_norm": 0.87109375, "learning_rate": 7.797615350554892e-05, "loss": 1.6711, "num_input_tokens_seen": 1465122816, "step": 1242 }, { "epoch": 0.028665801580747906, "grad_norm": 1.0703125, "learning_rate": 7.797125048283985e-05, "loss": 1.612, "num_input_tokens_seen": 1466302464, "step": 1243 }, { "epoch": 0.028783767842397074, "grad_norm": 0.9609375, "learning_rate": 7.796634168274473e-05, "loss": 1.6763, "num_input_tokens_seen": 1467482112, "step": 1244 }, { "epoch": 0.028901734104046242, "grad_norm": 1.03125, "learning_rate": 7.796142710601041e-05, "loss": 1.6243, "num_input_tokens_seen": 1468661760, "step": 1245 }, { "epoch": 0.02901970036569541, "grad_norm": 1.0, "learning_rate": 7.795650675338471e-05, "loss": 1.6789, "num_input_tokens_seen": 1469841408, "step": 1246 }, { "epoch": 0.02913766662734458, "grad_norm": 1.125, "learning_rate": 7.795158062561622e-05, "loss": 1.5523, "num_input_tokens_seen": 1471021056, "step": 1247 }, { "epoch": 0.029255632888993747, "grad_norm": 1.2578125, "learning_rate": 7.794664872345446e-05, "loss": 1.5118, "num_input_tokens_seen": 1472200704, "step": 1248 }, { "epoch": 0.029373599150642915, "grad_norm": 1.0859375, "learning_rate": 7.794171104764984e-05, "loss": 1.6745, "num_input_tokens_seen": 1473380352, "step": 1249 }, { "epoch": 0.029491565412292083, "grad_norm": 1.0390625, "learning_rate": 7.793676759895363e-05, "loss": 1.7021, "num_input_tokens_seen": 1474560000, "step": 1250 }, { "epoch": 0.02960953167394125, "grad_norm": 0.921875, "learning_rate": 7.793181837811801e-05, "loss": 1.7004, "num_input_tokens_seen": 1475739648, "step": 1251 }, { "epoch": 0.02972749793559042, "grad_norm": 1.1640625, "learning_rate": 7.792686338589598e-05, "loss": 1.6231, "num_input_tokens_seen": 1476919296, "step": 1252 }, { "epoch": 0.029845464197239588, "grad_norm": 0.80859375, "learning_rate": 7.792190262304146e-05, "loss": 1.7307, "num_input_tokens_seen": 1478098944, "step": 1253 }, { "epoch": 0.029963430458888756, "grad_norm": 1.25, "learning_rate": 7.791693609030922e-05, "loss": 1.4952, "num_input_tokens_seen": 1479278592, "step": 1254 }, { "epoch": 0.030081396720537928, "grad_norm": 1.046875, "learning_rate": 7.791196378845495e-05, "loss": 1.6335, "num_input_tokens_seen": 1480458240, "step": 1255 }, { "epoch": 0.030199362982187096, "grad_norm": 1.140625, "learning_rate": 7.79069857182352e-05, "loss": 1.7717, "num_input_tokens_seen": 1481637888, "step": 1256 }, { "epoch": 0.030317329243836264, "grad_norm": 1.078125, "learning_rate": 7.790200188040737e-05, "loss": 1.6776, "num_input_tokens_seen": 1482817536, "step": 1257 }, { "epoch": 0.030435295505485432, "grad_norm": 0.98828125, "learning_rate": 7.789701227572977e-05, "loss": 1.8233, "num_input_tokens_seen": 1483997184, "step": 1258 }, { "epoch": 0.0305532617671346, "grad_norm": 1.0390625, "learning_rate": 7.789201690496157e-05, "loss": 1.563, "num_input_tokens_seen": 1485176832, "step": 1259 }, { "epoch": 0.03067122802878377, "grad_norm": 1.0859375, "learning_rate": 7.788701576886283e-05, "loss": 1.6999, "num_input_tokens_seen": 1486356480, "step": 1260 }, { "epoch": 0.030789194290432937, "grad_norm": 1.0, "learning_rate": 7.788200886819447e-05, "loss": 1.627, "num_input_tokens_seen": 1487536128, "step": 1261 }, { "epoch": 0.030907160552082105, "grad_norm": 0.9296875, "learning_rate": 7.787699620371831e-05, "loss": 1.7276, "num_input_tokens_seen": 1488715776, "step": 1262 }, { "epoch": 0.031025126813731273, "grad_norm": 1.0625, "learning_rate": 7.787197777619703e-05, "loss": 1.6639, "num_input_tokens_seen": 1489895424, "step": 1263 }, { "epoch": 0.03114309307538044, "grad_norm": 1.1015625, "learning_rate": 7.786695358639419e-05, "loss": 1.7681, "num_input_tokens_seen": 1491075072, "step": 1264 }, { "epoch": 0.031261059337029606, "grad_norm": 1.046875, "learning_rate": 7.786192363507423e-05, "loss": 1.6764, "num_input_tokens_seen": 1492254720, "step": 1265 }, { "epoch": 0.03137902559867878, "grad_norm": 1.0, "learning_rate": 7.785688792300247e-05, "loss": 1.626, "num_input_tokens_seen": 1493434368, "step": 1266 }, { "epoch": 0.03149699186032794, "grad_norm": 0.94140625, "learning_rate": 7.785184645094509e-05, "loss": 1.6718, "num_input_tokens_seen": 1494614016, "step": 1267 }, { "epoch": 0.031614958121977114, "grad_norm": 0.96875, "learning_rate": 7.784679921966916e-05, "loss": 1.6437, "num_input_tokens_seen": 1495793664, "step": 1268 }, { "epoch": 0.031732924383626286, "grad_norm": 0.9921875, "learning_rate": 7.784174622994265e-05, "loss": 1.6631, "num_input_tokens_seen": 1496973312, "step": 1269 }, { "epoch": 0.03185089064527545, "grad_norm": 0.97265625, "learning_rate": 7.783668748253435e-05, "loss": 1.6691, "num_input_tokens_seen": 1498152960, "step": 1270 }, { "epoch": 0.03196885690692462, "grad_norm": 0.8984375, "learning_rate": 7.783162297821396e-05, "loss": 1.7088, "num_input_tokens_seen": 1499332608, "step": 1271 }, { "epoch": 0.03208682316857379, "grad_norm": 1.1015625, "learning_rate": 7.782655271775206e-05, "loss": 1.6545, "num_input_tokens_seen": 1500512256, "step": 1272 }, { "epoch": 0.03220478943022296, "grad_norm": 0.7890625, "learning_rate": 7.78214767019201e-05, "loss": 1.701, "num_input_tokens_seen": 1501691904, "step": 1273 }, { "epoch": 0.032322755691872124, "grad_norm": 1.03125, "learning_rate": 7.781639493149041e-05, "loss": 1.5984, "num_input_tokens_seen": 1502871552, "step": 1274 }, { "epoch": 0.032440721953521295, "grad_norm": 0.90234375, "learning_rate": 7.781130740723616e-05, "loss": 1.7417, "num_input_tokens_seen": 1504051200, "step": 1275 }, { "epoch": 0.03255868821517046, "grad_norm": 1.125, "learning_rate": 7.780621412993146e-05, "loss": 1.6722, "num_input_tokens_seen": 1505230848, "step": 1276 }, { "epoch": 0.03267665447681963, "grad_norm": 0.99609375, "learning_rate": 7.780111510035124e-05, "loss": 1.6619, "num_input_tokens_seen": 1506410496, "step": 1277 }, { "epoch": 0.0327946207384688, "grad_norm": 0.86328125, "learning_rate": 7.779601031927133e-05, "loss": 1.7004, "num_input_tokens_seen": 1507590144, "step": 1278 }, { "epoch": 0.03291258700011797, "grad_norm": 1.03125, "learning_rate": 7.779089978746844e-05, "loss": 1.5487, "num_input_tokens_seen": 1508769792, "step": 1279 }, { "epoch": 0.03303055326176713, "grad_norm": 0.9765625, "learning_rate": 7.778578350572012e-05, "loss": 1.5906, "num_input_tokens_seen": 1509949440, "step": 1280 }, { "epoch": 0.033148519523416305, "grad_norm": 0.91796875, "learning_rate": 7.778066147480484e-05, "loss": 1.7207, "num_input_tokens_seen": 1511129088, "step": 1281 }, { "epoch": 0.03326648578506547, "grad_norm": 1.0078125, "learning_rate": 7.777553369550193e-05, "loss": 1.7121, "num_input_tokens_seen": 1512308736, "step": 1282 }, { "epoch": 0.03338445204671464, "grad_norm": 1.015625, "learning_rate": 7.777040016859158e-05, "loss": 1.6065, "num_input_tokens_seen": 1513488384, "step": 1283 }, { "epoch": 0.033502418308363806, "grad_norm": 1.0234375, "learning_rate": 7.776526089485487e-05, "loss": 1.5206, "num_input_tokens_seen": 1514668032, "step": 1284 }, { "epoch": 0.03362038457001298, "grad_norm": 0.9609375, "learning_rate": 7.776011587507374e-05, "loss": 1.6641, "num_input_tokens_seen": 1515847680, "step": 1285 }, { "epoch": 0.03373835083166214, "grad_norm": 1.03125, "learning_rate": 7.775496511003101e-05, "loss": 1.6269, "num_input_tokens_seen": 1517027328, "step": 1286 }, { "epoch": 0.033856317093311314, "grad_norm": 1.0546875, "learning_rate": 7.774980860051039e-05, "loss": 1.7265, "num_input_tokens_seen": 1518206976, "step": 1287 }, { "epoch": 0.03397428335496048, "grad_norm": 0.90234375, "learning_rate": 7.774464634729645e-05, "loss": 1.6951, "num_input_tokens_seen": 1519386624, "step": 1288 }, { "epoch": 0.03409224961660965, "grad_norm": 0.92578125, "learning_rate": 7.773947835117464e-05, "loss": 1.6725, "num_input_tokens_seen": 1520566272, "step": 1289 }, { "epoch": 0.034210215878258815, "grad_norm": 1.015625, "learning_rate": 7.773430461293124e-05, "loss": 1.6742, "num_input_tokens_seen": 1521745920, "step": 1290 }, { "epoch": 0.03432818213990799, "grad_norm": 1.0, "learning_rate": 7.77291251333535e-05, "loss": 1.7581, "num_input_tokens_seen": 1522925568, "step": 1291 }, { "epoch": 0.03444614840155715, "grad_norm": 1.0703125, "learning_rate": 7.772393991322946e-05, "loss": 1.668, "num_input_tokens_seen": 1524105216, "step": 1292 }, { "epoch": 0.03456411466320632, "grad_norm": 1.1875, "learning_rate": 7.771874895334805e-05, "loss": 1.5293, "num_input_tokens_seen": 1525284864, "step": 1293 }, { "epoch": 0.03468208092485549, "grad_norm": 0.8828125, "learning_rate": 7.771355225449908e-05, "loss": 1.6373, "num_input_tokens_seen": 1526464512, "step": 1294 }, { "epoch": 0.03480004718650466, "grad_norm": 0.92578125, "learning_rate": 7.770834981747326e-05, "loss": 1.6129, "num_input_tokens_seen": 1527644160, "step": 1295 }, { "epoch": 0.03491801344815383, "grad_norm": 1.1875, "learning_rate": 7.770314164306213e-05, "loss": 1.693, "num_input_tokens_seen": 1528823808, "step": 1296 }, { "epoch": 0.035035979709802996, "grad_norm": 0.859375, "learning_rate": 7.769792773205813e-05, "loss": 1.7876, "num_input_tokens_seen": 1530003456, "step": 1297 }, { "epoch": 0.03515394597145217, "grad_norm": 0.953125, "learning_rate": 7.769270808525456e-05, "loss": 1.7498, "num_input_tokens_seen": 1531183104, "step": 1298 }, { "epoch": 0.03527191223310133, "grad_norm": 0.88671875, "learning_rate": 7.76874827034456e-05, "loss": 1.7111, "num_input_tokens_seen": 1532362752, "step": 1299 }, { "epoch": 0.035389878494750504, "grad_norm": 0.875, "learning_rate": 7.76822515874263e-05, "loss": 1.8188, "num_input_tokens_seen": 1533542400, "step": 1300 }, { "epoch": 0.03550784475639967, "grad_norm": 0.86328125, "learning_rate": 7.767701473799259e-05, "loss": 1.6833, "num_input_tokens_seen": 1534722048, "step": 1301 }, { "epoch": 0.03562581101804884, "grad_norm": 0.95703125, "learning_rate": 7.767177215594125e-05, "loss": 1.6609, "num_input_tokens_seen": 1535901696, "step": 1302 }, { "epoch": 0.035743777279698005, "grad_norm": 0.8671875, "learning_rate": 7.766652384206994e-05, "loss": 1.7225, "num_input_tokens_seen": 1537081344, "step": 1303 }, { "epoch": 0.03586174354134718, "grad_norm": 0.9375, "learning_rate": 7.766126979717725e-05, "loss": 1.6236, "num_input_tokens_seen": 1538260992, "step": 1304 }, { "epoch": 0.03597970980299634, "grad_norm": 0.8828125, "learning_rate": 7.765601002206253e-05, "loss": 1.6974, "num_input_tokens_seen": 1539440640, "step": 1305 }, { "epoch": 0.03609767606464551, "grad_norm": 0.96484375, "learning_rate": 7.76507445175261e-05, "loss": 1.8008, "num_input_tokens_seen": 1540620288, "step": 1306 }, { "epoch": 0.03621564232629468, "grad_norm": 1.0390625, "learning_rate": 7.764547328436909e-05, "loss": 1.5457, "num_input_tokens_seen": 1541799936, "step": 1307 }, { "epoch": 0.03633360858794385, "grad_norm": 1.0078125, "learning_rate": 7.764019632339355e-05, "loss": 1.6641, "num_input_tokens_seen": 1542979584, "step": 1308 }, { "epoch": 0.036451574849593014, "grad_norm": 1.1796875, "learning_rate": 7.763491363540237e-05, "loss": 1.6156, "num_input_tokens_seen": 1544159232, "step": 1309 }, { "epoch": 0.036569541111242186, "grad_norm": 1.1015625, "learning_rate": 7.762962522119931e-05, "loss": 1.5664, "num_input_tokens_seen": 1545338880, "step": 1310 }, { "epoch": 0.03668750737289135, "grad_norm": 1.0, "learning_rate": 7.762433108158903e-05, "loss": 1.7106, "num_input_tokens_seen": 1546518528, "step": 1311 }, { "epoch": 0.03680547363454052, "grad_norm": 1.046875, "learning_rate": 7.761903121737702e-05, "loss": 1.7238, "num_input_tokens_seen": 1547698176, "step": 1312 }, { "epoch": 0.03692343989618969, "grad_norm": 0.94140625, "learning_rate": 7.761372562936966e-05, "loss": 1.6679, "num_input_tokens_seen": 1548877824, "step": 1313 }, { "epoch": 0.03704140615783886, "grad_norm": 1.234375, "learning_rate": 7.760841431837424e-05, "loss": 1.5861, "num_input_tokens_seen": 1550057472, "step": 1314 }, { "epoch": 0.037159372419488024, "grad_norm": 1.0546875, "learning_rate": 7.760309728519884e-05, "loss": 1.5827, "num_input_tokens_seen": 1551237120, "step": 1315 }, { "epoch": 0.037277338681137195, "grad_norm": 1.0, "learning_rate": 7.75977745306525e-05, "loss": 1.6241, "num_input_tokens_seen": 1552416768, "step": 1316 }, { "epoch": 0.03739530494278636, "grad_norm": 0.94921875, "learning_rate": 7.759244605554504e-05, "loss": 1.5647, "num_input_tokens_seen": 1553596416, "step": 1317 }, { "epoch": 0.03751327120443553, "grad_norm": 1.0390625, "learning_rate": 7.758711186068723e-05, "loss": 1.6642, "num_input_tokens_seen": 1554776064, "step": 1318 }, { "epoch": 0.0376312374660847, "grad_norm": 1.0, "learning_rate": 7.758177194689065e-05, "loss": 1.6156, "num_input_tokens_seen": 1555955712, "step": 1319 }, { "epoch": 0.03774920372773387, "grad_norm": 0.98046875, "learning_rate": 7.75764263149678e-05, "loss": 1.6103, "num_input_tokens_seen": 1557135360, "step": 1320 }, { "epoch": 0.03786716998938304, "grad_norm": 0.98828125, "learning_rate": 7.757107496573202e-05, "loss": 1.6254, "num_input_tokens_seen": 1558315008, "step": 1321 }, { "epoch": 0.037985136251032205, "grad_norm": 0.96484375, "learning_rate": 7.756571789999751e-05, "loss": 1.6553, "num_input_tokens_seen": 1559494656, "step": 1322 }, { "epoch": 0.038103102512681376, "grad_norm": 1.046875, "learning_rate": 7.756035511857937e-05, "loss": 1.7126, "num_input_tokens_seen": 1560674304, "step": 1323 }, { "epoch": 0.03822106877433054, "grad_norm": 0.87890625, "learning_rate": 7.755498662229356e-05, "loss": 1.6802, "num_input_tokens_seen": 1561853952, "step": 1324 }, { "epoch": 0.03833903503597971, "grad_norm": 0.984375, "learning_rate": 7.754961241195689e-05, "loss": 1.6442, "num_input_tokens_seen": 1563033600, "step": 1325 }, { "epoch": 0.03845700129762888, "grad_norm": 0.9453125, "learning_rate": 7.754423248838708e-05, "loss": 1.648, "num_input_tokens_seen": 1564213248, "step": 1326 }, { "epoch": 0.03857496755927805, "grad_norm": 1.046875, "learning_rate": 7.753884685240267e-05, "loss": 1.6544, "num_input_tokens_seen": 1565392896, "step": 1327 }, { "epoch": 0.038692933820927214, "grad_norm": 0.99609375, "learning_rate": 7.75334555048231e-05, "loss": 1.6477, "num_input_tokens_seen": 1566572544, "step": 1328 }, { "epoch": 0.038810900082576386, "grad_norm": 1.0625, "learning_rate": 7.752805844646867e-05, "loss": 1.6371, "num_input_tokens_seen": 1567752192, "step": 1329 }, { "epoch": 0.03892886634422555, "grad_norm": 1.171875, "learning_rate": 7.752265567816055e-05, "loss": 1.7027, "num_input_tokens_seen": 1568931840, "step": 1330 }, { "epoch": 0.03904683260587472, "grad_norm": 0.9140625, "learning_rate": 7.75172472007208e-05, "loss": 1.6617, "num_input_tokens_seen": 1570111488, "step": 1331 }, { "epoch": 0.03916479886752389, "grad_norm": 0.9140625, "learning_rate": 7.75118330149723e-05, "loss": 1.7437, "num_input_tokens_seen": 1571291136, "step": 1332 }, { "epoch": 0.03928276512917306, "grad_norm": 1.015625, "learning_rate": 7.750641312173886e-05, "loss": 1.6968, "num_input_tokens_seen": 1572470784, "step": 1333 }, { "epoch": 0.03940073139082222, "grad_norm": 1.0234375, "learning_rate": 7.750098752184507e-05, "loss": 1.5903, "num_input_tokens_seen": 1573650432, "step": 1334 }, { "epoch": 0.039518697652471395, "grad_norm": 0.85546875, "learning_rate": 7.74955562161165e-05, "loss": 1.6188, "num_input_tokens_seen": 1574830080, "step": 1335 }, { "epoch": 0.03963666391412056, "grad_norm": 1.09375, "learning_rate": 7.749011920537951e-05, "loss": 1.5991, "num_input_tokens_seen": 1576009728, "step": 1336 }, { "epoch": 0.03975463017576973, "grad_norm": 0.94140625, "learning_rate": 7.748467649046135e-05, "loss": 1.5751, "num_input_tokens_seen": 1577189376, "step": 1337 }, { "epoch": 0.039872596437418896, "grad_norm": 0.90625, "learning_rate": 7.747922807219012e-05, "loss": 1.6036, "num_input_tokens_seen": 1578369024, "step": 1338 }, { "epoch": 0.03999056269906807, "grad_norm": 0.9375, "learning_rate": 7.747377395139484e-05, "loss": 1.6377, "num_input_tokens_seen": 1579548672, "step": 1339 }, { "epoch": 0.04010852896071723, "grad_norm": 1.0390625, "learning_rate": 7.746831412890534e-05, "loss": 1.6241, "num_input_tokens_seen": 1580728320, "step": 1340 }, { "epoch": 0.040226495222366404, "grad_norm": 0.875, "learning_rate": 7.746284860555235e-05, "loss": 1.6902, "num_input_tokens_seen": 1581907968, "step": 1341 }, { "epoch": 0.04034446148401557, "grad_norm": 0.98828125, "learning_rate": 7.745737738216745e-05, "loss": 1.7062, "num_input_tokens_seen": 1583087616, "step": 1342 }, { "epoch": 0.04046242774566474, "grad_norm": 0.9375, "learning_rate": 7.74519004595831e-05, "loss": 1.6637, "num_input_tokens_seen": 1584267264, "step": 1343 }, { "epoch": 0.040580394007313905, "grad_norm": 0.8359375, "learning_rate": 7.744641783863263e-05, "loss": 1.725, "num_input_tokens_seen": 1585446912, "step": 1344 }, { "epoch": 0.04069836026896308, "grad_norm": 1.046875, "learning_rate": 7.744092952015023e-05, "loss": 1.6461, "num_input_tokens_seen": 1586626560, "step": 1345 }, { "epoch": 0.04081632653061224, "grad_norm": 0.97265625, "learning_rate": 7.743543550497093e-05, "loss": 1.7593, "num_input_tokens_seen": 1587806208, "step": 1346 }, { "epoch": 0.04093429279226141, "grad_norm": 0.9609375, "learning_rate": 7.742993579393067e-05, "loss": 1.5863, "num_input_tokens_seen": 1588985856, "step": 1347 }, { "epoch": 0.041052259053910585, "grad_norm": 0.9140625, "learning_rate": 7.742443038786625e-05, "loss": 1.6812, "num_input_tokens_seen": 1590165504, "step": 1348 }, { "epoch": 0.04117022531555975, "grad_norm": 1.0703125, "learning_rate": 7.741891928761533e-05, "loss": 1.5854, "num_input_tokens_seen": 1591345152, "step": 1349 }, { "epoch": 0.04128819157720892, "grad_norm": 0.90625, "learning_rate": 7.74134024940164e-05, "loss": 1.7451, "num_input_tokens_seen": 1592524800, "step": 1350 }, { "epoch": 0.041406157838858086, "grad_norm": 0.859375, "learning_rate": 7.740788000790888e-05, "loss": 1.7746, "num_input_tokens_seen": 1593704448, "step": 1351 }, { "epoch": 0.04152412410050726, "grad_norm": 0.8828125, "learning_rate": 7.740235183013301e-05, "loss": 1.6677, "num_input_tokens_seen": 1594884096, "step": 1352 }, { "epoch": 0.04164209036215642, "grad_norm": 1.109375, "learning_rate": 7.739681796152992e-05, "loss": 1.6485, "num_input_tokens_seen": 1596063744, "step": 1353 }, { "epoch": 0.041760056623805594, "grad_norm": 0.98046875, "learning_rate": 7.739127840294159e-05, "loss": 1.7374, "num_input_tokens_seen": 1597243392, "step": 1354 }, { "epoch": 0.04187802288545476, "grad_norm": 0.91796875, "learning_rate": 7.738573315521088e-05, "loss": 1.6461, "num_input_tokens_seen": 1598423040, "step": 1355 }, { "epoch": 0.04199598914710393, "grad_norm": 0.99609375, "learning_rate": 7.738018221918148e-05, "loss": 1.7057, "num_input_tokens_seen": 1599602688, "step": 1356 }, { "epoch": 0.042113955408753095, "grad_norm": 1.2109375, "learning_rate": 7.737462559569803e-05, "loss": 1.5765, "num_input_tokens_seen": 1600782336, "step": 1357 }, { "epoch": 0.04223192167040227, "grad_norm": 0.921875, "learning_rate": 7.736906328560593e-05, "loss": 1.5281, "num_input_tokens_seen": 1601961984, "step": 1358 }, { "epoch": 0.04234988793205143, "grad_norm": 1.1484375, "learning_rate": 7.736349528975151e-05, "loss": 1.7288, "num_input_tokens_seen": 1603141632, "step": 1359 }, { "epoch": 0.042467854193700603, "grad_norm": 1.078125, "learning_rate": 7.735792160898194e-05, "loss": 1.5428, "num_input_tokens_seen": 1604321280, "step": 1360 }, { "epoch": 0.04258582045534977, "grad_norm": 0.9140625, "learning_rate": 7.735234224414528e-05, "loss": 1.5481, "num_input_tokens_seen": 1605500928, "step": 1361 }, { "epoch": 0.04270378671699894, "grad_norm": 1.0078125, "learning_rate": 7.734675719609046e-05, "loss": 1.6189, "num_input_tokens_seen": 1606680576, "step": 1362 }, { "epoch": 0.042821752978648105, "grad_norm": 0.7890625, "learning_rate": 7.73411664656672e-05, "loss": 1.6773, "num_input_tokens_seen": 1607860224, "step": 1363 }, { "epoch": 0.042939719240297276, "grad_norm": 1.1640625, "learning_rate": 7.733557005372616e-05, "loss": 1.482, "num_input_tokens_seen": 1609039872, "step": 1364 }, { "epoch": 0.04305768550194644, "grad_norm": 0.94921875, "learning_rate": 7.732996796111887e-05, "loss": 1.6277, "num_input_tokens_seen": 1610219520, "step": 1365 }, { "epoch": 0.04317565176359561, "grad_norm": 1.1015625, "learning_rate": 7.732436018869765e-05, "loss": 1.5224, "num_input_tokens_seen": 1611399168, "step": 1366 }, { "epoch": 0.04329361802524478, "grad_norm": 1.0625, "learning_rate": 7.731874673731579e-05, "loss": 1.581, "num_input_tokens_seen": 1612578816, "step": 1367 }, { "epoch": 0.04341158428689395, "grad_norm": 1.1171875, "learning_rate": 7.731312760782732e-05, "loss": 1.5601, "num_input_tokens_seen": 1613758464, "step": 1368 }, { "epoch": 0.043529550548543114, "grad_norm": 0.87890625, "learning_rate": 7.730750280108725e-05, "loss": 1.5671, "num_input_tokens_seen": 1614938112, "step": 1369 }, { "epoch": 0.043647516810192286, "grad_norm": 0.9296875, "learning_rate": 7.730187231795139e-05, "loss": 1.5482, "num_input_tokens_seen": 1616117760, "step": 1370 }, { "epoch": 0.04376548307184145, "grad_norm": 1.046875, "learning_rate": 7.72962361592764e-05, "loss": 1.516, "num_input_tokens_seen": 1617297408, "step": 1371 }, { "epoch": 0.04388344933349062, "grad_norm": 1.046875, "learning_rate": 7.729059432591989e-05, "loss": 1.5741, "num_input_tokens_seen": 1618477056, "step": 1372 }, { "epoch": 0.04400141559513979, "grad_norm": 0.90625, "learning_rate": 7.72849468187402e-05, "loss": 1.5939, "num_input_tokens_seen": 1619656704, "step": 1373 }, { "epoch": 0.04411938185678896, "grad_norm": 1.21875, "learning_rate": 7.727929363859668e-05, "loss": 1.503, "num_input_tokens_seen": 1620836352, "step": 1374 }, { "epoch": 0.04423734811843813, "grad_norm": 0.984375, "learning_rate": 7.727363478634941e-05, "loss": 1.6708, "num_input_tokens_seen": 1622016000, "step": 1375 }, { "epoch": 0.044355314380087295, "grad_norm": 1.0, "learning_rate": 7.726797026285941e-05, "loss": 1.7104, "num_input_tokens_seen": 1623195648, "step": 1376 }, { "epoch": 0.044473280641736467, "grad_norm": 1.015625, "learning_rate": 7.726230006898857e-05, "loss": 1.5368, "num_input_tokens_seen": 1624375296, "step": 1377 }, { "epoch": 0.04459124690338563, "grad_norm": 1.0390625, "learning_rate": 7.725662420559961e-05, "loss": 1.5915, "num_input_tokens_seen": 1625554944, "step": 1378 }, { "epoch": 0.0447092131650348, "grad_norm": 0.97265625, "learning_rate": 7.725094267355611e-05, "loss": 1.5744, "num_input_tokens_seen": 1626734592, "step": 1379 }, { "epoch": 0.04482717942668397, "grad_norm": 0.98046875, "learning_rate": 7.724525547372251e-05, "loss": 1.5304, "num_input_tokens_seen": 1627914240, "step": 1380 }, { "epoch": 0.04494514568833314, "grad_norm": 1.0, "learning_rate": 7.723956260696416e-05, "loss": 1.6273, "num_input_tokens_seen": 1629093888, "step": 1381 }, { "epoch": 0.045063111949982304, "grad_norm": 0.875, "learning_rate": 7.723386407414723e-05, "loss": 1.5527, "num_input_tokens_seen": 1630273536, "step": 1382 }, { "epoch": 0.045181078211631476, "grad_norm": 0.90625, "learning_rate": 7.722815987613875e-05, "loss": 1.6727, "num_input_tokens_seen": 1631453184, "step": 1383 }, { "epoch": 0.04529904447328064, "grad_norm": 0.9765625, "learning_rate": 7.722245001380663e-05, "loss": 1.5183, "num_input_tokens_seen": 1632632832, "step": 1384 }, { "epoch": 0.04541701073492981, "grad_norm": 0.86328125, "learning_rate": 7.721673448801963e-05, "loss": 1.5318, "num_input_tokens_seen": 1633812480, "step": 1385 }, { "epoch": 0.04553497699657898, "grad_norm": 0.98046875, "learning_rate": 7.721101329964737e-05, "loss": 1.5919, "num_input_tokens_seen": 1634992128, "step": 1386 }, { "epoch": 0.04565294325822815, "grad_norm": 1.140625, "learning_rate": 7.720528644956035e-05, "loss": 1.566, "num_input_tokens_seen": 1636171776, "step": 1387 }, { "epoch": 0.04577090951987731, "grad_norm": 0.92578125, "learning_rate": 7.719955393862993e-05, "loss": 1.5904, "num_input_tokens_seen": 1637351424, "step": 1388 }, { "epoch": 0.045888875781526485, "grad_norm": 0.83203125, "learning_rate": 7.719381576772829e-05, "loss": 1.7284, "num_input_tokens_seen": 1638531072, "step": 1389 }, { "epoch": 0.04600684204317565, "grad_norm": 0.98828125, "learning_rate": 7.718807193772853e-05, "loss": 1.5317, "num_input_tokens_seen": 1639710720, "step": 1390 }, { "epoch": 0.04612480830482482, "grad_norm": 0.96875, "learning_rate": 7.718232244950455e-05, "loss": 1.5254, "num_input_tokens_seen": 1640890368, "step": 1391 }, { "epoch": 0.046242774566473986, "grad_norm": 0.99609375, "learning_rate": 7.717656730393119e-05, "loss": 1.5747, "num_input_tokens_seen": 1642070016, "step": 1392 }, { "epoch": 0.04636074082812316, "grad_norm": 1.0, "learning_rate": 7.717080650188407e-05, "loss": 1.606, "num_input_tokens_seen": 1643249664, "step": 1393 }, { "epoch": 0.04647870708977232, "grad_norm": 1.0234375, "learning_rate": 7.716504004423972e-05, "loss": 1.5735, "num_input_tokens_seen": 1644429312, "step": 1394 }, { "epoch": 0.046596673351421494, "grad_norm": 1.015625, "learning_rate": 7.715926793187551e-05, "loss": 1.6629, "num_input_tokens_seen": 1645608960, "step": 1395 }, { "epoch": 0.04671463961307066, "grad_norm": 1.109375, "learning_rate": 7.715349016566968e-05, "loss": 1.6771, "num_input_tokens_seen": 1646788608, "step": 1396 }, { "epoch": 0.04683260587471983, "grad_norm": 0.87890625, "learning_rate": 7.71477067465013e-05, "loss": 1.8481, "num_input_tokens_seen": 1647968256, "step": 1397 }, { "epoch": 0.046950572136368995, "grad_norm": 1.1171875, "learning_rate": 7.714191767525036e-05, "loss": 1.6027, "num_input_tokens_seen": 1649147904, "step": 1398 }, { "epoch": 0.04706853839801817, "grad_norm": 1.015625, "learning_rate": 7.713612295279767e-05, "loss": 1.6044, "num_input_tokens_seen": 1650327552, "step": 1399 }, { "epoch": 0.04718650465966733, "grad_norm": 1.0, "learning_rate": 7.713032258002491e-05, "loss": 1.6524, "num_input_tokens_seen": 1651507200, "step": 1400 }, { "epoch": 0.04718650465966733, "eval_wikipedia_loss": 2.2798447608947754, "eval_wikipedia_runtime": 161.5268, "eval_wikipedia_samples_per_second": 4.346, "eval_wikipedia_steps_per_second": 0.186, "num_input_tokens_seen": 1651507200, "step": 1400 }, { "epoch": 0.04718650465966733, "eval_toxicity_loss": 4.040336608886719, "eval_toxicity_runtime": 0.9662, "eval_toxicity_samples_per_second": 2.07, "eval_toxicity_steps_per_second": 1.035, "num_input_tokens_seen": 1651507200, "step": 1400 }, { "epoch": 0.047304470921316503, "grad_norm": 1.03125, "learning_rate": 7.712451655781459e-05, "loss": 1.7851, "num_input_tokens_seen": 1652686848, "step": 1401 }, { "epoch": 0.047422437182965675, "grad_norm": 1.0859375, "learning_rate": 7.711870488705014e-05, "loss": 1.6764, "num_input_tokens_seen": 1653866496, "step": 1402 }, { "epoch": 0.04754040344461484, "grad_norm": 1.015625, "learning_rate": 7.711288756861579e-05, "loss": 1.7252, "num_input_tokens_seen": 1655046144, "step": 1403 }, { "epoch": 0.04765836970626401, "grad_norm": 0.94140625, "learning_rate": 7.710706460339665e-05, "loss": 1.7524, "num_input_tokens_seen": 1656225792, "step": 1404 }, { "epoch": 0.047776335967913176, "grad_norm": 1.0625, "learning_rate": 7.710123599227873e-05, "loss": 1.7855, "num_input_tokens_seen": 1657405440, "step": 1405 }, { "epoch": 0.04789430222956235, "grad_norm": 0.81640625, "learning_rate": 7.709540173614883e-05, "loss": 1.77, "num_input_tokens_seen": 1658585088, "step": 1406 }, { "epoch": 0.04801226849121151, "grad_norm": 1.0390625, "learning_rate": 7.708956183589466e-05, "loss": 1.7499, "num_input_tokens_seen": 1659764736, "step": 1407 }, { "epoch": 0.048130234752860684, "grad_norm": 0.953125, "learning_rate": 7.708371629240474e-05, "loss": 1.5442, "num_input_tokens_seen": 1660944384, "step": 1408 }, { "epoch": 0.04824820101450985, "grad_norm": 0.953125, "learning_rate": 7.707786510656849e-05, "loss": 1.7384, "num_input_tokens_seen": 1662124032, "step": 1409 }, { "epoch": 0.04836616727615902, "grad_norm": 0.88671875, "learning_rate": 7.70720082792762e-05, "loss": 1.7988, "num_input_tokens_seen": 1663303680, "step": 1410 }, { "epoch": 0.048484133537808186, "grad_norm": 0.8828125, "learning_rate": 7.706614581141898e-05, "loss": 1.6953, "num_input_tokens_seen": 1664483328, "step": 1411 }, { "epoch": 0.04860209979945736, "grad_norm": 0.82421875, "learning_rate": 7.706027770388881e-05, "loss": 1.7006, "num_input_tokens_seen": 1665662976, "step": 1412 }, { "epoch": 0.04872006606110652, "grad_norm": 0.85546875, "learning_rate": 7.705440395757855e-05, "loss": 1.8123, "num_input_tokens_seen": 1666842624, "step": 1413 }, { "epoch": 0.048838032322755694, "grad_norm": 0.87890625, "learning_rate": 7.704852457338188e-05, "loss": 1.5978, "num_input_tokens_seen": 1668022272, "step": 1414 }, { "epoch": 0.04895599858440486, "grad_norm": 0.7421875, "learning_rate": 7.704263955219336e-05, "loss": 1.8147, "num_input_tokens_seen": 1669201920, "step": 1415 }, { "epoch": 0.04907396484605403, "grad_norm": 0.87890625, "learning_rate": 7.703674889490842e-05, "loss": 1.6817, "num_input_tokens_seen": 1670381568, "step": 1416 }, { "epoch": 0.049191931107703195, "grad_norm": 0.8359375, "learning_rate": 7.703085260242331e-05, "loss": 1.7774, "num_input_tokens_seen": 1671561216, "step": 1417 }, { "epoch": 0.049309897369352367, "grad_norm": 0.77734375, "learning_rate": 7.702495067563519e-05, "loss": 1.7312, "num_input_tokens_seen": 1672740864, "step": 1418 }, { "epoch": 0.04942786363100153, "grad_norm": 0.82421875, "learning_rate": 7.701904311544202e-05, "loss": 1.6888, "num_input_tokens_seen": 1673920512, "step": 1419 }, { "epoch": 0.0495458298926507, "grad_norm": 0.7578125, "learning_rate": 7.701312992274265e-05, "loss": 1.7375, "num_input_tokens_seen": 1675100160, "step": 1420 }, { "epoch": 0.04966379615429987, "grad_norm": 0.875, "learning_rate": 7.700721109843679e-05, "loss": 1.8633, "num_input_tokens_seen": 1676279808, "step": 1421 }, { "epoch": 0.04978176241594904, "grad_norm": 0.640625, "learning_rate": 7.7001286643425e-05, "loss": 1.7872, "num_input_tokens_seen": 1677459456, "step": 1422 }, { "epoch": 0.049899728677598204, "grad_norm": 0.8828125, "learning_rate": 7.699535655860868e-05, "loss": 1.7318, "num_input_tokens_seen": 1678639104, "step": 1423 }, { "epoch": 0.050017694939247376, "grad_norm": 0.76171875, "learning_rate": 7.698942084489012e-05, "loss": 1.7188, "num_input_tokens_seen": 1679818752, "step": 1424 }, { "epoch": 0.05013566120089654, "grad_norm": 0.859375, "learning_rate": 7.698347950317244e-05, "loss": 1.7577, "num_input_tokens_seen": 1680998400, "step": 1425 }, { "epoch": 0.05025362746254571, "grad_norm": 0.69921875, "learning_rate": 7.697753253435962e-05, "loss": 1.7042, "num_input_tokens_seen": 1682178048, "step": 1426 }, { "epoch": 0.05037159372419488, "grad_norm": 0.76953125, "learning_rate": 7.697157993935651e-05, "loss": 1.7375, "num_input_tokens_seen": 1683357696, "step": 1427 }, { "epoch": 0.05048955998584405, "grad_norm": 0.87109375, "learning_rate": 7.696562171906881e-05, "loss": 1.6805, "num_input_tokens_seen": 1684537344, "step": 1428 }, { "epoch": 0.05060752624749322, "grad_norm": 0.8515625, "learning_rate": 7.695965787440305e-05, "loss": 1.6456, "num_input_tokens_seen": 1685716992, "step": 1429 }, { "epoch": 0.050725492509142385, "grad_norm": 0.93359375, "learning_rate": 7.695368840626666e-05, "loss": 1.7703, "num_input_tokens_seen": 1686896640, "step": 1430 }, { "epoch": 0.05084345877079156, "grad_norm": 0.89453125, "learning_rate": 7.694771331556792e-05, "loss": 1.6603, "num_input_tokens_seen": 1688076288, "step": 1431 }, { "epoch": 0.05096142503244072, "grad_norm": 0.96875, "learning_rate": 7.694173260321589e-05, "loss": 1.6734, "num_input_tokens_seen": 1689255936, "step": 1432 }, { "epoch": 0.05107939129408989, "grad_norm": 0.82421875, "learning_rate": 7.693574627012059e-05, "loss": 1.705, "num_input_tokens_seen": 1690435584, "step": 1433 }, { "epoch": 0.05119735755573906, "grad_norm": 0.97265625, "learning_rate": 7.692975431719285e-05, "loss": 1.795, "num_input_tokens_seen": 1691615232, "step": 1434 }, { "epoch": 0.05131532381738823, "grad_norm": 0.90234375, "learning_rate": 7.692375674534436e-05, "loss": 1.7853, "num_input_tokens_seen": 1692794880, "step": 1435 }, { "epoch": 0.051433290079037394, "grad_norm": 0.76171875, "learning_rate": 7.691775355548763e-05, "loss": 1.7208, "num_input_tokens_seen": 1693974528, "step": 1436 }, { "epoch": 0.051551256340686566, "grad_norm": 0.83984375, "learning_rate": 7.691174474853608e-05, "loss": 1.7193, "num_input_tokens_seen": 1695154176, "step": 1437 }, { "epoch": 0.05166922260233573, "grad_norm": 0.83984375, "learning_rate": 7.690573032540395e-05, "loss": 1.6211, "num_input_tokens_seen": 1696333824, "step": 1438 }, { "epoch": 0.0517871888639849, "grad_norm": 0.83984375, "learning_rate": 7.689971028700635e-05, "loss": 1.8099, "num_input_tokens_seen": 1697513472, "step": 1439 }, { "epoch": 0.05190515512563407, "grad_norm": 0.8671875, "learning_rate": 7.689368463425922e-05, "loss": 1.6112, "num_input_tokens_seen": 1698693120, "step": 1440 }, { "epoch": 0.05202312138728324, "grad_norm": 0.79296875, "learning_rate": 7.68876533680794e-05, "loss": 1.7277, "num_input_tokens_seen": 1699872768, "step": 1441 }, { "epoch": 0.052141087648932403, "grad_norm": 0.87109375, "learning_rate": 7.688161648938454e-05, "loss": 1.6243, "num_input_tokens_seen": 1701052416, "step": 1442 }, { "epoch": 0.052259053910581575, "grad_norm": 0.80078125, "learning_rate": 7.687557399909317e-05, "loss": 1.6371, "num_input_tokens_seen": 1702232064, "step": 1443 }, { "epoch": 0.05237702017223074, "grad_norm": 0.94140625, "learning_rate": 7.686952589812465e-05, "loss": 1.605, "num_input_tokens_seen": 1703411712, "step": 1444 }, { "epoch": 0.05249498643387991, "grad_norm": 0.765625, "learning_rate": 7.686347218739922e-05, "loss": 1.731, "num_input_tokens_seen": 1704591360, "step": 1445 }, { "epoch": 0.052612952695529076, "grad_norm": 0.83984375, "learning_rate": 7.685741286783795e-05, "loss": 1.838, "num_input_tokens_seen": 1705771008, "step": 1446 }, { "epoch": 0.05273091895717825, "grad_norm": 0.81640625, "learning_rate": 7.68513479403628e-05, "loss": 1.6418, "num_input_tokens_seen": 1706950656, "step": 1447 }, { "epoch": 0.05284888521882741, "grad_norm": 0.78125, "learning_rate": 7.684527740589654e-05, "loss": 1.7607, "num_input_tokens_seen": 1708130304, "step": 1448 }, { "epoch": 0.052966851480476584, "grad_norm": 0.859375, "learning_rate": 7.68392012653628e-05, "loss": 1.7545, "num_input_tokens_seen": 1709309952, "step": 1449 }, { "epoch": 0.05308481774212575, "grad_norm": 0.78125, "learning_rate": 7.68331195196861e-05, "loss": 1.712, "num_input_tokens_seen": 1710489600, "step": 1450 }, { "epoch": 0.05320278400377492, "grad_norm": 0.83984375, "learning_rate": 7.682703216979178e-05, "loss": 1.7965, "num_input_tokens_seen": 1711669248, "step": 1451 }, { "epoch": 0.053320750265424086, "grad_norm": 0.80078125, "learning_rate": 7.682093921660603e-05, "loss": 1.6449, "num_input_tokens_seen": 1712848896, "step": 1452 }, { "epoch": 0.05343871652707326, "grad_norm": 0.8046875, "learning_rate": 7.68148406610559e-05, "loss": 1.6439, "num_input_tokens_seen": 1714028544, "step": 1453 }, { "epoch": 0.05355668278872242, "grad_norm": 1.0078125, "learning_rate": 7.680873650406931e-05, "loss": 1.5848, "num_input_tokens_seen": 1715208192, "step": 1454 }, { "epoch": 0.053674649050371594, "grad_norm": 0.828125, "learning_rate": 7.6802626746575e-05, "loss": 1.6039, "num_input_tokens_seen": 1716387840, "step": 1455 }, { "epoch": 0.053792615312020765, "grad_norm": 0.890625, "learning_rate": 7.679651138950259e-05, "loss": 1.6822, "num_input_tokens_seen": 1717567488, "step": 1456 }, { "epoch": 0.05391058157366993, "grad_norm": 0.92578125, "learning_rate": 7.679039043378256e-05, "loss": 1.8276, "num_input_tokens_seen": 1718747136, "step": 1457 }, { "epoch": 0.0540285478353191, "grad_norm": 0.95703125, "learning_rate": 7.678426388034618e-05, "loss": 1.7017, "num_input_tokens_seen": 1719926784, "step": 1458 }, { "epoch": 0.054146514096968267, "grad_norm": 0.96484375, "learning_rate": 7.677813173012566e-05, "loss": 1.6949, "num_input_tokens_seen": 1721106432, "step": 1459 }, { "epoch": 0.05426448035861744, "grad_norm": 1.0546875, "learning_rate": 7.677199398405397e-05, "loss": 1.7601, "num_input_tokens_seen": 1722286080, "step": 1460 }, { "epoch": 0.0543824466202666, "grad_norm": 1.0078125, "learning_rate": 7.676585064306503e-05, "loss": 1.6369, "num_input_tokens_seen": 1723465728, "step": 1461 }, { "epoch": 0.054500412881915775, "grad_norm": 0.78125, "learning_rate": 7.67597017080935e-05, "loss": 1.7194, "num_input_tokens_seen": 1724645376, "step": 1462 }, { "epoch": 0.05461837914356494, "grad_norm": 1.109375, "learning_rate": 7.675354718007501e-05, "loss": 1.6558, "num_input_tokens_seen": 1725825024, "step": 1463 }, { "epoch": 0.05473634540521411, "grad_norm": 0.92578125, "learning_rate": 7.674738705994595e-05, "loss": 1.6609, "num_input_tokens_seen": 1727004672, "step": 1464 }, { "epoch": 0.054854311666863276, "grad_norm": 0.8359375, "learning_rate": 7.674122134864359e-05, "loss": 1.7041, "num_input_tokens_seen": 1728184320, "step": 1465 }, { "epoch": 0.05497227792851245, "grad_norm": 0.86328125, "learning_rate": 7.673505004710604e-05, "loss": 1.563, "num_input_tokens_seen": 1729363968, "step": 1466 }, { "epoch": 0.05509024419016161, "grad_norm": 0.7109375, "learning_rate": 7.672887315627232e-05, "loss": 1.6657, "num_input_tokens_seen": 1730543616, "step": 1467 }, { "epoch": 0.055208210451810784, "grad_norm": 0.82421875, "learning_rate": 7.67226906770822e-05, "loss": 1.7601, "num_input_tokens_seen": 1731723264, "step": 1468 }, { "epoch": 0.05532617671345995, "grad_norm": 0.75390625, "learning_rate": 7.67165026104764e-05, "loss": 1.6698, "num_input_tokens_seen": 1732902912, "step": 1469 }, { "epoch": 0.05544414297510912, "grad_norm": 0.84765625, "learning_rate": 7.671030895739641e-05, "loss": 1.6914, "num_input_tokens_seen": 1734082560, "step": 1470 }, { "epoch": 0.055562109236758285, "grad_norm": 0.765625, "learning_rate": 7.670410971878464e-05, "loss": 2.0048, "num_input_tokens_seen": 1735262208, "step": 1471 }, { "epoch": 0.05568007549840746, "grad_norm": 0.93359375, "learning_rate": 7.669790489558426e-05, "loss": 1.6403, "num_input_tokens_seen": 1736441856, "step": 1472 }, { "epoch": 0.05579804176005662, "grad_norm": 0.92578125, "learning_rate": 7.66916944887394e-05, "loss": 1.78, "num_input_tokens_seen": 1737621504, "step": 1473 }, { "epoch": 0.05591600802170579, "grad_norm": 0.81640625, "learning_rate": 7.668547849919494e-05, "loss": 1.667, "num_input_tokens_seen": 1738801152, "step": 1474 }, { "epoch": 0.05603397428335496, "grad_norm": 0.87109375, "learning_rate": 7.667925692789668e-05, "loss": 1.683, "num_input_tokens_seen": 1739980800, "step": 1475 }, { "epoch": 0.05615194054500413, "grad_norm": 0.921875, "learning_rate": 7.667302977579124e-05, "loss": 1.6781, "num_input_tokens_seen": 1741160448, "step": 1476 }, { "epoch": 0.056269906806653294, "grad_norm": 0.81640625, "learning_rate": 7.666679704382607e-05, "loss": 1.6615, "num_input_tokens_seen": 1742340096, "step": 1477 }, { "epoch": 0.056387873068302466, "grad_norm": 0.83203125, "learning_rate": 7.66605587329495e-05, "loss": 1.6439, "num_input_tokens_seen": 1743519744, "step": 1478 }, { "epoch": 0.05650583932995163, "grad_norm": 0.80859375, "learning_rate": 7.665431484411072e-05, "loss": 1.6852, "num_input_tokens_seen": 1744699392, "step": 1479 }, { "epoch": 0.0566238055916008, "grad_norm": 0.97265625, "learning_rate": 7.664806537825972e-05, "loss": 1.7587, "num_input_tokens_seen": 1745879040, "step": 1480 }, { "epoch": 0.05674177185324997, "grad_norm": 0.82421875, "learning_rate": 7.664181033634738e-05, "loss": 1.6441, "num_input_tokens_seen": 1747058688, "step": 1481 }, { "epoch": 0.05685973811489914, "grad_norm": 0.69921875, "learning_rate": 7.66355497193254e-05, "loss": 1.7134, "num_input_tokens_seen": 1748238336, "step": 1482 }, { "epoch": 0.05697770437654831, "grad_norm": 0.8828125, "learning_rate": 7.662928352814638e-05, "loss": 1.6675, "num_input_tokens_seen": 1749417984, "step": 1483 }, { "epoch": 0.057095670638197475, "grad_norm": 0.8515625, "learning_rate": 7.662301176376367e-05, "loss": 1.657, "num_input_tokens_seen": 1750597632, "step": 1484 }, { "epoch": 0.05721363689984665, "grad_norm": 0.875, "learning_rate": 7.661673442713159e-05, "loss": 1.6178, "num_input_tokens_seen": 1751777280, "step": 1485 }, { "epoch": 0.05733160316149581, "grad_norm": 0.93359375, "learning_rate": 7.661045151920522e-05, "loss": 1.5022, "num_input_tokens_seen": 1752956928, "step": 1486 }, { "epoch": 0.05744956942314498, "grad_norm": 1.0, "learning_rate": 7.66041630409405e-05, "loss": 1.5419, "num_input_tokens_seen": 1754136576, "step": 1487 }, { "epoch": 0.05756753568479415, "grad_norm": 0.9609375, "learning_rate": 7.659786899329426e-05, "loss": 1.6542, "num_input_tokens_seen": 1755316224, "step": 1488 }, { "epoch": 0.05768550194644332, "grad_norm": 0.83984375, "learning_rate": 7.659156937722413e-05, "loss": 1.6363, "num_input_tokens_seen": 1756495872, "step": 1489 }, { "epoch": 0.057803468208092484, "grad_norm": 0.953125, "learning_rate": 7.658526419368863e-05, "loss": 1.7117, "num_input_tokens_seen": 1757675520, "step": 1490 }, { "epoch": 0.057921434469741656, "grad_norm": 0.94140625, "learning_rate": 7.657895344364707e-05, "loss": 1.6056, "num_input_tokens_seen": 1758855168, "step": 1491 }, { "epoch": 0.05803940073139082, "grad_norm": 0.7890625, "learning_rate": 7.657263712805966e-05, "loss": 1.7623, "num_input_tokens_seen": 1760034816, "step": 1492 }, { "epoch": 0.05815736699303999, "grad_norm": 0.93359375, "learning_rate": 7.656631524788743e-05, "loss": 1.5595, "num_input_tokens_seen": 1761214464, "step": 1493 }, { "epoch": 0.05827533325468916, "grad_norm": 0.70703125, "learning_rate": 7.655998780409228e-05, "loss": 1.7031, "num_input_tokens_seen": 1762394112, "step": 1494 }, { "epoch": 0.05839329951633833, "grad_norm": 0.98828125, "learning_rate": 7.655365479763693e-05, "loss": 1.5904, "num_input_tokens_seen": 1763573760, "step": 1495 }, { "epoch": 0.058511265777987494, "grad_norm": 0.83203125, "learning_rate": 7.654731622948494e-05, "loss": 1.7596, "num_input_tokens_seen": 1764753408, "step": 1496 }, { "epoch": 0.058629232039636665, "grad_norm": 0.87109375, "learning_rate": 7.654097210060076e-05, "loss": 1.7201, "num_input_tokens_seen": 1765933056, "step": 1497 }, { "epoch": 0.05874719830128583, "grad_norm": 0.7578125, "learning_rate": 7.653462241194965e-05, "loss": 1.5919, "num_input_tokens_seen": 1767112704, "step": 1498 }, { "epoch": 0.058865164562935, "grad_norm": 0.77734375, "learning_rate": 7.652826716449773e-05, "loss": 1.718, "num_input_tokens_seen": 1768292352, "step": 1499 }, { "epoch": 0.058983130824584167, "grad_norm": 0.9609375, "learning_rate": 7.652190635921195e-05, "loss": 1.6418, "num_input_tokens_seen": 1769472000, "step": 1500 }, { "epoch": 0.05910109708623334, "grad_norm": 0.78125, "learning_rate": 7.651553999706011e-05, "loss": 1.601, "num_input_tokens_seen": 1770651648, "step": 1501 }, { "epoch": 0.0592190633478825, "grad_norm": 0.94921875, "learning_rate": 7.65091680790109e-05, "loss": 1.7251, "num_input_tokens_seen": 1771831296, "step": 1502 }, { "epoch": 0.059337029609531675, "grad_norm": 0.8828125, "learning_rate": 7.650279060603376e-05, "loss": 1.611, "num_input_tokens_seen": 1773010944, "step": 1503 }, { "epoch": 0.05945499587118084, "grad_norm": 0.8671875, "learning_rate": 7.649640757909907e-05, "loss": 1.6156, "num_input_tokens_seen": 1774190592, "step": 1504 }, { "epoch": 0.05957296213283001, "grad_norm": 0.796875, "learning_rate": 7.649001899917804e-05, "loss": 1.6175, "num_input_tokens_seen": 1775370240, "step": 1505 }, { "epoch": 0.059690928394479176, "grad_norm": 0.9140625, "learning_rate": 7.648362486724266e-05, "loss": 1.6087, "num_input_tokens_seen": 1776549888, "step": 1506 }, { "epoch": 0.05980889465612835, "grad_norm": 0.82421875, "learning_rate": 7.64772251842658e-05, "loss": 1.6076, "num_input_tokens_seen": 1777729536, "step": 1507 }, { "epoch": 0.05992686091777751, "grad_norm": 0.953125, "learning_rate": 7.647081995122122e-05, "loss": 1.5446, "num_input_tokens_seen": 1778909184, "step": 1508 }, { "epoch": 0.060044827179426684, "grad_norm": 0.76953125, "learning_rate": 7.646440916908347e-05, "loss": 1.8014, "num_input_tokens_seen": 1780088832, "step": 1509 }, { "epoch": 0.060162793441075856, "grad_norm": 0.88671875, "learning_rate": 7.645799283882797e-05, "loss": 1.736, "num_input_tokens_seen": 1781268480, "step": 1510 }, { "epoch": 0.06028075970272502, "grad_norm": 0.97265625, "learning_rate": 7.645157096143096e-05, "loss": 1.6802, "num_input_tokens_seen": 1782448128, "step": 1511 }, { "epoch": 0.06039872596437419, "grad_norm": 0.92578125, "learning_rate": 7.644514353786954e-05, "loss": 1.7258, "num_input_tokens_seen": 1783627776, "step": 1512 }, { "epoch": 0.06051669222602336, "grad_norm": 0.9140625, "learning_rate": 7.643871056912165e-05, "loss": 1.5829, "num_input_tokens_seen": 1784807424, "step": 1513 }, { "epoch": 0.06063465848767253, "grad_norm": 0.96484375, "learning_rate": 7.64322720561661e-05, "loss": 1.6509, "num_input_tokens_seen": 1785987072, "step": 1514 }, { "epoch": 0.06075262474932169, "grad_norm": 0.81640625, "learning_rate": 7.642582799998249e-05, "loss": 1.6295, "num_input_tokens_seen": 1787166720, "step": 1515 }, { "epoch": 0.060870591010970865, "grad_norm": 0.8671875, "learning_rate": 7.641937840155132e-05, "loss": 1.6291, "num_input_tokens_seen": 1788346368, "step": 1516 }, { "epoch": 0.06098855727262003, "grad_norm": 0.90234375, "learning_rate": 7.641292326185389e-05, "loss": 1.6605, "num_input_tokens_seen": 1789526016, "step": 1517 }, { "epoch": 0.0611065235342692, "grad_norm": 0.80078125, "learning_rate": 7.640646258187236e-05, "loss": 1.6616, "num_input_tokens_seen": 1790705664, "step": 1518 }, { "epoch": 0.061224489795918366, "grad_norm": 0.75, "learning_rate": 7.639999636258972e-05, "loss": 1.6898, "num_input_tokens_seen": 1791885312, "step": 1519 }, { "epoch": 0.06134245605756754, "grad_norm": 0.875, "learning_rate": 7.639352460498985e-05, "loss": 1.7334, "num_input_tokens_seen": 1793064960, "step": 1520 }, { "epoch": 0.0614604223192167, "grad_norm": 0.83203125, "learning_rate": 7.638704731005741e-05, "loss": 1.5819, "num_input_tokens_seen": 1794244608, "step": 1521 }, { "epoch": 0.061578388580865874, "grad_norm": 0.7890625, "learning_rate": 7.638056447877795e-05, "loss": 1.7682, "num_input_tokens_seen": 1795424256, "step": 1522 }, { "epoch": 0.06169635484251504, "grad_norm": 0.88671875, "learning_rate": 7.637407611213783e-05, "loss": 1.6932, "num_input_tokens_seen": 1796603904, "step": 1523 }, { "epoch": 0.06181432110416421, "grad_norm": 0.734375, "learning_rate": 7.636758221112428e-05, "loss": 1.6488, "num_input_tokens_seen": 1797783552, "step": 1524 }, { "epoch": 0.061932287365813375, "grad_norm": 0.84765625, "learning_rate": 7.636108277672533e-05, "loss": 1.7371, "num_input_tokens_seen": 1798963200, "step": 1525 }, { "epoch": 0.06205025362746255, "grad_norm": 0.80078125, "learning_rate": 7.635457780992989e-05, "loss": 1.7218, "num_input_tokens_seen": 1800142848, "step": 1526 }, { "epoch": 0.06216821988911171, "grad_norm": 0.81640625, "learning_rate": 7.634806731172772e-05, "loss": 1.7231, "num_input_tokens_seen": 1801322496, "step": 1527 }, { "epoch": 0.06228618615076088, "grad_norm": 0.81640625, "learning_rate": 7.634155128310938e-05, "loss": 1.6153, "num_input_tokens_seen": 1802502144, "step": 1528 }, { "epoch": 0.06240415241241005, "grad_norm": 0.92578125, "learning_rate": 7.63350297250663e-05, "loss": 1.6892, "num_input_tokens_seen": 1803681792, "step": 1529 }, { "epoch": 0.06252211867405921, "grad_norm": 0.890625, "learning_rate": 7.632850263859077e-05, "loss": 1.5461, "num_input_tokens_seen": 1804861440, "step": 1530 }, { "epoch": 0.06264008493570838, "grad_norm": 0.87890625, "learning_rate": 7.632197002467586e-05, "loss": 1.7642, "num_input_tokens_seen": 1806041088, "step": 1531 }, { "epoch": 0.06275805119735756, "grad_norm": 0.8671875, "learning_rate": 7.631543188431552e-05, "loss": 1.6223, "num_input_tokens_seen": 1807220736, "step": 1532 }, { "epoch": 0.06287601745900673, "grad_norm": 0.86328125, "learning_rate": 7.630888821850457e-05, "loss": 1.5815, "num_input_tokens_seen": 1808400384, "step": 1533 }, { "epoch": 0.06299398372065589, "grad_norm": 0.8828125, "learning_rate": 7.630233902823862e-05, "loss": 1.5879, "num_input_tokens_seen": 1809580032, "step": 1534 }, { "epoch": 0.06311194998230506, "grad_norm": 0.7578125, "learning_rate": 7.629578431451415e-05, "loss": 1.6571, "num_input_tokens_seen": 1810759680, "step": 1535 }, { "epoch": 0.06322991624395423, "grad_norm": 0.78125, "learning_rate": 7.628922407832843e-05, "loss": 1.6632, "num_input_tokens_seen": 1811939328, "step": 1536 }, { "epoch": 0.0633478825056034, "grad_norm": 0.82421875, "learning_rate": 7.628265832067967e-05, "loss": 1.8045, "num_input_tokens_seen": 1813118976, "step": 1537 }, { "epoch": 0.06346584876725257, "grad_norm": 0.828125, "learning_rate": 7.627608704256681e-05, "loss": 1.6111, "num_input_tokens_seen": 1814298624, "step": 1538 }, { "epoch": 0.06358381502890173, "grad_norm": 1.0234375, "learning_rate": 7.62695102449897e-05, "loss": 1.7442, "num_input_tokens_seen": 1815478272, "step": 1539 }, { "epoch": 0.0637017812905509, "grad_norm": 0.8203125, "learning_rate": 7.626292792894902e-05, "loss": 1.7246, "num_input_tokens_seen": 1816657920, "step": 1540 }, { "epoch": 0.06381974755220007, "grad_norm": 0.7890625, "learning_rate": 7.625634009544627e-05, "loss": 1.7061, "num_input_tokens_seen": 1817837568, "step": 1541 }, { "epoch": 0.06393771381384925, "grad_norm": 1.03125, "learning_rate": 7.62497467454838e-05, "loss": 1.5667, "num_input_tokens_seen": 1819017216, "step": 1542 }, { "epoch": 0.0640556800754984, "grad_norm": 0.82421875, "learning_rate": 7.62431478800648e-05, "loss": 1.6419, "num_input_tokens_seen": 1820196864, "step": 1543 }, { "epoch": 0.06417364633714757, "grad_norm": 0.8984375, "learning_rate": 7.62365435001933e-05, "loss": 1.6755, "num_input_tokens_seen": 1821376512, "step": 1544 }, { "epoch": 0.06429161259879675, "grad_norm": 0.84765625, "learning_rate": 7.622993360687416e-05, "loss": 1.6076, "num_input_tokens_seen": 1822556160, "step": 1545 }, { "epoch": 0.06440957886044592, "grad_norm": 0.84765625, "learning_rate": 7.622331820111309e-05, "loss": 1.7897, "num_input_tokens_seen": 1823735808, "step": 1546 }, { "epoch": 0.06452754512209508, "grad_norm": 0.78125, "learning_rate": 7.621669728391663e-05, "loss": 1.7062, "num_input_tokens_seen": 1824915456, "step": 1547 }, { "epoch": 0.06464551138374425, "grad_norm": 0.95703125, "learning_rate": 7.621007085629218e-05, "loss": 1.8787, "num_input_tokens_seen": 1826095104, "step": 1548 }, { "epoch": 0.06476347764539342, "grad_norm": 0.90625, "learning_rate": 7.620343891924794e-05, "loss": 1.6609, "num_input_tokens_seen": 1827274752, "step": 1549 }, { "epoch": 0.06488144390704259, "grad_norm": 0.8671875, "learning_rate": 7.619680147379297e-05, "loss": 1.6619, "num_input_tokens_seen": 1828454400, "step": 1550 }, { "epoch": 0.06499941016869175, "grad_norm": 0.94140625, "learning_rate": 7.61901585209372e-05, "loss": 1.5511, "num_input_tokens_seen": 1829634048, "step": 1551 }, { "epoch": 0.06511737643034092, "grad_norm": 0.82421875, "learning_rate": 7.618351006169133e-05, "loss": 1.6867, "num_input_tokens_seen": 1830813696, "step": 1552 }, { "epoch": 0.06523534269199009, "grad_norm": 0.8125, "learning_rate": 7.617685609706693e-05, "loss": 1.6889, "num_input_tokens_seen": 1831993344, "step": 1553 }, { "epoch": 0.06535330895363926, "grad_norm": 0.84765625, "learning_rate": 7.617019662807645e-05, "loss": 1.6725, "num_input_tokens_seen": 1833172992, "step": 1554 }, { "epoch": 0.06547127521528842, "grad_norm": 0.921875, "learning_rate": 7.61635316557331e-05, "loss": 1.6894, "num_input_tokens_seen": 1834352640, "step": 1555 }, { "epoch": 0.0655892414769376, "grad_norm": 0.734375, "learning_rate": 7.6156861181051e-05, "loss": 1.596, "num_input_tokens_seen": 1835532288, "step": 1556 }, { "epoch": 0.06570720773858676, "grad_norm": 0.9140625, "learning_rate": 7.615018520504503e-05, "loss": 1.6515, "num_input_tokens_seen": 1836711936, "step": 1557 }, { "epoch": 0.06582517400023594, "grad_norm": 0.98046875, "learning_rate": 7.6143503728731e-05, "loss": 1.642, "num_input_tokens_seen": 1837891584, "step": 1558 }, { "epoch": 0.0659431402618851, "grad_norm": 0.88671875, "learning_rate": 7.613681675312548e-05, "loss": 1.7575, "num_input_tokens_seen": 1839071232, "step": 1559 }, { "epoch": 0.06606110652353427, "grad_norm": 1.0, "learning_rate": 7.61301242792459e-05, "loss": 1.571, "num_input_tokens_seen": 1840250880, "step": 1560 }, { "epoch": 0.06617907278518344, "grad_norm": 0.875, "learning_rate": 7.612342630811053e-05, "loss": 1.495, "num_input_tokens_seen": 1841430528, "step": 1561 }, { "epoch": 0.06629703904683261, "grad_norm": 1.0546875, "learning_rate": 7.61167228407385e-05, "loss": 1.5052, "num_input_tokens_seen": 1842610176, "step": 1562 }, { "epoch": 0.06641500530848178, "grad_norm": 0.7734375, "learning_rate": 7.611001387814973e-05, "loss": 1.6763, "num_input_tokens_seen": 1843789824, "step": 1563 }, { "epoch": 0.06653297157013094, "grad_norm": 0.921875, "learning_rate": 7.610329942136501e-05, "loss": 1.683, "num_input_tokens_seen": 1844969472, "step": 1564 }, { "epoch": 0.06665093783178011, "grad_norm": 0.8359375, "learning_rate": 7.609657947140594e-05, "loss": 1.5549, "num_input_tokens_seen": 1846149120, "step": 1565 }, { "epoch": 0.06676890409342928, "grad_norm": 0.890625, "learning_rate": 7.608985402929499e-05, "loss": 1.556, "num_input_tokens_seen": 1847328768, "step": 1566 }, { "epoch": 0.06688687035507845, "grad_norm": 0.82421875, "learning_rate": 7.608312309605545e-05, "loss": 1.6592, "num_input_tokens_seen": 1848508416, "step": 1567 }, { "epoch": 0.06700483661672761, "grad_norm": 0.75390625, "learning_rate": 7.607638667271142e-05, "loss": 1.7267, "num_input_tokens_seen": 1849688064, "step": 1568 }, { "epoch": 0.06712280287837678, "grad_norm": 0.77734375, "learning_rate": 7.606964476028788e-05, "loss": 1.709, "num_input_tokens_seen": 1850867712, "step": 1569 }, { "epoch": 0.06724076914002595, "grad_norm": 0.80859375, "learning_rate": 7.60628973598106e-05, "loss": 1.5965, "num_input_tokens_seen": 1852047360, "step": 1570 }, { "epoch": 0.06735873540167513, "grad_norm": 0.71875, "learning_rate": 7.605614447230622e-05, "loss": 1.6316, "num_input_tokens_seen": 1853227008, "step": 1571 }, { "epoch": 0.06747670166332428, "grad_norm": 0.828125, "learning_rate": 7.604938609880221e-05, "loss": 1.7218, "num_input_tokens_seen": 1854406656, "step": 1572 }, { "epoch": 0.06759466792497346, "grad_norm": 0.7890625, "learning_rate": 7.604262224032687e-05, "loss": 1.6131, "num_input_tokens_seen": 1855586304, "step": 1573 }, { "epoch": 0.06771263418662263, "grad_norm": 0.90625, "learning_rate": 7.603585289790931e-05, "loss": 1.5626, "num_input_tokens_seen": 1856765952, "step": 1574 }, { "epoch": 0.0678306004482718, "grad_norm": 0.79296875, "learning_rate": 7.60290780725795e-05, "loss": 1.6016, "num_input_tokens_seen": 1857945600, "step": 1575 }, { "epoch": 0.06794856670992096, "grad_norm": 0.72265625, "learning_rate": 7.602229776536826e-05, "loss": 1.6785, "num_input_tokens_seen": 1859125248, "step": 1576 }, { "epoch": 0.06806653297157013, "grad_norm": 0.765625, "learning_rate": 7.60155119773072e-05, "loss": 1.824, "num_input_tokens_seen": 1860304896, "step": 1577 }, { "epoch": 0.0681844992332193, "grad_norm": 0.796875, "learning_rate": 7.600872070942882e-05, "loss": 1.6268, "num_input_tokens_seen": 1861484544, "step": 1578 }, { "epoch": 0.06830246549486847, "grad_norm": 0.78515625, "learning_rate": 7.600192396276638e-05, "loss": 1.6131, "num_input_tokens_seen": 1862664192, "step": 1579 }, { "epoch": 0.06842043175651763, "grad_norm": 0.7734375, "learning_rate": 7.599512173835406e-05, "loss": 1.6428, "num_input_tokens_seen": 1863843840, "step": 1580 }, { "epoch": 0.0685383980181668, "grad_norm": 0.86328125, "learning_rate": 7.598831403722681e-05, "loss": 1.6124, "num_input_tokens_seen": 1865023488, "step": 1581 }, { "epoch": 0.06865636427981597, "grad_norm": 0.70703125, "learning_rate": 7.598150086042042e-05, "loss": 1.6437, "num_input_tokens_seen": 1866203136, "step": 1582 }, { "epoch": 0.06877433054146515, "grad_norm": 0.8671875, "learning_rate": 7.597468220897154e-05, "loss": 1.4808, "num_input_tokens_seen": 1867382784, "step": 1583 }, { "epoch": 0.0688922968031143, "grad_norm": 0.83984375, "learning_rate": 7.596785808391764e-05, "loss": 1.6543, "num_input_tokens_seen": 1868562432, "step": 1584 }, { "epoch": 0.06901026306476347, "grad_norm": 0.8515625, "learning_rate": 7.596102848629701e-05, "loss": 1.5045, "num_input_tokens_seen": 1869742080, "step": 1585 }, { "epoch": 0.06912822932641265, "grad_norm": 0.78125, "learning_rate": 7.595419341714879e-05, "loss": 1.6585, "num_input_tokens_seen": 1870921728, "step": 1586 }, { "epoch": 0.06924619558806182, "grad_norm": 0.80078125, "learning_rate": 7.594735287751293e-05, "loss": 1.5746, "num_input_tokens_seen": 1872101376, "step": 1587 }, { "epoch": 0.06936416184971098, "grad_norm": 0.78125, "learning_rate": 7.594050686843028e-05, "loss": 1.5959, "num_input_tokens_seen": 1873281024, "step": 1588 }, { "epoch": 0.06948212811136015, "grad_norm": 0.8671875, "learning_rate": 7.593365539094242e-05, "loss": 1.4867, "num_input_tokens_seen": 1874460672, "step": 1589 }, { "epoch": 0.06960009437300932, "grad_norm": 0.875, "learning_rate": 7.592679844609184e-05, "loss": 1.5991, "num_input_tokens_seen": 1875640320, "step": 1590 }, { "epoch": 0.06971806063465849, "grad_norm": 0.7109375, "learning_rate": 7.591993603492182e-05, "loss": 1.7018, "num_input_tokens_seen": 1876819968, "step": 1591 }, { "epoch": 0.06983602689630766, "grad_norm": 0.77734375, "learning_rate": 7.591306815847649e-05, "loss": 1.5462, "num_input_tokens_seen": 1877999616, "step": 1592 }, { "epoch": 0.06995399315795682, "grad_norm": 0.84765625, "learning_rate": 7.590619481780081e-05, "loss": 1.4772, "num_input_tokens_seen": 1879179264, "step": 1593 }, { "epoch": 0.07007195941960599, "grad_norm": 0.765625, "learning_rate": 7.589931601394057e-05, "loss": 1.622, "num_input_tokens_seen": 1880358912, "step": 1594 }, { "epoch": 0.07018992568125516, "grad_norm": 0.8984375, "learning_rate": 7.58924317479424e-05, "loss": 1.7494, "num_input_tokens_seen": 1881538560, "step": 1595 }, { "epoch": 0.07030789194290434, "grad_norm": 0.828125, "learning_rate": 7.588554202085375e-05, "loss": 1.6065, "num_input_tokens_seen": 1882718208, "step": 1596 }, { "epoch": 0.0704258582045535, "grad_norm": 0.828125, "learning_rate": 7.587864683372288e-05, "loss": 1.6051, "num_input_tokens_seen": 1883897856, "step": 1597 }, { "epoch": 0.07054382446620266, "grad_norm": 0.828125, "learning_rate": 7.587174618759893e-05, "loss": 1.4877, "num_input_tokens_seen": 1885077504, "step": 1598 }, { "epoch": 0.07066179072785184, "grad_norm": 0.8125, "learning_rate": 7.586484008353182e-05, "loss": 1.519, "num_input_tokens_seen": 1886257152, "step": 1599 }, { "epoch": 0.07077975698950101, "grad_norm": 0.79296875, "learning_rate": 7.585792852257236e-05, "loss": 1.5906, "num_input_tokens_seen": 1887436800, "step": 1600 }, { "epoch": 0.07077975698950101, "eval_wikipedia_loss": 2.2380566596984863, "eval_wikipedia_runtime": 161.8513, "eval_wikipedia_samples_per_second": 4.337, "eval_wikipedia_steps_per_second": 0.185, "num_input_tokens_seen": 1887436800, "step": 1600 }, { "epoch": 0.07077975698950101, "eval_toxicity_loss": 3.9978690147399902, "eval_toxicity_runtime": 0.9108, "eval_toxicity_samples_per_second": 2.196, "eval_toxicity_steps_per_second": 1.098, "num_input_tokens_seen": 1887436800, "step": 1600 }, { "epoch": 0.00011796626164916834, "grad_norm": 0.69921875, "learning_rate": 7.585101150577211e-05, "loss": 1.6086, "num_input_tokens_seen": 1888616448, "step": 1601 }, { "epoch": 0.0002359325232983367, "grad_norm": 0.640625, "learning_rate": 7.584408903418355e-05, "loss": 1.4421, "num_input_tokens_seen": 1889796096, "step": 1602 }, { "epoch": 0.000353898784947505, "grad_norm": 0.69140625, "learning_rate": 7.583716110885992e-05, "loss": 1.3924, "num_input_tokens_seen": 1890975744, "step": 1603 }, { "epoch": 0.0004718650465966734, "grad_norm": 0.83203125, "learning_rate": 7.583022773085532e-05, "loss": 1.4947, "num_input_tokens_seen": 1892155392, "step": 1604 }, { "epoch": 0.0005898313082458417, "grad_norm": 0.79296875, "learning_rate": 7.582328890122466e-05, "loss": 1.5896, "num_input_tokens_seen": 1893335040, "step": 1605 }, { "epoch": 0.00070779756989501, "grad_norm": 0.73828125, "learning_rate": 7.581634462102373e-05, "loss": 1.4465, "num_input_tokens_seen": 1894514688, "step": 1606 }, { "epoch": 0.0008257638315441783, "grad_norm": 0.78515625, "learning_rate": 7.580939489130906e-05, "loss": 1.48, "num_input_tokens_seen": 1895694336, "step": 1607 }, { "epoch": 0.0009437300931933467, "grad_norm": 0.7109375, "learning_rate": 7.580243971313811e-05, "loss": 1.5052, "num_input_tokens_seen": 1896873984, "step": 1608 }, { "epoch": 0.001061696354842515, "grad_norm": 0.8046875, "learning_rate": 7.579547908756911e-05, "loss": 1.5373, "num_input_tokens_seen": 1898053632, "step": 1609 }, { "epoch": 0.0011796626164916834, "grad_norm": 0.828125, "learning_rate": 7.578851301566112e-05, "loss": 1.4559, "num_input_tokens_seen": 1899233280, "step": 1610 }, { "epoch": 0.0012976288781408518, "grad_norm": 1.015625, "learning_rate": 7.578154149847404e-05, "loss": 1.5937, "num_input_tokens_seen": 1900412928, "step": 1611 }, { "epoch": 0.00141559513979002, "grad_norm": 0.85546875, "learning_rate": 7.57745645370686e-05, "loss": 1.6112, "num_input_tokens_seen": 1901592576, "step": 1612 }, { "epoch": 0.0015335614014391884, "grad_norm": 0.83203125, "learning_rate": 7.576758213250638e-05, "loss": 1.4209, "num_input_tokens_seen": 1902772224, "step": 1613 }, { "epoch": 0.0016515276630883566, "grad_norm": 0.94921875, "learning_rate": 7.576059428584972e-05, "loss": 1.6171, "num_input_tokens_seen": 1903951872, "step": 1614 }, { "epoch": 0.001769493924737525, "grad_norm": 0.9453125, "learning_rate": 7.575360099816185e-05, "loss": 1.4181, "num_input_tokens_seen": 1905131520, "step": 1615 }, { "epoch": 0.0018874601863866935, "grad_norm": 1.0078125, "learning_rate": 7.574660227050681e-05, "loss": 1.529, "num_input_tokens_seen": 1906311168, "step": 1616 }, { "epoch": 0.0020054264480358617, "grad_norm": 0.91015625, "learning_rate": 7.573959810394948e-05, "loss": 1.6515, "num_input_tokens_seen": 1907490816, "step": 1617 }, { "epoch": 0.00212339270968503, "grad_norm": 1.015625, "learning_rate": 7.573258849955555e-05, "loss": 1.6023, "num_input_tokens_seen": 1908670464, "step": 1618 }, { "epoch": 0.0022413589713341986, "grad_norm": 0.890625, "learning_rate": 7.572557345839153e-05, "loss": 1.4418, "num_input_tokens_seen": 1909850112, "step": 1619 }, { "epoch": 0.0023593252329833668, "grad_norm": 1.0703125, "learning_rate": 7.571855298152477e-05, "loss": 1.5494, "num_input_tokens_seen": 1911029760, "step": 1620 }, { "epoch": 0.002477291494632535, "grad_norm": 0.84375, "learning_rate": 7.571152707002347e-05, "loss": 1.5324, "num_input_tokens_seen": 1912209408, "step": 1621 }, { "epoch": 0.0025952577562817036, "grad_norm": 1.0078125, "learning_rate": 7.57044957249566e-05, "loss": 1.5491, "num_input_tokens_seen": 1913389056, "step": 1622 }, { "epoch": 0.002713224017930872, "grad_norm": 0.81640625, "learning_rate": 7.569745894739402e-05, "loss": 1.5509, "num_input_tokens_seen": 1914568704, "step": 1623 }, { "epoch": 0.00283119027958004, "grad_norm": 0.97265625, "learning_rate": 7.569041673840637e-05, "loss": 1.5173, "num_input_tokens_seen": 1915748352, "step": 1624 }, { "epoch": 0.0029491565412292082, "grad_norm": 0.69921875, "learning_rate": 7.568336909906514e-05, "loss": 1.6629, "num_input_tokens_seen": 1916928000, "step": 1625 }, { "epoch": 0.003067122802878377, "grad_norm": 0.91015625, "learning_rate": 7.567631603044264e-05, "loss": 1.5456, "num_input_tokens_seen": 1918107648, "step": 1626 }, { "epoch": 0.003185089064527545, "grad_norm": 0.87890625, "learning_rate": 7.5669257533612e-05, "loss": 1.4868, "num_input_tokens_seen": 1919287296, "step": 1627 }, { "epoch": 0.0033030553261767133, "grad_norm": 0.8671875, "learning_rate": 7.566219360964719e-05, "loss": 1.475, "num_input_tokens_seen": 1920466944, "step": 1628 }, { "epoch": 0.003421021587825882, "grad_norm": 0.84375, "learning_rate": 7.565512425962298e-05, "loss": 1.4149, "num_input_tokens_seen": 1921646592, "step": 1629 }, { "epoch": 0.00353898784947505, "grad_norm": 0.84375, "learning_rate": 7.5648049484615e-05, "loss": 1.4973, "num_input_tokens_seen": 1922826240, "step": 1630 }, { "epoch": 0.0036569541111242184, "grad_norm": 0.859375, "learning_rate": 7.564096928569969e-05, "loss": 1.5576, "num_input_tokens_seen": 1924005888, "step": 1631 }, { "epoch": 0.003774920372773387, "grad_norm": 0.83984375, "learning_rate": 7.56338836639543e-05, "loss": 1.5025, "num_input_tokens_seen": 1925185536, "step": 1632 }, { "epoch": 0.003892886634422555, "grad_norm": 0.87109375, "learning_rate": 7.562679262045692e-05, "loss": 1.5112, "num_input_tokens_seen": 1926365184, "step": 1633 }, { "epoch": 0.004010852896071723, "grad_norm": 0.6875, "learning_rate": 7.561969615628649e-05, "loss": 1.6745, "num_input_tokens_seen": 1927544832, "step": 1634 }, { "epoch": 0.004128819157720892, "grad_norm": 0.85546875, "learning_rate": 7.56125942725227e-05, "loss": 1.5378, "num_input_tokens_seen": 1928724480, "step": 1635 }, { "epoch": 0.00424678541937006, "grad_norm": 0.7890625, "learning_rate": 7.560548697024616e-05, "loss": 1.4558, "num_input_tokens_seen": 1929904128, "step": 1636 }, { "epoch": 0.004364751681019229, "grad_norm": 0.8984375, "learning_rate": 7.559837425053822e-05, "loss": 1.5246, "num_input_tokens_seen": 1931083776, "step": 1637 }, { "epoch": 0.004482717942668397, "grad_norm": 0.7890625, "learning_rate": 7.559125611448112e-05, "loss": 1.6329, "num_input_tokens_seen": 1932263424, "step": 1638 }, { "epoch": 0.004600684204317565, "grad_norm": 0.8515625, "learning_rate": 7.558413256315788e-05, "loss": 1.6571, "num_input_tokens_seen": 1933443072, "step": 1639 }, { "epoch": 0.0047186504659667335, "grad_norm": 0.7109375, "learning_rate": 7.557700359765238e-05, "loss": 1.75, "num_input_tokens_seen": 1934622720, "step": 1640 }, { "epoch": 0.004836616727615902, "grad_norm": 0.78515625, "learning_rate": 7.556986921904927e-05, "loss": 1.7082, "num_input_tokens_seen": 1935802368, "step": 1641 }, { "epoch": 0.00495458298926507, "grad_norm": 0.75390625, "learning_rate": 7.556272942843407e-05, "loss": 1.6003, "num_input_tokens_seen": 1936982016, "step": 1642 }, { "epoch": 0.005072549250914238, "grad_norm": 0.80859375, "learning_rate": 7.555558422689312e-05, "loss": 1.6011, "num_input_tokens_seen": 1938161664, "step": 1643 }, { "epoch": 0.005190515512563407, "grad_norm": 0.7109375, "learning_rate": 7.554843361551357e-05, "loss": 1.5739, "num_input_tokens_seen": 1939341312, "step": 1644 }, { "epoch": 0.0053084817742125754, "grad_norm": 0.7890625, "learning_rate": 7.55412775953834e-05, "loss": 1.5394, "num_input_tokens_seen": 1940520960, "step": 1645 }, { "epoch": 0.005426448035861744, "grad_norm": 0.9609375, "learning_rate": 7.553411616759141e-05, "loss": 1.3772, "num_input_tokens_seen": 1941700608, "step": 1646 }, { "epoch": 0.005544414297510912, "grad_norm": 0.81640625, "learning_rate": 7.55269493332272e-05, "loss": 1.4369, "num_input_tokens_seen": 1942880256, "step": 1647 }, { "epoch": 0.00566238055916008, "grad_norm": 0.73828125, "learning_rate": 7.551977709338125e-05, "loss": 1.7093, "num_input_tokens_seen": 1944059904, "step": 1648 }, { "epoch": 0.005780346820809248, "grad_norm": 0.7734375, "learning_rate": 7.55125994491448e-05, "loss": 1.5003, "num_input_tokens_seen": 1945239552, "step": 1649 }, { "epoch": 0.0058983130824584165, "grad_norm": 0.81640625, "learning_rate": 7.550541640160996e-05, "loss": 1.6225, "num_input_tokens_seen": 1946419200, "step": 1650 }, { "epoch": 0.0060162793441075856, "grad_norm": 0.86328125, "learning_rate": 7.549822795186963e-05, "loss": 1.4609, "num_input_tokens_seen": 1947598848, "step": 1651 }, { "epoch": 0.006134245605756754, "grad_norm": 0.80859375, "learning_rate": 7.549103410101754e-05, "loss": 1.5634, "num_input_tokens_seen": 1948778496, "step": 1652 }, { "epoch": 0.006252211867405922, "grad_norm": 0.77734375, "learning_rate": 7.548383485014826e-05, "loss": 1.5238, "num_input_tokens_seen": 1949958144, "step": 1653 }, { "epoch": 0.00637017812905509, "grad_norm": 0.78125, "learning_rate": 7.547663020035717e-05, "loss": 1.6528, "num_input_tokens_seen": 1951137792, "step": 1654 }, { "epoch": 0.006488144390704258, "grad_norm": 0.765625, "learning_rate": 7.546942015274046e-05, "loss": 1.6072, "num_input_tokens_seen": 1952317440, "step": 1655 }, { "epoch": 0.006606110652353427, "grad_norm": 0.7890625, "learning_rate": 7.546220470839512e-05, "loss": 1.4707, "num_input_tokens_seen": 1953497088, "step": 1656 }, { "epoch": 0.006724076914002596, "grad_norm": 0.7109375, "learning_rate": 7.545498386841904e-05, "loss": 1.678, "num_input_tokens_seen": 1954676736, "step": 1657 }, { "epoch": 0.006842043175651764, "grad_norm": 0.92578125, "learning_rate": 7.544775763391086e-05, "loss": 1.5031, "num_input_tokens_seen": 1955856384, "step": 1658 }, { "epoch": 0.006960009437300932, "grad_norm": 0.75390625, "learning_rate": 7.544052600597009e-05, "loss": 1.4664, "num_input_tokens_seen": 1957036032, "step": 1659 }, { "epoch": 0.0070779756989501, "grad_norm": 1.0078125, "learning_rate": 7.543328898569698e-05, "loss": 1.5593, "num_input_tokens_seen": 1958215680, "step": 1660 }, { "epoch": 0.0071959419605992685, "grad_norm": 0.87109375, "learning_rate": 7.542604657419268e-05, "loss": 1.5318, "num_input_tokens_seen": 1959395328, "step": 1661 }, { "epoch": 0.007313908222248437, "grad_norm": 0.80859375, "learning_rate": 7.541879877255915e-05, "loss": 1.6062, "num_input_tokens_seen": 1960574976, "step": 1662 }, { "epoch": 0.007431874483897605, "grad_norm": 0.8125, "learning_rate": 7.541154558189914e-05, "loss": 1.5415, "num_input_tokens_seen": 1961754624, "step": 1663 }, { "epoch": 0.007549840745546774, "grad_norm": 0.80078125, "learning_rate": 7.540428700331625e-05, "loss": 1.4204, "num_input_tokens_seen": 1962934272, "step": 1664 }, { "epoch": 0.007667807007195942, "grad_norm": 0.76171875, "learning_rate": 7.539702303791486e-05, "loss": 1.5433, "num_input_tokens_seen": 1964113920, "step": 1665 }, { "epoch": 0.00778577326884511, "grad_norm": 0.80859375, "learning_rate": 7.53897536868002e-05, "loss": 1.6512, "num_input_tokens_seen": 1965293568, "step": 1666 }, { "epoch": 0.007903739530494279, "grad_norm": 0.81640625, "learning_rate": 7.538247895107835e-05, "loss": 1.5139, "num_input_tokens_seen": 1966473216, "step": 1667 }, { "epoch": 0.008021705792143447, "grad_norm": 0.74609375, "learning_rate": 7.53751988318561e-05, "loss": 1.7058, "num_input_tokens_seen": 1967652864, "step": 1668 }, { "epoch": 0.008139672053792615, "grad_norm": 0.828125, "learning_rate": 7.53679133302412e-05, "loss": 1.5792, "num_input_tokens_seen": 1968832512, "step": 1669 }, { "epoch": 0.008257638315441783, "grad_norm": 0.8046875, "learning_rate": 7.536062244734212e-05, "loss": 1.4524, "num_input_tokens_seen": 1970012160, "step": 1670 }, { "epoch": 0.008375604577090951, "grad_norm": 0.75390625, "learning_rate": 7.535332618426816e-05, "loss": 1.6209, "num_input_tokens_seen": 1971191808, "step": 1671 }, { "epoch": 0.00849357083874012, "grad_norm": 0.71484375, "learning_rate": 7.53460245421295e-05, "loss": 1.5364, "num_input_tokens_seen": 1972371456, "step": 1672 }, { "epoch": 0.008611537100389288, "grad_norm": 0.734375, "learning_rate": 7.533871752203708e-05, "loss": 1.6611, "num_input_tokens_seen": 1973551104, "step": 1673 }, { "epoch": 0.008729503362038458, "grad_norm": 0.82421875, "learning_rate": 7.533140512510267e-05, "loss": 1.5247, "num_input_tokens_seen": 1974730752, "step": 1674 }, { "epoch": 0.008847469623687626, "grad_norm": 0.703125, "learning_rate": 7.532408735243887e-05, "loss": 1.6105, "num_input_tokens_seen": 1975910400, "step": 1675 }, { "epoch": 0.008965435885336794, "grad_norm": 0.71484375, "learning_rate": 7.531676420515908e-05, "loss": 1.621, "num_input_tokens_seen": 1977090048, "step": 1676 }, { "epoch": 0.009083402146985962, "grad_norm": 0.71484375, "learning_rate": 7.530943568437753e-05, "loss": 1.5137, "num_input_tokens_seen": 1978269696, "step": 1677 }, { "epoch": 0.00920136840863513, "grad_norm": 0.7265625, "learning_rate": 7.530210179120927e-05, "loss": 1.4861, "num_input_tokens_seen": 1979449344, "step": 1678 }, { "epoch": 0.009319334670284299, "grad_norm": 0.74609375, "learning_rate": 7.529476252677016e-05, "loss": 1.6147, "num_input_tokens_seen": 1980628992, "step": 1679 }, { "epoch": 0.009437300931933467, "grad_norm": 0.8046875, "learning_rate": 7.528741789217692e-05, "loss": 1.506, "num_input_tokens_seen": 1981808640, "step": 1680 }, { "epoch": 0.009555267193582635, "grad_norm": 0.73046875, "learning_rate": 7.528006788854697e-05, "loss": 1.5949, "num_input_tokens_seen": 1982988288, "step": 1681 }, { "epoch": 0.009673233455231803, "grad_norm": 0.79296875, "learning_rate": 7.527271251699867e-05, "loss": 1.5086, "num_input_tokens_seen": 1984167936, "step": 1682 }, { "epoch": 0.009791199716880972, "grad_norm": 0.79296875, "learning_rate": 7.526535177865118e-05, "loss": 1.5301, "num_input_tokens_seen": 1985347584, "step": 1683 }, { "epoch": 0.00990916597853014, "grad_norm": 0.76953125, "learning_rate": 7.525798567462439e-05, "loss": 1.6647, "num_input_tokens_seen": 1986527232, "step": 1684 }, { "epoch": 0.010027132240179308, "grad_norm": 0.77734375, "learning_rate": 7.525061420603911e-05, "loss": 1.6048, "num_input_tokens_seen": 1987706880, "step": 1685 }, { "epoch": 0.010145098501828476, "grad_norm": 0.87109375, "learning_rate": 7.524323737401688e-05, "loss": 1.7124, "num_input_tokens_seen": 1988886528, "step": 1686 }, { "epoch": 0.010263064763477646, "grad_norm": 0.765625, "learning_rate": 7.523585517968013e-05, "loss": 1.6371, "num_input_tokens_seen": 1990066176, "step": 1687 }, { "epoch": 0.010381031025126814, "grad_norm": 0.79296875, "learning_rate": 7.522846762415207e-05, "loss": 1.5074, "num_input_tokens_seen": 1991245824, "step": 1688 }, { "epoch": 0.010498997286775983, "grad_norm": 0.734375, "learning_rate": 7.52210747085567e-05, "loss": 1.5505, "num_input_tokens_seen": 1992425472, "step": 1689 }, { "epoch": 0.010616963548425151, "grad_norm": 0.81640625, "learning_rate": 7.521367643401889e-05, "loss": 1.5206, "num_input_tokens_seen": 1993605120, "step": 1690 }, { "epoch": 0.010734929810074319, "grad_norm": 0.66796875, "learning_rate": 7.52062728016643e-05, "loss": 1.6137, "num_input_tokens_seen": 1994784768, "step": 1691 }, { "epoch": 0.010852896071723487, "grad_norm": 0.765625, "learning_rate": 7.519886381261938e-05, "loss": 1.5537, "num_input_tokens_seen": 1995964416, "step": 1692 }, { "epoch": 0.010970862333372655, "grad_norm": 0.74609375, "learning_rate": 7.519144946801145e-05, "loss": 1.5487, "num_input_tokens_seen": 1997144064, "step": 1693 }, { "epoch": 0.011088828595021824, "grad_norm": 0.80078125, "learning_rate": 7.518402976896861e-05, "loss": 1.5808, "num_input_tokens_seen": 1998323712, "step": 1694 }, { "epoch": 0.011206794856670992, "grad_norm": 0.7890625, "learning_rate": 7.517660471661976e-05, "loss": 1.5318, "num_input_tokens_seen": 1999503360, "step": 1695 }, { "epoch": 0.01132476111832016, "grad_norm": 0.703125, "learning_rate": 7.516917431209462e-05, "loss": 1.6044, "num_input_tokens_seen": 2000683008, "step": 1696 }, { "epoch": 0.011442727379969328, "grad_norm": 0.72265625, "learning_rate": 7.51617385565238e-05, "loss": 1.6691, "num_input_tokens_seen": 2001862656, "step": 1697 }, { "epoch": 0.011560693641618497, "grad_norm": 0.8359375, "learning_rate": 7.51542974510386e-05, "loss": 1.4109, "num_input_tokens_seen": 2003042304, "step": 1698 }, { "epoch": 0.011678659903267665, "grad_norm": 0.8359375, "learning_rate": 7.514685099677122e-05, "loss": 1.5029, "num_input_tokens_seen": 2004221952, "step": 1699 }, { "epoch": 0.011796626164916833, "grad_norm": 0.85546875, "learning_rate": 7.513939919485466e-05, "loss": 1.5524, "num_input_tokens_seen": 2005401600, "step": 1700 }, { "epoch": 0.011914592426566003, "grad_norm": 0.82421875, "learning_rate": 7.51319420464227e-05, "loss": 1.4823, "num_input_tokens_seen": 2006581248, "step": 1701 }, { "epoch": 0.012032558688215171, "grad_norm": 0.75390625, "learning_rate": 7.512447955260998e-05, "loss": 1.66, "num_input_tokens_seen": 2007760896, "step": 1702 }, { "epoch": 0.01215052494986434, "grad_norm": 0.80078125, "learning_rate": 7.51170117145519e-05, "loss": 1.6474, "num_input_tokens_seen": 2008940544, "step": 1703 }, { "epoch": 0.012268491211513508, "grad_norm": 0.6875, "learning_rate": 7.510953853338474e-05, "loss": 1.6331, "num_input_tokens_seen": 2010120192, "step": 1704 }, { "epoch": 0.012386457473162676, "grad_norm": 0.79296875, "learning_rate": 7.510206001024554e-05, "loss": 1.7039, "num_input_tokens_seen": 2011299840, "step": 1705 }, { "epoch": 0.012504423734811844, "grad_norm": 0.765625, "learning_rate": 7.509457614627217e-05, "loss": 1.5544, "num_input_tokens_seen": 2012479488, "step": 1706 }, { "epoch": 0.012622389996461012, "grad_norm": 0.765625, "learning_rate": 7.50870869426033e-05, "loss": 1.5711, "num_input_tokens_seen": 2013659136, "step": 1707 }, { "epoch": 0.01274035625811018, "grad_norm": 0.77734375, "learning_rate": 7.507959240037844e-05, "loss": 1.5601, "num_input_tokens_seen": 2014838784, "step": 1708 }, { "epoch": 0.012858322519759349, "grad_norm": 0.70703125, "learning_rate": 7.50720925207379e-05, "loss": 1.7537, "num_input_tokens_seen": 2016018432, "step": 1709 }, { "epoch": 0.012976288781408517, "grad_norm": 0.7421875, "learning_rate": 7.506458730482277e-05, "loss": 1.5504, "num_input_tokens_seen": 2017198080, "step": 1710 }, { "epoch": 0.013094255043057685, "grad_norm": 0.71484375, "learning_rate": 7.505707675377502e-05, "loss": 1.6148, "num_input_tokens_seen": 2018377728, "step": 1711 }, { "epoch": 0.013212221304706853, "grad_norm": 0.71484375, "learning_rate": 7.504956086873735e-05, "loss": 1.541, "num_input_tokens_seen": 2019557376, "step": 1712 }, { "epoch": 0.013330187566356021, "grad_norm": 0.765625, "learning_rate": 7.504203965085335e-05, "loss": 1.6243, "num_input_tokens_seen": 2020737024, "step": 1713 }, { "epoch": 0.013448153828005191, "grad_norm": 0.796875, "learning_rate": 7.503451310126738e-05, "loss": 1.4932, "num_input_tokens_seen": 2021916672, "step": 1714 }, { "epoch": 0.01356612008965436, "grad_norm": 0.765625, "learning_rate": 7.50269812211246e-05, "loss": 1.7139, "num_input_tokens_seen": 2023096320, "step": 1715 }, { "epoch": 0.013684086351303528, "grad_norm": 0.8515625, "learning_rate": 7.5019444011571e-05, "loss": 1.5194, "num_input_tokens_seen": 2024275968, "step": 1716 }, { "epoch": 0.013802052612952696, "grad_norm": 0.76171875, "learning_rate": 7.501190147375338e-05, "loss": 1.6824, "num_input_tokens_seen": 2025455616, "step": 1717 }, { "epoch": 0.013920018874601864, "grad_norm": 0.73046875, "learning_rate": 7.500435360881937e-05, "loss": 1.6044, "num_input_tokens_seen": 2026635264, "step": 1718 }, { "epoch": 0.014037985136251032, "grad_norm": 0.66796875, "learning_rate": 7.499680041791737e-05, "loss": 1.5265, "num_input_tokens_seen": 2027814912, "step": 1719 }, { "epoch": 0.0141559513979002, "grad_norm": 0.82421875, "learning_rate": 7.49892419021966e-05, "loss": 1.5371, "num_input_tokens_seen": 2028994560, "step": 1720 }, { "epoch": 0.014273917659549369, "grad_norm": 0.7109375, "learning_rate": 7.498167806280712e-05, "loss": 1.5835, "num_input_tokens_seen": 2030174208, "step": 1721 }, { "epoch": 0.014391883921198537, "grad_norm": 0.7109375, "learning_rate": 7.497410890089976e-05, "loss": 1.5378, "num_input_tokens_seen": 2031353856, "step": 1722 }, { "epoch": 0.014509850182847705, "grad_norm": 0.69140625, "learning_rate": 7.496653441762621e-05, "loss": 1.6016, "num_input_tokens_seen": 2032533504, "step": 1723 }, { "epoch": 0.014627816444496873, "grad_norm": 0.7421875, "learning_rate": 7.495895461413891e-05, "loss": 1.5343, "num_input_tokens_seen": 2033713152, "step": 1724 }, { "epoch": 0.014745782706146042, "grad_norm": 0.71484375, "learning_rate": 7.495136949159117e-05, "loss": 1.6498, "num_input_tokens_seen": 2034892800, "step": 1725 }, { "epoch": 0.01486374896779521, "grad_norm": 0.80859375, "learning_rate": 7.494377905113704e-05, "loss": 1.4332, "num_input_tokens_seen": 2036072448, "step": 1726 }, { "epoch": 0.014981715229444378, "grad_norm": 0.66796875, "learning_rate": 7.493618329393145e-05, "loss": 1.6878, "num_input_tokens_seen": 2037252096, "step": 1727 }, { "epoch": 0.015099681491093548, "grad_norm": 0.69921875, "learning_rate": 7.49285822211301e-05, "loss": 1.557, "num_input_tokens_seen": 2038431744, "step": 1728 }, { "epoch": 0.015217647752742716, "grad_norm": 0.7109375, "learning_rate": 7.492097583388949e-05, "loss": 1.6027, "num_input_tokens_seen": 2039611392, "step": 1729 }, { "epoch": 0.015335614014391884, "grad_norm": 0.73828125, "learning_rate": 7.491336413336695e-05, "loss": 1.5635, "num_input_tokens_seen": 2040791040, "step": 1730 }, { "epoch": 0.015453580276041053, "grad_norm": 0.7890625, "learning_rate": 7.490574712072063e-05, "loss": 1.5244, "num_input_tokens_seen": 2041970688, "step": 1731 }, { "epoch": 0.01557154653769022, "grad_norm": 0.75, "learning_rate": 7.489812479710944e-05, "loss": 1.4318, "num_input_tokens_seen": 2043150336, "step": 1732 }, { "epoch": 0.01568951279933939, "grad_norm": 0.7265625, "learning_rate": 7.489049716369316e-05, "loss": 1.5501, "num_input_tokens_seen": 2044329984, "step": 1733 }, { "epoch": 0.015807479060988557, "grad_norm": 0.62890625, "learning_rate": 7.488286422163232e-05, "loss": 1.6179, "num_input_tokens_seen": 2045509632, "step": 1734 }, { "epoch": 0.015925445322637725, "grad_norm": 0.6796875, "learning_rate": 7.48752259720883e-05, "loss": 1.5711, "num_input_tokens_seen": 2046689280, "step": 1735 }, { "epoch": 0.016043411584286894, "grad_norm": 0.71875, "learning_rate": 7.486758241622327e-05, "loss": 1.5267, "num_input_tokens_seen": 2047868928, "step": 1736 }, { "epoch": 0.016161377845936062, "grad_norm": 0.72265625, "learning_rate": 7.48599335552002e-05, "loss": 1.622, "num_input_tokens_seen": 2049048576, "step": 1737 }, { "epoch": 0.01627934410758523, "grad_norm": 0.98828125, "learning_rate": 7.485227939018287e-05, "loss": 1.5862, "num_input_tokens_seen": 2050228224, "step": 1738 }, { "epoch": 0.0163973103692344, "grad_norm": 0.78125, "learning_rate": 7.484461992233592e-05, "loss": 1.5703, "num_input_tokens_seen": 2051407872, "step": 1739 }, { "epoch": 0.016515276630883566, "grad_norm": 0.80078125, "learning_rate": 7.48369551528247e-05, "loss": 1.6746, "num_input_tokens_seen": 2052587520, "step": 1740 }, { "epoch": 0.016633242892532735, "grad_norm": 0.85546875, "learning_rate": 7.482928508281543e-05, "loss": 1.4853, "num_input_tokens_seen": 2053767168, "step": 1741 }, { "epoch": 0.016751209154181903, "grad_norm": 0.7265625, "learning_rate": 7.482160971347514e-05, "loss": 1.5609, "num_input_tokens_seen": 2054946816, "step": 1742 }, { "epoch": 0.01686917541583107, "grad_norm": 0.796875, "learning_rate": 7.481392904597161e-05, "loss": 1.4062, "num_input_tokens_seen": 2056126464, "step": 1743 }, { "epoch": 0.01698714167748024, "grad_norm": 0.6796875, "learning_rate": 7.480624308147352e-05, "loss": 1.6638, "num_input_tokens_seen": 2057306112, "step": 1744 }, { "epoch": 0.017105107939129408, "grad_norm": 0.84375, "learning_rate": 7.479855182115027e-05, "loss": 1.4458, "num_input_tokens_seen": 2058485760, "step": 1745 }, { "epoch": 0.017223074200778576, "grad_norm": 0.65625, "learning_rate": 7.479085526617209e-05, "loss": 1.7725, "num_input_tokens_seen": 2059665408, "step": 1746 }, { "epoch": 0.017341040462427744, "grad_norm": 0.80078125, "learning_rate": 7.478315341771003e-05, "loss": 1.5117, "num_input_tokens_seen": 2060845056, "step": 1747 }, { "epoch": 0.017459006724076916, "grad_norm": 0.68359375, "learning_rate": 7.477544627693595e-05, "loss": 1.5183, "num_input_tokens_seen": 2062024704, "step": 1748 }, { "epoch": 0.017576972985726084, "grad_norm": 0.828125, "learning_rate": 7.476773384502251e-05, "loss": 1.5633, "num_input_tokens_seen": 2063204352, "step": 1749 }, { "epoch": 0.017694939247375252, "grad_norm": 0.66796875, "learning_rate": 7.476001612314314e-05, "loss": 1.5814, "num_input_tokens_seen": 2064384000, "step": 1750 }, { "epoch": 0.01781290550902442, "grad_norm": 0.80859375, "learning_rate": 7.475229311247212e-05, "loss": 1.5078, "num_input_tokens_seen": 2065563648, "step": 1751 }, { "epoch": 0.01793087177067359, "grad_norm": 0.7421875, "learning_rate": 7.474456481418452e-05, "loss": 1.5594, "num_input_tokens_seen": 2066743296, "step": 1752 }, { "epoch": 0.018048838032322757, "grad_norm": 0.83203125, "learning_rate": 7.473683122945622e-05, "loss": 1.5369, "num_input_tokens_seen": 2067922944, "step": 1753 }, { "epoch": 0.018166804293971925, "grad_norm": 0.69921875, "learning_rate": 7.472909235946388e-05, "loss": 1.5476, "num_input_tokens_seen": 2069102592, "step": 1754 }, { "epoch": 0.018284770555621093, "grad_norm": 0.71875, "learning_rate": 7.472134820538498e-05, "loss": 1.525, "num_input_tokens_seen": 2070282240, "step": 1755 }, { "epoch": 0.01840273681727026, "grad_norm": 0.75, "learning_rate": 7.471359876839782e-05, "loss": 1.6491, "num_input_tokens_seen": 2071461888, "step": 1756 }, { "epoch": 0.01852070307891943, "grad_norm": 0.7109375, "learning_rate": 7.470584404968149e-05, "loss": 1.5732, "num_input_tokens_seen": 2072641536, "step": 1757 }, { "epoch": 0.018638669340568598, "grad_norm": 0.71875, "learning_rate": 7.469808405041587e-05, "loss": 1.5811, "num_input_tokens_seen": 2073821184, "step": 1758 }, { "epoch": 0.018756635602217766, "grad_norm": 0.65625, "learning_rate": 7.469031877178167e-05, "loss": 1.6869, "num_input_tokens_seen": 2075000832, "step": 1759 }, { "epoch": 0.018874601863866934, "grad_norm": 0.64453125, "learning_rate": 7.468254821496037e-05, "loss": 1.7133, "num_input_tokens_seen": 2076180480, "step": 1760 }, { "epoch": 0.018992568125516102, "grad_norm": 0.6953125, "learning_rate": 7.46747723811343e-05, "loss": 1.5979, "num_input_tokens_seen": 2077360128, "step": 1761 }, { "epoch": 0.01911053438716527, "grad_norm": 0.64453125, "learning_rate": 7.466699127148653e-05, "loss": 1.6148, "num_input_tokens_seen": 2078539776, "step": 1762 }, { "epoch": 0.01922850064881444, "grad_norm": 0.67578125, "learning_rate": 7.465920488720099e-05, "loss": 1.5162, "num_input_tokens_seen": 2079719424, "step": 1763 }, { "epoch": 0.019346466910463607, "grad_norm": 0.7265625, "learning_rate": 7.465141322946241e-05, "loss": 1.4266, "num_input_tokens_seen": 2080899072, "step": 1764 }, { "epoch": 0.019464433172112775, "grad_norm": 0.703125, "learning_rate": 7.464361629945625e-05, "loss": 1.6248, "num_input_tokens_seen": 2082078720, "step": 1765 }, { "epoch": 0.019582399433761943, "grad_norm": 0.70703125, "learning_rate": 7.463581409836888e-05, "loss": 1.4722, "num_input_tokens_seen": 2083258368, "step": 1766 }, { "epoch": 0.01970036569541111, "grad_norm": 0.73046875, "learning_rate": 7.462800662738739e-05, "loss": 1.5744, "num_input_tokens_seen": 2084438016, "step": 1767 }, { "epoch": 0.01981833195706028, "grad_norm": 0.703125, "learning_rate": 7.46201938876997e-05, "loss": 1.5756, "num_input_tokens_seen": 2085617664, "step": 1768 }, { "epoch": 0.019936298218709448, "grad_norm": 0.68359375, "learning_rate": 7.461237588049453e-05, "loss": 1.4826, "num_input_tokens_seen": 2086797312, "step": 1769 }, { "epoch": 0.020054264480358616, "grad_norm": 0.74609375, "learning_rate": 7.460455260696142e-05, "loss": 1.7039, "num_input_tokens_seen": 2087976960, "step": 1770 }, { "epoch": 0.020172230742007784, "grad_norm": 0.75390625, "learning_rate": 7.459672406829068e-05, "loss": 1.4206, "num_input_tokens_seen": 2089156608, "step": 1771 }, { "epoch": 0.020290197003656953, "grad_norm": 0.71875, "learning_rate": 7.458889026567342e-05, "loss": 1.6331, "num_input_tokens_seen": 2090336256, "step": 1772 }, { "epoch": 0.02040816326530612, "grad_norm": 0.7421875, "learning_rate": 7.458105120030159e-05, "loss": 1.5279, "num_input_tokens_seen": 2091515904, "step": 1773 }, { "epoch": 0.020526129526955292, "grad_norm": 0.6640625, "learning_rate": 7.457320687336792e-05, "loss": 1.6853, "num_input_tokens_seen": 2092695552, "step": 1774 }, { "epoch": 0.02064409578860446, "grad_norm": 0.80859375, "learning_rate": 7.45653572860659e-05, "loss": 1.5579, "num_input_tokens_seen": 2093875200, "step": 1775 }, { "epoch": 0.02076206205025363, "grad_norm": 0.6953125, "learning_rate": 7.455750243958989e-05, "loss": 1.6844, "num_input_tokens_seen": 2095054848, "step": 1776 }, { "epoch": 0.020880028311902797, "grad_norm": 0.76953125, "learning_rate": 7.4549642335135e-05, "loss": 1.6191, "num_input_tokens_seen": 2096234496, "step": 1777 }, { "epoch": 0.020997994573551965, "grad_norm": 0.83984375, "learning_rate": 7.454177697389717e-05, "loss": 1.5507, "num_input_tokens_seen": 2097414144, "step": 1778 }, { "epoch": 0.021115960835201134, "grad_norm": 0.82421875, "learning_rate": 7.453390635707312e-05, "loss": 1.4706, "num_input_tokens_seen": 2098593792, "step": 1779 }, { "epoch": 0.021233927096850302, "grad_norm": 0.6484375, "learning_rate": 7.452603048586039e-05, "loss": 1.5721, "num_input_tokens_seen": 2099773440, "step": 1780 }, { "epoch": 0.02135189335849947, "grad_norm": 0.8671875, "learning_rate": 7.451814936145728e-05, "loss": 1.344, "num_input_tokens_seen": 2100953088, "step": 1781 }, { "epoch": 0.021469859620148638, "grad_norm": 0.6953125, "learning_rate": 7.451026298506294e-05, "loss": 1.5718, "num_input_tokens_seen": 2102132736, "step": 1782 }, { "epoch": 0.021587825881797806, "grad_norm": 0.79296875, "learning_rate": 7.450237135787728e-05, "loss": 1.5887, "num_input_tokens_seen": 2103312384, "step": 1783 }, { "epoch": 0.021705792143446975, "grad_norm": 0.77734375, "learning_rate": 7.449447448110104e-05, "loss": 1.5317, "num_input_tokens_seen": 2104492032, "step": 1784 }, { "epoch": 0.021823758405096143, "grad_norm": 0.78515625, "learning_rate": 7.448657235593572e-05, "loss": 1.645, "num_input_tokens_seen": 2105671680, "step": 1785 }, { "epoch": 0.02194172466674531, "grad_norm": 0.76953125, "learning_rate": 7.447866498358366e-05, "loss": 1.533, "num_input_tokens_seen": 2106851328, "step": 1786 }, { "epoch": 0.02205969092839448, "grad_norm": 0.71875, "learning_rate": 7.447075236524797e-05, "loss": 1.6696, "num_input_tokens_seen": 2108030976, "step": 1787 }, { "epoch": 0.022177657190043647, "grad_norm": 0.87890625, "learning_rate": 7.446283450213256e-05, "loss": 1.4246, "num_input_tokens_seen": 2109210624, "step": 1788 }, { "epoch": 0.022295623451692816, "grad_norm": 0.703125, "learning_rate": 7.445491139544218e-05, "loss": 1.6228, "num_input_tokens_seen": 2110390272, "step": 1789 }, { "epoch": 0.022413589713341984, "grad_norm": 0.828125, "learning_rate": 7.444698304638229e-05, "loss": 1.6991, "num_input_tokens_seen": 2111569920, "step": 1790 }, { "epoch": 0.022531555974991152, "grad_norm": 0.8359375, "learning_rate": 7.443904945615924e-05, "loss": 1.5197, "num_input_tokens_seen": 2112749568, "step": 1791 }, { "epoch": 0.02264952223664032, "grad_norm": 0.7890625, "learning_rate": 7.443111062598013e-05, "loss": 1.5836, "num_input_tokens_seen": 2113929216, "step": 1792 }, { "epoch": 0.02276748849828949, "grad_norm": 0.8203125, "learning_rate": 7.442316655705285e-05, "loss": 1.6029, "num_input_tokens_seen": 2115108864, "step": 1793 }, { "epoch": 0.022885454759938657, "grad_norm": 0.8125, "learning_rate": 7.441521725058613e-05, "loss": 1.6011, "num_input_tokens_seen": 2116288512, "step": 1794 }, { "epoch": 0.023003421021587825, "grad_norm": 0.75390625, "learning_rate": 7.440726270778945e-05, "loss": 1.6227, "num_input_tokens_seen": 2117468160, "step": 1795 }, { "epoch": 0.023121387283236993, "grad_norm": 0.8046875, "learning_rate": 7.43993029298731e-05, "loss": 1.4413, "num_input_tokens_seen": 2118647808, "step": 1796 }, { "epoch": 0.02323935354488616, "grad_norm": 0.69140625, "learning_rate": 7.43913379180482e-05, "loss": 1.5052, "num_input_tokens_seen": 2119827456, "step": 1797 }, { "epoch": 0.02335731980653533, "grad_norm": 0.71484375, "learning_rate": 7.438336767352662e-05, "loss": 1.5939, "num_input_tokens_seen": 2121007104, "step": 1798 }, { "epoch": 0.023475286068184498, "grad_norm": 0.68359375, "learning_rate": 7.437539219752105e-05, "loss": 1.649, "num_input_tokens_seen": 2122186752, "step": 1799 }, { "epoch": 0.023593252329833666, "grad_norm": 0.76953125, "learning_rate": 7.436741149124496e-05, "loss": 1.5859, "num_input_tokens_seen": 2123366400, "step": 1800 }, { "epoch": 0.023593252329833666, "eval_wikipedia_loss": 2.2597548961639404, "eval_wikipedia_runtime": 162.0396, "eval_wikipedia_samples_per_second": 4.332, "eval_wikipedia_steps_per_second": 0.185, "num_input_tokens_seen": 2123366400, "step": 1800 }, { "epoch": 0.023593252329833666, "eval_toxicity_loss": 4.024182319641113, "eval_toxicity_runtime": 0.9591, "eval_toxicity_samples_per_second": 2.085, "eval_toxicity_steps_per_second": 1.043, "num_input_tokens_seen": 2123366400, "step": 1800 }, { "epoch": 0.023711218591482838, "grad_norm": 0.73046875, "learning_rate": 7.435942555591265e-05, "loss": 1.5798, "num_input_tokens_seen": 2124546048, "step": 1801 }, { "epoch": 0.023829184853132006, "grad_norm": 0.8203125, "learning_rate": 7.435143439273918e-05, "loss": 1.6395, "num_input_tokens_seen": 2125725696, "step": 1802 }, { "epoch": 0.023947151114781174, "grad_norm": 0.7421875, "learning_rate": 7.434343800294041e-05, "loss": 1.6122, "num_input_tokens_seen": 2126905344, "step": 1803 }, { "epoch": 0.024065117376430342, "grad_norm": 0.76171875, "learning_rate": 7.433543638773302e-05, "loss": 1.5967, "num_input_tokens_seen": 2128084992, "step": 1804 }, { "epoch": 0.02418308363807951, "grad_norm": 0.7734375, "learning_rate": 7.432742954833445e-05, "loss": 1.5463, "num_input_tokens_seen": 2129264640, "step": 1805 }, { "epoch": 0.02430104989972868, "grad_norm": 0.76171875, "learning_rate": 7.431941748596297e-05, "loss": 1.5567, "num_input_tokens_seen": 2130444288, "step": 1806 }, { "epoch": 0.024419016161377847, "grad_norm": 0.70703125, "learning_rate": 7.431140020183763e-05, "loss": 1.5494, "num_input_tokens_seen": 2131623936, "step": 1807 }, { "epoch": 0.024536982423027015, "grad_norm": 0.76171875, "learning_rate": 7.430337769717824e-05, "loss": 1.6508, "num_input_tokens_seen": 2132803584, "step": 1808 }, { "epoch": 0.024654948684676183, "grad_norm": 0.65234375, "learning_rate": 7.429534997320546e-05, "loss": 1.7077, "num_input_tokens_seen": 2133983232, "step": 1809 }, { "epoch": 0.02477291494632535, "grad_norm": 0.8046875, "learning_rate": 7.428731703114075e-05, "loss": 1.4564, "num_input_tokens_seen": 2135162880, "step": 1810 }, { "epoch": 0.02489088120797452, "grad_norm": 0.68359375, "learning_rate": 7.427927887220629e-05, "loss": 1.6478, "num_input_tokens_seen": 2136342528, "step": 1811 }, { "epoch": 0.025008847469623688, "grad_norm": 0.6953125, "learning_rate": 7.427123549762511e-05, "loss": 1.6442, "num_input_tokens_seen": 2137522176, "step": 1812 }, { "epoch": 0.025126813731272856, "grad_norm": 0.77734375, "learning_rate": 7.426318690862104e-05, "loss": 1.4693, "num_input_tokens_seen": 2138701824, "step": 1813 }, { "epoch": 0.025244779992922024, "grad_norm": 0.69140625, "learning_rate": 7.425513310641867e-05, "loss": 1.5982, "num_input_tokens_seen": 2139881472, "step": 1814 }, { "epoch": 0.025362746254571193, "grad_norm": 0.7421875, "learning_rate": 7.42470740922434e-05, "loss": 1.4796, "num_input_tokens_seen": 2141061120, "step": 1815 }, { "epoch": 0.02548071251622036, "grad_norm": 0.8203125, "learning_rate": 7.423900986732143e-05, "loss": 1.4444, "num_input_tokens_seen": 2142240768, "step": 1816 }, { "epoch": 0.02559867877786953, "grad_norm": 0.6640625, "learning_rate": 7.423094043287974e-05, "loss": 1.5656, "num_input_tokens_seen": 2143420416, "step": 1817 }, { "epoch": 0.025716645039518697, "grad_norm": 0.78515625, "learning_rate": 7.422286579014609e-05, "loss": 1.4905, "num_input_tokens_seen": 2144600064, "step": 1818 }, { "epoch": 0.025834611301167865, "grad_norm": 0.68359375, "learning_rate": 7.421478594034907e-05, "loss": 1.688, "num_input_tokens_seen": 2145779712, "step": 1819 }, { "epoch": 0.025952577562817034, "grad_norm": 0.82421875, "learning_rate": 7.420670088471803e-05, "loss": 1.5098, "num_input_tokens_seen": 2146959360, "step": 1820 }, { "epoch": 0.026070543824466202, "grad_norm": 0.7109375, "learning_rate": 7.419861062448314e-05, "loss": 1.5118, "num_input_tokens_seen": 2148139008, "step": 1821 }, { "epoch": 0.02618851008611537, "grad_norm": 0.7734375, "learning_rate": 7.419051516087535e-05, "loss": 1.5599, "num_input_tokens_seen": 2149318656, "step": 1822 }, { "epoch": 0.026306476347764538, "grad_norm": 0.72265625, "learning_rate": 7.418241449512638e-05, "loss": 1.5937, "num_input_tokens_seen": 2150498304, "step": 1823 }, { "epoch": 0.026424442609413706, "grad_norm": 0.72265625, "learning_rate": 7.417430862846875e-05, "loss": 1.5222, "num_input_tokens_seen": 2151677952, "step": 1824 }, { "epoch": 0.026542408871062875, "grad_norm": 0.76171875, "learning_rate": 7.416619756213581e-05, "loss": 1.5716, "num_input_tokens_seen": 2152857600, "step": 1825 }, { "epoch": 0.026660375132712043, "grad_norm": 0.74609375, "learning_rate": 7.415808129736164e-05, "loss": 1.5728, "num_input_tokens_seen": 2154037248, "step": 1826 }, { "epoch": 0.02677834139436121, "grad_norm": 0.6484375, "learning_rate": 7.414995983538116e-05, "loss": 1.7184, "num_input_tokens_seen": 2155216896, "step": 1827 }, { "epoch": 0.026896307656010383, "grad_norm": 0.72265625, "learning_rate": 7.414183317743008e-05, "loss": 1.5229, "num_input_tokens_seen": 2156396544, "step": 1828 }, { "epoch": 0.02701427391765955, "grad_norm": 0.6875, "learning_rate": 7.413370132474485e-05, "loss": 1.5489, "num_input_tokens_seen": 2157576192, "step": 1829 }, { "epoch": 0.02713224017930872, "grad_norm": 0.6484375, "learning_rate": 7.412556427856276e-05, "loss": 1.642, "num_input_tokens_seen": 2158755840, "step": 1830 }, { "epoch": 0.027250206440957887, "grad_norm": 0.6640625, "learning_rate": 7.411742204012188e-05, "loss": 1.6251, "num_input_tokens_seen": 2159935488, "step": 1831 }, { "epoch": 0.027368172702607056, "grad_norm": 0.73046875, "learning_rate": 7.410927461066107e-05, "loss": 1.5202, "num_input_tokens_seen": 2161115136, "step": 1832 }, { "epoch": 0.027486138964256224, "grad_norm": 0.82421875, "learning_rate": 7.410112199141994e-05, "loss": 1.5337, "num_input_tokens_seen": 2162294784, "step": 1833 }, { "epoch": 0.027604105225905392, "grad_norm": 0.80859375, "learning_rate": 7.409296418363897e-05, "loss": 1.4675, "num_input_tokens_seen": 2163474432, "step": 1834 }, { "epoch": 0.02772207148755456, "grad_norm": 0.7109375, "learning_rate": 7.408480118855935e-05, "loss": 1.6147, "num_input_tokens_seen": 2164654080, "step": 1835 }, { "epoch": 0.02784003774920373, "grad_norm": 0.95703125, "learning_rate": 7.407663300742309e-05, "loss": 1.4314, "num_input_tokens_seen": 2165833728, "step": 1836 }, { "epoch": 0.027958004010852897, "grad_norm": 0.80078125, "learning_rate": 7.406845964147303e-05, "loss": 1.5776, "num_input_tokens_seen": 2167013376, "step": 1837 }, { "epoch": 0.028075970272502065, "grad_norm": 0.78125, "learning_rate": 7.406028109195273e-05, "loss": 1.6119, "num_input_tokens_seen": 2168193024, "step": 1838 }, { "epoch": 0.028193936534151233, "grad_norm": 0.828125, "learning_rate": 7.405209736010656e-05, "loss": 1.5874, "num_input_tokens_seen": 2169372672, "step": 1839 }, { "epoch": 0.0283119027958004, "grad_norm": 0.75, "learning_rate": 7.404390844717971e-05, "loss": 1.583, "num_input_tokens_seen": 2170552320, "step": 1840 }, { "epoch": 0.02842986905744957, "grad_norm": 0.6328125, "learning_rate": 7.403571435441814e-05, "loss": 1.6481, "num_input_tokens_seen": 2171731968, "step": 1841 }, { "epoch": 0.028547835319098738, "grad_norm": 0.71875, "learning_rate": 7.402751508306858e-05, "loss": 1.5372, "num_input_tokens_seen": 2172911616, "step": 1842 }, { "epoch": 0.028665801580747906, "grad_norm": 0.6484375, "learning_rate": 7.401931063437855e-05, "loss": 1.5178, "num_input_tokens_seen": 2174091264, "step": 1843 }, { "epoch": 0.028783767842397074, "grad_norm": 0.68359375, "learning_rate": 7.40111010095964e-05, "loss": 1.5221, "num_input_tokens_seen": 2175270912, "step": 1844 }, { "epoch": 0.028901734104046242, "grad_norm": 0.65625, "learning_rate": 7.400288620997123e-05, "loss": 1.7355, "num_input_tokens_seen": 2176450560, "step": 1845 }, { "epoch": 0.02901970036569541, "grad_norm": 0.68359375, "learning_rate": 7.399466623675292e-05, "loss": 1.5432, "num_input_tokens_seen": 2177630208, "step": 1846 }, { "epoch": 0.02913766662734458, "grad_norm": 0.6796875, "learning_rate": 7.398644109119214e-05, "loss": 1.4819, "num_input_tokens_seen": 2178809856, "step": 1847 }, { "epoch": 0.029255632888993747, "grad_norm": 0.703125, "learning_rate": 7.39782107745404e-05, "loss": 1.5471, "num_input_tokens_seen": 2179989504, "step": 1848 }, { "epoch": 0.029373599150642915, "grad_norm": 0.7265625, "learning_rate": 7.396997528804994e-05, "loss": 1.6242, "num_input_tokens_seen": 2181169152, "step": 1849 }, { "epoch": 0.029491565412292083, "grad_norm": 0.6328125, "learning_rate": 7.396173463297379e-05, "loss": 1.5332, "num_input_tokens_seen": 2182348800, "step": 1850 }, { "epoch": 0.02960953167394125, "grad_norm": 0.66796875, "learning_rate": 7.395348881056578e-05, "loss": 1.4848, "num_input_tokens_seen": 2183528448, "step": 1851 }, { "epoch": 0.02972749793559042, "grad_norm": 0.71484375, "learning_rate": 7.394523782208053e-05, "loss": 1.5674, "num_input_tokens_seen": 2184708096, "step": 1852 }, { "epoch": 0.029845464197239588, "grad_norm": 0.703125, "learning_rate": 7.393698166877346e-05, "loss": 1.7141, "num_input_tokens_seen": 2185887744, "step": 1853 }, { "epoch": 0.029963430458888756, "grad_norm": 0.68359375, "learning_rate": 7.39287203519007e-05, "loss": 1.5141, "num_input_tokens_seen": 2187067392, "step": 1854 }, { "epoch": 0.030081396720537928, "grad_norm": 0.6640625, "learning_rate": 7.39204538727193e-05, "loss": 1.6283, "num_input_tokens_seen": 2188247040, "step": 1855 }, { "epoch": 0.030199362982187096, "grad_norm": 0.66015625, "learning_rate": 7.391218223248695e-05, "loss": 1.7063, "num_input_tokens_seen": 2189426688, "step": 1856 }, { "epoch": 0.030317329243836264, "grad_norm": 0.765625, "learning_rate": 7.390390543246224e-05, "loss": 1.5577, "num_input_tokens_seen": 2190606336, "step": 1857 }, { "epoch": 0.030435295505485432, "grad_norm": 0.7578125, "learning_rate": 7.389562347390447e-05, "loss": 1.5518, "num_input_tokens_seen": 2191785984, "step": 1858 }, { "epoch": 0.0305532617671346, "grad_norm": 0.7890625, "learning_rate": 7.388733635807378e-05, "loss": 1.5015, "num_input_tokens_seen": 2192965632, "step": 1859 }, { "epoch": 0.03067122802878377, "grad_norm": 0.6640625, "learning_rate": 7.387904408623103e-05, "loss": 1.4798, "num_input_tokens_seen": 2194145280, "step": 1860 }, { "epoch": 0.030789194290432937, "grad_norm": 0.7578125, "learning_rate": 7.387074665963794e-05, "loss": 1.5943, "num_input_tokens_seen": 2195324928, "step": 1861 }, { "epoch": 0.030907160552082105, "grad_norm": 0.6796875, "learning_rate": 7.386244407955695e-05, "loss": 1.6022, "num_input_tokens_seen": 2196504576, "step": 1862 }, { "epoch": 0.031025126813731273, "grad_norm": 0.7421875, "learning_rate": 7.385413634725132e-05, "loss": 1.5654, "num_input_tokens_seen": 2197684224, "step": 1863 }, { "epoch": 0.03114309307538044, "grad_norm": 0.77734375, "learning_rate": 7.384582346398509e-05, "loss": 1.597, "num_input_tokens_seen": 2198863872, "step": 1864 }, { "epoch": 0.031261059337029606, "grad_norm": 0.765625, "learning_rate": 7.383750543102308e-05, "loss": 1.4255, "num_input_tokens_seen": 2200043520, "step": 1865 }, { "epoch": 0.03137902559867878, "grad_norm": 0.72265625, "learning_rate": 7.382918224963087e-05, "loss": 1.6604, "num_input_tokens_seen": 2201223168, "step": 1866 }, { "epoch": 0.03149699186032794, "grad_norm": 0.765625, "learning_rate": 7.382085392107486e-05, "loss": 1.6227, "num_input_tokens_seen": 2202402816, "step": 1867 }, { "epoch": 0.031614958121977114, "grad_norm": 0.94140625, "learning_rate": 7.381252044662223e-05, "loss": 1.5137, "num_input_tokens_seen": 2203582464, "step": 1868 }, { "epoch": 0.031732924383626286, "grad_norm": 0.71875, "learning_rate": 7.380418182754093e-05, "loss": 1.6039, "num_input_tokens_seen": 2204762112, "step": 1869 }, { "epoch": 0.03185089064527545, "grad_norm": 0.76953125, "learning_rate": 7.379583806509967e-05, "loss": 1.4421, "num_input_tokens_seen": 2205941760, "step": 1870 }, { "epoch": 0.03196885690692462, "grad_norm": 0.66796875, "learning_rate": 7.378748916056798e-05, "loss": 1.6332, "num_input_tokens_seen": 2207121408, "step": 1871 }, { "epoch": 0.03208682316857379, "grad_norm": 0.71484375, "learning_rate": 7.377913511521617e-05, "loss": 1.5797, "num_input_tokens_seen": 2208301056, "step": 1872 }, { "epoch": 0.03220478943022296, "grad_norm": 0.66796875, "learning_rate": 7.377077593031531e-05, "loss": 1.598, "num_input_tokens_seen": 2209480704, "step": 1873 }, { "epoch": 0.032322755691872124, "grad_norm": 0.73046875, "learning_rate": 7.376241160713727e-05, "loss": 1.5616, "num_input_tokens_seen": 2210660352, "step": 1874 }, { "epoch": 0.032440721953521295, "grad_norm": 0.765625, "learning_rate": 7.37540421469547e-05, "loss": 1.4614, "num_input_tokens_seen": 2211840000, "step": 1875 }, { "epoch": 0.03255868821517046, "grad_norm": 0.6796875, "learning_rate": 7.374566755104098e-05, "loss": 1.5495, "num_input_tokens_seen": 2213019648, "step": 1876 }, { "epoch": 0.03267665447681963, "grad_norm": 0.81640625, "learning_rate": 7.373728782067038e-05, "loss": 1.358, "num_input_tokens_seen": 2214199296, "step": 1877 }, { "epoch": 0.0327946207384688, "grad_norm": 0.79296875, "learning_rate": 7.372890295711786e-05, "loss": 1.4047, "num_input_tokens_seen": 2215378944, "step": 1878 }, { "epoch": 0.03291258700011797, "grad_norm": 0.7578125, "learning_rate": 7.37205129616592e-05, "loss": 1.4799, "num_input_tokens_seen": 2216558592, "step": 1879 }, { "epoch": 0.03303055326176713, "grad_norm": 0.71875, "learning_rate": 7.371211783557095e-05, "loss": 1.3531, "num_input_tokens_seen": 2217738240, "step": 1880 }, { "epoch": 0.033148519523416305, "grad_norm": 0.70703125, "learning_rate": 7.370371758013042e-05, "loss": 1.5988, "num_input_tokens_seen": 2218917888, "step": 1881 }, { "epoch": 0.03326648578506547, "grad_norm": 0.69140625, "learning_rate": 7.369531219661575e-05, "loss": 1.5523, "num_input_tokens_seen": 2220097536, "step": 1882 }, { "epoch": 0.03338445204671464, "grad_norm": 0.6484375, "learning_rate": 7.368690168630582e-05, "loss": 1.5946, "num_input_tokens_seen": 2221277184, "step": 1883 }, { "epoch": 0.033502418308363806, "grad_norm": 0.7109375, "learning_rate": 7.367848605048031e-05, "loss": 1.4925, "num_input_tokens_seen": 2222456832, "step": 1884 }, { "epoch": 0.03362038457001298, "grad_norm": 0.76171875, "learning_rate": 7.367006529041967e-05, "loss": 1.5686, "num_input_tokens_seen": 2223636480, "step": 1885 }, { "epoch": 0.03373835083166214, "grad_norm": 0.8359375, "learning_rate": 7.366163940740511e-05, "loss": 1.5127, "num_input_tokens_seen": 2224816128, "step": 1886 }, { "epoch": 0.033856317093311314, "grad_norm": 0.83203125, "learning_rate": 7.365320840271867e-05, "loss": 1.5116, "num_input_tokens_seen": 2225995776, "step": 1887 }, { "epoch": 0.03397428335496048, "grad_norm": 0.796875, "learning_rate": 7.364477227764314e-05, "loss": 1.486, "num_input_tokens_seen": 2227175424, "step": 1888 }, { "epoch": 0.03409224961660965, "grad_norm": 1.046875, "learning_rate": 7.363633103346207e-05, "loss": 1.6249, "num_input_tokens_seen": 2228355072, "step": 1889 }, { "epoch": 0.034210215878258815, "grad_norm": 0.8984375, "learning_rate": 7.362788467145983e-05, "loss": 1.6136, "num_input_tokens_seen": 2229534720, "step": 1890 }, { "epoch": 0.03432818213990799, "grad_norm": 0.84765625, "learning_rate": 7.361943319292153e-05, "loss": 1.5335, "num_input_tokens_seen": 2230714368, "step": 1891 }, { "epoch": 0.03444614840155715, "grad_norm": 0.7734375, "learning_rate": 7.36109765991331e-05, "loss": 1.4603, "num_input_tokens_seen": 2231894016, "step": 1892 }, { "epoch": 0.03456411466320632, "grad_norm": 0.83984375, "learning_rate": 7.360251489138119e-05, "loss": 1.4003, "num_input_tokens_seen": 2233073664, "step": 1893 }, { "epoch": 0.03468208092485549, "grad_norm": 0.73828125, "learning_rate": 7.359404807095328e-05, "loss": 1.5943, "num_input_tokens_seen": 2234253312, "step": 1894 }, { "epoch": 0.03480004718650466, "grad_norm": 0.8203125, "learning_rate": 7.358557613913761e-05, "loss": 1.5785, "num_input_tokens_seen": 2235432960, "step": 1895 }, { "epoch": 0.03491801344815383, "grad_norm": 0.8359375, "learning_rate": 7.357709909722319e-05, "loss": 1.3838, "num_input_tokens_seen": 2236612608, "step": 1896 }, { "epoch": 0.035035979709802996, "grad_norm": 0.69140625, "learning_rate": 7.356861694649985e-05, "loss": 1.5458, "num_input_tokens_seen": 2237792256, "step": 1897 }, { "epoch": 0.03515394597145217, "grad_norm": 0.73828125, "learning_rate": 7.35601296882581e-05, "loss": 1.5471, "num_input_tokens_seen": 2238971904, "step": 1898 }, { "epoch": 0.03527191223310133, "grad_norm": 0.8046875, "learning_rate": 7.355163732378937e-05, "loss": 1.4163, "num_input_tokens_seen": 2240151552, "step": 1899 }, { "epoch": 0.035389878494750504, "grad_norm": 0.703125, "learning_rate": 7.35431398543857e-05, "loss": 1.5865, "num_input_tokens_seen": 2241331200, "step": 1900 }, { "epoch": 0.03550784475639967, "grad_norm": 0.8203125, "learning_rate": 7.353463728134005e-05, "loss": 1.3711, "num_input_tokens_seen": 2242510848, "step": 1901 }, { "epoch": 0.03562581101804884, "grad_norm": 0.68359375, "learning_rate": 7.352612960594609e-05, "loss": 1.5282, "num_input_tokens_seen": 2243690496, "step": 1902 }, { "epoch": 0.035743777279698005, "grad_norm": 0.765625, "learning_rate": 7.351761682949827e-05, "loss": 1.6066, "num_input_tokens_seen": 2244870144, "step": 1903 }, { "epoch": 0.03586174354134718, "grad_norm": 0.76953125, "learning_rate": 7.350909895329183e-05, "loss": 1.5216, "num_input_tokens_seen": 2246049792, "step": 1904 }, { "epoch": 0.03597970980299634, "grad_norm": 0.7421875, "learning_rate": 7.350057597862277e-05, "loss": 1.5401, "num_input_tokens_seen": 2247229440, "step": 1905 }, { "epoch": 0.03609767606464551, "grad_norm": 0.77734375, "learning_rate": 7.349204790678788e-05, "loss": 1.4897, "num_input_tokens_seen": 2248409088, "step": 1906 }, { "epoch": 0.03621564232629468, "grad_norm": 0.8359375, "learning_rate": 7.348351473908469e-05, "loss": 1.5963, "num_input_tokens_seen": 2249588736, "step": 1907 }, { "epoch": 0.03633360858794385, "grad_norm": 0.765625, "learning_rate": 7.34749764768116e-05, "loss": 1.6018, "num_input_tokens_seen": 2250768384, "step": 1908 }, { "epoch": 0.036451574849593014, "grad_norm": 0.84765625, "learning_rate": 7.346643312126766e-05, "loss": 1.5299, "num_input_tokens_seen": 2251948032, "step": 1909 }, { "epoch": 0.036569541111242186, "grad_norm": 0.68359375, "learning_rate": 7.345788467375278e-05, "loss": 1.4308, "num_input_tokens_seen": 2253127680, "step": 1910 }, { "epoch": 0.03668750737289135, "grad_norm": 0.984375, "learning_rate": 7.344933113556762e-05, "loss": 1.6228, "num_input_tokens_seen": 2254307328, "step": 1911 }, { "epoch": 0.03680547363454052, "grad_norm": 0.78515625, "learning_rate": 7.344077250801361e-05, "loss": 1.5428, "num_input_tokens_seen": 2255486976, "step": 1912 }, { "epoch": 0.03692343989618969, "grad_norm": 0.8828125, "learning_rate": 7.343220879239296e-05, "loss": 1.4764, "num_input_tokens_seen": 2256666624, "step": 1913 }, { "epoch": 0.03704140615783886, "grad_norm": 0.9140625, "learning_rate": 7.342363999000865e-05, "loss": 1.4045, "num_input_tokens_seen": 2257846272, "step": 1914 }, { "epoch": 0.037159372419488024, "grad_norm": 0.87109375, "learning_rate": 7.341506610216445e-05, "loss": 1.4639, "num_input_tokens_seen": 2259025920, "step": 1915 }, { "epoch": 0.037277338681137195, "grad_norm": 0.8125, "learning_rate": 7.340648713016487e-05, "loss": 1.4876, "num_input_tokens_seen": 2260205568, "step": 1916 }, { "epoch": 0.03739530494278636, "grad_norm": 0.7890625, "learning_rate": 7.339790307531523e-05, "loss": 1.4313, "num_input_tokens_seen": 2261385216, "step": 1917 }, { "epoch": 0.03751327120443553, "grad_norm": 0.77734375, "learning_rate": 7.33893139389216e-05, "loss": 1.6842, "num_input_tokens_seen": 2262564864, "step": 1918 }, { "epoch": 0.0376312374660847, "grad_norm": 0.765625, "learning_rate": 7.338071972229083e-05, "loss": 1.3915, "num_input_tokens_seen": 2263744512, "step": 1919 }, { "epoch": 0.03774920372773387, "grad_norm": 0.765625, "learning_rate": 7.337212042673055e-05, "loss": 1.4723, "num_input_tokens_seen": 2264924160, "step": 1920 }, { "epoch": 0.03786716998938304, "grad_norm": 0.828125, "learning_rate": 7.336351605354916e-05, "loss": 1.3676, "num_input_tokens_seen": 2266103808, "step": 1921 }, { "epoch": 0.037985136251032205, "grad_norm": 0.66796875, "learning_rate": 7.335490660405581e-05, "loss": 1.512, "num_input_tokens_seen": 2267283456, "step": 1922 }, { "epoch": 0.038103102512681376, "grad_norm": 0.703125, "learning_rate": 7.334629207956046e-05, "loss": 1.4637, "num_input_tokens_seen": 2268463104, "step": 1923 }, { "epoch": 0.03822106877433054, "grad_norm": 0.66796875, "learning_rate": 7.333767248137382e-05, "loss": 1.5311, "num_input_tokens_seen": 2269642752, "step": 1924 }, { "epoch": 0.03833903503597971, "grad_norm": 0.6875, "learning_rate": 7.332904781080736e-05, "loss": 1.4918, "num_input_tokens_seen": 2270822400, "step": 1925 }, { "epoch": 0.03845700129762888, "grad_norm": 0.67578125, "learning_rate": 7.332041806917337e-05, "loss": 1.5913, "num_input_tokens_seen": 2272002048, "step": 1926 }, { "epoch": 0.03857496755927805, "grad_norm": 0.6953125, "learning_rate": 7.331178325778485e-05, "loss": 1.3829, "num_input_tokens_seen": 2273181696, "step": 1927 }, { "epoch": 0.038692933820927214, "grad_norm": 0.6875, "learning_rate": 7.330314337795561e-05, "loss": 1.4821, "num_input_tokens_seen": 2274361344, "step": 1928 }, { "epoch": 0.038810900082576386, "grad_norm": 0.7421875, "learning_rate": 7.329449843100022e-05, "loss": 1.5275, "num_input_tokens_seen": 2275540992, "step": 1929 }, { "epoch": 0.03892886634422555, "grad_norm": 0.63671875, "learning_rate": 7.328584841823401e-05, "loss": 1.634, "num_input_tokens_seen": 2276720640, "step": 1930 }, { "epoch": 0.03904683260587472, "grad_norm": 0.7109375, "learning_rate": 7.327719334097312e-05, "loss": 1.6776, "num_input_tokens_seen": 2277900288, "step": 1931 }, { "epoch": 0.03916479886752389, "grad_norm": 0.70703125, "learning_rate": 7.326853320053442e-05, "loss": 1.4269, "num_input_tokens_seen": 2279079936, "step": 1932 }, { "epoch": 0.03928276512917306, "grad_norm": 0.66015625, "learning_rate": 7.325986799823555e-05, "loss": 1.4935, "num_input_tokens_seen": 2280259584, "step": 1933 }, { "epoch": 0.03940073139082222, "grad_norm": 0.66015625, "learning_rate": 7.325119773539497e-05, "loss": 1.5445, "num_input_tokens_seen": 2281439232, "step": 1934 }, { "epoch": 0.039518697652471395, "grad_norm": 0.71484375, "learning_rate": 7.324252241333185e-05, "loss": 1.4365, "num_input_tokens_seen": 2282618880, "step": 1935 }, { "epoch": 0.03963666391412056, "grad_norm": 0.6484375, "learning_rate": 7.323384203336615e-05, "loss": 1.5626, "num_input_tokens_seen": 2283798528, "step": 1936 }, { "epoch": 0.03975463017576973, "grad_norm": 0.75390625, "learning_rate": 7.32251565968186e-05, "loss": 1.4364, "num_input_tokens_seen": 2284978176, "step": 1937 }, { "epoch": 0.039872596437418896, "grad_norm": 0.65234375, "learning_rate": 7.321646610501073e-05, "loss": 1.5292, "num_input_tokens_seen": 2286157824, "step": 1938 }, { "epoch": 0.03999056269906807, "grad_norm": 0.71484375, "learning_rate": 7.320777055926478e-05, "loss": 1.3565, "num_input_tokens_seen": 2287337472, "step": 1939 }, { "epoch": 0.04010852896071723, "grad_norm": 0.69921875, "learning_rate": 7.319906996090383e-05, "loss": 1.5397, "num_input_tokens_seen": 2288517120, "step": 1940 }, { "epoch": 0.040226495222366404, "grad_norm": 0.6640625, "learning_rate": 7.319036431125165e-05, "loss": 1.4882, "num_input_tokens_seen": 2289696768, "step": 1941 }, { "epoch": 0.04034446148401557, "grad_norm": 0.67578125, "learning_rate": 7.318165361163284e-05, "loss": 1.6145, "num_input_tokens_seen": 2290876416, "step": 1942 }, { "epoch": 0.04046242774566474, "grad_norm": 0.6484375, "learning_rate": 7.317293786337273e-05, "loss": 1.5312, "num_input_tokens_seen": 2292056064, "step": 1943 }, { "epoch": 0.040580394007313905, "grad_norm": 0.76171875, "learning_rate": 7.316421706779746e-05, "loss": 1.5138, "num_input_tokens_seen": 2293235712, "step": 1944 }, { "epoch": 0.04069836026896308, "grad_norm": 0.703125, "learning_rate": 7.31554912262339e-05, "loss": 1.4698, "num_input_tokens_seen": 2294415360, "step": 1945 }, { "epoch": 0.04081632653061224, "grad_norm": 0.65234375, "learning_rate": 7.314676034000968e-05, "loss": 1.4988, "num_input_tokens_seen": 2295595008, "step": 1946 }, { "epoch": 0.04093429279226141, "grad_norm": 0.65234375, "learning_rate": 7.313802441045326e-05, "loss": 1.5906, "num_input_tokens_seen": 2296774656, "step": 1947 }, { "epoch": 0.041052259053910585, "grad_norm": 0.67578125, "learning_rate": 7.31292834388938e-05, "loss": 1.7172, "num_input_tokens_seen": 2297954304, "step": 1948 }, { "epoch": 0.04117022531555975, "grad_norm": 0.66796875, "learning_rate": 7.312053742666126e-05, "loss": 1.5577, "num_input_tokens_seen": 2299133952, "step": 1949 }, { "epoch": 0.04128819157720892, "grad_norm": 0.6953125, "learning_rate": 7.311178637508634e-05, "loss": 1.5534, "num_input_tokens_seen": 2300313600, "step": 1950 }, { "epoch": 0.041406157838858086, "grad_norm": 0.62109375, "learning_rate": 7.310303028550056e-05, "loss": 1.4872, "num_input_tokens_seen": 2301493248, "step": 1951 }, { "epoch": 0.04152412410050726, "grad_norm": 0.73046875, "learning_rate": 7.309426915923615e-05, "loss": 1.4101, "num_input_tokens_seen": 2302672896, "step": 1952 }, { "epoch": 0.04164209036215642, "grad_norm": 0.69140625, "learning_rate": 7.308550299762613e-05, "loss": 1.609, "num_input_tokens_seen": 2303852544, "step": 1953 }, { "epoch": 0.041760056623805594, "grad_norm": 0.63671875, "learning_rate": 7.307673180200429e-05, "loss": 1.6246, "num_input_tokens_seen": 2305032192, "step": 1954 }, { "epoch": 0.04187802288545476, "grad_norm": 0.74609375, "learning_rate": 7.306795557370519e-05, "loss": 1.5308, "num_input_tokens_seen": 2306211840, "step": 1955 }, { "epoch": 0.04199598914710393, "grad_norm": 0.68359375, "learning_rate": 7.305917431406413e-05, "loss": 1.5942, "num_input_tokens_seen": 2307391488, "step": 1956 }, { "epoch": 0.042113955408753095, "grad_norm": 0.7734375, "learning_rate": 7.30503880244172e-05, "loss": 1.6072, "num_input_tokens_seen": 2308571136, "step": 1957 }, { "epoch": 0.04223192167040227, "grad_norm": 0.734375, "learning_rate": 7.304159670610126e-05, "loss": 1.3998, "num_input_tokens_seen": 2309750784, "step": 1958 }, { "epoch": 0.04234988793205143, "grad_norm": 0.71875, "learning_rate": 7.30328003604539e-05, "loss": 1.5292, "num_input_tokens_seen": 2310930432, "step": 1959 }, { "epoch": 0.042467854193700603, "grad_norm": 0.6875, "learning_rate": 7.302399898881352e-05, "loss": 1.4782, "num_input_tokens_seen": 2312110080, "step": 1960 }, { "epoch": 0.04258582045534977, "grad_norm": 0.703125, "learning_rate": 7.301519259251925e-05, "loss": 1.3287, "num_input_tokens_seen": 2313289728, "step": 1961 }, { "epoch": 0.04270378671699894, "grad_norm": 0.734375, "learning_rate": 7.3006381172911e-05, "loss": 1.5527, "num_input_tokens_seen": 2314469376, "step": 1962 }, { "epoch": 0.042821752978648105, "grad_norm": 0.734375, "learning_rate": 7.299756473132944e-05, "loss": 1.5148, "num_input_tokens_seen": 2315649024, "step": 1963 }, { "epoch": 0.042939719240297276, "grad_norm": 0.6640625, "learning_rate": 7.2988743269116e-05, "loss": 1.541, "num_input_tokens_seen": 2316828672, "step": 1964 }, { "epoch": 0.04305768550194644, "grad_norm": 0.6875, "learning_rate": 7.297991678761289e-05, "loss": 1.4709, "num_input_tokens_seen": 2318008320, "step": 1965 }, { "epoch": 0.04317565176359561, "grad_norm": 0.7421875, "learning_rate": 7.297108528816308e-05, "loss": 1.4995, "num_input_tokens_seen": 2319187968, "step": 1966 }, { "epoch": 0.04329361802524478, "grad_norm": 0.67578125, "learning_rate": 7.296224877211029e-05, "loss": 1.4244, "num_input_tokens_seen": 2320367616, "step": 1967 }, { "epoch": 0.04341158428689395, "grad_norm": 0.9296875, "learning_rate": 7.295340724079899e-05, "loss": 1.5493, "num_input_tokens_seen": 2321547264, "step": 1968 }, { "epoch": 0.043529550548543114, "grad_norm": 0.74609375, "learning_rate": 7.294456069557445e-05, "loss": 1.5873, "num_input_tokens_seen": 2322726912, "step": 1969 }, { "epoch": 0.043647516810192286, "grad_norm": 0.75, "learning_rate": 7.293570913778268e-05, "loss": 1.3775, "num_input_tokens_seen": 2323906560, "step": 1970 }, { "epoch": 0.04376548307184145, "grad_norm": 0.75, "learning_rate": 7.292685256877049e-05, "loss": 1.443, "num_input_tokens_seen": 2325086208, "step": 1971 }, { "epoch": 0.04388344933349062, "grad_norm": 0.703125, "learning_rate": 7.291799098988539e-05, "loss": 1.495, "num_input_tokens_seen": 2326265856, "step": 1972 }, { "epoch": 0.04400141559513979, "grad_norm": 0.65625, "learning_rate": 7.290912440247567e-05, "loss": 1.5695, "num_input_tokens_seen": 2327445504, "step": 1973 }, { "epoch": 0.04411938185678896, "grad_norm": 0.70703125, "learning_rate": 7.290025280789042e-05, "loss": 1.5184, "num_input_tokens_seen": 2328625152, "step": 1974 }, { "epoch": 0.04423734811843813, "grad_norm": 0.6953125, "learning_rate": 7.289137620747947e-05, "loss": 1.4567, "num_input_tokens_seen": 2329804800, "step": 1975 }, { "epoch": 0.044355314380087295, "grad_norm": 0.625, "learning_rate": 7.288249460259338e-05, "loss": 1.5241, "num_input_tokens_seen": 2330984448, "step": 1976 }, { "epoch": 0.044473280641736467, "grad_norm": 0.73828125, "learning_rate": 7.287360799458354e-05, "loss": 1.4828, "num_input_tokens_seen": 2332164096, "step": 1977 }, { "epoch": 0.04459124690338563, "grad_norm": 0.6640625, "learning_rate": 7.286471638480204e-05, "loss": 1.6184, "num_input_tokens_seen": 2333343744, "step": 1978 }, { "epoch": 0.0447092131650348, "grad_norm": 0.73828125, "learning_rate": 7.285581977460174e-05, "loss": 1.3618, "num_input_tokens_seen": 2334523392, "step": 1979 }, { "epoch": 0.04482717942668397, "grad_norm": 0.73046875, "learning_rate": 7.28469181653363e-05, "loss": 1.3807, "num_input_tokens_seen": 2335703040, "step": 1980 }, { "epoch": 0.04494514568833314, "grad_norm": 0.72265625, "learning_rate": 7.283801155836009e-05, "loss": 1.5257, "num_input_tokens_seen": 2336882688, "step": 1981 }, { "epoch": 0.045063111949982304, "grad_norm": 0.69140625, "learning_rate": 7.282909995502828e-05, "loss": 1.5478, "num_input_tokens_seen": 2338062336, "step": 1982 }, { "epoch": 0.045181078211631476, "grad_norm": 0.703125, "learning_rate": 7.282018335669678e-05, "loss": 1.4212, "num_input_tokens_seen": 2339241984, "step": 1983 }, { "epoch": 0.04529904447328064, "grad_norm": 0.82421875, "learning_rate": 7.281126176472226e-05, "loss": 1.4443, "num_input_tokens_seen": 2340421632, "step": 1984 }, { "epoch": 0.04541701073492981, "grad_norm": 0.640625, "learning_rate": 7.280233518046217e-05, "loss": 1.544, "num_input_tokens_seen": 2341601280, "step": 1985 }, { "epoch": 0.04553497699657898, "grad_norm": 0.796875, "learning_rate": 7.27934036052747e-05, "loss": 1.498, "num_input_tokens_seen": 2342780928, "step": 1986 }, { "epoch": 0.04565294325822815, "grad_norm": 0.69140625, "learning_rate": 7.278446704051878e-05, "loss": 1.6416, "num_input_tokens_seen": 2343960576, "step": 1987 }, { "epoch": 0.04577090951987731, "grad_norm": 0.76171875, "learning_rate": 7.277552548755414e-05, "loss": 1.4801, "num_input_tokens_seen": 2345140224, "step": 1988 }, { "epoch": 0.045888875781526485, "grad_norm": 0.703125, "learning_rate": 7.276657894774126e-05, "loss": 1.4833, "num_input_tokens_seen": 2346319872, "step": 1989 }, { "epoch": 0.04600684204317565, "grad_norm": 0.73046875, "learning_rate": 7.275762742244135e-05, "loss": 1.5809, "num_input_tokens_seen": 2347499520, "step": 1990 }, { "epoch": 0.04612480830482482, "grad_norm": 0.73828125, "learning_rate": 7.274867091301642e-05, "loss": 1.553, "num_input_tokens_seen": 2348679168, "step": 1991 }, { "epoch": 0.046242774566473986, "grad_norm": 0.890625, "learning_rate": 7.27397094208292e-05, "loss": 1.3766, "num_input_tokens_seen": 2349858816, "step": 1992 }, { "epoch": 0.04636074082812316, "grad_norm": 0.75390625, "learning_rate": 7.27307429472432e-05, "loss": 1.5416, "num_input_tokens_seen": 2351038464, "step": 1993 }, { "epoch": 0.04647870708977232, "grad_norm": 0.8828125, "learning_rate": 7.27217714936227e-05, "loss": 1.4611, "num_input_tokens_seen": 2352218112, "step": 1994 }, { "epoch": 0.046596673351421494, "grad_norm": 0.71875, "learning_rate": 7.271279506133269e-05, "loss": 1.4342, "num_input_tokens_seen": 2353397760, "step": 1995 }, { "epoch": 0.04671463961307066, "grad_norm": 0.84765625, "learning_rate": 7.270381365173897e-05, "loss": 1.5395, "num_input_tokens_seen": 2354577408, "step": 1996 }, { "epoch": 0.04683260587471983, "grad_norm": 0.7265625, "learning_rate": 7.269482726620807e-05, "loss": 1.5751, "num_input_tokens_seen": 2355757056, "step": 1997 }, { "epoch": 0.046950572136368995, "grad_norm": 0.75, "learning_rate": 7.268583590610729e-05, "loss": 1.5388, "num_input_tokens_seen": 2356936704, "step": 1998 }, { "epoch": 0.04706853839801817, "grad_norm": 0.7421875, "learning_rate": 7.267683957280466e-05, "loss": 1.4628, "num_input_tokens_seen": 2358116352, "step": 1999 }, { "epoch": 0.04718650465966733, "grad_norm": 0.6875, "learning_rate": 7.266783826766901e-05, "loss": 1.6311, "num_input_tokens_seen": 2359296000, "step": 2000 }, { "epoch": 0.04718650465966733, "eval_wikipedia_loss": 2.2608883380889893, "eval_wikipedia_runtime": 163.5557, "eval_wikipedia_samples_per_second": 4.292, "eval_wikipedia_steps_per_second": 0.183, "num_input_tokens_seen": 2359296000, "step": 2000 }, { "epoch": 0.04718650465966733, "eval_toxicity_loss": 4.0160722732543945, "eval_toxicity_runtime": 1.1558, "eval_toxicity_samples_per_second": 1.73, "eval_toxicity_steps_per_second": 0.865, "num_input_tokens_seen": 2359296000, "step": 2000 }, { "epoch": 0.047304470921316503, "grad_norm": 0.74609375, "learning_rate": 7.265883199206989e-05, "loss": 1.4713, "num_input_tokens_seen": 2360475648, "step": 2001 }, { "epoch": 0.047422437182965675, "grad_norm": 0.6953125, "learning_rate": 7.264982074737762e-05, "loss": 1.4955, "num_input_tokens_seen": 2361655296, "step": 2002 }, { "epoch": 0.04754040344461484, "grad_norm": 0.67578125, "learning_rate": 7.264080453496328e-05, "loss": 1.5753, "num_input_tokens_seen": 2362834944, "step": 2003 }, { "epoch": 0.04765836970626401, "grad_norm": 0.6328125, "learning_rate": 7.263178335619868e-05, "loss": 1.6932, "num_input_tokens_seen": 2364014592, "step": 2004 }, { "epoch": 0.047776335967913176, "grad_norm": 0.7109375, "learning_rate": 7.262275721245644e-05, "loss": 1.6407, "num_input_tokens_seen": 2365194240, "step": 2005 }, { "epoch": 0.04789430222956235, "grad_norm": 0.59765625, "learning_rate": 7.26137261051099e-05, "loss": 1.698, "num_input_tokens_seen": 2366373888, "step": 2006 }, { "epoch": 0.04801226849121151, "grad_norm": 0.76953125, "learning_rate": 7.260469003553312e-05, "loss": 1.4249, "num_input_tokens_seen": 2367553536, "step": 2007 }, { "epoch": 0.048130234752860684, "grad_norm": 0.61328125, "learning_rate": 7.259564900510098e-05, "loss": 1.7002, "num_input_tokens_seen": 2368733184, "step": 2008 }, { "epoch": 0.04824820101450985, "grad_norm": 0.6796875, "learning_rate": 7.258660301518909e-05, "loss": 1.7053, "num_input_tokens_seen": 2369912832, "step": 2009 }, { "epoch": 0.04836616727615902, "grad_norm": 0.6875, "learning_rate": 7.257755206717379e-05, "loss": 1.4856, "num_input_tokens_seen": 2371092480, "step": 2010 }, { "epoch": 0.048484133537808186, "grad_norm": 0.72265625, "learning_rate": 7.256849616243223e-05, "loss": 1.4462, "num_input_tokens_seen": 2372272128, "step": 2011 }, { "epoch": 0.04860209979945736, "grad_norm": 0.70703125, "learning_rate": 7.255943530234225e-05, "loss": 1.4224, "num_input_tokens_seen": 2373451776, "step": 2012 }, { "epoch": 0.04872006606110652, "grad_norm": 0.63671875, "learning_rate": 7.255036948828249e-05, "loss": 1.6058, "num_input_tokens_seen": 2374631424, "step": 2013 }, { "epoch": 0.048838032322755694, "grad_norm": 0.69921875, "learning_rate": 7.254129872163232e-05, "loss": 1.4915, "num_input_tokens_seen": 2375811072, "step": 2014 }, { "epoch": 0.04895599858440486, "grad_norm": 0.73828125, "learning_rate": 7.253222300377188e-05, "loss": 1.5527, "num_input_tokens_seen": 2376990720, "step": 2015 }, { "epoch": 0.04907396484605403, "grad_norm": 0.640625, "learning_rate": 7.252314233608204e-05, "loss": 1.5796, "num_input_tokens_seen": 2378170368, "step": 2016 }, { "epoch": 0.049191931107703195, "grad_norm": 0.6875, "learning_rate": 7.251405671994446e-05, "loss": 1.6129, "num_input_tokens_seen": 2379350016, "step": 2017 }, { "epoch": 0.049309897369352367, "grad_norm": 0.6796875, "learning_rate": 7.250496615674152e-05, "loss": 1.5305, "num_input_tokens_seen": 2380529664, "step": 2018 }, { "epoch": 0.04942786363100153, "grad_norm": 0.6328125, "learning_rate": 7.249587064785636e-05, "loss": 1.5511, "num_input_tokens_seen": 2381709312, "step": 2019 }, { "epoch": 0.0495458298926507, "grad_norm": 0.69140625, "learning_rate": 7.248677019467286e-05, "loss": 1.5779, "num_input_tokens_seen": 2382888960, "step": 2020 }, { "epoch": 0.04966379615429987, "grad_norm": 0.65625, "learning_rate": 7.24776647985757e-05, "loss": 1.5857, "num_input_tokens_seen": 2384068608, "step": 2021 }, { "epoch": 0.04978176241594904, "grad_norm": 0.66796875, "learning_rate": 7.246855446095028e-05, "loss": 1.496, "num_input_tokens_seen": 2385248256, "step": 2022 }, { "epoch": 0.049899728677598204, "grad_norm": 0.6171875, "learning_rate": 7.245943918318272e-05, "loss": 1.49, "num_input_tokens_seen": 2386427904, "step": 2023 }, { "epoch": 0.050017694939247376, "grad_norm": 0.65625, "learning_rate": 7.245031896665995e-05, "loss": 1.6214, "num_input_tokens_seen": 2387607552, "step": 2024 }, { "epoch": 0.05013566120089654, "grad_norm": 0.6015625, "learning_rate": 7.244119381276963e-05, "loss": 1.6914, "num_input_tokens_seen": 2388787200, "step": 2025 }, { "epoch": 0.05025362746254571, "grad_norm": 0.66796875, "learning_rate": 7.243206372290014e-05, "loss": 1.5241, "num_input_tokens_seen": 2389966848, "step": 2026 }, { "epoch": 0.05037159372419488, "grad_norm": 0.6796875, "learning_rate": 7.242292869844067e-05, "loss": 1.6441, "num_input_tokens_seen": 2391146496, "step": 2027 }, { "epoch": 0.05048955998584405, "grad_norm": 0.6875, "learning_rate": 7.24137887407811e-05, "loss": 1.6763, "num_input_tokens_seen": 2392326144, "step": 2028 }, { "epoch": 0.05060752624749322, "grad_norm": 0.66796875, "learning_rate": 7.24046438513121e-05, "loss": 1.4772, "num_input_tokens_seen": 2393505792, "step": 2029 }, { "epoch": 0.050725492509142385, "grad_norm": 0.66796875, "learning_rate": 7.239549403142509e-05, "loss": 1.5272, "num_input_tokens_seen": 2394685440, "step": 2030 }, { "epoch": 0.05084345877079156, "grad_norm": 0.7265625, "learning_rate": 7.238633928251221e-05, "loss": 1.3924, "num_input_tokens_seen": 2395865088, "step": 2031 }, { "epoch": 0.05096142503244072, "grad_norm": 0.6015625, "learning_rate": 7.237717960596639e-05, "loss": 1.6087, "num_input_tokens_seen": 2397044736, "step": 2032 }, { "epoch": 0.05107939129408989, "grad_norm": 0.71875, "learning_rate": 7.236801500318127e-05, "loss": 1.5425, "num_input_tokens_seen": 2398224384, "step": 2033 }, { "epoch": 0.05119735755573906, "grad_norm": 0.68359375, "learning_rate": 7.235884547555127e-05, "loss": 1.5686, "num_input_tokens_seen": 2399404032, "step": 2034 }, { "epoch": 0.05131532381738823, "grad_norm": 0.6328125, "learning_rate": 7.234967102447155e-05, "loss": 1.6643, "num_input_tokens_seen": 2400583680, "step": 2035 }, { "epoch": 0.051433290079037394, "grad_norm": 0.6875, "learning_rate": 7.234049165133801e-05, "loss": 1.5434, "num_input_tokens_seen": 2401763328, "step": 2036 }, { "epoch": 0.051551256340686566, "grad_norm": 0.70703125, "learning_rate": 7.23313073575473e-05, "loss": 1.5236, "num_input_tokens_seen": 2402942976, "step": 2037 }, { "epoch": 0.05166922260233573, "grad_norm": 0.625, "learning_rate": 7.232211814449686e-05, "loss": 1.59, "num_input_tokens_seen": 2404122624, "step": 2038 }, { "epoch": 0.0517871888639849, "grad_norm": 0.625, "learning_rate": 7.23129240135848e-05, "loss": 1.5156, "num_input_tokens_seen": 2405302272, "step": 2039 }, { "epoch": 0.05190515512563407, "grad_norm": 0.62109375, "learning_rate": 7.230372496621003e-05, "loss": 1.6894, "num_input_tokens_seen": 2406481920, "step": 2040 }, { "epoch": 0.05202312138728324, "grad_norm": 0.66015625, "learning_rate": 7.229452100377223e-05, "loss": 1.5584, "num_input_tokens_seen": 2407661568, "step": 2041 }, { "epoch": 0.052141087648932403, "grad_norm": 0.70703125, "learning_rate": 7.228531212767178e-05, "loss": 1.5068, "num_input_tokens_seen": 2408841216, "step": 2042 }, { "epoch": 0.052259053910581575, "grad_norm": 0.67578125, "learning_rate": 7.22760983393098e-05, "loss": 1.595, "num_input_tokens_seen": 2410020864, "step": 2043 }, { "epoch": 0.05237702017223074, "grad_norm": 0.64453125, "learning_rate": 7.226687964008822e-05, "loss": 1.4293, "num_input_tokens_seen": 2411200512, "step": 2044 }, { "epoch": 0.05249498643387991, "grad_norm": 0.65625, "learning_rate": 7.225765603140964e-05, "loss": 1.4935, "num_input_tokens_seen": 2412380160, "step": 2045 }, { "epoch": 0.052612952695529076, "grad_norm": 0.640625, "learning_rate": 7.22484275146775e-05, "loss": 1.453, "num_input_tokens_seen": 2413559808, "step": 2046 }, { "epoch": 0.05273091895717825, "grad_norm": 0.625, "learning_rate": 7.223919409129589e-05, "loss": 1.5559, "num_input_tokens_seen": 2414739456, "step": 2047 }, { "epoch": 0.05284888521882741, "grad_norm": 0.640625, "learning_rate": 7.222995576266971e-05, "loss": 1.4826, "num_input_tokens_seen": 2415919104, "step": 2048 }, { "epoch": 0.052966851480476584, "grad_norm": 0.70703125, "learning_rate": 7.222071253020457e-05, "loss": 1.5566, "num_input_tokens_seen": 2417098752, "step": 2049 }, { "epoch": 0.05308481774212575, "grad_norm": 0.609375, "learning_rate": 7.221146439530687e-05, "loss": 1.6199, "num_input_tokens_seen": 2418278400, "step": 2050 }, { "epoch": 0.05320278400377492, "grad_norm": 0.625, "learning_rate": 7.220221135938369e-05, "loss": 1.5922, "num_input_tokens_seen": 2419458048, "step": 2051 }, { "epoch": 0.053320750265424086, "grad_norm": 0.59375, "learning_rate": 7.219295342384293e-05, "loss": 1.5259, "num_input_tokens_seen": 2420637696, "step": 2052 }, { "epoch": 0.05343871652707326, "grad_norm": 0.58984375, "learning_rate": 7.218369059009319e-05, "loss": 1.6072, "num_input_tokens_seen": 2421817344, "step": 2053 }, { "epoch": 0.05355668278872242, "grad_norm": 0.671875, "learning_rate": 7.217442285954381e-05, "loss": 1.6169, "num_input_tokens_seen": 2422996992, "step": 2054 }, { "epoch": 0.053674649050371594, "grad_norm": 0.62109375, "learning_rate": 7.21651502336049e-05, "loss": 1.5768, "num_input_tokens_seen": 2424176640, "step": 2055 }, { "epoch": 0.053792615312020765, "grad_norm": 0.609375, "learning_rate": 7.21558727136873e-05, "loss": 1.5223, "num_input_tokens_seen": 2425356288, "step": 2056 }, { "epoch": 0.05391058157366993, "grad_norm": 0.59765625, "learning_rate": 7.214659030120262e-05, "loss": 1.5783, "num_input_tokens_seen": 2426535936, "step": 2057 }, { "epoch": 0.0540285478353191, "grad_norm": 0.71875, "learning_rate": 7.213730299756317e-05, "loss": 1.4265, "num_input_tokens_seen": 2427715584, "step": 2058 }, { "epoch": 0.054146514096968267, "grad_norm": 0.6484375, "learning_rate": 7.212801080418204e-05, "loss": 1.5677, "num_input_tokens_seen": 2428895232, "step": 2059 }, { "epoch": 0.05426448035861744, "grad_norm": 0.6171875, "learning_rate": 7.211871372247304e-05, "loss": 1.5219, "num_input_tokens_seen": 2430074880, "step": 2060 }, { "epoch": 0.0543824466202666, "grad_norm": 0.66796875, "learning_rate": 7.210941175385075e-05, "loss": 1.5097, "num_input_tokens_seen": 2431254528, "step": 2061 }, { "epoch": 0.054500412881915775, "grad_norm": 0.66015625, "learning_rate": 7.210010489973047e-05, "loss": 1.7001, "num_input_tokens_seen": 2432434176, "step": 2062 }, { "epoch": 0.05461837914356494, "grad_norm": 0.640625, "learning_rate": 7.209079316152826e-05, "loss": 1.5308, "num_input_tokens_seen": 2433613824, "step": 2063 }, { "epoch": 0.05473634540521411, "grad_norm": 0.63671875, "learning_rate": 7.208147654066091e-05, "loss": 1.5989, "num_input_tokens_seen": 2434793472, "step": 2064 }, { "epoch": 0.054854311666863276, "grad_norm": 0.6328125, "learning_rate": 7.207215503854596e-05, "loss": 1.7482, "num_input_tokens_seen": 2435973120, "step": 2065 }, { "epoch": 0.05497227792851245, "grad_norm": 0.6875, "learning_rate": 7.206282865660169e-05, "loss": 1.6143, "num_input_tokens_seen": 2437152768, "step": 2066 }, { "epoch": 0.05509024419016161, "grad_norm": 0.6484375, "learning_rate": 7.205349739624711e-05, "loss": 1.6127, "num_input_tokens_seen": 2438332416, "step": 2067 }, { "epoch": 0.055208210451810784, "grad_norm": 0.64453125, "learning_rate": 7.204416125890203e-05, "loss": 1.4988, "num_input_tokens_seen": 2439512064, "step": 2068 }, { "epoch": 0.05532617671345995, "grad_norm": 0.61328125, "learning_rate": 7.20348202459869e-05, "loss": 1.5739, "num_input_tokens_seen": 2440691712, "step": 2069 }, { "epoch": 0.05544414297510912, "grad_norm": 0.65234375, "learning_rate": 7.202547435892302e-05, "loss": 1.5308, "num_input_tokens_seen": 2441871360, "step": 2070 }, { "epoch": 0.055562109236758285, "grad_norm": 0.625, "learning_rate": 7.201612359913235e-05, "loss": 1.5948, "num_input_tokens_seen": 2443051008, "step": 2071 }, { "epoch": 0.05568007549840746, "grad_norm": 0.67578125, "learning_rate": 7.200676796803765e-05, "loss": 1.5753, "num_input_tokens_seen": 2444230656, "step": 2072 }, { "epoch": 0.05579804176005662, "grad_norm": 0.6484375, "learning_rate": 7.199740746706235e-05, "loss": 1.5194, "num_input_tokens_seen": 2445410304, "step": 2073 }, { "epoch": 0.05591600802170579, "grad_norm": 0.68359375, "learning_rate": 7.198804209763071e-05, "loss": 1.4933, "num_input_tokens_seen": 2446589952, "step": 2074 }, { "epoch": 0.05603397428335496, "grad_norm": 0.65234375, "learning_rate": 7.197867186116767e-05, "loss": 1.5015, "num_input_tokens_seen": 2447769600, "step": 2075 }, { "epoch": 0.05615194054500413, "grad_norm": 0.63671875, "learning_rate": 7.196929675909893e-05, "loss": 1.6335, "num_input_tokens_seen": 2448949248, "step": 2076 }, { "epoch": 0.056269906806653294, "grad_norm": 0.6171875, "learning_rate": 7.195991679285091e-05, "loss": 1.5782, "num_input_tokens_seen": 2450128896, "step": 2077 }, { "epoch": 0.056387873068302466, "grad_norm": 0.703125, "learning_rate": 7.19505319638508e-05, "loss": 1.4924, "num_input_tokens_seen": 2451308544, "step": 2078 }, { "epoch": 0.05650583932995163, "grad_norm": 0.69921875, "learning_rate": 7.194114227352653e-05, "loss": 1.5547, "num_input_tokens_seen": 2452488192, "step": 2079 }, { "epoch": 0.0566238055916008, "grad_norm": 0.61328125, "learning_rate": 7.193174772330673e-05, "loss": 1.4579, "num_input_tokens_seen": 2453667840, "step": 2080 }, { "epoch": 0.05674177185324997, "grad_norm": 0.7421875, "learning_rate": 7.19223483146208e-05, "loss": 1.4006, "num_input_tokens_seen": 2454847488, "step": 2081 }, { "epoch": 0.05685973811489914, "grad_norm": 0.6875, "learning_rate": 7.191294404889891e-05, "loss": 1.4052, "num_input_tokens_seen": 2456027136, "step": 2082 }, { "epoch": 0.05697770437654831, "grad_norm": 0.6640625, "learning_rate": 7.190353492757186e-05, "loss": 1.5845, "num_input_tokens_seen": 2457206784, "step": 2083 }, { "epoch": 0.057095670638197475, "grad_norm": 0.66796875, "learning_rate": 7.189412095207136e-05, "loss": 1.5037, "num_input_tokens_seen": 2458386432, "step": 2084 }, { "epoch": 0.05721363689984665, "grad_norm": 0.76953125, "learning_rate": 7.188470212382968e-05, "loss": 1.5359, "num_input_tokens_seen": 2459566080, "step": 2085 }, { "epoch": 0.05733160316149581, "grad_norm": 0.84375, "learning_rate": 7.187527844427995e-05, "loss": 1.665, "num_input_tokens_seen": 2460745728, "step": 2086 }, { "epoch": 0.05744956942314498, "grad_norm": 0.6875, "learning_rate": 7.186584991485599e-05, "loss": 1.6403, "num_input_tokens_seen": 2461925376, "step": 2087 }, { "epoch": 0.05756753568479415, "grad_norm": 0.7265625, "learning_rate": 7.185641653699234e-05, "loss": 1.3332, "num_input_tokens_seen": 2463105024, "step": 2088 }, { "epoch": 0.05768550194644332, "grad_norm": 0.609375, "learning_rate": 7.184697831212435e-05, "loss": 1.6377, "num_input_tokens_seen": 2464284672, "step": 2089 }, { "epoch": 0.057803468208092484, "grad_norm": 0.73828125, "learning_rate": 7.183753524168803e-05, "loss": 1.4486, "num_input_tokens_seen": 2465464320, "step": 2090 }, { "epoch": 0.057921434469741656, "grad_norm": 0.66015625, "learning_rate": 7.182808732712018e-05, "loss": 1.4627, "num_input_tokens_seen": 2466643968, "step": 2091 }, { "epoch": 0.05803940073139082, "grad_norm": 0.65234375, "learning_rate": 7.181863456985827e-05, "loss": 1.6655, "num_input_tokens_seen": 2467823616, "step": 2092 }, { "epoch": 0.05815736699303999, "grad_norm": 0.63671875, "learning_rate": 7.180917697134061e-05, "loss": 1.499, "num_input_tokens_seen": 2469003264, "step": 2093 }, { "epoch": 0.05827533325468916, "grad_norm": 0.79296875, "learning_rate": 7.179971453300615e-05, "loss": 1.5666, "num_input_tokens_seen": 2470182912, "step": 2094 }, { "epoch": 0.05839329951633833, "grad_norm": 0.62109375, "learning_rate": 7.179024725629464e-05, "loss": 1.754, "num_input_tokens_seen": 2471362560, "step": 2095 }, { "epoch": 0.058511265777987494, "grad_norm": 0.6484375, "learning_rate": 7.178077514264652e-05, "loss": 1.6832, "num_input_tokens_seen": 2472542208, "step": 2096 }, { "epoch": 0.058629232039636665, "grad_norm": 0.7109375, "learning_rate": 7.177129819350299e-05, "loss": 1.4757, "num_input_tokens_seen": 2473721856, "step": 2097 }, { "epoch": 0.05874719830128583, "grad_norm": 0.59765625, "learning_rate": 7.1761816410306e-05, "loss": 1.4754, "num_input_tokens_seen": 2474901504, "step": 2098 }, { "epoch": 0.058865164562935, "grad_norm": 0.59375, "learning_rate": 7.175232979449819e-05, "loss": 1.677, "num_input_tokens_seen": 2476081152, "step": 2099 }, { "epoch": 0.058983130824584167, "grad_norm": 0.63671875, "learning_rate": 7.174283834752299e-05, "loss": 1.588, "num_input_tokens_seen": 2477260800, "step": 2100 }, { "epoch": 0.05910109708623334, "grad_norm": 0.72265625, "learning_rate": 7.173334207082453e-05, "loss": 1.5977, "num_input_tokens_seen": 2478440448, "step": 2101 }, { "epoch": 0.0592190633478825, "grad_norm": 0.68359375, "learning_rate": 7.172384096584769e-05, "loss": 1.4083, "num_input_tokens_seen": 2479620096, "step": 2102 }, { "epoch": 0.059337029609531675, "grad_norm": 0.63671875, "learning_rate": 7.171433503403805e-05, "loss": 1.7229, "num_input_tokens_seen": 2480799744, "step": 2103 }, { "epoch": 0.05945499587118084, "grad_norm": 0.62890625, "learning_rate": 7.170482427684199e-05, "loss": 1.5378, "num_input_tokens_seen": 2481979392, "step": 2104 }, { "epoch": 0.05957296213283001, "grad_norm": 0.6796875, "learning_rate": 7.169530869570655e-05, "loss": 1.6472, "num_input_tokens_seen": 2483159040, "step": 2105 }, { "epoch": 0.059690928394479176, "grad_norm": 0.71484375, "learning_rate": 7.168578829207958e-05, "loss": 1.5256, "num_input_tokens_seen": 2484338688, "step": 2106 }, { "epoch": 0.05980889465612835, "grad_norm": 0.6171875, "learning_rate": 7.167626306740961e-05, "loss": 1.6106, "num_input_tokens_seen": 2485518336, "step": 2107 }, { "epoch": 0.05992686091777751, "grad_norm": 0.61328125, "learning_rate": 7.16667330231459e-05, "loss": 1.6116, "num_input_tokens_seen": 2486697984, "step": 2108 }, { "epoch": 0.060044827179426684, "grad_norm": 0.67578125, "learning_rate": 7.165719816073848e-05, "loss": 1.553, "num_input_tokens_seen": 2487877632, "step": 2109 }, { "epoch": 0.060162793441075856, "grad_norm": 0.7109375, "learning_rate": 7.164765848163809e-05, "loss": 1.5079, "num_input_tokens_seen": 2489057280, "step": 2110 }, { "epoch": 0.06028075970272502, "grad_norm": 0.6171875, "learning_rate": 7.16381139872962e-05, "loss": 1.7166, "num_input_tokens_seen": 2490236928, "step": 2111 }, { "epoch": 0.06039872596437419, "grad_norm": 0.69140625, "learning_rate": 7.162856467916504e-05, "loss": 1.5981, "num_input_tokens_seen": 2491416576, "step": 2112 }, { "epoch": 0.06051669222602336, "grad_norm": 0.67578125, "learning_rate": 7.161901055869752e-05, "loss": 1.4822, "num_input_tokens_seen": 2492596224, "step": 2113 }, { "epoch": 0.06063465848767253, "grad_norm": 0.65234375, "learning_rate": 7.160945162734735e-05, "loss": 1.6089, "num_input_tokens_seen": 2493775872, "step": 2114 }, { "epoch": 0.06075262474932169, "grad_norm": 0.67578125, "learning_rate": 7.159988788656892e-05, "loss": 1.5562, "num_input_tokens_seen": 2494955520, "step": 2115 }, { "epoch": 0.060870591010970865, "grad_norm": 0.65234375, "learning_rate": 7.159031933781736e-05, "loss": 1.5186, "num_input_tokens_seen": 2496135168, "step": 2116 }, { "epoch": 0.06098855727262003, "grad_norm": 0.6953125, "learning_rate": 7.158074598254855e-05, "loss": 1.3422, "num_input_tokens_seen": 2497314816, "step": 2117 }, { "epoch": 0.0611065235342692, "grad_norm": 0.59765625, "learning_rate": 7.157116782221909e-05, "loss": 1.7471, "num_input_tokens_seen": 2498494464, "step": 2118 }, { "epoch": 0.061224489795918366, "grad_norm": 0.6328125, "learning_rate": 7.156158485828631e-05, "loss": 1.6097, "num_input_tokens_seen": 2499674112, "step": 2119 }, { "epoch": 0.06134245605756754, "grad_norm": 0.63671875, "learning_rate": 7.155199709220828e-05, "loss": 1.4928, "num_input_tokens_seen": 2500853760, "step": 2120 }, { "epoch": 0.0614604223192167, "grad_norm": 0.62890625, "learning_rate": 7.154240452544378e-05, "loss": 1.5735, "num_input_tokens_seen": 2502033408, "step": 2121 }, { "epoch": 0.061578388580865874, "grad_norm": 0.72265625, "learning_rate": 7.153280715945235e-05, "loss": 1.5354, "num_input_tokens_seen": 2503213056, "step": 2122 }, { "epoch": 0.06169635484251504, "grad_norm": 0.6953125, "learning_rate": 7.152320499569425e-05, "loss": 1.5084, "num_input_tokens_seen": 2504392704, "step": 2123 }, { "epoch": 0.06181432110416421, "grad_norm": 0.6328125, "learning_rate": 7.151359803563042e-05, "loss": 1.4934, "num_input_tokens_seen": 2505572352, "step": 2124 }, { "epoch": 0.061932287365813375, "grad_norm": 0.671875, "learning_rate": 7.150398628072263e-05, "loss": 1.4994, "num_input_tokens_seen": 2506752000, "step": 2125 }, { "epoch": 0.06205025362746255, "grad_norm": 0.65234375, "learning_rate": 7.14943697324333e-05, "loss": 1.4492, "num_input_tokens_seen": 2507931648, "step": 2126 }, { "epoch": 0.06216821988911171, "grad_norm": 0.6328125, "learning_rate": 7.148474839222559e-05, "loss": 1.7602, "num_input_tokens_seen": 2509111296, "step": 2127 }, { "epoch": 0.06228618615076088, "grad_norm": 0.6484375, "learning_rate": 7.147512226156342e-05, "loss": 1.5096, "num_input_tokens_seen": 2510290944, "step": 2128 }, { "epoch": 0.06240415241241005, "grad_norm": 0.65234375, "learning_rate": 7.146549134191142e-05, "loss": 1.6076, "num_input_tokens_seen": 2511470592, "step": 2129 }, { "epoch": 0.06252211867405921, "grad_norm": 0.64453125, "learning_rate": 7.145585563473495e-05, "loss": 1.449, "num_input_tokens_seen": 2512650240, "step": 2130 }, { "epoch": 0.06264008493570838, "grad_norm": 0.6484375, "learning_rate": 7.14462151415001e-05, "loss": 1.5525, "num_input_tokens_seen": 2513829888, "step": 2131 }, { "epoch": 0.06275805119735756, "grad_norm": 0.62890625, "learning_rate": 7.143656986367368e-05, "loss": 1.4403, "num_input_tokens_seen": 2515009536, "step": 2132 }, { "epoch": 0.06287601745900673, "grad_norm": 0.62890625, "learning_rate": 7.142691980272322e-05, "loss": 1.5139, "num_input_tokens_seen": 2516189184, "step": 2133 }, { "epoch": 0.06299398372065589, "grad_norm": 0.67578125, "learning_rate": 7.141726496011701e-05, "loss": 1.4524, "num_input_tokens_seen": 2517368832, "step": 2134 }, { "epoch": 0.06311194998230506, "grad_norm": 0.68359375, "learning_rate": 7.140760533732405e-05, "loss": 1.4579, "num_input_tokens_seen": 2518548480, "step": 2135 }, { "epoch": 0.06322991624395423, "grad_norm": 0.61328125, "learning_rate": 7.139794093581407e-05, "loss": 1.6199, "num_input_tokens_seen": 2519728128, "step": 2136 }, { "epoch": 0.0633478825056034, "grad_norm": 0.83203125, "learning_rate": 7.13882717570575e-05, "loss": 1.6241, "num_input_tokens_seen": 2520907776, "step": 2137 }, { "epoch": 0.06346584876725257, "grad_norm": 0.72265625, "learning_rate": 7.137859780252555e-05, "loss": 1.6226, "num_input_tokens_seen": 2522087424, "step": 2138 }, { "epoch": 0.06358381502890173, "grad_norm": 0.78515625, "learning_rate": 7.136891907369012e-05, "loss": 1.507, "num_input_tokens_seen": 2523267072, "step": 2139 }, { "epoch": 0.0637017812905509, "grad_norm": 0.734375, "learning_rate": 7.135923557202383e-05, "loss": 1.3917, "num_input_tokens_seen": 2524446720, "step": 2140 }, { "epoch": 0.06381974755220007, "grad_norm": 0.66796875, "learning_rate": 7.134954729900006e-05, "loss": 1.4087, "num_input_tokens_seen": 2525626368, "step": 2141 }, { "epoch": 0.06393771381384925, "grad_norm": 0.7109375, "learning_rate": 7.133985425609288e-05, "loss": 1.5533, "num_input_tokens_seen": 2526806016, "step": 2142 }, { "epoch": 0.0640556800754984, "grad_norm": 0.73046875, "learning_rate": 7.13301564447771e-05, "loss": 1.4244, "num_input_tokens_seen": 2527985664, "step": 2143 }, { "epoch": 0.06417364633714757, "grad_norm": 0.62109375, "learning_rate": 7.132045386652829e-05, "loss": 1.5217, "num_input_tokens_seen": 2529165312, "step": 2144 }, { "epoch": 0.06429161259879675, "grad_norm": 0.7265625, "learning_rate": 7.131074652282268e-05, "loss": 1.4388, "num_input_tokens_seen": 2530344960, "step": 2145 }, { "epoch": 0.06440957886044592, "grad_norm": 0.6328125, "learning_rate": 7.130103441513726e-05, "loss": 1.6643, "num_input_tokens_seen": 2531524608, "step": 2146 }, { "epoch": 0.06452754512209508, "grad_norm": 0.64453125, "learning_rate": 7.129131754494975e-05, "loss": 1.6273, "num_input_tokens_seen": 2532704256, "step": 2147 }, { "epoch": 0.06464551138374425, "grad_norm": 0.6796875, "learning_rate": 7.128159591373859e-05, "loss": 1.5731, "num_input_tokens_seen": 2533883904, "step": 2148 }, { "epoch": 0.06476347764539342, "grad_norm": 0.6640625, "learning_rate": 7.127186952298293e-05, "loss": 1.5956, "num_input_tokens_seen": 2535063552, "step": 2149 }, { "epoch": 0.06488144390704259, "grad_norm": 0.703125, "learning_rate": 7.126213837416267e-05, "loss": 1.5309, "num_input_tokens_seen": 2536243200, "step": 2150 }, { "epoch": 0.06499941016869175, "grad_norm": 0.6328125, "learning_rate": 7.125240246875841e-05, "loss": 1.5006, "num_input_tokens_seen": 2537422848, "step": 2151 }, { "epoch": 0.06511737643034092, "grad_norm": 0.71484375, "learning_rate": 7.124266180825148e-05, "loss": 1.6113, "num_input_tokens_seen": 2538602496, "step": 2152 }, { "epoch": 0.06523534269199009, "grad_norm": 0.62109375, "learning_rate": 7.123291639412395e-05, "loss": 1.4539, "num_input_tokens_seen": 2539782144, "step": 2153 }, { "epoch": 0.06535330895363926, "grad_norm": 0.7109375, "learning_rate": 7.12231662278586e-05, "loss": 1.4511, "num_input_tokens_seen": 2540961792, "step": 2154 }, { "epoch": 0.06547127521528842, "grad_norm": 0.5625, "learning_rate": 7.121341131093892e-05, "loss": 1.5934, "num_input_tokens_seen": 2542141440, "step": 2155 }, { "epoch": 0.0655892414769376, "grad_norm": 0.625, "learning_rate": 7.120365164484915e-05, "loss": 1.5815, "num_input_tokens_seen": 2543321088, "step": 2156 }, { "epoch": 0.06570720773858676, "grad_norm": 0.73828125, "learning_rate": 7.119388723107422e-05, "loss": 1.5041, "num_input_tokens_seen": 2544500736, "step": 2157 }, { "epoch": 0.06582517400023594, "grad_norm": 0.62109375, "learning_rate": 7.118411807109983e-05, "loss": 1.5656, "num_input_tokens_seen": 2545680384, "step": 2158 }, { "epoch": 0.0659431402618851, "grad_norm": 0.68359375, "learning_rate": 7.117434416641234e-05, "loss": 1.5776, "num_input_tokens_seen": 2546860032, "step": 2159 }, { "epoch": 0.06606110652353427, "grad_norm": 0.64453125, "learning_rate": 7.116456551849889e-05, "loss": 1.4988, "num_input_tokens_seen": 2548039680, "step": 2160 }, { "epoch": 0.06617907278518344, "grad_norm": 0.6484375, "learning_rate": 7.11547821288473e-05, "loss": 1.374, "num_input_tokens_seen": 2549219328, "step": 2161 }, { "epoch": 0.06629703904683261, "grad_norm": 0.59375, "learning_rate": 7.114499399894614e-05, "loss": 1.4518, "num_input_tokens_seen": 2550398976, "step": 2162 }, { "epoch": 0.06641500530848178, "grad_norm": 0.6171875, "learning_rate": 7.113520113028468e-05, "loss": 1.6117, "num_input_tokens_seen": 2551578624, "step": 2163 }, { "epoch": 0.06653297157013094, "grad_norm": 0.6171875, "learning_rate": 7.112540352435294e-05, "loss": 1.4237, "num_input_tokens_seen": 2552758272, "step": 2164 }, { "epoch": 0.06665093783178011, "grad_norm": 0.66015625, "learning_rate": 7.111560118264162e-05, "loss": 1.4234, "num_input_tokens_seen": 2553937920, "step": 2165 }, { "epoch": 0.06676890409342928, "grad_norm": 0.60546875, "learning_rate": 7.110579410664217e-05, "loss": 1.5433, "num_input_tokens_seen": 2555117568, "step": 2166 }, { "epoch": 0.06688687035507845, "grad_norm": 0.6328125, "learning_rate": 7.109598229784675e-05, "loss": 1.6265, "num_input_tokens_seen": 2556297216, "step": 2167 }, { "epoch": 0.06700483661672761, "grad_norm": 0.59765625, "learning_rate": 7.108616575774824e-05, "loss": 1.599, "num_input_tokens_seen": 2557476864, "step": 2168 }, { "epoch": 0.06712280287837678, "grad_norm": 0.6796875, "learning_rate": 7.107634448784025e-05, "loss": 1.7717, "num_input_tokens_seen": 2558656512, "step": 2169 }, { "epoch": 0.06724076914002595, "grad_norm": 0.71484375, "learning_rate": 7.106651848961711e-05, "loss": 1.4123, "num_input_tokens_seen": 2559836160, "step": 2170 }, { "epoch": 0.06735873540167513, "grad_norm": 0.734375, "learning_rate": 7.105668776457384e-05, "loss": 1.4984, "num_input_tokens_seen": 2561015808, "step": 2171 }, { "epoch": 0.06747670166332428, "grad_norm": 0.7734375, "learning_rate": 7.10468523142062e-05, "loss": 1.5935, "num_input_tokens_seen": 2562195456, "step": 2172 }, { "epoch": 0.06759466792497346, "grad_norm": 0.76171875, "learning_rate": 7.10370121400107e-05, "loss": 1.4926, "num_input_tokens_seen": 2563375104, "step": 2173 }, { "epoch": 0.06771263418662263, "grad_norm": 0.8671875, "learning_rate": 7.102716724348449e-05, "loss": 1.45, "num_input_tokens_seen": 2564554752, "step": 2174 }, { "epoch": 0.0678306004482718, "grad_norm": 0.6484375, "learning_rate": 7.101731762612554e-05, "loss": 1.6375, "num_input_tokens_seen": 2565734400, "step": 2175 }, { "epoch": 0.06794856670992096, "grad_norm": 0.88671875, "learning_rate": 7.100746328943245e-05, "loss": 1.3712, "num_input_tokens_seen": 2566914048, "step": 2176 }, { "epoch": 0.06806653297157013, "grad_norm": 0.65625, "learning_rate": 7.099760423490457e-05, "loss": 1.5398, "num_input_tokens_seen": 2568093696, "step": 2177 }, { "epoch": 0.0681844992332193, "grad_norm": 0.83203125, "learning_rate": 7.098774046404199e-05, "loss": 1.5061, "num_input_tokens_seen": 2569273344, "step": 2178 }, { "epoch": 0.06830246549486847, "grad_norm": 0.69140625, "learning_rate": 7.09778719783455e-05, "loss": 1.4503, "num_input_tokens_seen": 2570452992, "step": 2179 }, { "epoch": 0.06842043175651763, "grad_norm": 0.8515625, "learning_rate": 7.096799877931659e-05, "loss": 1.6409, "num_input_tokens_seen": 2571632640, "step": 2180 }, { "epoch": 0.0685383980181668, "grad_norm": 0.671875, "learning_rate": 7.095812086845749e-05, "loss": 1.6075, "num_input_tokens_seen": 2572812288, "step": 2181 }, { "epoch": 0.06865636427981597, "grad_norm": 0.78515625, "learning_rate": 7.094823824727114e-05, "loss": 1.4472, "num_input_tokens_seen": 2573991936, "step": 2182 }, { "epoch": 0.06877433054146515, "grad_norm": 0.6796875, "learning_rate": 7.09383509172612e-05, "loss": 1.4211, "num_input_tokens_seen": 2575171584, "step": 2183 }, { "epoch": 0.0688922968031143, "grad_norm": 0.70703125, "learning_rate": 7.092845887993201e-05, "loss": 1.5067, "num_input_tokens_seen": 2576351232, "step": 2184 }, { "epoch": 0.06901026306476347, "grad_norm": 0.68359375, "learning_rate": 7.091856213678872e-05, "loss": 1.6708, "num_input_tokens_seen": 2577530880, "step": 2185 }, { "epoch": 0.06912822932641265, "grad_norm": 0.69921875, "learning_rate": 7.09086606893371e-05, "loss": 1.5261, "num_input_tokens_seen": 2578710528, "step": 2186 }, { "epoch": 0.06924619558806182, "grad_norm": 0.7109375, "learning_rate": 7.089875453908366e-05, "loss": 1.476, "num_input_tokens_seen": 2579890176, "step": 2187 }, { "epoch": 0.06936416184971098, "grad_norm": 0.8046875, "learning_rate": 7.088884368753566e-05, "loss": 1.587, "num_input_tokens_seen": 2581069824, "step": 2188 }, { "epoch": 0.06948212811136015, "grad_norm": 0.68359375, "learning_rate": 7.087892813620104e-05, "loss": 1.52, "num_input_tokens_seen": 2582249472, "step": 2189 }, { "epoch": 0.06960009437300932, "grad_norm": 0.8515625, "learning_rate": 7.086900788658848e-05, "loss": 1.4511, "num_input_tokens_seen": 2583429120, "step": 2190 }, { "epoch": 0.06971806063465849, "grad_norm": 0.61328125, "learning_rate": 7.085908294020734e-05, "loss": 1.5586, "num_input_tokens_seen": 2584608768, "step": 2191 }, { "epoch": 0.06983602689630766, "grad_norm": 0.71484375, "learning_rate": 7.084915329856773e-05, "loss": 1.3686, "num_input_tokens_seen": 2585788416, "step": 2192 }, { "epoch": 0.06995399315795682, "grad_norm": 0.65625, "learning_rate": 7.083921896318045e-05, "loss": 1.5593, "num_input_tokens_seen": 2586968064, "step": 2193 }, { "epoch": 0.07007195941960599, "grad_norm": 0.65234375, "learning_rate": 7.082927993555704e-05, "loss": 1.4251, "num_input_tokens_seen": 2588147712, "step": 2194 }, { "epoch": 0.07018992568125516, "grad_norm": 0.703125, "learning_rate": 7.081933621720973e-05, "loss": 1.5143, "num_input_tokens_seen": 2589327360, "step": 2195 }, { "epoch": 0.07030789194290434, "grad_norm": 0.6953125, "learning_rate": 7.080938780965148e-05, "loss": 1.6101, "num_input_tokens_seen": 2590507008, "step": 2196 }, { "epoch": 0.0704258582045535, "grad_norm": 0.6640625, "learning_rate": 7.079943471439593e-05, "loss": 1.5199, "num_input_tokens_seen": 2591686656, "step": 2197 }, { "epoch": 0.07054382446620266, "grad_norm": 0.67578125, "learning_rate": 7.078947693295751e-05, "loss": 1.6252, "num_input_tokens_seen": 2592866304, "step": 2198 }, { "epoch": 0.07066179072785184, "grad_norm": 0.73046875, "learning_rate": 7.077951446685128e-05, "loss": 1.5631, "num_input_tokens_seen": 2594045952, "step": 2199 }, { "epoch": 0.07077975698950101, "grad_norm": 0.58984375, "learning_rate": 7.076954731759302e-05, "loss": 1.6381, "num_input_tokens_seen": 2595225600, "step": 2200 }, { "epoch": 0.07077975698950101, "eval_wikipedia_loss": 2.2444777488708496, "eval_wikipedia_runtime": 162.8868, "eval_wikipedia_samples_per_second": 4.31, "eval_wikipedia_steps_per_second": 0.184, "num_input_tokens_seen": 2595225600, "step": 2200 }, { "epoch": 0.07077975698950101, "eval_toxicity_loss": 4.000207901000977, "eval_toxicity_runtime": 1.0596, "eval_toxicity_samples_per_second": 1.887, "eval_toxicity_steps_per_second": 0.944, "num_input_tokens_seen": 2595225600, "step": 2200 }, { "epoch": 0.00011796626164916834, "grad_norm": 0.796875, "learning_rate": 7.07595754866993e-05, "loss": 1.279, "num_input_tokens_seen": 2596405248, "step": 2201 }, { "epoch": 0.0002359325232983367, "grad_norm": 0.62890625, "learning_rate": 7.074959897568731e-05, "loss": 1.4499, "num_input_tokens_seen": 2597584896, "step": 2202 }, { "epoch": 0.000353898784947505, "grad_norm": 0.78125, "learning_rate": 7.073961778607501e-05, "loss": 1.3672, "num_input_tokens_seen": 2598764544, "step": 2203 }, { "epoch": 0.0004718650465966734, "grad_norm": 0.734375, "learning_rate": 7.072963191938106e-05, "loss": 1.5299, "num_input_tokens_seen": 2599944192, "step": 2204 }, { "epoch": 0.0005898313082458417, "grad_norm": 0.72265625, "learning_rate": 7.07196413771248e-05, "loss": 1.4092, "num_input_tokens_seen": 2601123840, "step": 2205 }, { "epoch": 0.00070779756989501, "grad_norm": 0.6796875, "learning_rate": 7.070964616082633e-05, "loss": 1.3923, "num_input_tokens_seen": 2602303488, "step": 2206 }, { "epoch": 0.0008257638315441783, "grad_norm": 1.03125, "learning_rate": 7.069964627200643e-05, "loss": 1.4942, "num_input_tokens_seen": 2603483136, "step": 2207 }, { "epoch": 0.0009437300931933467, "grad_norm": 0.80078125, "learning_rate": 7.06896417121866e-05, "loss": 1.3644, "num_input_tokens_seen": 2604662784, "step": 2208 }, { "epoch": 0.001061696354842515, "grad_norm": 0.7109375, "learning_rate": 7.067963248288905e-05, "loss": 1.389, "num_input_tokens_seen": 2605842432, "step": 2209 }, { "epoch": 0.0011796626164916834, "grad_norm": 0.7265625, "learning_rate": 7.066961858563669e-05, "loss": 1.2878, "num_input_tokens_seen": 2607022080, "step": 2210 }, { "epoch": 0.0012976288781408518, "grad_norm": 0.7890625, "learning_rate": 7.065960002195319e-05, "loss": 1.4527, "num_input_tokens_seen": 2608201728, "step": 2211 }, { "epoch": 0.00141559513979002, "grad_norm": 0.72265625, "learning_rate": 7.064957679336284e-05, "loss": 1.3769, "num_input_tokens_seen": 2609381376, "step": 2212 }, { "epoch": 0.0015335614014391884, "grad_norm": 0.67578125, "learning_rate": 7.06395489013907e-05, "loss": 1.3244, "num_input_tokens_seen": 2610561024, "step": 2213 }, { "epoch": 0.0016515276630883566, "grad_norm": 0.69140625, "learning_rate": 7.062951634756256e-05, "loss": 1.2275, "num_input_tokens_seen": 2611740672, "step": 2214 }, { "epoch": 0.001769493924737525, "grad_norm": 0.72265625, "learning_rate": 7.061947913340485e-05, "loss": 1.3976, "num_input_tokens_seen": 2612920320, "step": 2215 }, { "epoch": 0.0018874601863866935, "grad_norm": 0.73828125, "learning_rate": 7.060943726044477e-05, "loss": 1.2778, "num_input_tokens_seen": 2614099968, "step": 2216 }, { "epoch": 0.0020054264480358617, "grad_norm": 0.65234375, "learning_rate": 7.059939073021022e-05, "loss": 1.5112, "num_input_tokens_seen": 2615279616, "step": 2217 }, { "epoch": 0.00212339270968503, "grad_norm": 0.6484375, "learning_rate": 7.058933954422977e-05, "loss": 1.4127, "num_input_tokens_seen": 2616459264, "step": 2218 }, { "epoch": 0.0022413589713341986, "grad_norm": 0.734375, "learning_rate": 7.057928370403272e-05, "loss": 1.4968, "num_input_tokens_seen": 2617638912, "step": 2219 }, { "epoch": 0.0023593252329833668, "grad_norm": 0.71484375, "learning_rate": 7.056922321114912e-05, "loss": 1.5317, "num_input_tokens_seen": 2618818560, "step": 2220 }, { "epoch": 0.002477291494632535, "grad_norm": 0.7109375, "learning_rate": 7.055915806710965e-05, "loss": 1.4512, "num_input_tokens_seen": 2619998208, "step": 2221 }, { "epoch": 0.0025952577562817036, "grad_norm": 0.79296875, "learning_rate": 7.054908827344575e-05, "loss": 1.3763, "num_input_tokens_seen": 2621177856, "step": 2222 }, { "epoch": 0.002713224017930872, "grad_norm": 0.64453125, "learning_rate": 7.053901383168957e-05, "loss": 1.6134, "num_input_tokens_seen": 2622357504, "step": 2223 }, { "epoch": 0.00283119027958004, "grad_norm": 0.70703125, "learning_rate": 7.052893474337394e-05, "loss": 1.4966, "num_input_tokens_seen": 2623537152, "step": 2224 }, { "epoch": 0.0029491565412292082, "grad_norm": 1.3828125, "learning_rate": 7.05188510100324e-05, "loss": 1.5941, "num_input_tokens_seen": 2624716800, "step": 2225 }, { "epoch": 0.003067122802878377, "grad_norm": 1.140625, "learning_rate": 7.050876263319922e-05, "loss": 1.3249, "num_input_tokens_seen": 2625896448, "step": 2226 }, { "epoch": 0.003185089064527545, "grad_norm": 0.9140625, "learning_rate": 7.049866961440936e-05, "loss": 1.4531, "num_input_tokens_seen": 2627076096, "step": 2227 }, { "epoch": 0.0033030553261767133, "grad_norm": 1.0703125, "learning_rate": 7.04885719551985e-05, "loss": 1.4713, "num_input_tokens_seen": 2628255744, "step": 2228 }, { "epoch": 0.003421021587825882, "grad_norm": 0.8984375, "learning_rate": 7.047846965710297e-05, "loss": 1.3711, "num_input_tokens_seen": 2629435392, "step": 2229 }, { "epoch": 0.00353898784947505, "grad_norm": 0.8515625, "learning_rate": 7.046836272165992e-05, "loss": 1.4326, "num_input_tokens_seen": 2630615040, "step": 2230 }, { "epoch": 0.0036569541111242184, "grad_norm": 0.875, "learning_rate": 7.045825115040707e-05, "loss": 1.36, "num_input_tokens_seen": 2631794688, "step": 2231 }, { "epoch": 0.003774920372773387, "grad_norm": 0.87890625, "learning_rate": 7.044813494488296e-05, "loss": 1.4388, "num_input_tokens_seen": 2632974336, "step": 2232 }, { "epoch": 0.003892886634422555, "grad_norm": 0.8515625, "learning_rate": 7.043801410662676e-05, "loss": 1.3882, "num_input_tokens_seen": 2634153984, "step": 2233 }, { "epoch": 0.004010852896071723, "grad_norm": 0.73046875, "learning_rate": 7.042788863717838e-05, "loss": 1.4323, "num_input_tokens_seen": 2635333632, "step": 2234 }, { "epoch": 0.004128819157720892, "grad_norm": 0.80859375, "learning_rate": 7.041775853807842e-05, "loss": 1.4982, "num_input_tokens_seen": 2636513280, "step": 2235 }, { "epoch": 0.00424678541937006, "grad_norm": 0.7265625, "learning_rate": 7.04076238108682e-05, "loss": 1.3422, "num_input_tokens_seen": 2637692928, "step": 2236 }, { "epoch": 0.004364751681019229, "grad_norm": 1.2421875, "learning_rate": 7.039748445708974e-05, "loss": 1.5675, "num_input_tokens_seen": 2638872576, "step": 2237 }, { "epoch": 0.004482717942668397, "grad_norm": 0.73828125, "learning_rate": 7.038734047828573e-05, "loss": 1.6413, "num_input_tokens_seen": 2640052224, "step": 2238 }, { "epoch": 0.004600684204317565, "grad_norm": 0.9375, "learning_rate": 7.037719187599963e-05, "loss": 1.3223, "num_input_tokens_seen": 2641231872, "step": 2239 }, { "epoch": 0.0047186504659667335, "grad_norm": 0.703125, "learning_rate": 7.036703865177555e-05, "loss": 1.5629, "num_input_tokens_seen": 2642411520, "step": 2240 }, { "epoch": 0.004836616727615902, "grad_norm": 0.8515625, "learning_rate": 7.035688080715829e-05, "loss": 1.4278, "num_input_tokens_seen": 2643591168, "step": 2241 }, { "epoch": 0.00495458298926507, "grad_norm": 0.734375, "learning_rate": 7.034671834369343e-05, "loss": 1.4345, "num_input_tokens_seen": 2644770816, "step": 2242 }, { "epoch": 0.005072549250914238, "grad_norm": 0.70703125, "learning_rate": 7.033655126292719e-05, "loss": 1.5669, "num_input_tokens_seen": 2645950464, "step": 2243 }, { "epoch": 0.005190515512563407, "grad_norm": 0.6875, "learning_rate": 7.03263795664065e-05, "loss": 1.7457, "num_input_tokens_seen": 2647130112, "step": 2244 }, { "epoch": 0.0053084817742125754, "grad_norm": 0.67578125, "learning_rate": 7.0316203255679e-05, "loss": 1.3967, "num_input_tokens_seen": 2648309760, "step": 2245 }, { "epoch": 0.005426448035861744, "grad_norm": 0.81640625, "learning_rate": 7.030602233229301e-05, "loss": 1.4547, "num_input_tokens_seen": 2649489408, "step": 2246 }, { "epoch": 0.005544414297510912, "grad_norm": 0.68359375, "learning_rate": 7.029583679779763e-05, "loss": 1.4224, "num_input_tokens_seen": 2650669056, "step": 2247 }, { "epoch": 0.00566238055916008, "grad_norm": 0.7734375, "learning_rate": 7.028564665374255e-05, "loss": 1.388, "num_input_tokens_seen": 2651848704, "step": 2248 }, { "epoch": 0.005780346820809248, "grad_norm": 0.7109375, "learning_rate": 7.027545190167826e-05, "loss": 1.5302, "num_input_tokens_seen": 2653028352, "step": 2249 }, { "epoch": 0.0058983130824584165, "grad_norm": 0.68359375, "learning_rate": 7.026525254315585e-05, "loss": 1.5047, "num_input_tokens_seen": 2654208000, "step": 2250 }, { "epoch": 0.0060162793441075856, "grad_norm": 0.85546875, "learning_rate": 7.025504857972725e-05, "loss": 1.4331, "num_input_tokens_seen": 2655387648, "step": 2251 }, { "epoch": 0.006134245605756754, "grad_norm": 0.65234375, "learning_rate": 7.024484001294493e-05, "loss": 1.3799, "num_input_tokens_seen": 2656567296, "step": 2252 }, { "epoch": 0.006252211867405922, "grad_norm": 0.7265625, "learning_rate": 7.023462684436219e-05, "loss": 1.5903, "num_input_tokens_seen": 2657746944, "step": 2253 }, { "epoch": 0.00637017812905509, "grad_norm": 0.67578125, "learning_rate": 7.022440907553297e-05, "loss": 1.4781, "num_input_tokens_seen": 2658926592, "step": 2254 }, { "epoch": 0.006488144390704258, "grad_norm": 0.6796875, "learning_rate": 7.02141867080119e-05, "loss": 1.3965, "num_input_tokens_seen": 2660106240, "step": 2255 }, { "epoch": 0.006606110652353427, "grad_norm": 0.7421875, "learning_rate": 7.020395974335435e-05, "loss": 1.3427, "num_input_tokens_seen": 2661285888, "step": 2256 }, { "epoch": 0.006724076914002596, "grad_norm": 0.62890625, "learning_rate": 7.019372818311637e-05, "loss": 1.4117, "num_input_tokens_seen": 2662465536, "step": 2257 }, { "epoch": 0.006842043175651764, "grad_norm": 0.63671875, "learning_rate": 7.018349202885469e-05, "loss": 1.6288, "num_input_tokens_seen": 2663645184, "step": 2258 }, { "epoch": 0.006960009437300932, "grad_norm": 0.8984375, "learning_rate": 7.017325128212676e-05, "loss": 1.4989, "num_input_tokens_seen": 2664824832, "step": 2259 }, { "epoch": 0.0070779756989501, "grad_norm": 0.671875, "learning_rate": 7.016300594449075e-05, "loss": 1.493, "num_input_tokens_seen": 2666004480, "step": 2260 }, { "epoch": 0.0071959419605992685, "grad_norm": 0.66015625, "learning_rate": 7.015275601750548e-05, "loss": 1.6693, "num_input_tokens_seen": 2667184128, "step": 2261 }, { "epoch": 0.007313908222248437, "grad_norm": 0.6328125, "learning_rate": 7.01425015027305e-05, "loss": 1.419, "num_input_tokens_seen": 2668363776, "step": 2262 }, { "epoch": 0.007431874483897605, "grad_norm": 0.671875, "learning_rate": 7.013224240172605e-05, "loss": 1.4517, "num_input_tokens_seen": 2669543424, "step": 2263 }, { "epoch": 0.007549840745546774, "grad_norm": 0.65625, "learning_rate": 7.012197871605308e-05, "loss": 1.463, "num_input_tokens_seen": 2670723072, "step": 2264 }, { "epoch": 0.007667807007195942, "grad_norm": 0.71875, "learning_rate": 7.01117104472732e-05, "loss": 1.5614, "num_input_tokens_seen": 2671902720, "step": 2265 }, { "epoch": 0.00778577326884511, "grad_norm": 0.75, "learning_rate": 7.010143759694876e-05, "loss": 1.5027, "num_input_tokens_seen": 2673082368, "step": 2266 }, { "epoch": 0.007903739530494279, "grad_norm": 0.640625, "learning_rate": 7.00911601666428e-05, "loss": 1.271, "num_input_tokens_seen": 2674262016, "step": 2267 }, { "epoch": 0.008021705792143447, "grad_norm": 0.6484375, "learning_rate": 7.008087815791904e-05, "loss": 1.4959, "num_input_tokens_seen": 2675441664, "step": 2268 }, { "epoch": 0.008139672053792615, "grad_norm": 0.65234375, "learning_rate": 7.007059157234189e-05, "loss": 1.4131, "num_input_tokens_seen": 2676621312, "step": 2269 }, { "epoch": 0.008257638315441783, "grad_norm": 0.625, "learning_rate": 7.006030041147649e-05, "loss": 1.605, "num_input_tokens_seen": 2677800960, "step": 2270 }, { "epoch": 0.008375604577090951, "grad_norm": 0.7109375, "learning_rate": 7.005000467688863e-05, "loss": 1.3874, "num_input_tokens_seen": 2678980608, "step": 2271 }, { "epoch": 0.00849357083874012, "grad_norm": 0.67578125, "learning_rate": 7.003970437014485e-05, "loss": 1.4089, "num_input_tokens_seen": 2680160256, "step": 2272 }, { "epoch": 0.008611537100389288, "grad_norm": 0.65234375, "learning_rate": 7.002939949281236e-05, "loss": 1.4299, "num_input_tokens_seen": 2681339904, "step": 2273 }, { "epoch": 0.008729503362038458, "grad_norm": 0.69921875, "learning_rate": 7.001909004645903e-05, "loss": 1.5478, "num_input_tokens_seen": 2682519552, "step": 2274 }, { "epoch": 0.008847469623687626, "grad_norm": 0.71875, "learning_rate": 7.00087760326535e-05, "loss": 1.5255, "num_input_tokens_seen": 2683699200, "step": 2275 }, { "epoch": 0.008965435885336794, "grad_norm": 0.67578125, "learning_rate": 6.999845745296502e-05, "loss": 1.5791, "num_input_tokens_seen": 2684878848, "step": 2276 }, { "epoch": 0.009083402146985962, "grad_norm": 0.671875, "learning_rate": 6.998813430896363e-05, "loss": 1.4338, "num_input_tokens_seen": 2686058496, "step": 2277 }, { "epoch": 0.00920136840863513, "grad_norm": 0.65234375, "learning_rate": 6.997780660221998e-05, "loss": 1.3966, "num_input_tokens_seen": 2687238144, "step": 2278 }, { "epoch": 0.009319334670284299, "grad_norm": 0.65625, "learning_rate": 6.996747433430544e-05, "loss": 1.4853, "num_input_tokens_seen": 2688417792, "step": 2279 }, { "epoch": 0.009437300931933467, "grad_norm": 0.6328125, "learning_rate": 6.99571375067921e-05, "loss": 1.4374, "num_input_tokens_seen": 2689597440, "step": 2280 }, { "epoch": 0.009555267193582635, "grad_norm": 0.625, "learning_rate": 6.994679612125271e-05, "loss": 1.4626, "num_input_tokens_seen": 2690777088, "step": 2281 }, { "epoch": 0.009673233455231803, "grad_norm": 0.63671875, "learning_rate": 6.993645017926073e-05, "loss": 1.4426, "num_input_tokens_seen": 2691956736, "step": 2282 }, { "epoch": 0.009791199716880972, "grad_norm": 0.62890625, "learning_rate": 6.992609968239033e-05, "loss": 1.551, "num_input_tokens_seen": 2693136384, "step": 2283 }, { "epoch": 0.00990916597853014, "grad_norm": 0.6328125, "learning_rate": 6.991574463221634e-05, "loss": 1.4595, "num_input_tokens_seen": 2694316032, "step": 2284 }, { "epoch": 0.010027132240179308, "grad_norm": 0.61328125, "learning_rate": 6.990538503031429e-05, "loss": 1.4067, "num_input_tokens_seen": 2695495680, "step": 2285 }, { "epoch": 0.010145098501828476, "grad_norm": 0.63671875, "learning_rate": 6.989502087826043e-05, "loss": 1.3568, "num_input_tokens_seen": 2696675328, "step": 2286 }, { "epoch": 0.010263064763477646, "grad_norm": 0.66796875, "learning_rate": 6.988465217763168e-05, "loss": 1.61, "num_input_tokens_seen": 2697854976, "step": 2287 }, { "epoch": 0.010381031025126814, "grad_norm": 0.609375, "learning_rate": 6.987427893000563e-05, "loss": 1.3611, "num_input_tokens_seen": 2699034624, "step": 2288 }, { "epoch": 0.010498997286775983, "grad_norm": 0.66015625, "learning_rate": 6.986390113696059e-05, "loss": 1.4669, "num_input_tokens_seen": 2700214272, "step": 2289 }, { "epoch": 0.010616963548425151, "grad_norm": 0.64453125, "learning_rate": 6.985351880007558e-05, "loss": 1.4732, "num_input_tokens_seen": 2701393920, "step": 2290 }, { "epoch": 0.010734929810074319, "grad_norm": 0.671875, "learning_rate": 6.984313192093029e-05, "loss": 1.4818, "num_input_tokens_seen": 2702573568, "step": 2291 }, { "epoch": 0.010852896071723487, "grad_norm": 0.62109375, "learning_rate": 6.983274050110509e-05, "loss": 1.4583, "num_input_tokens_seen": 2703753216, "step": 2292 }, { "epoch": 0.010970862333372655, "grad_norm": 0.625, "learning_rate": 6.982234454218102e-05, "loss": 1.4855, "num_input_tokens_seen": 2704932864, "step": 2293 }, { "epoch": 0.011088828595021824, "grad_norm": 0.58984375, "learning_rate": 6.98119440457399e-05, "loss": 1.5033, "num_input_tokens_seen": 2706112512, "step": 2294 }, { "epoch": 0.011206794856670992, "grad_norm": 0.66796875, "learning_rate": 6.980153901336414e-05, "loss": 1.4211, "num_input_tokens_seen": 2707292160, "step": 2295 }, { "epoch": 0.01132476111832016, "grad_norm": 0.6640625, "learning_rate": 6.97911294466369e-05, "loss": 1.4463, "num_input_tokens_seen": 2708471808, "step": 2296 }, { "epoch": 0.011442727379969328, "grad_norm": 0.69921875, "learning_rate": 6.978071534714201e-05, "loss": 1.3601, "num_input_tokens_seen": 2709651456, "step": 2297 }, { "epoch": 0.011560693641618497, "grad_norm": 0.71875, "learning_rate": 6.977029671646397e-05, "loss": 1.6515, "num_input_tokens_seen": 2710831104, "step": 2298 }, { "epoch": 0.011678659903267665, "grad_norm": 0.6171875, "learning_rate": 6.975987355618802e-05, "loss": 1.4837, "num_input_tokens_seen": 2712010752, "step": 2299 }, { "epoch": 0.011796626164916833, "grad_norm": 0.65625, "learning_rate": 6.974944586790007e-05, "loss": 1.4918, "num_input_tokens_seen": 2713190400, "step": 2300 }, { "epoch": 0.011914592426566003, "grad_norm": 0.62890625, "learning_rate": 6.973901365318667e-05, "loss": 1.6257, "num_input_tokens_seen": 2714370048, "step": 2301 }, { "epoch": 0.012032558688215171, "grad_norm": 0.66796875, "learning_rate": 6.972857691363512e-05, "loss": 1.4187, "num_input_tokens_seen": 2715549696, "step": 2302 }, { "epoch": 0.01215052494986434, "grad_norm": 0.65625, "learning_rate": 6.971813565083339e-05, "loss": 1.3837, "num_input_tokens_seen": 2716729344, "step": 2303 }, { "epoch": 0.012268491211513508, "grad_norm": 0.69921875, "learning_rate": 6.970768986637013e-05, "loss": 1.3876, "num_input_tokens_seen": 2717908992, "step": 2304 }, { "epoch": 0.012386457473162676, "grad_norm": 0.640625, "learning_rate": 6.969723956183468e-05, "loss": 1.4501, "num_input_tokens_seen": 2719088640, "step": 2305 }, { "epoch": 0.012504423734811844, "grad_norm": 0.6484375, "learning_rate": 6.968678473881707e-05, "loss": 1.4169, "num_input_tokens_seen": 2720268288, "step": 2306 }, { "epoch": 0.012622389996461012, "grad_norm": 0.6875, "learning_rate": 6.967632539890803e-05, "loss": 1.3804, "num_input_tokens_seen": 2721447936, "step": 2307 }, { "epoch": 0.01274035625811018, "grad_norm": 0.6171875, "learning_rate": 6.966586154369895e-05, "loss": 1.6041, "num_input_tokens_seen": 2722627584, "step": 2308 }, { "epoch": 0.012858322519759349, "grad_norm": 0.66015625, "learning_rate": 6.965539317478191e-05, "loss": 1.4367, "num_input_tokens_seen": 2723807232, "step": 2309 }, { "epoch": 0.012976288781408517, "grad_norm": 0.6171875, "learning_rate": 6.964492029374973e-05, "loss": 1.5256, "num_input_tokens_seen": 2724986880, "step": 2310 }, { "epoch": 0.013094255043057685, "grad_norm": 0.62890625, "learning_rate": 6.963444290219585e-05, "loss": 1.4456, "num_input_tokens_seen": 2726166528, "step": 2311 }, { "epoch": 0.013212221304706853, "grad_norm": 0.68359375, "learning_rate": 6.962396100171441e-05, "loss": 1.4174, "num_input_tokens_seen": 2727346176, "step": 2312 }, { "epoch": 0.013330187566356021, "grad_norm": 0.66796875, "learning_rate": 6.961347459390027e-05, "loss": 1.5069, "num_input_tokens_seen": 2728525824, "step": 2313 }, { "epoch": 0.013448153828005191, "grad_norm": 0.69140625, "learning_rate": 6.960298368034894e-05, "loss": 1.6307, "num_input_tokens_seen": 2729705472, "step": 2314 }, { "epoch": 0.01356612008965436, "grad_norm": 0.828125, "learning_rate": 6.959248826265665e-05, "loss": 1.5438, "num_input_tokens_seen": 2730885120, "step": 2315 }, { "epoch": 0.013684086351303528, "grad_norm": 0.671875, "learning_rate": 6.958198834242027e-05, "loss": 1.3704, "num_input_tokens_seen": 2732064768, "step": 2316 }, { "epoch": 0.013802052612952696, "grad_norm": 0.71484375, "learning_rate": 6.957148392123738e-05, "loss": 1.5327, "num_input_tokens_seen": 2733244416, "step": 2317 }, { "epoch": 0.013920018874601864, "grad_norm": 0.73828125, "learning_rate": 6.956097500070626e-05, "loss": 1.5696, "num_input_tokens_seen": 2734424064, "step": 2318 }, { "epoch": 0.014037985136251032, "grad_norm": 0.75390625, "learning_rate": 6.955046158242586e-05, "loss": 1.4443, "num_input_tokens_seen": 2735603712, "step": 2319 }, { "epoch": 0.0141559513979002, "grad_norm": 0.66015625, "learning_rate": 6.95399436679958e-05, "loss": 1.4513, "num_input_tokens_seen": 2736783360, "step": 2320 }, { "epoch": 0.014273917659549369, "grad_norm": 0.6171875, "learning_rate": 6.952942125901639e-05, "loss": 1.4938, "num_input_tokens_seen": 2737963008, "step": 2321 }, { "epoch": 0.014391883921198537, "grad_norm": 0.60546875, "learning_rate": 6.951889435708865e-05, "loss": 1.5951, "num_input_tokens_seen": 2739142656, "step": 2322 }, { "epoch": 0.014509850182847705, "grad_norm": 0.65234375, "learning_rate": 6.950836296381425e-05, "loss": 1.4721, "num_input_tokens_seen": 2740322304, "step": 2323 }, { "epoch": 0.014627816444496873, "grad_norm": 0.60546875, "learning_rate": 6.949782708079559e-05, "loss": 1.3621, "num_input_tokens_seen": 2741501952, "step": 2324 }, { "epoch": 0.014745782706146042, "grad_norm": 0.62109375, "learning_rate": 6.948728670963568e-05, "loss": 1.5476, "num_input_tokens_seen": 2742681600, "step": 2325 }, { "epoch": 0.01486374896779521, "grad_norm": 0.6015625, "learning_rate": 6.947674185193827e-05, "loss": 1.3664, "num_input_tokens_seen": 2743861248, "step": 2326 }, { "epoch": 0.014981715229444378, "grad_norm": 0.66796875, "learning_rate": 6.946619250930781e-05, "loss": 1.4769, "num_input_tokens_seen": 2745040896, "step": 2327 }, { "epoch": 0.015099681491093548, "grad_norm": 0.61328125, "learning_rate": 6.945563868334934e-05, "loss": 1.5773, "num_input_tokens_seen": 2746220544, "step": 2328 }, { "epoch": 0.015217647752742716, "grad_norm": 0.671875, "learning_rate": 6.944508037566867e-05, "loss": 1.3092, "num_input_tokens_seen": 2747400192, "step": 2329 }, { "epoch": 0.015335614014391884, "grad_norm": 0.609375, "learning_rate": 6.943451758787227e-05, "loss": 1.4372, "num_input_tokens_seen": 2748579840, "step": 2330 }, { "epoch": 0.015453580276041053, "grad_norm": 0.61328125, "learning_rate": 6.942395032156728e-05, "loss": 1.394, "num_input_tokens_seen": 2749759488, "step": 2331 }, { "epoch": 0.01557154653769022, "grad_norm": 0.65625, "learning_rate": 6.941337857836152e-05, "loss": 1.4001, "num_input_tokens_seen": 2750939136, "step": 2332 }, { "epoch": 0.01568951279933939, "grad_norm": 0.625, "learning_rate": 6.940280235986351e-05, "loss": 1.415, "num_input_tokens_seen": 2752118784, "step": 2333 }, { "epoch": 0.015807479060988557, "grad_norm": 0.5859375, "learning_rate": 6.939222166768242e-05, "loss": 1.4309, "num_input_tokens_seen": 2753298432, "step": 2334 }, { "epoch": 0.015925445322637725, "grad_norm": 0.625, "learning_rate": 6.938163650342812e-05, "loss": 1.4035, "num_input_tokens_seen": 2754478080, "step": 2335 }, { "epoch": 0.016043411584286894, "grad_norm": 0.609375, "learning_rate": 6.937104686871118e-05, "loss": 1.4242, "num_input_tokens_seen": 2755657728, "step": 2336 }, { "epoch": 0.016161377845936062, "grad_norm": 0.62109375, "learning_rate": 6.936045276514283e-05, "loss": 1.5698, "num_input_tokens_seen": 2756837376, "step": 2337 }, { "epoch": 0.01627934410758523, "grad_norm": 0.58203125, "learning_rate": 6.934985419433496e-05, "loss": 1.5925, "num_input_tokens_seen": 2758017024, "step": 2338 }, { "epoch": 0.0163973103692344, "grad_norm": 0.625, "learning_rate": 6.933925115790016e-05, "loss": 1.4618, "num_input_tokens_seen": 2759196672, "step": 2339 }, { "epoch": 0.016515276630883566, "grad_norm": 0.578125, "learning_rate": 6.932864365745172e-05, "loss": 1.6154, "num_input_tokens_seen": 2760376320, "step": 2340 }, { "epoch": 0.016633242892532735, "grad_norm": 0.66015625, "learning_rate": 6.931803169460356e-05, "loss": 1.5247, "num_input_tokens_seen": 2761555968, "step": 2341 }, { "epoch": 0.016751209154181903, "grad_norm": 0.6484375, "learning_rate": 6.930741527097033e-05, "loss": 1.4851, "num_input_tokens_seen": 2762735616, "step": 2342 }, { "epoch": 0.01686917541583107, "grad_norm": 0.66015625, "learning_rate": 6.929679438816733e-05, "loss": 1.4153, "num_input_tokens_seen": 2763915264, "step": 2343 }, { "epoch": 0.01698714167748024, "grad_norm": 0.6171875, "learning_rate": 6.928616904781053e-05, "loss": 1.4394, "num_input_tokens_seen": 2765094912, "step": 2344 }, { "epoch": 0.017105107939129408, "grad_norm": 0.59765625, "learning_rate": 6.927553925151661e-05, "loss": 1.4926, "num_input_tokens_seen": 2766274560, "step": 2345 }, { "epoch": 0.017223074200778576, "grad_norm": 0.61328125, "learning_rate": 6.92649050009029e-05, "loss": 1.5191, "num_input_tokens_seen": 2767454208, "step": 2346 }, { "epoch": 0.017341040462427744, "grad_norm": 0.671875, "learning_rate": 6.925426629758743e-05, "loss": 1.4289, "num_input_tokens_seen": 2768633856, "step": 2347 }, { "epoch": 0.017459006724076916, "grad_norm": 0.58984375, "learning_rate": 6.92436231431889e-05, "loss": 1.5127, "num_input_tokens_seen": 2769813504, "step": 2348 }, { "epoch": 0.017576972985726084, "grad_norm": 0.62890625, "learning_rate": 6.923297553932665e-05, "loss": 1.3559, "num_input_tokens_seen": 2770993152, "step": 2349 }, { "epoch": 0.017694939247375252, "grad_norm": 0.56640625, "learning_rate": 6.922232348762076e-05, "loss": 1.4668, "num_input_tokens_seen": 2772172800, "step": 2350 }, { "epoch": 0.01781290550902442, "grad_norm": 0.6484375, "learning_rate": 6.921166698969194e-05, "loss": 1.5413, "num_input_tokens_seen": 2773352448, "step": 2351 }, { "epoch": 0.01793087177067359, "grad_norm": 0.59765625, "learning_rate": 6.92010060471616e-05, "loss": 1.5214, "num_input_tokens_seen": 2774532096, "step": 2352 }, { "epoch": 0.018048838032322757, "grad_norm": 0.6484375, "learning_rate": 6.919034066165183e-05, "loss": 1.3498, "num_input_tokens_seen": 2775711744, "step": 2353 }, { "epoch": 0.018166804293971925, "grad_norm": 0.6171875, "learning_rate": 6.917967083478538e-05, "loss": 1.6508, "num_input_tokens_seen": 2776891392, "step": 2354 }, { "epoch": 0.018284770555621093, "grad_norm": 0.609375, "learning_rate": 6.916899656818565e-05, "loss": 1.4732, "num_input_tokens_seen": 2778071040, "step": 2355 }, { "epoch": 0.01840273681727026, "grad_norm": 0.60546875, "learning_rate": 6.91583178634768e-05, "loss": 1.4493, "num_input_tokens_seen": 2779250688, "step": 2356 }, { "epoch": 0.01852070307891943, "grad_norm": 0.6484375, "learning_rate": 6.914763472228357e-05, "loss": 1.4775, "num_input_tokens_seen": 2780430336, "step": 2357 }, { "epoch": 0.018638669340568598, "grad_norm": 0.640625, "learning_rate": 6.913694714623144e-05, "loss": 1.4312, "num_input_tokens_seen": 2781609984, "step": 2358 }, { "epoch": 0.018756635602217766, "grad_norm": 0.6328125, "learning_rate": 6.912625513694652e-05, "loss": 1.4589, "num_input_tokens_seen": 2782789632, "step": 2359 }, { "epoch": 0.018874601863866934, "grad_norm": 0.609375, "learning_rate": 6.911555869605564e-05, "loss": 1.5448, "num_input_tokens_seen": 2783969280, "step": 2360 }, { "epoch": 0.018992568125516102, "grad_norm": 0.578125, "learning_rate": 6.910485782518627e-05, "loss": 1.6133, "num_input_tokens_seen": 2785148928, "step": 2361 }, { "epoch": 0.01911053438716527, "grad_norm": 0.65234375, "learning_rate": 6.909415252596655e-05, "loss": 1.6983, "num_input_tokens_seen": 2786328576, "step": 2362 }, { "epoch": 0.01922850064881444, "grad_norm": 0.58984375, "learning_rate": 6.908344280002534e-05, "loss": 1.3997, "num_input_tokens_seen": 2787508224, "step": 2363 }, { "epoch": 0.019346466910463607, "grad_norm": 0.61328125, "learning_rate": 6.907272864899213e-05, "loss": 1.4385, "num_input_tokens_seen": 2788687872, "step": 2364 }, { "epoch": 0.019464433172112775, "grad_norm": 0.625, "learning_rate": 6.906201007449706e-05, "loss": 1.5578, "num_input_tokens_seen": 2789867520, "step": 2365 }, { "epoch": 0.019582399433761943, "grad_norm": 0.68359375, "learning_rate": 6.905128707817102e-05, "loss": 1.4887, "num_input_tokens_seen": 2791047168, "step": 2366 }, { "epoch": 0.01970036569541111, "grad_norm": 0.7265625, "learning_rate": 6.904055966164554e-05, "loss": 1.5436, "num_input_tokens_seen": 2792226816, "step": 2367 }, { "epoch": 0.01981833195706028, "grad_norm": 0.6328125, "learning_rate": 6.902982782655278e-05, "loss": 1.5248, "num_input_tokens_seen": 2793406464, "step": 2368 }, { "epoch": 0.019936298218709448, "grad_norm": 0.64453125, "learning_rate": 6.90190915745256e-05, "loss": 1.4808, "num_input_tokens_seen": 2794586112, "step": 2369 }, { "epoch": 0.020054264480358616, "grad_norm": 0.78515625, "learning_rate": 6.900835090719759e-05, "loss": 1.5845, "num_input_tokens_seen": 2795765760, "step": 2370 }, { "epoch": 0.020172230742007784, "grad_norm": 0.64453125, "learning_rate": 6.899760582620289e-05, "loss": 1.572, "num_input_tokens_seen": 2796945408, "step": 2371 }, { "epoch": 0.020290197003656953, "grad_norm": 0.75390625, "learning_rate": 6.898685633317643e-05, "loss": 1.4013, "num_input_tokens_seen": 2798125056, "step": 2372 }, { "epoch": 0.02040816326530612, "grad_norm": 0.609375, "learning_rate": 6.897610242975376e-05, "loss": 1.4491, "num_input_tokens_seen": 2799304704, "step": 2373 }, { "epoch": 0.020526129526955292, "grad_norm": 0.84375, "learning_rate": 6.896534411757108e-05, "loss": 1.4883, "num_input_tokens_seen": 2800484352, "step": 2374 }, { "epoch": 0.02064409578860446, "grad_norm": 0.60546875, "learning_rate": 6.89545813982653e-05, "loss": 1.3609, "num_input_tokens_seen": 2801664000, "step": 2375 }, { "epoch": 0.02076206205025363, "grad_norm": 0.7734375, "learning_rate": 6.894381427347397e-05, "loss": 1.5091, "num_input_tokens_seen": 2802843648, "step": 2376 }, { "epoch": 0.020880028311902797, "grad_norm": 0.62890625, "learning_rate": 6.893304274483535e-05, "loss": 1.5084, "num_input_tokens_seen": 2804023296, "step": 2377 }, { "epoch": 0.020997994573551965, "grad_norm": 0.83984375, "learning_rate": 6.892226681398832e-05, "loss": 1.3664, "num_input_tokens_seen": 2805202944, "step": 2378 }, { "epoch": 0.021115960835201134, "grad_norm": 0.625, "learning_rate": 6.891148648257249e-05, "loss": 1.5294, "num_input_tokens_seen": 2806382592, "step": 2379 }, { "epoch": 0.021233927096850302, "grad_norm": 0.703125, "learning_rate": 6.890070175222806e-05, "loss": 1.382, "num_input_tokens_seen": 2807562240, "step": 2380 }, { "epoch": 0.02135189335849947, "grad_norm": 0.6328125, "learning_rate": 6.888991262459597e-05, "loss": 1.4804, "num_input_tokens_seen": 2808741888, "step": 2381 }, { "epoch": 0.021469859620148638, "grad_norm": 0.640625, "learning_rate": 6.887911910131779e-05, "loss": 1.4278, "num_input_tokens_seen": 2809921536, "step": 2382 }, { "epoch": 0.021587825881797806, "grad_norm": 0.82421875, "learning_rate": 6.88683211840358e-05, "loss": 1.5575, "num_input_tokens_seen": 2811101184, "step": 2383 }, { "epoch": 0.021705792143446975, "grad_norm": 0.6015625, "learning_rate": 6.885751887439288e-05, "loss": 1.5781, "num_input_tokens_seen": 2812280832, "step": 2384 }, { "epoch": 0.021823758405096143, "grad_norm": 0.65234375, "learning_rate": 6.884671217403265e-05, "loss": 1.4838, "num_input_tokens_seen": 2813460480, "step": 2385 }, { "epoch": 0.02194172466674531, "grad_norm": 0.640625, "learning_rate": 6.883590108459935e-05, "loss": 1.418, "num_input_tokens_seen": 2814640128, "step": 2386 }, { "epoch": 0.02205969092839448, "grad_norm": 0.61328125, "learning_rate": 6.88250856077379e-05, "loss": 1.4708, "num_input_tokens_seen": 2815819776, "step": 2387 }, { "epoch": 0.022177657190043647, "grad_norm": 0.65625, "learning_rate": 6.881426574509394e-05, "loss": 1.4804, "num_input_tokens_seen": 2816999424, "step": 2388 }, { "epoch": 0.022295623451692816, "grad_norm": 0.6875, "learning_rate": 6.880344149831364e-05, "loss": 1.4976, "num_input_tokens_seen": 2818179072, "step": 2389 }, { "epoch": 0.022413589713341984, "grad_norm": 0.640625, "learning_rate": 6.879261286904401e-05, "loss": 1.5934, "num_input_tokens_seen": 2819358720, "step": 2390 }, { "epoch": 0.022531555974991152, "grad_norm": 0.64453125, "learning_rate": 6.87817798589326e-05, "loss": 1.381, "num_input_tokens_seen": 2820538368, "step": 2391 }, { "epoch": 0.02264952223664032, "grad_norm": 0.63671875, "learning_rate": 6.877094246962767e-05, "loss": 1.4783, "num_input_tokens_seen": 2821718016, "step": 2392 }, { "epoch": 0.02276748849828949, "grad_norm": 0.640625, "learning_rate": 6.876010070277817e-05, "loss": 1.242, "num_input_tokens_seen": 2822897664, "step": 2393 }, { "epoch": 0.022885454759938657, "grad_norm": 0.58203125, "learning_rate": 6.874925456003369e-05, "loss": 1.5498, "num_input_tokens_seen": 2824077312, "step": 2394 }, { "epoch": 0.023003421021587825, "grad_norm": 0.6328125, "learning_rate": 6.873840404304446e-05, "loss": 1.3926, "num_input_tokens_seen": 2825256960, "step": 2395 }, { "epoch": 0.023121387283236993, "grad_norm": 0.59375, "learning_rate": 6.872754915346141e-05, "loss": 1.5161, "num_input_tokens_seen": 2826436608, "step": 2396 }, { "epoch": 0.02323935354488616, "grad_norm": 0.59765625, "learning_rate": 6.871668989293614e-05, "loss": 1.4212, "num_input_tokens_seen": 2827616256, "step": 2397 }, { "epoch": 0.02335731980653533, "grad_norm": 0.6796875, "learning_rate": 6.870582626312091e-05, "loss": 1.4399, "num_input_tokens_seen": 2828795904, "step": 2398 }, { "epoch": 0.023475286068184498, "grad_norm": 0.62890625, "learning_rate": 6.869495826566864e-05, "loss": 1.4074, "num_input_tokens_seen": 2829975552, "step": 2399 }, { "epoch": 0.023593252329833666, "grad_norm": 0.59765625, "learning_rate": 6.868408590223289e-05, "loss": 1.5593, "num_input_tokens_seen": 2831155200, "step": 2400 }, { "epoch": 0.023593252329833666, "eval_wikipedia_loss": 2.2714240550994873, "eval_wikipedia_runtime": 160.8949, "eval_wikipedia_samples_per_second": 4.363, "eval_wikipedia_steps_per_second": 0.186, "num_input_tokens_seen": 2831155200, "step": 2400 }, { "epoch": 0.023593252329833666, "eval_toxicity_loss": 4.004484176635742, "eval_toxicity_runtime": 1.0541, "eval_toxicity_samples_per_second": 1.897, "eval_toxicity_steps_per_second": 0.949, "num_input_tokens_seen": 2831155200, "step": 2400 }, { "epoch": 0.023711218591482838, "grad_norm": 0.66796875, "learning_rate": 6.867320917446792e-05, "loss": 1.3818, "num_input_tokens_seen": 2832334848, "step": 2401 }, { "epoch": 0.023829184853132006, "grad_norm": 0.62109375, "learning_rate": 6.866232808402865e-05, "loss": 1.4095, "num_input_tokens_seen": 2833514496, "step": 2402 }, { "epoch": 0.023947151114781174, "grad_norm": 0.62890625, "learning_rate": 6.865144263257064e-05, "loss": 1.6509, "num_input_tokens_seen": 2834694144, "step": 2403 }, { "epoch": 0.024065117376430342, "grad_norm": 0.61328125, "learning_rate": 6.864055282175014e-05, "loss": 1.5531, "num_input_tokens_seen": 2835873792, "step": 2404 }, { "epoch": 0.02418308363807951, "grad_norm": 0.59765625, "learning_rate": 6.862965865322404e-05, "loss": 1.3999, "num_input_tokens_seen": 2837053440, "step": 2405 }, { "epoch": 0.02430104989972868, "grad_norm": 0.6484375, "learning_rate": 6.861876012864992e-05, "loss": 1.4736, "num_input_tokens_seen": 2838233088, "step": 2406 }, { "epoch": 0.024419016161377847, "grad_norm": 0.59375, "learning_rate": 6.860785724968601e-05, "loss": 1.4672, "num_input_tokens_seen": 2839412736, "step": 2407 }, { "epoch": 0.024536982423027015, "grad_norm": 0.63671875, "learning_rate": 6.859695001799117e-05, "loss": 1.6059, "num_input_tokens_seen": 2840592384, "step": 2408 }, { "epoch": 0.024654948684676183, "grad_norm": 0.6328125, "learning_rate": 6.8586038435225e-05, "loss": 1.4083, "num_input_tokens_seen": 2841772032, "step": 2409 }, { "epoch": 0.02477291494632535, "grad_norm": 0.65625, "learning_rate": 6.857512250304766e-05, "loss": 1.2803, "num_input_tokens_seen": 2842951680, "step": 2410 }, { "epoch": 0.02489088120797452, "grad_norm": 0.65234375, "learning_rate": 6.856420222312007e-05, "loss": 1.3801, "num_input_tokens_seen": 2844131328, "step": 2411 }, { "epoch": 0.025008847469623688, "grad_norm": 0.640625, "learning_rate": 6.855327759710376e-05, "loss": 1.3017, "num_input_tokens_seen": 2845310976, "step": 2412 }, { "epoch": 0.025126813731272856, "grad_norm": 0.6328125, "learning_rate": 6.85423486266609e-05, "loss": 1.3986, "num_input_tokens_seen": 2846490624, "step": 2413 }, { "epoch": 0.025244779992922024, "grad_norm": 0.58203125, "learning_rate": 6.853141531345439e-05, "loss": 1.592, "num_input_tokens_seen": 2847670272, "step": 2414 }, { "epoch": 0.025362746254571193, "grad_norm": 0.64453125, "learning_rate": 6.852047765914772e-05, "loss": 1.5204, "num_input_tokens_seen": 2848849920, "step": 2415 }, { "epoch": 0.02548071251622036, "grad_norm": 0.59765625, "learning_rate": 6.85095356654051e-05, "loss": 1.4745, "num_input_tokens_seen": 2850029568, "step": 2416 }, { "epoch": 0.02559867877786953, "grad_norm": 0.58984375, "learning_rate": 6.849858933389135e-05, "loss": 1.5989, "num_input_tokens_seen": 2851209216, "step": 2417 }, { "epoch": 0.025716645039518697, "grad_norm": 0.5859375, "learning_rate": 6.848763866627198e-05, "loss": 1.5378, "num_input_tokens_seen": 2852388864, "step": 2418 }, { "epoch": 0.025834611301167865, "grad_norm": 0.62890625, "learning_rate": 6.847668366421315e-05, "loss": 1.3857, "num_input_tokens_seen": 2853568512, "step": 2419 }, { "epoch": 0.025952577562817034, "grad_norm": 0.609375, "learning_rate": 6.846572432938166e-05, "loss": 1.5616, "num_input_tokens_seen": 2854748160, "step": 2420 }, { "epoch": 0.026070543824466202, "grad_norm": 0.69140625, "learning_rate": 6.845476066344504e-05, "loss": 1.4183, "num_input_tokens_seen": 2855927808, "step": 2421 }, { "epoch": 0.02618851008611537, "grad_norm": 0.6171875, "learning_rate": 6.844379266807138e-05, "loss": 1.2672, "num_input_tokens_seen": 2857107456, "step": 2422 }, { "epoch": 0.026306476347764538, "grad_norm": 0.6875, "learning_rate": 6.84328203449295e-05, "loss": 1.5404, "num_input_tokens_seen": 2858287104, "step": 2423 }, { "epoch": 0.026424442609413706, "grad_norm": 0.65625, "learning_rate": 6.842184369568886e-05, "loss": 1.5096, "num_input_tokens_seen": 2859466752, "step": 2424 }, { "epoch": 0.026542408871062875, "grad_norm": 0.63671875, "learning_rate": 6.841086272201956e-05, "loss": 1.5436, "num_input_tokens_seen": 2860646400, "step": 2425 }, { "epoch": 0.026660375132712043, "grad_norm": 0.62890625, "learning_rate": 6.839987742559241e-05, "loss": 1.494, "num_input_tokens_seen": 2861826048, "step": 2426 }, { "epoch": 0.02677834139436121, "grad_norm": 0.65625, "learning_rate": 6.838888780807879e-05, "loss": 1.5509, "num_input_tokens_seen": 2863005696, "step": 2427 }, { "epoch": 0.026896307656010383, "grad_norm": 0.69140625, "learning_rate": 6.83778938711508e-05, "loss": 1.4803, "num_input_tokens_seen": 2864185344, "step": 2428 }, { "epoch": 0.02701427391765955, "grad_norm": 0.62890625, "learning_rate": 6.836689561648121e-05, "loss": 1.4344, "num_input_tokens_seen": 2865364992, "step": 2429 }, { "epoch": 0.02713224017930872, "grad_norm": 0.6328125, "learning_rate": 6.83558930457434e-05, "loss": 1.3625, "num_input_tokens_seen": 2866544640, "step": 2430 }, { "epoch": 0.027250206440957887, "grad_norm": 0.6015625, "learning_rate": 6.834488616061142e-05, "loss": 1.4183, "num_input_tokens_seen": 2867724288, "step": 2431 }, { "epoch": 0.027368172702607056, "grad_norm": 0.65625, "learning_rate": 6.833387496276003e-05, "loss": 1.3796, "num_input_tokens_seen": 2868903936, "step": 2432 }, { "epoch": 0.027486138964256224, "grad_norm": 0.62890625, "learning_rate": 6.832285945386455e-05, "loss": 1.5422, "num_input_tokens_seen": 2870083584, "step": 2433 }, { "epoch": 0.027604105225905392, "grad_norm": 0.67578125, "learning_rate": 6.831183963560103e-05, "loss": 1.4594, "num_input_tokens_seen": 2871263232, "step": 2434 }, { "epoch": 0.02772207148755456, "grad_norm": 0.70703125, "learning_rate": 6.830081550964616e-05, "loss": 1.5606, "num_input_tokens_seen": 2872442880, "step": 2435 }, { "epoch": 0.02784003774920373, "grad_norm": 0.609375, "learning_rate": 6.828978707767727e-05, "loss": 1.4426, "num_input_tokens_seen": 2873622528, "step": 2436 }, { "epoch": 0.027958004010852897, "grad_norm": 0.6328125, "learning_rate": 6.827875434137234e-05, "loss": 1.4133, "num_input_tokens_seen": 2874802176, "step": 2437 }, { "epoch": 0.028075970272502065, "grad_norm": 0.61328125, "learning_rate": 6.826771730241004e-05, "loss": 1.5358, "num_input_tokens_seen": 2875981824, "step": 2438 }, { "epoch": 0.028193936534151233, "grad_norm": 0.66015625, "learning_rate": 6.825667596246967e-05, "loss": 1.3897, "num_input_tokens_seen": 2877161472, "step": 2439 }, { "epoch": 0.0283119027958004, "grad_norm": 0.6328125, "learning_rate": 6.824563032323117e-05, "loss": 1.5979, "num_input_tokens_seen": 2878341120, "step": 2440 }, { "epoch": 0.02842986905744957, "grad_norm": 0.73046875, "learning_rate": 6.823458038637517e-05, "loss": 1.6035, "num_input_tokens_seen": 2879520768, "step": 2441 }, { "epoch": 0.028547835319098738, "grad_norm": 0.65234375, "learning_rate": 6.822352615358293e-05, "loss": 1.4794, "num_input_tokens_seen": 2880700416, "step": 2442 }, { "epoch": 0.028665801580747906, "grad_norm": 0.62890625, "learning_rate": 6.821246762653636e-05, "loss": 1.4634, "num_input_tokens_seen": 2881880064, "step": 2443 }, { "epoch": 0.028783767842397074, "grad_norm": 0.62109375, "learning_rate": 6.820140480691804e-05, "loss": 1.4982, "num_input_tokens_seen": 2883059712, "step": 2444 }, { "epoch": 0.028901734104046242, "grad_norm": 0.59375, "learning_rate": 6.819033769641121e-05, "loss": 1.4973, "num_input_tokens_seen": 2884239360, "step": 2445 }, { "epoch": 0.02901970036569541, "grad_norm": 0.6015625, "learning_rate": 6.817926629669972e-05, "loss": 1.4905, "num_input_tokens_seen": 2885419008, "step": 2446 }, { "epoch": 0.02913766662734458, "grad_norm": 0.62109375, "learning_rate": 6.816819060946813e-05, "loss": 1.3498, "num_input_tokens_seen": 2886598656, "step": 2447 }, { "epoch": 0.029255632888993747, "grad_norm": 0.5703125, "learning_rate": 6.815711063640159e-05, "loss": 1.5305, "num_input_tokens_seen": 2887778304, "step": 2448 }, { "epoch": 0.029373599150642915, "grad_norm": 0.609375, "learning_rate": 6.814602637918597e-05, "loss": 1.4629, "num_input_tokens_seen": 2888957952, "step": 2449 }, { "epoch": 0.029491565412292083, "grad_norm": 0.62890625, "learning_rate": 6.813493783950772e-05, "loss": 1.3772, "num_input_tokens_seen": 2890137600, "step": 2450 }, { "epoch": 0.02960953167394125, "grad_norm": 0.59375, "learning_rate": 6.812384501905402e-05, "loss": 1.4661, "num_input_tokens_seen": 2891317248, "step": 2451 }, { "epoch": 0.02972749793559042, "grad_norm": 0.65234375, "learning_rate": 6.811274791951263e-05, "loss": 1.6128, "num_input_tokens_seen": 2892496896, "step": 2452 }, { "epoch": 0.029845464197239588, "grad_norm": 0.60546875, "learning_rate": 6.810164654257201e-05, "loss": 1.6284, "num_input_tokens_seen": 2893676544, "step": 2453 }, { "epoch": 0.029963430458888756, "grad_norm": 0.6484375, "learning_rate": 6.809054088992124e-05, "loss": 1.2364, "num_input_tokens_seen": 2894856192, "step": 2454 }, { "epoch": 0.030081396720537928, "grad_norm": 0.62890625, "learning_rate": 6.807943096325007e-05, "loss": 1.4442, "num_input_tokens_seen": 2896035840, "step": 2455 }, { "epoch": 0.030199362982187096, "grad_norm": 0.65234375, "learning_rate": 6.806831676424889e-05, "loss": 1.5318, "num_input_tokens_seen": 2897215488, "step": 2456 }, { "epoch": 0.030317329243836264, "grad_norm": 0.58984375, "learning_rate": 6.805719829460874e-05, "loss": 1.5749, "num_input_tokens_seen": 2898395136, "step": 2457 }, { "epoch": 0.030435295505485432, "grad_norm": 0.640625, "learning_rate": 6.804607555602131e-05, "loss": 1.4685, "num_input_tokens_seen": 2899574784, "step": 2458 }, { "epoch": 0.0305532617671346, "grad_norm": 0.609375, "learning_rate": 6.803494855017896e-05, "loss": 1.53, "num_input_tokens_seen": 2900754432, "step": 2459 }, { "epoch": 0.03067122802878377, "grad_norm": 0.64453125, "learning_rate": 6.802381727877465e-05, "loss": 1.4206, "num_input_tokens_seen": 2901934080, "step": 2460 }, { "epoch": 0.030789194290432937, "grad_norm": 0.62109375, "learning_rate": 6.801268174350206e-05, "loss": 1.6105, "num_input_tokens_seen": 2903113728, "step": 2461 }, { "epoch": 0.030907160552082105, "grad_norm": 0.6328125, "learning_rate": 6.800154194605546e-05, "loss": 1.468, "num_input_tokens_seen": 2904293376, "step": 2462 }, { "epoch": 0.031025126813731273, "grad_norm": 0.64453125, "learning_rate": 6.799039788812978e-05, "loss": 1.3604, "num_input_tokens_seen": 2905473024, "step": 2463 }, { "epoch": 0.03114309307538044, "grad_norm": 0.58984375, "learning_rate": 6.79792495714206e-05, "loss": 1.4354, "num_input_tokens_seen": 2906652672, "step": 2464 }, { "epoch": 0.031261059337029606, "grad_norm": 0.59375, "learning_rate": 6.796809699762419e-05, "loss": 1.4907, "num_input_tokens_seen": 2907832320, "step": 2465 }, { "epoch": 0.03137902559867878, "grad_norm": 0.625, "learning_rate": 6.79569401684374e-05, "loss": 1.4576, "num_input_tokens_seen": 2909011968, "step": 2466 }, { "epoch": 0.03149699186032794, "grad_norm": 0.6015625, "learning_rate": 6.794577908555777e-05, "loss": 1.7254, "num_input_tokens_seen": 2910191616, "step": 2467 }, { "epoch": 0.031614958121977114, "grad_norm": 0.63671875, "learning_rate": 6.793461375068347e-05, "loss": 1.4307, "num_input_tokens_seen": 2911371264, "step": 2468 }, { "epoch": 0.031732924383626286, "grad_norm": 0.6484375, "learning_rate": 6.792344416551335e-05, "loss": 1.5432, "num_input_tokens_seen": 2912550912, "step": 2469 }, { "epoch": 0.03185089064527545, "grad_norm": 0.6015625, "learning_rate": 6.791227033174683e-05, "loss": 1.3251, "num_input_tokens_seen": 2913730560, "step": 2470 }, { "epoch": 0.03196885690692462, "grad_norm": 0.6015625, "learning_rate": 6.790109225108406e-05, "loss": 1.5154, "num_input_tokens_seen": 2914910208, "step": 2471 }, { "epoch": 0.03208682316857379, "grad_norm": 0.64453125, "learning_rate": 6.788990992522581e-05, "loss": 1.5991, "num_input_tokens_seen": 2916089856, "step": 2472 }, { "epoch": 0.03220478943022296, "grad_norm": 0.7109375, "learning_rate": 6.787872335587347e-05, "loss": 1.5224, "num_input_tokens_seen": 2917269504, "step": 2473 }, { "epoch": 0.032322755691872124, "grad_norm": 0.55859375, "learning_rate": 6.78675325447291e-05, "loss": 1.5096, "num_input_tokens_seen": 2918449152, "step": 2474 }, { "epoch": 0.032440721953521295, "grad_norm": 0.70703125, "learning_rate": 6.785633749349541e-05, "loss": 1.3008, "num_input_tokens_seen": 2919628800, "step": 2475 }, { "epoch": 0.03255868821517046, "grad_norm": 0.59375, "learning_rate": 6.784513820387574e-05, "loss": 1.4127, "num_input_tokens_seen": 2920808448, "step": 2476 }, { "epoch": 0.03267665447681963, "grad_norm": 0.625, "learning_rate": 6.783393467757405e-05, "loss": 1.5803, "num_input_tokens_seen": 2921988096, "step": 2477 }, { "epoch": 0.0327946207384688, "grad_norm": 0.66796875, "learning_rate": 6.782272691629502e-05, "loss": 1.3688, "num_input_tokens_seen": 2923167744, "step": 2478 }, { "epoch": 0.03291258700011797, "grad_norm": 0.65234375, "learning_rate": 6.781151492174391e-05, "loss": 1.5045, "num_input_tokens_seen": 2924347392, "step": 2479 }, { "epoch": 0.03303055326176713, "grad_norm": 0.6640625, "learning_rate": 6.780029869562662e-05, "loss": 1.3259, "num_input_tokens_seen": 2925527040, "step": 2480 }, { "epoch": 0.033148519523416305, "grad_norm": 0.65625, "learning_rate": 6.778907823964976e-05, "loss": 1.4663, "num_input_tokens_seen": 2926706688, "step": 2481 }, { "epoch": 0.03326648578506547, "grad_norm": 0.82421875, "learning_rate": 6.777785355552049e-05, "loss": 1.5435, "num_input_tokens_seen": 2927886336, "step": 2482 }, { "epoch": 0.03338445204671464, "grad_norm": 0.6171875, "learning_rate": 6.77666246449467e-05, "loss": 1.5047, "num_input_tokens_seen": 2929065984, "step": 2483 }, { "epoch": 0.033502418308363806, "grad_norm": 0.67578125, "learning_rate": 6.775539150963689e-05, "loss": 1.4834, "num_input_tokens_seen": 2930245632, "step": 2484 }, { "epoch": 0.03362038457001298, "grad_norm": 0.671875, "learning_rate": 6.77441541513002e-05, "loss": 1.3929, "num_input_tokens_seen": 2931425280, "step": 2485 }, { "epoch": 0.03373835083166214, "grad_norm": 1.1796875, "learning_rate": 6.773291257164638e-05, "loss": 1.5317, "num_input_tokens_seen": 2932604928, "step": 2486 }, { "epoch": 0.033856317093311314, "grad_norm": 0.6875, "learning_rate": 6.772166677238586e-05, "loss": 1.4426, "num_input_tokens_seen": 2933784576, "step": 2487 }, { "epoch": 0.03397428335496048, "grad_norm": 0.75390625, "learning_rate": 6.771041675522975e-05, "loss": 1.6853, "num_input_tokens_seen": 2934964224, "step": 2488 }, { "epoch": 0.03409224961660965, "grad_norm": 0.66015625, "learning_rate": 6.769916252188971e-05, "loss": 1.5797, "num_input_tokens_seen": 2936143872, "step": 2489 }, { "epoch": 0.034210215878258815, "grad_norm": 0.79296875, "learning_rate": 6.76879040740781e-05, "loss": 1.4338, "num_input_tokens_seen": 2937323520, "step": 2490 }, { "epoch": 0.03432818213990799, "grad_norm": 0.89453125, "learning_rate": 6.767664141350793e-05, "loss": 1.3769, "num_input_tokens_seen": 2938503168, "step": 2491 }, { "epoch": 0.03444614840155715, "grad_norm": 0.734375, "learning_rate": 6.766537454189282e-05, "loss": 1.4008, "num_input_tokens_seen": 2939682816, "step": 2492 }, { "epoch": 0.03456411466320632, "grad_norm": 0.79296875, "learning_rate": 6.765410346094703e-05, "loss": 1.6917, "num_input_tokens_seen": 2940862464, "step": 2493 }, { "epoch": 0.03468208092485549, "grad_norm": 0.74609375, "learning_rate": 6.76428281723855e-05, "loss": 1.424, "num_input_tokens_seen": 2942042112, "step": 2494 }, { "epoch": 0.03480004718650466, "grad_norm": 0.7890625, "learning_rate": 6.763154867792375e-05, "loss": 1.3273, "num_input_tokens_seen": 2943221760, "step": 2495 }, { "epoch": 0.03491801344815383, "grad_norm": 0.7109375, "learning_rate": 6.762026497927801e-05, "loss": 1.4786, "num_input_tokens_seen": 2944401408, "step": 2496 }, { "epoch": 0.035035979709802996, "grad_norm": 0.73828125, "learning_rate": 6.760897707816509e-05, "loss": 1.3188, "num_input_tokens_seen": 2945581056, "step": 2497 }, { "epoch": 0.03515394597145217, "grad_norm": 0.78515625, "learning_rate": 6.759768497630245e-05, "loss": 1.3535, "num_input_tokens_seen": 2946760704, "step": 2498 }, { "epoch": 0.03527191223310133, "grad_norm": 0.6796875, "learning_rate": 6.758638867540824e-05, "loss": 1.3905, "num_input_tokens_seen": 2947940352, "step": 2499 }, { "epoch": 0.035389878494750504, "grad_norm": 0.703125, "learning_rate": 6.757508817720115e-05, "loss": 1.4335, "num_input_tokens_seen": 2949120000, "step": 2500 }, { "epoch": 0.03550784475639967, "grad_norm": 0.671875, "learning_rate": 6.756378348340065e-05, "loss": 1.3988, "num_input_tokens_seen": 2950299648, "step": 2501 }, { "epoch": 0.03562581101804884, "grad_norm": 0.65234375, "learning_rate": 6.75524745957267e-05, "loss": 1.4672, "num_input_tokens_seen": 2951479296, "step": 2502 }, { "epoch": 0.035743777279698005, "grad_norm": 0.68359375, "learning_rate": 6.754116151589998e-05, "loss": 1.3077, "num_input_tokens_seen": 2952658944, "step": 2503 }, { "epoch": 0.03586174354134718, "grad_norm": 0.6328125, "learning_rate": 6.752984424564182e-05, "loss": 1.4232, "num_input_tokens_seen": 2953838592, "step": 2504 }, { "epoch": 0.03597970980299634, "grad_norm": 0.6484375, "learning_rate": 6.751852278667413e-05, "loss": 1.3803, "num_input_tokens_seen": 2955018240, "step": 2505 }, { "epoch": 0.03609767606464551, "grad_norm": 0.65234375, "learning_rate": 6.75071971407195e-05, "loss": 1.4027, "num_input_tokens_seen": 2956197888, "step": 2506 }, { "epoch": 0.03621564232629468, "grad_norm": 0.62109375, "learning_rate": 6.749586730950114e-05, "loss": 1.498, "num_input_tokens_seen": 2957377536, "step": 2507 }, { "epoch": 0.03633360858794385, "grad_norm": 0.61328125, "learning_rate": 6.748453329474291e-05, "loss": 1.4218, "num_input_tokens_seen": 2958557184, "step": 2508 }, { "epoch": 0.036451574849593014, "grad_norm": 0.6328125, "learning_rate": 6.747319509816929e-05, "loss": 1.4672, "num_input_tokens_seen": 2959736832, "step": 2509 }, { "epoch": 0.036569541111242186, "grad_norm": 0.66796875, "learning_rate": 6.746185272150541e-05, "loss": 1.3774, "num_input_tokens_seen": 2960916480, "step": 2510 }, { "epoch": 0.03668750737289135, "grad_norm": 0.63671875, "learning_rate": 6.745050616647705e-05, "loss": 1.5326, "num_input_tokens_seen": 2962096128, "step": 2511 }, { "epoch": 0.03680547363454052, "grad_norm": 0.609375, "learning_rate": 6.743915543481057e-05, "loss": 1.4675, "num_input_tokens_seen": 2963275776, "step": 2512 }, { "epoch": 0.03692343989618969, "grad_norm": 0.6328125, "learning_rate": 6.742780052823302e-05, "loss": 1.4793, "num_input_tokens_seen": 2964455424, "step": 2513 }, { "epoch": 0.03704140615783886, "grad_norm": 0.6015625, "learning_rate": 6.741644144847206e-05, "loss": 1.4618, "num_input_tokens_seen": 2965635072, "step": 2514 }, { "epoch": 0.037159372419488024, "grad_norm": 0.64453125, "learning_rate": 6.740507819725601e-05, "loss": 1.4407, "num_input_tokens_seen": 2966814720, "step": 2515 }, { "epoch": 0.037277338681137195, "grad_norm": 0.6484375, "learning_rate": 6.739371077631379e-05, "loss": 1.3813, "num_input_tokens_seen": 2967994368, "step": 2516 }, { "epoch": 0.03739530494278636, "grad_norm": 0.62890625, "learning_rate": 6.738233918737496e-05, "loss": 1.4495, "num_input_tokens_seen": 2969174016, "step": 2517 }, { "epoch": 0.03751327120443553, "grad_norm": 0.640625, "learning_rate": 6.737096343216975e-05, "loss": 1.4633, "num_input_tokens_seen": 2970353664, "step": 2518 }, { "epoch": 0.0376312374660847, "grad_norm": 0.625, "learning_rate": 6.7359583512429e-05, "loss": 1.6253, "num_input_tokens_seen": 2971533312, "step": 2519 }, { "epoch": 0.03774920372773387, "grad_norm": 0.59375, "learning_rate": 6.734819942988417e-05, "loss": 1.4672, "num_input_tokens_seen": 2972712960, "step": 2520 }, { "epoch": 0.03786716998938304, "grad_norm": 0.61328125, "learning_rate": 6.733681118626735e-05, "loss": 1.3775, "num_input_tokens_seen": 2973892608, "step": 2521 }, { "epoch": 0.037985136251032205, "grad_norm": 0.5859375, "learning_rate": 6.732541878331132e-05, "loss": 1.5464, "num_input_tokens_seen": 2975072256, "step": 2522 }, { "epoch": 0.038103102512681376, "grad_norm": 0.59765625, "learning_rate": 6.731402222274941e-05, "loss": 1.2845, "num_input_tokens_seen": 2976251904, "step": 2523 }, { "epoch": 0.03822106877433054, "grad_norm": 0.6484375, "learning_rate": 6.730262150631566e-05, "loss": 1.4293, "num_input_tokens_seen": 2977431552, "step": 2524 }, { "epoch": 0.03833903503597971, "grad_norm": 0.65625, "learning_rate": 6.729121663574468e-05, "loss": 1.3535, "num_input_tokens_seen": 2978611200, "step": 2525 }, { "epoch": 0.03845700129762888, "grad_norm": 0.62890625, "learning_rate": 6.727980761277177e-05, "loss": 1.4109, "num_input_tokens_seen": 2979790848, "step": 2526 }, { "epoch": 0.03857496755927805, "grad_norm": 0.6484375, "learning_rate": 6.72683944391328e-05, "loss": 1.5062, "num_input_tokens_seen": 2980970496, "step": 2527 }, { "epoch": 0.038692933820927214, "grad_norm": 0.6015625, "learning_rate": 6.725697711656432e-05, "loss": 1.4584, "num_input_tokens_seen": 2982150144, "step": 2528 }, { "epoch": 0.038810900082576386, "grad_norm": 0.64453125, "learning_rate": 6.724555564680348e-05, "loss": 1.3609, "num_input_tokens_seen": 2983329792, "step": 2529 }, { "epoch": 0.03892886634422555, "grad_norm": 0.6640625, "learning_rate": 6.723413003158809e-05, "loss": 1.3514, "num_input_tokens_seen": 2984509440, "step": 2530 }, { "epoch": 0.03904683260587472, "grad_norm": 0.64453125, "learning_rate": 6.722270027265657e-05, "loss": 1.4508, "num_input_tokens_seen": 2985689088, "step": 2531 }, { "epoch": 0.03916479886752389, "grad_norm": 0.62109375, "learning_rate": 6.721126637174797e-05, "loss": 1.4692, "num_input_tokens_seen": 2986868736, "step": 2532 }, { "epoch": 0.03928276512917306, "grad_norm": 0.59765625, "learning_rate": 6.719982833060198e-05, "loss": 1.3881, "num_input_tokens_seen": 2988048384, "step": 2533 }, { "epoch": 0.03940073139082222, "grad_norm": 0.7265625, "learning_rate": 6.718838615095891e-05, "loss": 1.3993, "num_input_tokens_seen": 2989228032, "step": 2534 }, { "epoch": 0.039518697652471395, "grad_norm": 0.67578125, "learning_rate": 6.717693983455974e-05, "loss": 1.5893, "num_input_tokens_seen": 2990407680, "step": 2535 }, { "epoch": 0.03963666391412056, "grad_norm": 0.64453125, "learning_rate": 6.7165489383146e-05, "loss": 1.3197, "num_input_tokens_seen": 2991587328, "step": 2536 }, { "epoch": 0.03975463017576973, "grad_norm": 0.625, "learning_rate": 6.71540347984599e-05, "loss": 1.4952, "num_input_tokens_seen": 2992766976, "step": 2537 }, { "epoch": 0.039872596437418896, "grad_norm": 0.66796875, "learning_rate": 6.71425760822443e-05, "loss": 1.4684, "num_input_tokens_seen": 2993946624, "step": 2538 }, { "epoch": 0.03999056269906807, "grad_norm": 0.61328125, "learning_rate": 6.713111323624265e-05, "loss": 1.4798, "num_input_tokens_seen": 2995126272, "step": 2539 }, { "epoch": 0.04010852896071723, "grad_norm": 0.6484375, "learning_rate": 6.711964626219903e-05, "loss": 1.4208, "num_input_tokens_seen": 2996305920, "step": 2540 }, { "epoch": 0.040226495222366404, "grad_norm": 0.6640625, "learning_rate": 6.710817516185817e-05, "loss": 1.3006, "num_input_tokens_seen": 2997485568, "step": 2541 }, { "epoch": 0.04034446148401557, "grad_norm": 0.625, "learning_rate": 6.709669993696543e-05, "loss": 1.4182, "num_input_tokens_seen": 2998665216, "step": 2542 }, { "epoch": 0.04046242774566474, "grad_norm": 0.60546875, "learning_rate": 6.708522058926673e-05, "loss": 1.4264, "num_input_tokens_seen": 2999844864, "step": 2543 }, { "epoch": 0.040580394007313905, "grad_norm": 0.5859375, "learning_rate": 6.707373712050873e-05, "loss": 1.5005, "num_input_tokens_seen": 3001024512, "step": 2544 }, { "epoch": 0.04069836026896308, "grad_norm": 0.578125, "learning_rate": 6.706224953243863e-05, "loss": 1.4911, "num_input_tokens_seen": 3002204160, "step": 2545 }, { "epoch": 0.04081632653061224, "grad_norm": 0.6328125, "learning_rate": 6.705075782680428e-05, "loss": 1.3447, "num_input_tokens_seen": 3003383808, "step": 2546 }, { "epoch": 0.04093429279226141, "grad_norm": 0.5703125, "learning_rate": 6.703926200535419e-05, "loss": 1.5497, "num_input_tokens_seen": 3004563456, "step": 2547 }, { "epoch": 0.041052259053910585, "grad_norm": 0.60546875, "learning_rate": 6.702776206983744e-05, "loss": 1.4596, "num_input_tokens_seen": 3005743104, "step": 2548 }, { "epoch": 0.04117022531555975, "grad_norm": 0.59375, "learning_rate": 6.701625802200378e-05, "loss": 1.4756, "num_input_tokens_seen": 3006922752, "step": 2549 }, { "epoch": 0.04128819157720892, "grad_norm": 0.58984375, "learning_rate": 6.700474986360354e-05, "loss": 1.4192, "num_input_tokens_seen": 3008102400, "step": 2550 }, { "epoch": 0.041406157838858086, "grad_norm": 0.578125, "learning_rate": 6.699323759638773e-05, "loss": 1.4665, "num_input_tokens_seen": 3009282048, "step": 2551 }, { "epoch": 0.04152412410050726, "grad_norm": 0.59765625, "learning_rate": 6.698172122210797e-05, "loss": 1.5898, "num_input_tokens_seen": 3010461696, "step": 2552 }, { "epoch": 0.04164209036215642, "grad_norm": 0.609375, "learning_rate": 6.697020074251647e-05, "loss": 1.4587, "num_input_tokens_seen": 3011641344, "step": 2553 }, { "epoch": 0.041760056623805594, "grad_norm": 0.5859375, "learning_rate": 6.69586761593661e-05, "loss": 1.4906, "num_input_tokens_seen": 3012820992, "step": 2554 }, { "epoch": 0.04187802288545476, "grad_norm": 0.6015625, "learning_rate": 6.694714747441035e-05, "loss": 1.3911, "num_input_tokens_seen": 3014000640, "step": 2555 }, { "epoch": 0.04199598914710393, "grad_norm": 0.58984375, "learning_rate": 6.693561468940331e-05, "loss": 1.4116, "num_input_tokens_seen": 3015180288, "step": 2556 }, { "epoch": 0.042113955408753095, "grad_norm": 0.64453125, "learning_rate": 6.692407780609972e-05, "loss": 1.3856, "num_input_tokens_seen": 3016359936, "step": 2557 }, { "epoch": 0.04223192167040227, "grad_norm": 0.6015625, "learning_rate": 6.691253682625496e-05, "loss": 1.4585, "num_input_tokens_seen": 3017539584, "step": 2558 }, { "epoch": 0.04234988793205143, "grad_norm": 0.5859375, "learning_rate": 6.690099175162497e-05, "loss": 1.5573, "num_input_tokens_seen": 3018719232, "step": 2559 }, { "epoch": 0.042467854193700603, "grad_norm": 0.640625, "learning_rate": 6.688944258396637e-05, "loss": 1.4737, "num_input_tokens_seen": 3019898880, "step": 2560 }, { "epoch": 0.04258582045534977, "grad_norm": 0.6484375, "learning_rate": 6.687788932503637e-05, "loss": 1.4925, "num_input_tokens_seen": 3021078528, "step": 2561 }, { "epoch": 0.04270378671699894, "grad_norm": 0.80078125, "learning_rate": 6.686633197659284e-05, "loss": 1.5562, "num_input_tokens_seen": 3022258176, "step": 2562 }, { "epoch": 0.042821752978648105, "grad_norm": 0.68359375, "learning_rate": 6.685477054039423e-05, "loss": 1.4028, "num_input_tokens_seen": 3023437824, "step": 2563 }, { "epoch": 0.042939719240297276, "grad_norm": 0.61328125, "learning_rate": 6.684320501819965e-05, "loss": 1.445, "num_input_tokens_seen": 3024617472, "step": 2564 }, { "epoch": 0.04305768550194644, "grad_norm": 0.640625, "learning_rate": 6.683163541176879e-05, "loss": 1.3349, "num_input_tokens_seen": 3025797120, "step": 2565 }, { "epoch": 0.04317565176359561, "grad_norm": 0.6171875, "learning_rate": 6.6820061722862e-05, "loss": 1.3495, "num_input_tokens_seen": 3026976768, "step": 2566 }, { "epoch": 0.04329361802524478, "grad_norm": 0.6015625, "learning_rate": 6.680848395324022e-05, "loss": 1.4801, "num_input_tokens_seen": 3028156416, "step": 2567 }, { "epoch": 0.04341158428689395, "grad_norm": 0.60546875, "learning_rate": 6.679690210466505e-05, "loss": 1.4375, "num_input_tokens_seen": 3029336064, "step": 2568 }, { "epoch": 0.043529550548543114, "grad_norm": 1.6953125, "learning_rate": 6.678531617889866e-05, "loss": 1.5589, "num_input_tokens_seen": 3030515712, "step": 2569 }, { "epoch": 0.043647516810192286, "grad_norm": 0.6796875, "learning_rate": 6.677372617770389e-05, "loss": 1.5393, "num_input_tokens_seen": 3031695360, "step": 2570 }, { "epoch": 0.04376548307184145, "grad_norm": 0.6484375, "learning_rate": 6.676213210284415e-05, "loss": 1.3024, "num_input_tokens_seen": 3032875008, "step": 2571 }, { "epoch": 0.04388344933349062, "grad_norm": 0.640625, "learning_rate": 6.675053395608353e-05, "loss": 1.4264, "num_input_tokens_seen": 3034054656, "step": 2572 }, { "epoch": 0.04400141559513979, "grad_norm": 0.62890625, "learning_rate": 6.673893173918667e-05, "loss": 1.2767, "num_input_tokens_seen": 3035234304, "step": 2573 }, { "epoch": 0.04411938185678896, "grad_norm": 0.69140625, "learning_rate": 6.67273254539189e-05, "loss": 1.4428, "num_input_tokens_seen": 3036413952, "step": 2574 }, { "epoch": 0.04423734811843813, "grad_norm": 0.67578125, "learning_rate": 6.671571510204611e-05, "loss": 1.4224, "num_input_tokens_seen": 3037593600, "step": 2575 }, { "epoch": 0.044355314380087295, "grad_norm": 0.640625, "learning_rate": 6.670410068533486e-05, "loss": 1.2808, "num_input_tokens_seen": 3038773248, "step": 2576 }, { "epoch": 0.044473280641736467, "grad_norm": 0.73828125, "learning_rate": 6.669248220555227e-05, "loss": 1.3767, "num_input_tokens_seen": 3039952896, "step": 2577 }, { "epoch": 0.04459124690338563, "grad_norm": 0.6015625, "learning_rate": 6.668085966446614e-05, "loss": 1.6047, "num_input_tokens_seen": 3041132544, "step": 2578 }, { "epoch": 0.0447092131650348, "grad_norm": 0.69140625, "learning_rate": 6.666923306384483e-05, "loss": 1.3309, "num_input_tokens_seen": 3042312192, "step": 2579 }, { "epoch": 0.04482717942668397, "grad_norm": 0.63671875, "learning_rate": 6.665760240545737e-05, "loss": 1.3701, "num_input_tokens_seen": 3043491840, "step": 2580 }, { "epoch": 0.04494514568833314, "grad_norm": 0.69921875, "learning_rate": 6.664596769107336e-05, "loss": 1.298, "num_input_tokens_seen": 3044671488, "step": 2581 }, { "epoch": 0.045063111949982304, "grad_norm": 0.60546875, "learning_rate": 6.663432892246306e-05, "loss": 1.6694, "num_input_tokens_seen": 3045851136, "step": 2582 }, { "epoch": 0.045181078211631476, "grad_norm": 0.640625, "learning_rate": 6.662268610139732e-05, "loss": 1.4567, "num_input_tokens_seen": 3047030784, "step": 2583 }, { "epoch": 0.04529904447328064, "grad_norm": 0.71875, "learning_rate": 6.661103922964761e-05, "loss": 1.5915, "num_input_tokens_seen": 3048210432, "step": 2584 }, { "epoch": 0.04541701073492981, "grad_norm": 0.69140625, "learning_rate": 6.659938830898603e-05, "loss": 1.4121, "num_input_tokens_seen": 3049390080, "step": 2585 }, { "epoch": 0.04553497699657898, "grad_norm": 0.69140625, "learning_rate": 6.658773334118527e-05, "loss": 1.6211, "num_input_tokens_seen": 3050569728, "step": 2586 }, { "epoch": 0.04565294325822815, "grad_norm": 0.74609375, "learning_rate": 6.657607432801868e-05, "loss": 1.557, "num_input_tokens_seen": 3051749376, "step": 2587 }, { "epoch": 0.04577090951987731, "grad_norm": 0.64453125, "learning_rate": 6.656441127126017e-05, "loss": 1.4462, "num_input_tokens_seen": 3052929024, "step": 2588 }, { "epoch": 0.045888875781526485, "grad_norm": 0.63671875, "learning_rate": 6.65527441726843e-05, "loss": 1.4628, "num_input_tokens_seen": 3054108672, "step": 2589 }, { "epoch": 0.04600684204317565, "grad_norm": 0.625, "learning_rate": 6.654107303406623e-05, "loss": 1.3959, "num_input_tokens_seen": 3055288320, "step": 2590 }, { "epoch": 0.04612480830482482, "grad_norm": 0.63671875, "learning_rate": 6.652939785718177e-05, "loss": 1.5412, "num_input_tokens_seen": 3056467968, "step": 2591 }, { "epoch": 0.046242774566473986, "grad_norm": 0.65625, "learning_rate": 6.651771864380729e-05, "loss": 1.3125, "num_input_tokens_seen": 3057647616, "step": 2592 }, { "epoch": 0.04636074082812316, "grad_norm": 0.6484375, "learning_rate": 6.65060353957198e-05, "loss": 1.4411, "num_input_tokens_seen": 3058827264, "step": 2593 }, { "epoch": 0.04647870708977232, "grad_norm": 0.63671875, "learning_rate": 6.649434811469694e-05, "loss": 1.463, "num_input_tokens_seen": 3060006912, "step": 2594 }, { "epoch": 0.046596673351421494, "grad_norm": 0.6171875, "learning_rate": 6.648265680251695e-05, "loss": 1.3815, "num_input_tokens_seen": 3061186560, "step": 2595 }, { "epoch": 0.04671463961307066, "grad_norm": 0.62109375, "learning_rate": 6.647096146095866e-05, "loss": 1.4478, "num_input_tokens_seen": 3062366208, "step": 2596 }, { "epoch": 0.04683260587471983, "grad_norm": 0.69140625, "learning_rate": 6.645926209180156e-05, "loss": 1.3286, "num_input_tokens_seen": 3063545856, "step": 2597 }, { "epoch": 0.046950572136368995, "grad_norm": 0.63671875, "learning_rate": 6.644755869682572e-05, "loss": 1.3154, "num_input_tokens_seen": 3064725504, "step": 2598 }, { "epoch": 0.04706853839801817, "grad_norm": 0.61328125, "learning_rate": 6.643585127781182e-05, "loss": 1.2973, "num_input_tokens_seen": 3065905152, "step": 2599 }, { "epoch": 0.04718650465966733, "grad_norm": 0.62890625, "learning_rate": 6.642413983654116e-05, "loss": 1.4398, "num_input_tokens_seen": 3067084800, "step": 2600 }, { "epoch": 0.04718650465966733, "eval_wikipedia_loss": 2.269557476043701, "eval_wikipedia_runtime": 162.0914, "eval_wikipedia_samples_per_second": 4.331, "eval_wikipedia_steps_per_second": 0.185, "num_input_tokens_seen": 3067084800, "step": 2600 }, { "epoch": 0.04718650465966733, "eval_toxicity_loss": 4.000088691711426, "eval_toxicity_runtime": 1.1002, "eval_toxicity_samples_per_second": 1.818, "eval_toxicity_steps_per_second": 0.909, "num_input_tokens_seen": 3067084800, "step": 2600 } ], "logging_steps": 1, "max_steps": 8477, "num_input_tokens_seen": 3067084800, "num_train_epochs": 9223372036854775807, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.725620912209265e+19, "train_batch_size": 6, "trial_name": null, "trial_params": null }