{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04718650465966733, "eval_steps": 200, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011796626164916834, "grad_norm": 93.0, "learning_rate": 1.8912529550827425e-07, "loss": 7.9641, "num_input_tokens_seen": 1179648, "step": 1 }, { "epoch": 0.0002359325232983367, "grad_norm": 95.0, "learning_rate": 3.782505910165485e-07, "loss": 7.9866, "num_input_tokens_seen": 2359296, "step": 2 }, { "epoch": 0.000353898784947505, "grad_norm": 97.5, "learning_rate": 5.673758865248227e-07, "loss": 7.983, "num_input_tokens_seen": 3538944, "step": 3 }, { "epoch": 0.0004718650465966734, "grad_norm": 92.0, "learning_rate": 7.56501182033097e-07, "loss": 7.9018, "num_input_tokens_seen": 4718592, "step": 4 }, { "epoch": 0.0005898313082458417, "grad_norm": 102.5, "learning_rate": 9.456264775413712e-07, "loss": 8.0784, "num_input_tokens_seen": 5898240, "step": 5 }, { "epoch": 0.00070779756989501, "grad_norm": 95.0, "learning_rate": 1.1347517730496454e-06, "loss": 7.9578, "num_input_tokens_seen": 7077888, "step": 6 }, { "epoch": 0.0008257638315441783, "grad_norm": 93.5, "learning_rate": 1.3238770685579196e-06, "loss": 7.975, "num_input_tokens_seen": 8257536, "step": 7 }, { "epoch": 0.0009437300931933467, "grad_norm": 85.0, "learning_rate": 1.513002364066194e-06, "loss": 7.8405, "num_input_tokens_seen": 9437184, "step": 8 }, { "epoch": 0.001061696354842515, "grad_norm": 83.0, "learning_rate": 1.7021276595744682e-06, "loss": 7.841, "num_input_tokens_seen": 10616832, "step": 9 }, { "epoch": 0.0011796626164916834, "grad_norm": 82.5, "learning_rate": 1.8912529550827423e-06, "loss": 7.9491, "num_input_tokens_seen": 11796480, "step": 10 }, { "epoch": 0.0012976288781408518, "grad_norm": 68.0, "learning_rate": 2.0803782505910165e-06, "loss": 7.5612, "num_input_tokens_seen": 12976128, "step": 11 }, { "epoch": 0.00141559513979002, "grad_norm": 66.5, "learning_rate": 2.269503546099291e-06, "loss": 7.5808, "num_input_tokens_seen": 14155776, "step": 12 }, { "epoch": 0.0015335614014391884, "grad_norm": 57.5, "learning_rate": 2.4586288416075653e-06, "loss": 7.4494, "num_input_tokens_seen": 15335424, "step": 13 }, { "epoch": 0.0016515276630883566, "grad_norm": 51.25, "learning_rate": 2.6477541371158392e-06, "loss": 7.4606, "num_input_tokens_seen": 16515072, "step": 14 }, { "epoch": 0.001769493924737525, "grad_norm": 42.75, "learning_rate": 2.836879432624114e-06, "loss": 7.2804, "num_input_tokens_seen": 17694720, "step": 15 }, { "epoch": 0.0018874601863866935, "grad_norm": 38.0, "learning_rate": 3.026004728132388e-06, "loss": 7.2173, "num_input_tokens_seen": 18874368, "step": 16 }, { "epoch": 0.0020054264480358617, "grad_norm": 33.75, "learning_rate": 3.2151300236406624e-06, "loss": 7.0934, "num_input_tokens_seen": 20054016, "step": 17 }, { "epoch": 0.00212339270968503, "grad_norm": 30.5, "learning_rate": 3.4042553191489363e-06, "loss": 7.1284, "num_input_tokens_seen": 21233664, "step": 18 }, { "epoch": 0.0022413589713341986, "grad_norm": 26.5, "learning_rate": 3.5933806146572107e-06, "loss": 6.932, "num_input_tokens_seen": 22413312, "step": 19 }, { "epoch": 0.0023593252329833668, "grad_norm": 23.875, "learning_rate": 3.7825059101654847e-06, "loss": 6.9163, "num_input_tokens_seen": 23592960, "step": 20 }, { "epoch": 0.002477291494632535, "grad_norm": 21.625, "learning_rate": 3.9716312056737595e-06, "loss": 6.8335, "num_input_tokens_seen": 24772608, "step": 21 }, { "epoch": 0.0025952577562817036, "grad_norm": 18.625, "learning_rate": 4.160756501182033e-06, "loss": 6.6761, "num_input_tokens_seen": 25952256, "step": 22 }, { "epoch": 0.002713224017930872, "grad_norm": 16.0, "learning_rate": 4.349881796690308e-06, "loss": 6.6474, "num_input_tokens_seen": 27131904, "step": 23 }, { "epoch": 0.00283119027958004, "grad_norm": 18.0, "learning_rate": 4.539007092198582e-06, "loss": 6.6062, "num_input_tokens_seen": 28311552, "step": 24 }, { "epoch": 0.0029491565412292082, "grad_norm": 17.5, "learning_rate": 4.728132387706856e-06, "loss": 6.6435, "num_input_tokens_seen": 29491200, "step": 25 }, { "epoch": 0.003067122802878377, "grad_norm": 14.25, "learning_rate": 4.9172576832151305e-06, "loss": 6.57, "num_input_tokens_seen": 30670848, "step": 26 }, { "epoch": 0.003185089064527545, "grad_norm": 13.3125, "learning_rate": 5.106382978723404e-06, "loss": 6.4037, "num_input_tokens_seen": 31850496, "step": 27 }, { "epoch": 0.0033030553261767133, "grad_norm": 10.8125, "learning_rate": 5.2955082742316784e-06, "loss": 6.2927, "num_input_tokens_seen": 33030144, "step": 28 }, { "epoch": 0.003421021587825882, "grad_norm": 9.75, "learning_rate": 5.484633569739954e-06, "loss": 6.2084, "num_input_tokens_seen": 34209792, "step": 29 }, { "epoch": 0.00353898784947505, "grad_norm": 10.25, "learning_rate": 5.673758865248228e-06, "loss": 6.2349, "num_input_tokens_seen": 35389440, "step": 30 }, { "epoch": 0.0036569541111242184, "grad_norm": 10.8125, "learning_rate": 5.862884160756502e-06, "loss": 6.155, "num_input_tokens_seen": 36569088, "step": 31 }, { "epoch": 0.003774920372773387, "grad_norm": 10.0, "learning_rate": 6.052009456264776e-06, "loss": 6.1846, "num_input_tokens_seen": 37748736, "step": 32 }, { "epoch": 0.003892886634422555, "grad_norm": 8.0, "learning_rate": 6.24113475177305e-06, "loss": 6.0656, "num_input_tokens_seen": 38928384, "step": 33 }, { "epoch": 0.004010852896071723, "grad_norm": 8.0625, "learning_rate": 6.430260047281325e-06, "loss": 5.9873, "num_input_tokens_seen": 40108032, "step": 34 }, { "epoch": 0.004128819157720892, "grad_norm": 8.875, "learning_rate": 6.619385342789598e-06, "loss": 6.0008, "num_input_tokens_seen": 41287680, "step": 35 }, { "epoch": 0.00424678541937006, "grad_norm": 7.34375, "learning_rate": 6.808510638297873e-06, "loss": 6.0852, "num_input_tokens_seen": 42467328, "step": 36 }, { "epoch": 0.004364751681019229, "grad_norm": 6.28125, "learning_rate": 6.997635933806147e-06, "loss": 5.8264, "num_input_tokens_seen": 43646976, "step": 37 }, { "epoch": 0.004482717942668397, "grad_norm": 6.75, "learning_rate": 7.186761229314421e-06, "loss": 5.9511, "num_input_tokens_seen": 44826624, "step": 38 }, { "epoch": 0.004600684204317565, "grad_norm": 6.96875, "learning_rate": 7.375886524822695e-06, "loss": 5.8042, "num_input_tokens_seen": 46006272, "step": 39 }, { "epoch": 0.0047186504659667335, "grad_norm": 5.59375, "learning_rate": 7.565011820330969e-06, "loss": 5.8955, "num_input_tokens_seen": 47185920, "step": 40 }, { "epoch": 0.004836616727615902, "grad_norm": 5.3125, "learning_rate": 7.754137115839244e-06, "loss": 5.7764, "num_input_tokens_seen": 48365568, "step": 41 }, { "epoch": 0.00495458298926507, "grad_norm": 5.40625, "learning_rate": 7.943262411347519e-06, "loss": 5.7784, "num_input_tokens_seen": 49545216, "step": 42 }, { "epoch": 0.005072549250914238, "grad_norm": 5.03125, "learning_rate": 8.132387706855792e-06, "loss": 5.7388, "num_input_tokens_seen": 50724864, "step": 43 }, { "epoch": 0.005190515512563407, "grad_norm": 4.84375, "learning_rate": 8.321513002364066e-06, "loss": 5.7821, "num_input_tokens_seen": 51904512, "step": 44 }, { "epoch": 0.0053084817742125754, "grad_norm": 4.78125, "learning_rate": 8.510638297872341e-06, "loss": 5.6466, "num_input_tokens_seen": 53084160, "step": 45 }, { "epoch": 0.005426448035861744, "grad_norm": 4.125, "learning_rate": 8.699763593380616e-06, "loss": 5.6224, "num_input_tokens_seen": 54263808, "step": 46 }, { "epoch": 0.005544414297510912, "grad_norm": 5.0, "learning_rate": 8.888888888888888e-06, "loss": 5.9019, "num_input_tokens_seen": 55443456, "step": 47 }, { "epoch": 0.00566238055916008, "grad_norm": 4.0, "learning_rate": 9.078014184397164e-06, "loss": 5.5812, "num_input_tokens_seen": 56623104, "step": 48 }, { "epoch": 0.005780346820809248, "grad_norm": 3.796875, "learning_rate": 9.267139479905439e-06, "loss": 5.633, "num_input_tokens_seen": 57802752, "step": 49 }, { "epoch": 0.0058983130824584165, "grad_norm": 3.84375, "learning_rate": 9.456264775413712e-06, "loss": 5.5031, "num_input_tokens_seen": 58982400, "step": 50 }, { "epoch": 0.0060162793441075856, "grad_norm": 3.4375, "learning_rate": 9.645390070921986e-06, "loss": 5.5146, "num_input_tokens_seen": 60162048, "step": 51 }, { "epoch": 0.006134245605756754, "grad_norm": 3.609375, "learning_rate": 9.834515366430261e-06, "loss": 5.3805, "num_input_tokens_seen": 61341696, "step": 52 }, { "epoch": 0.006252211867405922, "grad_norm": 3.234375, "learning_rate": 1.0023640661938535e-05, "loss": 5.4098, "num_input_tokens_seen": 62521344, "step": 53 }, { "epoch": 0.00637017812905509, "grad_norm": 3.40625, "learning_rate": 1.0212765957446808e-05, "loss": 5.3773, "num_input_tokens_seen": 63700992, "step": 54 }, { "epoch": 0.006488144390704258, "grad_norm": 3.1875, "learning_rate": 1.0401891252955083e-05, "loss": 5.3287, "num_input_tokens_seen": 64880640, "step": 55 }, { "epoch": 0.006606110652353427, "grad_norm": 3.046875, "learning_rate": 1.0591016548463357e-05, "loss": 5.2282, "num_input_tokens_seen": 66060288, "step": 56 }, { "epoch": 0.006724076914002596, "grad_norm": 3.0625, "learning_rate": 1.0780141843971632e-05, "loss": 5.2967, "num_input_tokens_seen": 67239936, "step": 57 }, { "epoch": 0.006842043175651764, "grad_norm": 2.859375, "learning_rate": 1.0969267139479907e-05, "loss": 5.1126, "num_input_tokens_seen": 68419584, "step": 58 }, { "epoch": 0.006960009437300932, "grad_norm": 3.0625, "learning_rate": 1.1158392434988181e-05, "loss": 5.3309, "num_input_tokens_seen": 69599232, "step": 59 }, { "epoch": 0.0070779756989501, "grad_norm": 3.0625, "learning_rate": 1.1347517730496456e-05, "loss": 5.2134, "num_input_tokens_seen": 70778880, "step": 60 }, { "epoch": 0.0071959419605992685, "grad_norm": 3.546875, "learning_rate": 1.153664302600473e-05, "loss": 5.1972, "num_input_tokens_seen": 71958528, "step": 61 }, { "epoch": 0.007313908222248437, "grad_norm": 2.828125, "learning_rate": 1.1725768321513003e-05, "loss": 5.0123, "num_input_tokens_seen": 73138176, "step": 62 }, { "epoch": 0.007431874483897605, "grad_norm": 3.203125, "learning_rate": 1.1914893617021277e-05, "loss": 5.1108, "num_input_tokens_seen": 74317824, "step": 63 }, { "epoch": 0.007549840745546774, "grad_norm": 3.609375, "learning_rate": 1.2104018912529552e-05, "loss": 5.2287, "num_input_tokens_seen": 75497472, "step": 64 }, { "epoch": 0.007667807007195942, "grad_norm": 2.703125, "learning_rate": 1.2293144208037825e-05, "loss": 5.0902, "num_input_tokens_seen": 76677120, "step": 65 }, { "epoch": 0.00778577326884511, "grad_norm": 3.1875, "learning_rate": 1.24822695035461e-05, "loss": 5.1526, "num_input_tokens_seen": 77856768, "step": 66 }, { "epoch": 0.007903739530494279, "grad_norm": 3.359375, "learning_rate": 1.2671394799054376e-05, "loss": 4.8855, "num_input_tokens_seen": 79036416, "step": 67 }, { "epoch": 0.008021705792143447, "grad_norm": 2.765625, "learning_rate": 1.286052009456265e-05, "loss": 5.0226, "num_input_tokens_seen": 80216064, "step": 68 }, { "epoch": 0.008139672053792615, "grad_norm": 3.078125, "learning_rate": 1.3049645390070925e-05, "loss": 5.1366, "num_input_tokens_seen": 81395712, "step": 69 }, { "epoch": 0.008257638315441783, "grad_norm": 3.71875, "learning_rate": 1.3238770685579197e-05, "loss": 5.0749, "num_input_tokens_seen": 82575360, "step": 70 }, { "epoch": 0.008375604577090951, "grad_norm": 2.765625, "learning_rate": 1.3427895981087472e-05, "loss": 4.8899, "num_input_tokens_seen": 83755008, "step": 71 }, { "epoch": 0.00849357083874012, "grad_norm": 3.140625, "learning_rate": 1.3617021276595745e-05, "loss": 4.8809, "num_input_tokens_seen": 84934656, "step": 72 }, { "epoch": 0.008611537100389288, "grad_norm": 3.296875, "learning_rate": 1.380614657210402e-05, "loss": 4.8839, "num_input_tokens_seen": 86114304, "step": 73 }, { "epoch": 0.008729503362038458, "grad_norm": 2.953125, "learning_rate": 1.3995271867612294e-05, "loss": 4.9931, "num_input_tokens_seen": 87293952, "step": 74 }, { "epoch": 0.008847469623687626, "grad_norm": 2.9375, "learning_rate": 1.418439716312057e-05, "loss": 4.8041, "num_input_tokens_seen": 88473600, "step": 75 }, { "epoch": 0.008965435885336794, "grad_norm": 3.5625, "learning_rate": 1.4373522458628843e-05, "loss": 4.8611, "num_input_tokens_seen": 89653248, "step": 76 }, { "epoch": 0.009083402146985962, "grad_norm": 3.734375, "learning_rate": 1.4562647754137118e-05, "loss": 4.7747, "num_input_tokens_seen": 90832896, "step": 77 }, { "epoch": 0.00920136840863513, "grad_norm": 2.984375, "learning_rate": 1.475177304964539e-05, "loss": 4.6884, "num_input_tokens_seen": 92012544, "step": 78 }, { "epoch": 0.009319334670284299, "grad_norm": 2.671875, "learning_rate": 1.4940898345153665e-05, "loss": 4.6617, "num_input_tokens_seen": 93192192, "step": 79 }, { "epoch": 0.009437300931933467, "grad_norm": 2.84375, "learning_rate": 1.5130023640661939e-05, "loss": 4.6174, "num_input_tokens_seen": 94371840, "step": 80 }, { "epoch": 0.009555267193582635, "grad_norm": 3.71875, "learning_rate": 1.5319148936170214e-05, "loss": 4.5483, "num_input_tokens_seen": 95551488, "step": 81 }, { "epoch": 0.009673233455231803, "grad_norm": 4.34375, "learning_rate": 1.5508274231678487e-05, "loss": 4.604, "num_input_tokens_seen": 96731136, "step": 82 }, { "epoch": 0.009791199716880972, "grad_norm": 3.09375, "learning_rate": 1.5697399527186764e-05, "loss": 4.5806, "num_input_tokens_seen": 97910784, "step": 83 }, { "epoch": 0.00990916597853014, "grad_norm": 3.203125, "learning_rate": 1.5886524822695038e-05, "loss": 4.4723, "num_input_tokens_seen": 99090432, "step": 84 }, { "epoch": 0.010027132240179308, "grad_norm": 3.8125, "learning_rate": 1.607565011820331e-05, "loss": 4.4723, "num_input_tokens_seen": 100270080, "step": 85 }, { "epoch": 0.010145098501828476, "grad_norm": 3.03125, "learning_rate": 1.6264775413711585e-05, "loss": 4.4519, "num_input_tokens_seen": 101449728, "step": 86 }, { "epoch": 0.010263064763477646, "grad_norm": 5.53125, "learning_rate": 1.645390070921986e-05, "loss": 4.4077, "num_input_tokens_seen": 102629376, "step": 87 }, { "epoch": 0.010381031025126814, "grad_norm": 4.03125, "learning_rate": 1.6643026004728132e-05, "loss": 4.5295, "num_input_tokens_seen": 103809024, "step": 88 }, { "epoch": 0.010498997286775983, "grad_norm": 3.484375, "learning_rate": 1.683215130023641e-05, "loss": 4.3235, "num_input_tokens_seen": 104988672, "step": 89 }, { "epoch": 0.010616963548425151, "grad_norm": 5.78125, "learning_rate": 1.7021276595744682e-05, "loss": 4.3245, "num_input_tokens_seen": 106168320, "step": 90 }, { "epoch": 0.010734929810074319, "grad_norm": 3.0625, "learning_rate": 1.7210401891252956e-05, "loss": 4.2554, "num_input_tokens_seen": 107347968, "step": 91 }, { "epoch": 0.010852896071723487, "grad_norm": 3.671875, "learning_rate": 1.7399527186761233e-05, "loss": 4.2572, "num_input_tokens_seen": 108527616, "step": 92 }, { "epoch": 0.010970862333372655, "grad_norm": 5.0625, "learning_rate": 1.7588652482269506e-05, "loss": 4.2442, "num_input_tokens_seen": 109707264, "step": 93 }, { "epoch": 0.011088828595021824, "grad_norm": 2.6875, "learning_rate": 1.7777777777777777e-05, "loss": 4.3006, "num_input_tokens_seen": 110886912, "step": 94 }, { "epoch": 0.011206794856670992, "grad_norm": 3.171875, "learning_rate": 1.7966903073286054e-05, "loss": 4.2813, "num_input_tokens_seen": 112066560, "step": 95 }, { "epoch": 0.01132476111832016, "grad_norm": 2.765625, "learning_rate": 1.8156028368794327e-05, "loss": 4.1186, "num_input_tokens_seen": 113246208, "step": 96 }, { "epoch": 0.011442727379969328, "grad_norm": 3.1875, "learning_rate": 1.83451536643026e-05, "loss": 4.1652, "num_input_tokens_seen": 114425856, "step": 97 }, { "epoch": 0.011560693641618497, "grad_norm": 4.71875, "learning_rate": 1.8534278959810878e-05, "loss": 4.1923, "num_input_tokens_seen": 115605504, "step": 98 }, { "epoch": 0.011678659903267665, "grad_norm": 3.359375, "learning_rate": 1.872340425531915e-05, "loss": 4.0728, "num_input_tokens_seen": 116785152, "step": 99 }, { "epoch": 0.011796626164916833, "grad_norm": 2.484375, "learning_rate": 1.8912529550827425e-05, "loss": 4.2189, "num_input_tokens_seen": 117964800, "step": 100 }, { "epoch": 0.011914592426566003, "grad_norm": 3.109375, "learning_rate": 1.91016548463357e-05, "loss": 4.0632, "num_input_tokens_seen": 119144448, "step": 101 }, { "epoch": 0.012032558688215171, "grad_norm": 4.40625, "learning_rate": 1.929078014184397e-05, "loss": 3.8775, "num_input_tokens_seen": 120324096, "step": 102 }, { "epoch": 0.01215052494986434, "grad_norm": 2.5625, "learning_rate": 1.9479905437352245e-05, "loss": 3.8658, "num_input_tokens_seen": 121503744, "step": 103 }, { "epoch": 0.012268491211513508, "grad_norm": 3.703125, "learning_rate": 1.9669030732860522e-05, "loss": 3.9888, "num_input_tokens_seen": 122683392, "step": 104 }, { "epoch": 0.012386457473162676, "grad_norm": 2.78125, "learning_rate": 1.9858156028368796e-05, "loss": 3.9439, "num_input_tokens_seen": 123863040, "step": 105 }, { "epoch": 0.012504423734811844, "grad_norm": 3.4375, "learning_rate": 2.004728132387707e-05, "loss": 4.0098, "num_input_tokens_seen": 125042688, "step": 106 }, { "epoch": 0.012622389996461012, "grad_norm": 2.71875, "learning_rate": 2.0236406619385343e-05, "loss": 3.9454, "num_input_tokens_seen": 126222336, "step": 107 }, { "epoch": 0.01274035625811018, "grad_norm": 3.03125, "learning_rate": 2.0425531914893616e-05, "loss": 3.8491, "num_input_tokens_seen": 127401984, "step": 108 }, { "epoch": 0.012858322519759349, "grad_norm": 3.59375, "learning_rate": 2.0614657210401893e-05, "loss": 3.9602, "num_input_tokens_seen": 128581632, "step": 109 }, { "epoch": 0.012976288781408517, "grad_norm": 5.25, "learning_rate": 2.0803782505910167e-05, "loss": 3.9542, "num_input_tokens_seen": 129761280, "step": 110 }, { "epoch": 0.013094255043057685, "grad_norm": 2.34375, "learning_rate": 2.099290780141844e-05, "loss": 3.7846, "num_input_tokens_seen": 130940928, "step": 111 }, { "epoch": 0.013212221304706853, "grad_norm": 3.484375, "learning_rate": 2.1182033096926714e-05, "loss": 3.9128, "num_input_tokens_seen": 132120576, "step": 112 }, { "epoch": 0.013330187566356021, "grad_norm": 7.4375, "learning_rate": 2.137115839243499e-05, "loss": 3.7365, "num_input_tokens_seen": 133300224, "step": 113 }, { "epoch": 0.013448153828005191, "grad_norm": 3.6875, "learning_rate": 2.1560283687943264e-05, "loss": 3.8298, "num_input_tokens_seen": 134479872, "step": 114 }, { "epoch": 0.01356612008965436, "grad_norm": 11.25, "learning_rate": 2.1749408983451538e-05, "loss": 3.9192, "num_input_tokens_seen": 135659520, "step": 115 }, { "epoch": 0.013684086351303528, "grad_norm": 9.875, "learning_rate": 2.1938534278959815e-05, "loss": 3.7628, "num_input_tokens_seen": 136839168, "step": 116 }, { "epoch": 0.013802052612952696, "grad_norm": 4.90625, "learning_rate": 2.2127659574468088e-05, "loss": 3.7212, "num_input_tokens_seen": 138018816, "step": 117 }, { "epoch": 0.013920018874601864, "grad_norm": 6.875, "learning_rate": 2.2316784869976362e-05, "loss": 3.8291, "num_input_tokens_seen": 139198464, "step": 118 }, { "epoch": 0.014037985136251032, "grad_norm": 3.984375, "learning_rate": 2.2505910165484635e-05, "loss": 3.7104, "num_input_tokens_seen": 140378112, "step": 119 }, { "epoch": 0.0141559513979002, "grad_norm": 5.0625, "learning_rate": 2.2695035460992912e-05, "loss": 3.6898, "num_input_tokens_seen": 141557760, "step": 120 }, { "epoch": 0.014273917659549369, "grad_norm": 3.828125, "learning_rate": 2.2884160756501186e-05, "loss": 3.61, "num_input_tokens_seen": 142737408, "step": 121 }, { "epoch": 0.014391883921198537, "grad_norm": 4.875, "learning_rate": 2.307328605200946e-05, "loss": 3.6886, "num_input_tokens_seen": 143917056, "step": 122 }, { "epoch": 0.014509850182847705, "grad_norm": 4.75, "learning_rate": 2.326241134751773e-05, "loss": 3.6435, "num_input_tokens_seen": 145096704, "step": 123 }, { "epoch": 0.014627816444496873, "grad_norm": 3.75, "learning_rate": 2.3451536643026006e-05, "loss": 3.6619, "num_input_tokens_seen": 146276352, "step": 124 }, { "epoch": 0.014745782706146042, "grad_norm": 5.375, "learning_rate": 2.364066193853428e-05, "loss": 3.631, "num_input_tokens_seen": 147456000, "step": 125 }, { "epoch": 0.01486374896779521, "grad_norm": 4.125, "learning_rate": 2.3829787234042553e-05, "loss": 3.6482, "num_input_tokens_seen": 148635648, "step": 126 }, { "epoch": 0.014981715229444378, "grad_norm": 7.78125, "learning_rate": 2.4018912529550827e-05, "loss": 3.6785, "num_input_tokens_seen": 149815296, "step": 127 }, { "epoch": 0.015099681491093548, "grad_norm": 7.0, "learning_rate": 2.4208037825059104e-05, "loss": 3.6113, "num_input_tokens_seen": 150994944, "step": 128 }, { "epoch": 0.015217647752742716, "grad_norm": 4.46875, "learning_rate": 2.4397163120567377e-05, "loss": 3.6015, "num_input_tokens_seen": 152174592, "step": 129 }, { "epoch": 0.015335614014391884, "grad_norm": 4.1875, "learning_rate": 2.458628841607565e-05, "loss": 3.5241, "num_input_tokens_seen": 153354240, "step": 130 }, { "epoch": 0.015453580276041053, "grad_norm": 4.0625, "learning_rate": 2.4775413711583928e-05, "loss": 3.6007, "num_input_tokens_seen": 154533888, "step": 131 }, { "epoch": 0.01557154653769022, "grad_norm": 3.046875, "learning_rate": 2.49645390070922e-05, "loss": 3.5949, "num_input_tokens_seen": 155713536, "step": 132 }, { "epoch": 0.01568951279933939, "grad_norm": 3.265625, "learning_rate": 2.5153664302600475e-05, "loss": 3.5403, "num_input_tokens_seen": 156893184, "step": 133 }, { "epoch": 0.015807479060988557, "grad_norm": 2.859375, "learning_rate": 2.5342789598108752e-05, "loss": 3.5399, "num_input_tokens_seen": 158072832, "step": 134 }, { "epoch": 0.015925445322637725, "grad_norm": 2.6875, "learning_rate": 2.5531914893617025e-05, "loss": 3.5408, "num_input_tokens_seen": 159252480, "step": 135 }, { "epoch": 0.016043411584286894, "grad_norm": 2.03125, "learning_rate": 2.57210401891253e-05, "loss": 3.4678, "num_input_tokens_seen": 160432128, "step": 136 }, { "epoch": 0.016161377845936062, "grad_norm": 2.796875, "learning_rate": 2.5910165484633572e-05, "loss": 3.6019, "num_input_tokens_seen": 161611776, "step": 137 }, { "epoch": 0.01627934410758523, "grad_norm": 1.96875, "learning_rate": 2.609929078014185e-05, "loss": 3.4525, "num_input_tokens_seen": 162791424, "step": 138 }, { "epoch": 0.0163973103692344, "grad_norm": 2.390625, "learning_rate": 2.628841607565012e-05, "loss": 3.557, "num_input_tokens_seen": 163971072, "step": 139 }, { "epoch": 0.016515276630883566, "grad_norm": 2.734375, "learning_rate": 2.6477541371158393e-05, "loss": 3.5324, "num_input_tokens_seen": 165150720, "step": 140 }, { "epoch": 0.016633242892532735, "grad_norm": 3.703125, "learning_rate": 2.6666666666666667e-05, "loss": 3.4707, "num_input_tokens_seen": 166330368, "step": 141 }, { "epoch": 0.016751209154181903, "grad_norm": 3.53125, "learning_rate": 2.6855791962174944e-05, "loss": 3.3545, "num_input_tokens_seen": 167510016, "step": 142 }, { "epoch": 0.01686917541583107, "grad_norm": 1.7890625, "learning_rate": 2.7044917257683217e-05, "loss": 3.4051, "num_input_tokens_seen": 168689664, "step": 143 }, { "epoch": 0.01698714167748024, "grad_norm": 4.78125, "learning_rate": 2.723404255319149e-05, "loss": 3.4119, "num_input_tokens_seen": 169869312, "step": 144 }, { "epoch": 0.017105107939129408, "grad_norm": 3.0625, "learning_rate": 2.7423167848699764e-05, "loss": 3.5149, "num_input_tokens_seen": 171048960, "step": 145 }, { "epoch": 0.017223074200778576, "grad_norm": 3.671875, "learning_rate": 2.761229314420804e-05, "loss": 3.3442, "num_input_tokens_seen": 172228608, "step": 146 }, { "epoch": 0.017341040462427744, "grad_norm": 3.453125, "learning_rate": 2.7801418439716315e-05, "loss": 3.3277, "num_input_tokens_seen": 173408256, "step": 147 }, { "epoch": 0.017459006724076916, "grad_norm": 3.4375, "learning_rate": 2.7990543735224588e-05, "loss": 3.3905, "num_input_tokens_seen": 174587904, "step": 148 }, { "epoch": 0.017576972985726084, "grad_norm": 2.0, "learning_rate": 2.8179669030732865e-05, "loss": 3.276, "num_input_tokens_seen": 175767552, "step": 149 }, { "epoch": 0.017694939247375252, "grad_norm": 4.0, "learning_rate": 2.836879432624114e-05, "loss": 3.3211, "num_input_tokens_seen": 176947200, "step": 150 }, { "epoch": 0.01781290550902442, "grad_norm": 4.03125, "learning_rate": 2.8557919621749412e-05, "loss": 3.3483, "num_input_tokens_seen": 178126848, "step": 151 }, { "epoch": 0.01793087177067359, "grad_norm": 2.53125, "learning_rate": 2.8747044917257686e-05, "loss": 3.3391, "num_input_tokens_seen": 179306496, "step": 152 }, { "epoch": 0.018048838032322757, "grad_norm": 5.5625, "learning_rate": 2.8936170212765963e-05, "loss": 3.4719, "num_input_tokens_seen": 180486144, "step": 153 }, { "epoch": 0.018166804293971925, "grad_norm": 3.171875, "learning_rate": 2.9125295508274236e-05, "loss": 3.2727, "num_input_tokens_seen": 181665792, "step": 154 }, { "epoch": 0.018284770555621093, "grad_norm": 5.09375, "learning_rate": 2.9314420803782506e-05, "loss": 3.284, "num_input_tokens_seen": 182845440, "step": 155 }, { "epoch": 0.01840273681727026, "grad_norm": 3.25, "learning_rate": 2.950354609929078e-05, "loss": 3.2279, "num_input_tokens_seen": 184025088, "step": 156 }, { "epoch": 0.01852070307891943, "grad_norm": 3.71875, "learning_rate": 2.9692671394799057e-05, "loss": 3.2438, "num_input_tokens_seen": 185204736, "step": 157 }, { "epoch": 0.018638669340568598, "grad_norm": 3.875, "learning_rate": 2.988179669030733e-05, "loss": 3.3257, "num_input_tokens_seen": 186384384, "step": 158 }, { "epoch": 0.018756635602217766, "grad_norm": 2.21875, "learning_rate": 3.0070921985815604e-05, "loss": 3.2727, "num_input_tokens_seen": 187564032, "step": 159 }, { "epoch": 0.018874601863866934, "grad_norm": 4.125, "learning_rate": 3.0260047281323877e-05, "loss": 3.245, "num_input_tokens_seen": 188743680, "step": 160 }, { "epoch": 0.018992568125516102, "grad_norm": 3.640625, "learning_rate": 3.0449172576832154e-05, "loss": 3.1904, "num_input_tokens_seen": 189923328, "step": 161 }, { "epoch": 0.01911053438716527, "grad_norm": 2.625, "learning_rate": 3.063829787234043e-05, "loss": 3.2754, "num_input_tokens_seen": 191102976, "step": 162 }, { "epoch": 0.01922850064881444, "grad_norm": 3.578125, "learning_rate": 3.0827423167848705e-05, "loss": 3.1889, "num_input_tokens_seen": 192282624, "step": 163 }, { "epoch": 0.019346466910463607, "grad_norm": 2.953125, "learning_rate": 3.1016548463356975e-05, "loss": 3.2809, "num_input_tokens_seen": 193462272, "step": 164 }, { "epoch": 0.019464433172112775, "grad_norm": 2.234375, "learning_rate": 3.120567375886525e-05, "loss": 3.1929, "num_input_tokens_seen": 194641920, "step": 165 }, { "epoch": 0.019582399433761943, "grad_norm": 2.515625, "learning_rate": 3.139479905437353e-05, "loss": 3.1474, "num_input_tokens_seen": 195821568, "step": 166 }, { "epoch": 0.01970036569541111, "grad_norm": 4.09375, "learning_rate": 3.15839243498818e-05, "loss": 3.0433, "num_input_tokens_seen": 197001216, "step": 167 }, { "epoch": 0.01981833195706028, "grad_norm": 2.6875, "learning_rate": 3.1773049645390076e-05, "loss": 3.1527, "num_input_tokens_seen": 198180864, "step": 168 }, { "epoch": 0.019936298218709448, "grad_norm": 3.5, "learning_rate": 3.196217494089835e-05, "loss": 3.2145, "num_input_tokens_seen": 199360512, "step": 169 }, { "epoch": 0.020054264480358616, "grad_norm": 4.9375, "learning_rate": 3.215130023640662e-05, "loss": 3.2642, "num_input_tokens_seen": 200540160, "step": 170 }, { "epoch": 0.020172230742007784, "grad_norm": 2.484375, "learning_rate": 3.234042553191489e-05, "loss": 3.1286, "num_input_tokens_seen": 201719808, "step": 171 }, { "epoch": 0.020290197003656953, "grad_norm": 8.1875, "learning_rate": 3.252955082742317e-05, "loss": 3.1274, "num_input_tokens_seen": 202899456, "step": 172 }, { "epoch": 0.02040816326530612, "grad_norm": 6.375, "learning_rate": 3.271867612293144e-05, "loss": 3.096, "num_input_tokens_seen": 204079104, "step": 173 }, { "epoch": 0.020526129526955292, "grad_norm": 5.8125, "learning_rate": 3.290780141843972e-05, "loss": 3.1647, "num_input_tokens_seen": 205258752, "step": 174 }, { "epoch": 0.02064409578860446, "grad_norm": 5.6875, "learning_rate": 3.3096926713947994e-05, "loss": 3.269, "num_input_tokens_seen": 206438400, "step": 175 }, { "epoch": 0.02076206205025363, "grad_norm": 4.4375, "learning_rate": 3.3286052009456264e-05, "loss": 3.2106, "num_input_tokens_seen": 207618048, "step": 176 }, { "epoch": 0.020880028311902797, "grad_norm": 3.109375, "learning_rate": 3.347517730496454e-05, "loss": 3.1545, "num_input_tokens_seen": 208797696, "step": 177 }, { "epoch": 0.020997994573551965, "grad_norm": 6.6875, "learning_rate": 3.366430260047282e-05, "loss": 3.1045, "num_input_tokens_seen": 209977344, "step": 178 }, { "epoch": 0.021115960835201134, "grad_norm": 6.0625, "learning_rate": 3.385342789598109e-05, "loss": 3.0496, "num_input_tokens_seen": 211156992, "step": 179 }, { "epoch": 0.021233927096850302, "grad_norm": 4.03125, "learning_rate": 3.4042553191489365e-05, "loss": 3.081, "num_input_tokens_seen": 212336640, "step": 180 }, { "epoch": 0.02135189335849947, "grad_norm": 3.921875, "learning_rate": 3.423167848699764e-05, "loss": 3.0552, "num_input_tokens_seen": 213516288, "step": 181 }, { "epoch": 0.021469859620148638, "grad_norm": 4.5, "learning_rate": 3.442080378250591e-05, "loss": 3.0172, "num_input_tokens_seen": 214695936, "step": 182 }, { "epoch": 0.021587825881797806, "grad_norm": 4.0625, "learning_rate": 3.460992907801419e-05, "loss": 3.1379, "num_input_tokens_seen": 215875584, "step": 183 }, { "epoch": 0.021705792143446975, "grad_norm": 4.9375, "learning_rate": 3.4799054373522466e-05, "loss": 3.1235, "num_input_tokens_seen": 217055232, "step": 184 }, { "epoch": 0.021823758405096143, "grad_norm": 3.78125, "learning_rate": 3.4988179669030736e-05, "loss": 3.1189, "num_input_tokens_seen": 218234880, "step": 185 }, { "epoch": 0.02194172466674531, "grad_norm": 5.0625, "learning_rate": 3.517730496453901e-05, "loss": 3.0035, "num_input_tokens_seen": 219414528, "step": 186 }, { "epoch": 0.02205969092839448, "grad_norm": 4.28125, "learning_rate": 3.536643026004728e-05, "loss": 3.0478, "num_input_tokens_seen": 220594176, "step": 187 }, { "epoch": 0.022177657190043647, "grad_norm": 5.8125, "learning_rate": 3.555555555555555e-05, "loss": 3.0777, "num_input_tokens_seen": 221773824, "step": 188 }, { "epoch": 0.022295623451692816, "grad_norm": 5.0625, "learning_rate": 3.574468085106383e-05, "loss": 3.0665, "num_input_tokens_seen": 222953472, "step": 189 }, { "epoch": 0.022413589713341984, "grad_norm": 4.25, "learning_rate": 3.593380614657211e-05, "loss": 3.0271, "num_input_tokens_seen": 224133120, "step": 190 }, { "epoch": 0.022531555974991152, "grad_norm": 4.03125, "learning_rate": 3.612293144208038e-05, "loss": 3.033, "num_input_tokens_seen": 225312768, "step": 191 }, { "epoch": 0.02264952223664032, "grad_norm": 5.09375, "learning_rate": 3.6312056737588654e-05, "loss": 3.143, "num_input_tokens_seen": 226492416, "step": 192 }, { "epoch": 0.02276748849828949, "grad_norm": 4.09375, "learning_rate": 3.650118203309693e-05, "loss": 3.0347, "num_input_tokens_seen": 227672064, "step": 193 }, { "epoch": 0.022885454759938657, "grad_norm": 4.65625, "learning_rate": 3.66903073286052e-05, "loss": 3.07, "num_input_tokens_seen": 228851712, "step": 194 }, { "epoch": 0.023003421021587825, "grad_norm": 3.90625, "learning_rate": 3.687943262411348e-05, "loss": 3.0225, "num_input_tokens_seen": 230031360, "step": 195 }, { "epoch": 0.023121387283236993, "grad_norm": 4.96875, "learning_rate": 3.7068557919621755e-05, "loss": 3.0222, "num_input_tokens_seen": 231211008, "step": 196 }, { "epoch": 0.02323935354488616, "grad_norm": 3.84375, "learning_rate": 3.7257683215130025e-05, "loss": 2.9755, "num_input_tokens_seen": 232390656, "step": 197 }, { "epoch": 0.02335731980653533, "grad_norm": 5.0, "learning_rate": 3.74468085106383e-05, "loss": 3.0163, "num_input_tokens_seen": 233570304, "step": 198 }, { "epoch": 0.023475286068184498, "grad_norm": 4.53125, "learning_rate": 3.763593380614658e-05, "loss": 2.9553, "num_input_tokens_seen": 234749952, "step": 199 }, { "epoch": 0.023593252329833666, "grad_norm": 4.6875, "learning_rate": 3.782505910165485e-05, "loss": 3.015, "num_input_tokens_seen": 235929600, "step": 200 }, { "epoch": 0.023593252329833666, "eval_wikipedia_loss": 2.9615590572357178, "eval_wikipedia_runtime": 172.3085, "eval_wikipedia_samples_per_second": 4.074, "eval_wikipedia_steps_per_second": 0.174, "num_input_tokens_seen": 235929600, "step": 200 }, { "epoch": 0.023593252329833666, "eval_toxicity_loss": 4.73836088180542, "eval_toxicity_runtime": 0.999, "eval_toxicity_samples_per_second": 2.002, "eval_toxicity_steps_per_second": 1.001, "num_input_tokens_seen": 235929600, "step": 200 }, { "epoch": 0.023711218591482838, "grad_norm": 4.0, "learning_rate": 3.8014184397163126e-05, "loss": 2.9448, "num_input_tokens_seen": 237109248, "step": 201 }, { "epoch": 0.023829184853132006, "grad_norm": 4.78125, "learning_rate": 3.82033096926714e-05, "loss": 3.1131, "num_input_tokens_seen": 238288896, "step": 202 }, { "epoch": 0.023947151114781174, "grad_norm": 3.828125, "learning_rate": 3.839243498817967e-05, "loss": 2.9217, "num_input_tokens_seen": 239468544, "step": 203 }, { "epoch": 0.024065117376430342, "grad_norm": 4.46875, "learning_rate": 3.858156028368794e-05, "loss": 2.9658, "num_input_tokens_seen": 240648192, "step": 204 }, { "epoch": 0.02418308363807951, "grad_norm": 3.703125, "learning_rate": 3.877068557919622e-05, "loss": 2.9857, "num_input_tokens_seen": 241827840, "step": 205 }, { "epoch": 0.02430104989972868, "grad_norm": 4.9375, "learning_rate": 3.895981087470449e-05, "loss": 2.9761, "num_input_tokens_seen": 243007488, "step": 206 }, { "epoch": 0.024419016161377847, "grad_norm": 4.28125, "learning_rate": 3.914893617021277e-05, "loss": 2.981, "num_input_tokens_seen": 244187136, "step": 207 }, { "epoch": 0.024536982423027015, "grad_norm": 4.3125, "learning_rate": 3.9338061465721044e-05, "loss": 3.0308, "num_input_tokens_seen": 245366784, "step": 208 }, { "epoch": 0.024654948684676183, "grad_norm": 4.0, "learning_rate": 3.9527186761229314e-05, "loss": 2.9529, "num_input_tokens_seen": 246546432, "step": 209 }, { "epoch": 0.02477291494632535, "grad_norm": 4.28125, "learning_rate": 3.971631205673759e-05, "loss": 2.9297, "num_input_tokens_seen": 247726080, "step": 210 }, { "epoch": 0.02489088120797452, "grad_norm": 3.328125, "learning_rate": 3.990543735224587e-05, "loss": 2.8328, "num_input_tokens_seen": 248905728, "step": 211 }, { "epoch": 0.025008847469623688, "grad_norm": 4.6875, "learning_rate": 4.009456264775414e-05, "loss": 2.9565, "num_input_tokens_seen": 250085376, "step": 212 }, { "epoch": 0.025126813731272856, "grad_norm": 3.734375, "learning_rate": 4.028368794326241e-05, "loss": 2.9276, "num_input_tokens_seen": 251265024, "step": 213 }, { "epoch": 0.025244779992922024, "grad_norm": 4.90625, "learning_rate": 4.0472813238770685e-05, "loss": 2.8872, "num_input_tokens_seen": 252444672, "step": 214 }, { "epoch": 0.025362746254571193, "grad_norm": 3.984375, "learning_rate": 4.066193853427896e-05, "loss": 2.9423, "num_input_tokens_seen": 253624320, "step": 215 }, { "epoch": 0.02548071251622036, "grad_norm": 4.28125, "learning_rate": 4.085106382978723e-05, "loss": 2.9417, "num_input_tokens_seen": 254803968, "step": 216 }, { "epoch": 0.02559867877786953, "grad_norm": 3.53125, "learning_rate": 4.104018912529551e-05, "loss": 2.878, "num_input_tokens_seen": 255983616, "step": 217 }, { "epoch": 0.025716645039518697, "grad_norm": 3.71875, "learning_rate": 4.1229314420803786e-05, "loss": 2.8925, "num_input_tokens_seen": 257163264, "step": 218 }, { "epoch": 0.025834611301167865, "grad_norm": 3.109375, "learning_rate": 4.1418439716312056e-05, "loss": 2.9432, "num_input_tokens_seen": 258342912, "step": 219 }, { "epoch": 0.025952577562817034, "grad_norm": 4.875, "learning_rate": 4.1607565011820333e-05, "loss": 2.8552, "num_input_tokens_seen": 259522560, "step": 220 }, { "epoch": 0.026070543824466202, "grad_norm": 3.09375, "learning_rate": 4.1796690307328604e-05, "loss": 2.8685, "num_input_tokens_seen": 260702208, "step": 221 }, { "epoch": 0.02618851008611537, "grad_norm": 4.6875, "learning_rate": 4.198581560283688e-05, "loss": 2.9284, "num_input_tokens_seen": 261881856, "step": 222 }, { "epoch": 0.026306476347764538, "grad_norm": 3.265625, "learning_rate": 4.217494089834516e-05, "loss": 2.8862, "num_input_tokens_seen": 263061504, "step": 223 }, { "epoch": 0.026424442609413706, "grad_norm": 4.5625, "learning_rate": 4.236406619385343e-05, "loss": 2.8105, "num_input_tokens_seen": 264241152, "step": 224 }, { "epoch": 0.026542408871062875, "grad_norm": 3.546875, "learning_rate": 4.2553191489361704e-05, "loss": 2.9406, "num_input_tokens_seen": 265420800, "step": 225 }, { "epoch": 0.026660375132712043, "grad_norm": 3.640625, "learning_rate": 4.274231678486998e-05, "loss": 2.907, "num_input_tokens_seen": 266600448, "step": 226 }, { "epoch": 0.02677834139436121, "grad_norm": 3.015625, "learning_rate": 4.293144208037825e-05, "loss": 3.0143, "num_input_tokens_seen": 267780096, "step": 227 }, { "epoch": 0.026896307656010383, "grad_norm": 4.25, "learning_rate": 4.312056737588653e-05, "loss": 2.9126, "num_input_tokens_seen": 268959744, "step": 228 }, { "epoch": 0.02701427391765955, "grad_norm": 2.5, "learning_rate": 4.3309692671394805e-05, "loss": 2.8879, "num_input_tokens_seen": 270139392, "step": 229 }, { "epoch": 0.02713224017930872, "grad_norm": 3.34375, "learning_rate": 4.3498817966903076e-05, "loss": 2.8174, "num_input_tokens_seen": 271319040, "step": 230 }, { "epoch": 0.027250206440957887, "grad_norm": 4.40625, "learning_rate": 4.368794326241135e-05, "loss": 2.895, "num_input_tokens_seen": 272498688, "step": 231 }, { "epoch": 0.027368172702607056, "grad_norm": 2.140625, "learning_rate": 4.387706855791963e-05, "loss": 2.8573, "num_input_tokens_seen": 273678336, "step": 232 }, { "epoch": 0.027486138964256224, "grad_norm": 3.609375, "learning_rate": 4.40661938534279e-05, "loss": 2.7662, "num_input_tokens_seen": 274857984, "step": 233 }, { "epoch": 0.027604105225905392, "grad_norm": 3.484375, "learning_rate": 4.4255319148936176e-05, "loss": 2.8103, "num_input_tokens_seen": 276037632, "step": 234 }, { "epoch": 0.02772207148755456, "grad_norm": 1.9765625, "learning_rate": 4.444444444444445e-05, "loss": 2.8271, "num_input_tokens_seen": 277217280, "step": 235 }, { "epoch": 0.02784003774920373, "grad_norm": 3.328125, "learning_rate": 4.4633569739952723e-05, "loss": 2.8881, "num_input_tokens_seen": 278396928, "step": 236 }, { "epoch": 0.027958004010852897, "grad_norm": 3.9375, "learning_rate": 4.4822695035461e-05, "loss": 2.9018, "num_input_tokens_seen": 279576576, "step": 237 }, { "epoch": 0.028075970272502065, "grad_norm": 5.25, "learning_rate": 4.501182033096927e-05, "loss": 2.8584, "num_input_tokens_seen": 280756224, "step": 238 }, { "epoch": 0.028193936534151233, "grad_norm": 2.328125, "learning_rate": 4.520094562647755e-05, "loss": 2.8435, "num_input_tokens_seen": 281935872, "step": 239 }, { "epoch": 0.0283119027958004, "grad_norm": 3.375, "learning_rate": 4.5390070921985824e-05, "loss": 2.8827, "num_input_tokens_seen": 283115520, "step": 240 }, { "epoch": 0.02842986905744957, "grad_norm": 2.3125, "learning_rate": 4.5579196217494095e-05, "loss": 2.8415, "num_input_tokens_seen": 284295168, "step": 241 }, { "epoch": 0.028547835319098738, "grad_norm": 3.828125, "learning_rate": 4.576832151300237e-05, "loss": 2.8305, "num_input_tokens_seen": 285474816, "step": 242 }, { "epoch": 0.028665801580747906, "grad_norm": 2.5625, "learning_rate": 4.595744680851065e-05, "loss": 2.7947, "num_input_tokens_seen": 286654464, "step": 243 }, { "epoch": 0.028783767842397074, "grad_norm": 2.375, "learning_rate": 4.614657210401892e-05, "loss": 2.812, "num_input_tokens_seen": 287834112, "step": 244 }, { "epoch": 0.028901734104046242, "grad_norm": 2.578125, "learning_rate": 4.633569739952719e-05, "loss": 2.8157, "num_input_tokens_seen": 289013760, "step": 245 }, { "epoch": 0.02901970036569541, "grad_norm": 3.625, "learning_rate": 4.652482269503546e-05, "loss": 2.8768, "num_input_tokens_seen": 290193408, "step": 246 }, { "epoch": 0.02913766662734458, "grad_norm": 2.328125, "learning_rate": 4.6713947990543736e-05, "loss": 2.834, "num_input_tokens_seen": 291373056, "step": 247 }, { "epoch": 0.029255632888993747, "grad_norm": 2.1875, "learning_rate": 4.690307328605201e-05, "loss": 2.8125, "num_input_tokens_seen": 292552704, "step": 248 }, { "epoch": 0.029373599150642915, "grad_norm": 2.921875, "learning_rate": 4.709219858156028e-05, "loss": 2.7906, "num_input_tokens_seen": 293732352, "step": 249 }, { "epoch": 0.029491565412292083, "grad_norm": 1.921875, "learning_rate": 4.728132387706856e-05, "loss": 2.8163, "num_input_tokens_seen": 294912000, "step": 250 }, { "epoch": 0.02960953167394125, "grad_norm": 2.75, "learning_rate": 4.747044917257684e-05, "loss": 2.7826, "num_input_tokens_seen": 296091648, "step": 251 }, { "epoch": 0.02972749793559042, "grad_norm": 2.671875, "learning_rate": 4.765957446808511e-05, "loss": 2.8368, "num_input_tokens_seen": 297271296, "step": 252 }, { "epoch": 0.029845464197239588, "grad_norm": 2.796875, "learning_rate": 4.7848699763593384e-05, "loss": 2.7188, "num_input_tokens_seen": 298450944, "step": 253 }, { "epoch": 0.029963430458888756, "grad_norm": 1.984375, "learning_rate": 4.8037825059101654e-05, "loss": 2.7609, "num_input_tokens_seen": 299630592, "step": 254 }, { "epoch": 0.030081396720537928, "grad_norm": 2.484375, "learning_rate": 4.822695035460993e-05, "loss": 2.742, "num_input_tokens_seen": 300810240, "step": 255 }, { "epoch": 0.030199362982187096, "grad_norm": 1.84375, "learning_rate": 4.841607565011821e-05, "loss": 2.7751, "num_input_tokens_seen": 301989888, "step": 256 }, { "epoch": 0.030317329243836264, "grad_norm": 3.078125, "learning_rate": 4.860520094562648e-05, "loss": 2.6949, "num_input_tokens_seen": 303169536, "step": 257 }, { "epoch": 0.030435295505485432, "grad_norm": 3.25, "learning_rate": 4.8794326241134755e-05, "loss": 2.8174, "num_input_tokens_seen": 304349184, "step": 258 }, { "epoch": 0.0305532617671346, "grad_norm": 1.4921875, "learning_rate": 4.898345153664303e-05, "loss": 2.6962, "num_input_tokens_seen": 305528832, "step": 259 }, { "epoch": 0.03067122802878377, "grad_norm": 3.046875, "learning_rate": 4.91725768321513e-05, "loss": 2.8484, "num_input_tokens_seen": 306708480, "step": 260 }, { "epoch": 0.030789194290432937, "grad_norm": 2.78125, "learning_rate": 4.936170212765958e-05, "loss": 2.632, "num_input_tokens_seen": 307888128, "step": 261 }, { "epoch": 0.030907160552082105, "grad_norm": 2.546875, "learning_rate": 4.9550827423167856e-05, "loss": 2.8744, "num_input_tokens_seen": 309067776, "step": 262 }, { "epoch": 0.031025126813731273, "grad_norm": 2.359375, "learning_rate": 4.9739952718676126e-05, "loss": 2.7714, "num_input_tokens_seen": 310247424, "step": 263 }, { "epoch": 0.03114309307538044, "grad_norm": 1.8515625, "learning_rate": 4.99290780141844e-05, "loss": 2.8042, "num_input_tokens_seen": 311427072, "step": 264 }, { "epoch": 0.031261059337029606, "grad_norm": 2.265625, "learning_rate": 5.011820330969268e-05, "loss": 2.7448, "num_input_tokens_seen": 312606720, "step": 265 }, { "epoch": 0.03137902559867878, "grad_norm": 1.78125, "learning_rate": 5.030732860520095e-05, "loss": 2.678, "num_input_tokens_seen": 313786368, "step": 266 }, { "epoch": 0.03149699186032794, "grad_norm": 2.203125, "learning_rate": 5.049645390070923e-05, "loss": 2.676, "num_input_tokens_seen": 314966016, "step": 267 }, { "epoch": 0.031614958121977114, "grad_norm": 1.90625, "learning_rate": 5.0685579196217504e-05, "loss": 2.6778, "num_input_tokens_seen": 316145664, "step": 268 }, { "epoch": 0.031732924383626286, "grad_norm": 2.203125, "learning_rate": 5.0874704491725774e-05, "loss": 2.7624, "num_input_tokens_seen": 317325312, "step": 269 }, { "epoch": 0.03185089064527545, "grad_norm": 3.53125, "learning_rate": 5.106382978723405e-05, "loss": 2.6462, "num_input_tokens_seen": 318504960, "step": 270 }, { "epoch": 0.03196885690692462, "grad_norm": 2.1875, "learning_rate": 5.125295508274232e-05, "loss": 2.6475, "num_input_tokens_seen": 319684608, "step": 271 }, { "epoch": 0.03208682316857379, "grad_norm": 1.703125, "learning_rate": 5.14420803782506e-05, "loss": 2.7214, "num_input_tokens_seen": 320864256, "step": 272 }, { "epoch": 0.03220478943022296, "grad_norm": 2.25, "learning_rate": 5.1631205673758875e-05, "loss": 2.6597, "num_input_tokens_seen": 322043904, "step": 273 }, { "epoch": 0.032322755691872124, "grad_norm": 2.171875, "learning_rate": 5.1820330969267145e-05, "loss": 2.6412, "num_input_tokens_seen": 323223552, "step": 274 }, { "epoch": 0.032440721953521295, "grad_norm": 2.25, "learning_rate": 5.200945626477542e-05, "loss": 2.7664, "num_input_tokens_seen": 324403200, "step": 275 }, { "epoch": 0.03255868821517046, "grad_norm": 3.296875, "learning_rate": 5.21985815602837e-05, "loss": 2.7168, "num_input_tokens_seen": 325582848, "step": 276 }, { "epoch": 0.03267665447681963, "grad_norm": 1.515625, "learning_rate": 5.238770685579196e-05, "loss": 2.698, "num_input_tokens_seen": 326762496, "step": 277 }, { "epoch": 0.0327946207384688, "grad_norm": 1.75, "learning_rate": 5.257683215130024e-05, "loss": 2.7068, "num_input_tokens_seen": 327942144, "step": 278 }, { "epoch": 0.03291258700011797, "grad_norm": 2.859375, "learning_rate": 5.276595744680851e-05, "loss": 2.6487, "num_input_tokens_seen": 329121792, "step": 279 }, { "epoch": 0.03303055326176713, "grad_norm": 2.515625, "learning_rate": 5.2955082742316786e-05, "loss": 2.5925, "num_input_tokens_seen": 330301440, "step": 280 }, { "epoch": 0.033148519523416305, "grad_norm": 2.40625, "learning_rate": 5.314420803782506e-05, "loss": 2.6826, "num_input_tokens_seen": 331481088, "step": 281 }, { "epoch": 0.03326648578506547, "grad_norm": 2.015625, "learning_rate": 5.333333333333333e-05, "loss": 2.6997, "num_input_tokens_seen": 332660736, "step": 282 }, { "epoch": 0.03338445204671464, "grad_norm": 2.234375, "learning_rate": 5.352245862884161e-05, "loss": 2.6378, "num_input_tokens_seen": 333840384, "step": 283 }, { "epoch": 0.033502418308363806, "grad_norm": 1.984375, "learning_rate": 5.371158392434989e-05, "loss": 2.588, "num_input_tokens_seen": 335020032, "step": 284 }, { "epoch": 0.03362038457001298, "grad_norm": 2.546875, "learning_rate": 5.390070921985816e-05, "loss": 2.6051, "num_input_tokens_seen": 336199680, "step": 285 }, { "epoch": 0.03373835083166214, "grad_norm": 1.7421875, "learning_rate": 5.4089834515366434e-05, "loss": 2.6874, "num_input_tokens_seen": 337379328, "step": 286 }, { "epoch": 0.033856317093311314, "grad_norm": 2.09375, "learning_rate": 5.4278959810874704e-05, "loss": 2.7186, "num_input_tokens_seen": 338558976, "step": 287 }, { "epoch": 0.03397428335496048, "grad_norm": 1.796875, "learning_rate": 5.446808510638298e-05, "loss": 2.6698, "num_input_tokens_seen": 339738624, "step": 288 }, { "epoch": 0.03409224961660965, "grad_norm": 2.3125, "learning_rate": 5.465721040189126e-05, "loss": 2.603, "num_input_tokens_seen": 340918272, "step": 289 }, { "epoch": 0.034210215878258815, "grad_norm": 2.140625, "learning_rate": 5.484633569739953e-05, "loss": 2.6479, "num_input_tokens_seen": 342097920, "step": 290 }, { "epoch": 0.03432818213990799, "grad_norm": 2.140625, "learning_rate": 5.5035460992907805e-05, "loss": 2.7037, "num_input_tokens_seen": 343277568, "step": 291 }, { "epoch": 0.03444614840155715, "grad_norm": 1.90625, "learning_rate": 5.522458628841608e-05, "loss": 2.681, "num_input_tokens_seen": 344457216, "step": 292 }, { "epoch": 0.03456411466320632, "grad_norm": 2.046875, "learning_rate": 5.541371158392435e-05, "loss": 2.5947, "num_input_tokens_seen": 345636864, "step": 293 }, { "epoch": 0.03468208092485549, "grad_norm": 1.734375, "learning_rate": 5.560283687943263e-05, "loss": 2.6439, "num_input_tokens_seen": 346816512, "step": 294 }, { "epoch": 0.03480004718650466, "grad_norm": 2.28125, "learning_rate": 5.5791962174940906e-05, "loss": 2.6404, "num_input_tokens_seen": 347996160, "step": 295 }, { "epoch": 0.03491801344815383, "grad_norm": 2.03125, "learning_rate": 5.5981087470449176e-05, "loss": 2.6686, "num_input_tokens_seen": 349175808, "step": 296 }, { "epoch": 0.035035979709802996, "grad_norm": 1.7265625, "learning_rate": 5.617021276595745e-05, "loss": 2.666, "num_input_tokens_seen": 350355456, "step": 297 }, { "epoch": 0.03515394597145217, "grad_norm": 2.109375, "learning_rate": 5.635933806146573e-05, "loss": 2.7266, "num_input_tokens_seen": 351535104, "step": 298 }, { "epoch": 0.03527191223310133, "grad_norm": 2.03125, "learning_rate": 5.6548463356974e-05, "loss": 2.6655, "num_input_tokens_seen": 352714752, "step": 299 }, { "epoch": 0.035389878494750504, "grad_norm": 1.9296875, "learning_rate": 5.673758865248228e-05, "loss": 2.6975, "num_input_tokens_seen": 353894400, "step": 300 }, { "epoch": 0.03550784475639967, "grad_norm": 2.15625, "learning_rate": 5.692671394799055e-05, "loss": 2.6232, "num_input_tokens_seen": 355074048, "step": 301 }, { "epoch": 0.03562581101804884, "grad_norm": 2.140625, "learning_rate": 5.7115839243498824e-05, "loss": 2.6344, "num_input_tokens_seen": 356253696, "step": 302 }, { "epoch": 0.035743777279698005, "grad_norm": 1.7109375, "learning_rate": 5.73049645390071e-05, "loss": 2.5703, "num_input_tokens_seen": 357433344, "step": 303 }, { "epoch": 0.03586174354134718, "grad_norm": 2.125, "learning_rate": 5.749408983451537e-05, "loss": 2.5405, "num_input_tokens_seen": 358612992, "step": 304 }, { "epoch": 0.03597970980299634, "grad_norm": 2.546875, "learning_rate": 5.768321513002365e-05, "loss": 2.6079, "num_input_tokens_seen": 359792640, "step": 305 }, { "epoch": 0.03609767606464551, "grad_norm": 2.015625, "learning_rate": 5.7872340425531925e-05, "loss": 2.6958, "num_input_tokens_seen": 360972288, "step": 306 }, { "epoch": 0.03621564232629468, "grad_norm": 1.46875, "learning_rate": 5.8061465721040195e-05, "loss": 2.5541, "num_input_tokens_seen": 362151936, "step": 307 }, { "epoch": 0.03633360858794385, "grad_norm": 1.796875, "learning_rate": 5.825059101654847e-05, "loss": 2.6427, "num_input_tokens_seen": 363331584, "step": 308 }, { "epoch": 0.036451574849593014, "grad_norm": 2.171875, "learning_rate": 5.843971631205675e-05, "loss": 2.5853, "num_input_tokens_seen": 364511232, "step": 309 }, { "epoch": 0.036569541111242186, "grad_norm": 1.8046875, "learning_rate": 5.862884160756501e-05, "loss": 2.5426, "num_input_tokens_seen": 365690880, "step": 310 }, { "epoch": 0.03668750737289135, "grad_norm": 1.625, "learning_rate": 5.881796690307329e-05, "loss": 2.6835, "num_input_tokens_seen": 366870528, "step": 311 }, { "epoch": 0.03680547363454052, "grad_norm": 2.640625, "learning_rate": 5.900709219858156e-05, "loss": 2.626, "num_input_tokens_seen": 368050176, "step": 312 }, { "epoch": 0.03692343989618969, "grad_norm": 1.4765625, "learning_rate": 5.9196217494089836e-05, "loss": 2.5361, "num_input_tokens_seen": 369229824, "step": 313 }, { "epoch": 0.03704140615783886, "grad_norm": 3.03125, "learning_rate": 5.938534278959811e-05, "loss": 2.5762, "num_input_tokens_seen": 370409472, "step": 314 }, { "epoch": 0.037159372419488024, "grad_norm": 1.890625, "learning_rate": 5.9574468085106384e-05, "loss": 2.5874, "num_input_tokens_seen": 371589120, "step": 315 }, { "epoch": 0.037277338681137195, "grad_norm": 2.6875, "learning_rate": 5.976359338061466e-05, "loss": 2.5874, "num_input_tokens_seen": 372768768, "step": 316 }, { "epoch": 0.03739530494278636, "grad_norm": 1.8359375, "learning_rate": 5.995271867612294e-05, "loss": 2.4888, "num_input_tokens_seen": 373948416, "step": 317 }, { "epoch": 0.03751327120443553, "grad_norm": 2.25, "learning_rate": 6.014184397163121e-05, "loss": 2.5391, "num_input_tokens_seen": 375128064, "step": 318 }, { "epoch": 0.0376312374660847, "grad_norm": 1.890625, "learning_rate": 6.0330969267139484e-05, "loss": 2.6462, "num_input_tokens_seen": 376307712, "step": 319 }, { "epoch": 0.03774920372773387, "grad_norm": 1.875, "learning_rate": 6.0520094562647755e-05, "loss": 2.4726, "num_input_tokens_seen": 377487360, "step": 320 }, { "epoch": 0.03786716998938304, "grad_norm": 2.296875, "learning_rate": 6.070921985815603e-05, "loss": 2.5759, "num_input_tokens_seen": 378667008, "step": 321 }, { "epoch": 0.037985136251032205, "grad_norm": 2.703125, "learning_rate": 6.089834515366431e-05, "loss": 2.5199, "num_input_tokens_seen": 379846656, "step": 322 }, { "epoch": 0.038103102512681376, "grad_norm": 2.390625, "learning_rate": 6.108747044917259e-05, "loss": 2.6243, "num_input_tokens_seen": 381026304, "step": 323 }, { "epoch": 0.03822106877433054, "grad_norm": 1.65625, "learning_rate": 6.127659574468086e-05, "loss": 2.5922, "num_input_tokens_seen": 382205952, "step": 324 }, { "epoch": 0.03833903503597971, "grad_norm": 1.640625, "learning_rate": 6.146572104018913e-05, "loss": 2.5376, "num_input_tokens_seen": 383385600, "step": 325 }, { "epoch": 0.03845700129762888, "grad_norm": 2.03125, "learning_rate": 6.165484633569741e-05, "loss": 2.547, "num_input_tokens_seen": 384565248, "step": 326 }, { "epoch": 0.03857496755927805, "grad_norm": 2.765625, "learning_rate": 6.184397163120568e-05, "loss": 2.5095, "num_input_tokens_seen": 385744896, "step": 327 }, { "epoch": 0.038692933820927214, "grad_norm": 1.40625, "learning_rate": 6.203309692671395e-05, "loss": 2.5431, "num_input_tokens_seen": 386924544, "step": 328 }, { "epoch": 0.038810900082576386, "grad_norm": 2.203125, "learning_rate": 6.222222222222223e-05, "loss": 2.5744, "num_input_tokens_seen": 388104192, "step": 329 }, { "epoch": 0.03892886634422555, "grad_norm": 1.9921875, "learning_rate": 6.24113475177305e-05, "loss": 2.6511, "num_input_tokens_seen": 389283840, "step": 330 }, { "epoch": 0.03904683260587472, "grad_norm": 3.15625, "learning_rate": 6.260047281323877e-05, "loss": 2.5556, "num_input_tokens_seen": 390463488, "step": 331 }, { "epoch": 0.03916479886752389, "grad_norm": 1.953125, "learning_rate": 6.278959810874706e-05, "loss": 2.5616, "num_input_tokens_seen": 391643136, "step": 332 }, { "epoch": 0.03928276512917306, "grad_norm": 1.6484375, "learning_rate": 6.297872340425533e-05, "loss": 2.5747, "num_input_tokens_seen": 392822784, "step": 333 }, { "epoch": 0.03940073139082222, "grad_norm": 3.15625, "learning_rate": 6.31678486997636e-05, "loss": 2.4747, "num_input_tokens_seen": 394002432, "step": 334 }, { "epoch": 0.039518697652471395, "grad_norm": 1.765625, "learning_rate": 6.335697399527188e-05, "loss": 2.5124, "num_input_tokens_seen": 395182080, "step": 335 }, { "epoch": 0.03963666391412056, "grad_norm": 2.15625, "learning_rate": 6.354609929078015e-05, "loss": 2.552, "num_input_tokens_seen": 396361728, "step": 336 }, { "epoch": 0.03975463017576973, "grad_norm": 1.8125, "learning_rate": 6.373522458628842e-05, "loss": 2.4541, "num_input_tokens_seen": 397541376, "step": 337 }, { "epoch": 0.039872596437418896, "grad_norm": 3.15625, "learning_rate": 6.39243498817967e-05, "loss": 2.4496, "num_input_tokens_seen": 398721024, "step": 338 }, { "epoch": 0.03999056269906807, "grad_norm": 2.015625, "learning_rate": 6.411347517730498e-05, "loss": 2.5182, "num_input_tokens_seen": 399900672, "step": 339 }, { "epoch": 0.04010852896071723, "grad_norm": 1.8671875, "learning_rate": 6.430260047281325e-05, "loss": 2.5118, "num_input_tokens_seen": 401080320, "step": 340 }, { "epoch": 0.040226495222366404, "grad_norm": 4.46875, "learning_rate": 6.449172576832153e-05, "loss": 2.5161, "num_input_tokens_seen": 402259968, "step": 341 }, { "epoch": 0.04034446148401557, "grad_norm": 1.8828125, "learning_rate": 6.468085106382979e-05, "loss": 2.5411, "num_input_tokens_seen": 403439616, "step": 342 }, { "epoch": 0.04046242774566474, "grad_norm": 6.53125, "learning_rate": 6.486997635933806e-05, "loss": 2.5341, "num_input_tokens_seen": 404619264, "step": 343 }, { "epoch": 0.040580394007313905, "grad_norm": 5.0, "learning_rate": 6.505910165484634e-05, "loss": 2.5139, "num_input_tokens_seen": 405798912, "step": 344 }, { "epoch": 0.04069836026896308, "grad_norm": 6.9375, "learning_rate": 6.524822695035461e-05, "loss": 2.542, "num_input_tokens_seen": 406978560, "step": 345 }, { "epoch": 0.04081632653061224, "grad_norm": 6.625, "learning_rate": 6.543735224586288e-05, "loss": 2.6481, "num_input_tokens_seen": 408158208, "step": 346 }, { "epoch": 0.04093429279226141, "grad_norm": 1.9375, "learning_rate": 6.562647754137116e-05, "loss": 2.5413, "num_input_tokens_seen": 409337856, "step": 347 }, { "epoch": 0.041052259053910585, "grad_norm": 3.25, "learning_rate": 6.581560283687943e-05, "loss": 2.5869, "num_input_tokens_seen": 410517504, "step": 348 }, { "epoch": 0.04117022531555975, "grad_norm": 2.171875, "learning_rate": 6.60047281323877e-05, "loss": 2.5717, "num_input_tokens_seen": 411697152, "step": 349 }, { "epoch": 0.04128819157720892, "grad_norm": 2.296875, "learning_rate": 6.619385342789599e-05, "loss": 2.569, "num_input_tokens_seen": 412876800, "step": 350 }, { "epoch": 0.041406157838858086, "grad_norm": 2.53125, "learning_rate": 6.638297872340426e-05, "loss": 2.5656, "num_input_tokens_seen": 414056448, "step": 351 }, { "epoch": 0.04152412410050726, "grad_norm": 1.5625, "learning_rate": 6.657210401891253e-05, "loss": 2.4558, "num_input_tokens_seen": 415236096, "step": 352 }, { "epoch": 0.04164209036215642, "grad_norm": 3.046875, "learning_rate": 6.676122931442081e-05, "loss": 2.5107, "num_input_tokens_seen": 416415744, "step": 353 }, { "epoch": 0.041760056623805594, "grad_norm": 2.03125, "learning_rate": 6.695035460992908e-05, "loss": 2.5576, "num_input_tokens_seen": 417595392, "step": 354 }, { "epoch": 0.04187802288545476, "grad_norm": 2.015625, "learning_rate": 6.713947990543735e-05, "loss": 2.5029, "num_input_tokens_seen": 418775040, "step": 355 }, { "epoch": 0.04199598914710393, "grad_norm": 1.84375, "learning_rate": 6.732860520094564e-05, "loss": 2.547, "num_input_tokens_seen": 419954688, "step": 356 }, { "epoch": 0.042113955408753095, "grad_norm": 1.9453125, "learning_rate": 6.75177304964539e-05, "loss": 2.4942, "num_input_tokens_seen": 421134336, "step": 357 }, { "epoch": 0.04223192167040227, "grad_norm": 2.109375, "learning_rate": 6.770685579196218e-05, "loss": 2.4546, "num_input_tokens_seen": 422313984, "step": 358 }, { "epoch": 0.04234988793205143, "grad_norm": 1.875, "learning_rate": 6.789598108747046e-05, "loss": 2.6115, "num_input_tokens_seen": 423493632, "step": 359 }, { "epoch": 0.042467854193700603, "grad_norm": 1.5, "learning_rate": 6.808510638297873e-05, "loss": 2.4977, "num_input_tokens_seen": 424673280, "step": 360 }, { "epoch": 0.04258582045534977, "grad_norm": 1.84375, "learning_rate": 6.8274231678487e-05, "loss": 2.4208, "num_input_tokens_seen": 425852928, "step": 361 }, { "epoch": 0.04270378671699894, "grad_norm": 1.421875, "learning_rate": 6.846335697399528e-05, "loss": 2.5416, "num_input_tokens_seen": 427032576, "step": 362 }, { "epoch": 0.042821752978648105, "grad_norm": 1.78125, "learning_rate": 6.865248226950355e-05, "loss": 2.4938, "num_input_tokens_seen": 428212224, "step": 363 }, { "epoch": 0.042939719240297276, "grad_norm": 1.5859375, "learning_rate": 6.884160756501182e-05, "loss": 2.5203, "num_input_tokens_seen": 429391872, "step": 364 }, { "epoch": 0.04305768550194644, "grad_norm": 1.9296875, "learning_rate": 6.903073286052011e-05, "loss": 2.515, "num_input_tokens_seen": 430571520, "step": 365 }, { "epoch": 0.04317565176359561, "grad_norm": 2.25, "learning_rate": 6.921985815602838e-05, "loss": 2.5211, "num_input_tokens_seen": 431751168, "step": 366 }, { "epoch": 0.04329361802524478, "grad_norm": 1.796875, "learning_rate": 6.940898345153665e-05, "loss": 2.4887, "num_input_tokens_seen": 432930816, "step": 367 }, { "epoch": 0.04341158428689395, "grad_norm": 1.5234375, "learning_rate": 6.959810874704493e-05, "loss": 2.4896, "num_input_tokens_seen": 434110464, "step": 368 }, { "epoch": 0.043529550548543114, "grad_norm": 1.6171875, "learning_rate": 6.97872340425532e-05, "loss": 2.4172, "num_input_tokens_seen": 435290112, "step": 369 }, { "epoch": 0.043647516810192286, "grad_norm": 2.1875, "learning_rate": 6.997635933806147e-05, "loss": 2.4538, "num_input_tokens_seen": 436469760, "step": 370 }, { "epoch": 0.04376548307184145, "grad_norm": 2.015625, "learning_rate": 7.016548463356976e-05, "loss": 2.4439, "num_input_tokens_seen": 437649408, "step": 371 }, { "epoch": 0.04388344933349062, "grad_norm": 1.828125, "learning_rate": 7.035460992907803e-05, "loss": 2.4898, "num_input_tokens_seen": 438829056, "step": 372 }, { "epoch": 0.04400141559513979, "grad_norm": 1.6171875, "learning_rate": 7.05437352245863e-05, "loss": 2.4337, "num_input_tokens_seen": 440008704, "step": 373 }, { "epoch": 0.04411938185678896, "grad_norm": 1.53125, "learning_rate": 7.073286052009457e-05, "loss": 2.4729, "num_input_tokens_seen": 441188352, "step": 374 }, { "epoch": 0.04423734811843813, "grad_norm": 1.9765625, "learning_rate": 7.092198581560284e-05, "loss": 2.4801, "num_input_tokens_seen": 442368000, "step": 375 }, { "epoch": 0.044355314380087295, "grad_norm": 2.0, "learning_rate": 7.11111111111111e-05, "loss": 2.5573, "num_input_tokens_seen": 443547648, "step": 376 }, { "epoch": 0.044473280641736467, "grad_norm": 1.640625, "learning_rate": 7.130023640661939e-05, "loss": 2.4675, "num_input_tokens_seen": 444727296, "step": 377 }, { "epoch": 0.04459124690338563, "grad_norm": 1.8984375, "learning_rate": 7.148936170212766e-05, "loss": 2.4965, "num_input_tokens_seen": 445906944, "step": 378 }, { "epoch": 0.0447092131650348, "grad_norm": 1.2578125, "learning_rate": 7.167848699763593e-05, "loss": 2.4161, "num_input_tokens_seen": 447086592, "step": 379 }, { "epoch": 0.04482717942668397, "grad_norm": 1.8359375, "learning_rate": 7.186761229314421e-05, "loss": 2.4437, "num_input_tokens_seen": 448266240, "step": 380 }, { "epoch": 0.04494514568833314, "grad_norm": 1.8046875, "learning_rate": 7.205673758865248e-05, "loss": 2.4978, "num_input_tokens_seen": 449445888, "step": 381 }, { "epoch": 0.045063111949982304, "grad_norm": 1.53125, "learning_rate": 7.224586288416075e-05, "loss": 2.4316, "num_input_tokens_seen": 450625536, "step": 382 }, { "epoch": 0.045181078211631476, "grad_norm": 2.0625, "learning_rate": 7.243498817966904e-05, "loss": 2.5152, "num_input_tokens_seen": 451805184, "step": 383 }, { "epoch": 0.04529904447328064, "grad_norm": 1.65625, "learning_rate": 7.262411347517731e-05, "loss": 2.4161, "num_input_tokens_seen": 452984832, "step": 384 }, { "epoch": 0.04541701073492981, "grad_norm": 1.8515625, "learning_rate": 7.281323877068558e-05, "loss": 2.4274, "num_input_tokens_seen": 454164480, "step": 385 }, { "epoch": 0.04553497699657898, "grad_norm": 2.0625, "learning_rate": 7.300236406619386e-05, "loss": 2.4693, "num_input_tokens_seen": 455344128, "step": 386 }, { "epoch": 0.04565294325822815, "grad_norm": 1.765625, "learning_rate": 7.319148936170213e-05, "loss": 2.4907, "num_input_tokens_seen": 456523776, "step": 387 }, { "epoch": 0.04577090951987731, "grad_norm": 1.71875, "learning_rate": 7.33806146572104e-05, "loss": 2.4603, "num_input_tokens_seen": 457703424, "step": 388 }, { "epoch": 0.045888875781526485, "grad_norm": 1.7734375, "learning_rate": 7.356973995271869e-05, "loss": 2.4651, "num_input_tokens_seen": 458883072, "step": 389 }, { "epoch": 0.04600684204317565, "grad_norm": 1.9765625, "learning_rate": 7.375886524822696e-05, "loss": 2.4041, "num_input_tokens_seen": 460062720, "step": 390 }, { "epoch": 0.04612480830482482, "grad_norm": 2.90625, "learning_rate": 7.394799054373523e-05, "loss": 2.3896, "num_input_tokens_seen": 461242368, "step": 391 }, { "epoch": 0.046242774566473986, "grad_norm": 1.515625, "learning_rate": 7.413711583924351e-05, "loss": 2.3966, "num_input_tokens_seen": 462422016, "step": 392 }, { "epoch": 0.04636074082812316, "grad_norm": 1.984375, "learning_rate": 7.432624113475178e-05, "loss": 2.5009, "num_input_tokens_seen": 463601664, "step": 393 }, { "epoch": 0.04647870708977232, "grad_norm": 1.8984375, "learning_rate": 7.451536643026005e-05, "loss": 2.4524, "num_input_tokens_seen": 464781312, "step": 394 }, { "epoch": 0.046596673351421494, "grad_norm": 2.46875, "learning_rate": 7.470449172576833e-05, "loss": 2.4169, "num_input_tokens_seen": 465960960, "step": 395 }, { "epoch": 0.04671463961307066, "grad_norm": 1.609375, "learning_rate": 7.48936170212766e-05, "loss": 2.5313, "num_input_tokens_seen": 467140608, "step": 396 }, { "epoch": 0.04683260587471983, "grad_norm": 1.7890625, "learning_rate": 7.508274231678487e-05, "loss": 2.5893, "num_input_tokens_seen": 468320256, "step": 397 }, { "epoch": 0.046950572136368995, "grad_norm": 1.9453125, "learning_rate": 7.527186761229316e-05, "loss": 2.453, "num_input_tokens_seen": 469499904, "step": 398 }, { "epoch": 0.04706853839801817, "grad_norm": 1.78125, "learning_rate": 7.546099290780143e-05, "loss": 2.4462, "num_input_tokens_seen": 470679552, "step": 399 }, { "epoch": 0.04718650465966733, "grad_norm": 2.140625, "learning_rate": 7.56501182033097e-05, "loss": 2.5393, "num_input_tokens_seen": 471859200, "step": 400 }, { "epoch": 0.04718650465966733, "eval_wikipedia_loss": 2.4841043949127197, "eval_wikipedia_runtime": 173.8643, "eval_wikipedia_samples_per_second": 4.038, "eval_wikipedia_steps_per_second": 0.173, "num_input_tokens_seen": 471859200, "step": 400 }, { "epoch": 0.04718650465966733, "eval_toxicity_loss": 4.2398834228515625, "eval_toxicity_runtime": 0.9943, "eval_toxicity_samples_per_second": 2.011, "eval_toxicity_steps_per_second": 1.006, "num_input_tokens_seen": 471859200, "step": 400 } ], "logging_steps": 1, "max_steps": 8477, "num_input_tokens_seen": 471859200, "num_train_epochs": 9223372036854775807, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.731724480321946e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }