|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.023593252329833666, |
|
"eval_steps": 200, |
|
"global_step": 800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00011796626164916834, |
|
"grad_norm": 93.0, |
|
"learning_rate": 1.8912529550827425e-07, |
|
"loss": 7.9641, |
|
"num_input_tokens_seen": 1179648, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002359325232983367, |
|
"grad_norm": 95.0, |
|
"learning_rate": 3.782505910165485e-07, |
|
"loss": 7.9866, |
|
"num_input_tokens_seen": 2359296, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.000353898784947505, |
|
"grad_norm": 97.5, |
|
"learning_rate": 5.673758865248227e-07, |
|
"loss": 7.983, |
|
"num_input_tokens_seen": 3538944, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0004718650465966734, |
|
"grad_norm": 92.0, |
|
"learning_rate": 7.56501182033097e-07, |
|
"loss": 7.9018, |
|
"num_input_tokens_seen": 4718592, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0005898313082458417, |
|
"grad_norm": 102.5, |
|
"learning_rate": 9.456264775413712e-07, |
|
"loss": 8.0784, |
|
"num_input_tokens_seen": 5898240, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00070779756989501, |
|
"grad_norm": 95.0, |
|
"learning_rate": 1.1347517730496454e-06, |
|
"loss": 7.9578, |
|
"num_input_tokens_seen": 7077888, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0008257638315441783, |
|
"grad_norm": 93.5, |
|
"learning_rate": 1.3238770685579196e-06, |
|
"loss": 7.975, |
|
"num_input_tokens_seen": 8257536, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0009437300931933467, |
|
"grad_norm": 85.0, |
|
"learning_rate": 1.513002364066194e-06, |
|
"loss": 7.8405, |
|
"num_input_tokens_seen": 9437184, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.001061696354842515, |
|
"grad_norm": 83.0, |
|
"learning_rate": 1.7021276595744682e-06, |
|
"loss": 7.841, |
|
"num_input_tokens_seen": 10616832, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0011796626164916834, |
|
"grad_norm": 82.5, |
|
"learning_rate": 1.8912529550827423e-06, |
|
"loss": 7.9491, |
|
"num_input_tokens_seen": 11796480, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0012976288781408518, |
|
"grad_norm": 68.0, |
|
"learning_rate": 2.0803782505910165e-06, |
|
"loss": 7.5612, |
|
"num_input_tokens_seen": 12976128, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00141559513979002, |
|
"grad_norm": 66.5, |
|
"learning_rate": 2.269503546099291e-06, |
|
"loss": 7.5808, |
|
"num_input_tokens_seen": 14155776, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0015335614014391884, |
|
"grad_norm": 57.5, |
|
"learning_rate": 2.4586288416075653e-06, |
|
"loss": 7.4494, |
|
"num_input_tokens_seen": 15335424, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0016515276630883566, |
|
"grad_norm": 51.25, |
|
"learning_rate": 2.6477541371158392e-06, |
|
"loss": 7.4606, |
|
"num_input_tokens_seen": 16515072, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.001769493924737525, |
|
"grad_norm": 42.75, |
|
"learning_rate": 2.836879432624114e-06, |
|
"loss": 7.2804, |
|
"num_input_tokens_seen": 17694720, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0018874601863866935, |
|
"grad_norm": 38.0, |
|
"learning_rate": 3.026004728132388e-06, |
|
"loss": 7.2173, |
|
"num_input_tokens_seen": 18874368, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0020054264480358617, |
|
"grad_norm": 33.75, |
|
"learning_rate": 3.2151300236406624e-06, |
|
"loss": 7.0934, |
|
"num_input_tokens_seen": 20054016, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00212339270968503, |
|
"grad_norm": 30.5, |
|
"learning_rate": 3.4042553191489363e-06, |
|
"loss": 7.1284, |
|
"num_input_tokens_seen": 21233664, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0022413589713341986, |
|
"grad_norm": 26.5, |
|
"learning_rate": 3.5933806146572107e-06, |
|
"loss": 6.932, |
|
"num_input_tokens_seen": 22413312, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0023593252329833668, |
|
"grad_norm": 23.875, |
|
"learning_rate": 3.7825059101654847e-06, |
|
"loss": 6.9163, |
|
"num_input_tokens_seen": 23592960, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002477291494632535, |
|
"grad_norm": 21.625, |
|
"learning_rate": 3.9716312056737595e-06, |
|
"loss": 6.8335, |
|
"num_input_tokens_seen": 24772608, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0025952577562817036, |
|
"grad_norm": 18.625, |
|
"learning_rate": 4.160756501182033e-06, |
|
"loss": 6.6761, |
|
"num_input_tokens_seen": 25952256, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.002713224017930872, |
|
"grad_norm": 16.0, |
|
"learning_rate": 4.349881796690308e-06, |
|
"loss": 6.6474, |
|
"num_input_tokens_seen": 27131904, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00283119027958004, |
|
"grad_norm": 18.0, |
|
"learning_rate": 4.539007092198582e-06, |
|
"loss": 6.6062, |
|
"num_input_tokens_seen": 28311552, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0029491565412292082, |
|
"grad_norm": 17.5, |
|
"learning_rate": 4.728132387706856e-06, |
|
"loss": 6.6435, |
|
"num_input_tokens_seen": 29491200, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.003067122802878377, |
|
"grad_norm": 14.25, |
|
"learning_rate": 4.9172576832151305e-06, |
|
"loss": 6.57, |
|
"num_input_tokens_seen": 30670848, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.003185089064527545, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 5.106382978723404e-06, |
|
"loss": 6.4037, |
|
"num_input_tokens_seen": 31850496, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0033030553261767133, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 5.2955082742316784e-06, |
|
"loss": 6.2927, |
|
"num_input_tokens_seen": 33030144, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.003421021587825882, |
|
"grad_norm": 9.75, |
|
"learning_rate": 5.484633569739954e-06, |
|
"loss": 6.2084, |
|
"num_input_tokens_seen": 34209792, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.00353898784947505, |
|
"grad_norm": 10.25, |
|
"learning_rate": 5.673758865248228e-06, |
|
"loss": 6.2349, |
|
"num_input_tokens_seen": 35389440, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0036569541111242184, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 5.862884160756502e-06, |
|
"loss": 6.155, |
|
"num_input_tokens_seen": 36569088, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.003774920372773387, |
|
"grad_norm": 10.0, |
|
"learning_rate": 6.052009456264776e-06, |
|
"loss": 6.1846, |
|
"num_input_tokens_seen": 37748736, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.003892886634422555, |
|
"grad_norm": 8.0, |
|
"learning_rate": 6.24113475177305e-06, |
|
"loss": 6.0656, |
|
"num_input_tokens_seen": 38928384, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.004010852896071723, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 6.430260047281325e-06, |
|
"loss": 5.9873, |
|
"num_input_tokens_seen": 40108032, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.004128819157720892, |
|
"grad_norm": 8.875, |
|
"learning_rate": 6.619385342789598e-06, |
|
"loss": 6.0008, |
|
"num_input_tokens_seen": 41287680, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00424678541937006, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 6.808510638297873e-06, |
|
"loss": 6.0852, |
|
"num_input_tokens_seen": 42467328, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.004364751681019229, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 6.997635933806147e-06, |
|
"loss": 5.8264, |
|
"num_input_tokens_seen": 43646976, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.004482717942668397, |
|
"grad_norm": 6.75, |
|
"learning_rate": 7.186761229314421e-06, |
|
"loss": 5.9511, |
|
"num_input_tokens_seen": 44826624, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.004600684204317565, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 7.375886524822695e-06, |
|
"loss": 5.8042, |
|
"num_input_tokens_seen": 46006272, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0047186504659667335, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 7.565011820330969e-06, |
|
"loss": 5.8955, |
|
"num_input_tokens_seen": 47185920, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004836616727615902, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 7.754137115839244e-06, |
|
"loss": 5.7764, |
|
"num_input_tokens_seen": 48365568, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.00495458298926507, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 7.943262411347519e-06, |
|
"loss": 5.7784, |
|
"num_input_tokens_seen": 49545216, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.005072549250914238, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 8.132387706855792e-06, |
|
"loss": 5.7388, |
|
"num_input_tokens_seen": 50724864, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.005190515512563407, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 8.321513002364066e-06, |
|
"loss": 5.7821, |
|
"num_input_tokens_seen": 51904512, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0053084817742125754, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 5.6466, |
|
"num_input_tokens_seen": 53084160, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.005426448035861744, |
|
"grad_norm": 4.125, |
|
"learning_rate": 8.699763593380616e-06, |
|
"loss": 5.6224, |
|
"num_input_tokens_seen": 54263808, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.005544414297510912, |
|
"grad_norm": 5.0, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 5.9019, |
|
"num_input_tokens_seen": 55443456, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.00566238055916008, |
|
"grad_norm": 4.0, |
|
"learning_rate": 9.078014184397164e-06, |
|
"loss": 5.5812, |
|
"num_input_tokens_seen": 56623104, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.005780346820809248, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 9.267139479905439e-06, |
|
"loss": 5.633, |
|
"num_input_tokens_seen": 57802752, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0058983130824584165, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 9.456264775413712e-06, |
|
"loss": 5.5031, |
|
"num_input_tokens_seen": 58982400, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0060162793441075856, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 9.645390070921986e-06, |
|
"loss": 5.5146, |
|
"num_input_tokens_seen": 60162048, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.006134245605756754, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 9.834515366430261e-06, |
|
"loss": 5.3805, |
|
"num_input_tokens_seen": 61341696, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.006252211867405922, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1.0023640661938535e-05, |
|
"loss": 5.4098, |
|
"num_input_tokens_seen": 62521344, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00637017812905509, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.0212765957446808e-05, |
|
"loss": 5.3773, |
|
"num_input_tokens_seen": 63700992, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.006488144390704258, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.0401891252955083e-05, |
|
"loss": 5.3287, |
|
"num_input_tokens_seen": 64880640, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.006606110652353427, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 1.0591016548463357e-05, |
|
"loss": 5.2282, |
|
"num_input_tokens_seen": 66060288, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.006724076914002596, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.0780141843971632e-05, |
|
"loss": 5.2967, |
|
"num_input_tokens_seen": 67239936, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.006842043175651764, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.0969267139479907e-05, |
|
"loss": 5.1126, |
|
"num_input_tokens_seen": 68419584, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.006960009437300932, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.1158392434988181e-05, |
|
"loss": 5.3309, |
|
"num_input_tokens_seen": 69599232, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0070779756989501, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.1347517730496456e-05, |
|
"loss": 5.2134, |
|
"num_input_tokens_seen": 70778880, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0071959419605992685, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.153664302600473e-05, |
|
"loss": 5.1972, |
|
"num_input_tokens_seen": 71958528, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.007313908222248437, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.1725768321513003e-05, |
|
"loss": 5.0123, |
|
"num_input_tokens_seen": 73138176, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.007431874483897605, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.1914893617021277e-05, |
|
"loss": 5.1108, |
|
"num_input_tokens_seen": 74317824, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.007549840745546774, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.2104018912529552e-05, |
|
"loss": 5.2287, |
|
"num_input_tokens_seen": 75497472, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.007667807007195942, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.2293144208037825e-05, |
|
"loss": 5.0902, |
|
"num_input_tokens_seen": 76677120, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00778577326884511, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.24822695035461e-05, |
|
"loss": 5.1526, |
|
"num_input_tokens_seen": 77856768, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.007903739530494279, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.2671394799054376e-05, |
|
"loss": 4.8855, |
|
"num_input_tokens_seen": 79036416, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.008021705792143447, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.286052009456265e-05, |
|
"loss": 5.0226, |
|
"num_input_tokens_seen": 80216064, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.008139672053792615, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.3049645390070925e-05, |
|
"loss": 5.1366, |
|
"num_input_tokens_seen": 81395712, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.008257638315441783, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.3238770685579197e-05, |
|
"loss": 5.0749, |
|
"num_input_tokens_seen": 82575360, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.008375604577090951, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.3427895981087472e-05, |
|
"loss": 4.8899, |
|
"num_input_tokens_seen": 83755008, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.00849357083874012, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.3617021276595745e-05, |
|
"loss": 4.8809, |
|
"num_input_tokens_seen": 84934656, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.008611537100389288, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.380614657210402e-05, |
|
"loss": 4.8839, |
|
"num_input_tokens_seen": 86114304, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.008729503362038458, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.3995271867612294e-05, |
|
"loss": 4.9931, |
|
"num_input_tokens_seen": 87293952, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.008847469623687626, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.418439716312057e-05, |
|
"loss": 4.8041, |
|
"num_input_tokens_seen": 88473600, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.008965435885336794, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.4373522458628843e-05, |
|
"loss": 4.8611, |
|
"num_input_tokens_seen": 89653248, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.009083402146985962, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.4562647754137118e-05, |
|
"loss": 4.7747, |
|
"num_input_tokens_seen": 90832896, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.00920136840863513, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.475177304964539e-05, |
|
"loss": 4.6884, |
|
"num_input_tokens_seen": 92012544, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.009319334670284299, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4940898345153665e-05, |
|
"loss": 4.6617, |
|
"num_input_tokens_seen": 93192192, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.009437300931933467, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.5130023640661939e-05, |
|
"loss": 4.6174, |
|
"num_input_tokens_seen": 94371840, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009555267193582635, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.5319148936170214e-05, |
|
"loss": 4.5483, |
|
"num_input_tokens_seen": 95551488, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.009673233455231803, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.5508274231678487e-05, |
|
"loss": 4.604, |
|
"num_input_tokens_seen": 96731136, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.009791199716880972, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.5697399527186764e-05, |
|
"loss": 4.5806, |
|
"num_input_tokens_seen": 97910784, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.00990916597853014, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.5886524822695038e-05, |
|
"loss": 4.4723, |
|
"num_input_tokens_seen": 99090432, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.010027132240179308, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.607565011820331e-05, |
|
"loss": 4.4723, |
|
"num_input_tokens_seen": 100270080, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.010145098501828476, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.6264775413711585e-05, |
|
"loss": 4.4519, |
|
"num_input_tokens_seen": 101449728, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.010263064763477646, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.645390070921986e-05, |
|
"loss": 4.4077, |
|
"num_input_tokens_seen": 102629376, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.010381031025126814, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.6643026004728132e-05, |
|
"loss": 4.5295, |
|
"num_input_tokens_seen": 103809024, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.010498997286775983, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.683215130023641e-05, |
|
"loss": 4.3235, |
|
"num_input_tokens_seen": 104988672, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.010616963548425151, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 4.3245, |
|
"num_input_tokens_seen": 106168320, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.010734929810074319, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.7210401891252956e-05, |
|
"loss": 4.2554, |
|
"num_input_tokens_seen": 107347968, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.010852896071723487, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.7399527186761233e-05, |
|
"loss": 4.2572, |
|
"num_input_tokens_seen": 108527616, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.010970862333372655, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.7588652482269506e-05, |
|
"loss": 4.2442, |
|
"num_input_tokens_seen": 109707264, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.011088828595021824, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 4.3006, |
|
"num_input_tokens_seen": 110886912, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.011206794856670992, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.7966903073286054e-05, |
|
"loss": 4.2813, |
|
"num_input_tokens_seen": 112066560, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.01132476111832016, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.8156028368794327e-05, |
|
"loss": 4.1186, |
|
"num_input_tokens_seen": 113246208, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.011442727379969328, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.83451536643026e-05, |
|
"loss": 4.1652, |
|
"num_input_tokens_seen": 114425856, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.011560693641618497, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.8534278959810878e-05, |
|
"loss": 4.1923, |
|
"num_input_tokens_seen": 115605504, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.011678659903267665, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.872340425531915e-05, |
|
"loss": 4.0728, |
|
"num_input_tokens_seen": 116785152, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.011796626164916833, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.8912529550827425e-05, |
|
"loss": 4.2189, |
|
"num_input_tokens_seen": 117964800, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011914592426566003, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.91016548463357e-05, |
|
"loss": 4.0632, |
|
"num_input_tokens_seen": 119144448, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.012032558688215171, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.929078014184397e-05, |
|
"loss": 3.8775, |
|
"num_input_tokens_seen": 120324096, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.01215052494986434, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.9479905437352245e-05, |
|
"loss": 3.8658, |
|
"num_input_tokens_seen": 121503744, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.012268491211513508, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9669030732860522e-05, |
|
"loss": 3.9888, |
|
"num_input_tokens_seen": 122683392, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.012386457473162676, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.9858156028368796e-05, |
|
"loss": 3.9439, |
|
"num_input_tokens_seen": 123863040, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.012504423734811844, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 2.004728132387707e-05, |
|
"loss": 4.0098, |
|
"num_input_tokens_seen": 125042688, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.012622389996461012, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 2.0236406619385343e-05, |
|
"loss": 3.9454, |
|
"num_input_tokens_seen": 126222336, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.01274035625811018, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.0425531914893616e-05, |
|
"loss": 3.8491, |
|
"num_input_tokens_seen": 127401984, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.012858322519759349, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 2.0614657210401893e-05, |
|
"loss": 3.9602, |
|
"num_input_tokens_seen": 128581632, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.012976288781408517, |
|
"grad_norm": 5.25, |
|
"learning_rate": 2.0803782505910167e-05, |
|
"loss": 3.9542, |
|
"num_input_tokens_seen": 129761280, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.013094255043057685, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.099290780141844e-05, |
|
"loss": 3.7846, |
|
"num_input_tokens_seen": 130940928, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.013212221304706853, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 2.1182033096926714e-05, |
|
"loss": 3.9128, |
|
"num_input_tokens_seen": 132120576, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.013330187566356021, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 2.137115839243499e-05, |
|
"loss": 3.7365, |
|
"num_input_tokens_seen": 133300224, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.013448153828005191, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 2.1560283687943264e-05, |
|
"loss": 3.8298, |
|
"num_input_tokens_seen": 134479872, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.01356612008965436, |
|
"grad_norm": 11.25, |
|
"learning_rate": 2.1749408983451538e-05, |
|
"loss": 3.9192, |
|
"num_input_tokens_seen": 135659520, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.013684086351303528, |
|
"grad_norm": 9.875, |
|
"learning_rate": 2.1938534278959815e-05, |
|
"loss": 3.7628, |
|
"num_input_tokens_seen": 136839168, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.013802052612952696, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 2.2127659574468088e-05, |
|
"loss": 3.7212, |
|
"num_input_tokens_seen": 138018816, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.013920018874601864, |
|
"grad_norm": 6.875, |
|
"learning_rate": 2.2316784869976362e-05, |
|
"loss": 3.8291, |
|
"num_input_tokens_seen": 139198464, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.014037985136251032, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 2.2505910165484635e-05, |
|
"loss": 3.7104, |
|
"num_input_tokens_seen": 140378112, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0141559513979002, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 2.2695035460992912e-05, |
|
"loss": 3.6898, |
|
"num_input_tokens_seen": 141557760, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.014273917659549369, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 2.2884160756501186e-05, |
|
"loss": 3.61, |
|
"num_input_tokens_seen": 142737408, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.014391883921198537, |
|
"grad_norm": 4.875, |
|
"learning_rate": 2.307328605200946e-05, |
|
"loss": 3.6886, |
|
"num_input_tokens_seen": 143917056, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.014509850182847705, |
|
"grad_norm": 4.75, |
|
"learning_rate": 2.326241134751773e-05, |
|
"loss": 3.6435, |
|
"num_input_tokens_seen": 145096704, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.014627816444496873, |
|
"grad_norm": 3.75, |
|
"learning_rate": 2.3451536643026006e-05, |
|
"loss": 3.6619, |
|
"num_input_tokens_seen": 146276352, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.014745782706146042, |
|
"grad_norm": 5.375, |
|
"learning_rate": 2.364066193853428e-05, |
|
"loss": 3.631, |
|
"num_input_tokens_seen": 147456000, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.01486374896779521, |
|
"grad_norm": 4.125, |
|
"learning_rate": 2.3829787234042553e-05, |
|
"loss": 3.6482, |
|
"num_input_tokens_seen": 148635648, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.014981715229444378, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 2.4018912529550827e-05, |
|
"loss": 3.6785, |
|
"num_input_tokens_seen": 149815296, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.015099681491093548, |
|
"grad_norm": 7.0, |
|
"learning_rate": 2.4208037825059104e-05, |
|
"loss": 3.6113, |
|
"num_input_tokens_seen": 150994944, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.015217647752742716, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 2.4397163120567377e-05, |
|
"loss": 3.6015, |
|
"num_input_tokens_seen": 152174592, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.015335614014391884, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 2.458628841607565e-05, |
|
"loss": 3.5241, |
|
"num_input_tokens_seen": 153354240, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.015453580276041053, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 2.4775413711583928e-05, |
|
"loss": 3.6007, |
|
"num_input_tokens_seen": 154533888, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.01557154653769022, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 2.49645390070922e-05, |
|
"loss": 3.5949, |
|
"num_input_tokens_seen": 155713536, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.01568951279933939, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 2.5153664302600475e-05, |
|
"loss": 3.5403, |
|
"num_input_tokens_seen": 156893184, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.015807479060988557, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 2.5342789598108752e-05, |
|
"loss": 3.5399, |
|
"num_input_tokens_seen": 158072832, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.015925445322637725, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 2.5531914893617025e-05, |
|
"loss": 3.5408, |
|
"num_input_tokens_seen": 159252480, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.016043411584286894, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.57210401891253e-05, |
|
"loss": 3.4678, |
|
"num_input_tokens_seen": 160432128, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.016161377845936062, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 2.5910165484633572e-05, |
|
"loss": 3.6019, |
|
"num_input_tokens_seen": 161611776, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.01627934410758523, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 2.609929078014185e-05, |
|
"loss": 3.4525, |
|
"num_input_tokens_seen": 162791424, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0163973103692344, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.628841607565012e-05, |
|
"loss": 3.557, |
|
"num_input_tokens_seen": 163971072, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.016515276630883566, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 2.6477541371158393e-05, |
|
"loss": 3.5324, |
|
"num_input_tokens_seen": 165150720, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.016633242892532735, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 3.4707, |
|
"num_input_tokens_seen": 166330368, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.016751209154181903, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 2.6855791962174944e-05, |
|
"loss": 3.3545, |
|
"num_input_tokens_seen": 167510016, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.01686917541583107, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.7044917257683217e-05, |
|
"loss": 3.4051, |
|
"num_input_tokens_seen": 168689664, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.01698714167748024, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 2.723404255319149e-05, |
|
"loss": 3.4119, |
|
"num_input_tokens_seen": 169869312, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.017105107939129408, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 2.7423167848699764e-05, |
|
"loss": 3.5149, |
|
"num_input_tokens_seen": 171048960, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.017223074200778576, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 2.761229314420804e-05, |
|
"loss": 3.3442, |
|
"num_input_tokens_seen": 172228608, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.017341040462427744, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 2.7801418439716315e-05, |
|
"loss": 3.3277, |
|
"num_input_tokens_seen": 173408256, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.017459006724076916, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 2.7990543735224588e-05, |
|
"loss": 3.3905, |
|
"num_input_tokens_seen": 174587904, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.017576972985726084, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.8179669030732865e-05, |
|
"loss": 3.276, |
|
"num_input_tokens_seen": 175767552, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.017694939247375252, |
|
"grad_norm": 4.0, |
|
"learning_rate": 2.836879432624114e-05, |
|
"loss": 3.3211, |
|
"num_input_tokens_seen": 176947200, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01781290550902442, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 2.8557919621749412e-05, |
|
"loss": 3.3483, |
|
"num_input_tokens_seen": 178126848, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.01793087177067359, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.8747044917257686e-05, |
|
"loss": 3.3391, |
|
"num_input_tokens_seen": 179306496, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.018048838032322757, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 2.8936170212765963e-05, |
|
"loss": 3.4719, |
|
"num_input_tokens_seen": 180486144, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.018166804293971925, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 2.9125295508274236e-05, |
|
"loss": 3.2727, |
|
"num_input_tokens_seen": 181665792, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.018284770555621093, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 2.9314420803782506e-05, |
|
"loss": 3.284, |
|
"num_input_tokens_seen": 182845440, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.01840273681727026, |
|
"grad_norm": 3.25, |
|
"learning_rate": 2.950354609929078e-05, |
|
"loss": 3.2279, |
|
"num_input_tokens_seen": 184025088, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.01852070307891943, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 2.9692671394799057e-05, |
|
"loss": 3.2438, |
|
"num_input_tokens_seen": 185204736, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.018638669340568598, |
|
"grad_norm": 3.875, |
|
"learning_rate": 2.988179669030733e-05, |
|
"loss": 3.3257, |
|
"num_input_tokens_seen": 186384384, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.018756635602217766, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.0070921985815604e-05, |
|
"loss": 3.2727, |
|
"num_input_tokens_seen": 187564032, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.018874601863866934, |
|
"grad_norm": 4.125, |
|
"learning_rate": 3.0260047281323877e-05, |
|
"loss": 3.245, |
|
"num_input_tokens_seen": 188743680, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.018992568125516102, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 3.0449172576832154e-05, |
|
"loss": 3.1904, |
|
"num_input_tokens_seen": 189923328, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.01911053438716527, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.063829787234043e-05, |
|
"loss": 3.2754, |
|
"num_input_tokens_seen": 191102976, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.01922850064881444, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.0827423167848705e-05, |
|
"loss": 3.1889, |
|
"num_input_tokens_seen": 192282624, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.019346466910463607, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 3.1016548463356975e-05, |
|
"loss": 3.2809, |
|
"num_input_tokens_seen": 193462272, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.019464433172112775, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.120567375886525e-05, |
|
"loss": 3.1929, |
|
"num_input_tokens_seen": 194641920, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.019582399433761943, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.139479905437353e-05, |
|
"loss": 3.1474, |
|
"num_input_tokens_seen": 195821568, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.01970036569541111, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.15839243498818e-05, |
|
"loss": 3.0433, |
|
"num_input_tokens_seen": 197001216, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.01981833195706028, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.1773049645390076e-05, |
|
"loss": 3.1527, |
|
"num_input_tokens_seen": 198180864, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.019936298218709448, |
|
"grad_norm": 3.5, |
|
"learning_rate": 3.196217494089835e-05, |
|
"loss": 3.2145, |
|
"num_input_tokens_seen": 199360512, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.020054264480358616, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.215130023640662e-05, |
|
"loss": 3.2642, |
|
"num_input_tokens_seen": 200540160, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.020172230742007784, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.234042553191489e-05, |
|
"loss": 3.1286, |
|
"num_input_tokens_seen": 201719808, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.020290197003656953, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.252955082742317e-05, |
|
"loss": 3.1274, |
|
"num_input_tokens_seen": 202899456, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.02040816326530612, |
|
"grad_norm": 6.375, |
|
"learning_rate": 3.271867612293144e-05, |
|
"loss": 3.096, |
|
"num_input_tokens_seen": 204079104, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.020526129526955292, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 3.290780141843972e-05, |
|
"loss": 3.1647, |
|
"num_input_tokens_seen": 205258752, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.02064409578860446, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.3096926713947994e-05, |
|
"loss": 3.269, |
|
"num_input_tokens_seen": 206438400, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02076206205025363, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 3.3286052009456264e-05, |
|
"loss": 3.2106, |
|
"num_input_tokens_seen": 207618048, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.020880028311902797, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 3.347517730496454e-05, |
|
"loss": 3.1545, |
|
"num_input_tokens_seen": 208797696, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.020997994573551965, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 3.366430260047282e-05, |
|
"loss": 3.1045, |
|
"num_input_tokens_seen": 209977344, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.021115960835201134, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 3.385342789598109e-05, |
|
"loss": 3.0496, |
|
"num_input_tokens_seen": 211156992, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.021233927096850302, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 3.4042553191489365e-05, |
|
"loss": 3.081, |
|
"num_input_tokens_seen": 212336640, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02135189335849947, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 3.423167848699764e-05, |
|
"loss": 3.0552, |
|
"num_input_tokens_seen": 213516288, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.021469859620148638, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.442080378250591e-05, |
|
"loss": 3.0172, |
|
"num_input_tokens_seen": 214695936, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.021587825881797806, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 3.460992907801419e-05, |
|
"loss": 3.1379, |
|
"num_input_tokens_seen": 215875584, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.021705792143446975, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.4799054373522466e-05, |
|
"loss": 3.1235, |
|
"num_input_tokens_seen": 217055232, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.021823758405096143, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 3.4988179669030736e-05, |
|
"loss": 3.1189, |
|
"num_input_tokens_seen": 218234880, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.02194172466674531, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.517730496453901e-05, |
|
"loss": 3.0035, |
|
"num_input_tokens_seen": 219414528, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.02205969092839448, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.536643026004728e-05, |
|
"loss": 3.0478, |
|
"num_input_tokens_seen": 220594176, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.022177657190043647, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 3.555555555555555e-05, |
|
"loss": 3.0777, |
|
"num_input_tokens_seen": 221773824, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.022295623451692816, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.574468085106383e-05, |
|
"loss": 3.0665, |
|
"num_input_tokens_seen": 222953472, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.022413589713341984, |
|
"grad_norm": 4.25, |
|
"learning_rate": 3.593380614657211e-05, |
|
"loss": 3.0271, |
|
"num_input_tokens_seen": 224133120, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.022531555974991152, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 3.612293144208038e-05, |
|
"loss": 3.033, |
|
"num_input_tokens_seen": 225312768, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.02264952223664032, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.6312056737588654e-05, |
|
"loss": 3.143, |
|
"num_input_tokens_seen": 226492416, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.02276748849828949, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.650118203309693e-05, |
|
"loss": 3.0347, |
|
"num_input_tokens_seen": 227672064, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.022885454759938657, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 3.66903073286052e-05, |
|
"loss": 3.07, |
|
"num_input_tokens_seen": 228851712, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.023003421021587825, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 3.687943262411348e-05, |
|
"loss": 3.0225, |
|
"num_input_tokens_seen": 230031360, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.023121387283236993, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.7068557919621755e-05, |
|
"loss": 3.0222, |
|
"num_input_tokens_seen": 231211008, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.02323935354488616, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 3.7257683215130025e-05, |
|
"loss": 2.9755, |
|
"num_input_tokens_seen": 232390656, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.02335731980653533, |
|
"grad_norm": 5.0, |
|
"learning_rate": 3.74468085106383e-05, |
|
"loss": 3.0163, |
|
"num_input_tokens_seen": 233570304, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.023475286068184498, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.763593380614658e-05, |
|
"loss": 2.9553, |
|
"num_input_tokens_seen": 234749952, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.782505910165485e-05, |
|
"loss": 3.015, |
|
"num_input_tokens_seen": 235929600, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"eval_wikipedia_loss": 2.9615590572357178, |
|
"eval_wikipedia_runtime": 172.3085, |
|
"eval_wikipedia_samples_per_second": 4.074, |
|
"eval_wikipedia_steps_per_second": 0.174, |
|
"num_input_tokens_seen": 235929600, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"eval_toxicity_loss": 4.73836088180542, |
|
"eval_toxicity_runtime": 0.999, |
|
"eval_toxicity_samples_per_second": 2.002, |
|
"eval_toxicity_steps_per_second": 1.001, |
|
"num_input_tokens_seen": 235929600, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.023711218591482838, |
|
"grad_norm": 4.0, |
|
"learning_rate": 3.8014184397163126e-05, |
|
"loss": 2.9448, |
|
"num_input_tokens_seen": 237109248, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.023829184853132006, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 3.82033096926714e-05, |
|
"loss": 3.1131, |
|
"num_input_tokens_seen": 238288896, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.023947151114781174, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 3.839243498817967e-05, |
|
"loss": 2.9217, |
|
"num_input_tokens_seen": 239468544, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.024065117376430342, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.858156028368794e-05, |
|
"loss": 2.9658, |
|
"num_input_tokens_seen": 240648192, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.02418308363807951, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 3.877068557919622e-05, |
|
"loss": 2.9857, |
|
"num_input_tokens_seen": 241827840, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.02430104989972868, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.895981087470449e-05, |
|
"loss": 2.9761, |
|
"num_input_tokens_seen": 243007488, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.024419016161377847, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.914893617021277e-05, |
|
"loss": 2.981, |
|
"num_input_tokens_seen": 244187136, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.024536982423027015, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 3.9338061465721044e-05, |
|
"loss": 3.0308, |
|
"num_input_tokens_seen": 245366784, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.024654948684676183, |
|
"grad_norm": 4.0, |
|
"learning_rate": 3.9527186761229314e-05, |
|
"loss": 2.9529, |
|
"num_input_tokens_seen": 246546432, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.02477291494632535, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.971631205673759e-05, |
|
"loss": 2.9297, |
|
"num_input_tokens_seen": 247726080, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.02489088120797452, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 3.990543735224587e-05, |
|
"loss": 2.8328, |
|
"num_input_tokens_seen": 248905728, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.025008847469623688, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.009456264775414e-05, |
|
"loss": 2.9565, |
|
"num_input_tokens_seen": 250085376, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.025126813731272856, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 4.028368794326241e-05, |
|
"loss": 2.9276, |
|
"num_input_tokens_seen": 251265024, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.025244779992922024, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 4.0472813238770685e-05, |
|
"loss": 2.8872, |
|
"num_input_tokens_seen": 252444672, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.025362746254571193, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 4.066193853427896e-05, |
|
"loss": 2.9423, |
|
"num_input_tokens_seen": 253624320, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.02548071251622036, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 4.085106382978723e-05, |
|
"loss": 2.9417, |
|
"num_input_tokens_seen": 254803968, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.02559867877786953, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 4.104018912529551e-05, |
|
"loss": 2.878, |
|
"num_input_tokens_seen": 255983616, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.025716645039518697, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 4.1229314420803786e-05, |
|
"loss": 2.8925, |
|
"num_input_tokens_seen": 257163264, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.025834611301167865, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 4.1418439716312056e-05, |
|
"loss": 2.9432, |
|
"num_input_tokens_seen": 258342912, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.025952577562817034, |
|
"grad_norm": 4.875, |
|
"learning_rate": 4.1607565011820333e-05, |
|
"loss": 2.8552, |
|
"num_input_tokens_seen": 259522560, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.026070543824466202, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 4.1796690307328604e-05, |
|
"loss": 2.8685, |
|
"num_input_tokens_seen": 260702208, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.02618851008611537, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.198581560283688e-05, |
|
"loss": 2.9284, |
|
"num_input_tokens_seen": 261881856, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.026306476347764538, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 4.217494089834516e-05, |
|
"loss": 2.8862, |
|
"num_input_tokens_seen": 263061504, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.026424442609413706, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 4.236406619385343e-05, |
|
"loss": 2.8105, |
|
"num_input_tokens_seen": 264241152, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.026542408871062875, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 2.9406, |
|
"num_input_tokens_seen": 265420800, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.026660375132712043, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 4.274231678486998e-05, |
|
"loss": 2.907, |
|
"num_input_tokens_seen": 266600448, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.02677834139436121, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 4.293144208037825e-05, |
|
"loss": 3.0143, |
|
"num_input_tokens_seen": 267780096, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.026896307656010383, |
|
"grad_norm": 4.25, |
|
"learning_rate": 4.312056737588653e-05, |
|
"loss": 2.9126, |
|
"num_input_tokens_seen": 268959744, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.02701427391765955, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.3309692671394805e-05, |
|
"loss": 2.8879, |
|
"num_input_tokens_seen": 270139392, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.02713224017930872, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 4.3498817966903076e-05, |
|
"loss": 2.8174, |
|
"num_input_tokens_seen": 271319040, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.027250206440957887, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 4.368794326241135e-05, |
|
"loss": 2.895, |
|
"num_input_tokens_seen": 272498688, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.027368172702607056, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.387706855791963e-05, |
|
"loss": 2.8573, |
|
"num_input_tokens_seen": 273678336, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.027486138964256224, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 4.40661938534279e-05, |
|
"loss": 2.7662, |
|
"num_input_tokens_seen": 274857984, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.027604105225905392, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 4.4255319148936176e-05, |
|
"loss": 2.8103, |
|
"num_input_tokens_seen": 276037632, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.02772207148755456, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.444444444444445e-05, |
|
"loss": 2.8271, |
|
"num_input_tokens_seen": 277217280, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.02784003774920373, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 4.4633569739952723e-05, |
|
"loss": 2.8881, |
|
"num_input_tokens_seen": 278396928, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.027958004010852897, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 4.4822695035461e-05, |
|
"loss": 2.9018, |
|
"num_input_tokens_seen": 279576576, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.028075970272502065, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.501182033096927e-05, |
|
"loss": 2.8584, |
|
"num_input_tokens_seen": 280756224, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.028193936534151233, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.520094562647755e-05, |
|
"loss": 2.8435, |
|
"num_input_tokens_seen": 281935872, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.0283119027958004, |
|
"grad_norm": 3.375, |
|
"learning_rate": 4.5390070921985824e-05, |
|
"loss": 2.8827, |
|
"num_input_tokens_seen": 283115520, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02842986905744957, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.5579196217494095e-05, |
|
"loss": 2.8415, |
|
"num_input_tokens_seen": 284295168, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.028547835319098738, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 4.576832151300237e-05, |
|
"loss": 2.8305, |
|
"num_input_tokens_seen": 285474816, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.028665801580747906, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.595744680851065e-05, |
|
"loss": 2.7947, |
|
"num_input_tokens_seen": 286654464, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.028783767842397074, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.614657210401892e-05, |
|
"loss": 2.812, |
|
"num_input_tokens_seen": 287834112, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.028901734104046242, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.633569739952719e-05, |
|
"loss": 2.8157, |
|
"num_input_tokens_seen": 289013760, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.02901970036569541, |
|
"grad_norm": 3.625, |
|
"learning_rate": 4.652482269503546e-05, |
|
"loss": 2.8768, |
|
"num_input_tokens_seen": 290193408, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.02913766662734458, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.6713947990543736e-05, |
|
"loss": 2.834, |
|
"num_input_tokens_seen": 291373056, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.029255632888993747, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.690307328605201e-05, |
|
"loss": 2.8125, |
|
"num_input_tokens_seen": 292552704, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.029373599150642915, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 4.709219858156028e-05, |
|
"loss": 2.7906, |
|
"num_input_tokens_seen": 293732352, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.029491565412292083, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.728132387706856e-05, |
|
"loss": 2.8163, |
|
"num_input_tokens_seen": 294912000, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02960953167394125, |
|
"grad_norm": 2.75, |
|
"learning_rate": 4.747044917257684e-05, |
|
"loss": 2.7826, |
|
"num_input_tokens_seen": 296091648, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.02972749793559042, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 4.765957446808511e-05, |
|
"loss": 2.8368, |
|
"num_input_tokens_seen": 297271296, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.029845464197239588, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 4.7848699763593384e-05, |
|
"loss": 2.7188, |
|
"num_input_tokens_seen": 298450944, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.029963430458888756, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.8037825059101654e-05, |
|
"loss": 2.7609, |
|
"num_input_tokens_seen": 299630592, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.030081396720537928, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.822695035460993e-05, |
|
"loss": 2.742, |
|
"num_input_tokens_seen": 300810240, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.030199362982187096, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.841607565011821e-05, |
|
"loss": 2.7751, |
|
"num_input_tokens_seen": 301989888, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.030317329243836264, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 4.860520094562648e-05, |
|
"loss": 2.6949, |
|
"num_input_tokens_seen": 303169536, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.030435295505485432, |
|
"grad_norm": 3.25, |
|
"learning_rate": 4.8794326241134755e-05, |
|
"loss": 2.8174, |
|
"num_input_tokens_seen": 304349184, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.0305532617671346, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 4.898345153664303e-05, |
|
"loss": 2.6962, |
|
"num_input_tokens_seen": 305528832, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.03067122802878377, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 4.91725768321513e-05, |
|
"loss": 2.8484, |
|
"num_input_tokens_seen": 306708480, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.030789194290432937, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 4.936170212765958e-05, |
|
"loss": 2.632, |
|
"num_input_tokens_seen": 307888128, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.030907160552082105, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.9550827423167856e-05, |
|
"loss": 2.8744, |
|
"num_input_tokens_seen": 309067776, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.031025126813731273, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 4.9739952718676126e-05, |
|
"loss": 2.7714, |
|
"num_input_tokens_seen": 310247424, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.03114309307538044, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.99290780141844e-05, |
|
"loss": 2.8042, |
|
"num_input_tokens_seen": 311427072, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.031261059337029606, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.011820330969268e-05, |
|
"loss": 2.7448, |
|
"num_input_tokens_seen": 312606720, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.03137902559867878, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.030732860520095e-05, |
|
"loss": 2.678, |
|
"num_input_tokens_seen": 313786368, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.03149699186032794, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 5.049645390070923e-05, |
|
"loss": 2.676, |
|
"num_input_tokens_seen": 314966016, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.031614958121977114, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 5.0685579196217504e-05, |
|
"loss": 2.6778, |
|
"num_input_tokens_seen": 316145664, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.031732924383626286, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 5.0874704491725774e-05, |
|
"loss": 2.7624, |
|
"num_input_tokens_seen": 317325312, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.03185089064527545, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 5.106382978723405e-05, |
|
"loss": 2.6462, |
|
"num_input_tokens_seen": 318504960, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03196885690692462, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 5.125295508274232e-05, |
|
"loss": 2.6475, |
|
"num_input_tokens_seen": 319684608, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.03208682316857379, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 5.14420803782506e-05, |
|
"loss": 2.7214, |
|
"num_input_tokens_seen": 320864256, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.03220478943022296, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.1631205673758875e-05, |
|
"loss": 2.6597, |
|
"num_input_tokens_seen": 322043904, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.032322755691872124, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.1820330969267145e-05, |
|
"loss": 2.6412, |
|
"num_input_tokens_seen": 323223552, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.032440721953521295, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.200945626477542e-05, |
|
"loss": 2.7664, |
|
"num_input_tokens_seen": 324403200, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.03255868821517046, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 5.21985815602837e-05, |
|
"loss": 2.7168, |
|
"num_input_tokens_seen": 325582848, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.03267665447681963, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.238770685579196e-05, |
|
"loss": 2.698, |
|
"num_input_tokens_seen": 326762496, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.0327946207384688, |
|
"grad_norm": 1.75, |
|
"learning_rate": 5.257683215130024e-05, |
|
"loss": 2.7068, |
|
"num_input_tokens_seen": 327942144, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.03291258700011797, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 5.276595744680851e-05, |
|
"loss": 2.6487, |
|
"num_input_tokens_seen": 329121792, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.03303055326176713, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.2955082742316786e-05, |
|
"loss": 2.5925, |
|
"num_input_tokens_seen": 330301440, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.033148519523416305, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.314420803782506e-05, |
|
"loss": 2.6826, |
|
"num_input_tokens_seen": 331481088, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.03326648578506547, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 2.6997, |
|
"num_input_tokens_seen": 332660736, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.03338445204671464, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 5.352245862884161e-05, |
|
"loss": 2.6378, |
|
"num_input_tokens_seen": 333840384, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.033502418308363806, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 5.371158392434989e-05, |
|
"loss": 2.588, |
|
"num_input_tokens_seen": 335020032, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.03362038457001298, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.390070921985816e-05, |
|
"loss": 2.6051, |
|
"num_input_tokens_seen": 336199680, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.03373835083166214, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 5.4089834515366434e-05, |
|
"loss": 2.6874, |
|
"num_input_tokens_seen": 337379328, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.033856317093311314, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.4278959810874704e-05, |
|
"loss": 2.7186, |
|
"num_input_tokens_seen": 338558976, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.03397428335496048, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 5.446808510638298e-05, |
|
"loss": 2.6698, |
|
"num_input_tokens_seen": 339738624, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.03409224961660965, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 5.465721040189126e-05, |
|
"loss": 2.603, |
|
"num_input_tokens_seen": 340918272, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.034210215878258815, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.484633569739953e-05, |
|
"loss": 2.6479, |
|
"num_input_tokens_seen": 342097920, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03432818213990799, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.5035460992907805e-05, |
|
"loss": 2.7037, |
|
"num_input_tokens_seen": 343277568, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.03444614840155715, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 5.522458628841608e-05, |
|
"loss": 2.681, |
|
"num_input_tokens_seen": 344457216, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.03456411466320632, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 5.541371158392435e-05, |
|
"loss": 2.5947, |
|
"num_input_tokens_seen": 345636864, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.03468208092485549, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.560283687943263e-05, |
|
"loss": 2.6439, |
|
"num_input_tokens_seen": 346816512, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.03480004718650466, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 5.5791962174940906e-05, |
|
"loss": 2.6404, |
|
"num_input_tokens_seen": 347996160, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.03491801344815383, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.5981087470449176e-05, |
|
"loss": 2.6686, |
|
"num_input_tokens_seen": 349175808, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.035035979709802996, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.617021276595745e-05, |
|
"loss": 2.666, |
|
"num_input_tokens_seen": 350355456, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.03515394597145217, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 5.635933806146573e-05, |
|
"loss": 2.7266, |
|
"num_input_tokens_seen": 351535104, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.03527191223310133, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.6548463356974e-05, |
|
"loss": 2.6655, |
|
"num_input_tokens_seen": 352714752, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.035389878494750504, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 5.673758865248228e-05, |
|
"loss": 2.6975, |
|
"num_input_tokens_seen": 353894400, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03550784475639967, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 5.692671394799055e-05, |
|
"loss": 2.6232, |
|
"num_input_tokens_seen": 355074048, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.03562581101804884, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.7115839243498824e-05, |
|
"loss": 2.6344, |
|
"num_input_tokens_seen": 356253696, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.035743777279698005, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.73049645390071e-05, |
|
"loss": 2.5703, |
|
"num_input_tokens_seen": 357433344, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.03586174354134718, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.749408983451537e-05, |
|
"loss": 2.5405, |
|
"num_input_tokens_seen": 358612992, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.03597970980299634, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.768321513002365e-05, |
|
"loss": 2.6079, |
|
"num_input_tokens_seen": 359792640, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.03609767606464551, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.7872340425531925e-05, |
|
"loss": 2.6958, |
|
"num_input_tokens_seen": 360972288, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.03621564232629468, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 5.8061465721040195e-05, |
|
"loss": 2.5541, |
|
"num_input_tokens_seen": 362151936, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.03633360858794385, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 5.825059101654847e-05, |
|
"loss": 2.6427, |
|
"num_input_tokens_seen": 363331584, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.036451574849593014, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.843971631205675e-05, |
|
"loss": 2.5853, |
|
"num_input_tokens_seen": 364511232, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.036569541111242186, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 5.862884160756501e-05, |
|
"loss": 2.5426, |
|
"num_input_tokens_seen": 365690880, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03668750737289135, |
|
"grad_norm": 1.625, |
|
"learning_rate": 5.881796690307329e-05, |
|
"loss": 2.6835, |
|
"num_input_tokens_seen": 366870528, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.03680547363454052, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.900709219858156e-05, |
|
"loss": 2.626, |
|
"num_input_tokens_seen": 368050176, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.03692343989618969, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 5.9196217494089836e-05, |
|
"loss": 2.5361, |
|
"num_input_tokens_seen": 369229824, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.03704140615783886, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 5.938534278959811e-05, |
|
"loss": 2.5762, |
|
"num_input_tokens_seen": 370409472, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.037159372419488024, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 5.9574468085106384e-05, |
|
"loss": 2.5874, |
|
"num_input_tokens_seen": 371589120, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.037277338681137195, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.976359338061466e-05, |
|
"loss": 2.5874, |
|
"num_input_tokens_seen": 372768768, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.03739530494278636, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 5.995271867612294e-05, |
|
"loss": 2.4888, |
|
"num_input_tokens_seen": 373948416, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.03751327120443553, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.014184397163121e-05, |
|
"loss": 2.5391, |
|
"num_input_tokens_seen": 375128064, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.0376312374660847, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 6.0330969267139484e-05, |
|
"loss": 2.6462, |
|
"num_input_tokens_seen": 376307712, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.03774920372773387, |
|
"grad_norm": 1.875, |
|
"learning_rate": 6.0520094562647755e-05, |
|
"loss": 2.4726, |
|
"num_input_tokens_seen": 377487360, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03786716998938304, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 6.070921985815603e-05, |
|
"loss": 2.5759, |
|
"num_input_tokens_seen": 378667008, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.037985136251032205, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 6.089834515366431e-05, |
|
"loss": 2.5199, |
|
"num_input_tokens_seen": 379846656, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.038103102512681376, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 6.108747044917259e-05, |
|
"loss": 2.6243, |
|
"num_input_tokens_seen": 381026304, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.03822106877433054, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.127659574468086e-05, |
|
"loss": 2.5922, |
|
"num_input_tokens_seen": 382205952, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.03833903503597971, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 6.146572104018913e-05, |
|
"loss": 2.5376, |
|
"num_input_tokens_seen": 383385600, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.03845700129762888, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.165484633569741e-05, |
|
"loss": 2.547, |
|
"num_input_tokens_seen": 384565248, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.03857496755927805, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 6.184397163120568e-05, |
|
"loss": 2.5095, |
|
"num_input_tokens_seen": 385744896, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.038692933820927214, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 6.203309692671395e-05, |
|
"loss": 2.5431, |
|
"num_input_tokens_seen": 386924544, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.038810900082576386, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.222222222222223e-05, |
|
"loss": 2.5744, |
|
"num_input_tokens_seen": 388104192, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.03892886634422555, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 6.24113475177305e-05, |
|
"loss": 2.6511, |
|
"num_input_tokens_seen": 389283840, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.03904683260587472, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 6.260047281323877e-05, |
|
"loss": 2.5556, |
|
"num_input_tokens_seen": 390463488, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.03916479886752389, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 6.278959810874706e-05, |
|
"loss": 2.5616, |
|
"num_input_tokens_seen": 391643136, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.03928276512917306, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 6.297872340425533e-05, |
|
"loss": 2.5747, |
|
"num_input_tokens_seen": 392822784, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.03940073139082222, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 6.31678486997636e-05, |
|
"loss": 2.4747, |
|
"num_input_tokens_seen": 394002432, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.039518697652471395, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 6.335697399527188e-05, |
|
"loss": 2.5124, |
|
"num_input_tokens_seen": 395182080, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.03963666391412056, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 6.354609929078015e-05, |
|
"loss": 2.552, |
|
"num_input_tokens_seen": 396361728, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.03975463017576973, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 6.373522458628842e-05, |
|
"loss": 2.4541, |
|
"num_input_tokens_seen": 397541376, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.039872596437418896, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 6.39243498817967e-05, |
|
"loss": 2.4496, |
|
"num_input_tokens_seen": 398721024, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.03999056269906807, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.411347517730498e-05, |
|
"loss": 2.5182, |
|
"num_input_tokens_seen": 399900672, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.04010852896071723, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 6.430260047281325e-05, |
|
"loss": 2.5118, |
|
"num_input_tokens_seen": 401080320, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.040226495222366404, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 6.449172576832153e-05, |
|
"loss": 2.5161, |
|
"num_input_tokens_seen": 402259968, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.04034446148401557, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 6.468085106382979e-05, |
|
"loss": 2.5411, |
|
"num_input_tokens_seen": 403439616, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.04046242774566474, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 6.486997635933806e-05, |
|
"loss": 2.5341, |
|
"num_input_tokens_seen": 404619264, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.040580394007313905, |
|
"grad_norm": 5.0, |
|
"learning_rate": 6.505910165484634e-05, |
|
"loss": 2.5139, |
|
"num_input_tokens_seen": 405798912, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.04069836026896308, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 6.524822695035461e-05, |
|
"loss": 2.542, |
|
"num_input_tokens_seen": 406978560, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.04081632653061224, |
|
"grad_norm": 6.625, |
|
"learning_rate": 6.543735224586288e-05, |
|
"loss": 2.6481, |
|
"num_input_tokens_seen": 408158208, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.04093429279226141, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 6.562647754137116e-05, |
|
"loss": 2.5413, |
|
"num_input_tokens_seen": 409337856, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.041052259053910585, |
|
"grad_norm": 3.25, |
|
"learning_rate": 6.581560283687943e-05, |
|
"loss": 2.5869, |
|
"num_input_tokens_seen": 410517504, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.04117022531555975, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.60047281323877e-05, |
|
"loss": 2.5717, |
|
"num_input_tokens_seen": 411697152, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.04128819157720892, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 6.619385342789599e-05, |
|
"loss": 2.569, |
|
"num_input_tokens_seen": 412876800, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.041406157838858086, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.638297872340426e-05, |
|
"loss": 2.5656, |
|
"num_input_tokens_seen": 414056448, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.04152412410050726, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 6.657210401891253e-05, |
|
"loss": 2.4558, |
|
"num_input_tokens_seen": 415236096, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.04164209036215642, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 6.676122931442081e-05, |
|
"loss": 2.5107, |
|
"num_input_tokens_seen": 416415744, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.041760056623805594, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.695035460992908e-05, |
|
"loss": 2.5576, |
|
"num_input_tokens_seen": 417595392, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.04187802288545476, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.713947990543735e-05, |
|
"loss": 2.5029, |
|
"num_input_tokens_seen": 418775040, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.04199598914710393, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 6.732860520094564e-05, |
|
"loss": 2.547, |
|
"num_input_tokens_seen": 419954688, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.042113955408753095, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.75177304964539e-05, |
|
"loss": 2.4942, |
|
"num_input_tokens_seen": 421134336, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.04223192167040227, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.770685579196218e-05, |
|
"loss": 2.4546, |
|
"num_input_tokens_seen": 422313984, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.04234988793205143, |
|
"grad_norm": 1.875, |
|
"learning_rate": 6.789598108747046e-05, |
|
"loss": 2.6115, |
|
"num_input_tokens_seen": 423493632, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.042467854193700603, |
|
"grad_norm": 1.5, |
|
"learning_rate": 6.808510638297873e-05, |
|
"loss": 2.4977, |
|
"num_input_tokens_seen": 424673280, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.04258582045534977, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 6.8274231678487e-05, |
|
"loss": 2.4208, |
|
"num_input_tokens_seen": 425852928, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.04270378671699894, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 6.846335697399528e-05, |
|
"loss": 2.5416, |
|
"num_input_tokens_seen": 427032576, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.042821752978648105, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.865248226950355e-05, |
|
"loss": 2.4938, |
|
"num_input_tokens_seen": 428212224, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.042939719240297276, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 6.884160756501182e-05, |
|
"loss": 2.5203, |
|
"num_input_tokens_seen": 429391872, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.04305768550194644, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 6.903073286052011e-05, |
|
"loss": 2.515, |
|
"num_input_tokens_seen": 430571520, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.04317565176359561, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.921985815602838e-05, |
|
"loss": 2.5211, |
|
"num_input_tokens_seen": 431751168, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.04329361802524478, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 6.940898345153665e-05, |
|
"loss": 2.4887, |
|
"num_input_tokens_seen": 432930816, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.04341158428689395, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 6.959810874704493e-05, |
|
"loss": 2.4896, |
|
"num_input_tokens_seen": 434110464, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.043529550548543114, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 6.97872340425532e-05, |
|
"loss": 2.4172, |
|
"num_input_tokens_seen": 435290112, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.043647516810192286, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.997635933806147e-05, |
|
"loss": 2.4538, |
|
"num_input_tokens_seen": 436469760, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04376548307184145, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.016548463356976e-05, |
|
"loss": 2.4439, |
|
"num_input_tokens_seen": 437649408, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.04388344933349062, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 7.035460992907803e-05, |
|
"loss": 2.4898, |
|
"num_input_tokens_seen": 438829056, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.04400141559513979, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 7.05437352245863e-05, |
|
"loss": 2.4337, |
|
"num_input_tokens_seen": 440008704, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.04411938185678896, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 7.073286052009457e-05, |
|
"loss": 2.4729, |
|
"num_input_tokens_seen": 441188352, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.04423734811843813, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 7.092198581560284e-05, |
|
"loss": 2.4801, |
|
"num_input_tokens_seen": 442368000, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.044355314380087295, |
|
"grad_norm": 2.0, |
|
"learning_rate": 7.11111111111111e-05, |
|
"loss": 2.5573, |
|
"num_input_tokens_seen": 443547648, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.044473280641736467, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 7.130023640661939e-05, |
|
"loss": 2.4675, |
|
"num_input_tokens_seen": 444727296, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.04459124690338563, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 7.148936170212766e-05, |
|
"loss": 2.4965, |
|
"num_input_tokens_seen": 445906944, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.0447092131650348, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 7.167848699763593e-05, |
|
"loss": 2.4161, |
|
"num_input_tokens_seen": 447086592, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.04482717942668397, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 7.186761229314421e-05, |
|
"loss": 2.4437, |
|
"num_input_tokens_seen": 448266240, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04494514568833314, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 7.205673758865248e-05, |
|
"loss": 2.4978, |
|
"num_input_tokens_seen": 449445888, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.045063111949982304, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 7.224586288416075e-05, |
|
"loss": 2.4316, |
|
"num_input_tokens_seen": 450625536, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.045181078211631476, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.243498817966904e-05, |
|
"loss": 2.5152, |
|
"num_input_tokens_seen": 451805184, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.04529904447328064, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 7.262411347517731e-05, |
|
"loss": 2.4161, |
|
"num_input_tokens_seen": 452984832, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.04541701073492981, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 7.281323877068558e-05, |
|
"loss": 2.4274, |
|
"num_input_tokens_seen": 454164480, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.04553497699657898, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.300236406619386e-05, |
|
"loss": 2.4693, |
|
"num_input_tokens_seen": 455344128, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.04565294325822815, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 7.319148936170213e-05, |
|
"loss": 2.4907, |
|
"num_input_tokens_seen": 456523776, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.04577090951987731, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 7.33806146572104e-05, |
|
"loss": 2.4603, |
|
"num_input_tokens_seen": 457703424, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.045888875781526485, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 7.356973995271869e-05, |
|
"loss": 2.4651, |
|
"num_input_tokens_seen": 458883072, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.04600684204317565, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 7.375886524822696e-05, |
|
"loss": 2.4041, |
|
"num_input_tokens_seen": 460062720, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.04612480830482482, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 7.394799054373523e-05, |
|
"loss": 2.3896, |
|
"num_input_tokens_seen": 461242368, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.046242774566473986, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 7.413711583924351e-05, |
|
"loss": 2.3966, |
|
"num_input_tokens_seen": 462422016, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.04636074082812316, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 7.432624113475178e-05, |
|
"loss": 2.5009, |
|
"num_input_tokens_seen": 463601664, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.04647870708977232, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 7.451536643026005e-05, |
|
"loss": 2.4524, |
|
"num_input_tokens_seen": 464781312, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.046596673351421494, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.470449172576833e-05, |
|
"loss": 2.4169, |
|
"num_input_tokens_seen": 465960960, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.04671463961307066, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 7.48936170212766e-05, |
|
"loss": 2.5313, |
|
"num_input_tokens_seen": 467140608, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.04683260587471983, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 7.508274231678487e-05, |
|
"loss": 2.5893, |
|
"num_input_tokens_seen": 468320256, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.046950572136368995, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 7.527186761229316e-05, |
|
"loss": 2.453, |
|
"num_input_tokens_seen": 469499904, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.04706853839801817, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 7.546099290780143e-05, |
|
"loss": 2.4462, |
|
"num_input_tokens_seen": 470679552, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.04718650465966733, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 7.56501182033097e-05, |
|
"loss": 2.5393, |
|
"num_input_tokens_seen": 471859200, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04718650465966733, |
|
"eval_wikipedia_loss": 2.4841043949127197, |
|
"eval_wikipedia_runtime": 173.8643, |
|
"eval_wikipedia_samples_per_second": 4.038, |
|
"eval_wikipedia_steps_per_second": 0.173, |
|
"num_input_tokens_seen": 471859200, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04718650465966733, |
|
"eval_toxicity_loss": 4.2398834228515625, |
|
"eval_toxicity_runtime": 0.9943, |
|
"eval_toxicity_samples_per_second": 2.011, |
|
"eval_toxicity_steps_per_second": 1.006, |
|
"num_input_tokens_seen": 471859200, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.047304470921316503, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 7.583924349881798e-05, |
|
"loss": 2.4697, |
|
"num_input_tokens_seen": 473038848, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.047422437182965675, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.602836879432625e-05, |
|
"loss": 2.4191, |
|
"num_input_tokens_seen": 474218496, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.04754040344461484, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 7.621749408983452e-05, |
|
"loss": 2.3892, |
|
"num_input_tokens_seen": 475398144, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.04765836970626401, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 7.64066193853428e-05, |
|
"loss": 2.4324, |
|
"num_input_tokens_seen": 476577792, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.047776335967913176, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.659574468085108e-05, |
|
"loss": 2.4291, |
|
"num_input_tokens_seen": 477757440, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.04789430222956235, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 7.678486997635935e-05, |
|
"loss": 2.4577, |
|
"num_input_tokens_seen": 478937088, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.04801226849121151, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 7.697399527186762e-05, |
|
"loss": 2.4554, |
|
"num_input_tokens_seen": 480116736, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.048130234752860684, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 7.716312056737589e-05, |
|
"loss": 2.3692, |
|
"num_input_tokens_seen": 481296384, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.04824820101450985, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 7.735224586288416e-05, |
|
"loss": 2.518, |
|
"num_input_tokens_seen": 482476032, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.04836616727615902, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 7.754137115839244e-05, |
|
"loss": 2.495, |
|
"num_input_tokens_seen": 483655680, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.048484133537808186, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 7.773049645390071e-05, |
|
"loss": 2.4344, |
|
"num_input_tokens_seen": 484835328, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.04860209979945736, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 7.791962174940898e-05, |
|
"loss": 2.4095, |
|
"num_input_tokens_seen": 486014976, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.04872006606110652, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.810874704491726e-05, |
|
"loss": 2.4562, |
|
"num_input_tokens_seen": 487194624, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.048838032322755694, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 7.829787234042553e-05, |
|
"loss": 2.3863, |
|
"num_input_tokens_seen": 488374272, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.04895599858440486, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 7.84869976359338e-05, |
|
"loss": 2.4577, |
|
"num_input_tokens_seen": 489553920, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.04907396484605403, |
|
"grad_norm": 1.375, |
|
"learning_rate": 7.867612293144209e-05, |
|
"loss": 2.4166, |
|
"num_input_tokens_seen": 490733568, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.049191931107703195, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.886524822695036e-05, |
|
"loss": 2.4524, |
|
"num_input_tokens_seen": 491913216, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.049309897369352367, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 7.905437352245863e-05, |
|
"loss": 2.396, |
|
"num_input_tokens_seen": 493092864, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.04942786363100153, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 7.924349881796691e-05, |
|
"loss": 2.3969, |
|
"num_input_tokens_seen": 494272512, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.0495458298926507, |
|
"grad_norm": 1.625, |
|
"learning_rate": 7.943262411347518e-05, |
|
"loss": 2.4247, |
|
"num_input_tokens_seen": 495452160, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04966379615429987, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.962174940898345e-05, |
|
"loss": 2.5668, |
|
"num_input_tokens_seen": 496631808, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.04978176241594904, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.981087470449174e-05, |
|
"loss": 2.3802, |
|
"num_input_tokens_seen": 497811456, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.049899728677598204, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 8e-05, |
|
"loss": 2.4226, |
|
"num_input_tokens_seen": 498991104, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.050017694939247376, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.999999695696824e-05, |
|
"loss": 2.4269, |
|
"num_input_tokens_seen": 500170752, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.05013566120089654, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 7.999998782787343e-05, |
|
"loss": 2.4702, |
|
"num_input_tokens_seen": 501350400, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.05025362746254571, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 7.999997261271695e-05, |
|
"loss": 2.3265, |
|
"num_input_tokens_seen": 502530048, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.05037159372419488, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 7.999995131150112e-05, |
|
"loss": 2.4, |
|
"num_input_tokens_seen": 503709696, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.05048955998584405, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.999992392422917e-05, |
|
"loss": 2.4076, |
|
"num_input_tokens_seen": 504889344, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.05060752624749322, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 7.999989045090529e-05, |
|
"loss": 2.3864, |
|
"num_input_tokens_seen": 506068992, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.050725492509142385, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 7.999985089153455e-05, |
|
"loss": 2.4194, |
|
"num_input_tokens_seen": 507248640, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05084345877079156, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.999980524612299e-05, |
|
"loss": 2.4248, |
|
"num_input_tokens_seen": 508428288, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.05096142503244072, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 7.999975351467752e-05, |
|
"loss": 2.3898, |
|
"num_input_tokens_seen": 509607936, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.05107939129408989, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 7.999969569720605e-05, |
|
"loss": 2.3805, |
|
"num_input_tokens_seen": 510787584, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.05119735755573906, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 7.999963179371735e-05, |
|
"loss": 2.4378, |
|
"num_input_tokens_seen": 511967232, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.05131532381738823, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 7.999956180422118e-05, |
|
"loss": 2.4174, |
|
"num_input_tokens_seen": 513146880, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.051433290079037394, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 7.999948572872813e-05, |
|
"loss": 2.3132, |
|
"num_input_tokens_seen": 514326528, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.051551256340686566, |
|
"grad_norm": 2.0, |
|
"learning_rate": 7.999940356724983e-05, |
|
"loss": 2.3784, |
|
"num_input_tokens_seen": 515506176, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.05166922260233573, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.999931531979876e-05, |
|
"loss": 2.3361, |
|
"num_input_tokens_seen": 516685824, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.0517871888639849, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.999922098638835e-05, |
|
"loss": 2.4646, |
|
"num_input_tokens_seen": 517865472, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.05190515512563407, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.999912056703294e-05, |
|
"loss": 2.3374, |
|
"num_input_tokens_seen": 519045120, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.05202312138728324, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 7.999901406174781e-05, |
|
"loss": 2.3778, |
|
"num_input_tokens_seen": 520224768, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.052141087648932403, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 7.999890147054918e-05, |
|
"loss": 2.328, |
|
"num_input_tokens_seen": 521404416, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.052259053910581575, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.999878279345418e-05, |
|
"loss": 2.3145, |
|
"num_input_tokens_seen": 522584064, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.05237702017223074, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.999865803048087e-05, |
|
"loss": 2.3785, |
|
"num_input_tokens_seen": 523763712, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.05249498643387991, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.99985271816482e-05, |
|
"loss": 2.3462, |
|
"num_input_tokens_seen": 524943360, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.052612952695529076, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.999839024697612e-05, |
|
"loss": 2.5261, |
|
"num_input_tokens_seen": 526123008, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.05273091895717825, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 7.999824722648545e-05, |
|
"loss": 2.3022, |
|
"num_input_tokens_seen": 527302656, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.05284888521882741, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.999809812019794e-05, |
|
"loss": 2.4073, |
|
"num_input_tokens_seen": 528482304, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.052966851480476584, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.999794292813627e-05, |
|
"loss": 2.3876, |
|
"num_input_tokens_seen": 529661952, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.05308481774212575, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 7.99977816503241e-05, |
|
"loss": 2.3443, |
|
"num_input_tokens_seen": 530841600, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05320278400377492, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.999761428678591e-05, |
|
"loss": 2.4333, |
|
"num_input_tokens_seen": 532021248, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.053320750265424086, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 7.999744083754721e-05, |
|
"loss": 2.3346, |
|
"num_input_tokens_seen": 533200896, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.05343871652707326, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.999726130263437e-05, |
|
"loss": 2.356, |
|
"num_input_tokens_seen": 534380544, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.05355668278872242, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 7.99970756820747e-05, |
|
"loss": 2.2924, |
|
"num_input_tokens_seen": 535560192, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.053674649050371594, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 7.999688397589647e-05, |
|
"loss": 2.3714, |
|
"num_input_tokens_seen": 536739840, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.053792615312020765, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 7.99966861841288e-05, |
|
"loss": 2.3435, |
|
"num_input_tokens_seen": 537919488, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.05391058157366993, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 7.999648230680182e-05, |
|
"loss": 2.4296, |
|
"num_input_tokens_seen": 539099136, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.0540285478353191, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.999627234394656e-05, |
|
"loss": 2.4, |
|
"num_input_tokens_seen": 540278784, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.054146514096968267, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 7.999605629559493e-05, |
|
"loss": 2.3882, |
|
"num_input_tokens_seen": 541458432, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.05426448035861744, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 7.999583416177985e-05, |
|
"loss": 2.4006, |
|
"num_input_tokens_seen": 542638080, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0543824466202666, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 7.999560594253506e-05, |
|
"loss": 2.3288, |
|
"num_input_tokens_seen": 543817728, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.054500412881915775, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 7.999537163789534e-05, |
|
"loss": 2.3453, |
|
"num_input_tokens_seen": 544997376, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.05461837914356494, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.999513124789631e-05, |
|
"loss": 2.3976, |
|
"num_input_tokens_seen": 546177024, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.05473634540521411, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.999488477257453e-05, |
|
"loss": 2.3466, |
|
"num_input_tokens_seen": 547356672, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.054854311666863276, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.999463221196753e-05, |
|
"loss": 2.3788, |
|
"num_input_tokens_seen": 548536320, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.05497227792851245, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.999437356611375e-05, |
|
"loss": 2.2683, |
|
"num_input_tokens_seen": 549715968, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.05509024419016161, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.99941088350525e-05, |
|
"loss": 2.2787, |
|
"num_input_tokens_seen": 550895616, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.055208210451810784, |
|
"grad_norm": 1.5, |
|
"learning_rate": 7.999383801882408e-05, |
|
"loss": 2.3686, |
|
"num_input_tokens_seen": 552075264, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.05532617671345995, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 7.99935611174697e-05, |
|
"loss": 2.283, |
|
"num_input_tokens_seen": 553254912, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.05544414297510912, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.999327813103149e-05, |
|
"loss": 2.3282, |
|
"num_input_tokens_seen": 554434560, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.055562109236758285, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 7.999298905955251e-05, |
|
"loss": 2.5573, |
|
"num_input_tokens_seen": 555614208, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.05568007549840746, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.999269390307672e-05, |
|
"loss": 2.3396, |
|
"num_input_tokens_seen": 556793856, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.05579804176005662, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 7.999239266164906e-05, |
|
"loss": 2.424, |
|
"num_input_tokens_seen": 557973504, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.05591600802170579, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 7.999208533531534e-05, |
|
"loss": 2.2834, |
|
"num_input_tokens_seen": 559153152, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.05603397428335496, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 7.999177192412233e-05, |
|
"loss": 2.3158, |
|
"num_input_tokens_seen": 560332800, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.05615194054500413, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 7.999145242811773e-05, |
|
"loss": 2.3454, |
|
"num_input_tokens_seen": 561512448, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.056269906806653294, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 7.999112684735013e-05, |
|
"loss": 2.3211, |
|
"num_input_tokens_seen": 562692096, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.056387873068302466, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.999079518186908e-05, |
|
"loss": 2.2571, |
|
"num_input_tokens_seen": 563871744, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.05650583932995163, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 7.999045743172504e-05, |
|
"loss": 2.2938, |
|
"num_input_tokens_seen": 565051392, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.0566238055916008, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 7.99901135969694e-05, |
|
"loss": 2.4742, |
|
"num_input_tokens_seen": 566231040, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.05674177185324997, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 7.998976367765446e-05, |
|
"loss": 2.3188, |
|
"num_input_tokens_seen": 567410688, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.05685973811489914, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.998940767383348e-05, |
|
"loss": 2.2454, |
|
"num_input_tokens_seen": 568590336, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.05697770437654831, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 7.998904558556062e-05, |
|
"loss": 2.3923, |
|
"num_input_tokens_seen": 569769984, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.057095670638197475, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 7.998867741289097e-05, |
|
"loss": 2.3465, |
|
"num_input_tokens_seen": 570949632, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.05721363689984665, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.998830315588056e-05, |
|
"loss": 2.3016, |
|
"num_input_tokens_seen": 572129280, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.05733160316149581, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.998792281458631e-05, |
|
"loss": 2.2886, |
|
"num_input_tokens_seen": 573308928, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.05744956942314498, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 7.998753638906613e-05, |
|
"loss": 2.3398, |
|
"num_input_tokens_seen": 574488576, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.05756753568479415, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.998714387937875e-05, |
|
"loss": 2.3558, |
|
"num_input_tokens_seen": 575668224, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.05768550194644332, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 7.998674528558397e-05, |
|
"loss": 2.3123, |
|
"num_input_tokens_seen": 576847872, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.057803468208092484, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.998634060774239e-05, |
|
"loss": 2.4113, |
|
"num_input_tokens_seen": 578027520, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.057921434469741656, |
|
"grad_norm": 2.125, |
|
"learning_rate": 7.998592984591557e-05, |
|
"loss": 2.3259, |
|
"num_input_tokens_seen": 579207168, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.05803940073139082, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 7.998551300016603e-05, |
|
"loss": 2.39, |
|
"num_input_tokens_seen": 580386816, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.05815736699303999, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.99850900705572e-05, |
|
"loss": 2.3108, |
|
"num_input_tokens_seen": 581566464, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.05827533325468916, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.998466105715342e-05, |
|
"loss": 2.2948, |
|
"num_input_tokens_seen": 582746112, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.05839329951633833, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.998422596001997e-05, |
|
"loss": 2.3144, |
|
"num_input_tokens_seen": 583925760, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.058511265777987494, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.998378477922303e-05, |
|
"loss": 2.3439, |
|
"num_input_tokens_seen": 585105408, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.058629232039636665, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.998333751482976e-05, |
|
"loss": 2.3061, |
|
"num_input_tokens_seen": 586285056, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.05874719830128583, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.99828841669082e-05, |
|
"loss": 2.2375, |
|
"num_input_tokens_seen": 587464704, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.058865164562935, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.998242473552732e-05, |
|
"loss": 2.2959, |
|
"num_input_tokens_seen": 588644352, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.058983130824584167, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 7.9981959220757e-05, |
|
"loss": 2.3324, |
|
"num_input_tokens_seen": 589824000, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05910109708623334, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 7.998148762266812e-05, |
|
"loss": 2.267, |
|
"num_input_tokens_seen": 591003648, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.0592190633478825, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 7.99810099413324e-05, |
|
"loss": 2.376, |
|
"num_input_tokens_seen": 592183296, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.059337029609531675, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 7.998052617682253e-05, |
|
"loss": 2.2746, |
|
"num_input_tokens_seen": 593362944, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.05945499587118084, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.998003632921212e-05, |
|
"loss": 2.3335, |
|
"num_input_tokens_seen": 594542592, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.05957296213283001, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 7.997954039857569e-05, |
|
"loss": 2.2334, |
|
"num_input_tokens_seen": 595722240, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.059690928394479176, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 7.99790383849887e-05, |
|
"loss": 2.3015, |
|
"num_input_tokens_seen": 596901888, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.05980889465612835, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 7.997853028852755e-05, |
|
"loss": 2.2773, |
|
"num_input_tokens_seen": 598081536, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.05992686091777751, |
|
"grad_norm": 1.0, |
|
"learning_rate": 7.997801610926952e-05, |
|
"loss": 2.2407, |
|
"num_input_tokens_seen": 599261184, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.060044827179426684, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.997749584729286e-05, |
|
"loss": 2.4037, |
|
"num_input_tokens_seen": 600440832, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.060162793441075856, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 7.997696950267672e-05, |
|
"loss": 2.3178, |
|
"num_input_tokens_seen": 601620480, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.06028075970272502, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 7.997643707550117e-05, |
|
"loss": 2.2891, |
|
"num_input_tokens_seen": 602800128, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.06039872596437419, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 7.997589856584725e-05, |
|
"loss": 2.3047, |
|
"num_input_tokens_seen": 603979776, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.06051669222602336, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.99753539737969e-05, |
|
"loss": 2.2871, |
|
"num_input_tokens_seen": 605159424, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.06063465848767253, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 7.997480329943294e-05, |
|
"loss": 2.2966, |
|
"num_input_tokens_seen": 606339072, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.06075262474932169, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 7.99742465428392e-05, |
|
"loss": 2.1938, |
|
"num_input_tokens_seen": 607518720, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.060870591010970865, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.997368370410035e-05, |
|
"loss": 2.2878, |
|
"num_input_tokens_seen": 608698368, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.06098855727262003, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.997311478330206e-05, |
|
"loss": 2.2645, |
|
"num_input_tokens_seen": 609878016, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.0611065235342692, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.997253978053087e-05, |
|
"loss": 2.2939, |
|
"num_input_tokens_seen": 611057664, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.061224489795918366, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.997195869587428e-05, |
|
"loss": 2.2763, |
|
"num_input_tokens_seen": 612237312, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.06134245605756754, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 7.997137152942071e-05, |
|
"loss": 2.3172, |
|
"num_input_tokens_seen": 613416960, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.0614604223192167, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 7.997077828125948e-05, |
|
"loss": 2.2249, |
|
"num_input_tokens_seen": 614596608, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.061578388580865874, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.997017895148088e-05, |
|
"loss": 2.3344, |
|
"num_input_tokens_seen": 615776256, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.06169635484251504, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 7.996957354017605e-05, |
|
"loss": 2.2967, |
|
"num_input_tokens_seen": 616955904, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.06181432110416421, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 7.996896204743716e-05, |
|
"loss": 2.2287, |
|
"num_input_tokens_seen": 618135552, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.061932287365813375, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 7.996834447335722e-05, |
|
"loss": 2.2897, |
|
"num_input_tokens_seen": 619315200, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.06205025362746255, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 7.99677208180302e-05, |
|
"loss": 2.3208, |
|
"num_input_tokens_seen": 620494848, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.06216821988911171, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.996709108155098e-05, |
|
"loss": 2.2778, |
|
"num_input_tokens_seen": 621674496, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.06228618615076088, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 7.99664552640154e-05, |
|
"loss": 2.1945, |
|
"num_input_tokens_seen": 622854144, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.06240415241241005, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.996581336552018e-05, |
|
"loss": 2.3752, |
|
"num_input_tokens_seen": 624033792, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.06252211867405921, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 7.996516538616299e-05, |
|
"loss": 2.3217, |
|
"num_input_tokens_seen": 625213440, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.06264008493570838, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 7.996451132604242e-05, |
|
"loss": 2.3374, |
|
"num_input_tokens_seen": 626393088, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.06275805119735756, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.9963851185258e-05, |
|
"loss": 2.2926, |
|
"num_input_tokens_seen": 627572736, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.06287601745900673, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 7.996318496391015e-05, |
|
"loss": 2.2946, |
|
"num_input_tokens_seen": 628752384, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.06299398372065589, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 7.996251266210025e-05, |
|
"loss": 2.2977, |
|
"num_input_tokens_seen": 629932032, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.06311194998230506, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 7.996183427993058e-05, |
|
"loss": 2.2898, |
|
"num_input_tokens_seen": 631111680, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.06322991624395423, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 7.996114981750438e-05, |
|
"loss": 2.2557, |
|
"num_input_tokens_seen": 632291328, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.0633478825056034, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.996045927492578e-05, |
|
"loss": 2.3759, |
|
"num_input_tokens_seen": 633470976, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.06346584876725257, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.995976265229984e-05, |
|
"loss": 2.2283, |
|
"num_input_tokens_seen": 634650624, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.06358381502890173, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 7.995905994973257e-05, |
|
"loss": 2.4155, |
|
"num_input_tokens_seen": 635830272, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.0637017812905509, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.995835116733086e-05, |
|
"loss": 2.3187, |
|
"num_input_tokens_seen": 637009920, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.06381974755220007, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 7.995763630520257e-05, |
|
"loss": 2.2263, |
|
"num_input_tokens_seen": 638189568, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.06393771381384925, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.995691536345647e-05, |
|
"loss": 2.2788, |
|
"num_input_tokens_seen": 639369216, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.0640556800754984, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.995618834220223e-05, |
|
"loss": 2.2173, |
|
"num_input_tokens_seen": 640548864, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.06417364633714757, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 7.99554552415505e-05, |
|
"loss": 2.2742, |
|
"num_input_tokens_seen": 641728512, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.06429161259879675, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.99547160616128e-05, |
|
"loss": 2.2516, |
|
"num_input_tokens_seen": 642908160, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.06440957886044592, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.995397080250162e-05, |
|
"loss": 2.33, |
|
"num_input_tokens_seen": 644087808, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.06452754512209508, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.995321946433033e-05, |
|
"loss": 2.2447, |
|
"num_input_tokens_seen": 645267456, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.06464551138374425, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 7.995246204721324e-05, |
|
"loss": 2.369, |
|
"num_input_tokens_seen": 646447104, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.06476347764539342, |
|
"grad_norm": 1.25, |
|
"learning_rate": 7.995169855126561e-05, |
|
"loss": 2.2753, |
|
"num_input_tokens_seen": 647626752, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.06488144390704259, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 7.995092897660363e-05, |
|
"loss": 2.2828, |
|
"num_input_tokens_seen": 648806400, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.06499941016869175, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.995015332334433e-05, |
|
"loss": 2.244, |
|
"num_input_tokens_seen": 649986048, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.06511737643034092, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 7.994937159160578e-05, |
|
"loss": 2.2447, |
|
"num_input_tokens_seen": 651165696, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.06523534269199009, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.99485837815069e-05, |
|
"loss": 2.2295, |
|
"num_input_tokens_seen": 652345344, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.06535330895363926, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 7.994778989316757e-05, |
|
"loss": 2.2049, |
|
"num_input_tokens_seen": 653524992, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.06547127521528842, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 7.994698992670855e-05, |
|
"loss": 2.257, |
|
"num_input_tokens_seen": 654704640, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.0655892414769376, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.99461838822516e-05, |
|
"loss": 2.1423, |
|
"num_input_tokens_seen": 655884288, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.06570720773858676, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.994537175991934e-05, |
|
"loss": 2.2261, |
|
"num_input_tokens_seen": 657063936, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.06582517400023594, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 7.994455355983532e-05, |
|
"loss": 2.2693, |
|
"num_input_tokens_seen": 658243584, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.0659431402618851, |
|
"grad_norm": 1.125, |
|
"learning_rate": 7.994372928212406e-05, |
|
"loss": 2.3283, |
|
"num_input_tokens_seen": 659423232, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.06606110652353427, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 7.994289892691094e-05, |
|
"loss": 2.2631, |
|
"num_input_tokens_seen": 660602880, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.06617907278518344, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 7.994206249432234e-05, |
|
"loss": 2.1609, |
|
"num_input_tokens_seen": 661782528, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.06629703904683261, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 7.994121998448549e-05, |
|
"loss": 2.1385, |
|
"num_input_tokens_seen": 662962176, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.06641500530848178, |
|
"grad_norm": 1.625, |
|
"learning_rate": 7.99403713975286e-05, |
|
"loss": 2.22, |
|
"num_input_tokens_seen": 664141824, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.06653297157013094, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 7.993951673358078e-05, |
|
"loss": 2.2546, |
|
"num_input_tokens_seen": 665321472, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.06665093783178011, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.993865599277206e-05, |
|
"loss": 2.217, |
|
"num_input_tokens_seen": 666501120, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.06676890409342928, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 7.993778917523343e-05, |
|
"loss": 2.1732, |
|
"num_input_tokens_seen": 667680768, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.06688687035507845, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.993691628109673e-05, |
|
"loss": 2.2929, |
|
"num_input_tokens_seen": 668860416, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.06700483661672761, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 7.993603731049481e-05, |
|
"loss": 2.3019, |
|
"num_input_tokens_seen": 670040064, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.06712280287837678, |
|
"grad_norm": 1.25, |
|
"learning_rate": 7.99351522635614e-05, |
|
"loss": 2.2712, |
|
"num_input_tokens_seen": 671219712, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.06724076914002595, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.993426114043115e-05, |
|
"loss": 2.1995, |
|
"num_input_tokens_seen": 672399360, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.06735873540167513, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.993336394123965e-05, |
|
"loss": 2.189, |
|
"num_input_tokens_seen": 673579008, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.06747670166332428, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.993246066612343e-05, |
|
"loss": 2.2997, |
|
"num_input_tokens_seen": 674758656, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.06759466792497346, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 7.993155131521991e-05, |
|
"loss": 2.2692, |
|
"num_input_tokens_seen": 675938304, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.06771263418662263, |
|
"grad_norm": 1.125, |
|
"learning_rate": 7.993063588866742e-05, |
|
"loss": 2.243, |
|
"num_input_tokens_seen": 677117952, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.0678306004482718, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 7.992971438660529e-05, |
|
"loss": 2.184, |
|
"num_input_tokens_seen": 678297600, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.06794856670992096, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 7.99287868091737e-05, |
|
"loss": 2.1793, |
|
"num_input_tokens_seen": 679477248, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.06806653297157013, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 7.99278531565138e-05, |
|
"loss": 2.3607, |
|
"num_input_tokens_seen": 680656896, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.0681844992332193, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.992691342876765e-05, |
|
"loss": 2.2439, |
|
"num_input_tokens_seen": 681836544, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.06830246549486847, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 7.99259676260782e-05, |
|
"loss": 2.2214, |
|
"num_input_tokens_seen": 683016192, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.06842043175651763, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.992501574858937e-05, |
|
"loss": 2.1635, |
|
"num_input_tokens_seen": 684195840, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0685383980181668, |
|
"grad_norm": 1.25, |
|
"learning_rate": 7.9924057796446e-05, |
|
"loss": 2.1955, |
|
"num_input_tokens_seen": 685375488, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.06865636427981597, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.992309376979385e-05, |
|
"loss": 2.1876, |
|
"num_input_tokens_seen": 686555136, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.06877433054146515, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 7.992212366877959e-05, |
|
"loss": 2.1243, |
|
"num_input_tokens_seen": 687734784, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.0688922968031143, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 7.992114749355079e-05, |
|
"loss": 2.2922, |
|
"num_input_tokens_seen": 688914432, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.06901026306476347, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 7.992016524425603e-05, |
|
"loss": 2.1665, |
|
"num_input_tokens_seen": 690094080, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.06912822932641265, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.991917692104473e-05, |
|
"loss": 2.207, |
|
"num_input_tokens_seen": 691273728, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.06924619558806182, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 7.991818252406726e-05, |
|
"loss": 2.1341, |
|
"num_input_tokens_seen": 692453376, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.06936416184971098, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.991718205347494e-05, |
|
"loss": 2.1968, |
|
"num_input_tokens_seen": 693633024, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.06948212811136015, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.991617550941998e-05, |
|
"loss": 2.1095, |
|
"num_input_tokens_seen": 694812672, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.06960009437300932, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 7.991516289205554e-05, |
|
"loss": 2.2366, |
|
"num_input_tokens_seen": 695992320, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.06971806063465849, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.991414420153569e-05, |
|
"loss": 2.2333, |
|
"num_input_tokens_seen": 697171968, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.06983602689630766, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 7.991311943801539e-05, |
|
"loss": 2.1065, |
|
"num_input_tokens_seen": 698351616, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.06995399315795682, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.99120886016506e-05, |
|
"loss": 2.1005, |
|
"num_input_tokens_seen": 699531264, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.07007195941960599, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.991105169259815e-05, |
|
"loss": 2.2168, |
|
"num_input_tokens_seen": 700710912, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.07018992568125516, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.991000871101581e-05, |
|
"loss": 2.3617, |
|
"num_input_tokens_seen": 701890560, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.07030789194290434, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.990895965706227e-05, |
|
"loss": 2.2199, |
|
"num_input_tokens_seen": 703070208, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.0704258582045535, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.990790453089714e-05, |
|
"loss": 2.1908, |
|
"num_input_tokens_seen": 704249856, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.07054382446620266, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 7.990684333268097e-05, |
|
"loss": 2.1718, |
|
"num_input_tokens_seen": 705429504, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.07066179072785184, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.99057760625752e-05, |
|
"loss": 2.1296, |
|
"num_input_tokens_seen": 706609152, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.07077975698950101, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.990470272074225e-05, |
|
"loss": 2.1698, |
|
"num_input_tokens_seen": 707788800, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07077975698950101, |
|
"eval_wikipedia_loss": 2.343088150024414, |
|
"eval_wikipedia_runtime": 167.6638, |
|
"eval_wikipedia_samples_per_second": 4.187, |
|
"eval_wikipedia_steps_per_second": 0.179, |
|
"num_input_tokens_seen": 707788800, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07077975698950101, |
|
"eval_toxicity_loss": 4.090839862823486, |
|
"eval_toxicity_runtime": 1.2034, |
|
"eval_toxicity_samples_per_second": 1.662, |
|
"eval_toxicity_steps_per_second": 0.831, |
|
"num_input_tokens_seen": 707788800, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.00011796626164916834, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 7.99036233073454e-05, |
|
"loss": 2.1019, |
|
"num_input_tokens_seen": 708968448, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.0002359325232983367, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.99025378225489e-05, |
|
"loss": 2.1446, |
|
"num_input_tokens_seen": 710148096, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.000353898784947505, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 7.990144626651791e-05, |
|
"loss": 2.1271, |
|
"num_input_tokens_seen": 711327744, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.0004718650465966734, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.990034863941851e-05, |
|
"loss": 2.1123, |
|
"num_input_tokens_seen": 712507392, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.0005898313082458417, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.989924494141771e-05, |
|
"loss": 2.1567, |
|
"num_input_tokens_seen": 713687040, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.00070779756989501, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 7.989813517268343e-05, |
|
"loss": 2.1786, |
|
"num_input_tokens_seen": 714866688, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.0008257638315441783, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.989701933338453e-05, |
|
"loss": 2.2267, |
|
"num_input_tokens_seen": 716046336, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.0009437300931933467, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 7.989589742369077e-05, |
|
"loss": 2.1507, |
|
"num_input_tokens_seen": 717225984, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.001061696354842515, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.989476944377286e-05, |
|
"loss": 2.1274, |
|
"num_input_tokens_seen": 718405632, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.0011796626164916834, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 7.989363539380245e-05, |
|
"loss": 2.3054, |
|
"num_input_tokens_seen": 719585280, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.0012976288781408518, |
|
"grad_norm": 1.0, |
|
"learning_rate": 7.989249527395205e-05, |
|
"loss": 2.1765, |
|
"num_input_tokens_seen": 720764928, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.00141559513979002, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.989134908439515e-05, |
|
"loss": 2.1669, |
|
"num_input_tokens_seen": 721944576, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.0015335614014391884, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 7.989019682530614e-05, |
|
"loss": 2.1091, |
|
"num_input_tokens_seen": 723124224, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.0016515276630883566, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 7.988903849686033e-05, |
|
"loss": 2.0888, |
|
"num_input_tokens_seen": 724303872, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.001769493924737525, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 7.988787409923398e-05, |
|
"loss": 2.1604, |
|
"num_input_tokens_seen": 725483520, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.0018874601863866935, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 7.988670363260425e-05, |
|
"loss": 2.1344, |
|
"num_input_tokens_seen": 726663168, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.0020054264480358617, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 7.988552709714921e-05, |
|
"loss": 2.2063, |
|
"num_input_tokens_seen": 727842816, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.00212339270968503, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 7.98843444930479e-05, |
|
"loss": 2.2359, |
|
"num_input_tokens_seen": 729022464, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.0022413589713341986, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 7.988315582048022e-05, |
|
"loss": 2.1383, |
|
"num_input_tokens_seen": 730202112, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.0023593252329833668, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.988196107962707e-05, |
|
"loss": 2.2032, |
|
"num_input_tokens_seen": 731381760, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.002477291494632535, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 7.98807602706702e-05, |
|
"loss": 2.2634, |
|
"num_input_tokens_seen": 732561408, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.0025952577562817036, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.987955339379234e-05, |
|
"loss": 2.177, |
|
"num_input_tokens_seen": 733741056, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.002713224017930872, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 7.987834044917709e-05, |
|
"loss": 2.1918, |
|
"num_input_tokens_seen": 734920704, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.00283119027958004, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 7.9877121437009e-05, |
|
"loss": 2.2417, |
|
"num_input_tokens_seen": 736100352, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.0029491565412292082, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.987589635747359e-05, |
|
"loss": 2.1719, |
|
"num_input_tokens_seen": 737280000, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.003067122802878377, |
|
"grad_norm": 1.0, |
|
"learning_rate": 7.987466521075722e-05, |
|
"loss": 2.2465, |
|
"num_input_tokens_seen": 738459648, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.003185089064527545, |
|
"grad_norm": 1.0, |
|
"learning_rate": 7.987342799704721e-05, |
|
"loss": 2.1183, |
|
"num_input_tokens_seen": 739639296, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.0033030553261767133, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 7.987218471653181e-05, |
|
"loss": 2.0837, |
|
"num_input_tokens_seen": 740818944, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.003421021587825882, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.987093536940019e-05, |
|
"loss": 2.1209, |
|
"num_input_tokens_seen": 741998592, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.00353898784947505, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 7.986967995584245e-05, |
|
"loss": 2.137, |
|
"num_input_tokens_seen": 743178240, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0036569541111242184, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 7.986841847604958e-05, |
|
"loss": 2.1377, |
|
"num_input_tokens_seen": 744357888, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.003774920372773387, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 7.986715093021353e-05, |
|
"loss": 2.2592, |
|
"num_input_tokens_seen": 745537536, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.003892886634422555, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.986587731852717e-05, |
|
"loss": 2.1295, |
|
"num_input_tokens_seen": 746717184, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.004010852896071723, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 7.986459764118427e-05, |
|
"loss": 2.1378, |
|
"num_input_tokens_seen": 747896832, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.004128819157720892, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 7.986331189837952e-05, |
|
"loss": 2.1956, |
|
"num_input_tokens_seen": 749076480, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.00424678541937006, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 7.986202009030858e-05, |
|
"loss": 2.2073, |
|
"num_input_tokens_seen": 750256128, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.004364751681019229, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.986072221716798e-05, |
|
"loss": 2.1139, |
|
"num_input_tokens_seen": 751435776, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.004482717942668397, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.985941827915519e-05, |
|
"loss": 2.2407, |
|
"num_input_tokens_seen": 752615424, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.004600684204317565, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 7.985810827646862e-05, |
|
"loss": 2.1899, |
|
"num_input_tokens_seen": 753795072, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.0047186504659667335, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.985679220930758e-05, |
|
"loss": 2.2486, |
|
"num_input_tokens_seen": 754974720, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.004836616727615902, |
|
"grad_norm": 0.875, |
|
"learning_rate": 7.985547007787231e-05, |
|
"loss": 2.1191, |
|
"num_input_tokens_seen": 756154368, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.00495458298926507, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.985414188236398e-05, |
|
"loss": 2.0786, |
|
"num_input_tokens_seen": 757334016, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.005072549250914238, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.985280762298468e-05, |
|
"loss": 2.2075, |
|
"num_input_tokens_seen": 758513664, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.005190515512563407, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.985146729993741e-05, |
|
"loss": 2.1767, |
|
"num_input_tokens_seen": 759693312, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.0053084817742125754, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 7.985012091342611e-05, |
|
"loss": 2.1301, |
|
"num_input_tokens_seen": 760872960, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.005426448035861744, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.984876846365564e-05, |
|
"loss": 2.1197, |
|
"num_input_tokens_seen": 762052608, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.005544414297510912, |
|
"grad_norm": 1.25, |
|
"learning_rate": 7.984740995083175e-05, |
|
"loss": 2.4474, |
|
"num_input_tokens_seen": 763232256, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.00566238055916008, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 7.984604537516118e-05, |
|
"loss": 2.1303, |
|
"num_input_tokens_seen": 764411904, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.005780346820809248, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 7.984467473685153e-05, |
|
"loss": 2.2679, |
|
"num_input_tokens_seen": 765591552, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.0058983130824584165, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 7.984329803611133e-05, |
|
"loss": 2.1687, |
|
"num_input_tokens_seen": 766771200, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.0060162793441075856, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.984191527315009e-05, |
|
"loss": 2.1592, |
|
"num_input_tokens_seen": 767950848, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.006134245605756754, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 7.984052644817815e-05, |
|
"loss": 2.1808, |
|
"num_input_tokens_seen": 769130496, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.006252211867405922, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.983913156140685e-05, |
|
"loss": 2.209, |
|
"num_input_tokens_seen": 770310144, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.00637017812905509, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.983773061304843e-05, |
|
"loss": 2.1247, |
|
"num_input_tokens_seen": 771489792, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.006488144390704258, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.983632360331603e-05, |
|
"loss": 2.1624, |
|
"num_input_tokens_seen": 772669440, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.006606110652353427, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 7.983491053242373e-05, |
|
"loss": 2.1374, |
|
"num_input_tokens_seen": 773849088, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.006724076914002596, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.983349140058654e-05, |
|
"loss": 2.1465, |
|
"num_input_tokens_seen": 775028736, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.006842043175651764, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.983206620802038e-05, |
|
"loss": 2.1039, |
|
"num_input_tokens_seen": 776208384, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.006960009437300932, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.98306349549421e-05, |
|
"loss": 2.2616, |
|
"num_input_tokens_seen": 777388032, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.0070779756989501, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 7.982919764156945e-05, |
|
"loss": 2.1215, |
|
"num_input_tokens_seen": 778567680, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.0071959419605992685, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.982775426812114e-05, |
|
"loss": 2.1266, |
|
"num_input_tokens_seen": 779747328, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.007313908222248437, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.982630483481678e-05, |
|
"loss": 2.1429, |
|
"num_input_tokens_seen": 780926976, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.007431874483897605, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.982484934187687e-05, |
|
"loss": 2.0725, |
|
"num_input_tokens_seen": 782106624, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.007549840745546774, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.982338778952292e-05, |
|
"loss": 2.2623, |
|
"num_input_tokens_seen": 783286272, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.007667807007195942, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 7.982192017797727e-05, |
|
"loss": 2.1474, |
|
"num_input_tokens_seen": 784465920, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.00778577326884511, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.982044650746321e-05, |
|
"loss": 2.213, |
|
"num_input_tokens_seen": 785645568, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.007903739530494279, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 7.9818966778205e-05, |
|
"loss": 2.1183, |
|
"num_input_tokens_seen": 786825216, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.008021705792143447, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.981748099042777e-05, |
|
"loss": 2.203, |
|
"num_input_tokens_seen": 788004864, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.008139672053792615, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.981598914435756e-05, |
|
"loss": 2.247, |
|
"num_input_tokens_seen": 789184512, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.008257638315441783, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.98144912402214e-05, |
|
"loss": 2.1766, |
|
"num_input_tokens_seen": 790364160, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.008375604577090951, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 7.981298727824715e-05, |
|
"loss": 2.1032, |
|
"num_input_tokens_seen": 791543808, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.00849357083874012, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.981147725866367e-05, |
|
"loss": 2.0328, |
|
"num_input_tokens_seen": 792723456, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.008611537100389288, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 7.980996118170071e-05, |
|
"loss": 2.156, |
|
"num_input_tokens_seen": 793903104, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.008729503362038458, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 7.980843904758894e-05, |
|
"loss": 2.2632, |
|
"num_input_tokens_seen": 795082752, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.008847469623687626, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 7.980691085655995e-05, |
|
"loss": 2.1247, |
|
"num_input_tokens_seen": 796262400, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.008965435885336794, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 7.980537660884625e-05, |
|
"loss": 2.1808, |
|
"num_input_tokens_seen": 797442048, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.009083402146985962, |
|
"grad_norm": 1.375, |
|
"learning_rate": 7.980383630468132e-05, |
|
"loss": 2.1755, |
|
"num_input_tokens_seen": 798621696, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.00920136840863513, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.980228994429947e-05, |
|
"loss": 2.1093, |
|
"num_input_tokens_seen": 799801344, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.009319334670284299, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.9800737527936e-05, |
|
"loss": 2.0614, |
|
"num_input_tokens_seen": 800980992, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.009437300931933467, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.979917905582712e-05, |
|
"loss": 2.0862, |
|
"num_input_tokens_seen": 802160640, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.009555267193582635, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 7.979761452820993e-05, |
|
"loss": 2.1008, |
|
"num_input_tokens_seen": 803340288, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.009673233455231803, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 7.979604394532251e-05, |
|
"loss": 2.1894, |
|
"num_input_tokens_seen": 804519936, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.009791199716880972, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.979446730740381e-05, |
|
"loss": 2.1344, |
|
"num_input_tokens_seen": 805699584, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.00990916597853014, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.979288461469371e-05, |
|
"loss": 2.1419, |
|
"num_input_tokens_seen": 806879232, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.010027132240179308, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 7.9791295867433e-05, |
|
"loss": 2.1314, |
|
"num_input_tokens_seen": 808058880, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.010145098501828476, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 7.978970106586347e-05, |
|
"loss": 2.1138, |
|
"num_input_tokens_seen": 809238528, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.010263064763477646, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.978810021022773e-05, |
|
"loss": 2.1573, |
|
"num_input_tokens_seen": 810418176, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.010381031025126814, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 7.978649330076936e-05, |
|
"loss": 2.239, |
|
"num_input_tokens_seen": 811597824, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.010498997286775983, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.978488033773285e-05, |
|
"loss": 2.0574, |
|
"num_input_tokens_seen": 812777472, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.010616963548425151, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.978326132136364e-05, |
|
"loss": 2.0513, |
|
"num_input_tokens_seen": 813957120, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.010734929810074319, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.978163625190803e-05, |
|
"loss": 2.1201, |
|
"num_input_tokens_seen": 815136768, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.010852896071723487, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 7.978000512961329e-05, |
|
"loss": 2.0681, |
|
"num_input_tokens_seen": 816316416, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.010970862333372655, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.977836795472761e-05, |
|
"loss": 2.0907, |
|
"num_input_tokens_seen": 817496064, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.011088828595021824, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 7.977672472750006e-05, |
|
"loss": 2.1739, |
|
"num_input_tokens_seen": 818675712, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.011206794856670992, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 7.977507544818069e-05, |
|
"loss": 2.1536, |
|
"num_input_tokens_seen": 819855360, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.01132476111832016, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 7.977342011702043e-05, |
|
"loss": 2.0564, |
|
"num_input_tokens_seen": 821035008, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.011442727379969328, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.977175873427114e-05, |
|
"loss": 2.0971, |
|
"num_input_tokens_seen": 822214656, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.011560693641618497, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 7.977009130018561e-05, |
|
"loss": 2.1439, |
|
"num_input_tokens_seen": 823394304, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.011678659903267665, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 7.976841781501751e-05, |
|
"loss": 2.0159, |
|
"num_input_tokens_seen": 824573952, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.011796626164916833, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.97667382790215e-05, |
|
"loss": 2.2684, |
|
"num_input_tokens_seen": 825753600, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.011914592426566003, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 7.976505269245314e-05, |
|
"loss": 2.0875, |
|
"num_input_tokens_seen": 826933248, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.012032558688215171, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.976336105556884e-05, |
|
"loss": 2.0368, |
|
"num_input_tokens_seen": 828112896, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.01215052494986434, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 7.976166336862602e-05, |
|
"loss": 2.0681, |
|
"num_input_tokens_seen": 829292544, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.012268491211513508, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 7.975995963188297e-05, |
|
"loss": 2.1092, |
|
"num_input_tokens_seen": 830472192, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.012386457473162676, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 7.975824984559893e-05, |
|
"loss": 2.1025, |
|
"num_input_tokens_seen": 831651840, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.012504423734811844, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 7.975653401003404e-05, |
|
"loss": 2.1325, |
|
"num_input_tokens_seen": 832831488, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.012622389996461012, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 7.975481212544938e-05, |
|
"loss": 2.1961, |
|
"num_input_tokens_seen": 834011136, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.01274035625811018, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 7.97530841921069e-05, |
|
"loss": 2.0702, |
|
"num_input_tokens_seen": 835190784, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.012858322519759349, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.975135021026956e-05, |
|
"loss": 2.165, |
|
"num_input_tokens_seen": 836370432, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.012976288781408517, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 7.974961018020115e-05, |
|
"loss": 2.1599, |
|
"num_input_tokens_seen": 837550080, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.013094255043057685, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.974786410216643e-05, |
|
"loss": 2.0392, |
|
"num_input_tokens_seen": 838729728, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.013212221304706853, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.974611197643108e-05, |
|
"loss": 2.2548, |
|
"num_input_tokens_seen": 839909376, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.013330187566356021, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.974435380326166e-05, |
|
"loss": 2.0817, |
|
"num_input_tokens_seen": 841089024, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.013448153828005191, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.97425895829257e-05, |
|
"loss": 2.1358, |
|
"num_input_tokens_seen": 842268672, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.01356612008965436, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 7.974081931569163e-05, |
|
"loss": 2.207, |
|
"num_input_tokens_seen": 843448320, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.013684086351303528, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.97390430018288e-05, |
|
"loss": 2.106, |
|
"num_input_tokens_seen": 844627968, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.013802052612952696, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 7.973726064160746e-05, |
|
"loss": 2.0721, |
|
"num_input_tokens_seen": 845807616, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.013920018874601864, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 7.973547223529882e-05, |
|
"loss": 2.1795, |
|
"num_input_tokens_seen": 846987264, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.014037985136251032, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.973367778317497e-05, |
|
"loss": 2.1144, |
|
"num_input_tokens_seen": 848166912, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.0141559513979002, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 7.973187728550897e-05, |
|
"loss": 2.0617, |
|
"num_input_tokens_seen": 849346560, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.014273917659549369, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.973007074257472e-05, |
|
"loss": 1.9967, |
|
"num_input_tokens_seen": 850526208, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.014391883921198537, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 7.972825815464713e-05, |
|
"loss": 2.0598, |
|
"num_input_tokens_seen": 851705856, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.014509850182847705, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.972643952200198e-05, |
|
"loss": 2.0309, |
|
"num_input_tokens_seen": 852885504, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.014627816444496873, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 7.972461484491597e-05, |
|
"loss": 2.0394, |
|
"num_input_tokens_seen": 854065152, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.014745782706146042, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 7.972278412366672e-05, |
|
"loss": 2.0426, |
|
"num_input_tokens_seen": 855244800, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.01486374896779521, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 7.97209473585328e-05, |
|
"loss": 2.0396, |
|
"num_input_tokens_seen": 856424448, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.014981715229444378, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 7.971910454979367e-05, |
|
"loss": 2.0892, |
|
"num_input_tokens_seen": 857604096, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.015099681491093548, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 7.971725569772968e-05, |
|
"loss": 2.1033, |
|
"num_input_tokens_seen": 858783744, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.015217647752742716, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 7.97154008026222e-05, |
|
"loss": 1.975, |
|
"num_input_tokens_seen": 859963392, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.015335614014391884, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.97135398647534e-05, |
|
"loss": 2.0152, |
|
"num_input_tokens_seen": 861143040, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.015453580276041053, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 7.971167288440646e-05, |
|
"loss": 2.0447, |
|
"num_input_tokens_seen": 862322688, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.01557154653769022, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 7.970979986186541e-05, |
|
"loss": 2.0918, |
|
"num_input_tokens_seen": 863502336, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.01568951279933939, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.970792079741527e-05, |
|
"loss": 2.0034, |
|
"num_input_tokens_seen": 864681984, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.015807479060988557, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 7.970603569134192e-05, |
|
"loss": 2.0668, |
|
"num_input_tokens_seen": 865861632, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.015925445322637725, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 7.970414454393218e-05, |
|
"loss": 1.9964, |
|
"num_input_tokens_seen": 867041280, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.016043411584286894, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.970224735547382e-05, |
|
"loss": 1.9982, |
|
"num_input_tokens_seen": 868220928, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.016161377845936062, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 7.970034412625547e-05, |
|
"loss": 2.1421, |
|
"num_input_tokens_seen": 869400576, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.01627934410758523, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 7.96984348565667e-05, |
|
"loss": 2.0866, |
|
"num_input_tokens_seen": 870580224, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.0163973103692344, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 7.969651954669805e-05, |
|
"loss": 2.1172, |
|
"num_input_tokens_seen": 871759872, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.016515276630883566, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 7.96945981969409e-05, |
|
"loss": 2.1618, |
|
"num_input_tokens_seen": 872939520, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.016633242892532735, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.96926708075876e-05, |
|
"loss": 2.0966, |
|
"num_input_tokens_seen": 874119168, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.016751209154181903, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 7.969073737893142e-05, |
|
"loss": 1.9877, |
|
"num_input_tokens_seen": 875298816, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.01686917541583107, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 7.968879791126652e-05, |
|
"loss": 2.09, |
|
"num_input_tokens_seen": 876478464, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.01698714167748024, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 7.968685240488798e-05, |
|
"loss": 2.0698, |
|
"num_input_tokens_seen": 877658112, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.017105107939129408, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 7.968490086009184e-05, |
|
"loss": 2.1101, |
|
"num_input_tokens_seen": 878837760, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.017223074200778576, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.9682943277175e-05, |
|
"loss": 1.96, |
|
"num_input_tokens_seen": 880017408, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.017341040462427744, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.968097965643533e-05, |
|
"loss": 1.9999, |
|
"num_input_tokens_seen": 881197056, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.017459006724076916, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 7.96790099981716e-05, |
|
"loss": 2.0833, |
|
"num_input_tokens_seen": 882376704, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.017576972985726084, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.967703430268349e-05, |
|
"loss": 1.9634, |
|
"num_input_tokens_seen": 883556352, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.017694939247375252, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 7.967505257027158e-05, |
|
"loss": 2.1086, |
|
"num_input_tokens_seen": 884736000, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.01781290550902442, |
|
"grad_norm": 0.875, |
|
"learning_rate": 7.967306480123745e-05, |
|
"loss": 2.0866, |
|
"num_input_tokens_seen": 885915648, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.01793087177067359, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.967107099588349e-05, |
|
"loss": 2.1024, |
|
"num_input_tokens_seen": 887095296, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.018048838032322757, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 7.966907115451311e-05, |
|
"loss": 2.1854, |
|
"num_input_tokens_seen": 888274944, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.018166804293971925, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 7.966706527743052e-05, |
|
"loss": 2.0433, |
|
"num_input_tokens_seen": 889454592, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.018284770555621093, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.966505336494098e-05, |
|
"loss": 2.0656, |
|
"num_input_tokens_seen": 890634240, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.01840273681727026, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 7.96630354173506e-05, |
|
"loss": 1.9556, |
|
"num_input_tokens_seen": 891813888, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.01852070307891943, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 7.966101143496637e-05, |
|
"loss": 2.0212, |
|
"num_input_tokens_seen": 892993536, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.018638669340568598, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 7.965898141809629e-05, |
|
"loss": 2.1419, |
|
"num_input_tokens_seen": 894173184, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.018756635602217766, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 7.96569453670492e-05, |
|
"loss": 2.0694, |
|
"num_input_tokens_seen": 895352832, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.018874601863866934, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.96549032821349e-05, |
|
"loss": 2.0899, |
|
"num_input_tokens_seen": 896532480, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.018992568125516102, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.96528551636641e-05, |
|
"loss": 2.0442, |
|
"num_input_tokens_seen": 897712128, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.01911053438716527, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 7.96508010119484e-05, |
|
"loss": 2.0397, |
|
"num_input_tokens_seen": 898891776, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.01922850064881444, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 7.964874082730039e-05, |
|
"loss": 2.0424, |
|
"num_input_tokens_seen": 900071424, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.019346466910463607, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 7.964667461003347e-05, |
|
"loss": 2.1146, |
|
"num_input_tokens_seen": 901251072, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.019464433172112775, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.964460236046209e-05, |
|
"loss": 2.0401, |
|
"num_input_tokens_seen": 902430720, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.019582399433761943, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.964252407890147e-05, |
|
"loss": 1.9694, |
|
"num_input_tokens_seen": 903610368, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.01970036569541111, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.964043976566787e-05, |
|
"loss": 1.9582, |
|
"num_input_tokens_seen": 904790016, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.01981833195706028, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 7.963834942107843e-05, |
|
"loss": 2.0266, |
|
"num_input_tokens_seen": 905969664, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.019936298218709448, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.963625304545115e-05, |
|
"loss": 2.1604, |
|
"num_input_tokens_seen": 907149312, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.020054264480358616, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.963415063910505e-05, |
|
"loss": 2.1845, |
|
"num_input_tokens_seen": 908328960, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.020172230742007784, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 7.963204220236e-05, |
|
"loss": 2.0049, |
|
"num_input_tokens_seen": 909508608, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.020290197003656953, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.962992773553678e-05, |
|
"loss": 2.0349, |
|
"num_input_tokens_seen": 910688256, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.02040816326530612, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.962780723895712e-05, |
|
"loss": 1.9531, |
|
"num_input_tokens_seen": 911867904, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.020526129526955292, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 7.962568071294368e-05, |
|
"loss": 2.0456, |
|
"num_input_tokens_seen": 913047552, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.02064409578860446, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 7.962354815781999e-05, |
|
"loss": 2.1398, |
|
"num_input_tokens_seen": 914227200, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.02076206205025363, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.96214095739105e-05, |
|
"loss": 2.1565, |
|
"num_input_tokens_seen": 915406848, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.020880028311902797, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 7.961926496154066e-05, |
|
"loss": 2.0268, |
|
"num_input_tokens_seen": 916586496, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.020997994573551965, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.961711432103672e-05, |
|
"loss": 2.0407, |
|
"num_input_tokens_seen": 917766144, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.021115960835201134, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 7.961495765272593e-05, |
|
"loss": 1.9779, |
|
"num_input_tokens_seen": 918945792, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.021233927096850302, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.961279495693644e-05, |
|
"loss": 2.0524, |
|
"num_input_tokens_seen": 920125440, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.02135189335849947, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.961062623399728e-05, |
|
"loss": 2.012, |
|
"num_input_tokens_seen": 921305088, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.021469859620148638, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 7.960845148423844e-05, |
|
"loss": 1.9574, |
|
"num_input_tokens_seen": 922484736, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.021587825881797806, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.960627070799081e-05, |
|
"loss": 2.0983, |
|
"num_input_tokens_seen": 923664384, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.021705792143446975, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.96040839055862e-05, |
|
"loss": 2.0531, |
|
"num_input_tokens_seen": 924844032, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.021823758405096143, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.960189107735734e-05, |
|
"loss": 2.0866, |
|
"num_input_tokens_seen": 926023680, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.02194172466674531, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 7.959969222363786e-05, |
|
"loss": 2.012, |
|
"num_input_tokens_seen": 927203328, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.02205969092839448, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 7.959748734476231e-05, |
|
"loss": 2.0116, |
|
"num_input_tokens_seen": 928382976, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.022177657190043647, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 7.95952764410662e-05, |
|
"loss": 2.0609, |
|
"num_input_tokens_seen": 929562624, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.022295623451692816, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 7.95930595128859e-05, |
|
"loss": 2.0261, |
|
"num_input_tokens_seen": 930742272, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.022413589713341984, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 7.959083656055872e-05, |
|
"loss": 1.9831, |
|
"num_input_tokens_seen": 931921920, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.022531555974991152, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 7.958860758442289e-05, |
|
"loss": 2.0684, |
|
"num_input_tokens_seen": 933101568, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.02264952223664032, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 7.958637258481755e-05, |
|
"loss": 2.114, |
|
"num_input_tokens_seen": 934281216, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.02276748849828949, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.958413156208275e-05, |
|
"loss": 2.0545, |
|
"num_input_tokens_seen": 935460864, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.022885454759938657, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 7.958188451655949e-05, |
|
"loss": 2.1056, |
|
"num_input_tokens_seen": 936640512, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.023003421021587825, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 7.957963144858964e-05, |
|
"loss": 1.9934, |
|
"num_input_tokens_seen": 937820160, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.023121387283236993, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.957737235851602e-05, |
|
"loss": 2.0787, |
|
"num_input_tokens_seen": 938999808, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.02323935354488616, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.957510724668234e-05, |
|
"loss": 2.0125, |
|
"num_input_tokens_seen": 940179456, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.02335731980653533, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.957283611343325e-05, |
|
"loss": 1.9983, |
|
"num_input_tokens_seen": 941359104, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.023475286068184498, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.95705589591143e-05, |
|
"loss": 2.0102, |
|
"num_input_tokens_seen": 942538752, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.956827578407198e-05, |
|
"loss": 2.0244, |
|
"num_input_tokens_seen": 943718400, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"eval_wikipedia_loss": 2.2881267070770264, |
|
"eval_wikipedia_runtime": 163.2839, |
|
"eval_wikipedia_samples_per_second": 4.299, |
|
"eval_wikipedia_steps_per_second": 0.184, |
|
"num_input_tokens_seen": 943718400, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"eval_toxicity_loss": 4.0411834716796875, |
|
"eval_toxicity_runtime": 0.9512, |
|
"eval_toxicity_samples_per_second": 2.103, |
|
"eval_toxicity_steps_per_second": 1.051, |
|
"num_input_tokens_seen": 943718400, |
|
"step": 800 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 8477, |
|
"num_input_tokens_seen": 943718400, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1463448960643891e+19, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|