|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.023593252329833666, |
|
"eval_steps": 200, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00011796626164916834, |
|
"grad_norm": 93.0, |
|
"learning_rate": 1.8912529550827425e-07, |
|
"loss": 7.9641, |
|
"num_input_tokens_seen": 1179648, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002359325232983367, |
|
"grad_norm": 95.0, |
|
"learning_rate": 3.782505910165485e-07, |
|
"loss": 7.9866, |
|
"num_input_tokens_seen": 2359296, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.000353898784947505, |
|
"grad_norm": 97.5, |
|
"learning_rate": 5.673758865248227e-07, |
|
"loss": 7.983, |
|
"num_input_tokens_seen": 3538944, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0004718650465966734, |
|
"grad_norm": 92.0, |
|
"learning_rate": 7.56501182033097e-07, |
|
"loss": 7.9018, |
|
"num_input_tokens_seen": 4718592, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0005898313082458417, |
|
"grad_norm": 102.5, |
|
"learning_rate": 9.456264775413712e-07, |
|
"loss": 8.0784, |
|
"num_input_tokens_seen": 5898240, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00070779756989501, |
|
"grad_norm": 95.0, |
|
"learning_rate": 1.1347517730496454e-06, |
|
"loss": 7.9578, |
|
"num_input_tokens_seen": 7077888, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0008257638315441783, |
|
"grad_norm": 93.5, |
|
"learning_rate": 1.3238770685579196e-06, |
|
"loss": 7.975, |
|
"num_input_tokens_seen": 8257536, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0009437300931933467, |
|
"grad_norm": 85.0, |
|
"learning_rate": 1.513002364066194e-06, |
|
"loss": 7.8405, |
|
"num_input_tokens_seen": 9437184, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.001061696354842515, |
|
"grad_norm": 83.0, |
|
"learning_rate": 1.7021276595744682e-06, |
|
"loss": 7.841, |
|
"num_input_tokens_seen": 10616832, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0011796626164916834, |
|
"grad_norm": 82.5, |
|
"learning_rate": 1.8912529550827423e-06, |
|
"loss": 7.9491, |
|
"num_input_tokens_seen": 11796480, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0012976288781408518, |
|
"grad_norm": 68.0, |
|
"learning_rate": 2.0803782505910165e-06, |
|
"loss": 7.5612, |
|
"num_input_tokens_seen": 12976128, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00141559513979002, |
|
"grad_norm": 66.5, |
|
"learning_rate": 2.269503546099291e-06, |
|
"loss": 7.5808, |
|
"num_input_tokens_seen": 14155776, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0015335614014391884, |
|
"grad_norm": 57.5, |
|
"learning_rate": 2.4586288416075653e-06, |
|
"loss": 7.4494, |
|
"num_input_tokens_seen": 15335424, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0016515276630883566, |
|
"grad_norm": 51.25, |
|
"learning_rate": 2.6477541371158392e-06, |
|
"loss": 7.4606, |
|
"num_input_tokens_seen": 16515072, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.001769493924737525, |
|
"grad_norm": 42.75, |
|
"learning_rate": 2.836879432624114e-06, |
|
"loss": 7.2804, |
|
"num_input_tokens_seen": 17694720, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0018874601863866935, |
|
"grad_norm": 38.0, |
|
"learning_rate": 3.026004728132388e-06, |
|
"loss": 7.2173, |
|
"num_input_tokens_seen": 18874368, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0020054264480358617, |
|
"grad_norm": 33.75, |
|
"learning_rate": 3.2151300236406624e-06, |
|
"loss": 7.0934, |
|
"num_input_tokens_seen": 20054016, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00212339270968503, |
|
"grad_norm": 30.5, |
|
"learning_rate": 3.4042553191489363e-06, |
|
"loss": 7.1284, |
|
"num_input_tokens_seen": 21233664, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0022413589713341986, |
|
"grad_norm": 26.5, |
|
"learning_rate": 3.5933806146572107e-06, |
|
"loss": 6.932, |
|
"num_input_tokens_seen": 22413312, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0023593252329833668, |
|
"grad_norm": 23.875, |
|
"learning_rate": 3.7825059101654847e-06, |
|
"loss": 6.9163, |
|
"num_input_tokens_seen": 23592960, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002477291494632535, |
|
"grad_norm": 21.625, |
|
"learning_rate": 3.9716312056737595e-06, |
|
"loss": 6.8335, |
|
"num_input_tokens_seen": 24772608, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0025952577562817036, |
|
"grad_norm": 18.625, |
|
"learning_rate": 4.160756501182033e-06, |
|
"loss": 6.6761, |
|
"num_input_tokens_seen": 25952256, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.002713224017930872, |
|
"grad_norm": 16.0, |
|
"learning_rate": 4.349881796690308e-06, |
|
"loss": 6.6474, |
|
"num_input_tokens_seen": 27131904, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00283119027958004, |
|
"grad_norm": 18.0, |
|
"learning_rate": 4.539007092198582e-06, |
|
"loss": 6.6062, |
|
"num_input_tokens_seen": 28311552, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0029491565412292082, |
|
"grad_norm": 17.5, |
|
"learning_rate": 4.728132387706856e-06, |
|
"loss": 6.6435, |
|
"num_input_tokens_seen": 29491200, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.003067122802878377, |
|
"grad_norm": 14.25, |
|
"learning_rate": 4.9172576832151305e-06, |
|
"loss": 6.57, |
|
"num_input_tokens_seen": 30670848, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.003185089064527545, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 5.106382978723404e-06, |
|
"loss": 6.4037, |
|
"num_input_tokens_seen": 31850496, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0033030553261767133, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 5.2955082742316784e-06, |
|
"loss": 6.2927, |
|
"num_input_tokens_seen": 33030144, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.003421021587825882, |
|
"grad_norm": 9.75, |
|
"learning_rate": 5.484633569739954e-06, |
|
"loss": 6.2084, |
|
"num_input_tokens_seen": 34209792, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.00353898784947505, |
|
"grad_norm": 10.25, |
|
"learning_rate": 5.673758865248228e-06, |
|
"loss": 6.2349, |
|
"num_input_tokens_seen": 35389440, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0036569541111242184, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 5.862884160756502e-06, |
|
"loss": 6.155, |
|
"num_input_tokens_seen": 36569088, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.003774920372773387, |
|
"grad_norm": 10.0, |
|
"learning_rate": 6.052009456264776e-06, |
|
"loss": 6.1846, |
|
"num_input_tokens_seen": 37748736, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.003892886634422555, |
|
"grad_norm": 8.0, |
|
"learning_rate": 6.24113475177305e-06, |
|
"loss": 6.0656, |
|
"num_input_tokens_seen": 38928384, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.004010852896071723, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 6.430260047281325e-06, |
|
"loss": 5.9873, |
|
"num_input_tokens_seen": 40108032, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.004128819157720892, |
|
"grad_norm": 8.875, |
|
"learning_rate": 6.619385342789598e-06, |
|
"loss": 6.0008, |
|
"num_input_tokens_seen": 41287680, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00424678541937006, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 6.808510638297873e-06, |
|
"loss": 6.0852, |
|
"num_input_tokens_seen": 42467328, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.004364751681019229, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 6.997635933806147e-06, |
|
"loss": 5.8264, |
|
"num_input_tokens_seen": 43646976, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.004482717942668397, |
|
"grad_norm": 6.75, |
|
"learning_rate": 7.186761229314421e-06, |
|
"loss": 5.9511, |
|
"num_input_tokens_seen": 44826624, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.004600684204317565, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 7.375886524822695e-06, |
|
"loss": 5.8042, |
|
"num_input_tokens_seen": 46006272, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0047186504659667335, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 7.565011820330969e-06, |
|
"loss": 5.8955, |
|
"num_input_tokens_seen": 47185920, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004836616727615902, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 7.754137115839244e-06, |
|
"loss": 5.7764, |
|
"num_input_tokens_seen": 48365568, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.00495458298926507, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 7.943262411347519e-06, |
|
"loss": 5.7784, |
|
"num_input_tokens_seen": 49545216, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.005072549250914238, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 8.132387706855792e-06, |
|
"loss": 5.7388, |
|
"num_input_tokens_seen": 50724864, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.005190515512563407, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 8.321513002364066e-06, |
|
"loss": 5.7821, |
|
"num_input_tokens_seen": 51904512, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0053084817742125754, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 5.6466, |
|
"num_input_tokens_seen": 53084160, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.005426448035861744, |
|
"grad_norm": 4.125, |
|
"learning_rate": 8.699763593380616e-06, |
|
"loss": 5.6224, |
|
"num_input_tokens_seen": 54263808, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.005544414297510912, |
|
"grad_norm": 5.0, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 5.9019, |
|
"num_input_tokens_seen": 55443456, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.00566238055916008, |
|
"grad_norm": 4.0, |
|
"learning_rate": 9.078014184397164e-06, |
|
"loss": 5.5812, |
|
"num_input_tokens_seen": 56623104, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.005780346820809248, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 9.267139479905439e-06, |
|
"loss": 5.633, |
|
"num_input_tokens_seen": 57802752, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0058983130824584165, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 9.456264775413712e-06, |
|
"loss": 5.5031, |
|
"num_input_tokens_seen": 58982400, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0060162793441075856, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 9.645390070921986e-06, |
|
"loss": 5.5146, |
|
"num_input_tokens_seen": 60162048, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.006134245605756754, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 9.834515366430261e-06, |
|
"loss": 5.3805, |
|
"num_input_tokens_seen": 61341696, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.006252211867405922, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1.0023640661938535e-05, |
|
"loss": 5.4098, |
|
"num_input_tokens_seen": 62521344, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00637017812905509, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.0212765957446808e-05, |
|
"loss": 5.3773, |
|
"num_input_tokens_seen": 63700992, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.006488144390704258, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.0401891252955083e-05, |
|
"loss": 5.3287, |
|
"num_input_tokens_seen": 64880640, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.006606110652353427, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 1.0591016548463357e-05, |
|
"loss": 5.2282, |
|
"num_input_tokens_seen": 66060288, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.006724076914002596, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.0780141843971632e-05, |
|
"loss": 5.2967, |
|
"num_input_tokens_seen": 67239936, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.006842043175651764, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.0969267139479907e-05, |
|
"loss": 5.1126, |
|
"num_input_tokens_seen": 68419584, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.006960009437300932, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.1158392434988181e-05, |
|
"loss": 5.3309, |
|
"num_input_tokens_seen": 69599232, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0070779756989501, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.1347517730496456e-05, |
|
"loss": 5.2134, |
|
"num_input_tokens_seen": 70778880, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0071959419605992685, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.153664302600473e-05, |
|
"loss": 5.1972, |
|
"num_input_tokens_seen": 71958528, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.007313908222248437, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.1725768321513003e-05, |
|
"loss": 5.0123, |
|
"num_input_tokens_seen": 73138176, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.007431874483897605, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.1914893617021277e-05, |
|
"loss": 5.1108, |
|
"num_input_tokens_seen": 74317824, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.007549840745546774, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.2104018912529552e-05, |
|
"loss": 5.2287, |
|
"num_input_tokens_seen": 75497472, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.007667807007195942, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.2293144208037825e-05, |
|
"loss": 5.0902, |
|
"num_input_tokens_seen": 76677120, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00778577326884511, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.24822695035461e-05, |
|
"loss": 5.1526, |
|
"num_input_tokens_seen": 77856768, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.007903739530494279, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.2671394799054376e-05, |
|
"loss": 4.8855, |
|
"num_input_tokens_seen": 79036416, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.008021705792143447, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.286052009456265e-05, |
|
"loss": 5.0226, |
|
"num_input_tokens_seen": 80216064, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.008139672053792615, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.3049645390070925e-05, |
|
"loss": 5.1366, |
|
"num_input_tokens_seen": 81395712, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.008257638315441783, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.3238770685579197e-05, |
|
"loss": 5.0749, |
|
"num_input_tokens_seen": 82575360, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.008375604577090951, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.3427895981087472e-05, |
|
"loss": 4.8899, |
|
"num_input_tokens_seen": 83755008, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.00849357083874012, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.3617021276595745e-05, |
|
"loss": 4.8809, |
|
"num_input_tokens_seen": 84934656, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.008611537100389288, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.380614657210402e-05, |
|
"loss": 4.8839, |
|
"num_input_tokens_seen": 86114304, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.008729503362038458, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.3995271867612294e-05, |
|
"loss": 4.9931, |
|
"num_input_tokens_seen": 87293952, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.008847469623687626, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.418439716312057e-05, |
|
"loss": 4.8041, |
|
"num_input_tokens_seen": 88473600, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.008965435885336794, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.4373522458628843e-05, |
|
"loss": 4.8611, |
|
"num_input_tokens_seen": 89653248, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.009083402146985962, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.4562647754137118e-05, |
|
"loss": 4.7747, |
|
"num_input_tokens_seen": 90832896, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.00920136840863513, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.475177304964539e-05, |
|
"loss": 4.6884, |
|
"num_input_tokens_seen": 92012544, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.009319334670284299, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4940898345153665e-05, |
|
"loss": 4.6617, |
|
"num_input_tokens_seen": 93192192, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.009437300931933467, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.5130023640661939e-05, |
|
"loss": 4.6174, |
|
"num_input_tokens_seen": 94371840, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009555267193582635, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.5319148936170214e-05, |
|
"loss": 4.5483, |
|
"num_input_tokens_seen": 95551488, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.009673233455231803, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.5508274231678487e-05, |
|
"loss": 4.604, |
|
"num_input_tokens_seen": 96731136, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.009791199716880972, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.5697399527186764e-05, |
|
"loss": 4.5806, |
|
"num_input_tokens_seen": 97910784, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.00990916597853014, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.5886524822695038e-05, |
|
"loss": 4.4723, |
|
"num_input_tokens_seen": 99090432, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.010027132240179308, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.607565011820331e-05, |
|
"loss": 4.4723, |
|
"num_input_tokens_seen": 100270080, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.010145098501828476, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.6264775413711585e-05, |
|
"loss": 4.4519, |
|
"num_input_tokens_seen": 101449728, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.010263064763477646, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.645390070921986e-05, |
|
"loss": 4.4077, |
|
"num_input_tokens_seen": 102629376, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.010381031025126814, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.6643026004728132e-05, |
|
"loss": 4.5295, |
|
"num_input_tokens_seen": 103809024, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.010498997286775983, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.683215130023641e-05, |
|
"loss": 4.3235, |
|
"num_input_tokens_seen": 104988672, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.010616963548425151, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 4.3245, |
|
"num_input_tokens_seen": 106168320, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.010734929810074319, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.7210401891252956e-05, |
|
"loss": 4.2554, |
|
"num_input_tokens_seen": 107347968, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.010852896071723487, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.7399527186761233e-05, |
|
"loss": 4.2572, |
|
"num_input_tokens_seen": 108527616, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.010970862333372655, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.7588652482269506e-05, |
|
"loss": 4.2442, |
|
"num_input_tokens_seen": 109707264, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.011088828595021824, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 4.3006, |
|
"num_input_tokens_seen": 110886912, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.011206794856670992, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.7966903073286054e-05, |
|
"loss": 4.2813, |
|
"num_input_tokens_seen": 112066560, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.01132476111832016, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.8156028368794327e-05, |
|
"loss": 4.1186, |
|
"num_input_tokens_seen": 113246208, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.011442727379969328, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.83451536643026e-05, |
|
"loss": 4.1652, |
|
"num_input_tokens_seen": 114425856, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.011560693641618497, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.8534278959810878e-05, |
|
"loss": 4.1923, |
|
"num_input_tokens_seen": 115605504, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.011678659903267665, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.872340425531915e-05, |
|
"loss": 4.0728, |
|
"num_input_tokens_seen": 116785152, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.011796626164916833, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.8912529550827425e-05, |
|
"loss": 4.2189, |
|
"num_input_tokens_seen": 117964800, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011914592426566003, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.91016548463357e-05, |
|
"loss": 4.0632, |
|
"num_input_tokens_seen": 119144448, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.012032558688215171, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.929078014184397e-05, |
|
"loss": 3.8775, |
|
"num_input_tokens_seen": 120324096, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.01215052494986434, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.9479905437352245e-05, |
|
"loss": 3.8658, |
|
"num_input_tokens_seen": 121503744, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.012268491211513508, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9669030732860522e-05, |
|
"loss": 3.9888, |
|
"num_input_tokens_seen": 122683392, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.012386457473162676, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.9858156028368796e-05, |
|
"loss": 3.9439, |
|
"num_input_tokens_seen": 123863040, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.012504423734811844, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 2.004728132387707e-05, |
|
"loss": 4.0098, |
|
"num_input_tokens_seen": 125042688, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.012622389996461012, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 2.0236406619385343e-05, |
|
"loss": 3.9454, |
|
"num_input_tokens_seen": 126222336, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.01274035625811018, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.0425531914893616e-05, |
|
"loss": 3.8491, |
|
"num_input_tokens_seen": 127401984, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.012858322519759349, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 2.0614657210401893e-05, |
|
"loss": 3.9602, |
|
"num_input_tokens_seen": 128581632, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.012976288781408517, |
|
"grad_norm": 5.25, |
|
"learning_rate": 2.0803782505910167e-05, |
|
"loss": 3.9542, |
|
"num_input_tokens_seen": 129761280, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.013094255043057685, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.099290780141844e-05, |
|
"loss": 3.7846, |
|
"num_input_tokens_seen": 130940928, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.013212221304706853, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 2.1182033096926714e-05, |
|
"loss": 3.9128, |
|
"num_input_tokens_seen": 132120576, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.013330187566356021, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 2.137115839243499e-05, |
|
"loss": 3.7365, |
|
"num_input_tokens_seen": 133300224, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.013448153828005191, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 2.1560283687943264e-05, |
|
"loss": 3.8298, |
|
"num_input_tokens_seen": 134479872, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.01356612008965436, |
|
"grad_norm": 11.25, |
|
"learning_rate": 2.1749408983451538e-05, |
|
"loss": 3.9192, |
|
"num_input_tokens_seen": 135659520, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.013684086351303528, |
|
"grad_norm": 9.875, |
|
"learning_rate": 2.1938534278959815e-05, |
|
"loss": 3.7628, |
|
"num_input_tokens_seen": 136839168, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.013802052612952696, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 2.2127659574468088e-05, |
|
"loss": 3.7212, |
|
"num_input_tokens_seen": 138018816, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.013920018874601864, |
|
"grad_norm": 6.875, |
|
"learning_rate": 2.2316784869976362e-05, |
|
"loss": 3.8291, |
|
"num_input_tokens_seen": 139198464, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.014037985136251032, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 2.2505910165484635e-05, |
|
"loss": 3.7104, |
|
"num_input_tokens_seen": 140378112, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0141559513979002, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 2.2695035460992912e-05, |
|
"loss": 3.6898, |
|
"num_input_tokens_seen": 141557760, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.014273917659549369, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 2.2884160756501186e-05, |
|
"loss": 3.61, |
|
"num_input_tokens_seen": 142737408, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.014391883921198537, |
|
"grad_norm": 4.875, |
|
"learning_rate": 2.307328605200946e-05, |
|
"loss": 3.6886, |
|
"num_input_tokens_seen": 143917056, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.014509850182847705, |
|
"grad_norm": 4.75, |
|
"learning_rate": 2.326241134751773e-05, |
|
"loss": 3.6435, |
|
"num_input_tokens_seen": 145096704, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.014627816444496873, |
|
"grad_norm": 3.75, |
|
"learning_rate": 2.3451536643026006e-05, |
|
"loss": 3.6619, |
|
"num_input_tokens_seen": 146276352, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.014745782706146042, |
|
"grad_norm": 5.375, |
|
"learning_rate": 2.364066193853428e-05, |
|
"loss": 3.631, |
|
"num_input_tokens_seen": 147456000, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.01486374896779521, |
|
"grad_norm": 4.125, |
|
"learning_rate": 2.3829787234042553e-05, |
|
"loss": 3.6482, |
|
"num_input_tokens_seen": 148635648, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.014981715229444378, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 2.4018912529550827e-05, |
|
"loss": 3.6785, |
|
"num_input_tokens_seen": 149815296, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.015099681491093548, |
|
"grad_norm": 7.0, |
|
"learning_rate": 2.4208037825059104e-05, |
|
"loss": 3.6113, |
|
"num_input_tokens_seen": 150994944, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.015217647752742716, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 2.4397163120567377e-05, |
|
"loss": 3.6015, |
|
"num_input_tokens_seen": 152174592, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.015335614014391884, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 2.458628841607565e-05, |
|
"loss": 3.5241, |
|
"num_input_tokens_seen": 153354240, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.015453580276041053, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 2.4775413711583928e-05, |
|
"loss": 3.6007, |
|
"num_input_tokens_seen": 154533888, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.01557154653769022, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 2.49645390070922e-05, |
|
"loss": 3.5949, |
|
"num_input_tokens_seen": 155713536, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.01568951279933939, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 2.5153664302600475e-05, |
|
"loss": 3.5403, |
|
"num_input_tokens_seen": 156893184, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.015807479060988557, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 2.5342789598108752e-05, |
|
"loss": 3.5399, |
|
"num_input_tokens_seen": 158072832, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.015925445322637725, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 2.5531914893617025e-05, |
|
"loss": 3.5408, |
|
"num_input_tokens_seen": 159252480, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.016043411584286894, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.57210401891253e-05, |
|
"loss": 3.4678, |
|
"num_input_tokens_seen": 160432128, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.016161377845936062, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 2.5910165484633572e-05, |
|
"loss": 3.6019, |
|
"num_input_tokens_seen": 161611776, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.01627934410758523, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 2.609929078014185e-05, |
|
"loss": 3.4525, |
|
"num_input_tokens_seen": 162791424, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0163973103692344, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.628841607565012e-05, |
|
"loss": 3.557, |
|
"num_input_tokens_seen": 163971072, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.016515276630883566, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 2.6477541371158393e-05, |
|
"loss": 3.5324, |
|
"num_input_tokens_seen": 165150720, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.016633242892532735, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 3.4707, |
|
"num_input_tokens_seen": 166330368, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.016751209154181903, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 2.6855791962174944e-05, |
|
"loss": 3.3545, |
|
"num_input_tokens_seen": 167510016, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.01686917541583107, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.7044917257683217e-05, |
|
"loss": 3.4051, |
|
"num_input_tokens_seen": 168689664, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.01698714167748024, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 2.723404255319149e-05, |
|
"loss": 3.4119, |
|
"num_input_tokens_seen": 169869312, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.017105107939129408, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 2.7423167848699764e-05, |
|
"loss": 3.5149, |
|
"num_input_tokens_seen": 171048960, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.017223074200778576, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 2.761229314420804e-05, |
|
"loss": 3.3442, |
|
"num_input_tokens_seen": 172228608, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.017341040462427744, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 2.7801418439716315e-05, |
|
"loss": 3.3277, |
|
"num_input_tokens_seen": 173408256, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.017459006724076916, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 2.7990543735224588e-05, |
|
"loss": 3.3905, |
|
"num_input_tokens_seen": 174587904, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.017576972985726084, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.8179669030732865e-05, |
|
"loss": 3.276, |
|
"num_input_tokens_seen": 175767552, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.017694939247375252, |
|
"grad_norm": 4.0, |
|
"learning_rate": 2.836879432624114e-05, |
|
"loss": 3.3211, |
|
"num_input_tokens_seen": 176947200, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01781290550902442, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 2.8557919621749412e-05, |
|
"loss": 3.3483, |
|
"num_input_tokens_seen": 178126848, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.01793087177067359, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.8747044917257686e-05, |
|
"loss": 3.3391, |
|
"num_input_tokens_seen": 179306496, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.018048838032322757, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 2.8936170212765963e-05, |
|
"loss": 3.4719, |
|
"num_input_tokens_seen": 180486144, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.018166804293971925, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 2.9125295508274236e-05, |
|
"loss": 3.2727, |
|
"num_input_tokens_seen": 181665792, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.018284770555621093, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 2.9314420803782506e-05, |
|
"loss": 3.284, |
|
"num_input_tokens_seen": 182845440, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.01840273681727026, |
|
"grad_norm": 3.25, |
|
"learning_rate": 2.950354609929078e-05, |
|
"loss": 3.2279, |
|
"num_input_tokens_seen": 184025088, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.01852070307891943, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 2.9692671394799057e-05, |
|
"loss": 3.2438, |
|
"num_input_tokens_seen": 185204736, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.018638669340568598, |
|
"grad_norm": 3.875, |
|
"learning_rate": 2.988179669030733e-05, |
|
"loss": 3.3257, |
|
"num_input_tokens_seen": 186384384, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.018756635602217766, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.0070921985815604e-05, |
|
"loss": 3.2727, |
|
"num_input_tokens_seen": 187564032, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.018874601863866934, |
|
"grad_norm": 4.125, |
|
"learning_rate": 3.0260047281323877e-05, |
|
"loss": 3.245, |
|
"num_input_tokens_seen": 188743680, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.018992568125516102, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 3.0449172576832154e-05, |
|
"loss": 3.1904, |
|
"num_input_tokens_seen": 189923328, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.01911053438716527, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.063829787234043e-05, |
|
"loss": 3.2754, |
|
"num_input_tokens_seen": 191102976, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.01922850064881444, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.0827423167848705e-05, |
|
"loss": 3.1889, |
|
"num_input_tokens_seen": 192282624, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.019346466910463607, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 3.1016548463356975e-05, |
|
"loss": 3.2809, |
|
"num_input_tokens_seen": 193462272, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.019464433172112775, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.120567375886525e-05, |
|
"loss": 3.1929, |
|
"num_input_tokens_seen": 194641920, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.019582399433761943, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.139479905437353e-05, |
|
"loss": 3.1474, |
|
"num_input_tokens_seen": 195821568, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.01970036569541111, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.15839243498818e-05, |
|
"loss": 3.0433, |
|
"num_input_tokens_seen": 197001216, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.01981833195706028, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.1773049645390076e-05, |
|
"loss": 3.1527, |
|
"num_input_tokens_seen": 198180864, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.019936298218709448, |
|
"grad_norm": 3.5, |
|
"learning_rate": 3.196217494089835e-05, |
|
"loss": 3.2145, |
|
"num_input_tokens_seen": 199360512, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.020054264480358616, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.215130023640662e-05, |
|
"loss": 3.2642, |
|
"num_input_tokens_seen": 200540160, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.020172230742007784, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.234042553191489e-05, |
|
"loss": 3.1286, |
|
"num_input_tokens_seen": 201719808, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.020290197003656953, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.252955082742317e-05, |
|
"loss": 3.1274, |
|
"num_input_tokens_seen": 202899456, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.02040816326530612, |
|
"grad_norm": 6.375, |
|
"learning_rate": 3.271867612293144e-05, |
|
"loss": 3.096, |
|
"num_input_tokens_seen": 204079104, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.020526129526955292, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 3.290780141843972e-05, |
|
"loss": 3.1647, |
|
"num_input_tokens_seen": 205258752, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.02064409578860446, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.3096926713947994e-05, |
|
"loss": 3.269, |
|
"num_input_tokens_seen": 206438400, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02076206205025363, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 3.3286052009456264e-05, |
|
"loss": 3.2106, |
|
"num_input_tokens_seen": 207618048, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.020880028311902797, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 3.347517730496454e-05, |
|
"loss": 3.1545, |
|
"num_input_tokens_seen": 208797696, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.020997994573551965, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 3.366430260047282e-05, |
|
"loss": 3.1045, |
|
"num_input_tokens_seen": 209977344, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.021115960835201134, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 3.385342789598109e-05, |
|
"loss": 3.0496, |
|
"num_input_tokens_seen": 211156992, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.021233927096850302, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 3.4042553191489365e-05, |
|
"loss": 3.081, |
|
"num_input_tokens_seen": 212336640, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02135189335849947, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 3.423167848699764e-05, |
|
"loss": 3.0552, |
|
"num_input_tokens_seen": 213516288, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.021469859620148638, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.442080378250591e-05, |
|
"loss": 3.0172, |
|
"num_input_tokens_seen": 214695936, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.021587825881797806, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 3.460992907801419e-05, |
|
"loss": 3.1379, |
|
"num_input_tokens_seen": 215875584, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.021705792143446975, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.4799054373522466e-05, |
|
"loss": 3.1235, |
|
"num_input_tokens_seen": 217055232, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.021823758405096143, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 3.4988179669030736e-05, |
|
"loss": 3.1189, |
|
"num_input_tokens_seen": 218234880, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.02194172466674531, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.517730496453901e-05, |
|
"loss": 3.0035, |
|
"num_input_tokens_seen": 219414528, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.02205969092839448, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.536643026004728e-05, |
|
"loss": 3.0478, |
|
"num_input_tokens_seen": 220594176, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.022177657190043647, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 3.555555555555555e-05, |
|
"loss": 3.0777, |
|
"num_input_tokens_seen": 221773824, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.022295623451692816, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.574468085106383e-05, |
|
"loss": 3.0665, |
|
"num_input_tokens_seen": 222953472, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.022413589713341984, |
|
"grad_norm": 4.25, |
|
"learning_rate": 3.593380614657211e-05, |
|
"loss": 3.0271, |
|
"num_input_tokens_seen": 224133120, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.022531555974991152, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 3.612293144208038e-05, |
|
"loss": 3.033, |
|
"num_input_tokens_seen": 225312768, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.02264952223664032, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.6312056737588654e-05, |
|
"loss": 3.143, |
|
"num_input_tokens_seen": 226492416, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.02276748849828949, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.650118203309693e-05, |
|
"loss": 3.0347, |
|
"num_input_tokens_seen": 227672064, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.022885454759938657, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 3.66903073286052e-05, |
|
"loss": 3.07, |
|
"num_input_tokens_seen": 228851712, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.023003421021587825, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 3.687943262411348e-05, |
|
"loss": 3.0225, |
|
"num_input_tokens_seen": 230031360, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.023121387283236993, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.7068557919621755e-05, |
|
"loss": 3.0222, |
|
"num_input_tokens_seen": 231211008, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.02323935354488616, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 3.7257683215130025e-05, |
|
"loss": 2.9755, |
|
"num_input_tokens_seen": 232390656, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.02335731980653533, |
|
"grad_norm": 5.0, |
|
"learning_rate": 3.74468085106383e-05, |
|
"loss": 3.0163, |
|
"num_input_tokens_seen": 233570304, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.023475286068184498, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.763593380614658e-05, |
|
"loss": 2.9553, |
|
"num_input_tokens_seen": 234749952, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.782505910165485e-05, |
|
"loss": 3.015, |
|
"num_input_tokens_seen": 235929600, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"eval_wikipedia_loss": 2.9615590572357178, |
|
"eval_wikipedia_runtime": 172.3085, |
|
"eval_wikipedia_samples_per_second": 4.074, |
|
"eval_wikipedia_steps_per_second": 0.174, |
|
"num_input_tokens_seen": 235929600, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.023593252329833666, |
|
"eval_toxicity_loss": 4.73836088180542, |
|
"eval_toxicity_runtime": 0.999, |
|
"eval_toxicity_samples_per_second": 2.002, |
|
"eval_toxicity_steps_per_second": 1.001, |
|
"num_input_tokens_seen": 235929600, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 8477, |
|
"num_input_tokens_seen": 235929600, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.865862240160973e+18, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|