|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9998993609419817, |
|
"eval_steps": 150, |
|
"global_step": 2484, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0040255623207366776, |
|
"grad_norm": 38.58929443359375, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 9.0093, |
|
"num_input_tokens_seen": 327680, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008051124641473355, |
|
"grad_norm": 16.55693817138672, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 8.4074, |
|
"num_input_tokens_seen": 655360, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012076686962210034, |
|
"grad_norm": 10.222895622253418, |
|
"learning_rate": 1.2e-05, |
|
"loss": 7.9014, |
|
"num_input_tokens_seen": 983040, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01610224928294671, |
|
"grad_norm": 8.999139785766602, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 7.6741, |
|
"num_input_tokens_seen": 1310720, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02012781160368339, |
|
"grad_norm": 8.544699668884277, |
|
"learning_rate": 2e-05, |
|
"loss": 7.3022, |
|
"num_input_tokens_seen": 1638400, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.024153373924420067, |
|
"grad_norm": 7.598087310791016, |
|
"learning_rate": 2.4e-05, |
|
"loss": 6.7376, |
|
"num_input_tokens_seen": 1966080, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.028178936245156744, |
|
"grad_norm": 5.1022233963012695, |
|
"learning_rate": 2.8e-05, |
|
"loss": 6.2707, |
|
"num_input_tokens_seen": 2293760, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03220449856589342, |
|
"grad_norm": 3.5978548526763916, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 5.9112, |
|
"num_input_tokens_seen": 2621440, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.036230060886630104, |
|
"grad_norm": 2.6466448307037354, |
|
"learning_rate": 3.6e-05, |
|
"loss": 5.6431, |
|
"num_input_tokens_seen": 2949120, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04025562320736678, |
|
"grad_norm": 1.9538609981536865, |
|
"learning_rate": 4e-05, |
|
"loss": 5.4321, |
|
"num_input_tokens_seen": 3276800, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04428118552810346, |
|
"grad_norm": 1.7290977239608765, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 5.2876, |
|
"num_input_tokens_seen": 3604480, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.048306747848840134, |
|
"grad_norm": 1.647463321685791, |
|
"learning_rate": 4.8e-05, |
|
"loss": 5.1749, |
|
"num_input_tokens_seen": 3932160, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05233231016957681, |
|
"grad_norm": 1.367295742034912, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 5.0826, |
|
"num_input_tokens_seen": 4259840, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05635787249031349, |
|
"grad_norm": 1.4710743427276611, |
|
"learning_rate": 5.6e-05, |
|
"loss": 5.0053, |
|
"num_input_tokens_seen": 4587520, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06038343481105017, |
|
"grad_norm": 1.501865029335022, |
|
"learning_rate": 6.000000000000001e-05, |
|
"loss": 4.8943, |
|
"num_input_tokens_seen": 4915200, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06440899713178684, |
|
"grad_norm": 1.3167399168014526, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 4.9062, |
|
"num_input_tokens_seen": 5242880, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06843455945252352, |
|
"grad_norm": 1.6458464860916138, |
|
"learning_rate": 6.8e-05, |
|
"loss": 4.8432, |
|
"num_input_tokens_seen": 5570560, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07246012177326021, |
|
"grad_norm": 1.4131243228912354, |
|
"learning_rate": 7.2e-05, |
|
"loss": 4.8507, |
|
"num_input_tokens_seen": 5898240, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07648568409399688, |
|
"grad_norm": 1.6352394819259644, |
|
"learning_rate": 7.6e-05, |
|
"loss": 4.8214, |
|
"num_input_tokens_seen": 6225920, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08051124641473356, |
|
"grad_norm": 1.3868008852005005, |
|
"learning_rate": 8e-05, |
|
"loss": 4.7863, |
|
"num_input_tokens_seen": 6553600, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08453680873547023, |
|
"grad_norm": 1.5053304433822632, |
|
"learning_rate": 7.807200583588265e-05, |
|
"loss": 4.7478, |
|
"num_input_tokens_seen": 6881280, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08856237105620691, |
|
"grad_norm": 1.6993811130523682, |
|
"learning_rate": 7.627700713964738e-05, |
|
"loss": 4.7303, |
|
"num_input_tokens_seen": 7208960, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0925879333769436, |
|
"grad_norm": 1.4282785654067993, |
|
"learning_rate": 7.460038465922511e-05, |
|
"loss": 4.7332, |
|
"num_input_tokens_seen": 7536640, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.09661349569768027, |
|
"grad_norm": 1.6601049900054932, |
|
"learning_rate": 7.302967433402216e-05, |
|
"loss": 4.7639, |
|
"num_input_tokens_seen": 7864320, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10063905801841695, |
|
"grad_norm": 1.4113882780075073, |
|
"learning_rate": 7.155417527999328e-05, |
|
"loss": 4.7138, |
|
"num_input_tokens_seen": 8192000, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10466462033915362, |
|
"grad_norm": 1.5578097105026245, |
|
"learning_rate": 7.016464154456235e-05, |
|
"loss": 4.7051, |
|
"num_input_tokens_seen": 8519680, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1086901826598903, |
|
"grad_norm": 1.4274314641952515, |
|
"learning_rate": 6.885303726590964e-05, |
|
"loss": 4.736, |
|
"num_input_tokens_seen": 8847360, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11271574498062698, |
|
"grad_norm": 1.354647159576416, |
|
"learning_rate": 6.761234037828134e-05, |
|
"loss": 4.6927, |
|
"num_input_tokens_seen": 9175040, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11674130730136366, |
|
"grad_norm": 1.402042031288147, |
|
"learning_rate": 6.643638388299198e-05, |
|
"loss": 4.703, |
|
"num_input_tokens_seen": 9502720, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12076686962210034, |
|
"grad_norm": 1.4121787548065186, |
|
"learning_rate": 6.531972647421809e-05, |
|
"loss": 4.6583, |
|
"num_input_tokens_seen": 9830400, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12076686962210034, |
|
"eval_accuracy": 0.3405695459290188, |
|
"eval_loss": 4.505166053771973, |
|
"eval_runtime": 6.5101, |
|
"eval_samples_per_second": 46.082, |
|
"eval_steps_per_second": 5.837, |
|
"num_input_tokens_seen": 9830400, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12479243194283701, |
|
"grad_norm": 1.5172020196914673, |
|
"learning_rate": 6.425754631219992e-05, |
|
"loss": 4.6715, |
|
"num_input_tokens_seen": 10158080, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.12881799426357368, |
|
"grad_norm": 1.4244070053100586, |
|
"learning_rate": 6.324555320336759e-05, |
|
"loss": 4.6351, |
|
"num_input_tokens_seen": 10485760, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13284355658431038, |
|
"grad_norm": 1.352230191230774, |
|
"learning_rate": 6.227991553292185e-05, |
|
"loss": 4.6386, |
|
"num_input_tokens_seen": 10813440, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.13686911890504705, |
|
"grad_norm": 1.3570430278778076, |
|
"learning_rate": 6.135719910778964e-05, |
|
"loss": 4.6563, |
|
"num_input_tokens_seen": 11141120, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14089468122578372, |
|
"grad_norm": 1.3318084478378296, |
|
"learning_rate": 6.047431568147636e-05, |
|
"loss": 4.6472, |
|
"num_input_tokens_seen": 11468800, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.14492024354652042, |
|
"grad_norm": 1.4813034534454346, |
|
"learning_rate": 5.96284793999944e-05, |
|
"loss": 4.6493, |
|
"num_input_tokens_seen": 11796480, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1489458058672571, |
|
"grad_norm": 1.367981195449829, |
|
"learning_rate": 5.881716976750463e-05, |
|
"loss": 4.5858, |
|
"num_input_tokens_seen": 12124160, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.15297136818799376, |
|
"grad_norm": 1.3320348262786865, |
|
"learning_rate": 5.803810000880094e-05, |
|
"loss": 4.5985, |
|
"num_input_tokens_seen": 12451840, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.15699693050873043, |
|
"grad_norm": 1.233577847480774, |
|
"learning_rate": 5.7289189923154636e-05, |
|
"loss": 4.6574, |
|
"num_input_tokens_seen": 12779520, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.16102249282946712, |
|
"grad_norm": 1.2315726280212402, |
|
"learning_rate": 5.6568542494923805e-05, |
|
"loss": 4.6412, |
|
"num_input_tokens_seen": 13107200, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1650480551502038, |
|
"grad_norm": 1.3596357107162476, |
|
"learning_rate": 5.5874423661566265e-05, |
|
"loss": 4.6032, |
|
"num_input_tokens_seen": 13434880, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.16907361747094046, |
|
"grad_norm": 1.3829398155212402, |
|
"learning_rate": 5.5205244747388335e-05, |
|
"loss": 4.5954, |
|
"num_input_tokens_seen": 13762560, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.17309917979167716, |
|
"grad_norm": 1.4083398580551147, |
|
"learning_rate": 5.455954715763789e-05, |
|
"loss": 4.5888, |
|
"num_input_tokens_seen": 14090240, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.17712474211241383, |
|
"grad_norm": 1.467612862586975, |
|
"learning_rate": 5.393598899705937e-05, |
|
"loss": 4.6193, |
|
"num_input_tokens_seen": 14417920, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1811503044331505, |
|
"grad_norm": 1.2575774192810059, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 4.5579, |
|
"num_input_tokens_seen": 14745600, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1851758667538872, |
|
"grad_norm": 1.4239579439163208, |
|
"learning_rate": 5.2750437871662975e-05, |
|
"loss": 4.5607, |
|
"num_input_tokens_seen": 15073280, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.18920142907462387, |
|
"grad_norm": 1.2363418340682983, |
|
"learning_rate": 5.218624584427538e-05, |
|
"loss": 4.5617, |
|
"num_input_tokens_seen": 15400960, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.19322699139536054, |
|
"grad_norm": 1.242336392402649, |
|
"learning_rate": 5.163977794943223e-05, |
|
"loss": 4.5878, |
|
"num_input_tokens_seen": 15728640, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1972525537160972, |
|
"grad_norm": 1.2200363874435425, |
|
"learning_rate": 5.11101251999952e-05, |
|
"loss": 4.5518, |
|
"num_input_tokens_seen": 16056320, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2012781160368339, |
|
"grad_norm": 1.1616241931915283, |
|
"learning_rate": 5.0596442562694074e-05, |
|
"loss": 4.5944, |
|
"num_input_tokens_seen": 16384000, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.20530367835757057, |
|
"grad_norm": 1.3502942323684692, |
|
"learning_rate": 5.009794328681197e-05, |
|
"loss": 4.5356, |
|
"num_input_tokens_seen": 16711680, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.20932924067830724, |
|
"grad_norm": 1.1606285572052002, |
|
"learning_rate": 4.961389383568338e-05, |
|
"loss": 4.5341, |
|
"num_input_tokens_seen": 17039360, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.21335480299904394, |
|
"grad_norm": 1.4353660345077515, |
|
"learning_rate": 4.9143609346716104e-05, |
|
"loss": 4.5766, |
|
"num_input_tokens_seen": 17367040, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2173803653197806, |
|
"grad_norm": 1.2260538339614868, |
|
"learning_rate": 4.8686449556014764e-05, |
|
"loss": 4.5123, |
|
"num_input_tokens_seen": 17694720, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.22140592764051728, |
|
"grad_norm": 1.2337218523025513, |
|
"learning_rate": 4.8241815132442184e-05, |
|
"loss": 4.4989, |
|
"num_input_tokens_seen": 18022400, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.22543148996125395, |
|
"grad_norm": 1.2576812505722046, |
|
"learning_rate": 4.7809144373375745e-05, |
|
"loss": 4.5362, |
|
"num_input_tokens_seen": 18350080, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.22945705228199065, |
|
"grad_norm": 1.3166711330413818, |
|
"learning_rate": 4.738791022072739e-05, |
|
"loss": 4.5318, |
|
"num_input_tokens_seen": 18677760, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.23348261460272732, |
|
"grad_norm": 1.353780746459961, |
|
"learning_rate": 4.6977617561176284e-05, |
|
"loss": 4.5475, |
|
"num_input_tokens_seen": 19005440, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.237508176923464, |
|
"grad_norm": 1.2797174453735352, |
|
"learning_rate": 4.657780077916657e-05, |
|
"loss": 4.4992, |
|
"num_input_tokens_seen": 19333120, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.24153373924420068, |
|
"grad_norm": 1.433968424797058, |
|
"learning_rate": 4.618802153517007e-05, |
|
"loss": 4.5365, |
|
"num_input_tokens_seen": 19660800, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24153373924420068, |
|
"eval_accuracy": 0.35246274229682784, |
|
"eval_loss": 4.371213436126709, |
|
"eval_runtime": 6.282, |
|
"eval_samples_per_second": 47.755, |
|
"eval_steps_per_second": 6.049, |
|
"num_input_tokens_seen": 19660800, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24555930156493735, |
|
"grad_norm": 1.3408712148666382, |
|
"learning_rate": 4.580786674510946e-05, |
|
"loss": 4.4919, |
|
"num_input_tokens_seen": 19988480, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.24958486388567402, |
|
"grad_norm": 1.3724451065063477, |
|
"learning_rate": 4.543694673976519e-05, |
|
"loss": 4.498, |
|
"num_input_tokens_seen": 20316160, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2536104262064107, |
|
"grad_norm": 1.3110082149505615, |
|
"learning_rate": 4.507489358552088e-05, |
|
"loss": 4.4623, |
|
"num_input_tokens_seen": 20643840, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.25763598852714736, |
|
"grad_norm": 1.2995046377182007, |
|
"learning_rate": 4.47213595499958e-05, |
|
"loss": 4.5249, |
|
"num_input_tokens_seen": 20971520, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.26166155084788406, |
|
"grad_norm": 1.2765010595321655, |
|
"learning_rate": 4.4376015698018335e-05, |
|
"loss": 4.5525, |
|
"num_input_tokens_seen": 21299200, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.26568711316862076, |
|
"grad_norm": 1.268876552581787, |
|
"learning_rate": 4.403855060505443e-05, |
|
"loss": 4.4876, |
|
"num_input_tokens_seen": 21626880, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2697126754893574, |
|
"grad_norm": 1.4470906257629395, |
|
"learning_rate": 4.3708669176652244e-05, |
|
"loss": 4.4901, |
|
"num_input_tokens_seen": 21954560, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2737382378100941, |
|
"grad_norm": 1.3335119485855103, |
|
"learning_rate": 4.3386091563731235e-05, |
|
"loss": 4.4488, |
|
"num_input_tokens_seen": 22282240, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2777638001308308, |
|
"grad_norm": 1.4562543630599976, |
|
"learning_rate": 4.3070552164653247e-05, |
|
"loss": 4.4705, |
|
"num_input_tokens_seen": 22609920, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.28178936245156744, |
|
"grad_norm": 1.4459863901138306, |
|
"learning_rate": 4.276179870598791e-05, |
|
"loss": 4.4942, |
|
"num_input_tokens_seen": 22937600, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.28581492477230414, |
|
"grad_norm": 1.193866491317749, |
|
"learning_rate": 4.2459591394742024e-05, |
|
"loss": 4.5002, |
|
"num_input_tokens_seen": 23265280, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.28984048709304083, |
|
"grad_norm": 1.4731342792510986, |
|
"learning_rate": 4.2163702135578394e-05, |
|
"loss": 4.4794, |
|
"num_input_tokens_seen": 23592960, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2938660494137775, |
|
"grad_norm": 1.1683562994003296, |
|
"learning_rate": 4.1873913807217094e-05, |
|
"loss": 4.4578, |
|
"num_input_tokens_seen": 23920640, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2978916117345142, |
|
"grad_norm": 1.2372169494628906, |
|
"learning_rate": 4.159001959280291e-05, |
|
"loss": 4.5118, |
|
"num_input_tokens_seen": 24248320, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.30191717405525087, |
|
"grad_norm": 1.4430092573165894, |
|
"learning_rate": 4.131182235954578e-05, |
|
"loss": 4.4861, |
|
"num_input_tokens_seen": 24576000, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3059427363759875, |
|
"grad_norm": 1.3162908554077148, |
|
"learning_rate": 4.103913408340617e-05, |
|
"loss": 4.4968, |
|
"num_input_tokens_seen": 24903680, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3099682986967242, |
|
"grad_norm": 1.238364815711975, |
|
"learning_rate": 4.077177531501004e-05, |
|
"loss": 4.4465, |
|
"num_input_tokens_seen": 25231360, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.31399386101746085, |
|
"grad_norm": 1.2819691896438599, |
|
"learning_rate": 4.050957468334667e-05, |
|
"loss": 4.4656, |
|
"num_input_tokens_seen": 25559040, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.31801942333819755, |
|
"grad_norm": 1.2802215814590454, |
|
"learning_rate": 4.025236843413021e-05, |
|
"loss": 4.4934, |
|
"num_input_tokens_seen": 25886720, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.32204498565893425, |
|
"grad_norm": 1.232289433479309, |
|
"learning_rate": 4e-05, |
|
"loss": 4.4965, |
|
"num_input_tokens_seen": 26214400, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3260705479796709, |
|
"grad_norm": 1.3022078275680542, |
|
"learning_rate": 3.975231959999627e-05, |
|
"loss": 4.4332, |
|
"num_input_tokens_seen": 26542080, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3300961103004076, |
|
"grad_norm": 1.1387150287628174, |
|
"learning_rate": 3.950918386598359e-05, |
|
"loss": 4.4345, |
|
"num_input_tokens_seen": 26869760, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3341216726211443, |
|
"grad_norm": 1.1623753309249878, |
|
"learning_rate": 3.9270455493905284e-05, |
|
"loss": 4.4572, |
|
"num_input_tokens_seen": 27197440, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3381472349418809, |
|
"grad_norm": 1.1756770610809326, |
|
"learning_rate": 3.9036002917941324e-05, |
|
"loss": 4.4603, |
|
"num_input_tokens_seen": 27525120, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3421727972626176, |
|
"grad_norm": 1.243285059928894, |
|
"learning_rate": 3.8805700005813275e-05, |
|
"loss": 4.4576, |
|
"num_input_tokens_seen": 27852800, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3461983595833543, |
|
"grad_norm": 1.4163405895233154, |
|
"learning_rate": 3.8579425773632976e-05, |
|
"loss": 4.4693, |
|
"num_input_tokens_seen": 28180480, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.35022392190409096, |
|
"grad_norm": 1.2970571517944336, |
|
"learning_rate": 3.835706411883074e-05, |
|
"loss": 4.4731, |
|
"num_input_tokens_seen": 28508160, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.35424948422482766, |
|
"grad_norm": 1.2691751718521118, |
|
"learning_rate": 3.813850356982369e-05, |
|
"loss": 4.4548, |
|
"num_input_tokens_seen": 28835840, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.35827504654556436, |
|
"grad_norm": 1.2060904502868652, |
|
"learning_rate": 3.792363705119819e-05, |
|
"loss": 4.5089, |
|
"num_input_tokens_seen": 29163520, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.362300608866301, |
|
"grad_norm": 1.1918907165527344, |
|
"learning_rate": 3.771236166328254e-05, |
|
"loss": 4.4621, |
|
"num_input_tokens_seen": 29491200, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.362300608866301, |
|
"eval_accuracy": 0.3575167675445771, |
|
"eval_loss": 4.2810378074646, |
|
"eval_runtime": 6.4142, |
|
"eval_samples_per_second": 46.771, |
|
"eval_steps_per_second": 5.924, |
|
"num_input_tokens_seen": 29491200, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3663261711870377, |
|
"grad_norm": 1.257439136505127, |
|
"learning_rate": 3.750457847507964e-05, |
|
"loss": 4.4083, |
|
"num_input_tokens_seen": 29818880, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.3703517335077744, |
|
"grad_norm": 1.2655524015426636, |
|
"learning_rate": 3.7300192329612555e-05, |
|
"loss": 4.428, |
|
"num_input_tokens_seen": 30146560, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.37437729582851104, |
|
"grad_norm": 1.3658162355422974, |
|
"learning_rate": 3.709911166081346e-05, |
|
"loss": 4.4843, |
|
"num_input_tokens_seen": 30474240, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.37840285814924773, |
|
"grad_norm": 1.32870352268219, |
|
"learning_rate": 3.6901248321155405e-05, |
|
"loss": 4.449, |
|
"num_input_tokens_seen": 30801920, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3824284204699844, |
|
"grad_norm": 1.3358460664749146, |
|
"learning_rate": 3.670651741928988e-05, |
|
"loss": 4.435, |
|
"num_input_tokens_seen": 31129600, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3864539827907211, |
|
"grad_norm": 1.1790028810501099, |
|
"learning_rate": 3.651483716701108e-05, |
|
"loss": 4.426, |
|
"num_input_tokens_seen": 31457280, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.39047954511145777, |
|
"grad_norm": 1.2213348150253296, |
|
"learning_rate": 3.6326128734919986e-05, |
|
"loss": 4.4451, |
|
"num_input_tokens_seen": 31784960, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3945051074321944, |
|
"grad_norm": 1.4075285196304321, |
|
"learning_rate": 3.6140316116210055e-05, |
|
"loss": 4.4582, |
|
"num_input_tokens_seen": 32112640, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3985306697529311, |
|
"grad_norm": 1.3073724508285522, |
|
"learning_rate": 3.595732599803958e-05, |
|
"loss": 4.4541, |
|
"num_input_tokens_seen": 32440320, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4025562320736678, |
|
"grad_norm": 1.3178701400756836, |
|
"learning_rate": 3.577708763999664e-05, |
|
"loss": 4.4466, |
|
"num_input_tokens_seen": 32768000, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.40658179439440445, |
|
"grad_norm": 1.2765254974365234, |
|
"learning_rate": 3.5599532759198786e-05, |
|
"loss": 4.4159, |
|
"num_input_tokens_seen": 33095680, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.41060735671514115, |
|
"grad_norm": 1.2380224466323853, |
|
"learning_rate": 3.542459542160382e-05, |
|
"loss": 4.4387, |
|
"num_input_tokens_seen": 33423360, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.41463291903587784, |
|
"grad_norm": 1.4157094955444336, |
|
"learning_rate": 3.525221193913854e-05, |
|
"loss": 4.4529, |
|
"num_input_tokens_seen": 33751040, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4186584813566145, |
|
"grad_norm": 1.3274729251861572, |
|
"learning_rate": 3.5082320772281174e-05, |
|
"loss": 4.3776, |
|
"num_input_tokens_seen": 34078720, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4226840436773512, |
|
"grad_norm": 1.2400481700897217, |
|
"learning_rate": 3.4914862437758787e-05, |
|
"loss": 4.4103, |
|
"num_input_tokens_seen": 34406400, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.4267096059980879, |
|
"grad_norm": 1.2797058820724487, |
|
"learning_rate": 3.4749779421045556e-05, |
|
"loss": 4.4005, |
|
"num_input_tokens_seen": 34734080, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4307351683188245, |
|
"grad_norm": 1.4183690547943115, |
|
"learning_rate": 3.458701609336936e-05, |
|
"loss": 4.4181, |
|
"num_input_tokens_seen": 35061760, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4347607306395612, |
|
"grad_norm": 1.2432236671447754, |
|
"learning_rate": 3.442651863295482e-05, |
|
"loss": 4.4037, |
|
"num_input_tokens_seen": 35389440, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4387862929602979, |
|
"grad_norm": 1.19386887550354, |
|
"learning_rate": 3.426823495024955e-05, |
|
"loss": 4.4211, |
|
"num_input_tokens_seen": 35717120, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.44281185528103456, |
|
"grad_norm": 1.24849534034729, |
|
"learning_rate": 3.4112114616897666e-05, |
|
"loss": 4.3953, |
|
"num_input_tokens_seen": 36044800, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.44683741760177126, |
|
"grad_norm": 1.230302333831787, |
|
"learning_rate": 3.395810879824072e-05, |
|
"loss": 4.4187, |
|
"num_input_tokens_seen": 36372480, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4508629799225079, |
|
"grad_norm": 1.293820858001709, |
|
"learning_rate": 3.380617018914067e-05, |
|
"loss": 4.3944, |
|
"num_input_tokens_seen": 36700160, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4548885422432446, |
|
"grad_norm": 1.3907512426376343, |
|
"learning_rate": 3.36562529529337e-05, |
|
"loss": 4.4285, |
|
"num_input_tokens_seen": 37027840, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4589141045639813, |
|
"grad_norm": 1.4436863660812378, |
|
"learning_rate": 3.350831266333564e-05, |
|
"loss": 4.4322, |
|
"num_input_tokens_seen": 37355520, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.46293966688471794, |
|
"grad_norm": 1.3175897598266602, |
|
"learning_rate": 3.336230624913197e-05, |
|
"loss": 4.4196, |
|
"num_input_tokens_seen": 37683200, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.46696522920545463, |
|
"grad_norm": 1.1326998472213745, |
|
"learning_rate": 3.321819194149599e-05, |
|
"loss": 4.4478, |
|
"num_input_tokens_seen": 38010880, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.47099079152619133, |
|
"grad_norm": 1.3267349004745483, |
|
"learning_rate": 3.307592922378893e-05, |
|
"loss": 4.4068, |
|
"num_input_tokens_seen": 38338560, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.475016353846928, |
|
"grad_norm": 1.2397940158843994, |
|
"learning_rate": 3.2935478783704735e-05, |
|
"loss": 4.3565, |
|
"num_input_tokens_seen": 38666240, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 1.290412187576294, |
|
"learning_rate": 3.279680246763151e-05, |
|
"loss": 4.3957, |
|
"num_input_tokens_seen": 38993920, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.48306747848840137, |
|
"grad_norm": 1.3627963066101074, |
|
"learning_rate": 3.2659863237109046e-05, |
|
"loss": 4.4116, |
|
"num_input_tokens_seen": 39321600, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.48306747848840137, |
|
"eval_accuracy": 0.36151503242287286, |
|
"eval_loss": 4.246631622314453, |
|
"eval_runtime": 6.3292, |
|
"eval_samples_per_second": 47.4, |
|
"eval_steps_per_second": 6.004, |
|
"num_input_tokens_seen": 39321600, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.487093040809138, |
|
"grad_norm": 1.2708163261413574, |
|
"learning_rate": 3.2524625127269674e-05, |
|
"loss": 4.3632, |
|
"num_input_tokens_seen": 39649280, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.4911186031298747, |
|
"grad_norm": 1.1186234951019287, |
|
"learning_rate": 3.2391053207156646e-05, |
|
"loss": 4.3974, |
|
"num_input_tokens_seen": 39976960, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.4951441654506114, |
|
"grad_norm": 1.2380205392837524, |
|
"learning_rate": 3.2259113541820474e-05, |
|
"loss": 4.3891, |
|
"num_input_tokens_seen": 40304640, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.49916972777134805, |
|
"grad_norm": 1.220374584197998, |
|
"learning_rate": 3.212877315609996e-05, |
|
"loss": 4.3911, |
|
"num_input_tokens_seen": 40632320, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5031952900920847, |
|
"grad_norm": 1.273556113243103, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 4.3715, |
|
"num_input_tokens_seen": 40960000, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5072208524128214, |
|
"grad_norm": 1.2880346775054932, |
|
"learning_rate": 3.187276291558383e-05, |
|
"loss": 4.4068, |
|
"num_input_tokens_seen": 41287680, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5112464147335581, |
|
"grad_norm": 1.3127074241638184, |
|
"learning_rate": 3.1747031605301803e-05, |
|
"loss": 4.3636, |
|
"num_input_tokens_seen": 41615360, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5152719770542947, |
|
"grad_norm": 1.2499847412109375, |
|
"learning_rate": 3.1622776601683795e-05, |
|
"loss": 4.384, |
|
"num_input_tokens_seen": 41943040, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5192975393750314, |
|
"grad_norm": 1.28691828250885, |
|
"learning_rate": 3.1499969238326315e-05, |
|
"loss": 4.3927, |
|
"num_input_tokens_seen": 42270720, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5233231016957681, |
|
"grad_norm": 1.221285104751587, |
|
"learning_rate": 3.137858162210945e-05, |
|
"loss": 4.3894, |
|
"num_input_tokens_seen": 42598400, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5273486640165048, |
|
"grad_norm": 1.2766237258911133, |
|
"learning_rate": 3.125858660658254e-05, |
|
"loss": 4.3689, |
|
"num_input_tokens_seen": 42926080, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5313742263372415, |
|
"grad_norm": 1.3662595748901367, |
|
"learning_rate": 3.113995776646092e-05, |
|
"loss": 4.3842, |
|
"num_input_tokens_seen": 43253760, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5353997886579782, |
|
"grad_norm": 1.264195203781128, |
|
"learning_rate": 3.102266937317925e-05, |
|
"loss": 4.3665, |
|
"num_input_tokens_seen": 43581440, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5394253509787148, |
|
"grad_norm": 1.4342658519744873, |
|
"learning_rate": 3.0906696371450235e-05, |
|
"loss": 4.3987, |
|
"num_input_tokens_seen": 43909120, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5434509132994515, |
|
"grad_norm": 1.2556202411651611, |
|
"learning_rate": 3.0792014356780046e-05, |
|
"loss": 4.3837, |
|
"num_input_tokens_seen": 44236800, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5474764756201882, |
|
"grad_norm": 1.3614763021469116, |
|
"learning_rate": 3.067859955389482e-05, |
|
"loss": 4.339, |
|
"num_input_tokens_seen": 44564480, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5515020379409249, |
|
"grad_norm": 1.3664342164993286, |
|
"learning_rate": 3.0566428796034804e-05, |
|
"loss": 4.3543, |
|
"num_input_tokens_seen": 44892160, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5555276002616616, |
|
"grad_norm": 1.2612766027450562, |
|
"learning_rate": 3.045547950507524e-05, |
|
"loss": 4.3698, |
|
"num_input_tokens_seen": 45219840, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5595531625823982, |
|
"grad_norm": 1.3563127517700195, |
|
"learning_rate": 3.0345729672435268e-05, |
|
"loss": 4.3561, |
|
"num_input_tokens_seen": 45547520, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5635787249031349, |
|
"grad_norm": 1.15888249874115, |
|
"learning_rate": 3.023715784073818e-05, |
|
"loss": 4.4154, |
|
"num_input_tokens_seen": 45875200, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5676042872238716, |
|
"grad_norm": 1.2462992668151855, |
|
"learning_rate": 3.0129743086188382e-05, |
|
"loss": 4.4011, |
|
"num_input_tokens_seen": 46202880, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5716298495446083, |
|
"grad_norm": 1.2652802467346191, |
|
"learning_rate": 3.0023465001632065e-05, |
|
"loss": 4.4053, |
|
"num_input_tokens_seen": 46530560, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.575655411865345, |
|
"grad_norm": 1.4011398553848267, |
|
"learning_rate": 2.991830368027063e-05, |
|
"loss": 4.3539, |
|
"num_input_tokens_seen": 46858240, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.5796809741860817, |
|
"grad_norm": 1.248789668083191, |
|
"learning_rate": 2.98142396999972e-05, |
|
"loss": 4.3758, |
|
"num_input_tokens_seen": 47185920, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5837065365068183, |
|
"grad_norm": 1.218556523323059, |
|
"learning_rate": 2.97112541083283e-05, |
|
"loss": 4.3695, |
|
"num_input_tokens_seen": 47513600, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.587732098827555, |
|
"grad_norm": 1.28030526638031, |
|
"learning_rate": 2.960932840790421e-05, |
|
"loss": 4.3676, |
|
"num_input_tokens_seen": 47841280, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5917576611482916, |
|
"grad_norm": 1.1834255456924438, |
|
"learning_rate": 2.95084445425327e-05, |
|
"loss": 4.3445, |
|
"num_input_tokens_seen": 48168960, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5957832234690283, |
|
"grad_norm": 1.324885606765747, |
|
"learning_rate": 2.9408584883752315e-05, |
|
"loss": 4.3495, |
|
"num_input_tokens_seen": 48496640, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.599808785789765, |
|
"grad_norm": 1.3806536197662354, |
|
"learning_rate": 2.930973221789247e-05, |
|
"loss": 4.3369, |
|
"num_input_tokens_seen": 48824320, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6038343481105017, |
|
"grad_norm": 1.239935278892517, |
|
"learning_rate": 2.9211869733608858e-05, |
|
"loss": 4.3487, |
|
"num_input_tokens_seen": 49152000, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6038343481105017, |
|
"eval_accuracy": 0.3661367484834038, |
|
"eval_loss": 4.179458141326904, |
|
"eval_runtime": 6.3307, |
|
"eval_samples_per_second": 47.388, |
|
"eval_steps_per_second": 6.003, |
|
"num_input_tokens_seen": 49152000, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6078599104312383, |
|
"grad_norm": 1.294979214668274, |
|
"learning_rate": 2.911498100987356e-05, |
|
"loss": 4.3559, |
|
"num_input_tokens_seen": 49479680, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.611885472751975, |
|
"grad_norm": 1.2415525913238525, |
|
"learning_rate": 2.901905000440047e-05, |
|
"loss": 4.3862, |
|
"num_input_tokens_seen": 49807360, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6159110350727117, |
|
"grad_norm": 1.2934507131576538, |
|
"learning_rate": 2.8924061042487487e-05, |
|
"loss": 4.3778, |
|
"num_input_tokens_seen": 50135040, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6199365973934484, |
|
"grad_norm": 1.3969470262527466, |
|
"learning_rate": 2.8829998806257887e-05, |
|
"loss": 4.3623, |
|
"num_input_tokens_seen": 50462720, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6239621597141851, |
|
"grad_norm": 1.3450722694396973, |
|
"learning_rate": 2.873684832428399e-05, |
|
"loss": 4.3819, |
|
"num_input_tokens_seen": 50790400, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6279877220349217, |
|
"grad_norm": 1.4024951457977295, |
|
"learning_rate": 2.8644594961577318e-05, |
|
"loss": 4.3577, |
|
"num_input_tokens_seen": 51118080, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6320132843556584, |
|
"grad_norm": 1.4493414163589478, |
|
"learning_rate": 2.855322440992999e-05, |
|
"loss": 4.3402, |
|
"num_input_tokens_seen": 51445760, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6360388466763951, |
|
"grad_norm": 1.12810480594635, |
|
"learning_rate": 2.8462722678592803e-05, |
|
"loss": 4.3095, |
|
"num_input_tokens_seen": 51773440, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6400644089971318, |
|
"grad_norm": 1.2812013626098633, |
|
"learning_rate": 2.837307608527635e-05, |
|
"loss": 4.3664, |
|
"num_input_tokens_seen": 52101120, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6440899713178685, |
|
"grad_norm": 1.269395112991333, |
|
"learning_rate": 2.8284271247461902e-05, |
|
"loss": 4.3699, |
|
"num_input_tokens_seen": 52428800, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6481155336386052, |
|
"grad_norm": 1.2164851427078247, |
|
"learning_rate": 2.819629507400966e-05, |
|
"loss": 4.3331, |
|
"num_input_tokens_seen": 52756480, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.6521410959593418, |
|
"grad_norm": 1.249258279800415, |
|
"learning_rate": 2.8109134757052262e-05, |
|
"loss": 4.3384, |
|
"num_input_tokens_seen": 53084160, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6561666582800785, |
|
"grad_norm": 1.2432383298873901, |
|
"learning_rate": 2.8022777764162353e-05, |
|
"loss": 4.3628, |
|
"num_input_tokens_seen": 53411840, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6601922206008152, |
|
"grad_norm": 1.2008548974990845, |
|
"learning_rate": 2.7937211830783133e-05, |
|
"loss": 4.3751, |
|
"num_input_tokens_seen": 53739520, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6642177829215519, |
|
"grad_norm": 1.3261739015579224, |
|
"learning_rate": 2.785242495291166e-05, |
|
"loss": 4.3405, |
|
"num_input_tokens_seen": 54067200, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6682433452422886, |
|
"grad_norm": 1.3359473943710327, |
|
"learning_rate": 2.7768405380024934e-05, |
|
"loss": 4.3384, |
|
"num_input_tokens_seen": 54394880, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6722689075630253, |
|
"grad_norm": 1.3322173357009888, |
|
"learning_rate": 2.7685141608239315e-05, |
|
"loss": 4.3532, |
|
"num_input_tokens_seen": 54722560, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6762944698837619, |
|
"grad_norm": 1.2270052433013916, |
|
"learning_rate": 2.7602622373694168e-05, |
|
"loss": 4.3314, |
|
"num_input_tokens_seen": 55050240, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6803200322044985, |
|
"grad_norm": 1.2989898920059204, |
|
"learning_rate": 2.752083664615126e-05, |
|
"loss": 4.3486, |
|
"num_input_tokens_seen": 55377920, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.6843455945252352, |
|
"grad_norm": 1.2281562089920044, |
|
"learning_rate": 2.7439773622801412e-05, |
|
"loss": 4.3269, |
|
"num_input_tokens_seen": 55705600, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6883711568459719, |
|
"grad_norm": 1.3172951936721802, |
|
"learning_rate": 2.7359422722270782e-05, |
|
"loss": 4.3333, |
|
"num_input_tokens_seen": 56033280, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.6923967191667086, |
|
"grad_norm": 1.3179869651794434, |
|
"learning_rate": 2.7279773578818943e-05, |
|
"loss": 4.3657, |
|
"num_input_tokens_seen": 56360960, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6964222814874452, |
|
"grad_norm": 1.277402400970459, |
|
"learning_rate": 2.720081603672184e-05, |
|
"loss": 4.3581, |
|
"num_input_tokens_seen": 56688640, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7004478438081819, |
|
"grad_norm": 1.2060281038284302, |
|
"learning_rate": 2.712254014483242e-05, |
|
"loss": 4.372, |
|
"num_input_tokens_seen": 57016320, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7044734061289186, |
|
"grad_norm": 1.3300620317459106, |
|
"learning_rate": 2.704493615131253e-05, |
|
"loss": 4.3463, |
|
"num_input_tokens_seen": 57344000, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7084989684496553, |
|
"grad_norm": 1.2079811096191406, |
|
"learning_rate": 2.6967994498529685e-05, |
|
"loss": 4.3365, |
|
"num_input_tokens_seen": 57671680, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.712524530770392, |
|
"grad_norm": 1.2995448112487793, |
|
"learning_rate": 2.6891705818112575e-05, |
|
"loss": 4.3474, |
|
"num_input_tokens_seen": 57999360, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7165500930911287, |
|
"grad_norm": 1.3099631071090698, |
|
"learning_rate": 2.681606092615964e-05, |
|
"loss": 4.3505, |
|
"num_input_tokens_seen": 58327040, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7205756554118653, |
|
"grad_norm": 1.1766512393951416, |
|
"learning_rate": 2.6741050818595026e-05, |
|
"loss": 4.3314, |
|
"num_input_tokens_seen": 58654720, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.724601217732602, |
|
"grad_norm": 1.2225860357284546, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 4.338, |
|
"num_input_tokens_seen": 58982400, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.724601217732602, |
|
"eval_accuracy": 0.3663363114127162, |
|
"eval_loss": 4.187371730804443, |
|
"eval_runtime": 6.3051, |
|
"eval_samples_per_second": 47.581, |
|
"eval_steps_per_second": 6.027, |
|
"num_input_tokens_seen": 58982400, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7286267800533387, |
|
"grad_norm": 1.1610972881317139, |
|
"learning_rate": 2.6592899812581347e-05, |
|
"loss": 4.3445, |
|
"num_input_tokens_seen": 59310080, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7326523423740754, |
|
"grad_norm": 1.3581031560897827, |
|
"learning_rate": 2.6519741765271837e-05, |
|
"loss": 4.3591, |
|
"num_input_tokens_seen": 59637760, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7366779046948121, |
|
"grad_norm": 1.255743145942688, |
|
"learning_rate": 2.6447184196291452e-05, |
|
"loss": 4.342, |
|
"num_input_tokens_seen": 59965440, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7407034670155488, |
|
"grad_norm": 1.4238628149032593, |
|
"learning_rate": 2.6375218935831487e-05, |
|
"loss": 4.3019, |
|
"num_input_tokens_seen": 60293120, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7447290293362854, |
|
"grad_norm": 1.2544608116149902, |
|
"learning_rate": 2.6303837968857168e-05, |
|
"loss": 4.3089, |
|
"num_input_tokens_seen": 60620800, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.7487545916570221, |
|
"grad_norm": 1.1759463548660278, |
|
"learning_rate": 2.623303343135812e-05, |
|
"loss": 4.3121, |
|
"num_input_tokens_seen": 60948480, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7527801539777588, |
|
"grad_norm": 1.3340330123901367, |
|
"learning_rate": 2.616279760670915e-05, |
|
"loss": 4.3074, |
|
"num_input_tokens_seen": 61276160, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.7568057162984955, |
|
"grad_norm": 1.2744321823120117, |
|
"learning_rate": 2.609312292213769e-05, |
|
"loss": 4.3436, |
|
"num_input_tokens_seen": 61603840, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7608312786192322, |
|
"grad_norm": 1.1901994943618774, |
|
"learning_rate": 2.602400194529422e-05, |
|
"loss": 4.2908, |
|
"num_input_tokens_seen": 61931520, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7648568409399688, |
|
"grad_norm": 1.202046513557434, |
|
"learning_rate": 2.595542738092201e-05, |
|
"loss": 4.3347, |
|
"num_input_tokens_seen": 62259200, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7688824032607054, |
|
"grad_norm": 1.2437232732772827, |
|
"learning_rate": 2.5887392067622943e-05, |
|
"loss": 4.3434, |
|
"num_input_tokens_seen": 62586880, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.7729079655814421, |
|
"grad_norm": 1.2383480072021484, |
|
"learning_rate": 2.5819888974716114e-05, |
|
"loss": 4.3144, |
|
"num_input_tokens_seen": 62914560, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7769335279021788, |
|
"grad_norm": 1.2294889688491821, |
|
"learning_rate": 2.575291119918602e-05, |
|
"loss": 4.2971, |
|
"num_input_tokens_seen": 63242240, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.7809590902229155, |
|
"grad_norm": 1.235829472541809, |
|
"learning_rate": 2.5686451962717426e-05, |
|
"loss": 4.3006, |
|
"num_input_tokens_seen": 63569920, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7849846525436522, |
|
"grad_norm": 1.2483339309692383, |
|
"learning_rate": 2.5620504608813944e-05, |
|
"loss": 4.2846, |
|
"num_input_tokens_seen": 63897600, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7890102148643888, |
|
"grad_norm": 1.2668383121490479, |
|
"learning_rate": 2.55550625999976e-05, |
|
"loss": 4.3256, |
|
"num_input_tokens_seen": 64225280, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7930357771851255, |
|
"grad_norm": 1.2814356088638306, |
|
"learning_rate": 2.549011951508665e-05, |
|
"loss": 4.2869, |
|
"num_input_tokens_seen": 64552960, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.7970613395058622, |
|
"grad_norm": 1.3956332206726074, |
|
"learning_rate": 2.5425669046549127e-05, |
|
"loss": 4.3438, |
|
"num_input_tokens_seen": 64880640, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8010869018265989, |
|
"grad_norm": 1.3514597415924072, |
|
"learning_rate": 2.536170499792966e-05, |
|
"loss": 4.3132, |
|
"num_input_tokens_seen": 65208320, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8051124641473356, |
|
"grad_norm": 1.198144793510437, |
|
"learning_rate": 2.5298221281347037e-05, |
|
"loss": 4.291, |
|
"num_input_tokens_seen": 65536000, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8091380264680723, |
|
"grad_norm": 1.2150394916534424, |
|
"learning_rate": 2.523521191506047e-05, |
|
"loss": 4.3226, |
|
"num_input_tokens_seen": 65863680, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8131635887888089, |
|
"grad_norm": 1.3794989585876465, |
|
"learning_rate": 2.5172671021102106e-05, |
|
"loss": 4.331, |
|
"num_input_tokens_seen": 66191360, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8171891511095456, |
|
"grad_norm": 1.2272028923034668, |
|
"learning_rate": 2.511059282297378e-05, |
|
"loss": 4.3232, |
|
"num_input_tokens_seen": 66519040, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8212147134302823, |
|
"grad_norm": 1.275801420211792, |
|
"learning_rate": 2.5048971643405984e-05, |
|
"loss": 4.3274, |
|
"num_input_tokens_seen": 66846720, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.825240275751019, |
|
"grad_norm": 1.3816701173782349, |
|
"learning_rate": 2.4987801902176975e-05, |
|
"loss": 4.3331, |
|
"num_input_tokens_seen": 67174400, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8292658380717557, |
|
"grad_norm": 1.2978571653366089, |
|
"learning_rate": 2.4927078113990235e-05, |
|
"loss": 4.2978, |
|
"num_input_tokens_seen": 67502080, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8332914003924923, |
|
"grad_norm": 1.3196457624435425, |
|
"learning_rate": 2.486679488640837e-05, |
|
"loss": 4.2904, |
|
"num_input_tokens_seen": 67829760, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.837316962713229, |
|
"grad_norm": 1.3695876598358154, |
|
"learning_rate": 2.480694691784169e-05, |
|
"loss": 4.3261, |
|
"num_input_tokens_seen": 68157440, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8413425250339657, |
|
"grad_norm": 1.3481625318527222, |
|
"learning_rate": 2.474752899558984e-05, |
|
"loss": 4.2861, |
|
"num_input_tokens_seen": 68485120, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.8453680873547024, |
|
"grad_norm": 1.1765189170837402, |
|
"learning_rate": 2.4688535993934706e-05, |
|
"loss": 4.342, |
|
"num_input_tokens_seen": 68812800, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8453680873547024, |
|
"eval_accuracy": 0.3695067117398011, |
|
"eval_loss": 4.1475090980529785, |
|
"eval_runtime": 6.6416, |
|
"eval_samples_per_second": 45.17, |
|
"eval_steps_per_second": 5.722, |
|
"num_input_tokens_seen": 68812800, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8493936496754391, |
|
"grad_norm": 1.3343596458435059, |
|
"learning_rate": 2.4629962872283168e-05, |
|
"loss": 4.2779, |
|
"num_input_tokens_seen": 69140480, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.8534192119961758, |
|
"grad_norm": 1.31355619430542, |
|
"learning_rate": 2.4571804673358052e-05, |
|
"loss": 4.2672, |
|
"num_input_tokens_seen": 69468160, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8574447743169124, |
|
"grad_norm": 1.2236064672470093, |
|
"learning_rate": 2.4514056521435823e-05, |
|
"loss": 4.3232, |
|
"num_input_tokens_seen": 69795840, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.861470336637649, |
|
"grad_norm": 1.2435745000839233, |
|
"learning_rate": 2.4456713620629726e-05, |
|
"loss": 4.3067, |
|
"num_input_tokens_seen": 70123520, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8654958989583857, |
|
"grad_norm": 1.2577111721038818, |
|
"learning_rate": 2.439977125321675e-05, |
|
"loss": 4.3227, |
|
"num_input_tokens_seen": 70451200, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.8695214612791224, |
|
"grad_norm": 1.1730388402938843, |
|
"learning_rate": 2.4343224778007382e-05, |
|
"loss": 4.2894, |
|
"num_input_tokens_seen": 70778880, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8735470235998591, |
|
"grad_norm": 1.2375553846359253, |
|
"learning_rate": 2.4287069628756655e-05, |
|
"loss": 4.3004, |
|
"num_input_tokens_seen": 71106560, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.8775725859205958, |
|
"grad_norm": 1.3142080307006836, |
|
"learning_rate": 2.4231301312615306e-05, |
|
"loss": 4.3036, |
|
"num_input_tokens_seen": 71434240, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8815981482413324, |
|
"grad_norm": 1.218998670578003, |
|
"learning_rate": 2.4175915408619977e-05, |
|
"loss": 4.3097, |
|
"num_input_tokens_seen": 71761920, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8856237105620691, |
|
"grad_norm": 1.312283992767334, |
|
"learning_rate": 2.4120907566221092e-05, |
|
"loss": 4.2724, |
|
"num_input_tokens_seen": 72089600, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8896492728828058, |
|
"grad_norm": 1.273376703262329, |
|
"learning_rate": 2.4066273503847476e-05, |
|
"loss": 4.2896, |
|
"num_input_tokens_seen": 72417280, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.8936748352035425, |
|
"grad_norm": 1.2696881294250488, |
|
"learning_rate": 2.4012009007506573e-05, |
|
"loss": 4.3627, |
|
"num_input_tokens_seen": 72744960, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8977003975242792, |
|
"grad_norm": 1.3189187049865723, |
|
"learning_rate": 2.3958109929419195e-05, |
|
"loss": 4.2899, |
|
"num_input_tokens_seen": 73072640, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9017259598450158, |
|
"grad_norm": 1.1770976781845093, |
|
"learning_rate": 2.3904572186687872e-05, |
|
"loss": 4.2978, |
|
"num_input_tokens_seen": 73400320, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9057515221657525, |
|
"grad_norm": 1.1813838481903076, |
|
"learning_rate": 2.385139175999776e-05, |
|
"loss": 4.2919, |
|
"num_input_tokens_seen": 73728000, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9097770844864892, |
|
"grad_norm": 1.2608633041381836, |
|
"learning_rate": 2.3798564692349184e-05, |
|
"loss": 4.3118, |
|
"num_input_tokens_seen": 74055680, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9138026468072259, |
|
"grad_norm": 1.3048661947250366, |
|
"learning_rate": 2.3746087087820993e-05, |
|
"loss": 4.286, |
|
"num_input_tokens_seen": 74383360, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9178282091279626, |
|
"grad_norm": 1.1029573678970337, |
|
"learning_rate": 2.3693955110363694e-05, |
|
"loss": 4.2935, |
|
"num_input_tokens_seen": 74711040, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9218537714486993, |
|
"grad_norm": 1.2217323780059814, |
|
"learning_rate": 2.3642164982621672e-05, |
|
"loss": 4.3049, |
|
"num_input_tokens_seen": 75038720, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9258793337694359, |
|
"grad_norm": 1.2029818296432495, |
|
"learning_rate": 2.3590712984783544e-05, |
|
"loss": 4.3105, |
|
"num_input_tokens_seen": 75366400, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9299048960901726, |
|
"grad_norm": 1.3691645860671997, |
|
"learning_rate": 2.353959545345999e-05, |
|
"loss": 4.2643, |
|
"num_input_tokens_seen": 75694080, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.9339304584109093, |
|
"grad_norm": 1.441178321838379, |
|
"learning_rate": 2.3488808780588142e-05, |
|
"loss": 4.2654, |
|
"num_input_tokens_seen": 76021760, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.937956020731646, |
|
"grad_norm": 1.2421964406967163, |
|
"learning_rate": 2.3438349412361906e-05, |
|
"loss": 4.3056, |
|
"num_input_tokens_seen": 76349440, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.9419815830523827, |
|
"grad_norm": 1.349487543106079, |
|
"learning_rate": 2.338821384818745e-05, |
|
"loss": 4.281, |
|
"num_input_tokens_seen": 76677120, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9460071453731194, |
|
"grad_norm": 1.3027498722076416, |
|
"learning_rate": 2.3338398639663132e-05, |
|
"loss": 4.2827, |
|
"num_input_tokens_seen": 77004800, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.950032707693856, |
|
"grad_norm": 1.313684344291687, |
|
"learning_rate": 2.3288900389583283e-05, |
|
"loss": 4.3015, |
|
"num_input_tokens_seen": 77332480, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9540582700145926, |
|
"grad_norm": 1.1965482234954834, |
|
"learning_rate": 2.3239715750965074e-05, |
|
"loss": 4.2841, |
|
"num_input_tokens_seen": 77660160, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 1.2446861267089844, |
|
"learning_rate": 2.3190841426097943e-05, |
|
"loss": 4.3269, |
|
"num_input_tokens_seen": 77987840, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.962109394656066, |
|
"grad_norm": 1.218593955039978, |
|
"learning_rate": 2.314227416561487e-05, |
|
"loss": 4.3048, |
|
"num_input_tokens_seen": 78315520, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.9661349569768027, |
|
"grad_norm": 1.1979824304580688, |
|
"learning_rate": 2.3094010767585035e-05, |
|
"loss": 4.268, |
|
"num_input_tokens_seen": 78643200, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9661349569768027, |
|
"eval_accuracy": 0.37141276761892367, |
|
"eval_loss": 4.121544361114502, |
|
"eval_runtime": 6.4083, |
|
"eval_samples_per_second": 46.814, |
|
"eval_steps_per_second": 5.93, |
|
"num_input_tokens_seen": 78643200, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9701605192975393, |
|
"grad_norm": 1.2462401390075684, |
|
"learning_rate": 2.3046048076627097e-05, |
|
"loss": 4.2736, |
|
"num_input_tokens_seen": 78970880, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.974186081618276, |
|
"grad_norm": 1.3557130098342896, |
|
"learning_rate": 2.2998382983042763e-05, |
|
"loss": 4.2592, |
|
"num_input_tokens_seen": 79298560, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9782116439390127, |
|
"grad_norm": 1.342148780822754, |
|
"learning_rate": 2.2951012421969877e-05, |
|
"loss": 4.2947, |
|
"num_input_tokens_seen": 79626240, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.9822372062597494, |
|
"grad_norm": 1.269107699394226, |
|
"learning_rate": 2.290393337255473e-05, |
|
"loss": 4.2789, |
|
"num_input_tokens_seen": 79953920, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9862627685804861, |
|
"grad_norm": 1.189437747001648, |
|
"learning_rate": 2.2857142857142858e-05, |
|
"loss": 4.2793, |
|
"num_input_tokens_seen": 80281600, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.9902883309012228, |
|
"grad_norm": 1.217842698097229, |
|
"learning_rate": 2.2810637940488043e-05, |
|
"loss": 4.2606, |
|
"num_input_tokens_seen": 80609280, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9943138932219594, |
|
"grad_norm": 1.1911648511886597, |
|
"learning_rate": 2.2764415728978892e-05, |
|
"loss": 4.2786, |
|
"num_input_tokens_seen": 80936960, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.9983394555426961, |
|
"grad_norm": 1.2993587255477905, |
|
"learning_rate": 2.2718473369882594e-05, |
|
"loss": 4.2791, |
|
"num_input_tokens_seen": 81264640, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.002365017863433, |
|
"grad_norm": 1.2911006212234497, |
|
"learning_rate": 2.2672808050605434e-05, |
|
"loss": 4.2619, |
|
"num_input_tokens_seen": 81591296, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.0063905801841695, |
|
"grad_norm": 1.4278241395950317, |
|
"learning_rate": 2.2627416997969522e-05, |
|
"loss": 4.2724, |
|
"num_input_tokens_seen": 81918976, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.010416142504906, |
|
"grad_norm": 1.3427342176437378, |
|
"learning_rate": 2.2582297477505467e-05, |
|
"loss": 4.2481, |
|
"num_input_tokens_seen": 82246656, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.0144417048256429, |
|
"grad_norm": 1.337169885635376, |
|
"learning_rate": 2.253744679276044e-05, |
|
"loss": 4.291, |
|
"num_input_tokens_seen": 82574336, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0184672671463795, |
|
"grad_norm": 1.2553439140319824, |
|
"learning_rate": 2.249286228462133e-05, |
|
"loss": 4.2706, |
|
"num_input_tokens_seen": 82902016, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.0224928294671163, |
|
"grad_norm": 1.2947965860366821, |
|
"learning_rate": 2.244854133065255e-05, |
|
"loss": 4.21, |
|
"num_input_tokens_seen": 83229696, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.0265183917878529, |
|
"grad_norm": 1.327486515045166, |
|
"learning_rate": 2.2404481344448157e-05, |
|
"loss": 4.2847, |
|
"num_input_tokens_seen": 83557376, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.0305439541085895, |
|
"grad_norm": 1.2696667909622192, |
|
"learning_rate": 2.23606797749979e-05, |
|
"loss": 4.2864, |
|
"num_input_tokens_seen": 83885056, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.0345695164293263, |
|
"grad_norm": 1.3726356029510498, |
|
"learning_rate": 2.2317134106066828e-05, |
|
"loss": 4.265, |
|
"num_input_tokens_seen": 84212736, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.0385950787500629, |
|
"grad_norm": 1.250604271888733, |
|
"learning_rate": 2.2273841855588186e-05, |
|
"loss": 4.2894, |
|
"num_input_tokens_seen": 84540416, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.0426206410707997, |
|
"grad_norm": 1.3346271514892578, |
|
"learning_rate": 2.223080057506914e-05, |
|
"loss": 4.2751, |
|
"num_input_tokens_seen": 84868096, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.0466462033915362, |
|
"grad_norm": 1.272316575050354, |
|
"learning_rate": 2.2188007849009167e-05, |
|
"loss": 4.2783, |
|
"num_input_tokens_seen": 85195776, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.0506717657122728, |
|
"grad_norm": 1.37924063205719, |
|
"learning_rate": 2.214546129433066e-05, |
|
"loss": 4.2534, |
|
"num_input_tokens_seen": 85523456, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.0546973280330096, |
|
"grad_norm": 1.4074842929840088, |
|
"learning_rate": 2.2103158559821507e-05, |
|
"loss": 4.2809, |
|
"num_input_tokens_seen": 85851136, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.0587228903537462, |
|
"grad_norm": 1.3106921911239624, |
|
"learning_rate": 2.206109732558935e-05, |
|
"loss": 4.2831, |
|
"num_input_tokens_seen": 86178816, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.062748452674483, |
|
"grad_norm": 1.1723382472991943, |
|
"learning_rate": 2.2019275302527215e-05, |
|
"loss": 4.2617, |
|
"num_input_tokens_seen": 86506496, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.0667740149952196, |
|
"grad_norm": 1.2937371730804443, |
|
"learning_rate": 2.1977690231790248e-05, |
|
"loss": 4.2897, |
|
"num_input_tokens_seen": 86834176, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.0707995773159564, |
|
"grad_norm": 1.2420538663864136, |
|
"learning_rate": 2.193633988428327e-05, |
|
"loss": 4.2304, |
|
"num_input_tokens_seen": 87161856, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.074825139636693, |
|
"grad_norm": 1.2572234869003296, |
|
"learning_rate": 2.1895222060158936e-05, |
|
"loss": 4.2599, |
|
"num_input_tokens_seen": 87489536, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.0788507019574296, |
|
"grad_norm": 1.302157998085022, |
|
"learning_rate": 2.1854334588326122e-05, |
|
"loss": 4.2394, |
|
"num_input_tokens_seen": 87817216, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.0828762642781664, |
|
"grad_norm": 1.2681870460510254, |
|
"learning_rate": 2.1813675325968476e-05, |
|
"loss": 4.3021, |
|
"num_input_tokens_seen": 88144896, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.086901826598903, |
|
"grad_norm": 1.1405740976333618, |
|
"learning_rate": 2.1773242158072697e-05, |
|
"loss": 4.2185, |
|
"num_input_tokens_seen": 88472576, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.086901826598903, |
|
"eval_accuracy": 0.37249442300490127, |
|
"eval_loss": 4.103240489959717, |
|
"eval_runtime": 6.6226, |
|
"eval_samples_per_second": 45.3, |
|
"eval_steps_per_second": 5.738, |
|
"num_input_tokens_seen": 88472576, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.0909273889196398, |
|
"grad_norm": 1.144243836402893, |
|
"learning_rate": 2.1733032996966454e-05, |
|
"loss": 4.2665, |
|
"num_input_tokens_seen": 88800256, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.0949529512403764, |
|
"grad_norm": 1.158898115158081, |
|
"learning_rate": 2.1693045781865617e-05, |
|
"loss": 4.2608, |
|
"num_input_tokens_seen": 89127936, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.098978513561113, |
|
"grad_norm": 1.2325701713562012, |
|
"learning_rate": 2.165327847843067e-05, |
|
"loss": 4.2529, |
|
"num_input_tokens_seen": 89455616, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.1030040758818498, |
|
"grad_norm": 1.3270164728164673, |
|
"learning_rate": 2.161372907833197e-05, |
|
"loss": 4.2576, |
|
"num_input_tokens_seen": 89783296, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.1070296382025864, |
|
"grad_norm": 1.2680999040603638, |
|
"learning_rate": 2.157439559882375e-05, |
|
"loss": 4.2411, |
|
"num_input_tokens_seen": 90110976, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.1110552005233232, |
|
"grad_norm": 1.37815260887146, |
|
"learning_rate": 2.1535276082326623e-05, |
|
"loss": 4.3014, |
|
"num_input_tokens_seen": 90438656, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1150807628440598, |
|
"grad_norm": 1.165544867515564, |
|
"learning_rate": 2.149636859601836e-05, |
|
"loss": 4.2405, |
|
"num_input_tokens_seen": 90766336, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.1191063251647964, |
|
"grad_norm": 1.184525966644287, |
|
"learning_rate": 2.1457671231432803e-05, |
|
"loss": 4.2646, |
|
"num_input_tokens_seen": 91094016, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.1231318874855332, |
|
"grad_norm": 1.216493010520935, |
|
"learning_rate": 2.1419182104066638e-05, |
|
"loss": 4.2586, |
|
"num_input_tokens_seen": 91421696, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.1271574498062698, |
|
"grad_norm": 1.4095522165298462, |
|
"learning_rate": 2.1380899352993955e-05, |
|
"loss": 4.2576, |
|
"num_input_tokens_seen": 91749376, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1311830121270066, |
|
"grad_norm": 1.1573916673660278, |
|
"learning_rate": 2.134282114048833e-05, |
|
"loss": 4.2463, |
|
"num_input_tokens_seen": 92077056, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.1352085744477431, |
|
"grad_norm": 1.213179111480713, |
|
"learning_rate": 2.13049456516523e-05, |
|
"loss": 4.2402, |
|
"num_input_tokens_seen": 92404736, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.1392341367684797, |
|
"grad_norm": 1.197209119796753, |
|
"learning_rate": 2.1267271094054026e-05, |
|
"loss": 4.237, |
|
"num_input_tokens_seen": 92732416, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.1432596990892165, |
|
"grad_norm": 1.2352854013442993, |
|
"learning_rate": 2.1229795697371012e-05, |
|
"loss": 4.248, |
|
"num_input_tokens_seen": 93060096, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.1472852614099531, |
|
"grad_norm": 1.196413516998291, |
|
"learning_rate": 2.1192517713040704e-05, |
|
"loss": 4.272, |
|
"num_input_tokens_seen": 93387776, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.15131082373069, |
|
"grad_norm": 1.2881050109863281, |
|
"learning_rate": 2.11554354139178e-05, |
|
"loss": 4.258, |
|
"num_input_tokens_seen": 93715456, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.1553363860514265, |
|
"grad_norm": 1.2045120000839233, |
|
"learning_rate": 2.1118547093938186e-05, |
|
"loss": 4.2621, |
|
"num_input_tokens_seen": 94043136, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.1593619483721633, |
|
"grad_norm": 1.1793156862258911, |
|
"learning_rate": 2.1081851067789197e-05, |
|
"loss": 4.2386, |
|
"num_input_tokens_seen": 94370816, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.1633875106929, |
|
"grad_norm": 1.3340944051742554, |
|
"learning_rate": 2.1045345670586257e-05, |
|
"loss": 4.2837, |
|
"num_input_tokens_seen": 94698496, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.1674130730136365, |
|
"grad_norm": 1.2929213047027588, |
|
"learning_rate": 2.100902925755561e-05, |
|
"loss": 4.2334, |
|
"num_input_tokens_seen": 95026176, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.1714386353343733, |
|
"grad_norm": 1.3204522132873535, |
|
"learning_rate": 2.0972900203723056e-05, |
|
"loss": 4.2522, |
|
"num_input_tokens_seen": 95353856, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.17546419765511, |
|
"grad_norm": 1.305235743522644, |
|
"learning_rate": 2.0936956903608547e-05, |
|
"loss": 4.2534, |
|
"num_input_tokens_seen": 95681536, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.1794897599758467, |
|
"grad_norm": 1.2224024534225464, |
|
"learning_rate": 2.0901197770926567e-05, |
|
"loss": 4.2439, |
|
"num_input_tokens_seen": 96009216, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.1835153222965833, |
|
"grad_norm": 1.249813199043274, |
|
"learning_rate": 2.0865621238292045e-05, |
|
"loss": 4.2009, |
|
"num_input_tokens_seen": 96336896, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.18754088461732, |
|
"grad_norm": 1.2258191108703613, |
|
"learning_rate": 2.0830225756931824e-05, |
|
"loss": 4.2517, |
|
"num_input_tokens_seen": 96664576, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.1915664469380567, |
|
"grad_norm": 1.2485374212265015, |
|
"learning_rate": 2.0795009796401456e-05, |
|
"loss": 4.2254, |
|
"num_input_tokens_seen": 96992256, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.1955920092587933, |
|
"grad_norm": 1.2434406280517578, |
|
"learning_rate": 2.0759971844307282e-05, |
|
"loss": 4.208, |
|
"num_input_tokens_seen": 97319936, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.19961757157953, |
|
"grad_norm": 1.212769865989685, |
|
"learning_rate": 2.072511040603359e-05, |
|
"loss": 4.2253, |
|
"num_input_tokens_seen": 97647616, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2036431339002667, |
|
"grad_norm": 1.1651945114135742, |
|
"learning_rate": 2.0690424004474868e-05, |
|
"loss": 4.2694, |
|
"num_input_tokens_seen": 97975296, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.2076686962210035, |
|
"grad_norm": 1.2737419605255127, |
|
"learning_rate": 2.065591117977289e-05, |
|
"loss": 4.2645, |
|
"num_input_tokens_seen": 98302976, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2076686962210035, |
|
"eval_accuracy": 0.37566890863567187, |
|
"eval_loss": 4.08589506149292, |
|
"eval_runtime": 6.6303, |
|
"eval_samples_per_second": 45.247, |
|
"eval_steps_per_second": 5.731, |
|
"num_input_tokens_seen": 98302976, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.21169425854174, |
|
"grad_norm": 1.2633168697357178, |
|
"learning_rate": 2.0621570489058684e-05, |
|
"loss": 4.2746, |
|
"num_input_tokens_seen": 98630656, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.2157198208624767, |
|
"grad_norm": 1.2094192504882812, |
|
"learning_rate": 2.0587400506199153e-05, |
|
"loss": 4.2431, |
|
"num_input_tokens_seen": 98958336, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.2197453831832135, |
|
"grad_norm": 1.2098276615142822, |
|
"learning_rate": 2.0553399821548317e-05, |
|
"loss": 4.2635, |
|
"num_input_tokens_seen": 99286016, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.22377094550395, |
|
"grad_norm": 1.1902521848678589, |
|
"learning_rate": 2.0519567041703087e-05, |
|
"loss": 4.283, |
|
"num_input_tokens_seen": 99613696, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.2277965078246869, |
|
"grad_norm": 1.2712699174880981, |
|
"learning_rate": 2.048590078926336e-05, |
|
"loss": 4.2737, |
|
"num_input_tokens_seen": 99941376, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.2318220701454234, |
|
"grad_norm": 1.2726564407348633, |
|
"learning_rate": 2.0452399702596545e-05, |
|
"loss": 4.2715, |
|
"num_input_tokens_seen": 100269056, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.23584763246616, |
|
"grad_norm": 1.2632936239242554, |
|
"learning_rate": 2.0419062435606238e-05, |
|
"loss": 4.2266, |
|
"num_input_tokens_seen": 100596736, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.2398731947868968, |
|
"grad_norm": 1.470897912979126, |
|
"learning_rate": 2.038588765750502e-05, |
|
"loss": 4.235, |
|
"num_input_tokens_seen": 100924416, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.2438987571076334, |
|
"grad_norm": 1.3083152770996094, |
|
"learning_rate": 2.035287405259138e-05, |
|
"loss": 4.233, |
|
"num_input_tokens_seen": 101252096, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.2479243194283702, |
|
"grad_norm": 1.2998907566070557, |
|
"learning_rate": 2.032002032003048e-05, |
|
"loss": 4.22, |
|
"num_input_tokens_seen": 101579776, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.2519498817491068, |
|
"grad_norm": 1.3198977708816528, |
|
"learning_rate": 2.0287325173638952e-05, |
|
"loss": 4.236, |
|
"num_input_tokens_seen": 101907456, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.2559754440698434, |
|
"grad_norm": 1.3605294227600098, |
|
"learning_rate": 2.0254787341673334e-05, |
|
"loss": 4.2318, |
|
"num_input_tokens_seen": 102235136, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.2600010063905802, |
|
"grad_norm": 1.2605648040771484, |
|
"learning_rate": 2.0222405566622346e-05, |
|
"loss": 4.232, |
|
"num_input_tokens_seen": 102562816, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.2640265687113168, |
|
"grad_norm": 1.2441262006759644, |
|
"learning_rate": 2.019017860500275e-05, |
|
"loss": 4.1917, |
|
"num_input_tokens_seen": 102890496, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.2680521310320536, |
|
"grad_norm": 1.3246742486953735, |
|
"learning_rate": 2.0158105227158783e-05, |
|
"loss": 4.2272, |
|
"num_input_tokens_seen": 103218176, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.2720776933527902, |
|
"grad_norm": 1.2322101593017578, |
|
"learning_rate": 2.0126184217065105e-05, |
|
"loss": 4.1989, |
|
"num_input_tokens_seen": 103545856, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.2761032556735268, |
|
"grad_norm": 1.2290526628494263, |
|
"learning_rate": 2.0094414372133134e-05, |
|
"loss": 4.2389, |
|
"num_input_tokens_seen": 103873536, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.2801288179942636, |
|
"grad_norm": 1.3795405626296997, |
|
"learning_rate": 2.006279450302077e-05, |
|
"loss": 4.1988, |
|
"num_input_tokens_seen": 104201216, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.2841543803150002, |
|
"grad_norm": 1.3456906080245972, |
|
"learning_rate": 2.003132343344538e-05, |
|
"loss": 4.2586, |
|
"num_input_tokens_seen": 104528896, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.288179942635737, |
|
"grad_norm": 1.2667040824890137, |
|
"learning_rate": 2e-05, |
|
"loss": 4.2505, |
|
"num_input_tokens_seen": 104856576, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.2922055049564736, |
|
"grad_norm": 1.2646143436431885, |
|
"learning_rate": 1.9968823051972716e-05, |
|
"loss": 4.2537, |
|
"num_input_tokens_seen": 105184256, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.2962310672772102, |
|
"grad_norm": 1.2952028512954712, |
|
"learning_rate": 1.9937791451169073e-05, |
|
"loss": 4.2439, |
|
"num_input_tokens_seen": 105511936, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.300256629597947, |
|
"grad_norm": 1.346574068069458, |
|
"learning_rate": 1.9906904071737584e-05, |
|
"loss": 4.24, |
|
"num_input_tokens_seen": 105839616, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.3042821919186838, |
|
"grad_norm": 1.2765649557113647, |
|
"learning_rate": 1.9876159799998135e-05, |
|
"loss": 4.2229, |
|
"num_input_tokens_seen": 106167296, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.3083077542394204, |
|
"grad_norm": 1.3729643821716309, |
|
"learning_rate": 1.9845557534273358e-05, |
|
"loss": 4.201, |
|
"num_input_tokens_seen": 106494976, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.312333316560157, |
|
"grad_norm": 1.2387956380844116, |
|
"learning_rate": 1.98150961847228e-05, |
|
"loss": 4.2235, |
|
"num_input_tokens_seen": 106822656, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.3163588788808938, |
|
"grad_norm": 1.283973217010498, |
|
"learning_rate": 1.978477467317992e-05, |
|
"loss": 4.2616, |
|
"num_input_tokens_seen": 107150336, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.3203844412016303, |
|
"grad_norm": 1.3440768718719482, |
|
"learning_rate": 1.9754591932991796e-05, |
|
"loss": 4.2215, |
|
"num_input_tokens_seen": 107478016, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.3244100035223672, |
|
"grad_norm": 1.292891263961792, |
|
"learning_rate": 1.9724546908861517e-05, |
|
"loss": 4.1927, |
|
"num_input_tokens_seen": 107805696, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.3284355658431037, |
|
"grad_norm": 1.3360971212387085, |
|
"learning_rate": 1.9694638556693238e-05, |
|
"loss": 4.2542, |
|
"num_input_tokens_seen": 108133376, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3284355658431037, |
|
"eval_accuracy": 0.37502241786907964, |
|
"eval_loss": 4.072964668273926, |
|
"eval_runtime": 6.6283, |
|
"eval_samples_per_second": 45.26, |
|
"eval_steps_per_second": 5.733, |
|
"num_input_tokens_seen": 108133376, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3324611281638403, |
|
"grad_norm": 1.2728638648986816, |
|
"learning_rate": 1.9664865843439752e-05, |
|
"loss": 4.2287, |
|
"num_input_tokens_seen": 108461056, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.3364866904845771, |
|
"grad_norm": 1.242002248764038, |
|
"learning_rate": 1.9635227746952642e-05, |
|
"loss": 4.2355, |
|
"num_input_tokens_seen": 108788736, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.3405122528053137, |
|
"grad_norm": 1.2821120023727417, |
|
"learning_rate": 1.9605723255834874e-05, |
|
"loss": 4.2256, |
|
"num_input_tokens_seen": 109116416, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.3445378151260505, |
|
"grad_norm": 1.380469560623169, |
|
"learning_rate": 1.9576351369295855e-05, |
|
"loss": 4.2284, |
|
"num_input_tokens_seen": 109444096, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.3485633774467871, |
|
"grad_norm": 1.241410493850708, |
|
"learning_rate": 1.9547111097008838e-05, |
|
"loss": 4.2122, |
|
"num_input_tokens_seen": 109771776, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.3525889397675237, |
|
"grad_norm": 1.2036563158035278, |
|
"learning_rate": 1.9518001458970662e-05, |
|
"loss": 4.2312, |
|
"num_input_tokens_seen": 110099456, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.3566145020882605, |
|
"grad_norm": 1.3382368087768555, |
|
"learning_rate": 1.9489021485363838e-05, |
|
"loss": 4.2168, |
|
"num_input_tokens_seen": 110427136, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.360640064408997, |
|
"grad_norm": 1.2150115966796875, |
|
"learning_rate": 1.9460170216420796e-05, |
|
"loss": 4.2322, |
|
"num_input_tokens_seen": 110754816, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.364665626729734, |
|
"grad_norm": 1.2400802373886108, |
|
"learning_rate": 1.9431446702290413e-05, |
|
"loss": 4.2018, |
|
"num_input_tokens_seen": 111082496, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.3686911890504705, |
|
"grad_norm": 1.3871190547943115, |
|
"learning_rate": 1.9402850002906638e-05, |
|
"loss": 4.2346, |
|
"num_input_tokens_seen": 111410176, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.372716751371207, |
|
"grad_norm": 1.2957957983016968, |
|
"learning_rate": 1.9374379187859312e-05, |
|
"loss": 4.2078, |
|
"num_input_tokens_seen": 111737856, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.3767423136919439, |
|
"grad_norm": 1.2237967252731323, |
|
"learning_rate": 1.934603333626698e-05, |
|
"loss": 4.2472, |
|
"num_input_tokens_seen": 112065536, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.3807678760126805, |
|
"grad_norm": 1.2709431648254395, |
|
"learning_rate": 1.9317811536651808e-05, |
|
"loss": 4.2679, |
|
"num_input_tokens_seen": 112393216, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.3847934383334173, |
|
"grad_norm": 1.3072236776351929, |
|
"learning_rate": 1.9289712886816488e-05, |
|
"loss": 4.2672, |
|
"num_input_tokens_seen": 112720896, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.3888190006541539, |
|
"grad_norm": 1.2216880321502686, |
|
"learning_rate": 1.9261736493723075e-05, |
|
"loss": 4.2124, |
|
"num_input_tokens_seen": 113048576, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.3928445629748905, |
|
"grad_norm": 1.2216253280639648, |
|
"learning_rate": 1.9233881473373802e-05, |
|
"loss": 4.2168, |
|
"num_input_tokens_seen": 113376256, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.3968701252956273, |
|
"grad_norm": 1.15086030960083, |
|
"learning_rate": 1.9206146950693745e-05, |
|
"loss": 4.2097, |
|
"num_input_tokens_seen": 113703936, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.4008956876163638, |
|
"grad_norm": 1.2962745428085327, |
|
"learning_rate": 1.917853205941537e-05, |
|
"loss": 4.2328, |
|
"num_input_tokens_seen": 114031616, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.4049212499371007, |
|
"grad_norm": 1.3710294961929321, |
|
"learning_rate": 1.915103594196486e-05, |
|
"loss": 4.2205, |
|
"num_input_tokens_seen": 114359296, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.4089468122578372, |
|
"grad_norm": 1.290256142616272, |
|
"learning_rate": 1.91236577493503e-05, |
|
"loss": 4.2239, |
|
"num_input_tokens_seen": 114686976, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.4129723745785738, |
|
"grad_norm": 1.1622158288955688, |
|
"learning_rate": 1.9096396641051548e-05, |
|
"loss": 4.2077, |
|
"num_input_tokens_seen": 115014656, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.4169979368993106, |
|
"grad_norm": 1.2251081466674805, |
|
"learning_rate": 1.9069251784911845e-05, |
|
"loss": 4.2479, |
|
"num_input_tokens_seen": 115342336, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.4210234992200472, |
|
"grad_norm": 1.2181967496871948, |
|
"learning_rate": 1.9042222357031166e-05, |
|
"loss": 4.2252, |
|
"num_input_tokens_seen": 115670016, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.425049061540784, |
|
"grad_norm": 1.3037039041519165, |
|
"learning_rate": 1.9015307541661134e-05, |
|
"loss": 4.22, |
|
"num_input_tokens_seen": 115997696, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.4290746238615206, |
|
"grad_norm": 1.2290278673171997, |
|
"learning_rate": 1.8988506531101655e-05, |
|
"loss": 4.22, |
|
"num_input_tokens_seen": 116325376, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.4331001861822572, |
|
"grad_norm": 1.287455677986145, |
|
"learning_rate": 1.8961818525599093e-05, |
|
"loss": 4.2533, |
|
"num_input_tokens_seen": 116653056, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.437125748502994, |
|
"grad_norm": 1.1899845600128174, |
|
"learning_rate": 1.8935242733246034e-05, |
|
"loss": 4.2289, |
|
"num_input_tokens_seen": 116980736, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.4411513108237308, |
|
"grad_norm": 1.2166035175323486, |
|
"learning_rate": 1.8908778369882623e-05, |
|
"loss": 4.218, |
|
"num_input_tokens_seen": 117308416, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.4451768731444674, |
|
"grad_norm": 1.2712666988372803, |
|
"learning_rate": 1.888242465899932e-05, |
|
"loss": 4.1944, |
|
"num_input_tokens_seen": 117636096, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.449202435465204, |
|
"grad_norm": 1.1924896240234375, |
|
"learning_rate": 1.885618083164127e-05, |
|
"loss": 4.2614, |
|
"num_input_tokens_seen": 117963776, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.449202435465204, |
|
"eval_accuracy": 0.37485385814497274, |
|
"eval_loss": 4.068163871765137, |
|
"eval_runtime": 7.4672, |
|
"eval_samples_per_second": 40.176, |
|
"eval_steps_per_second": 5.089, |
|
"num_input_tokens_seen": 117963776, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.4532279977859408, |
|
"grad_norm": 1.2685041427612305, |
|
"learning_rate": 1.883004612631402e-05, |
|
"loss": 4.236, |
|
"num_input_tokens_seen": 118291456, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.4572535601066774, |
|
"grad_norm": 1.1977604627609253, |
|
"learning_rate": 1.880401978889074e-05, |
|
"loss": 4.258, |
|
"num_input_tokens_seen": 118619136, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.4612791224274142, |
|
"grad_norm": 1.337408423423767, |
|
"learning_rate": 1.877810107252081e-05, |
|
"loss": 4.2549, |
|
"num_input_tokens_seen": 118946816, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.4653046847481508, |
|
"grad_norm": 1.348046898841858, |
|
"learning_rate": 1.875228923753982e-05, |
|
"loss": 4.2513, |
|
"num_input_tokens_seen": 119274496, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.4693302470688874, |
|
"grad_norm": 1.2587336301803589, |
|
"learning_rate": 1.8726583551380893e-05, |
|
"loss": 4.2251, |
|
"num_input_tokens_seen": 119602176, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.4733558093896242, |
|
"grad_norm": 1.3538405895233154, |
|
"learning_rate": 1.8700983288487377e-05, |
|
"loss": 4.2219, |
|
"num_input_tokens_seen": 119929856, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.4773813717103608, |
|
"grad_norm": 1.2679195404052734, |
|
"learning_rate": 1.8675487730226835e-05, |
|
"loss": 4.2026, |
|
"num_input_tokens_seen": 120257536, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.4814069340310976, |
|
"grad_norm": 1.2712976932525635, |
|
"learning_rate": 1.8650096164806278e-05, |
|
"loss": 4.2277, |
|
"num_input_tokens_seen": 120585216, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.4854324963518342, |
|
"grad_norm": 1.2575690746307373, |
|
"learning_rate": 1.862480788718875e-05, |
|
"loss": 4.2329, |
|
"num_input_tokens_seen": 120912896, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.4894580586725708, |
|
"grad_norm": 1.2137782573699951, |
|
"learning_rate": 1.8599622199011086e-05, |
|
"loss": 4.2076, |
|
"num_input_tokens_seen": 121240576, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.4934836209933076, |
|
"grad_norm": 1.2346251010894775, |
|
"learning_rate": 1.8574538408502883e-05, |
|
"loss": 4.2357, |
|
"num_input_tokens_seen": 121568256, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.4975091833140441, |
|
"grad_norm": 1.2532144784927368, |
|
"learning_rate": 1.854955583040673e-05, |
|
"loss": 4.2616, |
|
"num_input_tokens_seen": 121895936, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.501534745634781, |
|
"grad_norm": 1.1912769079208374, |
|
"learning_rate": 1.8524673785899573e-05, |
|
"loss": 4.1788, |
|
"num_input_tokens_seen": 122223616, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.5055603079555175, |
|
"grad_norm": 1.2742645740509033, |
|
"learning_rate": 1.849989160251521e-05, |
|
"loss": 4.2119, |
|
"num_input_tokens_seen": 122551296, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.5095858702762541, |
|
"grad_norm": 1.264236569404602, |
|
"learning_rate": 1.847520861406802e-05, |
|
"loss": 4.2219, |
|
"num_input_tokens_seen": 122878976, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.513611432596991, |
|
"grad_norm": 1.2975112199783325, |
|
"learning_rate": 1.8450624160577702e-05, |
|
"loss": 4.2227, |
|
"num_input_tokens_seen": 123206656, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.5176369949177275, |
|
"grad_norm": 1.231823205947876, |
|
"learning_rate": 1.842613758819515e-05, |
|
"loss": 4.2185, |
|
"num_input_tokens_seen": 123534336, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.5216625572384643, |
|
"grad_norm": 1.3271822929382324, |
|
"learning_rate": 1.8401748249129447e-05, |
|
"loss": 4.2174, |
|
"num_input_tokens_seen": 123862016, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.525688119559201, |
|
"grad_norm": 1.2804478406906128, |
|
"learning_rate": 1.8377455501575864e-05, |
|
"loss": 4.1906, |
|
"num_input_tokens_seen": 124189696, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.5297136818799375, |
|
"grad_norm": 1.2299883365631104, |
|
"learning_rate": 1.835325870964494e-05, |
|
"loss": 4.1957, |
|
"num_input_tokens_seen": 124517376, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.5337392442006743, |
|
"grad_norm": 1.2697046995162964, |
|
"learning_rate": 1.8329157243292555e-05, |
|
"loss": 4.2115, |
|
"num_input_tokens_seen": 124845056, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.5377648065214111, |
|
"grad_norm": 1.20834219455719, |
|
"learning_rate": 1.8305150478251023e-05, |
|
"loss": 4.2224, |
|
"num_input_tokens_seen": 125172736, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.5417903688421477, |
|
"grad_norm": 1.2465049028396606, |
|
"learning_rate": 1.8281237795961206e-05, |
|
"loss": 4.1757, |
|
"num_input_tokens_seen": 125500416, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.5458159311628843, |
|
"grad_norm": 1.2020412683486938, |
|
"learning_rate": 1.825741858350554e-05, |
|
"loss": 4.2047, |
|
"num_input_tokens_seen": 125828096, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.5498414934836209, |
|
"grad_norm": 1.384621024131775, |
|
"learning_rate": 1.8233692233542074e-05, |
|
"loss": 4.2553, |
|
"num_input_tokens_seen": 126155776, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.5538670558043577, |
|
"grad_norm": 1.212430477142334, |
|
"learning_rate": 1.8210058144239417e-05, |
|
"loss": 4.1952, |
|
"num_input_tokens_seen": 126483456, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.5578926181250945, |
|
"grad_norm": 1.2624080181121826, |
|
"learning_rate": 1.8186515719212627e-05, |
|
"loss": 4.2208, |
|
"num_input_tokens_seen": 126811136, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.561918180445831, |
|
"grad_norm": 1.2795370817184448, |
|
"learning_rate": 1.8163064367459993e-05, |
|
"loss": 4.1835, |
|
"num_input_tokens_seen": 127138816, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.5659437427665677, |
|
"grad_norm": 1.3003804683685303, |
|
"learning_rate": 1.813970350330073e-05, |
|
"loss": 4.2235, |
|
"num_input_tokens_seen": 127466496, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.5699693050873043, |
|
"grad_norm": 1.328814148902893, |
|
"learning_rate": 1.8116432546313533e-05, |
|
"loss": 4.1928, |
|
"num_input_tokens_seen": 127794176, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.5699693050873043, |
|
"eval_accuracy": 0.37578634243079373, |
|
"eval_loss": 4.0595831871032715, |
|
"eval_runtime": 7.0878, |
|
"eval_samples_per_second": 42.326, |
|
"eval_steps_per_second": 5.361, |
|
"num_input_tokens_seen": 127794176, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.573994867408041, |
|
"grad_norm": 1.258589506149292, |
|
"learning_rate": 1.8093250921276027e-05, |
|
"loss": 4.2336, |
|
"num_input_tokens_seen": 128121856, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.5780204297287779, |
|
"grad_norm": 1.226361632347107, |
|
"learning_rate": 1.8070158058105027e-05, |
|
"loss": 4.1945, |
|
"num_input_tokens_seen": 128449536, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.5820459920495145, |
|
"grad_norm": 1.2249149084091187, |
|
"learning_rate": 1.8047153391797662e-05, |
|
"loss": 4.1916, |
|
"num_input_tokens_seen": 128777216, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.586071554370251, |
|
"grad_norm": 1.3121646642684937, |
|
"learning_rate": 1.8024236362373317e-05, |
|
"loss": 4.1833, |
|
"num_input_tokens_seen": 129104896, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.5900971166909876, |
|
"grad_norm": 1.2635945081710815, |
|
"learning_rate": 1.8001406414816385e-05, |
|
"loss": 4.208, |
|
"num_input_tokens_seen": 129432576, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.5941226790117244, |
|
"grad_norm": 1.2306846380233765, |
|
"learning_rate": 1.797866299901979e-05, |
|
"loss": 4.1832, |
|
"num_input_tokens_seen": 129760256, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.5981482413324613, |
|
"grad_norm": 1.2278416156768799, |
|
"learning_rate": 1.7956005569729342e-05, |
|
"loss": 4.1765, |
|
"num_input_tokens_seen": 130087936, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.6021738036531978, |
|
"grad_norm": 1.2951785326004028, |
|
"learning_rate": 1.7933433586488815e-05, |
|
"loss": 4.2123, |
|
"num_input_tokens_seen": 130415616, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.6061993659739344, |
|
"grad_norm": 1.341288447380066, |
|
"learning_rate": 1.79109465135858e-05, |
|
"loss": 4.1865, |
|
"num_input_tokens_seen": 130743296, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.610224928294671, |
|
"grad_norm": 1.269637942314148, |
|
"learning_rate": 1.788854381999832e-05, |
|
"loss": 4.2004, |
|
"num_input_tokens_seen": 131070976, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6142504906154078, |
|
"grad_norm": 1.3138514757156372, |
|
"learning_rate": 1.7866224979342134e-05, |
|
"loss": 4.2172, |
|
"num_input_tokens_seen": 131398656, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.6182760529361446, |
|
"grad_norm": 1.1714235544204712, |
|
"learning_rate": 1.7843989469818822e-05, |
|
"loss": 4.1903, |
|
"num_input_tokens_seen": 131726336, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.6223016152568812, |
|
"grad_norm": 1.3786474466323853, |
|
"learning_rate": 1.7821836774164523e-05, |
|
"loss": 4.2, |
|
"num_input_tokens_seen": 132054016, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.6263271775776178, |
|
"grad_norm": 1.3901410102844238, |
|
"learning_rate": 1.7799766379599393e-05, |
|
"loss": 4.1928, |
|
"num_input_tokens_seen": 132381696, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.6303527398983544, |
|
"grad_norm": 1.3432561159133911, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 4.1983, |
|
"num_input_tokens_seen": 132709376, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.6343783022190912, |
|
"grad_norm": 1.2629408836364746, |
|
"learning_rate": 1.7755870464739014e-05, |
|
"loss": 4.2136, |
|
"num_input_tokens_seen": 133037056, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.638403864539828, |
|
"grad_norm": 1.2436974048614502, |
|
"learning_rate": 1.7734043940858906e-05, |
|
"loss": 4.1695, |
|
"num_input_tokens_seen": 133364736, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.6424294268605646, |
|
"grad_norm": 1.2254369258880615, |
|
"learning_rate": 1.771229771080191e-05, |
|
"loss": 4.1636, |
|
"num_input_tokens_seen": 133692416, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.6464549891813012, |
|
"grad_norm": 1.3272947072982788, |
|
"learning_rate": 1.769063128347386e-05, |
|
"loss": 4.1979, |
|
"num_input_tokens_seen": 134020096, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.650480551502038, |
|
"grad_norm": 1.2953760623931885, |
|
"learning_rate": 1.7669044171975447e-05, |
|
"loss": 4.2026, |
|
"num_input_tokens_seen": 134347776, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.6545061138227746, |
|
"grad_norm": 1.2235223054885864, |
|
"learning_rate": 1.764753589355622e-05, |
|
"loss": 4.1674, |
|
"num_input_tokens_seen": 134675456, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.6585316761435114, |
|
"grad_norm": 1.1912940740585327, |
|
"learning_rate": 1.762610596956927e-05, |
|
"loss": 4.1811, |
|
"num_input_tokens_seen": 135003136, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.662557238464248, |
|
"grad_norm": 1.3255517482757568, |
|
"learning_rate": 1.7604753925426462e-05, |
|
"loss": 4.2094, |
|
"num_input_tokens_seen": 135330816, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.6665828007849846, |
|
"grad_norm": 1.2693239450454712, |
|
"learning_rate": 1.758347929055432e-05, |
|
"loss": 4.2113, |
|
"num_input_tokens_seen": 135658496, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.6706083631057214, |
|
"grad_norm": 1.2487210035324097, |
|
"learning_rate": 1.7562281598350458e-05, |
|
"loss": 4.2341, |
|
"num_input_tokens_seen": 135986176, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.6746339254264582, |
|
"grad_norm": 1.3119174242019653, |
|
"learning_rate": 1.7541160386140587e-05, |
|
"loss": 4.1846, |
|
"num_input_tokens_seen": 136313856, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.6786594877471948, |
|
"grad_norm": 1.2278319597244263, |
|
"learning_rate": 1.7520115195136116e-05, |
|
"loss": 4.1637, |
|
"num_input_tokens_seen": 136641536, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.6826850500679313, |
|
"grad_norm": 1.2323991060256958, |
|
"learning_rate": 1.7499145570392288e-05, |
|
"loss": 4.1943, |
|
"num_input_tokens_seen": 136969216, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.686710612388668, |
|
"grad_norm": 1.3337785005569458, |
|
"learning_rate": 1.747825106076687e-05, |
|
"loss": 4.2297, |
|
"num_input_tokens_seen": 137296896, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.6907361747094047, |
|
"grad_norm": 1.1806713342666626, |
|
"learning_rate": 1.7457431218879393e-05, |
|
"loss": 4.1971, |
|
"num_input_tokens_seen": 137624576, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.6907361747094047, |
|
"eval_accuracy": 0.3776899715899814, |
|
"eval_loss": 4.050466537475586, |
|
"eval_runtime": 6.6452, |
|
"eval_samples_per_second": 45.145, |
|
"eval_steps_per_second": 5.718, |
|
"num_input_tokens_seen": 137624576, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.6947617370301415, |
|
"grad_norm": 1.1816240549087524, |
|
"learning_rate": 1.7436685601070914e-05, |
|
"loss": 4.1944, |
|
"num_input_tokens_seen": 137952256, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.6987872993508781, |
|
"grad_norm": 1.258262276649475, |
|
"learning_rate": 1.7416013767364324e-05, |
|
"loss": 4.1668, |
|
"num_input_tokens_seen": 138279936, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.7028128616716147, |
|
"grad_norm": 1.2995336055755615, |
|
"learning_rate": 1.7395415281425125e-05, |
|
"loss": 4.2091, |
|
"num_input_tokens_seen": 138607616, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.7068384239923513, |
|
"grad_norm": 1.341255784034729, |
|
"learning_rate": 1.7374889710522778e-05, |
|
"loss": 4.1956, |
|
"num_input_tokens_seen": 138935296, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.7108639863130881, |
|
"grad_norm": 1.3196258544921875, |
|
"learning_rate": 1.7354436625492496e-05, |
|
"loss": 4.1887, |
|
"num_input_tokens_seen": 139262976, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.714889548633825, |
|
"grad_norm": 1.3612850904464722, |
|
"learning_rate": 1.7334055600697583e-05, |
|
"loss": 4.1805, |
|
"num_input_tokens_seen": 139590656, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.7189151109545615, |
|
"grad_norm": 1.259021282196045, |
|
"learning_rate": 1.73137462139922e-05, |
|
"loss": 4.2013, |
|
"num_input_tokens_seen": 139918336, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.722940673275298, |
|
"grad_norm": 1.226407766342163, |
|
"learning_rate": 1.729350804668468e-05, |
|
"loss": 4.1963, |
|
"num_input_tokens_seen": 140246016, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.7269662355960347, |
|
"grad_norm": 1.278818130493164, |
|
"learning_rate": 1.727334068350122e-05, |
|
"loss": 4.1719, |
|
"num_input_tokens_seen": 140573696, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.7309917979167715, |
|
"grad_norm": 1.2456446886062622, |
|
"learning_rate": 1.7253243712550147e-05, |
|
"loss": 4.2157, |
|
"num_input_tokens_seen": 140901376, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.7350173602375083, |
|
"grad_norm": 1.249311089515686, |
|
"learning_rate": 1.723321672528655e-05, |
|
"loss": 4.1854, |
|
"num_input_tokens_seen": 141229056, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.7390429225582449, |
|
"grad_norm": 1.3078937530517578, |
|
"learning_rate": 1.721325931647741e-05, |
|
"loss": 4.1911, |
|
"num_input_tokens_seen": 141556736, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.7430684848789815, |
|
"grad_norm": 1.2260226011276245, |
|
"learning_rate": 1.7193371084167158e-05, |
|
"loss": 4.1945, |
|
"num_input_tokens_seen": 141884416, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.747094047199718, |
|
"grad_norm": 1.2438477277755737, |
|
"learning_rate": 1.7173551629643676e-05, |
|
"loss": 4.1877, |
|
"num_input_tokens_seen": 142212096, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.7511196095204549, |
|
"grad_norm": 1.3562901020050049, |
|
"learning_rate": 1.7153800557404717e-05, |
|
"loss": 4.1662, |
|
"num_input_tokens_seen": 142539776, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.7551451718411917, |
|
"grad_norm": 1.2909834384918213, |
|
"learning_rate": 1.7134117475124774e-05, |
|
"loss": 4.1718, |
|
"num_input_tokens_seen": 142867456, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.7591707341619283, |
|
"grad_norm": 1.3816372156143188, |
|
"learning_rate": 1.7114501993622322e-05, |
|
"loss": 4.187, |
|
"num_input_tokens_seen": 143195136, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.7631962964826648, |
|
"grad_norm": 1.1970911026000977, |
|
"learning_rate": 1.7094953726827533e-05, |
|
"loss": 4.2293, |
|
"num_input_tokens_seen": 143522816, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.7672218588034014, |
|
"grad_norm": 1.300299882888794, |
|
"learning_rate": 1.707547229175031e-05, |
|
"loss": 4.1583, |
|
"num_input_tokens_seen": 143850496, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.7712474211241382, |
|
"grad_norm": 1.3158056735992432, |
|
"learning_rate": 1.7056057308448833e-05, |
|
"loss": 4.1986, |
|
"num_input_tokens_seen": 144178176, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.775272983444875, |
|
"grad_norm": 1.3262073993682861, |
|
"learning_rate": 1.7036708399998398e-05, |
|
"loss": 4.1911, |
|
"num_input_tokens_seen": 144505856, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.7792985457656116, |
|
"grad_norm": 1.2565948963165283, |
|
"learning_rate": 1.7017425192460684e-05, |
|
"loss": 4.1982, |
|
"num_input_tokens_seen": 144833536, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.7833241080863482, |
|
"grad_norm": 1.3026916980743408, |
|
"learning_rate": 1.699820731485341e-05, |
|
"loss": 4.1539, |
|
"num_input_tokens_seen": 145161216, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.787349670407085, |
|
"grad_norm": 1.1916109323501587, |
|
"learning_rate": 1.697905439912036e-05, |
|
"loss": 4.1723, |
|
"num_input_tokens_seen": 145488896, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.7913752327278216, |
|
"grad_norm": 1.2661770582199097, |
|
"learning_rate": 1.6959966080101762e-05, |
|
"loss": 4.1575, |
|
"num_input_tokens_seen": 145816576, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.7954007950485584, |
|
"grad_norm": 1.1987600326538086, |
|
"learning_rate": 1.694094199550507e-05, |
|
"loss": 4.1784, |
|
"num_input_tokens_seen": 146144256, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.799426357369295, |
|
"grad_norm": 1.2095016241073608, |
|
"learning_rate": 1.6921981785876064e-05, |
|
"loss": 4.1995, |
|
"num_input_tokens_seen": 146471936, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.8034519196900316, |
|
"grad_norm": 1.220253586769104, |
|
"learning_rate": 1.6903085094570334e-05, |
|
"loss": 4.1876, |
|
"num_input_tokens_seen": 146799616, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.8074774820107684, |
|
"grad_norm": 1.3598766326904297, |
|
"learning_rate": 1.6884251567725093e-05, |
|
"loss": 4.1952, |
|
"num_input_tokens_seen": 147127296, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.8115030443315052, |
|
"grad_norm": 1.2614084482192993, |
|
"learning_rate": 1.6865480854231357e-05, |
|
"loss": 4.1966, |
|
"num_input_tokens_seen": 147454976, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.8115030443315052, |
|
"eval_accuracy": 0.3787434709282334, |
|
"eval_loss": 4.016251564025879, |
|
"eval_runtime": 6.8672, |
|
"eval_samples_per_second": 43.686, |
|
"eval_steps_per_second": 5.534, |
|
"num_input_tokens_seen": 147454976, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.8155286066522418, |
|
"grad_norm": 1.1849647760391235, |
|
"learning_rate": 1.6846772605706453e-05, |
|
"loss": 4.1795, |
|
"num_input_tokens_seen": 147782656, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.8195541689729784, |
|
"grad_norm": 1.217572569847107, |
|
"learning_rate": 1.682812647646685e-05, |
|
"loss": 4.2173, |
|
"num_input_tokens_seen": 148110336, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.823579731293715, |
|
"grad_norm": 1.3434370756149292, |
|
"learning_rate": 1.6809542123501345e-05, |
|
"loss": 4.1826, |
|
"num_input_tokens_seen": 148438016, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.8276052936144518, |
|
"grad_norm": 1.2767517566680908, |
|
"learning_rate": 1.6791019206444543e-05, |
|
"loss": 4.1785, |
|
"num_input_tokens_seen": 148765696, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.8316308559351886, |
|
"grad_norm": 1.1992509365081787, |
|
"learning_rate": 1.6772557387550694e-05, |
|
"loss": 4.1884, |
|
"num_input_tokens_seen": 149093376, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.8356564182559252, |
|
"grad_norm": 1.1988239288330078, |
|
"learning_rate": 1.675415633166782e-05, |
|
"loss": 4.1726, |
|
"num_input_tokens_seen": 149421056, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.8396819805766618, |
|
"grad_norm": 1.3448524475097656, |
|
"learning_rate": 1.673581570621216e-05, |
|
"loss": 4.1467, |
|
"num_input_tokens_seen": 149748736, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.8437075428973984, |
|
"grad_norm": 1.2856345176696777, |
|
"learning_rate": 1.6717535181142915e-05, |
|
"loss": 4.1613, |
|
"num_input_tokens_seen": 150076416, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.8477331052181352, |
|
"grad_norm": 1.357901692390442, |
|
"learning_rate": 1.669931442893732e-05, |
|
"loss": 4.1587, |
|
"num_input_tokens_seen": 150404096, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.851758667538872, |
|
"grad_norm": 1.295551061630249, |
|
"learning_rate": 1.6681153124565983e-05, |
|
"loss": 4.1588, |
|
"num_input_tokens_seen": 150731776, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.8557842298596086, |
|
"grad_norm": 1.2811824083328247, |
|
"learning_rate": 1.6663050945468536e-05, |
|
"loss": 4.1839, |
|
"num_input_tokens_seen": 151059456, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.8598097921803451, |
|
"grad_norm": 1.3775153160095215, |
|
"learning_rate": 1.6645007571529582e-05, |
|
"loss": 4.2016, |
|
"num_input_tokens_seen": 151387136, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.8638353545010817, |
|
"grad_norm": 1.3014804124832153, |
|
"learning_rate": 1.6627022685054907e-05, |
|
"loss": 4.2082, |
|
"num_input_tokens_seen": 151714816, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 1.8678609168218185, |
|
"grad_norm": 1.2765071392059326, |
|
"learning_rate": 1.6609095970747994e-05, |
|
"loss": 4.1453, |
|
"num_input_tokens_seen": 152042496, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.8718864791425553, |
|
"grad_norm": 1.4745502471923828, |
|
"learning_rate": 1.6591227115686806e-05, |
|
"loss": 4.1856, |
|
"num_input_tokens_seen": 152370176, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.875912041463292, |
|
"grad_norm": 1.2726967334747314, |
|
"learning_rate": 1.6573415809300835e-05, |
|
"loss": 4.1977, |
|
"num_input_tokens_seen": 152697856, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.8799376037840285, |
|
"grad_norm": 1.286971092224121, |
|
"learning_rate": 1.6555661743348445e-05, |
|
"loss": 4.1962, |
|
"num_input_tokens_seen": 153025536, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 1.883963166104765, |
|
"grad_norm": 1.2794855833053589, |
|
"learning_rate": 1.6537964611894465e-05, |
|
"loss": 4.1606, |
|
"num_input_tokens_seen": 153353216, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.887988728425502, |
|
"grad_norm": 1.2264660596847534, |
|
"learning_rate": 1.652032411128802e-05, |
|
"loss": 4.1724, |
|
"num_input_tokens_seen": 153680896, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 1.8920142907462387, |
|
"grad_norm": 1.3459347486495972, |
|
"learning_rate": 1.6502739940140696e-05, |
|
"loss": 4.1663, |
|
"num_input_tokens_seen": 154008576, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.8960398530669753, |
|
"grad_norm": 1.284248948097229, |
|
"learning_rate": 1.6485211799304873e-05, |
|
"loss": 4.1927, |
|
"num_input_tokens_seen": 154336256, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.900065415387712, |
|
"grad_norm": 1.273288369178772, |
|
"learning_rate": 1.6467739391852368e-05, |
|
"loss": 4.1442, |
|
"num_input_tokens_seen": 154663936, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.9040909777084485, |
|
"grad_norm": 1.3548243045806885, |
|
"learning_rate": 1.6450322423053303e-05, |
|
"loss": 4.153, |
|
"num_input_tokens_seen": 154991616, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 1.9081165400291853, |
|
"grad_norm": 1.2603306770324707, |
|
"learning_rate": 1.6432960600355222e-05, |
|
"loss": 4.1805, |
|
"num_input_tokens_seen": 155319296, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.912142102349922, |
|
"grad_norm": 1.3440006971359253, |
|
"learning_rate": 1.6415653633362467e-05, |
|
"loss": 4.1954, |
|
"num_input_tokens_seen": 155646976, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.9161676646706587, |
|
"grad_norm": 1.385733723640442, |
|
"learning_rate": 1.6398401233815756e-05, |
|
"loss": 4.1677, |
|
"num_input_tokens_seen": 155974656, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.9201932269913953, |
|
"grad_norm": 1.408675193786621, |
|
"learning_rate": 1.638120311557203e-05, |
|
"loss": 4.1724, |
|
"num_input_tokens_seen": 156302336, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.924218789312132, |
|
"grad_norm": 1.3172813653945923, |
|
"learning_rate": 1.6364058994584528e-05, |
|
"loss": 4.1658, |
|
"num_input_tokens_seen": 156630016, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.9282443516328687, |
|
"grad_norm": 1.4357519149780273, |
|
"learning_rate": 1.6346968588883048e-05, |
|
"loss": 4.1538, |
|
"num_input_tokens_seen": 156957696, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 1.9322699139536055, |
|
"grad_norm": 1.3051438331604004, |
|
"learning_rate": 1.6329931618554523e-05, |
|
"loss": 4.16, |
|
"num_input_tokens_seen": 157285376, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9322699139536055, |
|
"eval_accuracy": 0.3773649363114314, |
|
"eval_loss": 4.035182952880859, |
|
"eval_runtime": 7.0048, |
|
"eval_samples_per_second": 42.828, |
|
"eval_steps_per_second": 5.425, |
|
"num_input_tokens_seen": 157285376, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.936295476274342, |
|
"grad_norm": 1.229457139968872, |
|
"learning_rate": 1.6312947805723712e-05, |
|
"loss": 4.1306, |
|
"num_input_tokens_seen": 157613056, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.9403210385950787, |
|
"grad_norm": 1.3241009712219238, |
|
"learning_rate": 1.629601687453421e-05, |
|
"loss": 4.1803, |
|
"num_input_tokens_seen": 157940736, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.9443466009158155, |
|
"grad_norm": 1.462347149848938, |
|
"learning_rate": 1.6279138551129596e-05, |
|
"loss": 4.1708, |
|
"num_input_tokens_seen": 158268416, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.9483721632365523, |
|
"grad_norm": 1.261893630027771, |
|
"learning_rate": 1.6262312563634837e-05, |
|
"loss": 4.1739, |
|
"num_input_tokens_seen": 158596096, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.9523977255572889, |
|
"grad_norm": 1.2686680555343628, |
|
"learning_rate": 1.624553864213791e-05, |
|
"loss": 4.1601, |
|
"num_input_tokens_seen": 158923776, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.9564232878780254, |
|
"grad_norm": 1.291801929473877, |
|
"learning_rate": 1.622881651867159e-05, |
|
"loss": 4.1971, |
|
"num_input_tokens_seen": 159251456, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.960448850198762, |
|
"grad_norm": 1.2599908113479614, |
|
"learning_rate": 1.6212145927195505e-05, |
|
"loss": 4.1876, |
|
"num_input_tokens_seen": 159579136, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 1.9644744125194988, |
|
"grad_norm": 1.2517198324203491, |
|
"learning_rate": 1.6195526603578323e-05, |
|
"loss": 4.2083, |
|
"num_input_tokens_seen": 159906816, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.9684999748402356, |
|
"grad_norm": 1.407639741897583, |
|
"learning_rate": 1.6178958285580194e-05, |
|
"loss": 4.1763, |
|
"num_input_tokens_seen": 160234496, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.9725255371609722, |
|
"grad_norm": 1.3766093254089355, |
|
"learning_rate": 1.6162440712835372e-05, |
|
"loss": 4.1656, |
|
"num_input_tokens_seen": 160562176, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.9765510994817088, |
|
"grad_norm": 1.201467514038086, |
|
"learning_rate": 1.6145973626835027e-05, |
|
"loss": 4.1317, |
|
"num_input_tokens_seen": 160889856, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 1.9805766618024454, |
|
"grad_norm": 1.245469093322754, |
|
"learning_rate": 1.6129556770910237e-05, |
|
"loss": 4.188, |
|
"num_input_tokens_seen": 161217536, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.9846022241231822, |
|
"grad_norm": 1.360424518585205, |
|
"learning_rate": 1.611318989021522e-05, |
|
"loss": 4.1817, |
|
"num_input_tokens_seen": 161545216, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 1.988627786443919, |
|
"grad_norm": 1.2917958498001099, |
|
"learning_rate": 1.6096872731710677e-05, |
|
"loss": 4.1912, |
|
"num_input_tokens_seen": 161872896, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.9926533487646556, |
|
"grad_norm": 1.311262607574463, |
|
"learning_rate": 1.6080605044147395e-05, |
|
"loss": 4.1897, |
|
"num_input_tokens_seen": 162200576, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.9966789110853922, |
|
"grad_norm": 1.233081340789795, |
|
"learning_rate": 1.606438657804998e-05, |
|
"loss": 4.1536, |
|
"num_input_tokens_seen": 162528256, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.9998993609419817, |
|
"num_input_tokens_seen": 162790400, |
|
"step": 2484, |
|
"total_flos": 1.059416318592e+16, |
|
"train_loss": 4.377665276304727, |
|
"train_runtime": 2972.3443, |
|
"train_samples_per_second": 106.974, |
|
"train_steps_per_second": 0.836, |
|
"train_tokens_per_second": 54770.764 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2484, |
|
"num_input_tokens_seen": 162790400, |
|
"num_train_epochs": 2, |
|
"save_steps": 200, |
|
"total_flos": 1.059416318592e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|