|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 23445, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06397952655150352, |
|
"grad_norm": 1.3757662773132324, |
|
"learning_rate": 0.0002987204094689699, |
|
"loss": 2.6581, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12795905310300704, |
|
"grad_norm": 1.3149511814117432, |
|
"learning_rate": 0.0002974408189379398, |
|
"loss": 2.2938, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19193857965451055, |
|
"grad_norm": 1.3127187490463257, |
|
"learning_rate": 0.00029617402431222006, |
|
"loss": 2.2174, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2559181062060141, |
|
"grad_norm": 1.1597858667373657, |
|
"learning_rate": 0.00029489443378119, |
|
"loss": 2.1893, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3198976327575176, |
|
"grad_norm": 1.3044805526733398, |
|
"learning_rate": 0.00029361484325015995, |
|
"loss": 2.173, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3838771593090211, |
|
"grad_norm": 1.2641485929489136, |
|
"learning_rate": 0.00029233525271912987, |
|
"loss": 2.1609, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.44785668586052463, |
|
"grad_norm": 1.2622572183609009, |
|
"learning_rate": 0.0002910556621880998, |
|
"loss": 2.1425, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5118362124120281, |
|
"grad_norm": 1.181531548500061, |
|
"learning_rate": 0.0002897760716570697, |
|
"loss": 2.1016, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5758157389635317, |
|
"grad_norm": 1.1225402355194092, |
|
"learning_rate": 0.0002884964811260397, |
|
"loss": 2.087, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6397952655150352, |
|
"grad_norm": 1.2168927192687988, |
|
"learning_rate": 0.00028721689059500955, |
|
"loss": 2.1028, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7037747920665387, |
|
"grad_norm": 1.1682486534118652, |
|
"learning_rate": 0.0002859373000639795, |
|
"loss": 2.0909, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7677543186180422, |
|
"grad_norm": 1.1344854831695557, |
|
"learning_rate": 0.00028465770953294944, |
|
"loss": 2.0896, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8317338451695457, |
|
"grad_norm": 1.1923776865005493, |
|
"learning_rate": 0.00028337811900191936, |
|
"loss": 2.0488, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8957133717210493, |
|
"grad_norm": 1.3097106218338013, |
|
"learning_rate": 0.0002820985284708893, |
|
"loss": 2.0488, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9596928982725528, |
|
"grad_norm": 1.1138262748718262, |
|
"learning_rate": 0.0002808189379398592, |
|
"loss": 2.0354, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0236724248240563, |
|
"grad_norm": 1.2939698696136475, |
|
"learning_rate": 0.00027953934740882917, |
|
"loss": 1.9789, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0876519513755598, |
|
"grad_norm": 1.2986066341400146, |
|
"learning_rate": 0.0002782597568777991, |
|
"loss": 1.9117, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.1516314779270633, |
|
"grad_norm": 1.36199152469635, |
|
"learning_rate": 0.000276980166346769, |
|
"loss": 1.9463, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.2156110044785668, |
|
"grad_norm": 1.1844524145126343, |
|
"learning_rate": 0.00027570057581573893, |
|
"loss": 1.9163, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.2795905310300704, |
|
"grad_norm": 1.3748372793197632, |
|
"learning_rate": 0.0002744209852847089, |
|
"loss": 1.9157, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3435700575815739, |
|
"grad_norm": 1.3857694864273071, |
|
"learning_rate": 0.00027314139475367877, |
|
"loss": 1.9234, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.4075495841330774, |
|
"grad_norm": 1.244759202003479, |
|
"learning_rate": 0.00027186180422264874, |
|
"loss": 1.9202, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.471529110684581, |
|
"grad_norm": 1.3650200366973877, |
|
"learning_rate": 0.00027058221369161866, |
|
"loss": 1.9262, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.5355086372360844, |
|
"grad_norm": 1.3304725885391235, |
|
"learning_rate": 0.0002693026231605886, |
|
"loss": 1.9225, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.599488163787588, |
|
"grad_norm": 1.2908028364181519, |
|
"learning_rate": 0.0002680230326295585, |
|
"loss": 1.9448, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.6634676903390915, |
|
"grad_norm": 1.4424597024917603, |
|
"learning_rate": 0.00026674344209852847, |
|
"loss": 1.9327, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.727447216890595, |
|
"grad_norm": 1.3038992881774902, |
|
"learning_rate": 0.0002654638515674984, |
|
"loss": 1.9156, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.7914267434420985, |
|
"grad_norm": 1.2507870197296143, |
|
"learning_rate": 0.0002641842610364683, |
|
"loss": 1.9111, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.855406269993602, |
|
"grad_norm": 1.3444691896438599, |
|
"learning_rate": 0.00026290467050543823, |
|
"loss": 1.9101, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.9193857965451055, |
|
"grad_norm": 1.3593779802322388, |
|
"learning_rate": 0.00026162507997440815, |
|
"loss": 1.9322, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.983365323096609, |
|
"grad_norm": 1.3340197801589966, |
|
"learning_rate": 0.0002603454894433781, |
|
"loss": 1.9361, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.0473448496481126, |
|
"grad_norm": 1.5682820081710815, |
|
"learning_rate": 0.000259065898912348, |
|
"loss": 1.8007, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.111324376199616, |
|
"grad_norm": 1.5191752910614014, |
|
"learning_rate": 0.00025778630838131796, |
|
"loss": 1.7626, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.1753039027511196, |
|
"grad_norm": 1.614561915397644, |
|
"learning_rate": 0.0002565067178502879, |
|
"loss": 1.7621, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.239283429302623, |
|
"grad_norm": 1.4934717416763306, |
|
"learning_rate": 0.00025522712731925785, |
|
"loss": 1.7753, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.3032629558541267, |
|
"grad_norm": 1.6563340425491333, |
|
"learning_rate": 0.0002539475367882277, |
|
"loss": 1.771, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.36724248240563, |
|
"grad_norm": 1.502406120300293, |
|
"learning_rate": 0.0002526679462571977, |
|
"loss": 1.7823, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.4312220089571337, |
|
"grad_norm": 1.6300321817398071, |
|
"learning_rate": 0.0002513883557261676, |
|
"loss": 1.7792, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.495201535508637, |
|
"grad_norm": 1.5266133546829224, |
|
"learning_rate": 0.00025010876519513753, |
|
"loss": 1.7994, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.5591810620601407, |
|
"grad_norm": 1.5854878425598145, |
|
"learning_rate": 0.00024882917466410745, |
|
"loss": 1.7947, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.6231605886116443, |
|
"grad_norm": 1.4971704483032227, |
|
"learning_rate": 0.00024754958413307737, |
|
"loss": 1.8, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.6871401151631478, |
|
"grad_norm": 1.4385017156600952, |
|
"learning_rate": 0.00024626999360204734, |
|
"loss": 1.8041, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.7511196417146513, |
|
"grad_norm": 1.526516079902649, |
|
"learning_rate": 0.00024499040307101726, |
|
"loss": 1.8172, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.815099168266155, |
|
"grad_norm": 1.5419269800186157, |
|
"learning_rate": 0.00024371081253998718, |
|
"loss": 1.7861, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.8790786948176583, |
|
"grad_norm": 1.6025625467300415, |
|
"learning_rate": 0.00024243122200895713, |
|
"loss": 1.8099, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.943058221369162, |
|
"grad_norm": 1.6059303283691406, |
|
"learning_rate": 0.00024115163147792705, |
|
"loss": 1.8057, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.0070377479206654, |
|
"grad_norm": 1.4824891090393066, |
|
"learning_rate": 0.00023987204094689697, |
|
"loss": 1.7666, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.071017274472169, |
|
"grad_norm": 1.7200456857681274, |
|
"learning_rate": 0.00023859245041586688, |
|
"loss": 1.5827, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.1349968010236724, |
|
"grad_norm": 1.7883553504943848, |
|
"learning_rate": 0.00023731285988483683, |
|
"loss": 1.6154, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.198976327575176, |
|
"grad_norm": 1.7612642049789429, |
|
"learning_rate": 0.00023604606525911705, |
|
"loss": 1.6331, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.2629558541266794, |
|
"grad_norm": 1.8273234367370605, |
|
"learning_rate": 0.000234766474728087, |
|
"loss": 1.6489, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.326935380678183, |
|
"grad_norm": 1.9144798517227173, |
|
"learning_rate": 0.00023348688419705691, |
|
"loss": 1.6451, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.3909149072296865, |
|
"grad_norm": 1.7729228734970093, |
|
"learning_rate": 0.00023220729366602686, |
|
"loss": 1.6504, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.45489443378119, |
|
"grad_norm": 1.7989096641540527, |
|
"learning_rate": 0.00023092770313499675, |
|
"loss": 1.6774, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.5188739603326935, |
|
"grad_norm": 1.9125975370407104, |
|
"learning_rate": 0.0002296481126039667, |
|
"loss": 1.6734, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.582853486884197, |
|
"grad_norm": 1.764624834060669, |
|
"learning_rate": 0.00022836852207293665, |
|
"loss": 1.6829, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.6468330134357005, |
|
"grad_norm": 1.8827048540115356, |
|
"learning_rate": 0.0002270889315419066, |
|
"loss": 1.6917, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.710812539987204, |
|
"grad_norm": 1.7637380361557007, |
|
"learning_rate": 0.00022580934101087648, |
|
"loss": 1.6957, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.7747920665387076, |
|
"grad_norm": 1.9357694387435913, |
|
"learning_rate": 0.00022452975047984643, |
|
"loss": 1.7071, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.838771593090211, |
|
"grad_norm": 1.8760075569152832, |
|
"learning_rate": 0.00022325015994881638, |
|
"loss": 1.689, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.9027511196417146, |
|
"grad_norm": 1.83319890499115, |
|
"learning_rate": 0.0002219705694177863, |
|
"loss": 1.7135, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.966730646193218, |
|
"grad_norm": 1.8324424028396606, |
|
"learning_rate": 0.00022069097888675622, |
|
"loss": 1.7052, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 4.030710172744722, |
|
"grad_norm": 1.772605538368225, |
|
"learning_rate": 0.00021941138835572613, |
|
"loss": 1.6098, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 4.094689699296225, |
|
"grad_norm": 1.936546802520752, |
|
"learning_rate": 0.00021813179782469608, |
|
"loss": 1.4814, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 4.158669225847729, |
|
"grad_norm": 2.055730104446411, |
|
"learning_rate": 0.000216852207293666, |
|
"loss": 1.5106, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.222648752399232, |
|
"grad_norm": 2.3238749504089355, |
|
"learning_rate": 0.00021557261676263592, |
|
"loss": 1.5136, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.286628278950736, |
|
"grad_norm": 1.9317381381988525, |
|
"learning_rate": 0.00021429302623160587, |
|
"loss": 1.5307, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 4.350607805502239, |
|
"grad_norm": 2.057237386703491, |
|
"learning_rate": 0.0002130134357005758, |
|
"loss": 1.5524, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.414587332053743, |
|
"grad_norm": 2.2331132888793945, |
|
"learning_rate": 0.0002117338451695457, |
|
"loss": 1.5674, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 4.478566858605246, |
|
"grad_norm": 2.0981638431549072, |
|
"learning_rate": 0.00021046705054382595, |
|
"loss": 1.5627, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.54254638515675, |
|
"grad_norm": 2.1681783199310303, |
|
"learning_rate": 0.0002091874600127959, |
|
"loss": 1.5615, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 4.606525911708253, |
|
"grad_norm": 2.0720126628875732, |
|
"learning_rate": 0.00020790786948176584, |
|
"loss": 1.5598, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 4.670505438259757, |
|
"grad_norm": 1.9248210191726685, |
|
"learning_rate": 0.00020662827895073573, |
|
"loss": 1.5712, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 4.73448496481126, |
|
"grad_norm": 1.9172708988189697, |
|
"learning_rate": 0.00020534868841970568, |
|
"loss": 1.5629, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.798464491362764, |
|
"grad_norm": 2.141303539276123, |
|
"learning_rate": 0.00020406909788867563, |
|
"loss": 1.5871, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.862444017914267, |
|
"grad_norm": 1.9447873830795288, |
|
"learning_rate": 0.00020278950735764552, |
|
"loss": 1.6006, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.926423544465771, |
|
"grad_norm": 2.089735984802246, |
|
"learning_rate": 0.00020150991682661547, |
|
"loss": 1.5995, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 4.990403071017274, |
|
"grad_norm": 2.056344985961914, |
|
"learning_rate": 0.00020023032629558539, |
|
"loss": 1.6122, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 5.054382597568778, |
|
"grad_norm": 2.0974326133728027, |
|
"learning_rate": 0.00019895073576455533, |
|
"loss": 1.4254, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 5.1183621241202815, |
|
"grad_norm": 2.250195264816284, |
|
"learning_rate": 0.00019767114523352525, |
|
"loss": 1.3966, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 5.182341650671785, |
|
"grad_norm": 2.278472900390625, |
|
"learning_rate": 0.00019639155470249517, |
|
"loss": 1.4051, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 5.2463211772232885, |
|
"grad_norm": 2.3049135208129883, |
|
"learning_rate": 0.00019511196417146512, |
|
"loss": 1.4147, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 5.310300703774792, |
|
"grad_norm": 2.423823833465576, |
|
"learning_rate": 0.00019383237364043506, |
|
"loss": 1.4283, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 5.3742802303262955, |
|
"grad_norm": 2.4221420288085938, |
|
"learning_rate": 0.00019255278310940495, |
|
"loss": 1.4538, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 5.438259756877799, |
|
"grad_norm": 2.3886525630950928, |
|
"learning_rate": 0.0001912731925783749, |
|
"loss": 1.4435, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 5.502239283429303, |
|
"grad_norm": 2.4025745391845703, |
|
"learning_rate": 0.00018999360204734485, |
|
"loss": 1.4512, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 5.566218809980806, |
|
"grad_norm": 2.312255382537842, |
|
"learning_rate": 0.00018871401151631477, |
|
"loss": 1.4644, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 5.63019833653231, |
|
"grad_norm": 2.2402162551879883, |
|
"learning_rate": 0.00018743442098528469, |
|
"loss": 1.4844, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 5.694177863083813, |
|
"grad_norm": 2.3729898929595947, |
|
"learning_rate": 0.00018615483045425463, |
|
"loss": 1.4749, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 5.758157389635317, |
|
"grad_norm": 2.137364149093628, |
|
"learning_rate": 0.00018488803582853488, |
|
"loss": 1.4875, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.82213691618682, |
|
"grad_norm": 2.404008626937866, |
|
"learning_rate": 0.00018360844529750477, |
|
"loss": 1.4954, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 5.886116442738324, |
|
"grad_norm": 2.167051076889038, |
|
"learning_rate": 0.00018232885476647472, |
|
"loss": 1.4923, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 5.950095969289827, |
|
"grad_norm": 2.2124693393707275, |
|
"learning_rate": 0.00018104926423544464, |
|
"loss": 1.4972, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 6.014075495841331, |
|
"grad_norm": 2.4574244022369385, |
|
"learning_rate": 0.00017976967370441458, |
|
"loss": 1.4535, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 6.078055022392834, |
|
"grad_norm": 2.2989892959594727, |
|
"learning_rate": 0.0001784900831733845, |
|
"loss": 1.272, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 6.142034548944338, |
|
"grad_norm": 2.8099708557128906, |
|
"learning_rate": 0.00017721049264235442, |
|
"loss": 1.3033, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 6.206014075495841, |
|
"grad_norm": 2.516444206237793, |
|
"learning_rate": 0.00017593090211132437, |
|
"loss": 1.3066, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 6.269993602047345, |
|
"grad_norm": 2.617293357849121, |
|
"learning_rate": 0.0001746513115802943, |
|
"loss": 1.3263, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 6.333973128598848, |
|
"grad_norm": 2.6873817443847656, |
|
"learning_rate": 0.0001733717210492642, |
|
"loss": 1.3415, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 6.397952655150352, |
|
"grad_norm": 2.558847427368164, |
|
"learning_rate": 0.00017209213051823415, |
|
"loss": 1.3599, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.461932181701855, |
|
"grad_norm": 2.6037933826446533, |
|
"learning_rate": 0.0001708125399872041, |
|
"loss": 1.3486, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 6.525911708253359, |
|
"grad_norm": 2.470381259918213, |
|
"learning_rate": 0.000169532949456174, |
|
"loss": 1.3639, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 6.589891234804862, |
|
"grad_norm": 2.6497058868408203, |
|
"learning_rate": 0.00016825335892514394, |
|
"loss": 1.365, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 6.653870761356366, |
|
"grad_norm": 2.8465592861175537, |
|
"learning_rate": 0.00016697376839411388, |
|
"loss": 1.3754, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 6.717850287907869, |
|
"grad_norm": 2.4625210762023926, |
|
"learning_rate": 0.0001656941778630838, |
|
"loss": 1.3743, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 6.781829814459373, |
|
"grad_norm": 2.633486747741699, |
|
"learning_rate": 0.00016441458733205372, |
|
"loss": 1.3786, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 6.8458093410108765, |
|
"grad_norm": 2.799623727798462, |
|
"learning_rate": 0.00016313499680102367, |
|
"loss": 1.3837, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 6.90978886756238, |
|
"grad_norm": 2.4427671432495117, |
|
"learning_rate": 0.0001618554062699936, |
|
"loss": 1.4229, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 6.9737683941138835, |
|
"grad_norm": 2.680490016937256, |
|
"learning_rate": 0.00016057581573896353, |
|
"loss": 1.3956, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 7.037747920665387, |
|
"grad_norm": 2.6439030170440674, |
|
"learning_rate": 0.00015929622520793343, |
|
"loss": 1.2917, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 7.1017274472168905, |
|
"grad_norm": 2.618804454803467, |
|
"learning_rate": 0.00015802943058221367, |
|
"loss": 1.2111, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 7.165706973768394, |
|
"grad_norm": 2.7387425899505615, |
|
"learning_rate": 0.00015674984005118362, |
|
"loss": 1.2036, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 7.229686500319898, |
|
"grad_norm": 2.7440712451934814, |
|
"learning_rate": 0.00015547024952015354, |
|
"loss": 1.2316, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 7.293666026871401, |
|
"grad_norm": 2.687152147293091, |
|
"learning_rate": 0.00015419065898912346, |
|
"loss": 1.2214, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 7.357645553422905, |
|
"grad_norm": 2.7659342288970947, |
|
"learning_rate": 0.0001529110684580934, |
|
"loss": 1.2316, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 7.421625079974408, |
|
"grad_norm": 2.877668857574463, |
|
"learning_rate": 0.00015163147792706335, |
|
"loss": 1.2567, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 7.485604606525912, |
|
"grad_norm": 2.554941415786743, |
|
"learning_rate": 0.00015035188739603324, |
|
"loss": 1.2661, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 7.549584133077415, |
|
"grad_norm": 2.9956531524658203, |
|
"learning_rate": 0.0001490722968650032, |
|
"loss": 1.2855, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 7.613563659628919, |
|
"grad_norm": 2.6620912551879883, |
|
"learning_rate": 0.00014779270633397313, |
|
"loss": 1.288, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 7.677543186180422, |
|
"grad_norm": 2.637007713317871, |
|
"learning_rate": 0.00014651311580294305, |
|
"loss": 1.2665, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.741522712731926, |
|
"grad_norm": 2.8605268001556396, |
|
"learning_rate": 0.00014523352527191297, |
|
"loss": 1.2897, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 7.805502239283429, |
|
"grad_norm": 2.8997604846954346, |
|
"learning_rate": 0.0001439539347408829, |
|
"loss": 1.3052, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 7.869481765834933, |
|
"grad_norm": 2.889934778213501, |
|
"learning_rate": 0.00014267434420985284, |
|
"loss": 1.2934, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 7.933461292386436, |
|
"grad_norm": 2.7243242263793945, |
|
"learning_rate": 0.00014139475367882276, |
|
"loss": 1.3035, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 7.99744081893794, |
|
"grad_norm": 2.7266464233398438, |
|
"learning_rate": 0.00014011516314779268, |
|
"loss": 1.3032, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 8.061420345489443, |
|
"grad_norm": 2.5178606510162354, |
|
"learning_rate": 0.00013883557261676262, |
|
"loss": 1.125, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 8.125399872040948, |
|
"grad_norm": 2.5766046047210693, |
|
"learning_rate": 0.00013755598208573254, |
|
"loss": 1.1352, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 8.18937939859245, |
|
"grad_norm": 2.6406075954437256, |
|
"learning_rate": 0.0001362763915547025, |
|
"loss": 1.1377, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 8.253358925143955, |
|
"grad_norm": 2.6050360202789307, |
|
"learning_rate": 0.0001349968010236724, |
|
"loss": 1.1539, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 8.317338451695457, |
|
"grad_norm": 3.0384466648101807, |
|
"learning_rate": 0.00013371721049264235, |
|
"loss": 1.1446, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 8.381317978246962, |
|
"grad_norm": 2.728938102722168, |
|
"learning_rate": 0.00013245041586692257, |
|
"loss": 1.154, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 8.445297504798464, |
|
"grad_norm": 2.723478078842163, |
|
"learning_rate": 0.00013117082533589252, |
|
"loss": 1.1583, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 8.509277031349969, |
|
"grad_norm": 2.9142537117004395, |
|
"learning_rate": 0.00012989123480486244, |
|
"loss": 1.1697, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 8.573256557901471, |
|
"grad_norm": 3.0678508281707764, |
|
"learning_rate": 0.00012861164427383238, |
|
"loss": 1.1759, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 8.637236084452976, |
|
"grad_norm": 2.6915736198425293, |
|
"learning_rate": 0.0001273320537428023, |
|
"loss": 1.18, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 8.701215611004478, |
|
"grad_norm": 2.8362019062042236, |
|
"learning_rate": 0.00012605246321177222, |
|
"loss": 1.1975, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 8.765195137555983, |
|
"grad_norm": 2.8301174640655518, |
|
"learning_rate": 0.00012477287268074214, |
|
"loss": 1.2049, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 8.829174664107486, |
|
"grad_norm": 2.866494655609131, |
|
"learning_rate": 0.00012350607805502239, |
|
"loss": 1.1916, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 8.89315419065899, |
|
"grad_norm": 3.0079123973846436, |
|
"learning_rate": 0.0001222264875239923, |
|
"loss": 1.1894, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 8.957133717210493, |
|
"grad_norm": 2.8487589359283447, |
|
"learning_rate": 0.00012094689699296224, |
|
"loss": 1.2077, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 9.021113243761997, |
|
"grad_norm": 2.8493270874023438, |
|
"learning_rate": 0.00011966730646193218, |
|
"loss": 1.1668, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 9.0850927703135, |
|
"grad_norm": 2.9474496841430664, |
|
"learning_rate": 0.0001183877159309021, |
|
"loss": 1.035, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 9.149072296865004, |
|
"grad_norm": 2.907160758972168, |
|
"learning_rate": 0.00011710812539987204, |
|
"loss": 1.0505, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 9.213051823416507, |
|
"grad_norm": 2.9124813079833984, |
|
"learning_rate": 0.00011582853486884196, |
|
"loss": 1.0573, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 9.277031349968011, |
|
"grad_norm": 2.925597906112671, |
|
"learning_rate": 0.00011454894433781189, |
|
"loss": 1.0654, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 9.341010876519514, |
|
"grad_norm": 2.844844102859497, |
|
"learning_rate": 0.00011326935380678182, |
|
"loss": 1.0703, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 9.404990403071018, |
|
"grad_norm": 3.0153064727783203, |
|
"learning_rate": 0.00011198976327575174, |
|
"loss": 1.0727, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 9.46896992962252, |
|
"grad_norm": 3.0144588947296143, |
|
"learning_rate": 0.00011071017274472169, |
|
"loss": 1.0791, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 9.532949456174025, |
|
"grad_norm": 3.219855308532715, |
|
"learning_rate": 0.0001094305822136916, |
|
"loss": 1.0921, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 9.596928982725528, |
|
"grad_norm": 3.0687131881713867, |
|
"learning_rate": 0.00010815099168266154, |
|
"loss": 1.0968, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 9.660908509277032, |
|
"grad_norm": 2.7260611057281494, |
|
"learning_rate": 0.00010687140115163146, |
|
"loss": 1.0924, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 9.724888035828535, |
|
"grad_norm": 3.266075372695923, |
|
"learning_rate": 0.0001055918106206014, |
|
"loss": 1.0871, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 9.78886756238004, |
|
"grad_norm": 2.816058874130249, |
|
"learning_rate": 0.00010431222008957132, |
|
"loss": 1.1183, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 9.852847088931542, |
|
"grad_norm": 2.7959651947021484, |
|
"learning_rate": 0.00010303262955854126, |
|
"loss": 1.1188, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 9.916826615483046, |
|
"grad_norm": 2.996344566345215, |
|
"learning_rate": 0.00010175303902751119, |
|
"loss": 1.1112, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 9.980806142034549, |
|
"grad_norm": 3.110300302505493, |
|
"learning_rate": 0.00010047344849648112, |
|
"loss": 1.13, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 10.044785668586053, |
|
"grad_norm": 2.8949873447418213, |
|
"learning_rate": 9.919385796545104e-05, |
|
"loss": 1.0038, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 10.108765195137556, |
|
"grad_norm": 2.7868170738220215, |
|
"learning_rate": 9.791426743442099e-05, |
|
"loss": 0.9668, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 10.17274472168906, |
|
"grad_norm": 3.0076348781585693, |
|
"learning_rate": 9.663467690339091e-05, |
|
"loss": 0.9592, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 10.236724248240563, |
|
"grad_norm": 3.313863515853882, |
|
"learning_rate": 9.535508637236083e-05, |
|
"loss": 0.9923, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 10.300703774792067, |
|
"grad_norm": 3.0162906646728516, |
|
"learning_rate": 9.407549584133077e-05, |
|
"loss": 0.9932, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 10.36468330134357, |
|
"grad_norm": 3.1555402278900146, |
|
"learning_rate": 9.279590531030069e-05, |
|
"loss": 0.9909, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 10.428662827895074, |
|
"grad_norm": 2.6180832386016846, |
|
"learning_rate": 9.151631477927062e-05, |
|
"loss": 1.0052, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 10.492642354446577, |
|
"grad_norm": 3.2279481887817383, |
|
"learning_rate": 9.023672424824054e-05, |
|
"loss": 1.0014, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 10.556621880998081, |
|
"grad_norm": 3.1148877143859863, |
|
"learning_rate": 8.895713371721049e-05, |
|
"loss": 1.0179, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 10.620601407549584, |
|
"grad_norm": 2.617116928100586, |
|
"learning_rate": 8.767754318618041e-05, |
|
"loss": 1.0234, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 10.684580934101088, |
|
"grad_norm": 3.179914951324463, |
|
"learning_rate": 8.639795265515034e-05, |
|
"loss": 1.018, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 10.748560460652591, |
|
"grad_norm": 3.002013921737671, |
|
"learning_rate": 8.511836212412028e-05, |
|
"loss": 1.03, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 10.812539987204095, |
|
"grad_norm": 3.1604723930358887, |
|
"learning_rate": 8.383877159309021e-05, |
|
"loss": 1.0289, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 10.876519513755598, |
|
"grad_norm": 3.0055463314056396, |
|
"learning_rate": 8.255918106206013e-05, |
|
"loss": 1.0244, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 10.940499040307103, |
|
"grad_norm": 3.2063984870910645, |
|
"learning_rate": 8.127959053103007e-05, |
|
"loss": 1.0246, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 11.004478566858605, |
|
"grad_norm": 2.9076807498931885, |
|
"learning_rate": 7.999999999999999e-05, |
|
"loss": 1.0284, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 11.06845809341011, |
|
"grad_norm": 2.873387098312378, |
|
"learning_rate": 7.872040946896993e-05, |
|
"loss": 0.8981, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 11.132437619961612, |
|
"grad_norm": 3.000307083129883, |
|
"learning_rate": 7.744081893793986e-05, |
|
"loss": 0.8994, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 11.196417146513117, |
|
"grad_norm": 2.965081214904785, |
|
"learning_rate": 7.616122840690978e-05, |
|
"loss": 0.9159, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 11.26039667306462, |
|
"grad_norm": 2.9889376163482666, |
|
"learning_rate": 7.488163787587971e-05, |
|
"loss": 0.9196, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 11.324376199616124, |
|
"grad_norm": 3.2027907371520996, |
|
"learning_rate": 7.360204734484964e-05, |
|
"loss": 0.9258, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 11.388355726167626, |
|
"grad_norm": 2.8633768558502197, |
|
"learning_rate": 7.233525271912987e-05, |
|
"loss": 0.9217, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 11.45233525271913, |
|
"grad_norm": 3.171734571456909, |
|
"learning_rate": 7.10556621880998e-05, |
|
"loss": 0.92, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 11.516314779270633, |
|
"grad_norm": 3.0337626934051514, |
|
"learning_rate": 6.977607165706973e-05, |
|
"loss": 0.9361, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 11.580294305822138, |
|
"grad_norm": 3.2068841457366943, |
|
"learning_rate": 6.849648112603966e-05, |
|
"loss": 0.9255, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 11.64427383237364, |
|
"grad_norm": 3.1199960708618164, |
|
"learning_rate": 6.721689059500959e-05, |
|
"loss": 0.9362, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 11.708253358925145, |
|
"grad_norm": 3.16876220703125, |
|
"learning_rate": 6.593730006397953e-05, |
|
"loss": 0.9509, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 11.772232885476647, |
|
"grad_norm": 2.9640047550201416, |
|
"learning_rate": 6.465770953294944e-05, |
|
"loss": 0.9338, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 11.836212412028152, |
|
"grad_norm": 3.112344980239868, |
|
"learning_rate": 6.337811900191938e-05, |
|
"loss": 0.9417, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 11.900191938579654, |
|
"grad_norm": 3.0994222164154053, |
|
"learning_rate": 6.211132437619961e-05, |
|
"loss": 0.9573, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 11.964171465131159, |
|
"grad_norm": 3.336512327194214, |
|
"learning_rate": 6.0831733845169535e-05, |
|
"loss": 0.9523, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 12.028150991682661, |
|
"grad_norm": 2.5776188373565674, |
|
"learning_rate": 5.955214331413947e-05, |
|
"loss": 0.9043, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 12.092130518234166, |
|
"grad_norm": 2.4695932865142822, |
|
"learning_rate": 5.8272552783109394e-05, |
|
"loss": 0.8316, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 12.156110044785668, |
|
"grad_norm": 2.912343740463257, |
|
"learning_rate": 5.699296225207933e-05, |
|
"loss": 0.8499, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 12.220089571337173, |
|
"grad_norm": 2.9618356227874756, |
|
"learning_rate": 5.571337172104926e-05, |
|
"loss": 0.8406, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 12.284069097888676, |
|
"grad_norm": 2.899482011795044, |
|
"learning_rate": 5.4433781190019186e-05, |
|
"loss": 0.8472, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 12.34804862444018, |
|
"grad_norm": 3.3630785942077637, |
|
"learning_rate": 5.315419065898912e-05, |
|
"loss": 0.8585, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 12.412028150991683, |
|
"grad_norm": 3.245290517807007, |
|
"learning_rate": 5.187460012795905e-05, |
|
"loss": 0.8593, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 12.476007677543187, |
|
"grad_norm": 3.1242306232452393, |
|
"learning_rate": 5.059500959692898e-05, |
|
"loss": 0.8659, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 12.53998720409469, |
|
"grad_norm": 3.265775442123413, |
|
"learning_rate": 4.931541906589891e-05, |
|
"loss": 0.879, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 12.603966730646194, |
|
"grad_norm": 2.995530843734741, |
|
"learning_rate": 4.8035828534868836e-05, |
|
"loss": 0.8688, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 12.667946257197697, |
|
"grad_norm": 3.1246113777160645, |
|
"learning_rate": 4.675623800383877e-05, |
|
"loss": 0.8759, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 12.731925783749201, |
|
"grad_norm": 3.249753713607788, |
|
"learning_rate": 4.54766474728087e-05, |
|
"loss": 0.8663, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 12.795905310300704, |
|
"grad_norm": 3.2970869541168213, |
|
"learning_rate": 4.419705694177863e-05, |
|
"loss": 0.8618, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 12.859884836852208, |
|
"grad_norm": 3.212738275527954, |
|
"learning_rate": 4.291746641074856e-05, |
|
"loss": 0.8709, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 12.92386436340371, |
|
"grad_norm": 3.064932107925415, |
|
"learning_rate": 4.163787587971848e-05, |
|
"loss": 0.8756, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 12.987843889955215, |
|
"grad_norm": 3.0357518196105957, |
|
"learning_rate": 4.035828534868841e-05, |
|
"loss": 0.8792, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 13.051823416506718, |
|
"grad_norm": 2.5418007373809814, |
|
"learning_rate": 3.9078694817658345e-05, |
|
"loss": 0.8018, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 13.115802943058222, |
|
"grad_norm": 3.088637590408325, |
|
"learning_rate": 3.779910428662827e-05, |
|
"loss": 0.7951, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 13.179782469609725, |
|
"grad_norm": 3.0088999271392822, |
|
"learning_rate": 3.6519513755598204e-05, |
|
"loss": 0.7759, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 13.24376199616123, |
|
"grad_norm": 2.929150104522705, |
|
"learning_rate": 3.523992322456814e-05, |
|
"loss": 0.7967, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 13.307741522712732, |
|
"grad_norm": 2.795482873916626, |
|
"learning_rate": 3.396033269353806e-05, |
|
"loss": 0.8023, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 13.371721049264236, |
|
"grad_norm": 2.998296022415161, |
|
"learning_rate": 3.2680742162507996e-05, |
|
"loss": 0.7929, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 13.435700575815739, |
|
"grad_norm": 3.09736967086792, |
|
"learning_rate": 3.140115163147792e-05, |
|
"loss": 0.801, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 13.499680102367243, |
|
"grad_norm": 3.1578280925750732, |
|
"learning_rate": 3.0134357005758153e-05, |
|
"loss": 0.8051, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 13.563659628918746, |
|
"grad_norm": 3.167719841003418, |
|
"learning_rate": 2.8854766474728086e-05, |
|
"loss": 0.8035, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 13.62763915547025, |
|
"grad_norm": 3.2616937160491943, |
|
"learning_rate": 2.7575175943698016e-05, |
|
"loss": 0.8152, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 13.691618682021753, |
|
"grad_norm": 3.2310030460357666, |
|
"learning_rate": 2.6295585412667945e-05, |
|
"loss": 0.8136, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 13.755598208573257, |
|
"grad_norm": 3.1108453273773193, |
|
"learning_rate": 2.5015994881637874e-05, |
|
"loss": 0.8158, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 13.81957773512476, |
|
"grad_norm": 2.875555992126465, |
|
"learning_rate": 2.37364043506078e-05, |
|
"loss": 0.8203, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 13.883557261676264, |
|
"grad_norm": 2.9165468215942383, |
|
"learning_rate": 2.2456813819577733e-05, |
|
"loss": 0.8142, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 13.947536788227767, |
|
"grad_norm": 2.839167356491089, |
|
"learning_rate": 2.1177223288547663e-05, |
|
"loss": 0.8075, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 14.011516314779271, |
|
"grad_norm": 2.6591968536376953, |
|
"learning_rate": 1.9897632757517592e-05, |
|
"loss": 0.7897, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 14.075495841330774, |
|
"grad_norm": 2.895771026611328, |
|
"learning_rate": 1.861804222648752e-05, |
|
"loss": 0.7396, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 14.139475367882278, |
|
"grad_norm": 3.234828472137451, |
|
"learning_rate": 1.733845169545745e-05, |
|
"loss": 0.7564, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 14.203454894433781, |
|
"grad_norm": 3.1565563678741455, |
|
"learning_rate": 1.605886116442738e-05, |
|
"loss": 0.7555, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 14.267434420985285, |
|
"grad_norm": 2.6395761966705322, |
|
"learning_rate": 1.4779270633397312e-05, |
|
"loss": 0.752, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 14.331413947536788, |
|
"grad_norm": 2.860470771789551, |
|
"learning_rate": 1.3499680102367243e-05, |
|
"loss": 0.7592, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 14.395393474088293, |
|
"grad_norm": 3.0977256298065186, |
|
"learning_rate": 1.222008957133717e-05, |
|
"loss": 0.7538, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 14.459373000639795, |
|
"grad_norm": 3.199491024017334, |
|
"learning_rate": 1.09404990403071e-05, |
|
"loss": 0.7554, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 14.5233525271913, |
|
"grad_norm": 3.1228437423706055, |
|
"learning_rate": 9.660908509277031e-06, |
|
"loss": 0.7536, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 14.587332053742802, |
|
"grad_norm": 3.0725913047790527, |
|
"learning_rate": 8.38131797824696e-06, |
|
"loss": 0.745, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 14.651311580294307, |
|
"grad_norm": 2.7372710704803467, |
|
"learning_rate": 7.101727447216891e-06, |
|
"loss": 0.7761, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 14.71529110684581, |
|
"grad_norm": 2.8115973472595215, |
|
"learning_rate": 5.822136916186819e-06, |
|
"loss": 0.7549, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 14.779270633397314, |
|
"grad_norm": 3.0110971927642822, |
|
"learning_rate": 4.5425463851567495e-06, |
|
"loss": 0.7585, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 14.843250159948816, |
|
"grad_norm": 3.1187989711761475, |
|
"learning_rate": 3.2629558541266794e-06, |
|
"loss": 0.7612, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 14.90722968650032, |
|
"grad_norm": 3.022102117538452, |
|
"learning_rate": 1.983365323096609e-06, |
|
"loss": 0.7555, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 14.971209213051823, |
|
"grad_norm": 3.1614649295806885, |
|
"learning_rate": 7.037747920665386e-07, |
|
"loss": 0.7546, |
|
"step": 23400 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 23445, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 2500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.649122201906708e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|