|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.996383363471971, |
|
"eval_steps": 500, |
|
"global_step": 966, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0020666494445879618, |
|
"grad_norm": 4.312192519671265, |
|
"learning_rate": 5.1546391752577325e-08, |
|
"loss": 3.0168, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01033324722293981, |
|
"grad_norm": 4.263469092495139, |
|
"learning_rate": 2.577319587628866e-07, |
|
"loss": 2.9984, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02066649444587962, |
|
"grad_norm": 4.090835140938876, |
|
"learning_rate": 5.154639175257732e-07, |
|
"loss": 2.9949, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030999741668819428, |
|
"grad_norm": 3.0467308036273013, |
|
"learning_rate": 7.731958762886599e-07, |
|
"loss": 2.9694, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04133298889175924, |
|
"grad_norm": 2.673851155124283, |
|
"learning_rate": 1.0309278350515464e-06, |
|
"loss": 2.9361, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.051666236114699046, |
|
"grad_norm": 2.4196900473976966, |
|
"learning_rate": 1.288659793814433e-06, |
|
"loss": 2.8922, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.061999483337638855, |
|
"grad_norm": 2.194039815147728, |
|
"learning_rate": 1.5463917525773197e-06, |
|
"loss": 2.8477, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07233273056057866, |
|
"grad_norm": 1.816626011260457, |
|
"learning_rate": 1.8041237113402063e-06, |
|
"loss": 2.7875, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08266597778351847, |
|
"grad_norm": 1.8318006255995982, |
|
"learning_rate": 2.061855670103093e-06, |
|
"loss": 2.7757, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09299922500645828, |
|
"grad_norm": 2.114157643853681, |
|
"learning_rate": 2.3195876288659796e-06, |
|
"loss": 2.7489, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10333247222939809, |
|
"grad_norm": 2.023968470692679, |
|
"learning_rate": 2.577319587628866e-06, |
|
"loss": 2.7448, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1136657194523379, |
|
"grad_norm": 1.9532625510646646, |
|
"learning_rate": 2.8350515463917527e-06, |
|
"loss": 2.6986, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12399896667527771, |
|
"grad_norm": 1.9398256152557316, |
|
"learning_rate": 3.0927835051546395e-06, |
|
"loss": 2.7067, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1343322138982175, |
|
"grad_norm": 2.0411858929451157, |
|
"learning_rate": 3.350515463917526e-06, |
|
"loss": 2.682, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14466546112115733, |
|
"grad_norm": 1.9182095651663165, |
|
"learning_rate": 3.6082474226804126e-06, |
|
"loss": 2.6537, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15499870834409712, |
|
"grad_norm": 1.8939218115979524, |
|
"learning_rate": 3.865979381443299e-06, |
|
"loss": 2.689, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16533195556703695, |
|
"grad_norm": 1.8800045882838237, |
|
"learning_rate": 4.123711340206186e-06, |
|
"loss": 2.6106, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17566520278997674, |
|
"grad_norm": 1.903666598213381, |
|
"learning_rate": 4.381443298969073e-06, |
|
"loss": 2.6385, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18599845001291657, |
|
"grad_norm": 1.9506327396922627, |
|
"learning_rate": 4.639175257731959e-06, |
|
"loss": 2.644, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19633169723585636, |
|
"grad_norm": 1.8969467447381583, |
|
"learning_rate": 4.8969072164948455e-06, |
|
"loss": 2.6167, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.20666494445879618, |
|
"grad_norm": 2.063483326690761, |
|
"learning_rate": 4.9998529691533944e-06, |
|
"loss": 2.6128, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21699819168173598, |
|
"grad_norm": 1.9617956961919498, |
|
"learning_rate": 4.998954509942391e-06, |
|
"loss": 2.5775, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2273314389046758, |
|
"grad_norm": 1.6301435863496945, |
|
"learning_rate": 4.997239568521342e-06, |
|
"loss": 2.5848, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2376646861276156, |
|
"grad_norm": 2.0118752460927594, |
|
"learning_rate": 4.994708705212238e-06, |
|
"loss": 2.56, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.24799793335055542, |
|
"grad_norm": 1.792769014427482, |
|
"learning_rate": 4.991362746922835e-06, |
|
"loss": 2.583, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25833118057349524, |
|
"grad_norm": 2.0389735927794557, |
|
"learning_rate": 4.987202786876483e-06, |
|
"loss": 2.5912, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.268664427796435, |
|
"grad_norm": 1.828850605237693, |
|
"learning_rate": 4.982230184254934e-06, |
|
"loss": 2.5457, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.27899767501937484, |
|
"grad_norm": 1.8015379458146892, |
|
"learning_rate": 4.976446563754262e-06, |
|
"loss": 2.5546, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.28933092224231466, |
|
"grad_norm": 1.7199552953613728, |
|
"learning_rate": 4.969853815054022e-06, |
|
"loss": 2.5483, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2996641694652545, |
|
"grad_norm": 1.5772784625180778, |
|
"learning_rate": 4.962454092199839e-06, |
|
"loss": 2.5046, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.30999741668819425, |
|
"grad_norm": 1.5677450227507446, |
|
"learning_rate": 4.954249812899617e-06, |
|
"loss": 2.5291, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3203306639111341, |
|
"grad_norm": 1.456660409079126, |
|
"learning_rate": 4.9452436577336025e-06, |
|
"loss": 2.514, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3306639111340739, |
|
"grad_norm": 1.5150592197514972, |
|
"learning_rate": 4.935438569278558e-06, |
|
"loss": 2.483, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3409971583570137, |
|
"grad_norm": 1.3377769703773317, |
|
"learning_rate": 4.924837751146339e-06, |
|
"loss": 2.4891, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3513304055799535, |
|
"grad_norm": 1.2629854028734164, |
|
"learning_rate": 4.913444666937174e-06, |
|
"loss": 2.4873, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3616636528028933, |
|
"grad_norm": 1.2369841480907526, |
|
"learning_rate": 4.9012630391080105e-06, |
|
"loss": 2.4808, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.37199690002583313, |
|
"grad_norm": 1.2030471624360595, |
|
"learning_rate": 4.8882968477562754e-06, |
|
"loss": 2.452, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3823301472487729, |
|
"grad_norm": 1.2297488483872987, |
|
"learning_rate": 4.874550329319457e-06, |
|
"loss": 2.4817, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3926633944717127, |
|
"grad_norm": 1.1865278901389154, |
|
"learning_rate": 4.8600279751909394e-06, |
|
"loss": 2.4779, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.40299664169465255, |
|
"grad_norm": 1.135041309414857, |
|
"learning_rate": 4.844734530252532e-06, |
|
"loss": 2.4839, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.41332988891759237, |
|
"grad_norm": 1.1971240713531093, |
|
"learning_rate": 4.8286749913241735e-06, |
|
"loss": 2.4727, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42366313614053214, |
|
"grad_norm": 1.184700416763506, |
|
"learning_rate": 4.811854605531325e-06, |
|
"loss": 2.4893, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.43399638336347196, |
|
"grad_norm": 1.3114264862654226, |
|
"learning_rate": 4.79427886859058e-06, |
|
"loss": 2.4651, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4443296305864118, |
|
"grad_norm": 1.2651626968475456, |
|
"learning_rate": 4.775953523014051e-06, |
|
"loss": 2.477, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4546628778093516, |
|
"grad_norm": 1.1651412485565396, |
|
"learning_rate": 4.75688455623312e-06, |
|
"loss": 2.4875, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4649961250322914, |
|
"grad_norm": 1.183328978373714, |
|
"learning_rate": 4.7370781986421685e-06, |
|
"loss": 2.452, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4753293722552312, |
|
"grad_norm": 1.2332127650589972, |
|
"learning_rate": 4.716540921562927e-06, |
|
"loss": 2.4488, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.485662619478171, |
|
"grad_norm": 1.1475089285046411, |
|
"learning_rate": 4.6952794351300964e-06, |
|
"loss": 2.463, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.49599586670111084, |
|
"grad_norm": 1.1862533875864938, |
|
"learning_rate": 4.673300686098957e-06, |
|
"loss": 2.4806, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5063291139240507, |
|
"grad_norm": 1.117191578266376, |
|
"learning_rate": 4.65061185557565e-06, |
|
"loss": 2.4535, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5166623611469905, |
|
"grad_norm": 1.1639210815368257, |
|
"learning_rate": 4.627220356670904e-06, |
|
"loss": 2.4536, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5269956083699302, |
|
"grad_norm": 1.1719032781427676, |
|
"learning_rate": 4.603133832077953e-06, |
|
"loss": 2.4366, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.53732885559287, |
|
"grad_norm": 1.1377513004294313, |
|
"learning_rate": 4.57836015157544e-06, |
|
"loss": 2.4524, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5476621028158098, |
|
"grad_norm": 1.1211864633428335, |
|
"learning_rate": 4.552907409456129e-06, |
|
"loss": 2.4202, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5579953500387497, |
|
"grad_norm": 1.1616507123396151, |
|
"learning_rate": 4.526783921882261e-06, |
|
"loss": 2.4273, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5683285972616895, |
|
"grad_norm": 1.1848691721187, |
|
"learning_rate": 4.499998224168417e-06, |
|
"loss": 2.4401, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5786618444846293, |
|
"grad_norm": 1.0582776568433816, |
|
"learning_rate": 4.47255906799278e-06, |
|
"loss": 2.4096, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5889950917075691, |
|
"grad_norm": 1.1638859190983761, |
|
"learning_rate": 4.444475418537707e-06, |
|
"loss": 2.4208, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.599328338930509, |
|
"grad_norm": 1.167542140637952, |
|
"learning_rate": 4.4157564515605315e-06, |
|
"loss": 2.4244, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6096615861534487, |
|
"grad_norm": 1.1838366384063863, |
|
"learning_rate": 4.386411550395576e-06, |
|
"loss": 2.435, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6199948333763885, |
|
"grad_norm": 1.081261572955448, |
|
"learning_rate": 4.356450302888343e-06, |
|
"loss": 2.4268, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6303280805993283, |
|
"grad_norm": 1.1052945392912799, |
|
"learning_rate": 4.32588249826288e-06, |
|
"loss": 2.388, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6406613278222681, |
|
"grad_norm": 1.08835668369336, |
|
"learning_rate": 4.294718123923349e-06, |
|
"loss": 2.3976, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.650994575045208, |
|
"grad_norm": 1.0478651783340451, |
|
"learning_rate": 4.262967362190851e-06, |
|
"loss": 2.4288, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6613278222681478, |
|
"grad_norm": 1.1501225106762079, |
|
"learning_rate": 4.23064058697656e-06, |
|
"loss": 2.4222, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6716610694910876, |
|
"grad_norm": 1.0831191994433091, |
|
"learning_rate": 4.197748360392256e-06, |
|
"loss": 2.4308, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6819943167140274, |
|
"grad_norm": 1.059007370674017, |
|
"learning_rate": 4.164301429299374e-06, |
|
"loss": 2.3975, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6923275639369671, |
|
"grad_norm": 1.092943166086201, |
|
"learning_rate": 4.130310721797691e-06, |
|
"loss": 2.4365, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.702660811159907, |
|
"grad_norm": 1.1348461717715252, |
|
"learning_rate": 4.095787343654787e-06, |
|
"loss": 2.4326, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7129940583828468, |
|
"grad_norm": 1.0225410363434102, |
|
"learning_rate": 4.060742574677467e-06, |
|
"loss": 2.4066, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7233273056057866, |
|
"grad_norm": 1.0792570056042612, |
|
"learning_rate": 4.025187865026311e-06, |
|
"loss": 2.4179, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7336605528287264, |
|
"grad_norm": 1.0656972631008976, |
|
"learning_rate": 3.989134831474575e-06, |
|
"loss": 2.3798, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7439938000516663, |
|
"grad_norm": 1.0796066558341972, |
|
"learning_rate": 3.952595253612639e-06, |
|
"loss": 2.4057, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7543270472746061, |
|
"grad_norm": 1.0659792080472803, |
|
"learning_rate": 3.915581069999279e-06, |
|
"loss": 2.4168, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7646602944975458, |
|
"grad_norm": 1.0221090674367013, |
|
"learning_rate": 3.878104374260974e-06, |
|
"loss": 2.4168, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7749935417204856, |
|
"grad_norm": 1.041831551248707, |
|
"learning_rate": 3.840177411140574e-06, |
|
"loss": 2.4064, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7853267889434254, |
|
"grad_norm": 1.0939898104463206, |
|
"learning_rate": 3.8018125724965727e-06, |
|
"loss": 2.4241, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7956600361663653, |
|
"grad_norm": 1.041903325778914, |
|
"learning_rate": 3.763022393254321e-06, |
|
"loss": 2.4209, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8059932833893051, |
|
"grad_norm": 1.0113446521141738, |
|
"learning_rate": 3.723819547310504e-06, |
|
"loss": 2.3965, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 1.0264749571767495, |
|
"learning_rate": 3.6842168433921987e-06, |
|
"loss": 2.3796, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8266597778351847, |
|
"grad_norm": 1.1575314733187445, |
|
"learning_rate": 3.6442272208718873e-06, |
|
"loss": 2.4162, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8369930250581246, |
|
"grad_norm": 1.0237058117990445, |
|
"learning_rate": 3.6038637455397802e-06, |
|
"loss": 2.3887, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8473262722810643, |
|
"grad_norm": 1.0197966280482622, |
|
"learning_rate": 3.5631396053348387e-06, |
|
"loss": 2.3858, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8576595195040041, |
|
"grad_norm": 1.1566498283302307, |
|
"learning_rate": 3.5220681060358865e-06, |
|
"loss": 2.3637, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8679927667269439, |
|
"grad_norm": 1.0095584831720887, |
|
"learning_rate": 3.4806626669142152e-06, |
|
"loss": 2.4122, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8783260139498837, |
|
"grad_norm": 1.0666694477391483, |
|
"learning_rate": 3.4389368163491164e-06, |
|
"loss": 2.3899, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8886592611728236, |
|
"grad_norm": 1.0506847225250229, |
|
"learning_rate": 3.396904187407759e-06, |
|
"loss": 2.3859, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8989925083957634, |
|
"grad_norm": 1.1509707991212996, |
|
"learning_rate": 3.3545785133908637e-06, |
|
"loss": 2.3521, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9093257556187032, |
|
"grad_norm": 1.1044049179908932, |
|
"learning_rate": 3.3119736233456266e-06, |
|
"loss": 2.3635, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.919659002841643, |
|
"grad_norm": 1.046550942981578, |
|
"learning_rate": 3.269103437547367e-06, |
|
"loss": 2.3529, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9299922500645827, |
|
"grad_norm": 1.1618469819333466, |
|
"learning_rate": 3.225981962951354e-06, |
|
"loss": 2.3838, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9403254972875226, |
|
"grad_norm": 1.040879291891829, |
|
"learning_rate": 3.182623288616328e-06, |
|
"loss": 2.3778, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9506587445104624, |
|
"grad_norm": 1.0092535371331217, |
|
"learning_rate": 3.139041581101187e-06, |
|
"loss": 2.4014, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9609919917334022, |
|
"grad_norm": 0.9581154923540901, |
|
"learning_rate": 3.0952510798363505e-06, |
|
"loss": 2.3724, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.971325238956342, |
|
"grad_norm": 1.019011267377941, |
|
"learning_rate": 3.0512660924713227e-06, |
|
"loss": 2.3851, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9816584861792819, |
|
"grad_norm": 1.1025515727643884, |
|
"learning_rate": 3.0071009901999552e-06, |
|
"loss": 2.3533, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9919917334022217, |
|
"grad_norm": 0.9955871899662452, |
|
"learning_rate": 2.9627702030649596e-06, |
|
"loss": 2.4072, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9981916817359855, |
|
"eval_loss": 2.4291670322418213, |
|
"eval_runtime": 98.2505, |
|
"eval_samples_per_second": 106.32, |
|
"eval_steps_per_second": 4.438, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.0023249806251615, |
|
"grad_norm": 1.043119029342129, |
|
"learning_rate": 2.9182882152431813e-06, |
|
"loss": 2.3632, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.0126582278481013, |
|
"grad_norm": 0.9917731803771155, |
|
"learning_rate": 2.8736695603131953e-06, |
|
"loss": 2.3459, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0229914750710412, |
|
"grad_norm": 1.059190202939381, |
|
"learning_rate": 2.8289288165067536e-06, |
|
"loss": 2.3504, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.033324722293981, |
|
"grad_norm": 1.047197875332299, |
|
"learning_rate": 2.784080601945651e-06, |
|
"loss": 2.3336, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0436579695169206, |
|
"grad_norm": 1.0722704531569385, |
|
"learning_rate": 2.739139569865547e-06, |
|
"loss": 2.3424, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.0539912167398604, |
|
"grad_norm": 1.0566564902419044, |
|
"learning_rate": 2.6941204038283285e-06, |
|
"loss": 2.3158, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0643244639628002, |
|
"grad_norm": 0.968455085420159, |
|
"learning_rate": 2.64903781292455e-06, |
|
"loss": 2.3381, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.07465771118574, |
|
"grad_norm": 1.0149829246524174, |
|
"learning_rate": 2.6039065269675428e-06, |
|
"loss": 2.3765, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0849909584086799, |
|
"grad_norm": 1.003506265130154, |
|
"learning_rate": 2.5587412916807508e-06, |
|
"loss": 2.3619, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.0953242056316197, |
|
"grad_norm": 1.0360443331960967, |
|
"learning_rate": 2.5135568638798647e-06, |
|
"loss": 2.3509, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1056574528545595, |
|
"grad_norm": 1.03015456481267, |
|
"learning_rate": 2.4683680066513367e-06, |
|
"loss": 2.3532, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.1159907000774993, |
|
"grad_norm": 0.9947690934378423, |
|
"learning_rate": 2.423189484528844e-06, |
|
"loss": 2.3387, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1263239473004392, |
|
"grad_norm": 0.9935757048613172, |
|
"learning_rate": 2.378036058669279e-06, |
|
"loss": 2.3449, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.136657194523379, |
|
"grad_norm": 1.0218876059810185, |
|
"learning_rate": 2.3329224820298454e-06, |
|
"loss": 2.3454, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1469904417463188, |
|
"grad_norm": 1.013721944420629, |
|
"learning_rate": 2.287863494547828e-06, |
|
"loss": 2.3202, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.1573236889692586, |
|
"grad_norm": 0.99845936769635, |
|
"learning_rate": 2.242873818324625e-06, |
|
"loss": 2.3581, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1676569361921985, |
|
"grad_norm": 1.024887085816387, |
|
"learning_rate": 2.1979681528155983e-06, |
|
"loss": 2.3335, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.1779901834151383, |
|
"grad_norm": 0.9636864445536957, |
|
"learning_rate": 2.15316117002733e-06, |
|
"loss": 2.3502, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.188323430638078, |
|
"grad_norm": 0.9607999961987078, |
|
"learning_rate": 2.1084675097238443e-06, |
|
"loss": 2.3295, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.1986566778610177, |
|
"grad_norm": 1.0062266520251049, |
|
"learning_rate": 2.0639017746433636e-06, |
|
"loss": 2.3619, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.2089899250839578, |
|
"grad_norm": 0.9874080934116608, |
|
"learning_rate": 2.0194785257271653e-06, |
|
"loss": 2.316, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.2193231723068974, |
|
"grad_norm": 0.9729847296931, |
|
"learning_rate": 1.9752122773620877e-06, |
|
"loss": 2.3287, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2296564195298372, |
|
"grad_norm": 1.0507641768918863, |
|
"learning_rate": 1.931117492638257e-06, |
|
"loss": 2.3458, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.239989666752777, |
|
"grad_norm": 0.9964380928331144, |
|
"learning_rate": 1.8872085786235635e-06, |
|
"loss": 2.3299, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2503229139757168, |
|
"grad_norm": 1.0014215477144741, |
|
"learning_rate": 1.8434998816564525e-06, |
|
"loss": 2.3473, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.2606561611986566, |
|
"grad_norm": 0.9875666662605547, |
|
"learning_rate": 1.800005682658547e-06, |
|
"loss": 2.3187, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2709894084215965, |
|
"grad_norm": 0.9891384254217107, |
|
"learning_rate": 1.7567401924686512e-06, |
|
"loss": 2.3352, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.2813226556445363, |
|
"grad_norm": 0.9500870275985281, |
|
"learning_rate": 1.7137175471996525e-06, |
|
"loss": 2.3546, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2916559028674761, |
|
"grad_norm": 0.9864694881362654, |
|
"learning_rate": 1.6709518036198307e-06, |
|
"loss": 2.3573, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.301989150090416, |
|
"grad_norm": 0.9724324214668948, |
|
"learning_rate": 1.628456934560102e-06, |
|
"loss": 2.3226, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.3123223973133558, |
|
"grad_norm": 1.0011381925537814, |
|
"learning_rate": 1.5862468243486783e-06, |
|
"loss": 2.358, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.3226556445362956, |
|
"grad_norm": 1.0283092775519347, |
|
"learning_rate": 1.5443352642746517e-06, |
|
"loss": 2.3421, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.3329888917592354, |
|
"grad_norm": 1.0189898038836651, |
|
"learning_rate": 1.5027359480819688e-06, |
|
"loss": 2.3148, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.3433221389821752, |
|
"grad_norm": 0.9802099195113333, |
|
"learning_rate": 1.4614624674952843e-06, |
|
"loss": 2.3327, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3536553862051148, |
|
"grad_norm": 0.9516385215813331, |
|
"learning_rate": 1.4205283077791393e-06, |
|
"loss": 2.3454, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.3639886334280549, |
|
"grad_norm": 1.0000404439141914, |
|
"learning_rate": 1.3799468433319314e-06, |
|
"loss": 2.3542, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.3743218806509945, |
|
"grad_norm": 1.12899420231106, |
|
"learning_rate": 1.3397313333161007e-06, |
|
"loss": 2.3109, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.3846551278739345, |
|
"grad_norm": 0.9852088893899086, |
|
"learning_rate": 1.2998949173259712e-06, |
|
"loss": 2.364, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3949883750968741, |
|
"grad_norm": 0.997466272202667, |
|
"learning_rate": 1.2604506110946599e-06, |
|
"loss": 2.3176, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.405321622319814, |
|
"grad_norm": 1.0065697210998936, |
|
"learning_rate": 1.2214113022414448e-06, |
|
"loss": 2.3405, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4156548695427538, |
|
"grad_norm": 1.0113850437912417, |
|
"learning_rate": 1.1827897460610105e-06, |
|
"loss": 2.3524, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.4259881167656936, |
|
"grad_norm": 1.0142447327303221, |
|
"learning_rate": 1.1445985613559065e-06, |
|
"loss": 2.3289, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4363213639886334, |
|
"grad_norm": 1.0377572762505685, |
|
"learning_rate": 1.1068502263136207e-06, |
|
"loss": 2.3502, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.4466546112115732, |
|
"grad_norm": 0.9765601945365431, |
|
"learning_rate": 1.0695570744295768e-06, |
|
"loss": 2.3203, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.456987858434513, |
|
"grad_norm": 1.003335350024317, |
|
"learning_rate": 1.032731290477429e-06, |
|
"loss": 2.3425, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.4673211056574529, |
|
"grad_norm": 1.1117543605228124, |
|
"learning_rate": 9.96384906527927e-07, |
|
"loss": 2.3282, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.4776543528803927, |
|
"grad_norm": 1.0082116359749547, |
|
"learning_rate": 9.605297980176905e-07, |
|
"loss": 2.3291, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.4879876001033325, |
|
"grad_norm": 1.0148823844175179, |
|
"learning_rate": 9.251776798691486e-07, |
|
"loss": 2.3322, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4983208473262724, |
|
"grad_norm": 1.0009464672805306, |
|
"learning_rate": 8.903401026629283e-07, |
|
"loss": 2.3285, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.508654094549212, |
|
"grad_norm": 1.0232374800844972, |
|
"learning_rate": 8.560284488639448e-07, |
|
"loss": 2.33, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.518987341772152, |
|
"grad_norm": 1.00937657376153, |
|
"learning_rate": 8.222539291024079e-07, |
|
"loss": 2.3302, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.5293205889950916, |
|
"grad_norm": 0.9911043964041184, |
|
"learning_rate": 7.890275785109833e-07, |
|
"loss": 2.33, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.5396538362180316, |
|
"grad_norm": 0.9894490010184659, |
|
"learning_rate": 7.563602531192815e-07, |
|
"loss": 2.3083, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.5499870834409712, |
|
"grad_norm": 0.9765968142359513, |
|
"learning_rate": 7.242626263068825e-07, |
|
"loss": 2.3302, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5603203306639113, |
|
"grad_norm": 0.9785368053784121, |
|
"learning_rate": 6.927451853160186e-07, |
|
"loss": 2.3488, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.570653577886851, |
|
"grad_norm": 0.9978746662987256, |
|
"learning_rate": 6.618182278250904e-07, |
|
"loss": 2.3268, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.5809868251097907, |
|
"grad_norm": 0.9671800637222838, |
|
"learning_rate": 6.314918585841026e-07, |
|
"loss": 2.3388, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.5913200723327305, |
|
"grad_norm": 1.0258803683412943, |
|
"learning_rate": 6.017759861131511e-07, |
|
"loss": 2.331, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.6016533195556704, |
|
"grad_norm": 1.003633352349549, |
|
"learning_rate": 5.72680319465016e-07, |
|
"loss": 2.348, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.6119865667786102, |
|
"grad_norm": 1.0073802630842454, |
|
"learning_rate": 5.442143650529233e-07, |
|
"loss": 2.33, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.62231981400155, |
|
"grad_norm": 0.982451589918844, |
|
"learning_rate": 5.163874235445248e-07, |
|
"loss": 2.337, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"grad_norm": 1.0215920984091988, |
|
"learning_rate": 4.892085868230881e-07, |
|
"loss": 2.3129, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.6429863084474297, |
|
"grad_norm": 0.9661027532299886, |
|
"learning_rate": 4.626867350169184e-07, |
|
"loss": 2.3305, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.6533195556703695, |
|
"grad_norm": 1.0107949913173304, |
|
"learning_rate": 4.368305335979514e-07, |
|
"loss": 2.3442, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.663652802893309, |
|
"grad_norm": 1.0338291893124987, |
|
"learning_rate": 4.1164843055049363e-07, |
|
"loss": 2.3306, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.6739860501162491, |
|
"grad_norm": 0.9842155309730392, |
|
"learning_rate": 3.871486536110089e-07, |
|
"loss": 2.3353, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6843192973391887, |
|
"grad_norm": 1.0061784109162568, |
|
"learning_rate": 3.633392075798833e-07, |
|
"loss": 2.3475, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.6946525445621288, |
|
"grad_norm": 0.9944727072730667, |
|
"learning_rate": 3.402278717060134e-07, |
|
"loss": 2.3519, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7049857917850684, |
|
"grad_norm": 1.0111005553437697, |
|
"learning_rate": 3.1782219714509844e-07, |
|
"loss": 2.3457, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.7153190390080084, |
|
"grad_norm": 0.9903500138490017, |
|
"learning_rate": 2.9612950449245727e-07, |
|
"loss": 2.3059, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.725652286230948, |
|
"grad_norm": 1.0201392949034442, |
|
"learning_rate": 2.75156881391167e-07, |
|
"loss": 2.3683, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.7359855334538878, |
|
"grad_norm": 1.050898965300813, |
|
"learning_rate": 2.549111802163298e-07, |
|
"loss": 2.3081, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.7463187806768277, |
|
"grad_norm": 1.0078158269898172, |
|
"learning_rate": 2.3539901583619186e-07, |
|
"loss": 2.3309, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.7566520278997675, |
|
"grad_norm": 0.9924247873358294, |
|
"learning_rate": 2.1662676345087591e-07, |
|
"loss": 2.3229, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7669852751227073, |
|
"grad_norm": 0.9670839207903285, |
|
"learning_rate": 1.986005565094104e-07, |
|
"loss": 2.337, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.7773185223456471, |
|
"grad_norm": 0.9841412613184868, |
|
"learning_rate": 1.813262847057562e-07, |
|
"loss": 2.3188, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.787651769568587, |
|
"grad_norm": 0.9711799938673707, |
|
"learning_rate": 1.6480959205446483e-07, |
|
"loss": 2.3296, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.7979850167915268, |
|
"grad_norm": 0.9602656706710844, |
|
"learning_rate": 1.490558750466145e-07, |
|
"loss": 2.3302, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.8083182640144666, |
|
"grad_norm": 1.0216431924094667, |
|
"learning_rate": 1.3407028088661818e-07, |
|
"loss": 2.3393, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.8186515112374062, |
|
"grad_norm": 0.976140544478714, |
|
"learning_rate": 1.1985770581047662e-07, |
|
"loss": 2.3178, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.8289847584603462, |
|
"grad_norm": 0.9640709438478974, |
|
"learning_rate": 1.0642279348603762e-07, |
|
"loss": 2.332, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.8393180056832859, |
|
"grad_norm": 0.9965422182757674, |
|
"learning_rate": 9.376993349576896e-08, |
|
"loss": 2.3361, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.849651252906226, |
|
"grad_norm": 0.9956732940976856, |
|
"learning_rate": 8.421350989479299e-08, |
|
"loss": 2.3355, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.8599845001291655, |
|
"grad_norm": 0.9514205830424876, |
|
"learning_rate": 7.297858937089786e-08, |
|
"loss": 2.3176, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8703177473521055, |
|
"grad_norm": 0.9798917287324781, |
|
"learning_rate": 6.253664838996393e-08, |
|
"loss": 2.3078, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.8806509945750451, |
|
"grad_norm": 0.9708517797491476, |
|
"learning_rate": 5.289109864236669e-08, |
|
"loss": 2.3019, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.890984241797985, |
|
"grad_norm": 0.9943948904413487, |
|
"learning_rate": 4.404509161395454e-08, |
|
"loss": 2.3227, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.9013174890209248, |
|
"grad_norm": 0.9560070616668404, |
|
"learning_rate": 3.600151755636744e-08, |
|
"loss": 2.3065, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.9116507362438646, |
|
"grad_norm": 0.9882401832339076, |
|
"learning_rate": 2.8763004542704255e-08, |
|
"loss": 2.349, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.9219839834668044, |
|
"grad_norm": 0.9652740208624518, |
|
"learning_rate": 2.233191760885406e-08, |
|
"loss": 2.3353, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.9323172306897443, |
|
"grad_norm": 0.9855071890028472, |
|
"learning_rate": 1.671035798077092e-08, |
|
"loss": 2.3222, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.942650477912684, |
|
"grad_norm": 0.9849452175341697, |
|
"learning_rate": 1.1900162387938896e-08, |
|
"loss": 2.3175, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.952983725135624, |
|
"grad_norm": 1.0077538293794988, |
|
"learning_rate": 7.90290246326042e-09, |
|
"loss": 2.3174, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.9633169723585637, |
|
"grad_norm": 0.9764187721877715, |
|
"learning_rate": 4.719884229555938e-09, |
|
"loss": 2.3102, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9736502195815033, |
|
"grad_norm": 1.0135556832497523, |
|
"learning_rate": 2.3521476728480264e-09, |
|
"loss": 2.3504, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.9839834668044434, |
|
"grad_norm": 1.0016466106794726, |
|
"learning_rate": 8.004664025679232e-10, |
|
"loss": 2.3299, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.994316714027383, |
|
"grad_norm": 0.9845609766229912, |
|
"learning_rate": 6.534739879382468e-11, |
|
"loss": 2.3391, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.996383363471971, |
|
"eval_loss": 2.4173293113708496, |
|
"eval_runtime": 96.9594, |
|
"eval_samples_per_second": 107.736, |
|
"eval_steps_per_second": 4.497, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.996383363471971, |
|
"step": 966, |
|
"total_flos": 6353086637408256.0, |
|
"train_loss": 2.421678250374014, |
|
"train_runtime": 6320.1233, |
|
"train_samples_per_second": 29.398, |
|
"train_steps_per_second": 0.153 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 966, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6353086637408256.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|