| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 4000, | |
| "global_step": 94764, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012663237039968342, | |
| "grad_norm": 5.712349891662598, | |
| "learning_rate": 0.00019916841672031155, | |
| "loss": 28.7584, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.025326474079936684, | |
| "grad_norm": 6.924032688140869, | |
| "learning_rate": 0.00019832416973585622, | |
| "loss": 26.4219, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.037989711119905026, | |
| "grad_norm": 7.283262252807617, | |
| "learning_rate": 0.00019747992275140092, | |
| "loss": 24.0051, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.05065294815987337, | |
| "grad_norm": 7.671304702758789, | |
| "learning_rate": 0.00019663567576694565, | |
| "loss": 23.2237, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.06331618519984171, | |
| "grad_norm": 8.817034721374512, | |
| "learning_rate": 0.00019579142878249032, | |
| "loss": 22.7684, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.07597942223981005, | |
| "grad_norm": 5.911375999450684, | |
| "learning_rate": 0.00019494718179803502, | |
| "loss": 22.7665, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.0886426592797784, | |
| "grad_norm": 8.014992713928223, | |
| "learning_rate": 0.00019410293481357974, | |
| "loss": 22.4848, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.10130589631974674, | |
| "grad_norm": 5.90252685546875, | |
| "learning_rate": 0.0001932586878291244, | |
| "loss": 22.4803, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.11396913335971508, | |
| "grad_norm": 7.182605743408203, | |
| "learning_rate": 0.0001924144408446691, | |
| "loss": 22.2229, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.12663237039968342, | |
| "grad_norm": 7.886034965515137, | |
| "learning_rate": 0.0001915701938602138, | |
| "loss": 22.1914, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.12663237039968342, | |
| "eval_loss": 5.568932056427002, | |
| "eval_runtime": 828.8574, | |
| "eval_samples_per_second": 8.023, | |
| "eval_steps_per_second": 8.023, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.13929560743965175, | |
| "grad_norm": 6.553421974182129, | |
| "learning_rate": 0.0001907259468757585, | |
| "loss": 22.0885, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.1519588444796201, | |
| "grad_norm": 6.866927146911621, | |
| "learning_rate": 0.0001898816998913032, | |
| "loss": 22.0398, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.16462208151958843, | |
| "grad_norm": 4.441892147064209, | |
| "learning_rate": 0.0001890374529068479, | |
| "loss": 21.9852, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.1772853185595568, | |
| "grad_norm": 8.22354507446289, | |
| "learning_rate": 0.0001881932059223926, | |
| "loss": 22.0736, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.18994855559952512, | |
| "grad_norm": 5.67435884475708, | |
| "learning_rate": 0.00018734895893793733, | |
| "loss": 21.9875, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.20261179263949347, | |
| "grad_norm": 8.888544082641602, | |
| "learning_rate": 0.000186504711953482, | |
| "loss": 21.9139, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.2152750296794618, | |
| "grad_norm": 5.154353141784668, | |
| "learning_rate": 0.0001856604649690267, | |
| "loss": 21.8646, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.22793826671943015, | |
| "grad_norm": 8.800929069519043, | |
| "learning_rate": 0.0001848162179845714, | |
| "loss": 21.8461, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.24060150375939848, | |
| "grad_norm": 8.013294219970703, | |
| "learning_rate": 0.0001839719710001161, | |
| "loss": 21.8447, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.25326474079936684, | |
| "grad_norm": 5.175795078277588, | |
| "learning_rate": 0.0001831277240156608, | |
| "loss": 21.8885, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.25326474079936684, | |
| "eval_loss": 5.471480846405029, | |
| "eval_runtime": 827.0752, | |
| "eval_samples_per_second": 8.04, | |
| "eval_steps_per_second": 8.04, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.2659279778393352, | |
| "grad_norm": 9.457793235778809, | |
| "learning_rate": 0.0001822834770312055, | |
| "loss": 21.8034, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.2785912148793035, | |
| "grad_norm": 8.597488403320312, | |
| "learning_rate": 0.00018143923004675019, | |
| "loss": 21.7742, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.29125445191927185, | |
| "grad_norm": 6.178262233734131, | |
| "learning_rate": 0.00018059498306229488, | |
| "loss": 21.6497, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.3039176889592402, | |
| "grad_norm": 4.998334884643555, | |
| "learning_rate": 0.00017975073607783958, | |
| "loss": 21.6025, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.31658092599920856, | |
| "grad_norm": 3.7733397483825684, | |
| "learning_rate": 0.00017890648909338428, | |
| "loss": 21.7662, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.32924416303917686, | |
| "grad_norm": 4.3571248054504395, | |
| "learning_rate": 0.00017806224210892898, | |
| "loss": 21.724, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.3419074000791452, | |
| "grad_norm": 4.92628288269043, | |
| "learning_rate": 0.00017721799512447368, | |
| "loss": 21.6431, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.3545706371191136, | |
| "grad_norm": 6.349125385284424, | |
| "learning_rate": 0.00017637374814001837, | |
| "loss": 21.7532, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.36723387415908193, | |
| "grad_norm": 5.166254043579102, | |
| "learning_rate": 0.00017552950115556305, | |
| "loss": 21.7536, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.37989711119905023, | |
| "grad_norm": 5.412994384765625, | |
| "learning_rate": 0.00017468525417110777, | |
| "loss": 21.6709, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.37989711119905023, | |
| "eval_loss": 5.415313720703125, | |
| "eval_runtime": 827.4502, | |
| "eval_samples_per_second": 8.037, | |
| "eval_steps_per_second": 8.037, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.3925603482390186, | |
| "grad_norm": 5.78619384765625, | |
| "learning_rate": 0.00017384100718665247, | |
| "loss": 21.6338, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.40522358527898694, | |
| "grad_norm": 5.001625061035156, | |
| "learning_rate": 0.00017299676020219714, | |
| "loss": 21.6746, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.4178868223189553, | |
| "grad_norm": 6.57304048538208, | |
| "learning_rate": 0.00017215251321774186, | |
| "loss": 21.5606, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.4305500593589236, | |
| "grad_norm": 4.932160377502441, | |
| "learning_rate": 0.00017130826623328656, | |
| "loss": 21.6432, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.44321329639889195, | |
| "grad_norm": 5.347546577453613, | |
| "learning_rate": 0.00017046401924883126, | |
| "loss": 21.473, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.4558765334388603, | |
| "grad_norm": 4.49334716796875, | |
| "learning_rate": 0.00016961977226437596, | |
| "loss": 21.5754, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.46853977047882867, | |
| "grad_norm": 6.437751770019531, | |
| "learning_rate": 0.00016877552527992063, | |
| "loss": 21.4601, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.48120300751879697, | |
| "grad_norm": 5.687736988067627, | |
| "learning_rate": 0.00016793127829546536, | |
| "loss": 21.6067, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.4938662445587653, | |
| "grad_norm": 4.615053653717041, | |
| "learning_rate": 0.00016708703131101005, | |
| "loss": 21.5149, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.5065294815987337, | |
| "grad_norm": 6.088917255401611, | |
| "learning_rate": 0.00016624278432655472, | |
| "loss": 21.4344, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.5065294815987337, | |
| "eval_loss": 5.379353046417236, | |
| "eval_runtime": 827.937, | |
| "eval_samples_per_second": 8.032, | |
| "eval_steps_per_second": 8.032, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.519192718638702, | |
| "grad_norm": 8.198923110961914, | |
| "learning_rate": 0.00016539853734209945, | |
| "loss": 21.6445, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.5318559556786704, | |
| "grad_norm": 4.853950500488281, | |
| "learning_rate": 0.00016455429035764415, | |
| "loss": 21.5521, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.5445191927186387, | |
| "grad_norm": 5.882175922393799, | |
| "learning_rate": 0.00016371004337318882, | |
| "loss": 21.6128, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.557182429758607, | |
| "grad_norm": 5.045036792755127, | |
| "learning_rate": 0.00016286579638873354, | |
| "loss": 21.3947, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.5698456667985754, | |
| "grad_norm": 7.406811714172363, | |
| "learning_rate": 0.00016202154940427822, | |
| "loss": 21.4895, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.5825089038385437, | |
| "grad_norm": 4.246167182922363, | |
| "learning_rate": 0.0001611773024198229, | |
| "loss": 21.4127, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.5951721408785121, | |
| "grad_norm": 6.033562183380127, | |
| "learning_rate": 0.00016033305543536764, | |
| "loss": 21.4598, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.6078353779184804, | |
| "grad_norm": 6.69214391708374, | |
| "learning_rate": 0.0001594888084509123, | |
| "loss": 21.3653, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.6204986149584487, | |
| "grad_norm": 4.1962199211120605, | |
| "learning_rate": 0.000158644561466457, | |
| "loss": 21.4043, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.6331618519984171, | |
| "grad_norm": 5.602635860443115, | |
| "learning_rate": 0.00015780031448200173, | |
| "loss": 21.3265, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6331618519984171, | |
| "eval_loss": 5.360119342803955, | |
| "eval_runtime": 827.4824, | |
| "eval_samples_per_second": 8.036, | |
| "eval_steps_per_second": 8.036, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6458250890383854, | |
| "grad_norm": 5.863671779632568, | |
| "learning_rate": 0.0001569560674975464, | |
| "loss": 21.4703, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.6584883260783537, | |
| "grad_norm": 4.729866027832031, | |
| "learning_rate": 0.00015611182051309113, | |
| "loss": 21.3102, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.6711515631183221, | |
| "grad_norm": 5.219459056854248, | |
| "learning_rate": 0.0001552675735286358, | |
| "loss": 21.2844, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.6838148001582904, | |
| "grad_norm": 8.284063339233398, | |
| "learning_rate": 0.0001544233265441805, | |
| "loss": 21.3088, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.6964780371982588, | |
| "grad_norm": 5.863561153411865, | |
| "learning_rate": 0.00015357907955972522, | |
| "loss": 21.4019, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.7091412742382271, | |
| "grad_norm": 4.672849655151367, | |
| "learning_rate": 0.0001527348325752699, | |
| "loss": 21.2354, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.7218045112781954, | |
| "grad_norm": 5.514325141906738, | |
| "learning_rate": 0.0001518905855908146, | |
| "loss": 21.2799, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.7344677483181639, | |
| "grad_norm": 4.2964186668396, | |
| "learning_rate": 0.00015104633860635932, | |
| "loss": 21.2922, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.7471309853581322, | |
| "grad_norm": 6.241255283355713, | |
| "learning_rate": 0.000150202091621904, | |
| "loss": 21.3237, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.7597942223981005, | |
| "grad_norm": 5.528208255767822, | |
| "learning_rate": 0.0001493578446374487, | |
| "loss": 21.3914, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.7597942223981005, | |
| "eval_loss": 5.337831974029541, | |
| "eval_runtime": 827.9109, | |
| "eval_samples_per_second": 8.032, | |
| "eval_steps_per_second": 8.032, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.7724574594380689, | |
| "grad_norm": 6.572064399719238, | |
| "learning_rate": 0.00014851359765299339, | |
| "loss": 21.2885, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.7851206964780372, | |
| "grad_norm": 6.282362937927246, | |
| "learning_rate": 0.00014766935066853808, | |
| "loss": 21.2727, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.7977839335180056, | |
| "grad_norm": 6.260461807250977, | |
| "learning_rate": 0.00014682510368408278, | |
| "loss": 21.2631, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.8104471705579739, | |
| "grad_norm": 5.576934814453125, | |
| "learning_rate": 0.00014598085669962748, | |
| "loss": 21.2312, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.8231104075979422, | |
| "grad_norm": 7.043119430541992, | |
| "learning_rate": 0.00014513660971517218, | |
| "loss": 21.2092, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.8357736446379106, | |
| "grad_norm": 4.880777835845947, | |
| "learning_rate": 0.00014429236273071688, | |
| "loss": 21.2742, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.8484368816778789, | |
| "grad_norm": 4.317328929901123, | |
| "learning_rate": 0.00014344811574626157, | |
| "loss": 21.3076, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.8611001187178472, | |
| "grad_norm": 6.527237415313721, | |
| "learning_rate": 0.00014260386876180627, | |
| "loss": 21.1922, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.8737633557578156, | |
| "grad_norm": 5.896966457366943, | |
| "learning_rate": 0.00014175962177735097, | |
| "loss": 21.296, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.8864265927977839, | |
| "grad_norm": 4.883022785186768, | |
| "learning_rate": 0.00014091537479289567, | |
| "loss": 21.3216, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.8864265927977839, | |
| "eval_loss": 5.316357135772705, | |
| "eval_runtime": 827.6596, | |
| "eval_samples_per_second": 8.035, | |
| "eval_steps_per_second": 8.035, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.8990898298377523, | |
| "grad_norm": 4.470438480377197, | |
| "learning_rate": 0.00014007112780844037, | |
| "loss": 21.1219, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.9117530668777206, | |
| "grad_norm": 4.960896015167236, | |
| "learning_rate": 0.00013922688082398506, | |
| "loss": 21.2995, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.9244163039176889, | |
| "grad_norm": 4.033212184906006, | |
| "learning_rate": 0.00013838263383952976, | |
| "loss": 21.2332, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.9370795409576573, | |
| "grad_norm": 5.585933685302734, | |
| "learning_rate": 0.00013753838685507446, | |
| "loss": 21.1877, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.9497427779976256, | |
| "grad_norm": 5.022076606750488, | |
| "learning_rate": 0.00013669413987061916, | |
| "loss": 21.2318, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.9624060150375939, | |
| "grad_norm": 6.21948766708374, | |
| "learning_rate": 0.00013584989288616386, | |
| "loss": 21.2636, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.9750692520775623, | |
| "grad_norm": 4.889180660247803, | |
| "learning_rate": 0.00013500564590170855, | |
| "loss": 21.2175, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.9877324891175306, | |
| "grad_norm": 4.395177364349365, | |
| "learning_rate": 0.00013416139891725325, | |
| "loss": 21.0757, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.000379897111199, | |
| "grad_norm": 4.519713878631592, | |
| "learning_rate": 0.00013331715193279795, | |
| "loss": 21.2397, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.0130431341511674, | |
| "grad_norm": 7.355111598968506, | |
| "learning_rate": 0.00013247290494834262, | |
| "loss": 20.7296, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.0130431341511674, | |
| "eval_loss": 5.2997307777404785, | |
| "eval_runtime": 827.4677, | |
| "eval_samples_per_second": 8.037, | |
| "eval_steps_per_second": 8.037, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.0257063711911358, | |
| "grad_norm": 5.514514446258545, | |
| "learning_rate": 0.00013162865796388735, | |
| "loss": 20.8263, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 1.038369608231104, | |
| "grad_norm": 5.889220714569092, | |
| "learning_rate": 0.00013078441097943205, | |
| "loss": 20.7564, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 1.0510328452710724, | |
| "grad_norm": 4.103857517242432, | |
| "learning_rate": 0.00012994016399497672, | |
| "loss": 20.6965, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 1.0636960823110408, | |
| "grad_norm": 5.945840358734131, | |
| "learning_rate": 0.00012909591701052144, | |
| "loss": 20.7703, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 1.076359319351009, | |
| "grad_norm": 5.632152557373047, | |
| "learning_rate": 0.00012825167002606614, | |
| "loss": 20.7298, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.0890225563909774, | |
| "grad_norm": 4.992783069610596, | |
| "learning_rate": 0.0001274074230416108, | |
| "loss": 20.7289, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 1.1016857934309459, | |
| "grad_norm": 5.6489338874816895, | |
| "learning_rate": 0.00012656317605715554, | |
| "loss": 20.8587, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 1.114349030470914, | |
| "grad_norm": 5.924511432647705, | |
| "learning_rate": 0.0001257189290727002, | |
| "loss": 20.6762, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 1.1270122675108825, | |
| "grad_norm": 7.25151252746582, | |
| "learning_rate": 0.00012487468208824493, | |
| "loss": 20.7874, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 1.1396755045508509, | |
| "grad_norm": 7.074513912200928, | |
| "learning_rate": 0.00012403043510378963, | |
| "loss": 20.7852, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.1396755045508509, | |
| "eval_loss": 5.293835639953613, | |
| "eval_runtime": 827.451, | |
| "eval_samples_per_second": 8.037, | |
| "eval_steps_per_second": 8.037, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.152338741590819, | |
| "grad_norm": 5.452646732330322, | |
| "learning_rate": 0.0001231861881193343, | |
| "loss": 20.7661, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 1.1650019786307875, | |
| "grad_norm": 9.404861450195312, | |
| "learning_rate": 0.00012234194113487903, | |
| "loss": 20.7723, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 1.1776652156707559, | |
| "grad_norm": 8.0953950881958, | |
| "learning_rate": 0.00012149769415042371, | |
| "loss": 20.7162, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 1.190328452710724, | |
| "grad_norm": 8.188663482666016, | |
| "learning_rate": 0.00012065344716596841, | |
| "loss": 20.7402, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 1.2029916897506925, | |
| "grad_norm": 6.381914138793945, | |
| "learning_rate": 0.00011980920018151312, | |
| "loss": 20.6441, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.215654926790661, | |
| "grad_norm": 6.439055442810059, | |
| "learning_rate": 0.0001189649531970578, | |
| "loss": 20.7694, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 1.2283181638306293, | |
| "grad_norm": 5.406436443328857, | |
| "learning_rate": 0.00011812070621260249, | |
| "loss": 20.8294, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 1.2409814008705975, | |
| "grad_norm": 5.663506507873535, | |
| "learning_rate": 0.0001172764592281472, | |
| "loss": 20.6368, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 1.253644637910566, | |
| "grad_norm": 7.003817558288574, | |
| "learning_rate": 0.0001164322122436919, | |
| "loss": 20.7311, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 1.266307874950534, | |
| "grad_norm": 5.4320783615112305, | |
| "learning_rate": 0.00011558796525923658, | |
| "loss": 20.6077, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.266307874950534, | |
| "eval_loss": 5.290070533752441, | |
| "eval_runtime": 828.016, | |
| "eval_samples_per_second": 8.031, | |
| "eval_steps_per_second": 8.031, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.2789711119905025, | |
| "grad_norm": 5.977424144744873, | |
| "learning_rate": 0.0001147437182747813, | |
| "loss": 20.807, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 1.291634349030471, | |
| "grad_norm": 4.140133857727051, | |
| "learning_rate": 0.000113899471290326, | |
| "loss": 20.692, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 1.3042975860704393, | |
| "grad_norm": 8.311481475830078, | |
| "learning_rate": 0.00011305522430587068, | |
| "loss": 20.8168, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 1.3169608231104075, | |
| "grad_norm": 5.127328872680664, | |
| "learning_rate": 0.00011221097732141539, | |
| "loss": 20.7075, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 1.329624060150376, | |
| "grad_norm": 5.829504013061523, | |
| "learning_rate": 0.00011136673033696008, | |
| "loss": 20.6994, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.3422872971903443, | |
| "grad_norm": 10.939740180969238, | |
| "learning_rate": 0.00011052248335250479, | |
| "loss": 20.6369, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 1.3549505342303125, | |
| "grad_norm": 7.285661697387695, | |
| "learning_rate": 0.00010967823636804948, | |
| "loss": 20.765, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 1.367613771270281, | |
| "grad_norm": 6.9708781242370605, | |
| "learning_rate": 0.00010883398938359417, | |
| "loss": 20.6949, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 1.3802770083102494, | |
| "grad_norm": 3.8849053382873535, | |
| "learning_rate": 0.00010798974239913888, | |
| "loss": 20.775, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 1.3929402453502178, | |
| "grad_norm": 5.512059211730957, | |
| "learning_rate": 0.00010714549541468357, | |
| "loss": 20.6644, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.3929402453502178, | |
| "eval_loss": 5.2762451171875, | |
| "eval_runtime": 828.4019, | |
| "eval_samples_per_second": 8.028, | |
| "eval_steps_per_second": 8.028, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.405603482390186, | |
| "grad_norm": 5.7364583015441895, | |
| "learning_rate": 0.00010630124843022826, | |
| "loss": 20.7325, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 1.4182667194301544, | |
| "grad_norm": 6.881465911865234, | |
| "learning_rate": 0.00010545700144577298, | |
| "loss": 20.7941, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 1.4309299564701226, | |
| "grad_norm": 4.779888153076172, | |
| "learning_rate": 0.00010461275446131766, | |
| "loss": 20.7267, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 1.443593193510091, | |
| "grad_norm": 8.492117881774902, | |
| "learning_rate": 0.00010376850747686236, | |
| "loss": 20.7508, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 1.4562564305500594, | |
| "grad_norm": 4.9288458824157715, | |
| "learning_rate": 0.00010292426049240707, | |
| "loss": 20.6607, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.4689196675900278, | |
| "grad_norm": 5.2882819175720215, | |
| "learning_rate": 0.00010208001350795175, | |
| "loss": 20.7009, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 1.481582904629996, | |
| "grad_norm": 6.698110103607178, | |
| "learning_rate": 0.00010123576652349644, | |
| "loss": 20.6642, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 1.4942461416699644, | |
| "grad_norm": 6.440258026123047, | |
| "learning_rate": 0.00010039151953904115, | |
| "loss": 20.6448, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 1.5069093787099326, | |
| "grad_norm": 5.870398998260498, | |
| "learning_rate": 9.954727255458585e-05, | |
| "loss": 20.7483, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 1.519572615749901, | |
| "grad_norm": 5.033831596374512, | |
| "learning_rate": 9.870302557013055e-05, | |
| "loss": 20.7524, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.519572615749901, | |
| "eval_loss": 5.262022972106934, | |
| "eval_runtime": 825.2873, | |
| "eval_samples_per_second": 8.058, | |
| "eval_steps_per_second": 8.058, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.5322358527898694, | |
| "grad_norm": 6.174431324005127, | |
| "learning_rate": 9.785877858567524e-05, | |
| "loss": 20.5863, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 1.5448990898298378, | |
| "grad_norm": 7.077877521514893, | |
| "learning_rate": 9.701453160121994e-05, | |
| "loss": 20.5485, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 1.5575623268698062, | |
| "grad_norm": 5.598957538604736, | |
| "learning_rate": 9.617028461676464e-05, | |
| "loss": 20.714, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 1.5702255639097744, | |
| "grad_norm": 6.939237594604492, | |
| "learning_rate": 9.532603763230934e-05, | |
| "loss": 20.7646, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 1.5828888009497428, | |
| "grad_norm": 5.701028823852539, | |
| "learning_rate": 9.448179064785402e-05, | |
| "loss": 20.756, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.595552037989711, | |
| "grad_norm": 8.093730926513672, | |
| "learning_rate": 9.363754366339874e-05, | |
| "loss": 20.5889, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 1.6082152750296794, | |
| "grad_norm": 5.6987738609313965, | |
| "learning_rate": 9.279329667894343e-05, | |
| "loss": 20.6295, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 1.6208785120696478, | |
| "grad_norm": 6.735232830047607, | |
| "learning_rate": 9.194904969448813e-05, | |
| "loss": 20.6727, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 1.6335417491096162, | |
| "grad_norm": 6.842558860778809, | |
| "learning_rate": 9.110480271003282e-05, | |
| "loss": 20.6825, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 1.6462049861495844, | |
| "grad_norm": 7.107422351837158, | |
| "learning_rate": 9.026055572557753e-05, | |
| "loss": 20.5772, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.6462049861495844, | |
| "eval_loss": 5.254425525665283, | |
| "eval_runtime": 824.9539, | |
| "eval_samples_per_second": 8.061, | |
| "eval_steps_per_second": 8.061, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.6588682231895528, | |
| "grad_norm": 4.229997634887695, | |
| "learning_rate": 8.941630874112223e-05, | |
| "loss": 20.6556, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 1.671531460229521, | |
| "grad_norm": 7.964229583740234, | |
| "learning_rate": 8.857206175666691e-05, | |
| "loss": 20.6154, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 1.6841946972694894, | |
| "grad_norm": 5.777505397796631, | |
| "learning_rate": 8.772781477221161e-05, | |
| "loss": 20.6021, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 1.6968579343094579, | |
| "grad_norm": 5.017460346221924, | |
| "learning_rate": 8.688356778775632e-05, | |
| "loss": 20.7124, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 1.7095211713494263, | |
| "grad_norm": 6.260605812072754, | |
| "learning_rate": 8.6039320803301e-05, | |
| "loss": 20.5744, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.7221844083893947, | |
| "grad_norm": 6.316308975219727, | |
| "learning_rate": 8.51950738188457e-05, | |
| "loss": 20.5549, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 1.7348476454293629, | |
| "grad_norm": 5.088778972625732, | |
| "learning_rate": 8.43508268343904e-05, | |
| "loss": 20.6309, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 1.747510882469331, | |
| "grad_norm": 10.32010555267334, | |
| "learning_rate": 8.350657984993511e-05, | |
| "loss": 20.6552, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 1.7601741195092995, | |
| "grad_norm": 6.9819016456604, | |
| "learning_rate": 8.26623328654798e-05, | |
| "loss": 20.6702, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 1.7728373565492679, | |
| "grad_norm": 5.110126972198486, | |
| "learning_rate": 8.18180858810245e-05, | |
| "loss": 20.5999, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.7728373565492679, | |
| "eval_loss": 5.242661952972412, | |
| "eval_runtime": 824.7934, | |
| "eval_samples_per_second": 8.063, | |
| "eval_steps_per_second": 8.063, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.7855005935892363, | |
| "grad_norm": 4.659190654754639, | |
| "learning_rate": 8.09738388965692e-05, | |
| "loss": 20.6725, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 1.7981638306292047, | |
| "grad_norm": 7.482509136199951, | |
| "learning_rate": 8.012959191211389e-05, | |
| "loss": 20.6451, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 1.810827067669173, | |
| "grad_norm": 4.4703803062438965, | |
| "learning_rate": 7.928534492765859e-05, | |
| "loss": 20.6875, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 1.8234903047091413, | |
| "grad_norm": 5.930804252624512, | |
| "learning_rate": 7.844109794320329e-05, | |
| "loss": 20.6306, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 1.8361535417491095, | |
| "grad_norm": 6.960028648376465, | |
| "learning_rate": 7.759685095874799e-05, | |
| "loss": 20.5419, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.848816778789078, | |
| "grad_norm": 5.914958953857422, | |
| "learning_rate": 7.675260397429268e-05, | |
| "loss": 20.5981, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 1.8614800158290463, | |
| "grad_norm": 4.671518802642822, | |
| "learning_rate": 7.590835698983738e-05, | |
| "loss": 20.6456, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 1.8741432528690147, | |
| "grad_norm": 6.006710529327393, | |
| "learning_rate": 7.506411000538208e-05, | |
| "loss": 20.5486, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 1.8868064899089831, | |
| "grad_norm": 5.438353538513184, | |
| "learning_rate": 7.421986302092678e-05, | |
| "loss": 20.6253, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 1.8994697269489513, | |
| "grad_norm": 4.497569561004639, | |
| "learning_rate": 7.337561603647148e-05, | |
| "loss": 20.5851, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.8994697269489513, | |
| "eval_loss": 5.23079776763916, | |
| "eval_runtime": 824.4065, | |
| "eval_samples_per_second": 8.066, | |
| "eval_steps_per_second": 8.066, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.9121329639889195, | |
| "grad_norm": 6.200584888458252, | |
| "learning_rate": 7.253136905201617e-05, | |
| "loss": 20.4825, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 1.924796201028888, | |
| "grad_norm": 7.205463886260986, | |
| "learning_rate": 7.168712206756086e-05, | |
| "loss": 20.4809, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 1.9374594380688563, | |
| "grad_norm": 6.1208367347717285, | |
| "learning_rate": 7.084287508310557e-05, | |
| "loss": 20.604, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 1.9501226751088248, | |
| "grad_norm": 8.871358871459961, | |
| "learning_rate": 6.999862809865027e-05, | |
| "loss": 20.53, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 1.9627859121487932, | |
| "grad_norm": 5.655535697937012, | |
| "learning_rate": 6.915438111419497e-05, | |
| "loss": 20.5554, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.9754491491887614, | |
| "grad_norm": 5.66743278503418, | |
| "learning_rate": 6.831013412973965e-05, | |
| "loss": 20.5893, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 1.9881123862287298, | |
| "grad_norm": 5.920203685760498, | |
| "learning_rate": 6.746588714528435e-05, | |
| "loss": 20.5351, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 2.000759794222398, | |
| "grad_norm": 5.4713592529296875, | |
| "learning_rate": 6.662164016082906e-05, | |
| "loss": 20.4346, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 2.0134230312623664, | |
| "grad_norm": 6.8507866859436035, | |
| "learning_rate": 6.577739317637375e-05, | |
| "loss": 19.9368, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 2.026086268302335, | |
| "grad_norm": 5.996831893920898, | |
| "learning_rate": 6.493314619191844e-05, | |
| "loss": 20.1004, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 2.026086268302335, | |
| "eval_loss": 5.241866588592529, | |
| "eval_runtime": 824.5405, | |
| "eval_samples_per_second": 8.065, | |
| "eval_steps_per_second": 8.065, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 2.0387495053423033, | |
| "grad_norm": 8.938416481018066, | |
| "learning_rate": 6.408889920746314e-05, | |
| "loss": 19.9944, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 2.0514127423822717, | |
| "grad_norm": 8.623857498168945, | |
| "learning_rate": 6.324465222300784e-05, | |
| "loss": 19.9731, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 2.0640759794222396, | |
| "grad_norm": 4.697628498077393, | |
| "learning_rate": 6.240040523855254e-05, | |
| "loss": 20.1333, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 2.076739216462208, | |
| "grad_norm": 6.522459506988525, | |
| "learning_rate": 6.155615825409724e-05, | |
| "loss": 19.9999, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 2.0894024535021765, | |
| "grad_norm": 7.3197922706604, | |
| "learning_rate": 6.071191126964194e-05, | |
| "loss": 19.9933, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.102065690542145, | |
| "grad_norm": 5.75483512878418, | |
| "learning_rate": 5.986766428518663e-05, | |
| "loss": 20.0283, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 2.1147289275821133, | |
| "grad_norm": 5.822326183319092, | |
| "learning_rate": 5.902341730073133e-05, | |
| "loss": 20.0077, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 2.1273921646220817, | |
| "grad_norm": 6.054640769958496, | |
| "learning_rate": 5.8179170316276036e-05, | |
| "loss": 19.9779, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 2.1400554016620497, | |
| "grad_norm": 7.926814556121826, | |
| "learning_rate": 5.733492333182072e-05, | |
| "loss": 19.9615, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 2.152718638702018, | |
| "grad_norm": 7.559403419494629, | |
| "learning_rate": 5.6490676347365425e-05, | |
| "loss": 20.0425, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 2.152718638702018, | |
| "eval_loss": 5.239729881286621, | |
| "eval_runtime": 823.4127, | |
| "eval_samples_per_second": 8.076, | |
| "eval_steps_per_second": 8.076, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 2.1653818757419865, | |
| "grad_norm": 6.052306652069092, | |
| "learning_rate": 5.5646429362910123e-05, | |
| "loss": 19.9474, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 2.178045112781955, | |
| "grad_norm": 7.51389741897583, | |
| "learning_rate": 5.480218237845482e-05, | |
| "loss": 19.8334, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 2.1907083498219233, | |
| "grad_norm": 6.6172194480896, | |
| "learning_rate": 5.395793539399951e-05, | |
| "loss": 20.0055, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 2.2033715868618917, | |
| "grad_norm": 6.666018486022949, | |
| "learning_rate": 5.311368840954422e-05, | |
| "loss": 19.9767, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 2.21603482390186, | |
| "grad_norm": 6.328158378601074, | |
| "learning_rate": 5.2269441425088916e-05, | |
| "loss": 19.9566, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 2.228698060941828, | |
| "grad_norm": 5.255093574523926, | |
| "learning_rate": 5.142519444063361e-05, | |
| "loss": 19.956, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 2.2413612979817965, | |
| "grad_norm": 7.3458638191223145, | |
| "learning_rate": 5.0580947456178305e-05, | |
| "loss": 19.9541, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 2.254024535021765, | |
| "grad_norm": 7.968871116638184, | |
| "learning_rate": 4.9736700471723003e-05, | |
| "loss": 19.9859, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 2.2666877720617333, | |
| "grad_norm": 5.766326427459717, | |
| "learning_rate": 4.88924534872677e-05, | |
| "loss": 19.9209, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 2.2793510091017017, | |
| "grad_norm": 7.928537368774414, | |
| "learning_rate": 4.80482065028124e-05, | |
| "loss": 19.9624, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 2.2793510091017017, | |
| "eval_loss": 5.233314037322998, | |
| "eval_runtime": 824.278, | |
| "eval_samples_per_second": 8.068, | |
| "eval_steps_per_second": 8.068, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 2.29201424614167, | |
| "grad_norm": 6.768707752227783, | |
| "learning_rate": 4.72039595183571e-05, | |
| "loss": 19.9709, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 2.304677483181638, | |
| "grad_norm": 7.442516326904297, | |
| "learning_rate": 4.6359712533901796e-05, | |
| "loss": 20.0136, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 2.3173407202216065, | |
| "grad_norm": 5.536145210266113, | |
| "learning_rate": 4.5515465549446494e-05, | |
| "loss": 19.9965, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 2.330003957261575, | |
| "grad_norm": 6.3454694747924805, | |
| "learning_rate": 4.467121856499119e-05, | |
| "loss": 19.9978, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 2.3426671943015434, | |
| "grad_norm": 8.60901165008545, | |
| "learning_rate": 4.3826971580535884e-05, | |
| "loss": 19.9545, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 2.3553304313415118, | |
| "grad_norm": 4.566495895385742, | |
| "learning_rate": 4.298272459608059e-05, | |
| "loss": 19.9565, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 2.36799366838148, | |
| "grad_norm": 6.594531059265137, | |
| "learning_rate": 4.213847761162528e-05, | |
| "loss": 19.9141, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 2.380656905421448, | |
| "grad_norm": 7.312906742095947, | |
| "learning_rate": 4.1294230627169985e-05, | |
| "loss": 19.9924, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 2.3933201424614166, | |
| "grad_norm": 6.62802267074585, | |
| "learning_rate": 4.0449983642714676e-05, | |
| "loss": 19.978, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 2.405983379501385, | |
| "grad_norm": 5.666561603546143, | |
| "learning_rate": 3.9605736658259374e-05, | |
| "loss": 19.9937, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 2.405983379501385, | |
| "eval_loss": 5.228879451751709, | |
| "eval_runtime": 824.5564, | |
| "eval_samples_per_second": 8.065, | |
| "eval_steps_per_second": 8.065, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 2.4186466165413534, | |
| "grad_norm": 7.05219841003418, | |
| "learning_rate": 3.876148967380407e-05, | |
| "loss": 19.9902, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 2.431309853581322, | |
| "grad_norm": 7.524454593658447, | |
| "learning_rate": 3.791724268934877e-05, | |
| "loss": 20.0407, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 2.44397309062129, | |
| "grad_norm": 10.032590866088867, | |
| "learning_rate": 3.707299570489347e-05, | |
| "loss": 19.9683, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 2.4566363276612586, | |
| "grad_norm": 7.076017379760742, | |
| "learning_rate": 3.6228748720438167e-05, | |
| "loss": 19.985, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 2.4692995647012266, | |
| "grad_norm": 4.9333014488220215, | |
| "learning_rate": 3.538450173598286e-05, | |
| "loss": 19.9737, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 2.481962801741195, | |
| "grad_norm": 9.004124641418457, | |
| "learning_rate": 3.454025475152756e-05, | |
| "loss": 19.9352, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 2.4946260387811634, | |
| "grad_norm": 6.457568168640137, | |
| "learning_rate": 3.3696007767072254e-05, | |
| "loss": 19.959, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 2.507289275821132, | |
| "grad_norm": 6.369134902954102, | |
| "learning_rate": 3.285176078261696e-05, | |
| "loss": 19.914, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 2.5199525128611002, | |
| "grad_norm": 8.021166801452637, | |
| "learning_rate": 3.200751379816165e-05, | |
| "loss": 20.0264, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 2.532615749901068, | |
| "grad_norm": 6.305584907531738, | |
| "learning_rate": 3.116326681370635e-05, | |
| "loss": 19.8487, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 2.532615749901068, | |
| "eval_loss": 5.220189571380615, | |
| "eval_runtime": 824.5446, | |
| "eval_samples_per_second": 8.065, | |
| "eval_steps_per_second": 8.065, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 2.545278986941037, | |
| "grad_norm": 8.87743854522705, | |
| "learning_rate": 3.031901982925105e-05, | |
| "loss": 20.0282, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 2.557942223981005, | |
| "grad_norm": 7.1361188888549805, | |
| "learning_rate": 2.9474772844795745e-05, | |
| "loss": 19.8977, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 2.5706054610209734, | |
| "grad_norm": 5.905386447906494, | |
| "learning_rate": 2.8630525860340446e-05, | |
| "loss": 19.9515, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 2.583268698060942, | |
| "grad_norm": 4.580873966217041, | |
| "learning_rate": 2.778627887588514e-05, | |
| "loss": 19.9952, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 2.5959319351009102, | |
| "grad_norm": 12.230775833129883, | |
| "learning_rate": 2.6942031891429836e-05, | |
| "loss": 19.9481, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 2.6085951721408787, | |
| "grad_norm": 6.843789577484131, | |
| "learning_rate": 2.6097784906974537e-05, | |
| "loss": 19.9544, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 2.6212584091808466, | |
| "grad_norm": 6.9929280281066895, | |
| "learning_rate": 2.5253537922519232e-05, | |
| "loss": 19.9988, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 2.633921646220815, | |
| "grad_norm": 6.615355968475342, | |
| "learning_rate": 2.440929093806393e-05, | |
| "loss": 19.9229, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 2.6465848832607834, | |
| "grad_norm": 5.5568528175354, | |
| "learning_rate": 2.3565043953608628e-05, | |
| "loss": 19.9999, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 2.659248120300752, | |
| "grad_norm": 5.973674297332764, | |
| "learning_rate": 2.2720796969153326e-05, | |
| "loss": 19.9165, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 2.659248120300752, | |
| "eval_loss": 5.213447093963623, | |
| "eval_runtime": 824.012, | |
| "eval_samples_per_second": 8.07, | |
| "eval_steps_per_second": 8.07, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 2.6719113573407203, | |
| "grad_norm": 6.193792819976807, | |
| "learning_rate": 2.1876549984698024e-05, | |
| "loss": 19.9343, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 2.6845745943806887, | |
| "grad_norm": 9.161760330200195, | |
| "learning_rate": 2.1032303000242722e-05, | |
| "loss": 19.8688, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 2.697237831420657, | |
| "grad_norm": 7.71662712097168, | |
| "learning_rate": 2.0188056015787417e-05, | |
| "loss": 19.862, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 2.709901068460625, | |
| "grad_norm": 7.0517191886901855, | |
| "learning_rate": 1.9343809031332115e-05, | |
| "loss": 19.9579, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 2.7225643055005935, | |
| "grad_norm": 6.024531841278076, | |
| "learning_rate": 1.8499562046876813e-05, | |
| "loss": 19.9337, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 2.735227542540562, | |
| "grad_norm": 6.53899621963501, | |
| "learning_rate": 1.765531506242151e-05, | |
| "loss": 19.8203, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 2.7478907795805303, | |
| "grad_norm": 5.616986274719238, | |
| "learning_rate": 1.681106807796621e-05, | |
| "loss": 19.9971, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 2.7605540166204987, | |
| "grad_norm": 9.12677001953125, | |
| "learning_rate": 1.5966821093510908e-05, | |
| "loss": 19.9302, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 2.773217253660467, | |
| "grad_norm": 5.700610160827637, | |
| "learning_rate": 1.5122574109055604e-05, | |
| "loss": 19.8132, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 2.7858804907004355, | |
| "grad_norm": 14.847786903381348, | |
| "learning_rate": 1.4278327124600302e-05, | |
| "loss": 19.941, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 2.7858804907004355, | |
| "eval_loss": 5.208680152893066, | |
| "eval_runtime": 823.5502, | |
| "eval_samples_per_second": 8.075, | |
| "eval_steps_per_second": 8.075, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 2.7985437277404035, | |
| "grad_norm": 7.600852966308594, | |
| "learning_rate": 1.3434080140145e-05, | |
| "loss": 19.9477, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 2.811206964780372, | |
| "grad_norm": 4.853618621826172, | |
| "learning_rate": 1.2589833155689699e-05, | |
| "loss": 19.8732, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 2.8238702018203403, | |
| "grad_norm": 6.983166694641113, | |
| "learning_rate": 1.1745586171234395e-05, | |
| "loss": 19.891, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 2.8365334388603087, | |
| "grad_norm": 9.106974601745605, | |
| "learning_rate": 1.0901339186779093e-05, | |
| "loss": 19.8636, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 2.849196675900277, | |
| "grad_norm": 8.579305648803711, | |
| "learning_rate": 1.0057092202323791e-05, | |
| "loss": 19.9161, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 2.861859912940245, | |
| "grad_norm": 7.962963581085205, | |
| "learning_rate": 9.212845217868488e-06, | |
| "loss": 19.8788, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 2.874523149980214, | |
| "grad_norm": 7.710476875305176, | |
| "learning_rate": 8.368598233413186e-06, | |
| "loss": 19.8828, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 2.887186387020182, | |
| "grad_norm": 8.663653373718262, | |
| "learning_rate": 7.524351248957882e-06, | |
| "loss": 19.8885, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 2.8998496240601503, | |
| "grad_norm": 5.92315673828125, | |
| "learning_rate": 6.68010426450258e-06, | |
| "loss": 19.8681, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 2.9125128611001188, | |
| "grad_norm": 5.890766143798828, | |
| "learning_rate": 5.8358572800472775e-06, | |
| "loss": 19.8796, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 2.9125128611001188, | |
| "eval_loss": 5.202943325042725, | |
| "eval_runtime": 823.4591, | |
| "eval_samples_per_second": 8.076, | |
| "eval_steps_per_second": 8.076, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 2.925176098140087, | |
| "grad_norm": 5.6968770027160645, | |
| "learning_rate": 4.991610295591976e-06, | |
| "loss": 19.9215, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 2.9378393351800556, | |
| "grad_norm": 8.343860626220703, | |
| "learning_rate": 4.147363311136674e-06, | |
| "loss": 19.8819, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 2.9505025722200235, | |
| "grad_norm": 7.180873870849609, | |
| "learning_rate": 3.3031163266813706e-06, | |
| "loss": 19.9235, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 2.963165809259992, | |
| "grad_norm": 6.29449987411499, | |
| "learning_rate": 2.4588693422260683e-06, | |
| "loss": 19.8343, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 2.9758290462999604, | |
| "grad_norm": 9.834261894226074, | |
| "learning_rate": 1.6146223577707657e-06, | |
| "loss": 19.9012, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 2.988492283339929, | |
| "grad_norm": 7.410830497741699, | |
| "learning_rate": 7.703753733154634e-07, | |
| "loss": 19.8262, | |
| "step": 94400 | |
| } | |
| ], | |
| "logging_steps": 400, | |
| "max_steps": 94764, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 4000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.015885255232636e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |