| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 615, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008146639511201629, | |
| "grad_norm": 0.4904627755196518, | |
| "learning_rate": 0.0, | |
| "loss": 0.0296, | |
| "num_tokens": 468319.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016293279022403257, | |
| "grad_norm": 0.44639106202217055, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 0.0278, | |
| "num_tokens": 931744.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024439918533604887, | |
| "grad_norm": 0.5235906052705608, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 0.0346, | |
| "num_tokens": 1382492.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032586558044806514, | |
| "grad_norm": 0.48829378532794426, | |
| "learning_rate": 1.5789473684210526e-06, | |
| "loss": 0.0298, | |
| "num_tokens": 1822837.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04073319755600815, | |
| "grad_norm": 0.46192310755459265, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 0.0282, | |
| "num_tokens": 2324341.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048879837067209775, | |
| "grad_norm": 0.42590423154372875, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 0.0259, | |
| "num_tokens": 2786402.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.05702647657841141, | |
| "grad_norm": 0.3780878258784539, | |
| "learning_rate": 3.157894736842105e-06, | |
| "loss": 0.0257, | |
| "num_tokens": 3249490.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.06517311608961303, | |
| "grad_norm": 0.3621520375009199, | |
| "learning_rate": 3.6842105263157896e-06, | |
| "loss": 0.0244, | |
| "num_tokens": 3691588.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.07331975560081466, | |
| "grad_norm": 0.34785014992590463, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 0.0243, | |
| "num_tokens": 4145266.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0814663951120163, | |
| "grad_norm": 0.4866514034362246, | |
| "learning_rate": 4.736842105263158e-06, | |
| "loss": 0.0283, | |
| "num_tokens": 4589804.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08961303462321792, | |
| "grad_norm": 0.6175459481284201, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 0.0314, | |
| "num_tokens": 5028241.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.09775967413441955, | |
| "grad_norm": 0.6496287922379511, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 0.0345, | |
| "num_tokens": 5485301.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.10590631364562118, | |
| "grad_norm": 0.6359630764639106, | |
| "learning_rate": 6.31578947368421e-06, | |
| "loss": 0.0325, | |
| "num_tokens": 5919605.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.11405295315682282, | |
| "grad_norm": 0.5165849518665486, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 0.027, | |
| "num_tokens": 6397578.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12219959266802444, | |
| "grad_norm": 0.5201523052382824, | |
| "learning_rate": 7.368421052631579e-06, | |
| "loss": 0.034, | |
| "num_tokens": 6859276.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.13034623217922606, | |
| "grad_norm": 0.46155792727423955, | |
| "learning_rate": 7.894736842105265e-06, | |
| "loss": 0.0301, | |
| "num_tokens": 7339093.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1384928716904277, | |
| "grad_norm": 0.42614925852564395, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 0.03, | |
| "num_tokens": 7829130.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.14663951120162932, | |
| "grad_norm": 0.4234114672689651, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 0.0318, | |
| "num_tokens": 8284433.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.15478615071283094, | |
| "grad_norm": 0.4251114371201452, | |
| "learning_rate": 9.473684210526315e-06, | |
| "loss": 0.0303, | |
| "num_tokens": 8720891.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1629327902240326, | |
| "grad_norm": 0.40955870197241373, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0297, | |
| "num_tokens": 9262899.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1710794297352342, | |
| "grad_norm": 0.4087963920152161, | |
| "learning_rate": 9.999937484351817e-06, | |
| "loss": 0.0306, | |
| "num_tokens": 9716643.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.17922606924643583, | |
| "grad_norm": 0.3693398530680141, | |
| "learning_rate": 9.999749939144244e-06, | |
| "loss": 0.0298, | |
| "num_tokens": 10180183.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.18737270875763748, | |
| "grad_norm": 0.41806848178827605, | |
| "learning_rate": 9.99943736958818e-06, | |
| "loss": 0.0321, | |
| "num_tokens": 10631870.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.1955193482688391, | |
| "grad_norm": 0.3809861049653307, | |
| "learning_rate": 9.998999784368282e-06, | |
| "loss": 0.0309, | |
| "num_tokens": 11100352.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.20366598778004075, | |
| "grad_norm": 0.39880645680339244, | |
| "learning_rate": 9.99843719564274e-06, | |
| "loss": 0.0352, | |
| "num_tokens": 11561399.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.21181262729124237, | |
| "grad_norm": 0.35747673914837313, | |
| "learning_rate": 9.997749619042932e-06, | |
| "loss": 0.0302, | |
| "num_tokens": 12052978.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.219959266802444, | |
| "grad_norm": 0.3646974156800663, | |
| "learning_rate": 9.996937073672988e-06, | |
| "loss": 0.0326, | |
| "num_tokens": 12510505.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.22810590631364563, | |
| "grad_norm": 0.37781095340314147, | |
| "learning_rate": 9.995999582109266e-06, | |
| "loss": 0.0329, | |
| "num_tokens": 12972726.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.23625254582484725, | |
| "grad_norm": 0.32601449306874525, | |
| "learning_rate": 9.994937170399715e-06, | |
| "loss": 0.0338, | |
| "num_tokens": 13415015.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24439918533604887, | |
| "grad_norm": 0.3124719848123855, | |
| "learning_rate": 9.993749868063162e-06, | |
| "loss": 0.0321, | |
| "num_tokens": 13862924.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2525458248472505, | |
| "grad_norm": 0.33624412669801873, | |
| "learning_rate": 9.992437708088487e-06, | |
| "loss": 0.0343, | |
| "num_tokens": 14336744.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.2606924643584521, | |
| "grad_norm": 0.2822892406548645, | |
| "learning_rate": 9.991000726933702e-06, | |
| "loss": 0.0317, | |
| "num_tokens": 14787461.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.26883910386965376, | |
| "grad_norm": 0.3141509965138737, | |
| "learning_rate": 9.989438964524943e-06, | |
| "loss": 0.0348, | |
| "num_tokens": 15260166.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.2769857433808554, | |
| "grad_norm": 0.28289872142233236, | |
| "learning_rate": 9.987752464255365e-06, | |
| "loss": 0.0328, | |
| "num_tokens": 15716455.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.285132382892057, | |
| "grad_norm": 0.2988023355786341, | |
| "learning_rate": 9.98594127298392e-06, | |
| "loss": 0.0327, | |
| "num_tokens": 16208303.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.29327902240325865, | |
| "grad_norm": 0.2827377533678326, | |
| "learning_rate": 9.984005441034079e-06, | |
| "loss": 0.0316, | |
| "num_tokens": 16661734.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3014256619144603, | |
| "grad_norm": 0.3277891610784485, | |
| "learning_rate": 9.981945022192412e-06, | |
| "loss": 0.0363, | |
| "num_tokens": 17117668.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.3095723014256619, | |
| "grad_norm": 0.29741031623668746, | |
| "learning_rate": 9.979760073707106e-06, | |
| "loss": 0.0322, | |
| "num_tokens": 17568922.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.31771894093686354, | |
| "grad_norm": 0.28575184625841077, | |
| "learning_rate": 9.977450656286371e-06, | |
| "loss": 0.0317, | |
| "num_tokens": 18032936.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.3258655804480652, | |
| "grad_norm": 0.28459652313484274, | |
| "learning_rate": 9.97501683409675e-06, | |
| "loss": 0.0334, | |
| "num_tokens": 18462483.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3340122199592668, | |
| "grad_norm": 0.2853320257498706, | |
| "learning_rate": 9.972458674761347e-06, | |
| "loss": 0.0325, | |
| "num_tokens": 18918154.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.3421588594704684, | |
| "grad_norm": 0.31245376505929573, | |
| "learning_rate": 9.96977624935793e-06, | |
| "loss": 0.0356, | |
| "num_tokens": 19392456.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.35030549898167007, | |
| "grad_norm": 0.29339121491288905, | |
| "learning_rate": 9.96696963241697e-06, | |
| "loss": 0.0358, | |
| "num_tokens": 19864410.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.35845213849287166, | |
| "grad_norm": 0.308001575808903, | |
| "learning_rate": 9.964038901919573e-06, | |
| "loss": 0.0344, | |
| "num_tokens": 20325616.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.3665987780040733, | |
| "grad_norm": 0.29777121599268264, | |
| "learning_rate": 9.9609841392953e-06, | |
| "loss": 0.0361, | |
| "num_tokens": 20754956.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.37474541751527496, | |
| "grad_norm": 0.27446985734348617, | |
| "learning_rate": 9.95780542941991e-06, | |
| "loss": 0.0367, | |
| "num_tokens": 21197697.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.38289205702647655, | |
| "grad_norm": 0.2723208448567585, | |
| "learning_rate": 9.954502860613011e-06, | |
| "loss": 0.0355, | |
| "num_tokens": 21644714.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3910386965376782, | |
| "grad_norm": 0.34829072831093, | |
| "learning_rate": 9.951076524635593e-06, | |
| "loss": 0.0343, | |
| "num_tokens": 22094029.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.39918533604887985, | |
| "grad_norm": 0.2633667374393046, | |
| "learning_rate": 9.947526516687484e-06, | |
| "loss": 0.0342, | |
| "num_tokens": 22577438.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4073319755600815, | |
| "grad_norm": 0.2781504189612014, | |
| "learning_rate": 9.943852935404706e-06, | |
| "loss": 0.0356, | |
| "num_tokens": 23046436.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4154786150712831, | |
| "grad_norm": 0.29581469873784194, | |
| "learning_rate": 9.940055882856734e-06, | |
| "loss": 0.038, | |
| "num_tokens": 23498243.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.42362525458248473, | |
| "grad_norm": 0.2656899667965322, | |
| "learning_rate": 9.936135464543652e-06, | |
| "loss": 0.0347, | |
| "num_tokens": 23972330.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.4317718940936864, | |
| "grad_norm": 0.2543418233162407, | |
| "learning_rate": 9.93209178939324e-06, | |
| "loss": 0.0341, | |
| "num_tokens": 24453685.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.439918533604888, | |
| "grad_norm": 0.25163009959008703, | |
| "learning_rate": 9.927924969757926e-06, | |
| "loss": 0.034, | |
| "num_tokens": 24926242.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.4480651731160896, | |
| "grad_norm": 0.2530048416696052, | |
| "learning_rate": 9.923635121411683e-06, | |
| "loss": 0.0341, | |
| "num_tokens": 25365241.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.45621181262729127, | |
| "grad_norm": 0.2591530319599859, | |
| "learning_rate": 9.919222363546797e-06, | |
| "loss": 0.0353, | |
| "num_tokens": 25833971.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.46435845213849286, | |
| "grad_norm": 0.23005642120058867, | |
| "learning_rate": 9.914686818770567e-06, | |
| "loss": 0.0328, | |
| "num_tokens": 26279628.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.4725050916496945, | |
| "grad_norm": 0.2612401425726277, | |
| "learning_rate": 9.910028613101888e-06, | |
| "loss": 0.0343, | |
| "num_tokens": 26734776.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.48065173116089616, | |
| "grad_norm": 0.25501336518012946, | |
| "learning_rate": 9.905247875967764e-06, | |
| "loss": 0.035, | |
| "num_tokens": 27206001.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48879837067209775, | |
| "grad_norm": 0.25907516477795234, | |
| "learning_rate": 9.900344740199691e-06, | |
| "loss": 0.0342, | |
| "num_tokens": 27647448.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4969450101832994, | |
| "grad_norm": 0.2627756492187737, | |
| "learning_rate": 9.895319342029992e-06, | |
| "loss": 0.0352, | |
| "num_tokens": 28116087.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.505091649694501, | |
| "grad_norm": 0.2520744974011735, | |
| "learning_rate": 9.890171821088006e-06, | |
| "loss": 0.034, | |
| "num_tokens": 28556029.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.5132382892057027, | |
| "grad_norm": 0.25566988242695377, | |
| "learning_rate": 9.884902320396228e-06, | |
| "loss": 0.0345, | |
| "num_tokens": 29003546.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.5213849287169042, | |
| "grad_norm": 0.26761657061201327, | |
| "learning_rate": 9.879510986366321e-06, | |
| "loss": 0.0386, | |
| "num_tokens": 29464833.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.5295315682281059, | |
| "grad_norm": 0.25151679573138824, | |
| "learning_rate": 9.873997968795066e-06, | |
| "loss": 0.0361, | |
| "num_tokens": 29908906.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5376782077393075, | |
| "grad_norm": 0.25192725491977325, | |
| "learning_rate": 9.868363420860176e-06, | |
| "loss": 0.0363, | |
| "num_tokens": 30339618.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.5458248472505092, | |
| "grad_norm": 0.2558097074022343, | |
| "learning_rate": 9.86260749911606e-06, | |
| "loss": 0.0359, | |
| "num_tokens": 30798302.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.5539714867617108, | |
| "grad_norm": 0.23903896250926235, | |
| "learning_rate": 9.856730363489465e-06, | |
| "loss": 0.0321, | |
| "num_tokens": 31270382.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.5621181262729125, | |
| "grad_norm": 0.23678636099022307, | |
| "learning_rate": 9.85073217727503e-06, | |
| "loss": 0.0332, | |
| "num_tokens": 31743990.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.570264765784114, | |
| "grad_norm": 0.2718182538363666, | |
| "learning_rate": 9.844613107130758e-06, | |
| "loss": 0.0377, | |
| "num_tokens": 32188589.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5784114052953157, | |
| "grad_norm": 0.2447806090135222, | |
| "learning_rate": 9.838373323073376e-06, | |
| "loss": 0.0335, | |
| "num_tokens": 32654341.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.5865580448065173, | |
| "grad_norm": 0.24791886655928558, | |
| "learning_rate": 9.832012998473612e-06, | |
| "loss": 0.0357, | |
| "num_tokens": 33133443.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.594704684317719, | |
| "grad_norm": 0.2602111918495323, | |
| "learning_rate": 9.825532310051383e-06, | |
| "loss": 0.0369, | |
| "num_tokens": 33600590.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.6028513238289206, | |
| "grad_norm": 0.23958876317959238, | |
| "learning_rate": 9.818931437870888e-06, | |
| "loss": 0.0347, | |
| "num_tokens": 34081907.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6109979633401222, | |
| "grad_norm": 0.24528240723597736, | |
| "learning_rate": 9.812210565335591e-06, | |
| "loss": 0.0347, | |
| "num_tokens": 34528542.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6191446028513238, | |
| "grad_norm": 0.2511113811601625, | |
| "learning_rate": 9.805369879183143e-06, | |
| "loss": 0.0358, | |
| "num_tokens": 34984490.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.6272912423625254, | |
| "grad_norm": 0.23964291648975655, | |
| "learning_rate": 9.798409569480171e-06, | |
| "loss": 0.0368, | |
| "num_tokens": 35438413.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.6354378818737271, | |
| "grad_norm": 0.22854430928208863, | |
| "learning_rate": 9.791329829617025e-06, | |
| "loss": 0.0329, | |
| "num_tokens": 35861862.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.6435845213849287, | |
| "grad_norm": 0.25934229180134305, | |
| "learning_rate": 9.784130856302383e-06, | |
| "loss": 0.0352, | |
| "num_tokens": 36334726.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.6517311608961304, | |
| "grad_norm": 0.249853867356781, | |
| "learning_rate": 9.77681284955779e-06, | |
| "loss": 0.0334, | |
| "num_tokens": 36806966.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.659877800407332, | |
| "grad_norm": 0.24228111972158922, | |
| "learning_rate": 9.769376012712107e-06, | |
| "loss": 0.0355, | |
| "num_tokens": 37255978.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.6680244399185336, | |
| "grad_norm": 0.24656941383849604, | |
| "learning_rate": 9.761820552395857e-06, | |
| "loss": 0.0372, | |
| "num_tokens": 37695349.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.6761710794297352, | |
| "grad_norm": 0.24557463844035055, | |
| "learning_rate": 9.754146678535483e-06, | |
| "loss": 0.0364, | |
| "num_tokens": 38137196.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.6843177189409368, | |
| "grad_norm": 0.25045832824836683, | |
| "learning_rate": 9.74635460434752e-06, | |
| "loss": 0.036, | |
| "num_tokens": 38601156.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.6924643584521385, | |
| "grad_norm": 0.23961222253413397, | |
| "learning_rate": 9.738444546332663e-06, | |
| "loss": 0.0348, | |
| "num_tokens": 39098917.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7006109979633401, | |
| "grad_norm": 0.21623543203559747, | |
| "learning_rate": 9.73041672426976e-06, | |
| "loss": 0.0313, | |
| "num_tokens": 39589476.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.7087576374745418, | |
| "grad_norm": 0.2454384444263673, | |
| "learning_rate": 9.722271361209698e-06, | |
| "loss": 0.035, | |
| "num_tokens": 40040757.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.7169042769857433, | |
| "grad_norm": 0.2514790044121715, | |
| "learning_rate": 9.714008683469212e-06, | |
| "loss": 0.035, | |
| "num_tokens": 40503981.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.725050916496945, | |
| "grad_norm": 0.2574428715510541, | |
| "learning_rate": 9.705628920624592e-06, | |
| "loss": 0.0365, | |
| "num_tokens": 40969365.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.7331975560081466, | |
| "grad_norm": 0.25017040048121353, | |
| "learning_rate": 9.69713230550531e-06, | |
| "loss": 0.0349, | |
| "num_tokens": 41427533.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7413441955193483, | |
| "grad_norm": 0.2526246003424556, | |
| "learning_rate": 9.68851907418754e-06, | |
| "loss": 0.0385, | |
| "num_tokens": 41894302.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.7494908350305499, | |
| "grad_norm": 0.2461082056251613, | |
| "learning_rate": 9.679789465987614e-06, | |
| "loss": 0.0357, | |
| "num_tokens": 42349463.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.7576374745417516, | |
| "grad_norm": 0.2617726018040813, | |
| "learning_rate": 9.67094372345536e-06, | |
| "loss": 0.0389, | |
| "num_tokens": 42774515.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.7657841140529531, | |
| "grad_norm": 0.24705231631404728, | |
| "learning_rate": 9.661982092367366e-06, | |
| "loss": 0.036, | |
| "num_tokens": 43230624.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.7739307535641547, | |
| "grad_norm": 0.235679439367168, | |
| "learning_rate": 9.652904821720158e-06, | |
| "loss": 0.0365, | |
| "num_tokens": 43672523.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7820773930753564, | |
| "grad_norm": 0.2510768490849978, | |
| "learning_rate": 9.643712163723271e-06, | |
| "loss": 0.0377, | |
| "num_tokens": 44158995.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.790224032586558, | |
| "grad_norm": 0.2533074838565773, | |
| "learning_rate": 9.63440437379225e-06, | |
| "loss": 0.0376, | |
| "num_tokens": 44636347.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.7983706720977597, | |
| "grad_norm": 0.23715260979777855, | |
| "learning_rate": 9.624981710541548e-06, | |
| "loss": 0.0356, | |
| "num_tokens": 45086574.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.8065173116089613, | |
| "grad_norm": 0.23369067636824356, | |
| "learning_rate": 9.615444435777343e-06, | |
| "loss": 0.0357, | |
| "num_tokens": 45541713.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.814663951120163, | |
| "grad_norm": 0.22571635640078413, | |
| "learning_rate": 9.605792814490263e-06, | |
| "loss": 0.0348, | |
| "num_tokens": 46007566.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8228105906313645, | |
| "grad_norm": 0.23077275204681233, | |
| "learning_rate": 9.596027114848025e-06, | |
| "loss": 0.0345, | |
| "num_tokens": 46477746.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.8309572301425662, | |
| "grad_norm": 0.22566558819394333, | |
| "learning_rate": 9.58614760818798e-06, | |
| "loss": 0.0338, | |
| "num_tokens": 46929999.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.8391038696537678, | |
| "grad_norm": 0.21695625400644095, | |
| "learning_rate": 9.57615456900958e-06, | |
| "loss": 0.0347, | |
| "num_tokens": 47395766.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.8472505091649695, | |
| "grad_norm": 0.2620473147070263, | |
| "learning_rate": 9.566048274966745e-06, | |
| "loss": 0.0383, | |
| "num_tokens": 47845971.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.8553971486761711, | |
| "grad_norm": 0.2410799135804227, | |
| "learning_rate": 9.55582900686015e-06, | |
| "loss": 0.0365, | |
| "num_tokens": 48287919.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.8635437881873728, | |
| "grad_norm": 0.2396885428184001, | |
| "learning_rate": 9.545497048629427e-06, | |
| "loss": 0.0348, | |
| "num_tokens": 48749479.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.8716904276985743, | |
| "grad_norm": 0.24688227687368633, | |
| "learning_rate": 9.535052687345273e-06, | |
| "loss": 0.0387, | |
| "num_tokens": 49192411.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.879837067209776, | |
| "grad_norm": 0.24294158661413467, | |
| "learning_rate": 9.524496213201473e-06, | |
| "loss": 0.0378, | |
| "num_tokens": 49653484.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.8879837067209776, | |
| "grad_norm": 0.2405101629778957, | |
| "learning_rate": 9.513827919506835e-06, | |
| "loss": 0.0363, | |
| "num_tokens": 50112406.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.8961303462321792, | |
| "grad_norm": 0.23181354337095814, | |
| "learning_rate": 9.503048102677048e-06, | |
| "loss": 0.0349, | |
| "num_tokens": 50574830.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9042769857433809, | |
| "grad_norm": 0.23382747597194983, | |
| "learning_rate": 9.492157062226438e-06, | |
| "loss": 0.0341, | |
| "num_tokens": 51043765.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.9124236252545825, | |
| "grad_norm": 0.22729966362083456, | |
| "learning_rate": 9.481155100759651e-06, | |
| "loss": 0.0345, | |
| "num_tokens": 51491061.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.9205702647657841, | |
| "grad_norm": 0.24513175538240015, | |
| "learning_rate": 9.470042523963243e-06, | |
| "loss": 0.039, | |
| "num_tokens": 51927088.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.9287169042769857, | |
| "grad_norm": 0.24880865741998745, | |
| "learning_rate": 9.458819640597193e-06, | |
| "loss": 0.0379, | |
| "num_tokens": 52372997.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.9368635437881874, | |
| "grad_norm": 0.2220343898509789, | |
| "learning_rate": 9.447486762486307e-06, | |
| "loss": 0.034, | |
| "num_tokens": 52812484.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.945010183299389, | |
| "grad_norm": 0.22431667653715365, | |
| "learning_rate": 9.436044204511575e-06, | |
| "loss": 0.0346, | |
| "num_tokens": 53269746.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.9531568228105907, | |
| "grad_norm": 0.2278604942336719, | |
| "learning_rate": 9.42449228460141e-06, | |
| "loss": 0.0364, | |
| "num_tokens": 53715464.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.9613034623217923, | |
| "grad_norm": 0.2233927678176066, | |
| "learning_rate": 9.412831323722813e-06, | |
| "loss": 0.0354, | |
| "num_tokens": 54163779.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.9694501018329938, | |
| "grad_norm": 0.2258095576824266, | |
| "learning_rate": 9.401061645872469e-06, | |
| "loss": 0.0356, | |
| "num_tokens": 54622927.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.9775967413441955, | |
| "grad_norm": 0.21843742724066828, | |
| "learning_rate": 9.389183578067725e-06, | |
| "loss": 0.0332, | |
| "num_tokens": 55117094.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.9857433808553971, | |
| "grad_norm": 0.23443623658924626, | |
| "learning_rate": 9.37719745033752e-06, | |
| "loss": 0.0372, | |
| "num_tokens": 55571058.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.9938900203665988, | |
| "grad_norm": 0.22544218766750995, | |
| "learning_rate": 9.365103595713206e-06, | |
| "loss": 0.0347, | |
| "num_tokens": 56023909.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.23694297938614514, | |
| "learning_rate": 9.352902350219298e-06, | |
| "loss": 0.0325, | |
| "num_tokens": 56069607.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.041923802345991135, | |
| "eval_num_tokens": 56069607.0, | |
| "eval_runtime": 59.8807, | |
| "eval_samples_per_second": 40.731, | |
| "eval_steps_per_second": 5.093, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.0081466395112015, | |
| "grad_norm": 0.18235571925323477, | |
| "learning_rate": 9.34059405286414e-06, | |
| "loss": 0.0242, | |
| "num_tokens": 56508815.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.0162932790224033, | |
| "grad_norm": 0.19439769536061022, | |
| "learning_rate": 9.32817904563048e-06, | |
| "loss": 0.0234, | |
| "num_tokens": 56965411.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0244399185336048, | |
| "grad_norm": 0.1736558218986549, | |
| "learning_rate": 9.315657673465978e-06, | |
| "loss": 0.0225, | |
| "num_tokens": 57414294.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.0325865580448066, | |
| "grad_norm": 0.19113275019426793, | |
| "learning_rate": 9.303030284273606e-06, | |
| "loss": 0.0225, | |
| "num_tokens": 57877954.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.0407331975560081, | |
| "grad_norm": 0.19036081030160895, | |
| "learning_rate": 9.290297228901994e-06, | |
| "loss": 0.022, | |
| "num_tokens": 58325030.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.0488798370672097, | |
| "grad_norm": 0.1984639840701536, | |
| "learning_rate": 9.277458861135684e-06, | |
| "loss": 0.0219, | |
| "num_tokens": 58808552.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.0570264765784114, | |
| "grad_norm": 0.2043532515942055, | |
| "learning_rate": 9.264515537685289e-06, | |
| "loss": 0.0217, | |
| "num_tokens": 59306149.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.065173116089613, | |
| "grad_norm": 0.24055798224631966, | |
| "learning_rate": 9.251467618177588e-06, | |
| "loss": 0.0238, | |
| "num_tokens": 59747428.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.0733197556008147, | |
| "grad_norm": 0.1995629506004207, | |
| "learning_rate": 9.238315465145536e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 60204616.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.0814663951120163, | |
| "grad_norm": 0.2525900691277178, | |
| "learning_rate": 9.225059444018185e-06, | |
| "loss": 0.0239, | |
| "num_tokens": 60656969.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.089613034623218, | |
| "grad_norm": 0.2318763327290573, | |
| "learning_rate": 9.21169992311054e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 61138427.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.0977596741344195, | |
| "grad_norm": 0.24997426536385803, | |
| "learning_rate": 9.198237273613311e-06, | |
| "loss": 0.0249, | |
| "num_tokens": 61577876.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.105906313645621, | |
| "grad_norm": 0.2271197177471986, | |
| "learning_rate": 9.184671869582617e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 62045028.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.1140529531568228, | |
| "grad_norm": 0.22400406233634754, | |
| "learning_rate": 9.17100408792958e-06, | |
| "loss": 0.0207, | |
| "num_tokens": 62486192.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.1221995926680244, | |
| "grad_norm": 0.23845965068678432, | |
| "learning_rate": 9.157234308409859e-06, | |
| "loss": 0.0225, | |
| "num_tokens": 62956027.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.1303462321792261, | |
| "grad_norm": 0.2106619550266949, | |
| "learning_rate": 9.14336291361309e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 63414690.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.1384928716904277, | |
| "grad_norm": 0.2187838784331104, | |
| "learning_rate": 9.129390288952273e-06, | |
| "loss": 0.0228, | |
| "num_tokens": 63863726.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1466395112016294, | |
| "grad_norm": 0.19858994365963545, | |
| "learning_rate": 9.115316822653043e-06, | |
| "loss": 0.0203, | |
| "num_tokens": 64339457.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.154786150712831, | |
| "grad_norm": 0.2091640165384244, | |
| "learning_rate": 9.101142905742898e-06, | |
| "loss": 0.0224, | |
| "num_tokens": 64797748.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.1629327902240325, | |
| "grad_norm": 0.21848028557367125, | |
| "learning_rate": 9.086868932040327e-06, | |
| "loss": 0.0237, | |
| "num_tokens": 65261816.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.1710794297352343, | |
| "grad_norm": 0.21024580943693202, | |
| "learning_rate": 9.072495298143876e-06, | |
| "loss": 0.0222, | |
| "num_tokens": 65710096.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.1792260692464358, | |
| "grad_norm": 0.21112539693299767, | |
| "learning_rate": 9.058022403421112e-06, | |
| "loss": 0.0234, | |
| "num_tokens": 66180522.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.1873727087576376, | |
| "grad_norm": 0.20089428024021022, | |
| "learning_rate": 9.043450649997546e-06, | |
| "loss": 0.0221, | |
| "num_tokens": 66643220.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.195519348268839, | |
| "grad_norm": 0.21269472349968574, | |
| "learning_rate": 9.028780442745452e-06, | |
| "loss": 0.0236, | |
| "num_tokens": 67103696.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.2036659877800409, | |
| "grad_norm": 0.19919608780198533, | |
| "learning_rate": 9.014012189272612e-06, | |
| "loss": 0.0215, | |
| "num_tokens": 67578752.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.2118126272912424, | |
| "grad_norm": 0.19516667942695456, | |
| "learning_rate": 8.999146299911001e-06, | |
| "loss": 0.0226, | |
| "num_tokens": 68024730.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.219959266802444, | |
| "grad_norm": 0.21431471881204775, | |
| "learning_rate": 8.984183187705376e-06, | |
| "loss": 0.0237, | |
| "num_tokens": 68513599.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.2281059063136457, | |
| "grad_norm": 0.19529826679401555, | |
| "learning_rate": 8.969123268401807e-06, | |
| "loss": 0.0207, | |
| "num_tokens": 68988237.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.2362525458248472, | |
| "grad_norm": 0.2014146714986417, | |
| "learning_rate": 8.953966960436125e-06, | |
| "loss": 0.0231, | |
| "num_tokens": 69430574.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.2443991853360488, | |
| "grad_norm": 0.21239498172005217, | |
| "learning_rate": 8.938714684922294e-06, | |
| "loss": 0.0233, | |
| "num_tokens": 69884264.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.2525458248472505, | |
| "grad_norm": 0.213004792751643, | |
| "learning_rate": 8.923366865640708e-06, | |
| "loss": 0.0237, | |
| "num_tokens": 70361322.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.260692464358452, | |
| "grad_norm": 0.21940926870270266, | |
| "learning_rate": 8.90792392902642e-06, | |
| "loss": 0.0224, | |
| "num_tokens": 70825081.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.2688391038696538, | |
| "grad_norm": 0.21496750501528322, | |
| "learning_rate": 8.892386304157297e-06, | |
| "loss": 0.0221, | |
| "num_tokens": 71283936.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.2769857433808554, | |
| "grad_norm": 0.21230254367904663, | |
| "learning_rate": 8.876754422742084e-06, | |
| "loss": 0.0246, | |
| "num_tokens": 71732305.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.2851323828920571, | |
| "grad_norm": 0.2188832765541447, | |
| "learning_rate": 8.861028719108431e-06, | |
| "loss": 0.0224, | |
| "num_tokens": 72199220.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.2932790224032586, | |
| "grad_norm": 0.215744449219536, | |
| "learning_rate": 8.845209630190804e-06, | |
| "loss": 0.0232, | |
| "num_tokens": 72686777.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.3014256619144602, | |
| "grad_norm": 0.2027419921199597, | |
| "learning_rate": 8.829297595518357e-06, | |
| "loss": 0.0217, | |
| "num_tokens": 73141281.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.309572301425662, | |
| "grad_norm": 0.1999503892860215, | |
| "learning_rate": 8.81329305720272e-06, | |
| "loss": 0.0236, | |
| "num_tokens": 73601661.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.3177189409368635, | |
| "grad_norm": 0.17428881801329021, | |
| "learning_rate": 8.797196459925707e-06, | |
| "loss": 0.0205, | |
| "num_tokens": 74058631.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.3258655804480652, | |
| "grad_norm": 0.18566703149612335, | |
| "learning_rate": 8.78100825092697e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 74506287.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.3340122199592668, | |
| "grad_norm": 0.21095321978269194, | |
| "learning_rate": 8.764728879991563e-06, | |
| "loss": 0.0233, | |
| "num_tokens": 74961649.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.3421588594704685, | |
| "grad_norm": 0.19297708933381486, | |
| "learning_rate": 8.748358799437454e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 75451492.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.35030549898167, | |
| "grad_norm": 0.21567434563638074, | |
| "learning_rate": 8.731898464102955e-06, | |
| "loss": 0.0233, | |
| "num_tokens": 75906898.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.3584521384928716, | |
| "grad_norm": 0.20493170237350014, | |
| "learning_rate": 8.715348331334079e-06, | |
| "loss": 0.0225, | |
| "num_tokens": 76352518.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.3665987780040734, | |
| "grad_norm": 0.20033609244286213, | |
| "learning_rate": 8.698708860971837e-06, | |
| "loss": 0.0217, | |
| "num_tokens": 76833416.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.374745417515275, | |
| "grad_norm": 0.19488013729722037, | |
| "learning_rate": 8.681980515339464e-06, | |
| "loss": 0.0228, | |
| "num_tokens": 77274089.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.3828920570264764, | |
| "grad_norm": 0.23159956916525645, | |
| "learning_rate": 8.66516375922957e-06, | |
| "loss": 0.026, | |
| "num_tokens": 77722945.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3910386965376782, | |
| "grad_norm": 0.20950731710653633, | |
| "learning_rate": 8.648259059891222e-06, | |
| "loss": 0.0238, | |
| "num_tokens": 78165800.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.39918533604888, | |
| "grad_norm": 0.20384280771126798, | |
| "learning_rate": 8.631266887016973e-06, | |
| "loss": 0.0234, | |
| "num_tokens": 78606210.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.4073319755600815, | |
| "grad_norm": 0.2174107340618266, | |
| "learning_rate": 8.614187712729801e-06, | |
| "loss": 0.025, | |
| "num_tokens": 79049600.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.415478615071283, | |
| "grad_norm": 0.21419711356536544, | |
| "learning_rate": 8.597022011569993e-06, | |
| "loss": 0.0236, | |
| "num_tokens": 79519022.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.4236252545824848, | |
| "grad_norm": 0.19839711735747953, | |
| "learning_rate": 8.579770260481967e-06, | |
| "loss": 0.0233, | |
| "num_tokens": 79988589.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.4317718940936863, | |
| "grad_norm": 0.20080459279151233, | |
| "learning_rate": 8.56243293880101e-06, | |
| "loss": 0.0222, | |
| "num_tokens": 80468185.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.4399185336048879, | |
| "grad_norm": 0.1934313717220866, | |
| "learning_rate": 8.545010528239969e-06, | |
| "loss": 0.0236, | |
| "num_tokens": 80915153.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.4480651731160896, | |
| "grad_norm": 0.20353533737845392, | |
| "learning_rate": 8.527503512875862e-06, | |
| "loss": 0.023, | |
| "num_tokens": 81406072.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.4562118126272914, | |
| "grad_norm": 0.1913760568401795, | |
| "learning_rate": 8.509912379136429e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 81861174.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.464358452138493, | |
| "grad_norm": 0.18760725003184955, | |
| "learning_rate": 8.492237615786613e-06, | |
| "loss": 0.0232, | |
| "num_tokens": 82291515.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.4725050916496945, | |
| "grad_norm": 0.2073497810013695, | |
| "learning_rate": 8.474479713914985e-06, | |
| "loss": 0.0241, | |
| "num_tokens": 82746207.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.4806517311608962, | |
| "grad_norm": 0.20252547578412558, | |
| "learning_rate": 8.456639166920104e-06, | |
| "loss": 0.023, | |
| "num_tokens": 83217896.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.4887983706720977, | |
| "grad_norm": 0.19100843841767107, | |
| "learning_rate": 8.438716470496793e-06, | |
| "loss": 0.0234, | |
| "num_tokens": 83673415.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.4969450101832993, | |
| "grad_norm": 0.18768759409970084, | |
| "learning_rate": 8.42071212262238e-06, | |
| "loss": 0.0217, | |
| "num_tokens": 84165622.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.505091649694501, | |
| "grad_norm": 0.21146877851004245, | |
| "learning_rate": 8.402626623542853e-06, | |
| "loss": 0.0236, | |
| "num_tokens": 84623691.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.5132382892057028, | |
| "grad_norm": 0.209209778931465, | |
| "learning_rate": 8.384460475758967e-06, | |
| "loss": 0.0244, | |
| "num_tokens": 85066604.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.5213849287169041, | |
| "grad_norm": 0.20794230796465518, | |
| "learning_rate": 8.36621418401228e-06, | |
| "loss": 0.0245, | |
| "num_tokens": 85500800.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.5295315682281059, | |
| "grad_norm": 0.19401787928805586, | |
| "learning_rate": 8.347888255271126e-06, | |
| "loss": 0.0227, | |
| "num_tokens": 85950718.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.5376782077393076, | |
| "grad_norm": 0.19358587269712685, | |
| "learning_rate": 8.329483198716536e-06, | |
| "loss": 0.0216, | |
| "num_tokens": 86425214.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.5458248472505092, | |
| "grad_norm": 0.19988901116993596, | |
| "learning_rate": 8.310999525728083e-06, | |
| "loss": 0.0237, | |
| "num_tokens": 86872612.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.5539714867617107, | |
| "grad_norm": 0.21347868715899784, | |
| "learning_rate": 8.292437749869676e-06, | |
| "loss": 0.0237, | |
| "num_tokens": 87321247.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.5621181262729125, | |
| "grad_norm": 0.21370368402938023, | |
| "learning_rate": 8.273798386875292e-06, | |
| "loss": 0.0247, | |
| "num_tokens": 87762936.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.570264765784114, | |
| "grad_norm": 0.20394116229065584, | |
| "learning_rate": 8.255081954634646e-06, | |
| "loss": 0.0224, | |
| "num_tokens": 88233384.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.5784114052953155, | |
| "grad_norm": 0.21271701085924696, | |
| "learning_rate": 8.236288973178806e-06, | |
| "loss": 0.024, | |
| "num_tokens": 88702888.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.5865580448065173, | |
| "grad_norm": 0.20525261813526166, | |
| "learning_rate": 8.217419964665728e-06, | |
| "loss": 0.0228, | |
| "num_tokens": 89157902.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.594704684317719, | |
| "grad_norm": 0.20518578666067122, | |
| "learning_rate": 8.198475453365772e-06, | |
| "loss": 0.0239, | |
| "num_tokens": 89596892.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.6028513238289206, | |
| "grad_norm": 0.20424504177429212, | |
| "learning_rate": 8.179455965647117e-06, | |
| "loss": 0.024, | |
| "num_tokens": 90043689.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.6109979633401221, | |
| "grad_norm": 0.21550795243608867, | |
| "learning_rate": 8.16036202996114e-06, | |
| "loss": 0.0255, | |
| "num_tokens": 90493255.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.6191446028513239, | |
| "grad_norm": 0.21505565048112654, | |
| "learning_rate": 8.141194176827738e-06, | |
| "loss": 0.0246, | |
| "num_tokens": 90933700.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.6272912423625254, | |
| "grad_norm": 0.18057787414765422, | |
| "learning_rate": 8.12195293882058e-06, | |
| "loss": 0.0205, | |
| "num_tokens": 91402906.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.635437881873727, | |
| "grad_norm": 0.2128757560225609, | |
| "learning_rate": 8.102638850552323e-06, | |
| "loss": 0.0236, | |
| "num_tokens": 91854715.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.6435845213849287, | |
| "grad_norm": 0.1977918829414763, | |
| "learning_rate": 8.083252448659742e-06, | |
| "loss": 0.022, | |
| "num_tokens": 92317914.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.6517311608961305, | |
| "grad_norm": 0.22533293918121253, | |
| "learning_rate": 8.063794271788826e-06, | |
| "loss": 0.0256, | |
| "num_tokens": 92775730.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.659877800407332, | |
| "grad_norm": 0.21714220334981602, | |
| "learning_rate": 8.044264860579816e-06, | |
| "loss": 0.0255, | |
| "num_tokens": 93222261.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.6680244399185336, | |
| "grad_norm": 0.2061132581627763, | |
| "learning_rate": 8.02466475765218e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 93713195.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6761710794297353, | |
| "grad_norm": 0.21189634915409705, | |
| "learning_rate": 8.004994507589532e-06, | |
| "loss": 0.0244, | |
| "num_tokens": 94167787.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.6843177189409368, | |
| "grad_norm": 0.2049250916068622, | |
| "learning_rate": 7.985254656924512e-06, | |
| "loss": 0.0227, | |
| "num_tokens": 94634140.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.6924643584521384, | |
| "grad_norm": 0.21854416704059987, | |
| "learning_rate": 7.965445754123592e-06, | |
| "loss": 0.0252, | |
| "num_tokens": 95093967.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.7006109979633401, | |
| "grad_norm": 0.20098034036974133, | |
| "learning_rate": 7.945568349571834e-06, | |
| "loss": 0.0233, | |
| "num_tokens": 95578447.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.708757637474542, | |
| "grad_norm": 0.19707920391781453, | |
| "learning_rate": 7.925622995557609e-06, | |
| "loss": 0.0234, | |
| "num_tokens": 96028708.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.7169042769857432, | |
| "grad_norm": 0.183646916505621, | |
| "learning_rate": 7.905610246257243e-06, | |
| "loss": 0.0219, | |
| "num_tokens": 96490579.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.725050916496945, | |
| "grad_norm": 0.21800938479643353, | |
| "learning_rate": 7.885530657719623e-06, | |
| "loss": 0.0245, | |
| "num_tokens": 96939215.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.7331975560081467, | |
| "grad_norm": 0.17805921063304794, | |
| "learning_rate": 7.865384787850742e-06, | |
| "loss": 0.0207, | |
| "num_tokens": 97416826.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.7413441955193483, | |
| "grad_norm": 0.20335070394293855, | |
| "learning_rate": 7.845173196398213e-06, | |
| "loss": 0.023, | |
| "num_tokens": 97870409.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.7494908350305498, | |
| "grad_norm": 0.2014363721260783, | |
| "learning_rate": 7.824896444935692e-06, | |
| "loss": 0.023, | |
| "num_tokens": 98303923.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.7576374745417516, | |
| "grad_norm": 0.19767917831916373, | |
| "learning_rate": 7.804555096847298e-06, | |
| "loss": 0.0206, | |
| "num_tokens": 98792735.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.765784114052953, | |
| "grad_norm": 0.18927709030960627, | |
| "learning_rate": 7.784149717311947e-06, | |
| "loss": 0.0228, | |
| "num_tokens": 99283099.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.7739307535641546, | |
| "grad_norm": 0.19540533688345146, | |
| "learning_rate": 7.763680873287648e-06, | |
| "loss": 0.0224, | |
| "num_tokens": 99728623.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.7820773930753564, | |
| "grad_norm": 0.2021434762578394, | |
| "learning_rate": 7.743149133495763e-06, | |
| "loss": 0.0226, | |
| "num_tokens": 100217105.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.7902240325865582, | |
| "grad_norm": 0.20319556075451253, | |
| "learning_rate": 7.722555068405186e-06, | |
| "loss": 0.024, | |
| "num_tokens": 100658986.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.7983706720977597, | |
| "grad_norm": 0.2037408366987311, | |
| "learning_rate": 7.70189925021651e-06, | |
| "loss": 0.0243, | |
| "num_tokens": 101137134.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.8065173116089612, | |
| "grad_norm": 0.21058268386430223, | |
| "learning_rate": 7.681182252846115e-06, | |
| "loss": 0.0241, | |
| "num_tokens": 101594654.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.814663951120163, | |
| "grad_norm": 0.20499883443387898, | |
| "learning_rate": 7.660404651910236e-06, | |
| "loss": 0.0263, | |
| "num_tokens": 102027887.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.8228105906313645, | |
| "grad_norm": 0.2084551925346071, | |
| "learning_rate": 7.639567024708953e-06, | |
| "loss": 0.0234, | |
| "num_tokens": 102479243.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.830957230142566, | |
| "grad_norm": 0.21438521035457928, | |
| "learning_rate": 7.6186699502101676e-06, | |
| "loss": 0.0226, | |
| "num_tokens": 102944020.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.8391038696537678, | |
| "grad_norm": 0.20743883238353383, | |
| "learning_rate": 7.597714009033505e-06, | |
| "loss": 0.0243, | |
| "num_tokens": 103377204.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.8472505091649696, | |
| "grad_norm": 0.19590114337198036, | |
| "learning_rate": 7.5766997834341836e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 103836520.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.8553971486761711, | |
| "grad_norm": 0.2072497473244054, | |
| "learning_rate": 7.555627857286843e-06, | |
| "loss": 0.0247, | |
| "num_tokens": 104285481.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.8635437881873727, | |
| "grad_norm": 0.18899125629327573, | |
| "learning_rate": 7.534498816069315e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 104746152.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.8716904276985744, | |
| "grad_norm": 0.21687392806104466, | |
| "learning_rate": 7.513313246846357e-06, | |
| "loss": 0.0232, | |
| "num_tokens": 105207211.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.879837067209776, | |
| "grad_norm": 0.20114168053955322, | |
| "learning_rate": 7.492071738253343e-06, | |
| "loss": 0.0243, | |
| "num_tokens": 105657445.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.8879837067209775, | |
| "grad_norm": 0.31880562870408674, | |
| "learning_rate": 7.470774880479909e-06, | |
| "loss": 0.0216, | |
| "num_tokens": 106145000.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.8961303462321792, | |
| "grad_norm": 0.17709039062644658, | |
| "learning_rate": 7.449423265253551e-06, | |
| "loss": 0.0195, | |
| "num_tokens": 106619177.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.904276985743381, | |
| "grad_norm": 0.1941234160393901, | |
| "learning_rate": 7.428017485823189e-06, | |
| "loss": 0.0221, | |
| "num_tokens": 107100389.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.9124236252545825, | |
| "grad_norm": 0.21047496416728861, | |
| "learning_rate": 7.406558136942677e-06, | |
| "loss": 0.0253, | |
| "num_tokens": 107531535.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.920570264765784, | |
| "grad_norm": 0.1811130030622756, | |
| "learning_rate": 7.3850458148542835e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 108000369.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.9287169042769858, | |
| "grad_norm": 0.18791035767087905, | |
| "learning_rate": 7.363481117272125e-06, | |
| "loss": 0.0217, | |
| "num_tokens": 108465611.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.9368635437881874, | |
| "grad_norm": 0.174382304685201, | |
| "learning_rate": 7.341864643365557e-06, | |
| "loss": 0.0214, | |
| "num_tokens": 108923767.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.945010183299389, | |
| "grad_norm": 0.1996921946422325, | |
| "learning_rate": 7.320196993742522e-06, | |
| "loss": 0.023, | |
| "num_tokens": 109367680.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.9531568228105907, | |
| "grad_norm": 0.21146568012414002, | |
| "learning_rate": 7.29847877043287e-06, | |
| "loss": 0.0231, | |
| "num_tokens": 109818455.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.9613034623217924, | |
| "grad_norm": 0.20624057045002148, | |
| "learning_rate": 7.2767105768716295e-06, | |
| "loss": 0.024, | |
| "num_tokens": 110268348.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.9694501018329937, | |
| "grad_norm": 0.17960428006685406, | |
| "learning_rate": 7.254893017882233e-06, | |
| "loss": 0.0222, | |
| "num_tokens": 110696800.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.9775967413441955, | |
| "grad_norm": 0.18718314902352962, | |
| "learning_rate": 7.233026699659723e-06, | |
| "loss": 0.0226, | |
| "num_tokens": 111154475.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.9857433808553973, | |
| "grad_norm": 0.18787650373147796, | |
| "learning_rate": 7.211112229753901e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 111620815.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.9938900203665988, | |
| "grad_norm": 0.19693361518983973, | |
| "learning_rate": 7.189150217052455e-06, | |
| "loss": 0.0216, | |
| "num_tokens": 112092986.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.2093105825813619, | |
| "learning_rate": 7.1671412717640295e-06, | |
| "loss": 0.0201, | |
| "num_tokens": 112131036.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.04159076511859894, | |
| "eval_num_tokens": 112131036.0, | |
| "eval_runtime": 57.7607, | |
| "eval_samples_per_second": 42.226, | |
| "eval_steps_per_second": 5.28, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.0081466395112018, | |
| "grad_norm": 0.12416538079579213, | |
| "learning_rate": 7.145086005401287e-06, | |
| "loss": 0.0126, | |
| "num_tokens": 112602682.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.016293279022403, | |
| "grad_norm": 0.15057303383190754, | |
| "learning_rate": 7.122985030763901e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 113073432.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.024439918533605, | |
| "grad_norm": 0.14759632900226355, | |
| "learning_rate": 7.10083896192154e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 113577827.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.0325865580448066, | |
| "grad_norm": 0.14133368502923574, | |
| "learning_rate": 7.078648414196805e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 114048831.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.0407331975560083, | |
| "grad_norm": 0.15715348160815634, | |
| "learning_rate": 7.056414004148128e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 114548364.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.0488798370672097, | |
| "grad_norm": 0.17716027065421572, | |
| "learning_rate": 7.034136349552647e-06, | |
| "loss": 0.016, | |
| "num_tokens": 114999500.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.0570264765784114, | |
| "grad_norm": 0.1589991111261928, | |
| "learning_rate": 7.011816069389034e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 115456071.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.065173116089613, | |
| "grad_norm": 0.1543372807006171, | |
| "learning_rate": 6.989453783820304e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 115926758.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.0733197556008145, | |
| "grad_norm": 0.1691364992847739, | |
| "learning_rate": 6.9670501141765825e-06, | |
| "loss": 0.014, | |
| "num_tokens": 116385952.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.0814663951120163, | |
| "grad_norm": 0.16602983431455004, | |
| "learning_rate": 6.944605682937834e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 116820035.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.089613034623218, | |
| "grad_norm": 0.18962015294617535, | |
| "learning_rate": 6.92212111371658e-06, | |
| "loss": 0.0143, | |
| "num_tokens": 117297850.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.0977596741344193, | |
| "grad_norm": 0.17122221487492462, | |
| "learning_rate": 6.8995970312405615e-06, | |
| "loss": 0.0126, | |
| "num_tokens": 117759960.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.105906313645621, | |
| "grad_norm": 0.17247701616442646, | |
| "learning_rate": 6.877034061335384e-06, | |
| "loss": 0.0139, | |
| "num_tokens": 118229929.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.114052953156823, | |
| "grad_norm": 0.18706937438179935, | |
| "learning_rate": 6.854432830907135e-06, | |
| "loss": 0.0151, | |
| "num_tokens": 118689637.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.1221995926680246, | |
| "grad_norm": 0.1756410658036281, | |
| "learning_rate": 6.831793967924953e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 119159530.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.130346232179226, | |
| "grad_norm": 0.17325809294266983, | |
| "learning_rate": 6.8091181014035935e-06, | |
| "loss": 0.014, | |
| "num_tokens": 119598302.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.1384928716904277, | |
| "grad_norm": 0.1831164025049776, | |
| "learning_rate": 6.7864058613859395e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 120108425.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.1466395112016294, | |
| "grad_norm": 0.18048260933108903, | |
| "learning_rate": 6.763657878925508e-06, | |
| "loss": 0.015, | |
| "num_tokens": 120578186.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.1547861507128308, | |
| "grad_norm": 0.16327229194519086, | |
| "learning_rate": 6.740874786068906e-06, | |
| "loss": 0.0126, | |
| "num_tokens": 121055989.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.1629327902240325, | |
| "grad_norm": 0.16958620370157418, | |
| "learning_rate": 6.718057215838274e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 121502528.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.1710794297352343, | |
| "grad_norm": 0.1792898292473741, | |
| "learning_rate": 6.695205802213699e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 121956627.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.179226069246436, | |
| "grad_norm": 0.15481947737459167, | |
| "learning_rate": 6.672321180115595e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 122426773.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.1873727087576373, | |
| "grad_norm": 0.16707758315087737, | |
| "learning_rate": 6.6494039853870676e-06, | |
| "loss": 0.0132, | |
| "num_tokens": 122875336.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.195519348268839, | |
| "grad_norm": 0.16476693800658634, | |
| "learning_rate": 6.6264548547762395e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 123320079.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.203665987780041, | |
| "grad_norm": 0.16291392396662507, | |
| "learning_rate": 6.603474425918573e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 123791870.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.211812627291242, | |
| "grad_norm": 0.1703687751088918, | |
| "learning_rate": 6.580463337319128e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 124260736.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.219959266802444, | |
| "grad_norm": 0.17901374374090187, | |
| "learning_rate": 6.557422228334852e-06, | |
| "loss": 0.0159, | |
| "num_tokens": 124681807.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.2281059063136457, | |
| "grad_norm": 0.16798711219930113, | |
| "learning_rate": 6.534351739156797e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 125127429.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.2362525458248474, | |
| "grad_norm": 0.16305596345258705, | |
| "learning_rate": 6.5112525107923296e-06, | |
| "loss": 0.0135, | |
| "num_tokens": 125567336.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.2443991853360488, | |
| "grad_norm": 0.17643316822000632, | |
| "learning_rate": 6.488125185047334e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 126021900.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.2525458248472505, | |
| "grad_norm": 0.167997975045288, | |
| "learning_rate": 6.464970404508369e-06, | |
| "loss": 0.0139, | |
| "num_tokens": 126491133.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.2606924643584523, | |
| "grad_norm": 0.1808990629197575, | |
| "learning_rate": 6.4417888125248195e-06, | |
| "loss": 0.0153, | |
| "num_tokens": 126949660.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.2688391038696536, | |
| "grad_norm": 0.18179273918150798, | |
| "learning_rate": 6.418581053191017e-06, | |
| "loss": 0.0155, | |
| "num_tokens": 127395046.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.2769857433808554, | |
| "grad_norm": 0.16186916571289603, | |
| "learning_rate": 6.39534777132835e-06, | |
| "loss": 0.0141, | |
| "num_tokens": 127879266.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.285132382892057, | |
| "grad_norm": 0.1687611769820901, | |
| "learning_rate": 6.3720896124673356e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 128345971.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.293279022403259, | |
| "grad_norm": 0.18415607421229815, | |
| "learning_rate": 6.348807222829704e-06, | |
| "loss": 0.0155, | |
| "num_tokens": 128804402.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.30142566191446, | |
| "grad_norm": 0.16514691991418554, | |
| "learning_rate": 6.325501249310416e-06, | |
| "loss": 0.0146, | |
| "num_tokens": 129261613.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.309572301425662, | |
| "grad_norm": 0.16769380960540944, | |
| "learning_rate": 6.302172339459717e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 129748258.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.3177189409368637, | |
| "grad_norm": 0.17542238238137692, | |
| "learning_rate": 6.278821141465126e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 130203139.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.325865580448065, | |
| "grad_norm": 0.1703028823912319, | |
| "learning_rate": 6.255448304133435e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 130680052.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.3340122199592668, | |
| "grad_norm": 0.15875518919149162, | |
| "learning_rate": 6.232054476872674e-06, | |
| "loss": 0.013, | |
| "num_tokens": 131145142.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.3421588594704685, | |
| "grad_norm": 0.1559999046320083, | |
| "learning_rate": 6.208640309674081e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 131606714.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.35030549898167, | |
| "grad_norm": 0.16638792870478772, | |
| "learning_rate": 6.185206453094026e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 132070874.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.3584521384928716, | |
| "grad_norm": 0.16556273278032177, | |
| "learning_rate": 6.161753558235945e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 132523899.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.3665987780040734, | |
| "grad_norm": 0.1627153835397699, | |
| "learning_rate": 6.138282276732251e-06, | |
| "loss": 0.0141, | |
| "num_tokens": 132984150.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.374745417515275, | |
| "grad_norm": 0.17420180567604815, | |
| "learning_rate": 6.1147932607262215e-06, | |
| "loss": 0.0153, | |
| "num_tokens": 133423004.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.3828920570264764, | |
| "grad_norm": 0.1715872000392912, | |
| "learning_rate": 6.091287162853883e-06, | |
| "loss": 0.0143, | |
| "num_tokens": 133885515.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.391038696537678, | |
| "grad_norm": 0.15875189010502294, | |
| "learning_rate": 6.067764636225881e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 134334800.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.39918533604888, | |
| "grad_norm": 0.14222500494759646, | |
| "learning_rate": 6.0442263344093224e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 134794203.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.4073319755600817, | |
| "grad_norm": 0.16587875726539164, | |
| "learning_rate": 6.020672911409626e-06, | |
| "loss": 0.014, | |
| "num_tokens": 135246488.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.415478615071283, | |
| "grad_norm": 0.16945883731715217, | |
| "learning_rate": 5.997105021652355e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 135734483.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.423625254582485, | |
| "grad_norm": 0.17856893575225632, | |
| "learning_rate": 5.97352331996502e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 136180989.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.4317718940936865, | |
| "grad_norm": 0.16901738376392064, | |
| "learning_rate": 5.949928461558894e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 136633463.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.439918533604888, | |
| "grad_norm": 0.19440740998217734, | |
| "learning_rate": 5.926321102010808e-06, | |
| "loss": 0.0153, | |
| "num_tokens": 137065466.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.4480651731160896, | |
| "grad_norm": 0.17396280168075312, | |
| "learning_rate": 5.902701897244932e-06, | |
| "loss": 0.014, | |
| "num_tokens": 137519052.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.4562118126272914, | |
| "grad_norm": 0.1962070967726784, | |
| "learning_rate": 5.879071503514555e-06, | |
| "loss": 0.0167, | |
| "num_tokens": 137969737.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.4643584521384927, | |
| "grad_norm": 0.17287895679065615, | |
| "learning_rate": 5.855430577383842e-06, | |
| "loss": 0.0148, | |
| "num_tokens": 138433151.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.4725050916496945, | |
| "grad_norm": 0.19400622443946244, | |
| "learning_rate": 5.831779775709606e-06, | |
| "loss": 0.0148, | |
| "num_tokens": 138875359.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.480651731160896, | |
| "grad_norm": 0.16969104274852342, | |
| "learning_rate": 5.808119755623045e-06, | |
| "loss": 0.0141, | |
| "num_tokens": 139333435.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.4887983706720975, | |
| "grad_norm": 0.17975044746142824, | |
| "learning_rate": 5.784451174511486e-06, | |
| "loss": 0.0155, | |
| "num_tokens": 139787251.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.4969450101832993, | |
| "grad_norm": 0.18637909822915394, | |
| "learning_rate": 5.760774690000128e-06, | |
| "loss": 0.014, | |
| "num_tokens": 140263010.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.505091649694501, | |
| "grad_norm": 0.1755752695664621, | |
| "learning_rate": 5.7370909599337585e-06, | |
| "loss": 0.0143, | |
| "num_tokens": 140730852.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.513238289205703, | |
| "grad_norm": 0.17738520787824683, | |
| "learning_rate": 5.713400642358483e-06, | |
| "loss": 0.015, | |
| "num_tokens": 141160459.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.521384928716904, | |
| "grad_norm": 0.17114933786372763, | |
| "learning_rate": 5.689704395503438e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 141652980.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.529531568228106, | |
| "grad_norm": 0.1702830061303869, | |
| "learning_rate": 5.666002877762506e-06, | |
| "loss": 0.0153, | |
| "num_tokens": 142092423.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.5376782077393076, | |
| "grad_norm": 0.17360071510124675, | |
| "learning_rate": 5.642296747676016e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 142533489.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.5458248472505094, | |
| "grad_norm": 0.1607500590426996, | |
| "learning_rate": 5.618586663912452e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 142991787.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.5539714867617107, | |
| "grad_norm": 0.16048833714516317, | |
| "learning_rate": 5.594873285250151e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 143468508.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.5621181262729125, | |
| "grad_norm": 0.1838591156346174, | |
| "learning_rate": 5.571157270558995e-06, | |
| "loss": 0.0163, | |
| "num_tokens": 143916886.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.5702647657841142, | |
| "grad_norm": 0.162005395980572, | |
| "learning_rate": 5.5474392787821096e-06, | |
| "loss": 0.0135, | |
| "num_tokens": 144388134.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.5784114052953155, | |
| "grad_norm": 0.14863787001529957, | |
| "learning_rate": 5.52371996891755e-06, | |
| "loss": 0.0132, | |
| "num_tokens": 144871370.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.5865580448065173, | |
| "grad_norm": 0.16754180761222826, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.0146, | |
| "num_tokens": 145320563.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.594704684317719, | |
| "grad_norm": 0.18005757817722826, | |
| "learning_rate": 5.476280031082451e-06, | |
| "loss": 0.016, | |
| "num_tokens": 145758817.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.6028513238289204, | |
| "grad_norm": 0.17034049730069928, | |
| "learning_rate": 5.452560721217892e-06, | |
| "loss": 0.0155, | |
| "num_tokens": 146189214.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.610997963340122, | |
| "grad_norm": 0.15413011304140098, | |
| "learning_rate": 5.428842729441008e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 146640888.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.619144602851324, | |
| "grad_norm": 0.17242345415805765, | |
| "learning_rate": 5.405126714749852e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 147089993.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.627291242362525, | |
| "grad_norm": 0.15793761105384327, | |
| "learning_rate": 5.38141333608755e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 147549085.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.635437881873727, | |
| "grad_norm": 0.15260230173501832, | |
| "learning_rate": 5.357703252323985e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 148018238.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.6435845213849287, | |
| "grad_norm": 0.17616115019719872, | |
| "learning_rate": 5.333997122237497e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 148467378.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.6517311608961305, | |
| "grad_norm": 0.16869830739625263, | |
| "learning_rate": 5.310295604496563e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 148924273.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.6598778004073322, | |
| "grad_norm": 0.1516947132562575, | |
| "learning_rate": 5.286599357641519e-06, | |
| "loss": 0.0132, | |
| "num_tokens": 149394678.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.6680244399185336, | |
| "grad_norm": 0.1644528806031863, | |
| "learning_rate": 5.262909040066243e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 149841850.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.6761710794297353, | |
| "grad_norm": 0.1958369178369615, | |
| "learning_rate": 5.239225309999875e-06, | |
| "loss": 0.0156, | |
| "num_tokens": 150282571.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.684317718940937, | |
| "grad_norm": 0.18244134325631398, | |
| "learning_rate": 5.215548825488514e-06, | |
| "loss": 0.0148, | |
| "num_tokens": 150723879.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.6924643584521384, | |
| "grad_norm": 0.16252807203895167, | |
| "learning_rate": 5.191880244376957e-06, | |
| "loss": 0.015, | |
| "num_tokens": 151164471.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.70061099796334, | |
| "grad_norm": 0.16907582749071554, | |
| "learning_rate": 5.168220224290395e-06, | |
| "loss": 0.013, | |
| "num_tokens": 151627236.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.708757637474542, | |
| "grad_norm": 0.1521247341456988, | |
| "learning_rate": 5.144569422616159e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 152112152.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.716904276985743, | |
| "grad_norm": 0.15854466142189536, | |
| "learning_rate": 5.120928496485448e-06, | |
| "loss": 0.013, | |
| "num_tokens": 152585932.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.725050916496945, | |
| "grad_norm": 0.17382518355000084, | |
| "learning_rate": 5.097298102755069e-06, | |
| "loss": 0.0139, | |
| "num_tokens": 153055065.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.7331975560081467, | |
| "grad_norm": 0.17769717946639274, | |
| "learning_rate": 5.073678897989194e-06, | |
| "loss": 0.0146, | |
| "num_tokens": 153518977.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.741344195519348, | |
| "grad_norm": 0.16803081279999066, | |
| "learning_rate": 5.050071538441107e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 153976769.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.74949083503055, | |
| "grad_norm": 0.17280566271506004, | |
| "learning_rate": 5.026476680034983e-06, | |
| "loss": 0.0154, | |
| "num_tokens": 154408635.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.7576374745417516, | |
| "grad_norm": 0.169314934778943, | |
| "learning_rate": 5.002894978347646e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 154856201.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.765784114052953, | |
| "grad_norm": 0.18005778973651862, | |
| "learning_rate": 4.979327088590375e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 155310653.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.7739307535641546, | |
| "grad_norm": 0.16841593789310932, | |
| "learning_rate": 4.95577366559068e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 155788563.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.7820773930753564, | |
| "grad_norm": 0.17699144806638442, | |
| "learning_rate": 4.932235363774121e-06, | |
| "loss": 0.015, | |
| "num_tokens": 156228468.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.790224032586558, | |
| "grad_norm": 0.15791462498013234, | |
| "learning_rate": 4.908712837146118e-06, | |
| "loss": 0.014, | |
| "num_tokens": 156664176.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.79837067209776, | |
| "grad_norm": 0.15227035101116576, | |
| "learning_rate": 4.88520673927378e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 157134252.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.8065173116089612, | |
| "grad_norm": 0.1594189404919013, | |
| "learning_rate": 4.861717723267752e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 157599805.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.814663951120163, | |
| "grad_norm": 0.15995395220658057, | |
| "learning_rate": 4.838246441764056e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 158053673.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.8228105906313647, | |
| "grad_norm": 0.18202172640131933, | |
| "learning_rate": 4.814793546905977e-06, | |
| "loss": 0.0157, | |
| "num_tokens": 158485241.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.830957230142566, | |
| "grad_norm": 0.15339369785350124, | |
| "learning_rate": 4.791359690325921e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 158947625.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.839103869653768, | |
| "grad_norm": 0.16788836990713416, | |
| "learning_rate": 4.767945523127327e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 159423146.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.8472505091649696, | |
| "grad_norm": 0.16165924770039114, | |
| "learning_rate": 4.744551695866567e-06, | |
| "loss": 0.0148, | |
| "num_tokens": 159861129.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.855397148676171, | |
| "grad_norm": 0.1774028782674121, | |
| "learning_rate": 4.721178858534876e-06, | |
| "loss": 0.0148, | |
| "num_tokens": 160329576.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.8635437881873727, | |
| "grad_norm": 0.16285854302808034, | |
| "learning_rate": 4.697827660540285e-06, | |
| "loss": 0.014, | |
| "num_tokens": 160797840.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.8716904276985744, | |
| "grad_norm": 0.18877633260447374, | |
| "learning_rate": 4.674498750689585e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 161243065.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.8798370672097757, | |
| "grad_norm": 0.1601336012550065, | |
| "learning_rate": 4.651192777170298e-06, | |
| "loss": 0.0143, | |
| "num_tokens": 161699619.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.8879837067209775, | |
| "grad_norm": 0.16259855324262715, | |
| "learning_rate": 4.627910387532663e-06, | |
| "loss": 0.014, | |
| "num_tokens": 162166184.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.8961303462321792, | |
| "grad_norm": 0.17610385289208558, | |
| "learning_rate": 4.604652228671653e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 162610492.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.904276985743381, | |
| "grad_norm": 0.1838783740248808, | |
| "learning_rate": 4.581418946808983e-06, | |
| "loss": 0.0153, | |
| "num_tokens": 163056383.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.9124236252545828, | |
| "grad_norm": 0.15216837256215965, | |
| "learning_rate": 4.558211187475181e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 163543282.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.920570264765784, | |
| "grad_norm": 0.15127415963746377, | |
| "learning_rate": 4.535029595491632e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 163999105.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.928716904276986, | |
| "grad_norm": 0.17498160266417795, | |
| "learning_rate": 4.511874814952668e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 164458000.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.9368635437881876, | |
| "grad_norm": 0.16423843849992176, | |
| "learning_rate": 4.488747489207672e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 164928642.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.945010183299389, | |
| "grad_norm": 0.15573818980541582, | |
| "learning_rate": 4.4656482608432054e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 165408976.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.9531568228105907, | |
| "grad_norm": 0.17543688765013044, | |
| "learning_rate": 4.442577771665147e-06, | |
| "loss": 0.014, | |
| "num_tokens": 165886616.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.9613034623217924, | |
| "grad_norm": 0.16511980145949437, | |
| "learning_rate": 4.419536662680873e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 166343018.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.9694501018329937, | |
| "grad_norm": 0.1587473778216488, | |
| "learning_rate": 4.39652557408143e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 166801376.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.9775967413441955, | |
| "grad_norm": 0.17009893552558653, | |
| "learning_rate": 4.373545145223761e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 167280403.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.9857433808553973, | |
| "grad_norm": 0.15750436171587542, | |
| "learning_rate": 4.350596014612935e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 167726691.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.9938900203665986, | |
| "grad_norm": 0.17197192413672613, | |
| "learning_rate": 4.327678819884405e-06, | |
| "loss": 0.0143, | |
| "num_tokens": 168173644.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.19850738254699854, | |
| "learning_rate": 4.304794197786304e-06, | |
| "loss": 0.0141, | |
| "num_tokens": 168216390.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.041459400206804276, | |
| "eval_num_tokens": 168216390.0, | |
| "eval_runtime": 58.1871, | |
| "eval_samples_per_second": 41.917, | |
| "eval_steps_per_second": 5.242, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.0081466395112018, | |
| "grad_norm": 0.12585174258784562, | |
| "learning_rate": 4.281942784161728e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 168660422.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.016293279022403, | |
| "grad_norm": 0.11765413035696883, | |
| "learning_rate": 4.2591252139310945e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 169121635.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.024439918533605, | |
| "grad_norm": 0.11700242994990097, | |
| "learning_rate": 4.2363421210744925e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 169588292.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.0325865580448066, | |
| "grad_norm": 0.13410847188727293, | |
| "learning_rate": 4.213594138614062e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 170048576.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.0407331975560083, | |
| "grad_norm": 0.11184500956394558, | |
| "learning_rate": 4.190881898596409e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 170553649.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.0488798370672097, | |
| "grad_norm": 0.12083327220094565, | |
| "learning_rate": 4.168206032075048e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 171011806.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.0570264765784114, | |
| "grad_norm": 0.13145187085930216, | |
| "learning_rate": 4.1455671690928666e-06, | |
| "loss": 0.009, | |
| "num_tokens": 171488462.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.065173116089613, | |
| "grad_norm": 0.13334793710473314, | |
| "learning_rate": 4.122965938664616e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 171943130.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.0733197556008145, | |
| "grad_norm": 0.1332625062123775, | |
| "learning_rate": 4.100402968759441e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 172384061.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.0814663951120163, | |
| "grad_norm": 0.13147800386811567, | |
| "learning_rate": 4.077878886283422e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 172832702.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.089613034623218, | |
| "grad_norm": 0.1411078689570707, | |
| "learning_rate": 4.055394317062168e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 173290817.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.0977596741344193, | |
| "grad_norm": 0.1284905098348191, | |
| "learning_rate": 4.03294988582342e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 173766754.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.105906313645621, | |
| "grad_norm": 0.13291783263584392, | |
| "learning_rate": 4.010546216179697e-06, | |
| "loss": 0.008, | |
| "num_tokens": 174227586.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.114052953156823, | |
| "grad_norm": 0.13439803780962148, | |
| "learning_rate": 3.988183930610967e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 174684443.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.1221995926680246, | |
| "grad_norm": 0.1318097744846226, | |
| "learning_rate": 3.965863650447355e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 175153040.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.130346232179226, | |
| "grad_norm": 0.14505278918262016, | |
| "learning_rate": 3.943585995851872e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 175616900.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.1384928716904277, | |
| "grad_norm": 0.143736668078946, | |
| "learning_rate": 3.9213515858031984e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 176098251.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.1466395112016294, | |
| "grad_norm": 0.13749127082571724, | |
| "learning_rate": 3.8991610380784626e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 176570672.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.1547861507128308, | |
| "grad_norm": 0.15661494242610496, | |
| "learning_rate": 3.877014969236102e-06, | |
| "loss": 0.0101, | |
| "num_tokens": 177008465.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.1629327902240325, | |
| "grad_norm": 0.15062683514898298, | |
| "learning_rate": 3.854913994598715e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 177466175.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.1710794297352343, | |
| "grad_norm": 0.1391922011105707, | |
| "learning_rate": 3.832858728235971e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 177917874.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.179226069246436, | |
| "grad_norm": 0.1552031660404893, | |
| "learning_rate": 3.8108497829475465e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 178367628.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.1873727087576373, | |
| "grad_norm": 0.13811754646428342, | |
| "learning_rate": 3.7888877702460992e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 178825445.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.195519348268839, | |
| "grad_norm": 0.12162345237220032, | |
| "learning_rate": 3.7669733003402775e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 179301109.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.203665987780041, | |
| "grad_norm": 0.13707719742366498, | |
| "learning_rate": 3.7451069821177677e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 179757593.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.211812627291242, | |
| "grad_norm": 0.13095735092161556, | |
| "learning_rate": 3.7232894231283724e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 180213993.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.219959266802444, | |
| "grad_norm": 0.13262472070811615, | |
| "learning_rate": 3.701521229567131e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 180668901.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.2281059063136457, | |
| "grad_norm": 0.13971045948367564, | |
| "learning_rate": 3.6798030062574807e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 181137029.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.2362525458248474, | |
| "grad_norm": 0.15719898296312626, | |
| "learning_rate": 3.6581353566344447e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 181583795.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.2443991853360488, | |
| "grad_norm": 0.13349745981088976, | |
| "learning_rate": 3.6365188827278752e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 182040738.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.2525458248472505, | |
| "grad_norm": 0.1507228385771512, | |
| "learning_rate": 3.6149541851457183e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 182494412.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.2606924643584523, | |
| "grad_norm": 0.13598098409095466, | |
| "learning_rate": 3.593441863057325e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 182943146.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.2688391038696536, | |
| "grad_norm": 0.13606743657097284, | |
| "learning_rate": 3.5719825141768128e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 183393591.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.2769857433808554, | |
| "grad_norm": 0.14156987679154379, | |
| "learning_rate": 3.5505767347464504e-06, | |
| "loss": 0.009, | |
| "num_tokens": 183862449.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.285132382892057, | |
| "grad_norm": 0.13512553050700174, | |
| "learning_rate": 3.5292251195200932e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 184305229.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.293279022403259, | |
| "grad_norm": 0.11472791583197466, | |
| "learning_rate": 3.5079282617466594e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 184802522.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.30142566191446, | |
| "grad_norm": 0.12789474002800086, | |
| "learning_rate": 3.486686753153645e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 185274960.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.309572301425662, | |
| "grad_norm": 0.1275610588019882, | |
| "learning_rate": 3.4655011839306866e-06, | |
| "loss": 0.009, | |
| "num_tokens": 185709382.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.3177189409368637, | |
| "grad_norm": 0.1404980269677411, | |
| "learning_rate": 3.4443721427131593e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 186161144.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.325865580448065, | |
| "grad_norm": 0.13529566839707055, | |
| "learning_rate": 3.423300216565819e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 186619778.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.3340122199592668, | |
| "grad_norm": 0.1387178170918977, | |
| "learning_rate": 3.4022859909664957e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 187041856.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.3421588594704685, | |
| "grad_norm": 0.13789162045155967, | |
| "learning_rate": 3.3813300497898326e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 187505631.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.35030549898167, | |
| "grad_norm": 0.137718313724877, | |
| "learning_rate": 3.3604329752910468e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 187962839.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.3584521384928716, | |
| "grad_norm": 0.13226236747300735, | |
| "learning_rate": 3.339595348089767e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 188406846.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.3665987780040734, | |
| "grad_norm": 0.13283015288873243, | |
| "learning_rate": 3.3188177471538864e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 188859539.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.374745417515275, | |
| "grad_norm": 0.13902664596528255, | |
| "learning_rate": 3.2981007497834922e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 189323101.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.3828920570264764, | |
| "grad_norm": 0.13509517554370873, | |
| "learning_rate": 3.2774449315948147e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 189823493.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.391038696537678, | |
| "grad_norm": 0.1366523338854662, | |
| "learning_rate": 3.2568508665042383e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 190301541.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.39918533604888, | |
| "grad_norm": 0.125577137562613, | |
| "learning_rate": 3.2363191267123517e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 190798114.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.4073319755600817, | |
| "grad_norm": 0.14591111241424826, | |
| "learning_rate": 3.215850282688055e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 191261005.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.415478615071283, | |
| "grad_norm": 0.12604467726858234, | |
| "learning_rate": 3.195444903152703e-06, | |
| "loss": 0.008, | |
| "num_tokens": 191709305.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.423625254582485, | |
| "grad_norm": 0.13382954324399682, | |
| "learning_rate": 3.1751035550643107e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 192209220.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.4317718940936865, | |
| "grad_norm": 0.13698395980312603, | |
| "learning_rate": 3.1548268036017904e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 192639412.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.439918533604888, | |
| "grad_norm": 0.13829425626998468, | |
| "learning_rate": 3.134615212149258e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 193098241.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.4480651731160896, | |
| "grad_norm": 0.11711892810797479, | |
| "learning_rate": 3.114469342280379e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 193574245.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.4562118126272914, | |
| "grad_norm": 0.1309214084812048, | |
| "learning_rate": 3.094389753742758e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 194017166.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.4643584521384927, | |
| "grad_norm": 0.14133229462166405, | |
| "learning_rate": 3.0743770044423936e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 194461022.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.4725050916496945, | |
| "grad_norm": 0.12915594606644895, | |
| "learning_rate": 3.0544316504281677e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 194921886.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.480651731160896, | |
| "grad_norm": 0.13019588847393995, | |
| "learning_rate": 3.03455424587641e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 195394552.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.4887983706720975, | |
| "grad_norm": 0.12493252602627915, | |
| "learning_rate": 3.014745343075488e-06, | |
| "loss": 0.009, | |
| "num_tokens": 195853843.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.4969450101832993, | |
| "grad_norm": 0.13292973796735513, | |
| "learning_rate": 2.995005492410469e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 196316073.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.505091649694501, | |
| "grad_norm": 0.15361936626468706, | |
| "learning_rate": 2.975335242347822e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 196747650.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.513238289205703, | |
| "grad_norm": 0.12126261520512835, | |
| "learning_rate": 2.9557351394201855e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 197222644.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.521384928716904, | |
| "grad_norm": 0.14364063312304898, | |
| "learning_rate": 2.9362057282111754e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 197703977.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 3.529531568228106, | |
| "grad_norm": 0.1285606277274214, | |
| "learning_rate": 2.9167475513402592e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 198159184.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 3.5376782077393076, | |
| "grad_norm": 0.12784246623295054, | |
| "learning_rate": 2.897361149447679e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 198611287.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.5458248472505094, | |
| "grad_norm": 0.1297694309800873, | |
| "learning_rate": 2.878047061179422e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 199069757.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 3.5539714867617107, | |
| "grad_norm": 0.1292114725276358, | |
| "learning_rate": 2.858805823172264e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 199540737.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 3.5621181262729125, | |
| "grad_norm": 0.12887249746822058, | |
| "learning_rate": 2.839637970038861e-06, | |
| "loss": 0.009, | |
| "num_tokens": 199982367.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 3.5702647657841142, | |
| "grad_norm": 0.1280981502556342, | |
| "learning_rate": 2.8205440343528856e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 200427445.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 3.5784114052953155, | |
| "grad_norm": 0.13979378072527007, | |
| "learning_rate": 2.8015245466342287e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 200889454.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.5865580448065173, | |
| "grad_norm": 0.14089395360902868, | |
| "learning_rate": 2.7825800353342734e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 201331340.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.594704684317719, | |
| "grad_norm": 0.1380485690052255, | |
| "learning_rate": 2.763711026821196e-06, | |
| "loss": 0.0087, | |
| "num_tokens": 201788908.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 3.6028513238289204, | |
| "grad_norm": 0.13663809301177426, | |
| "learning_rate": 2.7449180453653544e-06, | |
| "loss": 0.009, | |
| "num_tokens": 202225257.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 3.610997963340122, | |
| "grad_norm": 0.1473692732003636, | |
| "learning_rate": 2.72620161312471e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 202692568.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 3.619144602851324, | |
| "grad_norm": 0.1257774235275037, | |
| "learning_rate": 2.7075622501303255e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 203149741.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.627291242362525, | |
| "grad_norm": 0.13888313118631118, | |
| "learning_rate": 2.689000474271918e-06, | |
| "loss": 0.008, | |
| "num_tokens": 203602311.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 3.635437881873727, | |
| "grad_norm": 0.15749316142966002, | |
| "learning_rate": 2.670516801283464e-06, | |
| "loss": 0.0108, | |
| "num_tokens": 204036522.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 3.6435845213849287, | |
| "grad_norm": 0.12907115857092855, | |
| "learning_rate": 2.652111744728876e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 204486691.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 3.6517311608961305, | |
| "grad_norm": 0.13596062968350994, | |
| "learning_rate": 2.6337858159877226e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 204952023.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 3.6598778004073322, | |
| "grad_norm": 0.13346166766765533, | |
| "learning_rate": 2.615539524241036e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 205402274.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.6680244399185336, | |
| "grad_norm": 0.12166137102621093, | |
| "learning_rate": 2.5973733764571486e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 205859233.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 3.6761710794297353, | |
| "grad_norm": 0.13150089757352357, | |
| "learning_rate": 2.5792878773776225e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 206314665.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 3.684317718940937, | |
| "grad_norm": 0.137357932504932, | |
| "learning_rate": 2.561283529503208e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 206766146.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 3.6924643584521384, | |
| "grad_norm": 0.13479268397128444, | |
| "learning_rate": 2.5433608330798974e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 207200864.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 3.70061099796334, | |
| "grad_norm": 0.11930317957334262, | |
| "learning_rate": 2.5255202860850157e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 207685884.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.708757637474542, | |
| "grad_norm": 0.13838840583020326, | |
| "learning_rate": 2.5077623842133895e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 208130253.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.716904276985743, | |
| "grad_norm": 0.11517621045103824, | |
| "learning_rate": 2.490087620863573e-06, | |
| "loss": 0.0069, | |
| "num_tokens": 208602611.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 3.725050916496945, | |
| "grad_norm": 0.13060665615710568, | |
| "learning_rate": 2.4724964871241387e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 209056174.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.7331975560081467, | |
| "grad_norm": 0.12008096972228131, | |
| "learning_rate": 2.454989471760031e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 209524636.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 3.741344195519348, | |
| "grad_norm": 0.12402851148890304, | |
| "learning_rate": 2.437567061198991e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 209994196.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.74949083503055, | |
| "grad_norm": 0.1386116097689955, | |
| "learning_rate": 2.4202297395180353e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 210466756.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 3.7576374745417516, | |
| "grad_norm": 0.13027448435015335, | |
| "learning_rate": 2.4029779884300084e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 210950806.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.765784114052953, | |
| "grad_norm": 0.13406681887661104, | |
| "learning_rate": 2.3858122872702004e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 211404708.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 3.7739307535641546, | |
| "grad_norm": 0.12564076418855794, | |
| "learning_rate": 2.3687331129830276e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 211866245.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.7820773930753564, | |
| "grad_norm": 0.14390639481706682, | |
| "learning_rate": 2.3517409401087787e-06, | |
| "loss": 0.01, | |
| "num_tokens": 212305458.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.790224032586558, | |
| "grad_norm": 0.12121296971812623, | |
| "learning_rate": 2.3348362407704313e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 212752369.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.79837067209776, | |
| "grad_norm": 0.13138683401901344, | |
| "learning_rate": 2.3180194846605367e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 213222393.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.8065173116089612, | |
| "grad_norm": 0.12175266765217344, | |
| "learning_rate": 2.301291139028164e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 213681819.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.814663951120163, | |
| "grad_norm": 0.1371049624563703, | |
| "learning_rate": 2.284651668665923e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 214137525.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.8228105906313647, | |
| "grad_norm": 0.12055815969853237, | |
| "learning_rate": 2.268101535897046e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 214589391.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.830957230142566, | |
| "grad_norm": 0.14290097179006628, | |
| "learning_rate": 2.2516412005625465e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 215032404.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.839103869653768, | |
| "grad_norm": 0.12498594646510086, | |
| "learning_rate": 2.235271120008439e-06, | |
| "loss": 0.008, | |
| "num_tokens": 215507249.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.8472505091649696, | |
| "grad_norm": 0.12951287934361264, | |
| "learning_rate": 2.218991749073032e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 215963900.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.855397148676171, | |
| "grad_norm": 0.1340360089239412, | |
| "learning_rate": 2.2028035400742946e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 216413372.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.8635437881873727, | |
| "grad_norm": 0.1428815749513004, | |
| "learning_rate": 2.1867069427972814e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 216874891.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.8716904276985744, | |
| "grad_norm": 0.15258894927804814, | |
| "learning_rate": 2.1707024044816433e-06, | |
| "loss": 0.01, | |
| "num_tokens": 217335057.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.8798370672097757, | |
| "grad_norm": 0.11854572238956909, | |
| "learning_rate": 2.1547903698091975e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 217810990.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.8879837067209775, | |
| "grad_norm": 0.12413149337289436, | |
| "learning_rate": 2.13897128089157e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 218271262.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.8961303462321792, | |
| "grad_norm": 0.13054286386457706, | |
| "learning_rate": 2.1232455772579164e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 218733996.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.904276985743381, | |
| "grad_norm": 0.14174809290893123, | |
| "learning_rate": 2.107613695842705e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 219193703.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.9124236252545828, | |
| "grad_norm": 0.13172558951478341, | |
| "learning_rate": 2.09207607097358e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 219637585.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.920570264765784, | |
| "grad_norm": 0.12825966468997463, | |
| "learning_rate": 2.0766331343592935e-06, | |
| "loss": 0.0087, | |
| "num_tokens": 220100782.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.928716904276986, | |
| "grad_norm": 0.11619016881857674, | |
| "learning_rate": 2.0612853150777083e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 220548817.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.9368635437881876, | |
| "grad_norm": 0.12824773954267013, | |
| "learning_rate": 2.0460330395638754e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 220986452.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.945010183299389, | |
| "grad_norm": 0.12829553056162407, | |
| "learning_rate": 2.030876731598194e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 221480796.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.9531568228105907, | |
| "grad_norm": 0.12618210454698364, | |
| "learning_rate": 2.0158168122946254e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 221927605.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.9613034623217924, | |
| "grad_norm": 0.1351329578005386, | |
| "learning_rate": 2.000853700089001e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 222378433.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.9694501018329937, | |
| "grad_norm": 0.12217015553170964, | |
| "learning_rate": 1.9859878107273884e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 222829736.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.9775967413441955, | |
| "grad_norm": 0.132799994524403, | |
| "learning_rate": 1.971219557254548e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 223285731.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.9857433808553973, | |
| "grad_norm": 0.12499217681603624, | |
| "learning_rate": 1.956549350002454e-06, | |
| "loss": 0.0069, | |
| "num_tokens": 223757013.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.9938900203665986, | |
| "grad_norm": 0.1140878012499302, | |
| "learning_rate": 1.9419775965788897e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 224255873.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.16568246569300987, | |
| "learning_rate": 1.9275047018561265e-06, | |
| "loss": 0.0087, | |
| "num_tokens": 224299419.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.04265177622437477, | |
| "eval_num_tokens": 224299419.0, | |
| "eval_runtime": 57.842, | |
| "eval_samples_per_second": 42.167, | |
| "eval_steps_per_second": 5.273, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.008146639511201, | |
| "grad_norm": 0.10953124483033916, | |
| "learning_rate": 1.913131067959673e-06, | |
| "loss": 0.0064, | |
| "num_tokens": 224740301.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 4.0162932790224035, | |
| "grad_norm": 0.09661175616513212, | |
| "learning_rate": 1.8988570942571039e-06, | |
| "loss": 0.0064, | |
| "num_tokens": 225184668.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 4.024439918533605, | |
| "grad_norm": 0.09108581236399259, | |
| "learning_rate": 1.8846831773469587e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 225644004.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.032586558044806, | |
| "grad_norm": 0.08286087324083283, | |
| "learning_rate": 1.8706097110477298e-06, | |
| "loss": 0.0049, | |
| "num_tokens": 226150717.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 4.040733197556008, | |
| "grad_norm": 0.0987575217688521, | |
| "learning_rate": 1.8566370863869122e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 226596638.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 4.04887983706721, | |
| "grad_norm": 0.10313893410726134, | |
| "learning_rate": 1.8427656915901428e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 227070697.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 4.057026476578411, | |
| "grad_norm": 0.08409803434100602, | |
| "learning_rate": 1.8289959120704204e-06, | |
| "loss": 0.0045, | |
| "num_tokens": 227563263.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.065173116089613, | |
| "grad_norm": 0.10477823554325051, | |
| "learning_rate": 1.8153281304173842e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 228039640.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.0733197556008145, | |
| "grad_norm": 0.09829199382018614, | |
| "learning_rate": 1.801762726386691e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 228524467.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.081466395112017, | |
| "grad_norm": 0.09331418832378849, | |
| "learning_rate": 1.7883000768894627e-06, | |
| "loss": 0.0047, | |
| "num_tokens": 228994748.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.089613034623218, | |
| "grad_norm": 0.09581281591436303, | |
| "learning_rate": 1.7749405559818162e-06, | |
| "loss": 0.0053, | |
| "num_tokens": 229450908.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.097759674134419, | |
| "grad_norm": 0.10440722069541235, | |
| "learning_rate": 1.7616845348544657e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 229910862.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.1059063136456215, | |
| "grad_norm": 0.09700791936550231, | |
| "learning_rate": 1.7485323818224126e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 230369276.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.114052953156823, | |
| "grad_norm": 0.10857998726786411, | |
| "learning_rate": 1.7354844623147116e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 230827234.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 4.122199592668024, | |
| "grad_norm": 0.10148060818665218, | |
| "learning_rate": 1.722541138864316e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 231311328.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 4.130346232179226, | |
| "grad_norm": 0.09790332100802439, | |
| "learning_rate": 1.7097027710980059e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 231774619.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 4.138492871690428, | |
| "grad_norm": 0.11303318804836798, | |
| "learning_rate": 1.6969697157263968e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 232234778.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 4.146639511201629, | |
| "grad_norm": 0.11928599820659892, | |
| "learning_rate": 1.6843423265340241e-06, | |
| "loss": 0.0066, | |
| "num_tokens": 232685797.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.154786150712831, | |
| "grad_norm": 0.10102982747758138, | |
| "learning_rate": 1.6718209543695198e-06, | |
| "loss": 0.0053, | |
| "num_tokens": 233131575.0, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 4.1629327902240325, | |
| "grad_norm": 0.10103824755044703, | |
| "learning_rate": 1.6594059471358603e-06, | |
| "loss": 0.005, | |
| "num_tokens": 233592206.0, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 4.171079429735234, | |
| "grad_norm": 0.09473975634726714, | |
| "learning_rate": 1.6470976497807028e-06, | |
| "loss": 0.0049, | |
| "num_tokens": 234073717.0, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 4.179226069246436, | |
| "grad_norm": 0.10317754634136525, | |
| "learning_rate": 1.6348964042867963e-06, | |
| "loss": 0.0053, | |
| "num_tokens": 234525493.0, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 4.187372708757637, | |
| "grad_norm": 0.10883171792223603, | |
| "learning_rate": 1.6228025496624816e-06, | |
| "loss": 0.0062, | |
| "num_tokens": 234975032.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.195519348268839, | |
| "grad_norm": 0.10753806313999263, | |
| "learning_rate": 1.6108164219322759e-06, | |
| "loss": 0.0053, | |
| "num_tokens": 235438383.0, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 4.203665987780041, | |
| "grad_norm": 0.1064236411620558, | |
| "learning_rate": 1.598938354127532e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 235879893.0, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 4.211812627291242, | |
| "grad_norm": 0.12022958396721184, | |
| "learning_rate": 1.5871686762771876e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 236349201.0, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 4.219959266802444, | |
| "grad_norm": 0.11124601452546444, | |
| "learning_rate": 1.5755077153985927e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 236800777.0, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 4.228105906313646, | |
| "grad_norm": 0.09236406495488149, | |
| "learning_rate": 1.5639557954884263e-06, | |
| "loss": 0.0043, | |
| "num_tokens": 237293264.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.236252545824847, | |
| "grad_norm": 0.1273600213134377, | |
| "learning_rate": 1.552513237513694e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 237724964.0, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 4.244399185336049, | |
| "grad_norm": 0.11769756147992531, | |
| "learning_rate": 1.541180359402809e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 238180300.0, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 4.2525458248472505, | |
| "grad_norm": 0.1217965830011373, | |
| "learning_rate": 1.5299574760367564e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 238636931.0, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 4.260692464358452, | |
| "grad_norm": 0.11472619272622367, | |
| "learning_rate": 1.5188448992403504e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 239086905.0, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 4.268839103869654, | |
| "grad_norm": 0.11529863540901476, | |
| "learning_rate": 1.5078429377735626e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 239550473.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.276985743380855, | |
| "grad_norm": 0.1156481007594638, | |
| "learning_rate": 1.4969518973229526e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 239995374.0, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 4.285132382892057, | |
| "grad_norm": 0.10698685435134675, | |
| "learning_rate": 1.4861720804931665e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 240466754.0, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 4.293279022403259, | |
| "grad_norm": 0.11289580051998427, | |
| "learning_rate": 1.4755037867985285e-06, | |
| "loss": 0.006, | |
| "num_tokens": 240906071.0, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 4.30142566191446, | |
| "grad_norm": 0.11536050235837439, | |
| "learning_rate": 1.4649473126547273e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 241355455.0, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 4.3095723014256615, | |
| "grad_norm": 0.11636487088267386, | |
| "learning_rate": 1.4545029513705735e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 241836525.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.317718940936864, | |
| "grad_norm": 0.09846554835421734, | |
| "learning_rate": 1.4441709931398513e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 242307462.0, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 4.325865580448065, | |
| "grad_norm": 0.1120813571543054, | |
| "learning_rate": 1.4339517250332565e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 242741978.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 4.334012219959266, | |
| "grad_norm": 0.1113819187138935, | |
| "learning_rate": 1.4238454309904205e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 243192201.0, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 4.3421588594704685, | |
| "grad_norm": 0.11764577339647353, | |
| "learning_rate": 1.4138523918120201e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 243636087.0, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 4.35030549898167, | |
| "grad_norm": 0.11164487804753273, | |
| "learning_rate": 1.4039728851519764e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 244110581.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.358452138492872, | |
| "grad_norm": 0.09698712924798691, | |
| "learning_rate": 1.3942071855097381e-06, | |
| "loss": 0.0049, | |
| "num_tokens": 244572435.0, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 4.366598778004073, | |
| "grad_norm": 0.1104930978310767, | |
| "learning_rate": 1.3845555642226583e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 245032371.0, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 4.374745417515275, | |
| "grad_norm": 0.11183186111310507, | |
| "learning_rate": 1.375018289458453e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 245488372.0, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 4.382892057026477, | |
| "grad_norm": 0.11660880918067139, | |
| "learning_rate": 1.3655956262077502e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 245947576.0, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 4.391038696537678, | |
| "grad_norm": 0.11734436147080707, | |
| "learning_rate": 1.3562878362767296e-06, | |
| "loss": 0.006, | |
| "num_tokens": 246410789.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.3991853360488795, | |
| "grad_norm": 0.11187947506861028, | |
| "learning_rate": 1.3470951782798432e-06, | |
| "loss": 0.0053, | |
| "num_tokens": 246885080.0, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 4.407331975560082, | |
| "grad_norm": 0.10682796561668163, | |
| "learning_rate": 1.338017907632635e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 247344383.0, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 4.415478615071283, | |
| "grad_norm": 0.11487602768278418, | |
| "learning_rate": 1.329056276544642e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 247825702.0, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 4.423625254582484, | |
| "grad_norm": 0.10954303849780199, | |
| "learning_rate": 1.320210534012388e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 248301334.0, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 4.4317718940936865, | |
| "grad_norm": 0.1065560110571518, | |
| "learning_rate": 1.311480925812461e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 248770660.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.439918533604888, | |
| "grad_norm": 0.12112306787916738, | |
| "learning_rate": 1.3028676944946916e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 249197698.0, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 4.44806517311609, | |
| "grad_norm": 0.10503970639083068, | |
| "learning_rate": 1.2943710793754082e-06, | |
| "loss": 0.0049, | |
| "num_tokens": 249659509.0, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 4.456211812627291, | |
| "grad_norm": 0.11924557580218739, | |
| "learning_rate": 1.2859913165307886e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 250110156.0, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 4.464358452138493, | |
| "grad_norm": 0.11442982117714874, | |
| "learning_rate": 1.277728638790303e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 250550111.0, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 4.472505091649695, | |
| "grad_norm": 0.12110395302590302, | |
| "learning_rate": 1.2695832757302412e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 251002357.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.480651731160896, | |
| "grad_norm": 0.11664662464057247, | |
| "learning_rate": 1.2615554536673377e-06, | |
| "loss": 0.0062, | |
| "num_tokens": 251458462.0, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 4.4887983706720975, | |
| "grad_norm": 0.11645513165539287, | |
| "learning_rate": 1.253645395652481e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 251902226.0, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 4.4969450101833, | |
| "grad_norm": 0.11363438791067745, | |
| "learning_rate": 1.2458533214645175e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 252346885.0, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 4.505091649694501, | |
| "grad_norm": 0.12965647026273558, | |
| "learning_rate": 1.2381794476041447e-06, | |
| "loss": 0.0064, | |
| "num_tokens": 252804103.0, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 4.513238289205702, | |
| "grad_norm": 0.10419635456766704, | |
| "learning_rate": 1.2306239872878946e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 253273586.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.521384928716905, | |
| "grad_norm": 0.11350584533770305, | |
| "learning_rate": 1.2231871504422117e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 253725593.0, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 4.529531568228106, | |
| "grad_norm": 0.13468868599441702, | |
| "learning_rate": 1.215869143697619e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 254156458.0, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 4.537678207739307, | |
| "grad_norm": 0.10259852383741634, | |
| "learning_rate": 1.2086701703829755e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 254617846.0, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 4.545824847250509, | |
| "grad_norm": 0.11651453346375099, | |
| "learning_rate": 1.2015904305198286e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 255052922.0, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 4.553971486761711, | |
| "grad_norm": 0.10384453182105129, | |
| "learning_rate": 1.1946301208168593e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 255534554.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.562118126272912, | |
| "grad_norm": 0.12828945094057975, | |
| "learning_rate": 1.1877894346644085e-06, | |
| "loss": 0.007, | |
| "num_tokens": 255986625.0, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 4.570264765784114, | |
| "grad_norm": 0.10166841643303247, | |
| "learning_rate": 1.1810685621291135e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 256440817.0, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 4.5784114052953155, | |
| "grad_norm": 0.12163643122042941, | |
| "learning_rate": 1.174467689948618e-06, | |
| "loss": 0.007, | |
| "num_tokens": 256883913.0, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 4.586558044806518, | |
| "grad_norm": 0.11612572338384212, | |
| "learning_rate": 1.1679870015263908e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 257340848.0, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 4.594704684317719, | |
| "grad_norm": 0.09659828775248515, | |
| "learning_rate": 1.1616266769266263e-06, | |
| "loss": 0.0052, | |
| "num_tokens": 257795593.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 4.60285132382892, | |
| "grad_norm": 0.10140831312358678, | |
| "learning_rate": 1.1553868928692422e-06, | |
| "loss": 0.0048, | |
| "num_tokens": 258288534.0, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 4.610997963340123, | |
| "grad_norm": 0.11217052895153468, | |
| "learning_rate": 1.1492678227249695e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 258741097.0, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 4.619144602851324, | |
| "grad_norm": 0.1126933577651828, | |
| "learning_rate": 1.143269636510536e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 259193501.0, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 4.627291242362525, | |
| "grad_norm": 0.11797745986694334, | |
| "learning_rate": 1.1373925008839403e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 259649197.0, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 4.635437881873727, | |
| "grad_norm": 0.11303980738140469, | |
| "learning_rate": 1.1316365791398251e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 260088831.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.643584521384929, | |
| "grad_norm": 0.10873603489504344, | |
| "learning_rate": 1.1260020312049356e-06, | |
| "loss": 0.006, | |
| "num_tokens": 260555536.0, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 4.65173116089613, | |
| "grad_norm": 0.0920006832828397, | |
| "learning_rate": 1.1204890136336784e-06, | |
| "loss": 0.0052, | |
| "num_tokens": 261048454.0, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 4.659877800407332, | |
| "grad_norm": 0.1255806723199747, | |
| "learning_rate": 1.1150976796037736e-06, | |
| "loss": 0.0068, | |
| "num_tokens": 261480295.0, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 4.6680244399185336, | |
| "grad_norm": 0.11533169004614044, | |
| "learning_rate": 1.1098281789119948e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 261942589.0, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 4.676171079429735, | |
| "grad_norm": 0.10129996781843084, | |
| "learning_rate": 1.104680657970009e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 262393944.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 4.684317718940937, | |
| "grad_norm": 0.11015833267592207, | |
| "learning_rate": 1.0996552598003088e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 262882312.0, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 4.692464358452138, | |
| "grad_norm": 0.10314595226042249, | |
| "learning_rate": 1.094752124032238e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 263336673.0, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 4.70061099796334, | |
| "grad_norm": 0.11664841890610124, | |
| "learning_rate": 1.0899713868981123e-06, | |
| "loss": 0.0064, | |
| "num_tokens": 263792010.0, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 4.708757637474542, | |
| "grad_norm": 0.09972137290365708, | |
| "learning_rate": 1.0853131812294355e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 264237484.0, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 4.716904276985743, | |
| "grad_norm": 0.10268566206680875, | |
| "learning_rate": 1.0807776364532044e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 264713321.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.725050916496945, | |
| "grad_norm": 0.10619035337589804, | |
| "learning_rate": 1.0763648785883186e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 265183724.0, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 4.733197556008147, | |
| "grad_norm": 0.10541962557747203, | |
| "learning_rate": 1.0720750302420745e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 265627643.0, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 4.741344195519348, | |
| "grad_norm": 0.11506033498658928, | |
| "learning_rate": 1.0679082106067618e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 266084878.0, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 4.74949083503055, | |
| "grad_norm": 0.11142067796057883, | |
| "learning_rate": 1.0638645354563488e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 266578362.0, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 4.757637474541752, | |
| "grad_norm": 0.12323031771225379, | |
| "learning_rate": 1.0599441171432685e-06, | |
| "loss": 0.0071, | |
| "num_tokens": 267005793.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 4.765784114052953, | |
| "grad_norm": 0.10911498957082988, | |
| "learning_rate": 1.0561470645952939e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 267445983.0, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 4.773930753564155, | |
| "grad_norm": 0.10589151493278187, | |
| "learning_rate": 1.0524734833125155e-06, | |
| "loss": 0.006, | |
| "num_tokens": 267934787.0, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 4.782077393075356, | |
| "grad_norm": 0.0961251286065213, | |
| "learning_rate": 1.0489234753644075e-06, | |
| "loss": 0.0047, | |
| "num_tokens": 268404039.0, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 4.790224032586558, | |
| "grad_norm": 0.11570808115862555, | |
| "learning_rate": 1.0454971393869895e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 268871776.0, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 4.79837067209776, | |
| "grad_norm": 0.11996049644781787, | |
| "learning_rate": 1.0421945705800913e-06, | |
| "loss": 0.006, | |
| "num_tokens": 269329939.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.806517311608961, | |
| "grad_norm": 0.11015784556640101, | |
| "learning_rate": 1.0390158607047029e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 269796155.0, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 4.814663951120163, | |
| "grad_norm": 0.10516381427732067, | |
| "learning_rate": 1.0359610980804286e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 270260800.0, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 4.822810590631365, | |
| "grad_norm": 0.11057933848917369, | |
| "learning_rate": 1.0330303675830306e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 270718037.0, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 4.830957230142566, | |
| "grad_norm": 0.12034159438309625, | |
| "learning_rate": 1.0302237506420722e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 271163129.0, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 4.839103869653767, | |
| "grad_norm": 0.1298369000893159, | |
| "learning_rate": 1.0275413252386545e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 271586088.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 4.84725050916497, | |
| "grad_norm": 0.11485648605447368, | |
| "learning_rate": 1.0249831659032494e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 272031287.0, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 4.855397148676171, | |
| "grad_norm": 0.11585325382556429, | |
| "learning_rate": 1.0225493437136302e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 272474742.0, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 4.863543788187373, | |
| "grad_norm": 0.1239008691750004, | |
| "learning_rate": 1.020239926292895e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 272932607.0, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 4.871690427698574, | |
| "grad_norm": 0.10254555243859467, | |
| "learning_rate": 1.018054977807589e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 273415530.0, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 4.879837067209776, | |
| "grad_norm": 0.10546701888018833, | |
| "learning_rate": 1.0159945589659223e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 273885366.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.887983706720978, | |
| "grad_norm": 0.12031804835663963, | |
| "learning_rate": 1.0140587270160806e-06, | |
| "loss": 0.0066, | |
| "num_tokens": 274335421.0, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 4.896130346232179, | |
| "grad_norm": 0.11002140545903802, | |
| "learning_rate": 1.0122475357446372e-06, | |
| "loss": 0.0061, | |
| "num_tokens": 274789915.0, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 4.904276985743381, | |
| "grad_norm": 0.10524124599370216, | |
| "learning_rate": 1.0105610354750566e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 275269107.0, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 4.912423625254583, | |
| "grad_norm": 0.1279839524575316, | |
| "learning_rate": 1.0089992730662983e-06, | |
| "loss": 0.007, | |
| "num_tokens": 275714557.0, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 4.920570264765784, | |
| "grad_norm": 0.11521764229191792, | |
| "learning_rate": 1.0075622919115133e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 276134943.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 4.928716904276985, | |
| "grad_norm": 0.10797809839128278, | |
| "learning_rate": 1.0062501319368376e-06, | |
| "loss": 0.005, | |
| "num_tokens": 276628333.0, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 4.936863543788188, | |
| "grad_norm": 0.103832639157195, | |
| "learning_rate": 1.0050628296002864e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 277092549.0, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 4.945010183299389, | |
| "grad_norm": 0.10579829877653406, | |
| "learning_rate": 1.0040004178907364e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 277564414.0, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 4.953156822810591, | |
| "grad_norm": 0.1129191145686251, | |
| "learning_rate": 1.0030629263270133e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 278043267.0, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 4.961303462321792, | |
| "grad_norm": 0.11428351556872687, | |
| "learning_rate": 1.0022503809570692e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 278500208.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.969450101832994, | |
| "grad_norm": 0.12454757064462266, | |
| "learning_rate": 1.0015628043572607e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 278958350.0, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 4.977596741344195, | |
| "grad_norm": 0.11985284209865818, | |
| "learning_rate": 1.0010002156317187e-06, | |
| "loss": 0.0055, | |
| "num_tokens": 279426149.0, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 4.985743380855397, | |
| "grad_norm": 0.11653540817309618, | |
| "learning_rate": 1.0005626304118208e-06, | |
| "loss": 0.0062, | |
| "num_tokens": 279874989.0, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 4.993890020366599, | |
| "grad_norm": 0.12398585004184347, | |
| "learning_rate": 1.0002500608557558e-06, | |
| "loss": 0.0064, | |
| "num_tokens": 280320581.0, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.12467063210289439, | |
| "learning_rate": 1.0000625156481842e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 280366492.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.04499583691358566, | |
| "eval_num_tokens": 280366492.0, | |
| "eval_runtime": 57.8338, | |
| "eval_samples_per_second": 42.173, | |
| "eval_steps_per_second": 5.274, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 615, | |
| "total_flos": 9.471448716243108e+17, | |
| "train_loss": 0.017112477973285245, | |
| "train_runtime": 9496.6509, | |
| "train_samples_per_second": 8.264, | |
| "train_steps_per_second": 0.065 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 615, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.471448716243108e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |