| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 25.0, |
| "eval_steps": 500, |
| "global_step": 39775, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.06285355122564425, |
| "grad_norm": 608.9674682617188, |
| "learning_rate": 4.844437460716531e-05, |
| "loss": 14.2524, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1257071024512885, |
| "grad_norm": 34.65327453613281, |
| "learning_rate": 4.6873035826524205e-05, |
| "loss": 10.3562, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.18856065367693275, |
| "grad_norm": 21.24808120727539, |
| "learning_rate": 4.5301697045883096e-05, |
| "loss": 7.8551, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.251414204902577, |
| "grad_norm": 17.404918670654297, |
| "learning_rate": 4.373035826524199e-05, |
| "loss": 6.6346, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.3142677561282212, |
| "grad_norm": 12.713433265686035, |
| "learning_rate": 4.2159019484600884e-05, |
| "loss": 5.9755, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3771213073538655, |
| "grad_norm": 10.050477981567383, |
| "learning_rate": 4.0587680703959775e-05, |
| "loss": 5.5595, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.43997485857950974, |
| "grad_norm": 13.709216117858887, |
| "learning_rate": 3.9016341923318666e-05, |
| "loss": 5.2853, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.502828409805154, |
| "grad_norm": 9.112940788269043, |
| "learning_rate": 3.744500314267756e-05, |
| "loss": 5.1417, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5656819610307983, |
| "grad_norm": 8.267425537109375, |
| "learning_rate": 3.587366436203646e-05, |
| "loss": 4.9615, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6285355122564424, |
| "grad_norm": 9.709076881408691, |
| "learning_rate": 3.430232558139535e-05, |
| "loss": 4.6907, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6913890634820867, |
| "grad_norm": 845.80859375, |
| "learning_rate": 3.273098680075424e-05, |
| "loss": 4.5456, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.754242614707731, |
| "grad_norm": 5.943735599517822, |
| "learning_rate": 3.115964802011313e-05, |
| "loss": 4.4291, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8170961659333752, |
| "grad_norm": 5.8759989738464355, |
| "learning_rate": 2.9588309239472034e-05, |
| "loss": 4.3252, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8799497171590195, |
| "grad_norm": 14.995753288269043, |
| "learning_rate": 2.8016970458830928e-05, |
| "loss": 4.2586, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.9428032683846638, |
| "grad_norm": 23.3351993560791, |
| "learning_rate": 2.644563167818982e-05, |
| "loss": 4.1372, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 3.215750217437744, |
| "eval_runtime": 19.7611, |
| "eval_samples_per_second": 48.479, |
| "eval_steps_per_second": 6.073, |
| "step": 1591 |
| }, |
| { |
| "epoch": 1.005656819610308, |
| "grad_norm": 8.584565162658691, |
| "learning_rate": 2.4874292897548713e-05, |
| "loss": 4.0272, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.0685103708359522, |
| "grad_norm": 6.45043420791626, |
| "learning_rate": 2.3302954116907607e-05, |
| "loss": 3.9602, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.1313639220615965, |
| "grad_norm": 6.03476095199585, |
| "learning_rate": 2.17316153362665e-05, |
| "loss": 3.9052, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.1942174732872408, |
| "grad_norm": 5.746309280395508, |
| "learning_rate": 2.0160276555625392e-05, |
| "loss": 3.9282, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.2570710245128849, |
| "grad_norm": 8.062549591064453, |
| "learning_rate": 1.858893777498429e-05, |
| "loss": 3.8096, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.3199245757385292, |
| "grad_norm": 8.58310317993164, |
| "learning_rate": 1.701759899434318e-05, |
| "loss": 3.803, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.3827781269641735, |
| "grad_norm": 7.599905490875244, |
| "learning_rate": 1.5446260213702074e-05, |
| "loss": 3.8381, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.4456316781898177, |
| "grad_norm": 22.772512435913086, |
| "learning_rate": 1.3874921433060969e-05, |
| "loss": 3.6456, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.508485229415462, |
| "grad_norm": 6.949570178985596, |
| "learning_rate": 1.2303582652419863e-05, |
| "loss": 3.7442, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.5713387806411063, |
| "grad_norm": 5.7536821365356445, |
| "learning_rate": 1.0732243871778757e-05, |
| "loss": 3.691, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.6341923318667506, |
| "grad_norm": 55.64060974121094, |
| "learning_rate": 9.160905091137651e-06, |
| "loss": 3.7461, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.6970458830923947, |
| "grad_norm": 6.573077201843262, |
| "learning_rate": 7.589566310496543e-06, |
| "loss": 3.6186, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.759899434318039, |
| "grad_norm": 8.615326881408691, |
| "learning_rate": 6.018227529855437e-06, |
| "loss": 3.6546, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.8227529855436833, |
| "grad_norm": 6.359428405761719, |
| "learning_rate": 4.446888749214331e-06, |
| "loss": 3.5724, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.8856065367693273, |
| "grad_norm": 5.5190582275390625, |
| "learning_rate": 2.8755499685732243e-06, |
| "loss": 3.6164, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.9484600879949716, |
| "grad_norm": 5.9382004737854, |
| "learning_rate": 1.3042111879321182e-06, |
| "loss": 3.52, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 2.803544521331787, |
| "eval_runtime": 19.8643, |
| "eval_samples_per_second": 48.227, |
| "eval_steps_per_second": 6.041, |
| "step": 3182 |
| }, |
| { |
| "epoch": 2.011313639220616, |
| "grad_norm": 10.074417114257812, |
| "learning_rate": 3.9946574481458206e-05, |
| "loss": 3.5087, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.07416719044626, |
| "grad_norm": 6.9990434646606445, |
| "learning_rate": 3.963230672532998e-05, |
| "loss": 3.5746, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.1370207416719045, |
| "grad_norm": 6.968172073364258, |
| "learning_rate": 3.931803896920176e-05, |
| "loss": 3.6324, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.1998742928975488, |
| "grad_norm": 179.99803161621094, |
| "learning_rate": 3.9003771213073545e-05, |
| "loss": 3.4072, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.262727844123193, |
| "grad_norm": 59.86805725097656, |
| "learning_rate": 3.868950345694532e-05, |
| "loss": 3.391, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.3255813953488373, |
| "grad_norm": 7.445355415344238, |
| "learning_rate": 3.83752357008171e-05, |
| "loss": 3.2032, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.3884349465744816, |
| "grad_norm": 5.553746700286865, |
| "learning_rate": 3.806096794468888e-05, |
| "loss": 3.3644, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.4512884978001255, |
| "grad_norm": 6.544325351715088, |
| "learning_rate": 3.7746700188560656e-05, |
| "loss": 3.1666, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.5141420490257698, |
| "grad_norm": 7.863962650299072, |
| "learning_rate": 3.7432432432432436e-05, |
| "loss": 3.1982, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.576995600251414, |
| "grad_norm": 10.573624610900879, |
| "learning_rate": 3.7118164676304215e-05, |
| "loss": 3.1336, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.6398491514770583, |
| "grad_norm": 8.506134986877441, |
| "learning_rate": 3.680389692017599e-05, |
| "loss": 3.0191, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.7027027027027026, |
| "grad_norm": 7.1274518966674805, |
| "learning_rate": 3.6489629164047774e-05, |
| "loss": 3.003, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.765556253928347, |
| "grad_norm": 5.121671199798584, |
| "learning_rate": 3.617536140791955e-05, |
| "loss": 3.085, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.828409805153991, |
| "grad_norm": 6.66685152053833, |
| "learning_rate": 3.5861093651791327e-05, |
| "loss": 3.0205, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.8912633563796355, |
| "grad_norm": 8.410430908203125, |
| "learning_rate": 3.5546825895663106e-05, |
| "loss": 2.9611, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.95411690760528, |
| "grad_norm": 6.266846179962158, |
| "learning_rate": 3.5232558139534886e-05, |
| "loss": 2.9299, |
| "step": 4700 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 2.3084471225738525, |
| "eval_runtime": 20.0337, |
| "eval_samples_per_second": 47.819, |
| "eval_steps_per_second": 5.99, |
| "step": 4773 |
| }, |
| { |
| "epoch": 3.016970458830924, |
| "grad_norm": 6.011202335357666, |
| "learning_rate": 3.4918290383406665e-05, |
| "loss": 2.886, |
| "step": 4800 |
| }, |
| { |
| "epoch": 3.0798240100565684, |
| "grad_norm": 7.204225063323975, |
| "learning_rate": 3.4604022627278445e-05, |
| "loss": 2.8579, |
| "step": 4900 |
| }, |
| { |
| "epoch": 3.1426775612822127, |
| "grad_norm": 10.316048622131348, |
| "learning_rate": 3.428975487115022e-05, |
| "loss": 2.8155, |
| "step": 5000 |
| }, |
| { |
| "epoch": 3.2055311125078565, |
| "grad_norm": 6.55385684967041, |
| "learning_rate": 3.3975487115022e-05, |
| "loss": 2.8938, |
| "step": 5100 |
| }, |
| { |
| "epoch": 3.268384663733501, |
| "grad_norm": 6.081694602966309, |
| "learning_rate": 3.366121935889378e-05, |
| "loss": 2.7344, |
| "step": 5200 |
| }, |
| { |
| "epoch": 3.331238214959145, |
| "grad_norm": 8.186753273010254, |
| "learning_rate": 3.3346951602765556e-05, |
| "loss": 2.7899, |
| "step": 5300 |
| }, |
| { |
| "epoch": 3.3940917661847894, |
| "grad_norm": 7.425989627838135, |
| "learning_rate": 3.3032683846637335e-05, |
| "loss": 2.7317, |
| "step": 5400 |
| }, |
| { |
| "epoch": 3.4569453174104336, |
| "grad_norm": 5.459439277648926, |
| "learning_rate": 3.2718416090509115e-05, |
| "loss": 2.6456, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.519798868636078, |
| "grad_norm": 5.077919006347656, |
| "learning_rate": 3.2404148334380894e-05, |
| "loss": 2.6816, |
| "step": 5600 |
| }, |
| { |
| "epoch": 3.5826524198617222, |
| "grad_norm": 5.81939172744751, |
| "learning_rate": 3.2089880578252674e-05, |
| "loss": 2.64, |
| "step": 5700 |
| }, |
| { |
| "epoch": 3.6455059710873665, |
| "grad_norm": 39.74727249145508, |
| "learning_rate": 3.177561282212445e-05, |
| "loss": 2.6725, |
| "step": 5800 |
| }, |
| { |
| "epoch": 3.708359522313011, |
| "grad_norm": 5.927642345428467, |
| "learning_rate": 3.1461345065996226e-05, |
| "loss": 2.5395, |
| "step": 5900 |
| }, |
| { |
| "epoch": 3.771213073538655, |
| "grad_norm": 5.984442710876465, |
| "learning_rate": 3.114707730986801e-05, |
| "loss": 2.6297, |
| "step": 6000 |
| }, |
| { |
| "epoch": 3.834066624764299, |
| "grad_norm": 5.258358478546143, |
| "learning_rate": 3.083280955373979e-05, |
| "loss": 2.6291, |
| "step": 6100 |
| }, |
| { |
| "epoch": 3.8969201759899432, |
| "grad_norm": 5.7379937171936035, |
| "learning_rate": 3.0518541797611565e-05, |
| "loss": 2.6116, |
| "step": 6200 |
| }, |
| { |
| "epoch": 3.9597737272155875, |
| "grad_norm": 5.038835048675537, |
| "learning_rate": 3.0204274041483344e-05, |
| "loss": 2.6695, |
| "step": 6300 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 2.0932769775390625, |
| "eval_runtime": 20.0417, |
| "eval_samples_per_second": 47.8, |
| "eval_steps_per_second": 5.988, |
| "step": 6364 |
| }, |
| { |
| "epoch": 4.022627278441232, |
| "grad_norm": 7.459395885467529, |
| "learning_rate": 2.9890006285355127e-05, |
| "loss": 2.6404, |
| "step": 6400 |
| }, |
| { |
| "epoch": 4.085480829666876, |
| "grad_norm": 6.721461296081543, |
| "learning_rate": 2.9575738529226903e-05, |
| "loss": 2.4614, |
| "step": 6500 |
| }, |
| { |
| "epoch": 4.14833438089252, |
| "grad_norm": 6.69769287109375, |
| "learning_rate": 2.9261470773098683e-05, |
| "loss": 2.457, |
| "step": 6600 |
| }, |
| { |
| "epoch": 4.211187932118165, |
| "grad_norm": 5.306356906890869, |
| "learning_rate": 2.894720301697046e-05, |
| "loss": 2.513, |
| "step": 6700 |
| }, |
| { |
| "epoch": 4.274041483343809, |
| "grad_norm": 5.425265312194824, |
| "learning_rate": 2.8632935260842235e-05, |
| "loss": 2.5467, |
| "step": 6800 |
| }, |
| { |
| "epoch": 4.336895034569453, |
| "grad_norm": 4.722207546234131, |
| "learning_rate": 2.8318667504714018e-05, |
| "loss": 2.3467, |
| "step": 6900 |
| }, |
| { |
| "epoch": 4.3997485857950975, |
| "grad_norm": 4.346086502075195, |
| "learning_rate": 2.8004399748585797e-05, |
| "loss": 2.5098, |
| "step": 7000 |
| }, |
| { |
| "epoch": 4.462602137020742, |
| "grad_norm": 7.4684319496154785, |
| "learning_rate": 2.7690131992457573e-05, |
| "loss": 2.4396, |
| "step": 7100 |
| }, |
| { |
| "epoch": 4.525455688246386, |
| "grad_norm": 5.709039688110352, |
| "learning_rate": 2.7375864236329353e-05, |
| "loss": 2.4688, |
| "step": 7200 |
| }, |
| { |
| "epoch": 4.58830923947203, |
| "grad_norm": 4.952858924865723, |
| "learning_rate": 2.7061596480201136e-05, |
| "loss": 2.3643, |
| "step": 7300 |
| }, |
| { |
| "epoch": 4.651162790697675, |
| "grad_norm": 6.68017578125, |
| "learning_rate": 2.6747328724072912e-05, |
| "loss": 2.4242, |
| "step": 7400 |
| }, |
| { |
| "epoch": 4.714016341923319, |
| "grad_norm": 3.584669828414917, |
| "learning_rate": 2.6433060967944688e-05, |
| "loss": 2.4552, |
| "step": 7500 |
| }, |
| { |
| "epoch": 4.776869893148963, |
| "grad_norm": 5.264488220214844, |
| "learning_rate": 2.6118793211816468e-05, |
| "loss": 2.4232, |
| "step": 7600 |
| }, |
| { |
| "epoch": 4.8397234443746076, |
| "grad_norm": 4.609414100646973, |
| "learning_rate": 2.580452545568825e-05, |
| "loss": 2.4418, |
| "step": 7700 |
| }, |
| { |
| "epoch": 4.902576995600251, |
| "grad_norm": 4.986881256103516, |
| "learning_rate": 2.5490257699560027e-05, |
| "loss": 2.4065, |
| "step": 7800 |
| }, |
| { |
| "epoch": 4.965430546825896, |
| "grad_norm": 4.9718098640441895, |
| "learning_rate": 2.5175989943431806e-05, |
| "loss": 2.4589, |
| "step": 7900 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 1.984979271888733, |
| "eval_runtime": 20.0353, |
| "eval_samples_per_second": 47.816, |
| "eval_steps_per_second": 5.989, |
| "step": 7955 |
| }, |
| { |
| "epoch": 5.0282840980515395, |
| "grad_norm": 5.2526750564575195, |
| "learning_rate": 2.4861722187303586e-05, |
| "loss": 2.2708, |
| "step": 8000 |
| }, |
| { |
| "epoch": 5.091137649277184, |
| "grad_norm": 5.312747001647949, |
| "learning_rate": 2.454745443117536e-05, |
| "loss": 2.3068, |
| "step": 8100 |
| }, |
| { |
| "epoch": 5.153991200502828, |
| "grad_norm": 7.204046726226807, |
| "learning_rate": 2.423318667504714e-05, |
| "loss": 2.3729, |
| "step": 8200 |
| }, |
| { |
| "epoch": 5.216844751728472, |
| "grad_norm": 4.8044753074646, |
| "learning_rate": 2.391891891891892e-05, |
| "loss": 2.3501, |
| "step": 8300 |
| }, |
| { |
| "epoch": 5.279698302954117, |
| "grad_norm": 6.9473185539245605, |
| "learning_rate": 2.3604651162790697e-05, |
| "loss": 2.3398, |
| "step": 8400 |
| }, |
| { |
| "epoch": 5.342551854179761, |
| "grad_norm": 4.014726161956787, |
| "learning_rate": 2.3290383406662476e-05, |
| "loss": 2.2938, |
| "step": 8500 |
| }, |
| { |
| "epoch": 5.405405405405405, |
| "grad_norm": 6.722488880157471, |
| "learning_rate": 2.2976115650534256e-05, |
| "loss": 2.2354, |
| "step": 8600 |
| }, |
| { |
| "epoch": 5.4682589566310495, |
| "grad_norm": 5.856524467468262, |
| "learning_rate": 2.2661847894406035e-05, |
| "loss": 2.2757, |
| "step": 8700 |
| }, |
| { |
| "epoch": 5.531112507856694, |
| "grad_norm": 4.9930644035339355, |
| "learning_rate": 2.234758013827781e-05, |
| "loss": 2.2586, |
| "step": 8800 |
| }, |
| { |
| "epoch": 5.593966059082338, |
| "grad_norm": 5.49005126953125, |
| "learning_rate": 2.2033312382149594e-05, |
| "loss": 2.3155, |
| "step": 8900 |
| }, |
| { |
| "epoch": 5.656819610307982, |
| "grad_norm": 8.850517272949219, |
| "learning_rate": 2.171904462602137e-05, |
| "loss": 2.2841, |
| "step": 9000 |
| }, |
| { |
| "epoch": 5.719673161533627, |
| "grad_norm": 5.094405651092529, |
| "learning_rate": 2.140477686989315e-05, |
| "loss": 2.3147, |
| "step": 9100 |
| }, |
| { |
| "epoch": 5.782526712759271, |
| "grad_norm": 4.709909439086914, |
| "learning_rate": 2.109050911376493e-05, |
| "loss": 2.1584, |
| "step": 9200 |
| }, |
| { |
| "epoch": 5.845380263984915, |
| "grad_norm": 4.1693525314331055, |
| "learning_rate": 2.077624135763671e-05, |
| "loss": 2.2396, |
| "step": 9300 |
| }, |
| { |
| "epoch": 5.90823381521056, |
| "grad_norm": 6.800940036773682, |
| "learning_rate": 2.0461973601508485e-05, |
| "loss": 2.301, |
| "step": 9400 |
| }, |
| { |
| "epoch": 5.971087366436204, |
| "grad_norm": 7.419278144836426, |
| "learning_rate": 2.0147705845380265e-05, |
| "loss": 2.3142, |
| "step": 9500 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 1.905881643295288, |
| "eval_runtime": 20.0332, |
| "eval_samples_per_second": 47.821, |
| "eval_steps_per_second": 5.99, |
| "step": 9546 |
| }, |
| { |
| "epoch": 6.033940917661848, |
| "grad_norm": 4.217894077301025, |
| "learning_rate": 1.9833438089252044e-05, |
| "loss": 2.1013, |
| "step": 9600 |
| }, |
| { |
| "epoch": 6.096794468887492, |
| "grad_norm": 5.345584869384766, |
| "learning_rate": 1.9519170333123824e-05, |
| "loss": 2.2714, |
| "step": 9700 |
| }, |
| { |
| "epoch": 6.159648020113137, |
| "grad_norm": 5.364700794219971, |
| "learning_rate": 1.92049025769956e-05, |
| "loss": 2.2381, |
| "step": 9800 |
| }, |
| { |
| "epoch": 6.222501571338781, |
| "grad_norm": 4.380568504333496, |
| "learning_rate": 1.8890634820867383e-05, |
| "loss": 2.1527, |
| "step": 9900 |
| }, |
| { |
| "epoch": 6.285355122564425, |
| "grad_norm": 6.300790309906006, |
| "learning_rate": 1.857636706473916e-05, |
| "loss": 2.1771, |
| "step": 10000 |
| }, |
| { |
| "epoch": 6.348208673790069, |
| "grad_norm": 5.757110118865967, |
| "learning_rate": 1.8262099308610938e-05, |
| "loss": 2.1695, |
| "step": 10100 |
| }, |
| { |
| "epoch": 6.411062225015713, |
| "grad_norm": 4.908361434936523, |
| "learning_rate": 1.7947831552482718e-05, |
| "loss": 2.1056, |
| "step": 10200 |
| }, |
| { |
| "epoch": 6.473915776241357, |
| "grad_norm": 5.048102378845215, |
| "learning_rate": 1.7633563796354494e-05, |
| "loss": 2.2112, |
| "step": 10300 |
| }, |
| { |
| "epoch": 6.536769327467002, |
| "grad_norm": 8.040143013000488, |
| "learning_rate": 1.7319296040226273e-05, |
| "loss": 2.0298, |
| "step": 10400 |
| }, |
| { |
| "epoch": 6.599622878692646, |
| "grad_norm": 5.15581750869751, |
| "learning_rate": 1.7005028284098053e-05, |
| "loss": 2.1224, |
| "step": 10500 |
| }, |
| { |
| "epoch": 6.66247642991829, |
| "grad_norm": 4.935842514038086, |
| "learning_rate": 1.6690760527969832e-05, |
| "loss": 2.0772, |
| "step": 10600 |
| }, |
| { |
| "epoch": 6.725329981143934, |
| "grad_norm": 5.487718105316162, |
| "learning_rate": 1.637649277184161e-05, |
| "loss": 2.2552, |
| "step": 10700 |
| }, |
| { |
| "epoch": 6.788183532369579, |
| "grad_norm": 5.713748455047607, |
| "learning_rate": 1.6062225015713388e-05, |
| "loss": 2.1358, |
| "step": 10800 |
| }, |
| { |
| "epoch": 6.851037083595223, |
| "grad_norm": 4.882757186889648, |
| "learning_rate": 1.5747957259585168e-05, |
| "loss": 2.1613, |
| "step": 10900 |
| }, |
| { |
| "epoch": 6.913890634820867, |
| "grad_norm": 5.634950637817383, |
| "learning_rate": 1.5433689503456947e-05, |
| "loss": 2.2567, |
| "step": 11000 |
| }, |
| { |
| "epoch": 6.976744186046512, |
| "grad_norm": 5.634829044342041, |
| "learning_rate": 1.5119421747328725e-05, |
| "loss": 2.1283, |
| "step": 11100 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 1.84635591506958, |
| "eval_runtime": 20.0367, |
| "eval_samples_per_second": 47.812, |
| "eval_steps_per_second": 5.989, |
| "step": 11137 |
| }, |
| { |
| "epoch": 7.039597737272156, |
| "grad_norm": 5.635861873626709, |
| "learning_rate": 1.4805153991200504e-05, |
| "loss": 2.0938, |
| "step": 11200 |
| }, |
| { |
| "epoch": 7.1024512884978, |
| "grad_norm": 5.214977741241455, |
| "learning_rate": 1.4490886235072282e-05, |
| "loss": 2.062, |
| "step": 11300 |
| }, |
| { |
| "epoch": 7.1653048397234445, |
| "grad_norm": 7.498839855194092, |
| "learning_rate": 1.4176618478944062e-05, |
| "loss": 2.1292, |
| "step": 11400 |
| }, |
| { |
| "epoch": 7.228158390949089, |
| "grad_norm": 5.83459997177124, |
| "learning_rate": 1.386235072281584e-05, |
| "loss": 2.0796, |
| "step": 11500 |
| }, |
| { |
| "epoch": 7.291011942174733, |
| "grad_norm": 3.8935282230377197, |
| "learning_rate": 1.3548082966687619e-05, |
| "loss": 2.1414, |
| "step": 11600 |
| }, |
| { |
| "epoch": 7.353865493400377, |
| "grad_norm": 5.774020671844482, |
| "learning_rate": 1.3233815210559397e-05, |
| "loss": 2.145, |
| "step": 11700 |
| }, |
| { |
| "epoch": 7.416719044626022, |
| "grad_norm": 128.24192810058594, |
| "learning_rate": 1.2919547454431178e-05, |
| "loss": 2.0242, |
| "step": 11800 |
| }, |
| { |
| "epoch": 7.479572595851666, |
| "grad_norm": 4.4846367835998535, |
| "learning_rate": 1.2605279698302954e-05, |
| "loss": 2.0936, |
| "step": 11900 |
| }, |
| { |
| "epoch": 7.54242614707731, |
| "grad_norm": 5.091222763061523, |
| "learning_rate": 1.2291011942174734e-05, |
| "loss": 2.1988, |
| "step": 12000 |
| }, |
| { |
| "epoch": 7.6052796983029545, |
| "grad_norm": 3.3482093811035156, |
| "learning_rate": 1.1976744186046513e-05, |
| "loss": 2.1323, |
| "step": 12100 |
| }, |
| { |
| "epoch": 7.668133249528598, |
| "grad_norm": 5.329409599304199, |
| "learning_rate": 1.1662476429918291e-05, |
| "loss": 2.0587, |
| "step": 12200 |
| }, |
| { |
| "epoch": 7.730986800754243, |
| "grad_norm": 7.584386348724365, |
| "learning_rate": 1.134820867379007e-05, |
| "loss": 2.1341, |
| "step": 12300 |
| }, |
| { |
| "epoch": 7.7938403519798864, |
| "grad_norm": 5.996345520019531, |
| "learning_rate": 1.1033940917661848e-05, |
| "loss": 2.1108, |
| "step": 12400 |
| }, |
| { |
| "epoch": 7.856693903205531, |
| "grad_norm": 6.1731648445129395, |
| "learning_rate": 1.0719673161533628e-05, |
| "loss": 2.1218, |
| "step": 12500 |
| }, |
| { |
| "epoch": 7.919547454431175, |
| "grad_norm": 5.414481163024902, |
| "learning_rate": 1.0405405405405407e-05, |
| "loss": 2.028, |
| "step": 12600 |
| }, |
| { |
| "epoch": 7.982401005656819, |
| "grad_norm": 7.198294639587402, |
| "learning_rate": 1.0091137649277185e-05, |
| "loss": 2.0489, |
| "step": 12700 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 1.8111430406570435, |
| "eval_runtime": 20.0666, |
| "eval_samples_per_second": 47.741, |
| "eval_steps_per_second": 5.98, |
| "step": 12728 |
| }, |
| { |
| "epoch": 8.045254556882464, |
| "grad_norm": 6.677022933959961, |
| "learning_rate": 9.776869893148963e-06, |
| "loss": 2.0814, |
| "step": 12800 |
| }, |
| { |
| "epoch": 8.108108108108109, |
| "grad_norm": 5.1916728019714355, |
| "learning_rate": 9.46260213702074e-06, |
| "loss": 2.119, |
| "step": 12900 |
| }, |
| { |
| "epoch": 8.170961659333752, |
| "grad_norm": 6.04162073135376, |
| "learning_rate": 9.14833438089252e-06, |
| "loss": 2.0058, |
| "step": 13000 |
| }, |
| { |
| "epoch": 8.233815210559397, |
| "grad_norm": 4.764267444610596, |
| "learning_rate": 8.8340666247643e-06, |
| "loss": 2.0113, |
| "step": 13100 |
| }, |
| { |
| "epoch": 8.29666876178504, |
| "grad_norm": 5.77971887588501, |
| "learning_rate": 8.519798868636078e-06, |
| "loss": 2.0392, |
| "step": 13200 |
| }, |
| { |
| "epoch": 8.359522313010686, |
| "grad_norm": 5.698218822479248, |
| "learning_rate": 8.205531112507857e-06, |
| "loss": 2.107, |
| "step": 13300 |
| }, |
| { |
| "epoch": 8.42237586423633, |
| "grad_norm": 5.236012935638428, |
| "learning_rate": 7.891263356379635e-06, |
| "loss": 2.0829, |
| "step": 13400 |
| }, |
| { |
| "epoch": 8.485229415461973, |
| "grad_norm": 4.379955291748047, |
| "learning_rate": 7.576995600251414e-06, |
| "loss": 1.9321, |
| "step": 13500 |
| }, |
| { |
| "epoch": 8.548082966687618, |
| "grad_norm": 6.034859657287598, |
| "learning_rate": 7.262727844123193e-06, |
| "loss": 2.1013, |
| "step": 13600 |
| }, |
| { |
| "epoch": 8.610936517913261, |
| "grad_norm": 5.320705413818359, |
| "learning_rate": 6.948460087994972e-06, |
| "loss": 2.0543, |
| "step": 13700 |
| }, |
| { |
| "epoch": 8.673790069138906, |
| "grad_norm": 5.735895156860352, |
| "learning_rate": 6.634192331866751e-06, |
| "loss": 2.0594, |
| "step": 13800 |
| }, |
| { |
| "epoch": 8.73664362036455, |
| "grad_norm": 4.845800876617432, |
| "learning_rate": 6.31992457573853e-06, |
| "loss": 1.9402, |
| "step": 13900 |
| }, |
| { |
| "epoch": 8.799497171590195, |
| "grad_norm": 4.628382682800293, |
| "learning_rate": 6.0056568196103085e-06, |
| "loss": 1.9937, |
| "step": 14000 |
| }, |
| { |
| "epoch": 8.862350722815838, |
| "grad_norm": 4.747410774230957, |
| "learning_rate": 5.691389063482086e-06, |
| "loss": 2.0654, |
| "step": 14100 |
| }, |
| { |
| "epoch": 8.925204274041484, |
| "grad_norm": 4.694166660308838, |
| "learning_rate": 5.377121307353866e-06, |
| "loss": 2.0523, |
| "step": 14200 |
| }, |
| { |
| "epoch": 8.988057825267127, |
| "grad_norm": 6.711084365844727, |
| "learning_rate": 5.0628535512256445e-06, |
| "loss": 1.9856, |
| "step": 14300 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 1.7920939922332764, |
| "eval_runtime": 20.0378, |
| "eval_samples_per_second": 47.81, |
| "eval_steps_per_second": 5.989, |
| "step": 14319 |
| }, |
| { |
| "epoch": 9.050911376492772, |
| "grad_norm": 6.053162097930908, |
| "learning_rate": 4.748585795097423e-06, |
| "loss": 2.0392, |
| "step": 14400 |
| }, |
| { |
| "epoch": 9.113764927718416, |
| "grad_norm": 4.806529521942139, |
| "learning_rate": 4.434318038969202e-06, |
| "loss": 2.0308, |
| "step": 14500 |
| }, |
| { |
| "epoch": 9.17661847894406, |
| "grad_norm": 4.725819110870361, |
| "learning_rate": 4.1200502828409805e-06, |
| "loss": 2.0441, |
| "step": 14600 |
| }, |
| { |
| "epoch": 9.239472030169704, |
| "grad_norm": 4.637420177459717, |
| "learning_rate": 3.8057825267127596e-06, |
| "loss": 2.0061, |
| "step": 14700 |
| }, |
| { |
| "epoch": 9.30232558139535, |
| "grad_norm": 6.441665172576904, |
| "learning_rate": 3.4915147705845382e-06, |
| "loss": 2.1299, |
| "step": 14800 |
| }, |
| { |
| "epoch": 9.365179132620993, |
| "grad_norm": 3.506943941116333, |
| "learning_rate": 3.1772470144563173e-06, |
| "loss": 1.9443, |
| "step": 14900 |
| }, |
| { |
| "epoch": 9.428032683846638, |
| "grad_norm": 8.454822540283203, |
| "learning_rate": 2.8629792583280956e-06, |
| "loss": 2.0327, |
| "step": 15000 |
| }, |
| { |
| "epoch": 9.490886235072281, |
| "grad_norm": 5.021187782287598, |
| "learning_rate": 2.5487115021998746e-06, |
| "loss": 1.9839, |
| "step": 15100 |
| }, |
| { |
| "epoch": 9.553739786297927, |
| "grad_norm": 6.3962016105651855, |
| "learning_rate": 2.234443746071653e-06, |
| "loss": 2.0604, |
| "step": 15200 |
| }, |
| { |
| "epoch": 9.61659333752357, |
| "grad_norm": 5.531436443328857, |
| "learning_rate": 1.920175989943432e-06, |
| "loss": 2.0168, |
| "step": 15300 |
| }, |
| { |
| "epoch": 9.679446888749215, |
| "grad_norm": 4.300695896148682, |
| "learning_rate": 1.6059082338152106e-06, |
| "loss": 1.9994, |
| "step": 15400 |
| }, |
| { |
| "epoch": 9.742300439974859, |
| "grad_norm": 3.102018356323242, |
| "learning_rate": 1.2916404776869893e-06, |
| "loss": 2.0441, |
| "step": 15500 |
| }, |
| { |
| "epoch": 9.805153991200502, |
| "grad_norm": 4.91919469833374, |
| "learning_rate": 9.773727215587681e-07, |
| "loss": 1.9584, |
| "step": 15600 |
| }, |
| { |
| "epoch": 9.868007542426147, |
| "grad_norm": 4.21737813949585, |
| "learning_rate": 6.631049654305469e-07, |
| "loss": 2.0019, |
| "step": 15700 |
| }, |
| { |
| "epoch": 9.930861093651792, |
| "grad_norm": 4.098769187927246, |
| "learning_rate": 3.4883720930232557e-07, |
| "loss": 2.0121, |
| "step": 15800 |
| }, |
| { |
| "epoch": 9.993714644877436, |
| "grad_norm": 4.722096920013428, |
| "learning_rate": 3.456945317410434e-08, |
| "loss": 2.0196, |
| "step": 15900 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 1.787421464920044, |
| "eval_runtime": 20.0243, |
| "eval_samples_per_second": 47.842, |
| "eval_steps_per_second": 5.993, |
| "step": 15910 |
| }, |
| { |
| "epoch": 10.056568196103079, |
| "grad_norm": 3.8331987857818604, |
| "learning_rate": 2.4860150848522942e-05, |
| "loss": 2.0388, |
| "step": 16000 |
| }, |
| { |
| "epoch": 10.119421747328724, |
| "grad_norm": 3.9292027950286865, |
| "learning_rate": 2.4703016970458832e-05, |
| "loss": 2.0913, |
| "step": 16100 |
| }, |
| { |
| "epoch": 10.182275298554368, |
| "grad_norm": 5.124855995178223, |
| "learning_rate": 2.454588309239472e-05, |
| "loss": 2.0452, |
| "step": 16200 |
| }, |
| { |
| "epoch": 10.245128849780013, |
| "grad_norm": 5.743933200836182, |
| "learning_rate": 2.438874921433061e-05, |
| "loss": 2.016, |
| "step": 16300 |
| }, |
| { |
| "epoch": 10.307982401005656, |
| "grad_norm": 6.4510931968688965, |
| "learning_rate": 2.42316153362665e-05, |
| "loss": 1.9785, |
| "step": 16400 |
| }, |
| { |
| "epoch": 10.370835952231301, |
| "grad_norm": 6.550465106964111, |
| "learning_rate": 2.4074481458202387e-05, |
| "loss": 1.9912, |
| "step": 16500 |
| }, |
| { |
| "epoch": 10.433689503456945, |
| "grad_norm": 5.37285852432251, |
| "learning_rate": 2.391734758013828e-05, |
| "loss": 2.0549, |
| "step": 16600 |
| }, |
| { |
| "epoch": 10.49654305468259, |
| "grad_norm": 5.4893412590026855, |
| "learning_rate": 2.376021370207417e-05, |
| "loss": 1.9434, |
| "step": 16700 |
| }, |
| { |
| "epoch": 10.559396605908233, |
| "grad_norm": 4.316259384155273, |
| "learning_rate": 2.3603079824010057e-05, |
| "loss": 1.8413, |
| "step": 16800 |
| }, |
| { |
| "epoch": 10.622250157133879, |
| "grad_norm": 3.4342756271362305, |
| "learning_rate": 2.3445945945945946e-05, |
| "loss": 1.9312, |
| "step": 16900 |
| }, |
| { |
| "epoch": 10.685103708359522, |
| "grad_norm": 5.680815696716309, |
| "learning_rate": 2.3288812067881836e-05, |
| "loss": 1.9678, |
| "step": 17000 |
| }, |
| { |
| "epoch": 10.747957259585167, |
| "grad_norm": 6.04569149017334, |
| "learning_rate": 2.3131678189817726e-05, |
| "loss": 2.0329, |
| "step": 17100 |
| }, |
| { |
| "epoch": 10.81081081081081, |
| "grad_norm": 9.336991310119629, |
| "learning_rate": 2.2974544311753616e-05, |
| "loss": 1.9575, |
| "step": 17200 |
| }, |
| { |
| "epoch": 10.873664362036456, |
| "grad_norm": 3.826447010040283, |
| "learning_rate": 2.2817410433689505e-05, |
| "loss": 1.9692, |
| "step": 17300 |
| }, |
| { |
| "epoch": 10.936517913262099, |
| "grad_norm": 4.134801387786865, |
| "learning_rate": 2.2660276555625392e-05, |
| "loss": 2.0406, |
| "step": 17400 |
| }, |
| { |
| "epoch": 10.999371464487744, |
| "grad_norm": 5.291431903839111, |
| "learning_rate": 2.2503142677561285e-05, |
| "loss": 1.9631, |
| "step": 17500 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 1.7517410516738892, |
| "eval_runtime": 21.6572, |
| "eval_samples_per_second": 44.235, |
| "eval_steps_per_second": 5.541, |
| "step": 17501 |
| }, |
| { |
| "epoch": 11.062225015713388, |
| "grad_norm": 4.9575066566467285, |
| "learning_rate": 2.234600879949717e-05, |
| "loss": 1.9381, |
| "step": 17600 |
| }, |
| { |
| "epoch": 11.125078566939033, |
| "grad_norm": 12.871175765991211, |
| "learning_rate": 2.218887492143306e-05, |
| "loss": 1.8867, |
| "step": 17700 |
| }, |
| { |
| "epoch": 11.187932118164676, |
| "grad_norm": 4.3662519454956055, |
| "learning_rate": 2.203174104336895e-05, |
| "loss": 1.9713, |
| "step": 17800 |
| }, |
| { |
| "epoch": 11.250785669390321, |
| "grad_norm": 5.662289619445801, |
| "learning_rate": 2.187460716530484e-05, |
| "loss": 1.9188, |
| "step": 17900 |
| }, |
| { |
| "epoch": 11.313639220615965, |
| "grad_norm": 7.633818626403809, |
| "learning_rate": 2.171747328724073e-05, |
| "loss": 1.9142, |
| "step": 18000 |
| }, |
| { |
| "epoch": 11.376492771841608, |
| "grad_norm": 4.940028667449951, |
| "learning_rate": 2.156033940917662e-05, |
| "loss": 1.8697, |
| "step": 18100 |
| }, |
| { |
| "epoch": 11.439346323067253, |
| "grad_norm": 5.070211410522461, |
| "learning_rate": 2.1403205531112506e-05, |
| "loss": 1.9624, |
| "step": 18200 |
| }, |
| { |
| "epoch": 11.502199874292897, |
| "grad_norm": 7.409548282623291, |
| "learning_rate": 2.12460716530484e-05, |
| "loss": 1.9283, |
| "step": 18300 |
| }, |
| { |
| "epoch": 11.565053425518542, |
| "grad_norm": 6.541192531585693, |
| "learning_rate": 2.108893777498429e-05, |
| "loss": 1.9357, |
| "step": 18400 |
| }, |
| { |
| "epoch": 11.627906976744185, |
| "grad_norm": 5.941864967346191, |
| "learning_rate": 2.0931803896920176e-05, |
| "loss": 1.869, |
| "step": 18500 |
| }, |
| { |
| "epoch": 11.69076052796983, |
| "grad_norm": 9.418646812438965, |
| "learning_rate": 2.0774670018856065e-05, |
| "loss": 1.8518, |
| "step": 18600 |
| }, |
| { |
| "epoch": 11.753614079195474, |
| "grad_norm": 5.367152690887451, |
| "learning_rate": 2.061753614079196e-05, |
| "loss": 1.8945, |
| "step": 18700 |
| }, |
| { |
| "epoch": 11.81646763042112, |
| "grad_norm": 5.896432399749756, |
| "learning_rate": 2.0460402262727845e-05, |
| "loss": 1.8569, |
| "step": 18800 |
| }, |
| { |
| "epoch": 11.879321181646763, |
| "grad_norm": 6.137564182281494, |
| "learning_rate": 2.0303268384663735e-05, |
| "loss": 1.9179, |
| "step": 18900 |
| }, |
| { |
| "epoch": 11.942174732872408, |
| "grad_norm": 4.5933918952941895, |
| "learning_rate": 2.0146134506599625e-05, |
| "loss": 1.8941, |
| "step": 19000 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 1.7062737941741943, |
| "eval_runtime": 21.7167, |
| "eval_samples_per_second": 44.114, |
| "eval_steps_per_second": 5.526, |
| "step": 19092 |
| }, |
| { |
| "epoch": 12.005028284098051, |
| "grad_norm": 5.298050880432129, |
| "learning_rate": 1.998900062853551e-05, |
| "loss": 1.8681, |
| "step": 19100 |
| }, |
| { |
| "epoch": 12.067881835323696, |
| "grad_norm": 7.001854419708252, |
| "learning_rate": 1.9831866750471404e-05, |
| "loss": 1.8377, |
| "step": 19200 |
| }, |
| { |
| "epoch": 12.13073538654934, |
| "grad_norm": 4.692386150360107, |
| "learning_rate": 1.9674732872407294e-05, |
| "loss": 1.8279, |
| "step": 19300 |
| }, |
| { |
| "epoch": 12.193588937774985, |
| "grad_norm": 6.864208221435547, |
| "learning_rate": 1.951759899434318e-05, |
| "loss": 1.8855, |
| "step": 19400 |
| }, |
| { |
| "epoch": 12.256442489000628, |
| "grad_norm": 3.883880853652954, |
| "learning_rate": 1.936046511627907e-05, |
| "loss": 1.84, |
| "step": 19500 |
| }, |
| { |
| "epoch": 12.319296040226273, |
| "grad_norm": 5.302524566650391, |
| "learning_rate": 1.920333123821496e-05, |
| "loss": 1.8791, |
| "step": 19600 |
| }, |
| { |
| "epoch": 12.382149591451917, |
| "grad_norm": 6.854051113128662, |
| "learning_rate": 1.904619736015085e-05, |
| "loss": 1.9189, |
| "step": 19700 |
| }, |
| { |
| "epoch": 12.445003142677562, |
| "grad_norm": 4.728283405303955, |
| "learning_rate": 1.888906348208674e-05, |
| "loss": 1.8903, |
| "step": 19800 |
| }, |
| { |
| "epoch": 12.507856693903205, |
| "grad_norm": 4.314347267150879, |
| "learning_rate": 1.8731929604022626e-05, |
| "loss": 1.8615, |
| "step": 19900 |
| }, |
| { |
| "epoch": 12.57071024512885, |
| "grad_norm": 3.873619318008423, |
| "learning_rate": 1.857479572595852e-05, |
| "loss": 1.8232, |
| "step": 20000 |
| }, |
| { |
| "epoch": 12.633563796354494, |
| "grad_norm": 6.445096969604492, |
| "learning_rate": 1.841766184789441e-05, |
| "loss": 1.7764, |
| "step": 20100 |
| }, |
| { |
| "epoch": 12.696417347580137, |
| "grad_norm": 4.258322715759277, |
| "learning_rate": 1.8260527969830295e-05, |
| "loss": 1.869, |
| "step": 20200 |
| }, |
| { |
| "epoch": 12.759270898805783, |
| "grad_norm": 7.782538414001465, |
| "learning_rate": 1.8103394091766185e-05, |
| "loss": 1.7986, |
| "step": 20300 |
| }, |
| { |
| "epoch": 12.822124450031426, |
| "grad_norm": 7.189488887786865, |
| "learning_rate": 1.7946260213702078e-05, |
| "loss": 1.8448, |
| "step": 20400 |
| }, |
| { |
| "epoch": 12.884978001257071, |
| "grad_norm": 5.59601354598999, |
| "learning_rate": 1.7789126335637964e-05, |
| "loss": 1.7924, |
| "step": 20500 |
| }, |
| { |
| "epoch": 12.947831552482715, |
| "grad_norm": 4.675200939178467, |
| "learning_rate": 1.7631992457573854e-05, |
| "loss": 1.8212, |
| "step": 20600 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 1.6696668863296509, |
| "eval_runtime": 21.645, |
| "eval_samples_per_second": 44.26, |
| "eval_steps_per_second": 5.544, |
| "step": 20683 |
| }, |
| { |
| "epoch": 13.01068510370836, |
| "grad_norm": 3.3650217056274414, |
| "learning_rate": 1.7474858579509744e-05, |
| "loss": 1.6872, |
| "step": 20700 |
| }, |
| { |
| "epoch": 13.073538654934003, |
| "grad_norm": 6.4758219718933105, |
| "learning_rate": 1.731772470144563e-05, |
| "loss": 1.8029, |
| "step": 20800 |
| }, |
| { |
| "epoch": 13.136392206159648, |
| "grad_norm": 4.500367641448975, |
| "learning_rate": 1.7160590823381523e-05, |
| "loss": 1.8655, |
| "step": 20900 |
| }, |
| { |
| "epoch": 13.199245757385292, |
| "grad_norm": 5.369949817657471, |
| "learning_rate": 1.7003456945317413e-05, |
| "loss": 1.821, |
| "step": 21000 |
| }, |
| { |
| "epoch": 13.262099308610937, |
| "grad_norm": 4.84245491027832, |
| "learning_rate": 1.68463230672533e-05, |
| "loss": 1.7454, |
| "step": 21100 |
| }, |
| { |
| "epoch": 13.32495285983658, |
| "grad_norm": 4.510051727294922, |
| "learning_rate": 1.668918918918919e-05, |
| "loss": 1.8378, |
| "step": 21200 |
| }, |
| { |
| "epoch": 13.387806411062225, |
| "grad_norm": 5.163560390472412, |
| "learning_rate": 1.653205531112508e-05, |
| "loss": 1.7985, |
| "step": 21300 |
| }, |
| { |
| "epoch": 13.450659962287869, |
| "grad_norm": 4.454617023468018, |
| "learning_rate": 1.637492143306097e-05, |
| "loss": 1.8177, |
| "step": 21400 |
| }, |
| { |
| "epoch": 13.513513513513514, |
| "grad_norm": 3.672908067703247, |
| "learning_rate": 1.6217787554996858e-05, |
| "loss": 1.6908, |
| "step": 21500 |
| }, |
| { |
| "epoch": 13.576367064739157, |
| "grad_norm": 4.549923419952393, |
| "learning_rate": 1.6060653676932748e-05, |
| "loss": 1.7603, |
| "step": 21600 |
| }, |
| { |
| "epoch": 13.639220615964803, |
| "grad_norm": 5.733989715576172, |
| "learning_rate": 1.5903519798868638e-05, |
| "loss": 1.7689, |
| "step": 21700 |
| }, |
| { |
| "epoch": 13.702074167190446, |
| "grad_norm": 4.507519245147705, |
| "learning_rate": 1.5746385920804527e-05, |
| "loss": 1.7984, |
| "step": 21800 |
| }, |
| { |
| "epoch": 13.764927718416091, |
| "grad_norm": 4.713226795196533, |
| "learning_rate": 1.5589252042740414e-05, |
| "loss": 1.8011, |
| "step": 21900 |
| }, |
| { |
| "epoch": 13.827781269641735, |
| "grad_norm": 4.300686359405518, |
| "learning_rate": 1.5432118164676304e-05, |
| "loss": 1.7743, |
| "step": 22000 |
| }, |
| { |
| "epoch": 13.89063482086738, |
| "grad_norm": 4.702789306640625, |
| "learning_rate": 1.5274984286612197e-05, |
| "loss": 1.6903, |
| "step": 22100 |
| }, |
| { |
| "epoch": 13.953488372093023, |
| "grad_norm": 6.481640815734863, |
| "learning_rate": 1.5117850408548085e-05, |
| "loss": 1.822, |
| "step": 22200 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 1.648952603340149, |
| "eval_runtime": 21.6512, |
| "eval_samples_per_second": 44.247, |
| "eval_steps_per_second": 5.542, |
| "step": 22274 |
| }, |
| { |
| "epoch": 14.016341923318668, |
| "grad_norm": 4.320845127105713, |
| "learning_rate": 2.1968573224387177e-05, |
| "loss": 1.7866, |
| "step": 22300 |
| }, |
| { |
| "epoch": 14.079195474544312, |
| "grad_norm": 5.575278282165527, |
| "learning_rate": 2.184286612193589e-05, |
| "loss": 1.7572, |
| "step": 22400 |
| }, |
| { |
| "epoch": 14.142049025769955, |
| "grad_norm": 5.764155387878418, |
| "learning_rate": 2.17171590194846e-05, |
| "loss": 1.7566, |
| "step": 22500 |
| }, |
| { |
| "epoch": 14.2049025769956, |
| "grad_norm": 4.854477882385254, |
| "learning_rate": 2.1591451917033316e-05, |
| "loss": 1.7517, |
| "step": 22600 |
| }, |
| { |
| "epoch": 14.267756128221244, |
| "grad_norm": 4.7141618728637695, |
| "learning_rate": 2.1465744814582025e-05, |
| "loss": 1.713, |
| "step": 22700 |
| }, |
| { |
| "epoch": 14.330609679446889, |
| "grad_norm": 4.3324785232543945, |
| "learning_rate": 2.1340037712130736e-05, |
| "loss": 1.7511, |
| "step": 22800 |
| }, |
| { |
| "epoch": 14.393463230672532, |
| "grad_norm": 3.4204530715942383, |
| "learning_rate": 2.1214330609679448e-05, |
| "loss": 1.7451, |
| "step": 22900 |
| }, |
| { |
| "epoch": 14.456316781898177, |
| "grad_norm": 4.925296783447266, |
| "learning_rate": 2.108862350722816e-05, |
| "loss": 1.6868, |
| "step": 23000 |
| }, |
| { |
| "epoch": 14.51917033312382, |
| "grad_norm": 4.997200965881348, |
| "learning_rate": 2.0962916404776872e-05, |
| "loss": 1.7259, |
| "step": 23100 |
| }, |
| { |
| "epoch": 14.582023884349466, |
| "grad_norm": 4.816483497619629, |
| "learning_rate": 2.0837209302325584e-05, |
| "loss": 1.7716, |
| "step": 23200 |
| }, |
| { |
| "epoch": 14.64487743557511, |
| "grad_norm": 5.224360466003418, |
| "learning_rate": 2.0711502199874295e-05, |
| "loss": 1.7039, |
| "step": 23300 |
| }, |
| { |
| "epoch": 14.707730986800755, |
| "grad_norm": 7.450541019439697, |
| "learning_rate": 2.0585795097423004e-05, |
| "loss": 1.6634, |
| "step": 23400 |
| }, |
| { |
| "epoch": 14.770584538026398, |
| "grad_norm": 5.811767101287842, |
| "learning_rate": 2.0460087994971716e-05, |
| "loss": 1.7526, |
| "step": 23500 |
| }, |
| { |
| "epoch": 14.833438089252043, |
| "grad_norm": 4.1061272621154785, |
| "learning_rate": 2.0334380892520427e-05, |
| "loss": 1.7612, |
| "step": 23600 |
| }, |
| { |
| "epoch": 14.896291640477687, |
| "grad_norm": 4.599556922912598, |
| "learning_rate": 2.020867379006914e-05, |
| "loss": 1.776, |
| "step": 23700 |
| }, |
| { |
| "epoch": 14.959145191703332, |
| "grad_norm": 4.085700988769531, |
| "learning_rate": 2.008296668761785e-05, |
| "loss": 1.7143, |
| "step": 23800 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 1.6270309686660767, |
| "eval_runtime": 20.346, |
| "eval_samples_per_second": 47.085, |
| "eval_steps_per_second": 5.898, |
| "step": 23865 |
| }, |
| { |
| "epoch": 15.021998742928975, |
| "grad_norm": 8.476902961730957, |
| "learning_rate": 1.9957259585166563e-05, |
| "loss": 1.6504, |
| "step": 23900 |
| }, |
| { |
| "epoch": 15.08485229415462, |
| "grad_norm": 4.84979772567749, |
| "learning_rate": 1.9831552482715275e-05, |
| "loss": 1.7259, |
| "step": 24000 |
| }, |
| { |
| "epoch": 15.147705845380264, |
| "grad_norm": 4.314637184143066, |
| "learning_rate": 1.9705845380263983e-05, |
| "loss": 1.6254, |
| "step": 24100 |
| }, |
| { |
| "epoch": 15.210559396605909, |
| "grad_norm": 4.656597137451172, |
| "learning_rate": 1.9580138277812698e-05, |
| "loss": 1.7493, |
| "step": 24200 |
| }, |
| { |
| "epoch": 15.273412947831552, |
| "grad_norm": 4.276788711547852, |
| "learning_rate": 1.945443117536141e-05, |
| "loss": 1.6797, |
| "step": 24300 |
| }, |
| { |
| "epoch": 15.336266499057198, |
| "grad_norm": 3.9574031829833984, |
| "learning_rate": 1.9328724072910122e-05, |
| "loss": 1.716, |
| "step": 24400 |
| }, |
| { |
| "epoch": 15.399120050282841, |
| "grad_norm": 8.148831367492676, |
| "learning_rate": 1.920301697045883e-05, |
| "loss": 1.6737, |
| "step": 24500 |
| }, |
| { |
| "epoch": 15.461973601508486, |
| "grad_norm": 3.8734018802642822, |
| "learning_rate": 1.9077309868007542e-05, |
| "loss": 1.6452, |
| "step": 24600 |
| }, |
| { |
| "epoch": 15.52482715273413, |
| "grad_norm": 4.928835391998291, |
| "learning_rate": 1.8951602765556257e-05, |
| "loss": 1.7134, |
| "step": 24700 |
| }, |
| { |
| "epoch": 15.587680703959773, |
| "grad_norm": 4.991033554077148, |
| "learning_rate": 1.8825895663104966e-05, |
| "loss": 1.7327, |
| "step": 24800 |
| }, |
| { |
| "epoch": 15.650534255185418, |
| "grad_norm": 4.160732269287109, |
| "learning_rate": 1.8700188560653677e-05, |
| "loss": 1.6678, |
| "step": 24900 |
| }, |
| { |
| "epoch": 15.713387806411061, |
| "grad_norm": 6.523078441619873, |
| "learning_rate": 1.857448145820239e-05, |
| "loss": 1.6856, |
| "step": 25000 |
| }, |
| { |
| "epoch": 15.776241357636707, |
| "grad_norm": 6.306403636932373, |
| "learning_rate": 1.84487743557511e-05, |
| "loss": 1.6699, |
| "step": 25100 |
| }, |
| { |
| "epoch": 15.83909490886235, |
| "grad_norm": 4.479640483856201, |
| "learning_rate": 1.832306725329981e-05, |
| "loss": 1.676, |
| "step": 25200 |
| }, |
| { |
| "epoch": 15.901948460087995, |
| "grad_norm": 4.6891279220581055, |
| "learning_rate": 1.8197360150848525e-05, |
| "loss": 1.667, |
| "step": 25300 |
| }, |
| { |
| "epoch": 15.964802011313639, |
| "grad_norm": 5.908668518066406, |
| "learning_rate": 1.8071653048397236e-05, |
| "loss": 1.6267, |
| "step": 25400 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 1.608726143836975, |
| "eval_runtime": 20.3571, |
| "eval_samples_per_second": 47.06, |
| "eval_steps_per_second": 5.895, |
| "step": 25456 |
| }, |
| { |
| "epoch": 16.027655562539284, |
| "grad_norm": 4.081086158752441, |
| "learning_rate": 1.7945945945945948e-05, |
| "loss": 1.5625, |
| "step": 25500 |
| }, |
| { |
| "epoch": 16.090509113764927, |
| "grad_norm": 3.7648415565490723, |
| "learning_rate": 1.7820238843494657e-05, |
| "loss": 1.6818, |
| "step": 25600 |
| }, |
| { |
| "epoch": 16.15336266499057, |
| "grad_norm": 5.430357456207275, |
| "learning_rate": 1.769453174104337e-05, |
| "loss": 1.6125, |
| "step": 25700 |
| }, |
| { |
| "epoch": 16.216216216216218, |
| "grad_norm": 5.235119819641113, |
| "learning_rate": 1.7568824638592084e-05, |
| "loss": 1.6985, |
| "step": 25800 |
| }, |
| { |
| "epoch": 16.27906976744186, |
| "grad_norm": 5.521476745605469, |
| "learning_rate": 1.7443117536140792e-05, |
| "loss": 1.6291, |
| "step": 25900 |
| }, |
| { |
| "epoch": 16.341923318667504, |
| "grad_norm": 5.7086873054504395, |
| "learning_rate": 1.7317410433689504e-05, |
| "loss": 1.6523, |
| "step": 26000 |
| }, |
| { |
| "epoch": 16.404776869893148, |
| "grad_norm": 5.697257041931152, |
| "learning_rate": 1.7191703331238216e-05, |
| "loss": 1.6518, |
| "step": 26100 |
| }, |
| { |
| "epoch": 16.467630421118795, |
| "grad_norm": 8.258442878723145, |
| "learning_rate": 1.7065996228786928e-05, |
| "loss": 1.6314, |
| "step": 26200 |
| }, |
| { |
| "epoch": 16.530483972344438, |
| "grad_norm": 4.087442874908447, |
| "learning_rate": 1.694028912633564e-05, |
| "loss": 1.7048, |
| "step": 26300 |
| }, |
| { |
| "epoch": 16.59333752357008, |
| "grad_norm": 4.184548377990723, |
| "learning_rate": 1.681458202388435e-05, |
| "loss": 1.6062, |
| "step": 26400 |
| }, |
| { |
| "epoch": 16.656191074795725, |
| "grad_norm": 5.8042707443237305, |
| "learning_rate": 1.6688874921433063e-05, |
| "loss": 1.6239, |
| "step": 26500 |
| }, |
| { |
| "epoch": 16.719044626021372, |
| "grad_norm": 4.104475498199463, |
| "learning_rate": 1.656316781898177e-05, |
| "loss": 1.5742, |
| "step": 26600 |
| }, |
| { |
| "epoch": 16.781898177247015, |
| "grad_norm": 4.2934722900390625, |
| "learning_rate": 1.6437460716530483e-05, |
| "loss": 1.6069, |
| "step": 26700 |
| }, |
| { |
| "epoch": 16.84475172847266, |
| "grad_norm": 4.601330757141113, |
| "learning_rate": 1.6311753614079195e-05, |
| "loss": 1.5827, |
| "step": 26800 |
| }, |
| { |
| "epoch": 16.907605279698302, |
| "grad_norm": 4.304816246032715, |
| "learning_rate": 1.618604651162791e-05, |
| "loss": 1.6461, |
| "step": 26900 |
| }, |
| { |
| "epoch": 16.970458830923945, |
| "grad_norm": 6.80120325088501, |
| "learning_rate": 1.606033940917662e-05, |
| "loss": 1.6143, |
| "step": 27000 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 1.5869935750961304, |
| "eval_runtime": 20.3162, |
| "eval_samples_per_second": 47.154, |
| "eval_steps_per_second": 5.907, |
| "step": 27047 |
| }, |
| { |
| "epoch": 17.033312382149592, |
| "grad_norm": 4.368440628051758, |
| "learning_rate": 1.593463230672533e-05, |
| "loss": 1.6352, |
| "step": 27100 |
| }, |
| { |
| "epoch": 17.096165933375236, |
| "grad_norm": 4.066120624542236, |
| "learning_rate": 1.5808925204274042e-05, |
| "loss": 1.5052, |
| "step": 27200 |
| }, |
| { |
| "epoch": 17.15901948460088, |
| "grad_norm": 6.150811672210693, |
| "learning_rate": 1.5683218101822754e-05, |
| "loss": 1.5449, |
| "step": 27300 |
| }, |
| { |
| "epoch": 17.221873035826523, |
| "grad_norm": 7.994663715362549, |
| "learning_rate": 1.5557510999371466e-05, |
| "loss": 1.7157, |
| "step": 27400 |
| }, |
| { |
| "epoch": 17.28472658705217, |
| "grad_norm": 3.554856061935425, |
| "learning_rate": 1.5431803896920178e-05, |
| "loss": 1.5878, |
| "step": 27500 |
| }, |
| { |
| "epoch": 17.347580138277813, |
| "grad_norm": 4.025883674621582, |
| "learning_rate": 1.530609679446889e-05, |
| "loss": 1.6454, |
| "step": 27600 |
| }, |
| { |
| "epoch": 17.410433689503456, |
| "grad_norm": 2.9825448989868164, |
| "learning_rate": 1.51803896920176e-05, |
| "loss": 1.5605, |
| "step": 27700 |
| }, |
| { |
| "epoch": 17.4732872407291, |
| "grad_norm": 4.528345584869385, |
| "learning_rate": 1.505468258956631e-05, |
| "loss": 1.626, |
| "step": 27800 |
| }, |
| { |
| "epoch": 17.536140791954747, |
| "grad_norm": 4.549004554748535, |
| "learning_rate": 1.4928975487115023e-05, |
| "loss": 1.5508, |
| "step": 27900 |
| }, |
| { |
| "epoch": 17.59899434318039, |
| "grad_norm": 4.830588340759277, |
| "learning_rate": 1.4803268384663735e-05, |
| "loss": 1.5394, |
| "step": 28000 |
| }, |
| { |
| "epoch": 17.661847894406034, |
| "grad_norm": 4.127079486846924, |
| "learning_rate": 1.4677561282212447e-05, |
| "loss": 1.5548, |
| "step": 28100 |
| }, |
| { |
| "epoch": 17.724701445631677, |
| "grad_norm": 3.208592414855957, |
| "learning_rate": 1.4551854179761157e-05, |
| "loss": 1.5595, |
| "step": 28200 |
| }, |
| { |
| "epoch": 17.787554996857324, |
| "grad_norm": 4.784154891967773, |
| "learning_rate": 1.4426147077309869e-05, |
| "loss": 1.6029, |
| "step": 28300 |
| }, |
| { |
| "epoch": 17.850408548082967, |
| "grad_norm": 5.0941481590271, |
| "learning_rate": 1.4300439974858582e-05, |
| "loss": 1.634, |
| "step": 28400 |
| }, |
| { |
| "epoch": 17.91326209930861, |
| "grad_norm": 6.4498982429504395, |
| "learning_rate": 1.4174732872407292e-05, |
| "loss": 1.6685, |
| "step": 28500 |
| }, |
| { |
| "epoch": 17.976115650534254, |
| "grad_norm": 5.136322021484375, |
| "learning_rate": 1.4049025769956004e-05, |
| "loss": 1.5587, |
| "step": 28600 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 1.565408706665039, |
| "eval_runtime": 20.3165, |
| "eval_samples_per_second": 47.154, |
| "eval_steps_per_second": 5.907, |
| "step": 28638 |
| }, |
| { |
| "epoch": 18.0389692017599, |
| "grad_norm": 7.265219211578369, |
| "learning_rate": 1.3923318667504714e-05, |
| "loss": 1.534, |
| "step": 28700 |
| }, |
| { |
| "epoch": 18.101822752985544, |
| "grad_norm": 5.552704334259033, |
| "learning_rate": 1.3797611565053426e-05, |
| "loss": 1.5396, |
| "step": 28800 |
| }, |
| { |
| "epoch": 18.164676304211188, |
| "grad_norm": 7.356419086456299, |
| "learning_rate": 1.3671904462602136e-05, |
| "loss": 1.5851, |
| "step": 28900 |
| }, |
| { |
| "epoch": 18.22752985543683, |
| "grad_norm": 5.519120693206787, |
| "learning_rate": 1.354619736015085e-05, |
| "loss": 1.6331, |
| "step": 29000 |
| }, |
| { |
| "epoch": 18.290383406662478, |
| "grad_norm": 4.4178242683410645, |
| "learning_rate": 1.3420490257699561e-05, |
| "loss": 1.508, |
| "step": 29100 |
| }, |
| { |
| "epoch": 18.35323695788812, |
| "grad_norm": 4.479162216186523, |
| "learning_rate": 1.3294783155248271e-05, |
| "loss": 1.5201, |
| "step": 29200 |
| }, |
| { |
| "epoch": 18.416090509113765, |
| "grad_norm": 4.4193806648254395, |
| "learning_rate": 1.3169076052796983e-05, |
| "loss": 1.5393, |
| "step": 29300 |
| }, |
| { |
| "epoch": 18.47894406033941, |
| "grad_norm": 6.695824146270752, |
| "learning_rate": 1.3043368950345693e-05, |
| "loss": 1.6264, |
| "step": 29400 |
| }, |
| { |
| "epoch": 18.541797611565052, |
| "grad_norm": 4.760421276092529, |
| "learning_rate": 1.2917661847894409e-05, |
| "loss": 1.5465, |
| "step": 29500 |
| }, |
| { |
| "epoch": 18.6046511627907, |
| "grad_norm": 4.158078193664551, |
| "learning_rate": 1.2791954745443119e-05, |
| "loss": 1.5533, |
| "step": 29600 |
| }, |
| { |
| "epoch": 18.667504714016342, |
| "grad_norm": 6.8502092361450195, |
| "learning_rate": 1.266624764299183e-05, |
| "loss": 1.6525, |
| "step": 29700 |
| }, |
| { |
| "epoch": 18.730358265241986, |
| "grad_norm": 4.013594150543213, |
| "learning_rate": 1.254054054054054e-05, |
| "loss": 1.5357, |
| "step": 29800 |
| }, |
| { |
| "epoch": 18.79321181646763, |
| "grad_norm": 6.064908981323242, |
| "learning_rate": 1.2414833438089252e-05, |
| "loss": 1.5659, |
| "step": 29900 |
| }, |
| { |
| "epoch": 18.856065367693276, |
| "grad_norm": 5.281710624694824, |
| "learning_rate": 1.2289126335637964e-05, |
| "loss": 1.4692, |
| "step": 30000 |
| }, |
| { |
| "epoch": 18.91891891891892, |
| "grad_norm": 4.661835193634033, |
| "learning_rate": 1.2163419233186674e-05, |
| "loss": 1.5126, |
| "step": 30100 |
| }, |
| { |
| "epoch": 18.981772470144563, |
| "grad_norm": 3.9490227699279785, |
| "learning_rate": 1.2037712130735388e-05, |
| "loss": 1.5389, |
| "step": 30200 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 1.5563335418701172, |
| "eval_runtime": 20.3631, |
| "eval_samples_per_second": 47.046, |
| "eval_steps_per_second": 5.893, |
| "step": 30229 |
| }, |
| { |
| "epoch": 19.044626021370206, |
| "grad_norm": 4.6667866706848145, |
| "learning_rate": 1.1912005028284098e-05, |
| "loss": 1.5508, |
| "step": 30300 |
| }, |
| { |
| "epoch": 19.107479572595853, |
| "grad_norm": 4.471792697906494, |
| "learning_rate": 1.1786297925832811e-05, |
| "loss": 1.5253, |
| "step": 30400 |
| }, |
| { |
| "epoch": 19.170333123821496, |
| "grad_norm": 4.01970100402832, |
| "learning_rate": 1.1660590823381521e-05, |
| "loss": 1.5047, |
| "step": 30500 |
| }, |
| { |
| "epoch": 19.23318667504714, |
| "grad_norm": 5.021801471710205, |
| "learning_rate": 1.1534883720930233e-05, |
| "loss": 1.5459, |
| "step": 30600 |
| }, |
| { |
| "epoch": 19.296040226272783, |
| "grad_norm": 4.681889533996582, |
| "learning_rate": 1.1409176618478945e-05, |
| "loss": 1.561, |
| "step": 30700 |
| }, |
| { |
| "epoch": 19.35889377749843, |
| "grad_norm": 4.114772319793701, |
| "learning_rate": 1.1283469516027655e-05, |
| "loss": 1.532, |
| "step": 30800 |
| }, |
| { |
| "epoch": 19.421747328724074, |
| "grad_norm": 3.9337844848632812, |
| "learning_rate": 1.1157762413576367e-05, |
| "loss": 1.5512, |
| "step": 30900 |
| }, |
| { |
| "epoch": 19.484600879949717, |
| "grad_norm": 4.935436725616455, |
| "learning_rate": 1.1032055311125079e-05, |
| "loss": 1.5328, |
| "step": 31000 |
| }, |
| { |
| "epoch": 19.54745443117536, |
| "grad_norm": 5.703494071960449, |
| "learning_rate": 1.090634820867379e-05, |
| "loss": 1.5889, |
| "step": 31100 |
| }, |
| { |
| "epoch": 19.610307982401007, |
| "grad_norm": 6.010659217834473, |
| "learning_rate": 1.0780641106222502e-05, |
| "loss": 1.5166, |
| "step": 31200 |
| }, |
| { |
| "epoch": 19.67316153362665, |
| "grad_norm": 5.14444637298584, |
| "learning_rate": 1.0654934003771214e-05, |
| "loss": 1.5096, |
| "step": 31300 |
| }, |
| { |
| "epoch": 19.736015084852294, |
| "grad_norm": 7.321188449859619, |
| "learning_rate": 1.0529226901319924e-05, |
| "loss": 1.4865, |
| "step": 31400 |
| }, |
| { |
| "epoch": 19.798868636077938, |
| "grad_norm": 3.7702994346618652, |
| "learning_rate": 1.0403519798868636e-05, |
| "loss": 1.5122, |
| "step": 31500 |
| }, |
| { |
| "epoch": 19.86172218730358, |
| "grad_norm": 5.493444442749023, |
| "learning_rate": 1.0277812696417348e-05, |
| "loss": 1.4974, |
| "step": 31600 |
| }, |
| { |
| "epoch": 19.924575738529228, |
| "grad_norm": 5.273486137390137, |
| "learning_rate": 1.015210559396606e-05, |
| "loss": 1.5619, |
| "step": 31700 |
| }, |
| { |
| "epoch": 19.98742928975487, |
| "grad_norm": 4.340183734893799, |
| "learning_rate": 1.0026398491514772e-05, |
| "loss": 1.4476, |
| "step": 31800 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 1.5459223985671997, |
| "eval_runtime": 20.3264, |
| "eval_samples_per_second": 47.131, |
| "eval_steps_per_second": 5.904, |
| "step": 31820 |
| }, |
| { |
| "epoch": 20.050282840980515, |
| "grad_norm": 3.8120639324188232, |
| "learning_rate": 9.900691389063482e-06, |
| "loss": 1.4837, |
| "step": 31900 |
| }, |
| { |
| "epoch": 20.113136392206158, |
| "grad_norm": 4.154244899749756, |
| "learning_rate": 9.774984286612195e-06, |
| "loss": 1.4684, |
| "step": 32000 |
| }, |
| { |
| "epoch": 20.175989943431805, |
| "grad_norm": 3.925746202468872, |
| "learning_rate": 9.649277184160905e-06, |
| "loss": 1.4685, |
| "step": 32100 |
| }, |
| { |
| "epoch": 20.23884349465745, |
| "grad_norm": 5.944131374359131, |
| "learning_rate": 9.523570081709617e-06, |
| "loss": 1.5097, |
| "step": 32200 |
| }, |
| { |
| "epoch": 20.301697045883092, |
| "grad_norm": 4.755185127258301, |
| "learning_rate": 9.397862979258329e-06, |
| "loss": 1.4334, |
| "step": 32300 |
| }, |
| { |
| "epoch": 20.364550597108735, |
| "grad_norm": 4.627038478851318, |
| "learning_rate": 9.27215587680704e-06, |
| "loss": 1.503, |
| "step": 32400 |
| }, |
| { |
| "epoch": 20.427404148334382, |
| "grad_norm": 9.863165855407715, |
| "learning_rate": 9.14644877435575e-06, |
| "loss": 1.4607, |
| "step": 32500 |
| }, |
| { |
| "epoch": 20.490257699560026, |
| "grad_norm": 4.401854991912842, |
| "learning_rate": 9.020741671904463e-06, |
| "loss": 1.4653, |
| "step": 32600 |
| }, |
| { |
| "epoch": 20.55311125078567, |
| "grad_norm": 6.041737079620361, |
| "learning_rate": 8.895034569453174e-06, |
| "loss": 1.504, |
| "step": 32700 |
| }, |
| { |
| "epoch": 20.615964802011312, |
| "grad_norm": 6.523427963256836, |
| "learning_rate": 8.769327467001886e-06, |
| "loss": 1.6205, |
| "step": 32800 |
| }, |
| { |
| "epoch": 20.67881835323696, |
| "grad_norm": 5.47548246383667, |
| "learning_rate": 8.643620364550598e-06, |
| "loss": 1.4491, |
| "step": 32900 |
| }, |
| { |
| "epoch": 20.741671904462603, |
| "grad_norm": 5.3726959228515625, |
| "learning_rate": 8.517913262099308e-06, |
| "loss": 1.5817, |
| "step": 33000 |
| }, |
| { |
| "epoch": 20.804525455688246, |
| "grad_norm": 3.872283935546875, |
| "learning_rate": 8.392206159648022e-06, |
| "loss": 1.5482, |
| "step": 33100 |
| }, |
| { |
| "epoch": 20.86737900691389, |
| "grad_norm": 4.935946464538574, |
| "learning_rate": 8.266499057196732e-06, |
| "loss": 1.5006, |
| "step": 33200 |
| }, |
| { |
| "epoch": 20.930232558139537, |
| "grad_norm": 6.805904388427734, |
| "learning_rate": 8.140791954745444e-06, |
| "loss": 1.5314, |
| "step": 33300 |
| }, |
| { |
| "epoch": 20.99308610936518, |
| "grad_norm": 4.420083522796631, |
| "learning_rate": 8.015084852294155e-06, |
| "loss": 1.5417, |
| "step": 33400 |
| }, |
| { |
| "epoch": 21.0, |
| "eval_loss": 1.5356966257095337, |
| "eval_runtime": 20.4137, |
| "eval_samples_per_second": 46.929, |
| "eval_steps_per_second": 5.878, |
| "step": 33411 |
| }, |
| { |
| "epoch": 21.055939660590823, |
| "grad_norm": 3.697171688079834, |
| "learning_rate": 7.889377749842865e-06, |
| "loss": 1.4994, |
| "step": 33500 |
| }, |
| { |
| "epoch": 21.118793211816467, |
| "grad_norm": 5.232399940490723, |
| "learning_rate": 7.763670647391579e-06, |
| "loss": 1.5351, |
| "step": 33600 |
| }, |
| { |
| "epoch": 21.18164676304211, |
| "grad_norm": 4.508577823638916, |
| "learning_rate": 7.637963544940289e-06, |
| "loss": 1.4301, |
| "step": 33700 |
| }, |
| { |
| "epoch": 21.244500314267757, |
| "grad_norm": 5.425107479095459, |
| "learning_rate": 7.512256442489001e-06, |
| "loss": 1.4739, |
| "step": 33800 |
| }, |
| { |
| "epoch": 21.3073538654934, |
| "grad_norm": 6.195432186126709, |
| "learning_rate": 7.386549340037713e-06, |
| "loss": 1.5458, |
| "step": 33900 |
| }, |
| { |
| "epoch": 21.370207416719044, |
| "grad_norm": 5.850045204162598, |
| "learning_rate": 7.260842237586424e-06, |
| "loss": 1.5189, |
| "step": 34000 |
| }, |
| { |
| "epoch": 21.433060967944687, |
| "grad_norm": 7.121579170227051, |
| "learning_rate": 7.135135135135136e-06, |
| "loss": 1.5273, |
| "step": 34100 |
| }, |
| { |
| "epoch": 21.495914519170334, |
| "grad_norm": 4.316208362579346, |
| "learning_rate": 7.009428032683847e-06, |
| "loss": 1.4437, |
| "step": 34200 |
| }, |
| { |
| "epoch": 21.558768070395978, |
| "grad_norm": 4.3052873611450195, |
| "learning_rate": 6.883720930232558e-06, |
| "loss": 1.4266, |
| "step": 34300 |
| }, |
| { |
| "epoch": 21.62162162162162, |
| "grad_norm": 4.691330432891846, |
| "learning_rate": 6.758013827781271e-06, |
| "loss": 1.422, |
| "step": 34400 |
| }, |
| { |
| "epoch": 21.684475172847264, |
| "grad_norm": 4.346444129943848, |
| "learning_rate": 6.632306725329982e-06, |
| "loss": 1.5511, |
| "step": 34500 |
| }, |
| { |
| "epoch": 21.74732872407291, |
| "grad_norm": 5.304843902587891, |
| "learning_rate": 6.506599622878693e-06, |
| "loss": 1.4961, |
| "step": 34600 |
| }, |
| { |
| "epoch": 21.810182275298555, |
| "grad_norm": 4.877419948577881, |
| "learning_rate": 6.3808925204274045e-06, |
| "loss": 1.4837, |
| "step": 34700 |
| }, |
| { |
| "epoch": 21.873035826524198, |
| "grad_norm": 4.086881637573242, |
| "learning_rate": 6.2551854179761155e-06, |
| "loss": 1.5164, |
| "step": 34800 |
| }, |
| { |
| "epoch": 21.93588937774984, |
| "grad_norm": 4.570976734161377, |
| "learning_rate": 6.129478315524827e-06, |
| "loss": 1.4681, |
| "step": 34900 |
| }, |
| { |
| "epoch": 21.99874292897549, |
| "grad_norm": 25.407676696777344, |
| "learning_rate": 6.003771213073539e-06, |
| "loss": 1.4062, |
| "step": 35000 |
| }, |
| { |
| "epoch": 22.0, |
| "eval_loss": 1.5373815298080444, |
| "eval_runtime": 20.3495, |
| "eval_samples_per_second": 47.077, |
| "eval_steps_per_second": 5.897, |
| "step": 35002 |
| }, |
| { |
| "epoch": 22.061596480201132, |
| "grad_norm": 4.965208053588867, |
| "learning_rate": 5.878064110622251e-06, |
| "loss": 1.446, |
| "step": 35100 |
| }, |
| { |
| "epoch": 22.124450031426775, |
| "grad_norm": 5.620969772338867, |
| "learning_rate": 5.752357008170962e-06, |
| "loss": 1.475, |
| "step": 35200 |
| }, |
| { |
| "epoch": 22.18730358265242, |
| "grad_norm": 4.315845489501953, |
| "learning_rate": 5.626649905719674e-06, |
| "loss": 1.4866, |
| "step": 35300 |
| }, |
| { |
| "epoch": 22.250157133878066, |
| "grad_norm": 4.076879501342773, |
| "learning_rate": 5.5009428032683854e-06, |
| "loss": 1.5079, |
| "step": 35400 |
| }, |
| { |
| "epoch": 22.31301068510371, |
| "grad_norm": 9.52351188659668, |
| "learning_rate": 5.375235700817096e-06, |
| "loss": 1.5637, |
| "step": 35500 |
| }, |
| { |
| "epoch": 22.375864236329353, |
| "grad_norm": 5.529058933258057, |
| "learning_rate": 5.249528598365807e-06, |
| "loss": 1.4702, |
| "step": 35600 |
| }, |
| { |
| "epoch": 22.438717787554996, |
| "grad_norm": 4.761877536773682, |
| "learning_rate": 5.123821495914519e-06, |
| "loss": 1.4367, |
| "step": 35700 |
| }, |
| { |
| "epoch": 22.501571338780643, |
| "grad_norm": 6.587429046630859, |
| "learning_rate": 4.998114393463231e-06, |
| "loss": 1.4052, |
| "step": 35800 |
| }, |
| { |
| "epoch": 22.564424890006286, |
| "grad_norm": 5.834304332733154, |
| "learning_rate": 4.872407291011943e-06, |
| "loss": 1.4186, |
| "step": 35900 |
| }, |
| { |
| "epoch": 22.62727844123193, |
| "grad_norm": 3.871225595474243, |
| "learning_rate": 4.746700188560654e-06, |
| "loss": 1.51, |
| "step": 36000 |
| }, |
| { |
| "epoch": 22.690131992457573, |
| "grad_norm": 3.876692771911621, |
| "learning_rate": 4.6209930861093655e-06, |
| "loss": 1.5022, |
| "step": 36100 |
| }, |
| { |
| "epoch": 22.752985543683216, |
| "grad_norm": 4.569952964782715, |
| "learning_rate": 4.495285983658077e-06, |
| "loss": 1.454, |
| "step": 36200 |
| }, |
| { |
| "epoch": 22.815839094908863, |
| "grad_norm": 5.837776184082031, |
| "learning_rate": 4.369578881206788e-06, |
| "loss": 1.4472, |
| "step": 36300 |
| }, |
| { |
| "epoch": 22.878692646134507, |
| "grad_norm": 5.9942426681518555, |
| "learning_rate": 4.243871778755499e-06, |
| "loss": 1.4198, |
| "step": 36400 |
| }, |
| { |
| "epoch": 22.94154619736015, |
| "grad_norm": 4.1033220291137695, |
| "learning_rate": 4.118164676304211e-06, |
| "loss": 1.4658, |
| "step": 36500 |
| }, |
| { |
| "epoch": 23.0, |
| "eval_loss": 1.5307875871658325, |
| "eval_runtime": 20.3299, |
| "eval_samples_per_second": 47.123, |
| "eval_steps_per_second": 5.903, |
| "step": 36593 |
| }, |
| { |
| "epoch": 23.004399748585794, |
| "grad_norm": 4.649007320404053, |
| "learning_rate": 3.992457573852923e-06, |
| "loss": 1.4064, |
| "step": 36600 |
| }, |
| { |
| "epoch": 23.06725329981144, |
| "grad_norm": 4.318711757659912, |
| "learning_rate": 3.866750471401635e-06, |
| "loss": 1.4249, |
| "step": 36700 |
| }, |
| { |
| "epoch": 23.130106851037084, |
| "grad_norm": 6.213062286376953, |
| "learning_rate": 3.7410433689503456e-06, |
| "loss": 1.4317, |
| "step": 36800 |
| }, |
| { |
| "epoch": 23.192960402262727, |
| "grad_norm": 4.529442310333252, |
| "learning_rate": 3.6153362664990574e-06, |
| "loss": 1.5102, |
| "step": 36900 |
| }, |
| { |
| "epoch": 23.25581395348837, |
| "grad_norm": 4.912539005279541, |
| "learning_rate": 3.4896291640477688e-06, |
| "loss": 1.4684, |
| "step": 37000 |
| }, |
| { |
| "epoch": 23.318667504714018, |
| "grad_norm": 4.593921661376953, |
| "learning_rate": 3.3639220615964806e-06, |
| "loss": 1.4181, |
| "step": 37100 |
| }, |
| { |
| "epoch": 23.38152105593966, |
| "grad_norm": 5.35049295425415, |
| "learning_rate": 3.2382149591451915e-06, |
| "loss": 1.4813, |
| "step": 37200 |
| }, |
| { |
| "epoch": 23.444374607165305, |
| "grad_norm": 4.00051212310791, |
| "learning_rate": 3.1125078566939033e-06, |
| "loss": 1.4392, |
| "step": 37300 |
| }, |
| { |
| "epoch": 23.507228158390948, |
| "grad_norm": 5.91484260559082, |
| "learning_rate": 2.9868007542426147e-06, |
| "loss": 1.4386, |
| "step": 37400 |
| }, |
| { |
| "epoch": 23.570081709616595, |
| "grad_norm": 7.114585876464844, |
| "learning_rate": 2.861093651791326e-06, |
| "loss": 1.4115, |
| "step": 37500 |
| }, |
| { |
| "epoch": 23.63293526084224, |
| "grad_norm": 2.977877378463745, |
| "learning_rate": 2.735386549340038e-06, |
| "loss": 1.4211, |
| "step": 37600 |
| }, |
| { |
| "epoch": 23.69578881206788, |
| "grad_norm": 3.83953857421875, |
| "learning_rate": 2.6096794468887493e-06, |
| "loss": 1.4601, |
| "step": 37700 |
| }, |
| { |
| "epoch": 23.758642363293525, |
| "grad_norm": 4.377187728881836, |
| "learning_rate": 2.483972344437461e-06, |
| "loss": 1.4281, |
| "step": 37800 |
| }, |
| { |
| "epoch": 23.821495914519172, |
| "grad_norm": 3.9868085384368896, |
| "learning_rate": 2.358265241986172e-06, |
| "loss": 1.4585, |
| "step": 37900 |
| }, |
| { |
| "epoch": 23.884349465744815, |
| "grad_norm": 3.989767551422119, |
| "learning_rate": 2.232558139534884e-06, |
| "loss": 1.5302, |
| "step": 38000 |
| }, |
| { |
| "epoch": 23.94720301697046, |
| "grad_norm": 4.481296062469482, |
| "learning_rate": 2.1068510370835952e-06, |
| "loss": 1.4366, |
| "step": 38100 |
| }, |
| { |
| "epoch": 24.0, |
| "eval_loss": 1.5289642810821533, |
| "eval_runtime": 20.3269, |
| "eval_samples_per_second": 47.13, |
| "eval_steps_per_second": 5.904, |
| "step": 38184 |
| }, |
| { |
| "epoch": 24.010056568196102, |
| "grad_norm": 4.909224033355713, |
| "learning_rate": 1.981143934632307e-06, |
| "loss": 1.4956, |
| "step": 38200 |
| }, |
| { |
| "epoch": 24.072910119421746, |
| "grad_norm": 4.9214372634887695, |
| "learning_rate": 1.8554368321810182e-06, |
| "loss": 1.4725, |
| "step": 38300 |
| }, |
| { |
| "epoch": 24.135763670647393, |
| "grad_norm": 4.345515251159668, |
| "learning_rate": 1.7297297297297298e-06, |
| "loss": 1.4407, |
| "step": 38400 |
| }, |
| { |
| "epoch": 24.198617221873036, |
| "grad_norm": 4.926340579986572, |
| "learning_rate": 1.6040226272784412e-06, |
| "loss": 1.5008, |
| "step": 38500 |
| }, |
| { |
| "epoch": 24.26147077309868, |
| "grad_norm": 4.5064263343811035, |
| "learning_rate": 1.4783155248271527e-06, |
| "loss": 1.4868, |
| "step": 38600 |
| }, |
| { |
| "epoch": 24.324324324324323, |
| "grad_norm": 5.347716808319092, |
| "learning_rate": 1.3526084223758643e-06, |
| "loss": 1.45, |
| "step": 38700 |
| }, |
| { |
| "epoch": 24.38717787554997, |
| "grad_norm": 5.024169921875, |
| "learning_rate": 1.2269013199245757e-06, |
| "loss": 1.3905, |
| "step": 38800 |
| }, |
| { |
| "epoch": 24.450031426775613, |
| "grad_norm": 4.319692611694336, |
| "learning_rate": 1.1011942174732873e-06, |
| "loss": 1.4671, |
| "step": 38900 |
| }, |
| { |
| "epoch": 24.512884978001257, |
| "grad_norm": 2.880321979522705, |
| "learning_rate": 9.75487115021999e-07, |
| "loss": 1.4211, |
| "step": 39000 |
| }, |
| { |
| "epoch": 24.5757385292269, |
| "grad_norm": 4.416039943695068, |
| "learning_rate": 8.497800125707103e-07, |
| "loss": 1.4176, |
| "step": 39100 |
| }, |
| { |
| "epoch": 24.638592080452547, |
| "grad_norm": 4.598896503448486, |
| "learning_rate": 7.240729101194218e-07, |
| "loss": 1.4194, |
| "step": 39200 |
| }, |
| { |
| "epoch": 24.70144563167819, |
| "grad_norm": 4.256235599517822, |
| "learning_rate": 5.983658076681333e-07, |
| "loss": 1.4331, |
| "step": 39300 |
| }, |
| { |
| "epoch": 24.764299182903834, |
| "grad_norm": 4.7764811515808105, |
| "learning_rate": 4.726587052168448e-07, |
| "loss": 1.4491, |
| "step": 39400 |
| }, |
| { |
| "epoch": 24.827152734129477, |
| "grad_norm": 4.296844005584717, |
| "learning_rate": 3.4695160276555627e-07, |
| "loss": 1.4443, |
| "step": 39500 |
| }, |
| { |
| "epoch": 24.890006285355124, |
| "grad_norm": 3.9589693546295166, |
| "learning_rate": 2.2124450031426776e-07, |
| "loss": 1.4612, |
| "step": 39600 |
| }, |
| { |
| "epoch": 24.952859836580767, |
| "grad_norm": 4.165828227996826, |
| "learning_rate": 9.553739786297926e-08, |
| "loss": 1.48, |
| "step": 39700 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_loss": 1.528791069984436, |
| "eval_runtime": 20.2887, |
| "eval_samples_per_second": 47.218, |
| "eval_steps_per_second": 5.915, |
| "step": 39775 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 39775, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 25, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.44418915549184e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|