| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.954168967421314, |
| "eval_steps": 500, |
| "global_step": 280, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0176697956929873, |
| "grad_norm": 5.875631617147988, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.8112, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0353395913859746, |
| "grad_norm": 5.961953835603671, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.8194, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0530093870789619, |
| "grad_norm": 5.5057251196001635, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 0.796, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0706791827719492, |
| "grad_norm": 2.504760468622455, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": 0.7065, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0883489784649365, |
| "grad_norm": 3.925627650334603, |
| "learning_rate": 1.4285714285714287e-05, |
| "loss": 0.7078, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.1060187741579238, |
| "grad_norm": 4.140394334047557, |
| "learning_rate": 1.7142857142857142e-05, |
| "loss": 0.6955, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.1236885698509111, |
| "grad_norm": 4.544574066739854, |
| "learning_rate": 2e-05, |
| "loss": 0.6606, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.1413583655438984, |
| "grad_norm": 2.8858399260590226, |
| "learning_rate": 2.2857142857142858e-05, |
| "loss": 0.6356, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.1590281612368857, |
| "grad_norm": 2.810080725674096, |
| "learning_rate": 2.5714285714285718e-05, |
| "loss": 0.6119, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.176697956929873, |
| "grad_norm": 2.3552991906071616, |
| "learning_rate": 2.8571428571428574e-05, |
| "loss": 0.6021, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1943677526228603, |
| "grad_norm": 1.2910105034140027, |
| "learning_rate": 3.142857142857143e-05, |
| "loss": 0.5694, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.2120375483158476, |
| "grad_norm": 1.6332855505381296, |
| "learning_rate": 3.4285714285714284e-05, |
| "loss": 0.554, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.22970734400883489, |
| "grad_norm": 1.2595348729805071, |
| "learning_rate": 3.714285714285715e-05, |
| "loss": 0.5432, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.2473771397018222, |
| "grad_norm": 1.3472823656550243, |
| "learning_rate": 4e-05, |
| "loss": 0.536, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.2650469353948095, |
| "grad_norm": 0.9873839365603844, |
| "learning_rate": 4.2857142857142856e-05, |
| "loss": 0.5393, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2827167310877968, |
| "grad_norm": 1.1377505855165275, |
| "learning_rate": 4.5714285714285716e-05, |
| "loss": 0.5293, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.3003865267807841, |
| "grad_norm": 1.0613233081748863, |
| "learning_rate": 4.857142857142857e-05, |
| "loss": 0.5129, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.3180563224737714, |
| "grad_norm": 1.1184007657565689, |
| "learning_rate": 5.1428571428571436e-05, |
| "loss": 0.5091, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.3357261181667587, |
| "grad_norm": 1.1596525894157543, |
| "learning_rate": 5.4285714285714295e-05, |
| "loss": 0.5141, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.353395913859746, |
| "grad_norm": 1.1056524487969162, |
| "learning_rate": 5.714285714285715e-05, |
| "loss": 0.5065, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.37106570955273327, |
| "grad_norm": 1.2246588779747853, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 0.5001, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.3887355052457206, |
| "grad_norm": 0.9005160048442259, |
| "learning_rate": 6.285714285714286e-05, |
| "loss": 0.4948, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.4064053009387079, |
| "grad_norm": 1.468175523379714, |
| "learning_rate": 6.571428571428571e-05, |
| "loss": 0.5029, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.4240750966316952, |
| "grad_norm": 0.7100895840287704, |
| "learning_rate": 6.857142857142857e-05, |
| "loss": 0.4901, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.4417448923246825, |
| "grad_norm": 1.4524384718122851, |
| "learning_rate": 7.142857142857143e-05, |
| "loss": 0.5015, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.45941468801766977, |
| "grad_norm": 3.29851781812619, |
| "learning_rate": 7.42857142857143e-05, |
| "loss": 0.4866, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.4770844837106571, |
| "grad_norm": 1.6338876773578732, |
| "learning_rate": 7.714285714285715e-05, |
| "loss": 0.5027, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.4947542794036444, |
| "grad_norm": 1.1095768120544467, |
| "learning_rate": 8e-05, |
| "loss": 0.4881, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.5124240750966317, |
| "grad_norm": 0.9195853861546358, |
| "learning_rate": 7.9996891699239e-05, |
| "loss": 0.4766, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.530093870789619, |
| "grad_norm": 1.893948627853531, |
| "learning_rate": 7.998756728003266e-05, |
| "loss": 0.5003, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.5477636664826063, |
| "grad_norm": 1.3966468358301396, |
| "learning_rate": 7.997202819153595e-05, |
| "loss": 0.4748, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.5654334621755936, |
| "grad_norm": 1.173904127551551, |
| "learning_rate": 7.99502768487569e-05, |
| "loss": 0.4728, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.5831032578685809, |
| "grad_norm": 1.6511214276120525, |
| "learning_rate": 7.992231663218129e-05, |
| "loss": 0.4727, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.6007730535615682, |
| "grad_norm": 0.8130185719483476, |
| "learning_rate": 7.988815188724721e-05, |
| "loss": 0.4696, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.6184428492545555, |
| "grad_norm": 1.2456661967993063, |
| "learning_rate": 7.984778792366983e-05, |
| "loss": 0.4711, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.6361126449475428, |
| "grad_norm": 1.0570393538806437, |
| "learning_rate": 7.980123101461606e-05, |
| "loss": 0.463, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.6537824406405301, |
| "grad_norm": 0.9901050412456828, |
| "learning_rate": 7.974848839572971e-05, |
| "loss": 0.4578, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.6714522363335174, |
| "grad_norm": 0.8705213139045455, |
| "learning_rate": 7.96895682640069e-05, |
| "loss": 0.4645, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.6891220320265047, |
| "grad_norm": 0.7854045575641022, |
| "learning_rate": 7.962447977652211e-05, |
| "loss": 0.4647, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.706791827719492, |
| "grad_norm": 0.8451669852541023, |
| "learning_rate": 7.955323304900514e-05, |
| "loss": 0.4609, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7244616234124793, |
| "grad_norm": 1.0240484592422439, |
| "learning_rate": 7.947583915426885e-05, |
| "loss": 0.4583, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.7421314191054665, |
| "grad_norm": 0.9325468461529605, |
| "learning_rate": 7.939231012048833e-05, |
| "loss": 0.4599, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.7598012147984539, |
| "grad_norm": 0.8208173672541784, |
| "learning_rate": 7.930265892933154e-05, |
| "loss": 0.4488, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.7774710104914412, |
| "grad_norm": 0.8066370895813, |
| "learning_rate": 7.920689951394175e-05, |
| "loss": 0.4599, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.7951408061844285, |
| "grad_norm": 0.5967750986819955, |
| "learning_rate": 7.91050467567722e-05, |
| "loss": 0.4531, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8128106018774158, |
| "grad_norm": 0.6644148535772102, |
| "learning_rate": 7.899711648727294e-05, |
| "loss": 0.4503, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.830480397570403, |
| "grad_norm": 0.7344655144114142, |
| "learning_rate": 7.888312547943099e-05, |
| "loss": 0.4509, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.8481501932633904, |
| "grad_norm": 3.462692478412094, |
| "learning_rate": 7.876309144916312e-05, |
| "loss": 0.4933, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.8658199889563777, |
| "grad_norm": 1.0900503309702443, |
| "learning_rate": 7.863703305156273e-05, |
| "loss": 0.4673, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.883489784649365, |
| "grad_norm": 1.372735585333971, |
| "learning_rate": 7.850496987800048e-05, |
| "loss": 0.45, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.9011595803423523, |
| "grad_norm": 0.9943280611059013, |
| "learning_rate": 7.836692245307951e-05, |
| "loss": 0.4619, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.9188293760353395, |
| "grad_norm": 1.1230540437524632, |
| "learning_rate": 7.822291223144564e-05, |
| "loss": 0.4602, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.9364991717283269, |
| "grad_norm": 0.9358182024156444, |
| "learning_rate": 7.80729615944529e-05, |
| "loss": 0.4627, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.9541689674213142, |
| "grad_norm": 0.6965038938335383, |
| "learning_rate": 7.791709384668528e-05, |
| "loss": 0.4377, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.9718387631143015, |
| "grad_norm": 0.8794762258340597, |
| "learning_rate": 7.775533321233471e-05, |
| "loss": 0.4416, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.9895085588072888, |
| "grad_norm": 0.5702192945738279, |
| "learning_rate": 7.758770483143634e-05, |
| "loss": 0.4381, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.0088348978464936, |
| "grad_norm": 1.0158837364149667, |
| "learning_rate": 7.741423475596136e-05, |
| "loss": 0.6602, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.026504693539481, |
| "grad_norm": 0.7661664733655509, |
| "learning_rate": 7.723494994576818e-05, |
| "loss": 0.4224, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.0441744892324683, |
| "grad_norm": 0.6984964000427009, |
| "learning_rate": 7.704987826441235e-05, |
| "loss": 0.423, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.0618442849254555, |
| "grad_norm": 0.543414847614615, |
| "learning_rate": 7.685904847481631e-05, |
| "loss": 0.4214, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.079514080618443, |
| "grad_norm": 0.6586728820833049, |
| "learning_rate": 7.666249023479905e-05, |
| "loss": 0.4232, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.09718387631143, |
| "grad_norm": 0.48664555730901193, |
| "learning_rate": 7.646023409246694e-05, |
| "loss": 0.4184, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.1148536720044175, |
| "grad_norm": 0.45691517952976585, |
| "learning_rate": 7.625231148146601e-05, |
| "loss": 0.4116, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.1325234676974048, |
| "grad_norm": 0.44580349839113315, |
| "learning_rate": 7.603875471609677e-05, |
| "loss": 0.4148, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.150193263390392, |
| "grad_norm": 0.38551843328636426, |
| "learning_rate": 7.581959698629204e-05, |
| "loss": 0.4081, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.1678630590833794, |
| "grad_norm": 0.4359324964542397, |
| "learning_rate": 7.559487235245875e-05, |
| "loss": 0.4151, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.1855328547763666, |
| "grad_norm": 0.41530730049997977, |
| "learning_rate": 7.536461574018439e-05, |
| "loss": 0.4116, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.203202650469354, |
| "grad_norm": 0.38292190540156723, |
| "learning_rate": 7.512886293480914e-05, |
| "loss": 0.4099, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.2208724461623413, |
| "grad_norm": 0.4050268821480764, |
| "learning_rate": 7.488765057586422e-05, |
| "loss": 0.4059, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.2385422418553285, |
| "grad_norm": 0.573303636137278, |
| "learning_rate": 7.464101615137756e-05, |
| "loss": 0.4137, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.256212037548316, |
| "grad_norm": 0.6806327106102632, |
| "learning_rate": 7.438899799204762e-05, |
| "loss": 0.412, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.273881833241303, |
| "grad_norm": 0.5165313145422143, |
| "learning_rate": 7.413163526528623e-05, |
| "loss": 0.4078, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.2915516289342905, |
| "grad_norm": 0.5532520513811989, |
| "learning_rate": 7.386896796913137e-05, |
| "loss": 0.4026, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.3092214246272778, |
| "grad_norm": 0.6170140080440525, |
| "learning_rate": 7.360103692603087e-05, |
| "loss": 0.4025, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.326891220320265, |
| "grad_norm": 0.42873518437379604, |
| "learning_rate": 7.332788377649796e-05, |
| "loss": 0.4052, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.3445610160132524, |
| "grad_norm": 0.449652816843054, |
| "learning_rate": 7.30495509726398e-05, |
| "loss": 0.4089, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.3622308117062396, |
| "grad_norm": 0.4609411200515682, |
| "learning_rate": 7.276608177155968e-05, |
| "loss": 0.409, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.379900607399227, |
| "grad_norm": 0.37783779994964356, |
| "learning_rate": 7.247752022863428e-05, |
| "loss": 0.411, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.3975704030922143, |
| "grad_norm": 0.47167800258784437, |
| "learning_rate": 7.218391119066674e-05, |
| "loss": 0.4006, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.4152401987852015, |
| "grad_norm": 0.5209290267234993, |
| "learning_rate": 7.188530028891691e-05, |
| "loss": 0.3961, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.432909994478189, |
| "grad_norm": 0.35540053272228744, |
| "learning_rate": 7.158173393200942e-05, |
| "loss": 0.3999, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.450579790171176, |
| "grad_norm": 0.2822473247891192, |
| "learning_rate": 7.12732592987212e-05, |
| "loss": 0.4029, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.4682495858641635, |
| "grad_norm": 0.44251535710813034, |
| "learning_rate": 7.09599243306491e-05, |
| "loss": 0.411, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.4859193815571508, |
| "grad_norm": 0.46257597741002365, |
| "learning_rate": 7.064177772475912e-05, |
| "loss": 0.3997, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.503589177250138, |
| "grad_norm": 0.3804354174312414, |
| "learning_rate": 7.031886892581813e-05, |
| "loss": 0.3984, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.5212589729431254, |
| "grad_norm": 0.2667610882166938, |
| "learning_rate": 6.999124811870938e-05, |
| "loss": 0.3986, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.5389287686361126, |
| "grad_norm": 0.30942594899982945, |
| "learning_rate": 6.965896622063307e-05, |
| "loss": 0.4055, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.5565985643291, |
| "grad_norm": 0.3550745842897038, |
| "learning_rate": 6.932207487319305e-05, |
| "loss": 0.408, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.5742683600220873, |
| "grad_norm": 0.3569377139350627, |
| "learning_rate": 6.898062643437091e-05, |
| "loss": 0.3961, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.5919381557150745, |
| "grad_norm": 0.33698677021351336, |
| "learning_rate": 6.863467397038874e-05, |
| "loss": 0.3927, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.609607951408062, |
| "grad_norm": 0.4283132299621792, |
| "learning_rate": 6.828427124746191e-05, |
| "loss": 0.3962, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.627277747101049, |
| "grad_norm": 0.4800585612046361, |
| "learning_rate": 6.792947272344292e-05, |
| "loss": 0.4024, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.6449475427940365, |
| "grad_norm": 0.4106952334802103, |
| "learning_rate": 6.757033353935788e-05, |
| "loss": 0.3983, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.6626173384870238, |
| "grad_norm": 0.44863543828572716, |
| "learning_rate": 6.720690951083678e-05, |
| "loss": 0.3983, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.680287134180011, |
| "grad_norm": 0.5972683621698959, |
| "learning_rate": 6.68392571194388e-05, |
| "loss": 0.3952, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.6979569298729982, |
| "grad_norm": 0.6864704632849609, |
| "learning_rate": 6.646743350387438e-05, |
| "loss": 0.4052, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.7156267255659856, |
| "grad_norm": 0.5997402895023315, |
| "learning_rate": 6.609149645112485e-05, |
| "loss": 0.3977, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.733296521258973, |
| "grad_norm": 0.42336994036744263, |
| "learning_rate": 6.571150438746157e-05, |
| "loss": 0.3985, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.7509663169519603, |
| "grad_norm": 0.416677515975007, |
| "learning_rate": 6.532751636936561e-05, |
| "loss": 0.4043, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.7686361126449475, |
| "grad_norm": 0.5095146036807348, |
| "learning_rate": 6.493959207434934e-05, |
| "loss": 0.3931, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.7863059083379347, |
| "grad_norm": 0.45946367582524933, |
| "learning_rate": 6.45477917916819e-05, |
| "loss": 0.3972, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.8039757040309221, |
| "grad_norm": 0.3713407116922932, |
| "learning_rate": 6.41521764130191e-05, |
| "loss": 0.4044, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.8216454997239095, |
| "grad_norm": 0.40017896468093417, |
| "learning_rate": 6.375280742294007e-05, |
| "loss": 0.398, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.8393152954168968, |
| "grad_norm": 0.5022523013468211, |
| "learning_rate": 6.334974688939161e-05, |
| "loss": 0.3963, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.856985091109884, |
| "grad_norm": 0.3934847620868829, |
| "learning_rate": 6.294305745404185e-05, |
| "loss": 0.3884, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.8746548868028712, |
| "grad_norm": 0.3453314877586019, |
| "learning_rate": 6.253280232254489e-05, |
| "loss": 0.3899, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.8923246824958586, |
| "grad_norm": 0.4479654044472846, |
| "learning_rate": 6.211904525471758e-05, |
| "loss": 0.3938, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.909994478188846, |
| "grad_norm": 0.39352474590110836, |
| "learning_rate": 6.170185055463039e-05, |
| "loss": 0.3915, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.9276642738818333, |
| "grad_norm": 0.2714956264200266, |
| "learning_rate": 6.128128306061347e-05, |
| "loss": 0.3899, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.9453340695748205, |
| "grad_norm": 0.31091747939470954, |
| "learning_rate": 6.0857408135179926e-05, |
| "loss": 0.3893, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.9630038652678077, |
| "grad_norm": 0.3575276621136432, |
| "learning_rate": 6.0430291654867435e-05, |
| "loss": 0.3913, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.9806736609607951, |
| "grad_norm": 0.2508689498362062, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 0.3972, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.9983434566537825, |
| "grad_norm": 0.5605963723377234, |
| "learning_rate": 5.9566600044371584e-05, |
| "loss": 0.5878, |
| "step": 113 |
| }, |
| { |
| "epoch": 2.017669795692987, |
| "grad_norm": 0.7102289660217768, |
| "learning_rate": 5.913015914485274e-05, |
| "loss": 0.3661, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.0353395913859744, |
| "grad_norm": 0.3944445954033844, |
| "learning_rate": 5.869074513092249e-05, |
| "loss": 0.373, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.053009387078962, |
| "grad_norm": 0.48382052666433606, |
| "learning_rate": 5.824842629412653e-05, |
| "loss": 0.3739, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.0706791827719493, |
| "grad_norm": 0.5806563133800278, |
| "learning_rate": 5.7803271377463695e-05, |
| "loss": 0.3672, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.0883489784649365, |
| "grad_norm": 0.6502257019855456, |
| "learning_rate": 5.735534956470233e-05, |
| "loss": 0.3644, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.1060187741579237, |
| "grad_norm": 0.8472824581196564, |
| "learning_rate": 5.6904730469627985e-05, |
| "loss": 0.3709, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.123688569850911, |
| "grad_norm": 0.6853246906791319, |
| "learning_rate": 5.645148412522447e-05, |
| "loss": 0.3645, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.1413583655438986, |
| "grad_norm": 0.4107921656466385, |
| "learning_rate": 5.5995680972789634e-05, |
| "loss": 0.3662, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.159028161236886, |
| "grad_norm": 0.37871427512746525, |
| "learning_rate": 5.5537391850987795e-05, |
| "loss": 0.3614, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.176697956929873, |
| "grad_norm": 0.5083596297218169, |
| "learning_rate": 5.507668798484021e-05, |
| "loss": 0.3645, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.19436775262286, |
| "grad_norm": 0.4631791629644736, |
| "learning_rate": 5.461364097465581e-05, |
| "loss": 0.3651, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.2120375483158474, |
| "grad_norm": 0.3234371144780778, |
| "learning_rate": 5.414832278490326e-05, |
| "loss": 0.3604, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.229707344008835, |
| "grad_norm": 0.352276909118395, |
| "learning_rate": 5.368080573302676e-05, |
| "loss": 0.3662, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.2473771397018223, |
| "grad_norm": 0.41914234346119467, |
| "learning_rate": 5.321116247820669e-05, |
| "loss": 0.3603, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.2650469353948095, |
| "grad_norm": 0.28573640352352386, |
| "learning_rate": 5.2739466010067385e-05, |
| "loss": 0.3556, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.2827167310877967, |
| "grad_norm": 0.25492365093010394, |
| "learning_rate": 5.226578963733338e-05, |
| "loss": 0.363, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.300386526780784, |
| "grad_norm": 0.2927182277484276, |
| "learning_rate": 5.179020697643618e-05, |
| "loss": 0.3636, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.3180563224737716, |
| "grad_norm": 0.2580774273705453, |
| "learning_rate": 5.13127919400731e-05, |
| "loss": 0.3632, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.335726118166759, |
| "grad_norm": 0.22474459932551596, |
| "learning_rate": 5.0833618725720214e-05, |
| "loss": 0.3614, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.353395913859746, |
| "grad_norm": 0.25586289471702456, |
| "learning_rate": 5.0352761804100835e-05, |
| "loss": 0.36, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.371065709552733, |
| "grad_norm": 0.24042001746342648, |
| "learning_rate": 4.987029590761174e-05, |
| "loss": 0.3667, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.3887355052457204, |
| "grad_norm": 0.24039897426531276, |
| "learning_rate": 4.9386296018708614e-05, |
| "loss": 0.3673, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.406405300938708, |
| "grad_norm": 0.27314874775426934, |
| "learning_rate": 4.890083735825258e-05, |
| "loss": 0.3619, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.4240750966316953, |
| "grad_norm": 0.23246378924197766, |
| "learning_rate": 4.841399537381984e-05, |
| "loss": 0.3623, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.4417448923246825, |
| "grad_norm": 0.191508019981753, |
| "learning_rate": 4.792584572797591e-05, |
| "loss": 0.3633, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.4594146880176697, |
| "grad_norm": 0.22416894070833618, |
| "learning_rate": 4.743646428651659e-05, |
| "loss": 0.3584, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.477084483710657, |
| "grad_norm": 0.20230146319655629, |
| "learning_rate": 4.694592710667723e-05, |
| "loss": 0.3615, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.4947542794036446, |
| "grad_norm": 0.1822985257854695, |
| "learning_rate": 4.645431042531227e-05, |
| "loss": 0.363, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.512424075096632, |
| "grad_norm": 0.20736102263198034, |
| "learning_rate": 4.5961690647046974e-05, |
| "loss": 0.3586, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.530093870789619, |
| "grad_norm": 0.17577697599602832, |
| "learning_rate": 4.546814433240294e-05, |
| "loss": 0.3598, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.547763666482606, |
| "grad_norm": 0.20267553551988982, |
| "learning_rate": 4.4973748185899416e-05, |
| "loss": 0.3595, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.5654334621755934, |
| "grad_norm": 0.16672091257518545, |
| "learning_rate": 4.4478579044132314e-05, |
| "loss": 0.3591, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.583103257868581, |
| "grad_norm": 0.18713978691301508, |
| "learning_rate": 4.398271386383267e-05, |
| "loss": 0.3588, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.6007730535615683, |
| "grad_norm": 0.1544729610927051, |
| "learning_rate": 4.348622970990634e-05, |
| "loss": 0.3535, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.6184428492545555, |
| "grad_norm": 0.16023976528470063, |
| "learning_rate": 4.298920374345698e-05, |
| "loss": 0.3624, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.6361126449475427, |
| "grad_norm": 0.1952298963833661, |
| "learning_rate": 4.249171320979409e-05, |
| "loss": 0.3592, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.65378244064053, |
| "grad_norm": 0.17353667541371376, |
| "learning_rate": 4.199383542642789e-05, |
| "loss": 0.3655, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.6714522363335176, |
| "grad_norm": 0.194516804675962, |
| "learning_rate": 4.149564777105304e-05, |
| "loss": 0.3565, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.689122032026505, |
| "grad_norm": 0.1758217981986949, |
| "learning_rate": 4.0997227669522924e-05, |
| "loss": 0.3666, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.706791827719492, |
| "grad_norm": 0.16067174619876137, |
| "learning_rate": 4.0498652583816606e-05, |
| "loss": 0.3592, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.724461623412479, |
| "grad_norm": 0.14593999093364027, |
| "learning_rate": 4e-05, |
| "loss": 0.3561, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.7421314191054664, |
| "grad_norm": 0.1603549305082767, |
| "learning_rate": 3.95013474161834e-05, |
| "loss": 0.3588, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.759801214798454, |
| "grad_norm": 0.11543165975506586, |
| "learning_rate": 3.9002772330477096e-05, |
| "loss": 0.3613, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.7774710104914413, |
| "grad_norm": 0.16445493416381396, |
| "learning_rate": 3.850435222894698e-05, |
| "loss": 0.3607, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.7951408061844285, |
| "grad_norm": 0.1414422759760679, |
| "learning_rate": 3.800616457357211e-05, |
| "loss": 0.36, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.8128106018774157, |
| "grad_norm": 0.13046409193282807, |
| "learning_rate": 3.7508286790205916e-05, |
| "loss": 0.35, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.830480397570403, |
| "grad_norm": 0.1596088175716807, |
| "learning_rate": 3.7010796256543034e-05, |
| "loss": 0.3639, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.8481501932633906, |
| "grad_norm": 0.1263702590709236, |
| "learning_rate": 3.6513770290093674e-05, |
| "loss": 0.3592, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.865819988956378, |
| "grad_norm": 0.14761623625344572, |
| "learning_rate": 3.601728613616734e-05, |
| "loss": 0.3609, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.883489784649365, |
| "grad_norm": 0.1457836294562342, |
| "learning_rate": 3.552142095586769e-05, |
| "loss": 0.3515, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.901159580342352, |
| "grad_norm": 0.11727253956988348, |
| "learning_rate": 3.5026251814100604e-05, |
| "loss": 0.3611, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.9188293760353394, |
| "grad_norm": 0.13457982400211851, |
| "learning_rate": 3.453185566759707e-05, |
| "loss": 0.3536, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.936499171728327, |
| "grad_norm": 0.13998361556088473, |
| "learning_rate": 3.403830935295302e-05, |
| "loss": 0.3608, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.9541689674213143, |
| "grad_norm": 0.14067124850466778, |
| "learning_rate": 3.3545689574687734e-05, |
| "loss": 0.3706, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.9718387631143015, |
| "grad_norm": 0.1318011050837946, |
| "learning_rate": 3.305407289332279e-05, |
| "loss": 0.3544, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.9895085588072887, |
| "grad_norm": 0.14546432839266002, |
| "learning_rate": 3.256353571348342e-05, |
| "loss": 0.3709, |
| "step": 169 |
| }, |
| { |
| "epoch": 3.008834897846494, |
| "grad_norm": 0.19583526414298022, |
| "learning_rate": 3.207415427202411e-05, |
| "loss": 0.527, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.026504693539481, |
| "grad_norm": 0.1877498962594719, |
| "learning_rate": 3.1586004626180175e-05, |
| "loss": 0.3322, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.0441744892324683, |
| "grad_norm": 0.18186432632351485, |
| "learning_rate": 3.109916264174743e-05, |
| "loss": 0.3366, |
| "step": 172 |
| }, |
| { |
| "epoch": 3.0618442849254555, |
| "grad_norm": 0.19885800612643872, |
| "learning_rate": 3.0613703981291406e-05, |
| "loss": 0.3324, |
| "step": 173 |
| }, |
| { |
| "epoch": 3.0795140806184427, |
| "grad_norm": 0.2089734413211629, |
| "learning_rate": 3.0129704092388253e-05, |
| "loss": 0.3339, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.0971838763114303, |
| "grad_norm": 0.17173171928937206, |
| "learning_rate": 2.9647238195899168e-05, |
| "loss": 0.3335, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.1148536720044175, |
| "grad_norm": 0.2123048979948462, |
| "learning_rate": 2.9166381274279803e-05, |
| "loss": 0.3333, |
| "step": 176 |
| }, |
| { |
| "epoch": 3.1325234676974048, |
| "grad_norm": 0.16806112420590733, |
| "learning_rate": 2.8687208059926904e-05, |
| "loss": 0.3353, |
| "step": 177 |
| }, |
| { |
| "epoch": 3.150193263390392, |
| "grad_norm": 0.17559998407873006, |
| "learning_rate": 2.8209793023563833e-05, |
| "loss": 0.3304, |
| "step": 178 |
| }, |
| { |
| "epoch": 3.167863059083379, |
| "grad_norm": 0.1478401374423511, |
| "learning_rate": 2.7734210362666637e-05, |
| "loss": 0.3301, |
| "step": 179 |
| }, |
| { |
| "epoch": 3.185532854776367, |
| "grad_norm": 0.15148759846687243, |
| "learning_rate": 2.7260533989932628e-05, |
| "loss": 0.3332, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.203202650469354, |
| "grad_norm": 0.13080567876743235, |
| "learning_rate": 2.678883752179333e-05, |
| "loss": 0.3296, |
| "step": 181 |
| }, |
| { |
| "epoch": 3.2208724461623413, |
| "grad_norm": 0.14032258509798645, |
| "learning_rate": 2.6319194266973256e-05, |
| "loss": 0.3272, |
| "step": 182 |
| }, |
| { |
| "epoch": 3.2385422418553285, |
| "grad_norm": 0.1162785431430188, |
| "learning_rate": 2.5851677215096745e-05, |
| "loss": 0.3316, |
| "step": 183 |
| }, |
| { |
| "epoch": 3.2562120375483157, |
| "grad_norm": 0.13212366326804822, |
| "learning_rate": 2.53863590253442e-05, |
| "loss": 0.3357, |
| "step": 184 |
| }, |
| { |
| "epoch": 3.2738818332413033, |
| "grad_norm": 0.11293978492936926, |
| "learning_rate": 2.4923312015159794e-05, |
| "loss": 0.3301, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.2915516289342905, |
| "grad_norm": 0.11627210040478692, |
| "learning_rate": 2.4462608149012215e-05, |
| "loss": 0.3372, |
| "step": 186 |
| }, |
| { |
| "epoch": 3.3092214246272778, |
| "grad_norm": 0.1116155121257842, |
| "learning_rate": 2.400431902721037e-05, |
| "loss": 0.332, |
| "step": 187 |
| }, |
| { |
| "epoch": 3.326891220320265, |
| "grad_norm": 0.10499190887903717, |
| "learning_rate": 2.3548515874775547e-05, |
| "loss": 0.3258, |
| "step": 188 |
| }, |
| { |
| "epoch": 3.344561016013252, |
| "grad_norm": 0.11985499625363151, |
| "learning_rate": 2.3095269530372032e-05, |
| "loss": 0.3356, |
| "step": 189 |
| }, |
| { |
| "epoch": 3.36223081170624, |
| "grad_norm": 0.10946277088364416, |
| "learning_rate": 2.264465043529768e-05, |
| "loss": 0.3339, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.379900607399227, |
| "grad_norm": 0.10946281010542962, |
| "learning_rate": 2.2196728622536304e-05, |
| "loss": 0.3324, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.3975704030922143, |
| "grad_norm": 0.10470866039213844, |
| "learning_rate": 2.175157370587348e-05, |
| "loss": 0.3333, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.4152401987852015, |
| "grad_norm": 0.11496566123138528, |
| "learning_rate": 2.130925486907752e-05, |
| "loss": 0.3299, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.4329099944781887, |
| "grad_norm": 0.10141893666492717, |
| "learning_rate": 2.0869840855147286e-05, |
| "loss": 0.3415, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.4505797901711763, |
| "grad_norm": 0.11362446781863374, |
| "learning_rate": 2.0433399955628443e-05, |
| "loss": 0.3325, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.4682495858641635, |
| "grad_norm": 0.09706953644786295, |
| "learning_rate": 2.0000000000000012e-05, |
| "loss": 0.3385, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.4859193815571508, |
| "grad_norm": 0.10343474750084429, |
| "learning_rate": 1.956970834513259e-05, |
| "loss": 0.3324, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.503589177250138, |
| "grad_norm": 0.1115997799981753, |
| "learning_rate": 1.914259186482008e-05, |
| "loss": 0.3304, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.5212589729431256, |
| "grad_norm": 0.100665947226005, |
| "learning_rate": 1.8718716939386543e-05, |
| "loss": 0.341, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.5389287686361124, |
| "grad_norm": 0.1112389145071717, |
| "learning_rate": 1.829814944536963e-05, |
| "loss": 0.3311, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.5565985643291, |
| "grad_norm": 0.10508913738753005, |
| "learning_rate": 1.7880954745282425e-05, |
| "loss": 0.3262, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.5742683600220873, |
| "grad_norm": 0.10715189168423515, |
| "learning_rate": 1.7467197677455118e-05, |
| "loss": 0.3387, |
| "step": 202 |
| }, |
| { |
| "epoch": 3.5919381557150745, |
| "grad_norm": 0.12309528662432599, |
| "learning_rate": 1.7056942545958167e-05, |
| "loss": 0.3272, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.609607951408062, |
| "grad_norm": 0.09503224614910906, |
| "learning_rate": 1.6650253110608415e-05, |
| "loss": 0.3361, |
| "step": 204 |
| }, |
| { |
| "epoch": 3.627277747101049, |
| "grad_norm": 0.10336176601010666, |
| "learning_rate": 1.6247192577059943e-05, |
| "loss": 0.3394, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.6449475427940365, |
| "grad_norm": 0.10325570712107657, |
| "learning_rate": 1.5847823586980897e-05, |
| "loss": 0.3329, |
| "step": 206 |
| }, |
| { |
| "epoch": 3.6626173384870238, |
| "grad_norm": 0.09392755935325038, |
| "learning_rate": 1.545220820831811e-05, |
| "loss": 0.3273, |
| "step": 207 |
| }, |
| { |
| "epoch": 3.680287134180011, |
| "grad_norm": 0.09365769242973894, |
| "learning_rate": 1.5060407925650662e-05, |
| "loss": 0.3366, |
| "step": 208 |
| }, |
| { |
| "epoch": 3.697956929872998, |
| "grad_norm": 0.11814878497030305, |
| "learning_rate": 1.4672483630634414e-05, |
| "loss": 0.3365, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.7156267255659854, |
| "grad_norm": 0.08867405697122034, |
| "learning_rate": 1.4288495612538427e-05, |
| "loss": 0.3344, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.733296521258973, |
| "grad_norm": 0.09623647972155307, |
| "learning_rate": 1.3908503548875167e-05, |
| "loss": 0.334, |
| "step": 211 |
| }, |
| { |
| "epoch": 3.7509663169519603, |
| "grad_norm": 0.10214752499383144, |
| "learning_rate": 1.3532566496125634e-05, |
| "loss": 0.3319, |
| "step": 212 |
| }, |
| { |
| "epoch": 3.7686361126449475, |
| "grad_norm": 0.09305403874654943, |
| "learning_rate": 1.3160742880561204e-05, |
| "loss": 0.3327, |
| "step": 213 |
| }, |
| { |
| "epoch": 3.7863059083379347, |
| "grad_norm": 0.09492686150258188, |
| "learning_rate": 1.2793090489163218e-05, |
| "loss": 0.3276, |
| "step": 214 |
| }, |
| { |
| "epoch": 3.803975704030922, |
| "grad_norm": 0.08712346665776856, |
| "learning_rate": 1.242966646064212e-05, |
| "loss": 0.3378, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.8216454997239095, |
| "grad_norm": 0.09302902241662848, |
| "learning_rate": 1.2070527276557092e-05, |
| "loss": 0.3276, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.8393152954168968, |
| "grad_norm": 0.09984664869036139, |
| "learning_rate": 1.1715728752538103e-05, |
| "loss": 0.335, |
| "step": 217 |
| }, |
| { |
| "epoch": 3.856985091109884, |
| "grad_norm": 0.08136111333958188, |
| "learning_rate": 1.1365326029611263e-05, |
| "loss": 0.325, |
| "step": 218 |
| }, |
| { |
| "epoch": 3.874654886802871, |
| "grad_norm": 0.09067392795608281, |
| "learning_rate": 1.1019373565629094e-05, |
| "loss": 0.3326, |
| "step": 219 |
| }, |
| { |
| "epoch": 3.8923246824958584, |
| "grad_norm": 0.1151467415131904, |
| "learning_rate": 1.0677925126806956e-05, |
| "loss": 0.3338, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.909994478188846, |
| "grad_norm": 0.08540355002710472, |
| "learning_rate": 1.0341033779366931e-05, |
| "loss": 0.3281, |
| "step": 221 |
| }, |
| { |
| "epoch": 3.9276642738818333, |
| "grad_norm": 0.08890862244357016, |
| "learning_rate": 1.0008751881290628e-05, |
| "loss": 0.3279, |
| "step": 222 |
| }, |
| { |
| "epoch": 3.9453340695748205, |
| "grad_norm": 0.08886465646079035, |
| "learning_rate": 9.681131074181876e-06, |
| "loss": 0.3331, |
| "step": 223 |
| }, |
| { |
| "epoch": 3.9630038652678077, |
| "grad_norm": 0.08382860075616222, |
| "learning_rate": 9.358222275240884e-06, |
| "loss": 0.3301, |
| "step": 224 |
| }, |
| { |
| "epoch": 3.980673660960795, |
| "grad_norm": 0.08493360832822018, |
| "learning_rate": 9.040075669350905e-06, |
| "loss": 0.3321, |
| "step": 225 |
| }, |
| { |
| "epoch": 3.9983434566537825, |
| "grad_norm": 0.12681148128353786, |
| "learning_rate": 8.72674070127881e-06, |
| "loss": 0.4956, |
| "step": 226 |
| }, |
| { |
| "epoch": 4.017669795692988, |
| "grad_norm": 0.11963433162616376, |
| "learning_rate": 8.418266067990588e-06, |
| "loss": 0.3171, |
| "step": 227 |
| }, |
| { |
| "epoch": 4.035339591385974, |
| "grad_norm": 0.11337795603837922, |
| "learning_rate": 8.114699711083113e-06, |
| "loss": 0.3207, |
| "step": 228 |
| }, |
| { |
| "epoch": 4.053009387078962, |
| "grad_norm": 0.08872533010955261, |
| "learning_rate": 7.816088809333266e-06, |
| "loss": 0.3165, |
| "step": 229 |
| }, |
| { |
| "epoch": 4.070679182771949, |
| "grad_norm": 0.09241216784587171, |
| "learning_rate": 7.52247977136574e-06, |
| "loss": 0.328, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.0883489784649365, |
| "grad_norm": 0.09282854796647831, |
| "learning_rate": 7.233918228440324e-06, |
| "loss": 0.3119, |
| "step": 231 |
| }, |
| { |
| "epoch": 4.106018774157924, |
| "grad_norm": 0.10157632018918776, |
| "learning_rate": 6.950449027360213e-06, |
| "loss": 0.3182, |
| "step": 232 |
| }, |
| { |
| "epoch": 4.123688569850911, |
| "grad_norm": 0.10277617383975274, |
| "learning_rate": 6.6721162235020476e-06, |
| "loss": 0.319, |
| "step": 233 |
| }, |
| { |
| "epoch": 4.141358365543899, |
| "grad_norm": 0.10056534251093886, |
| "learning_rate": 6.398963073969144e-06, |
| "loss": 0.3171, |
| "step": 234 |
| }, |
| { |
| "epoch": 4.159028161236885, |
| "grad_norm": 0.09505805141388292, |
| "learning_rate": 6.1310320308686354e-06, |
| "loss": 0.3147, |
| "step": 235 |
| }, |
| { |
| "epoch": 4.176697956929873, |
| "grad_norm": 0.09268654061240998, |
| "learning_rate": 5.868364734713776e-06, |
| "loss": 0.3191, |
| "step": 236 |
| }, |
| { |
| "epoch": 4.194367752622861, |
| "grad_norm": 0.08706624460622792, |
| "learning_rate": 5.611002007952389e-06, |
| "loss": 0.3208, |
| "step": 237 |
| }, |
| { |
| "epoch": 4.212037548315847, |
| "grad_norm": 0.08951679698719879, |
| "learning_rate": 5.358983848622452e-06, |
| "loss": 0.3172, |
| "step": 238 |
| }, |
| { |
| "epoch": 4.229707344008835, |
| "grad_norm": 0.09131830865477111, |
| "learning_rate": 5.112349424135788e-06, |
| "loss": 0.3164, |
| "step": 239 |
| }, |
| { |
| "epoch": 4.247377139701822, |
| "grad_norm": 0.08750009357915652, |
| "learning_rate": 4.871137065190854e-06, |
| "loss": 0.3106, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.2650469353948095, |
| "grad_norm": 0.0850783896741063, |
| "learning_rate": 4.635384259815614e-06, |
| "loss": 0.3169, |
| "step": 241 |
| }, |
| { |
| "epoch": 4.282716731087797, |
| "grad_norm": 0.08475820034880008, |
| "learning_rate": 4.405127647541259e-06, |
| "loss": 0.3196, |
| "step": 242 |
| }, |
| { |
| "epoch": 4.300386526780784, |
| "grad_norm": 0.08682717964250186, |
| "learning_rate": 4.180403013707963e-06, |
| "loss": 0.3109, |
| "step": 243 |
| }, |
| { |
| "epoch": 4.318056322473772, |
| "grad_norm": 0.08602559336503131, |
| "learning_rate": 3.961245283903239e-06, |
| "loss": 0.3118, |
| "step": 244 |
| }, |
| { |
| "epoch": 4.335726118166758, |
| "grad_norm": 0.08629377402093397, |
| "learning_rate": 3.747688518534003e-06, |
| "loss": 0.3153, |
| "step": 245 |
| }, |
| { |
| "epoch": 4.353395913859746, |
| "grad_norm": 0.08163055098829672, |
| "learning_rate": 3.5397659075330748e-06, |
| "loss": 0.3139, |
| "step": 246 |
| }, |
| { |
| "epoch": 4.371065709552734, |
| "grad_norm": 0.07572785159509317, |
| "learning_rate": 3.3375097652009526e-06, |
| "loss": 0.313, |
| "step": 247 |
| }, |
| { |
| "epoch": 4.38873550524572, |
| "grad_norm": 0.08030300979420142, |
| "learning_rate": 3.140951525183691e-06, |
| "loss": 0.3154, |
| "step": 248 |
| }, |
| { |
| "epoch": 4.406405300938708, |
| "grad_norm": 0.07708287661350902, |
| "learning_rate": 2.950121735587654e-06, |
| "loss": 0.3168, |
| "step": 249 |
| }, |
| { |
| "epoch": 4.424075096631695, |
| "grad_norm": 0.09203966729547491, |
| "learning_rate": 2.765050054231835e-06, |
| "loss": 0.314, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.4417448923246825, |
| "grad_norm": 0.08041597052637767, |
| "learning_rate": 2.5857652440386404e-06, |
| "loss": 0.3197, |
| "step": 251 |
| }, |
| { |
| "epoch": 4.45941468801767, |
| "grad_norm": 0.08037954632735851, |
| "learning_rate": 2.4122951685636674e-06, |
| "loss": 0.3185, |
| "step": 252 |
| }, |
| { |
| "epoch": 4.477084483710657, |
| "grad_norm": 0.07294469295727943, |
| "learning_rate": 2.244666787665297e-06, |
| "loss": 0.3198, |
| "step": 253 |
| }, |
| { |
| "epoch": 4.494754279403645, |
| "grad_norm": 0.07372400441204935, |
| "learning_rate": 2.0829061533147322e-06, |
| "loss": 0.3125, |
| "step": 254 |
| }, |
| { |
| "epoch": 4.512424075096631, |
| "grad_norm": 0.07696683693078588, |
| "learning_rate": 1.927038405547106e-06, |
| "loss": 0.3153, |
| "step": 255 |
| }, |
| { |
| "epoch": 4.530093870789619, |
| "grad_norm": 0.07761898190749474, |
| "learning_rate": 1.7770877685543687e-06, |
| "loss": 0.3164, |
| "step": 256 |
| }, |
| { |
| "epoch": 4.547763666482607, |
| "grad_norm": 0.07405297885815232, |
| "learning_rate": 1.6330775469204895e-06, |
| "loss": 0.3165, |
| "step": 257 |
| }, |
| { |
| "epoch": 4.565433462175593, |
| "grad_norm": 0.07194679469449752, |
| "learning_rate": 1.495030121999519e-06, |
| "loss": 0.3174, |
| "step": 258 |
| }, |
| { |
| "epoch": 4.583103257868581, |
| "grad_norm": 0.07372588870054934, |
| "learning_rate": 1.3629669484372722e-06, |
| "loss": 0.3125, |
| "step": 259 |
| }, |
| { |
| "epoch": 4.600773053561568, |
| "grad_norm": 0.0736484855706964, |
| "learning_rate": 1.2369085508368862e-06, |
| "loss": 0.3117, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.6184428492545555, |
| "grad_norm": 0.06929860862646713, |
| "learning_rate": 1.1168745205690202e-06, |
| "loss": 0.3188, |
| "step": 261 |
| }, |
| { |
| "epoch": 4.636112644947543, |
| "grad_norm": 0.06998294296555901, |
| "learning_rate": 1.0028835127270553e-06, |
| "loss": 0.3111, |
| "step": 262 |
| }, |
| { |
| "epoch": 4.65378244064053, |
| "grad_norm": 0.07010415717148563, |
| "learning_rate": 8.949532432278185e-07, |
| "loss": 0.3157, |
| "step": 263 |
| }, |
| { |
| "epoch": 4.671452236333518, |
| "grad_norm": 0.07068983473178043, |
| "learning_rate": 7.93100486058247e-07, |
| "loss": 0.322, |
| "step": 264 |
| }, |
| { |
| "epoch": 4.689122032026504, |
| "grad_norm": 0.07415661278084447, |
| "learning_rate": 6.973410706684691e-07, |
| "loss": 0.3122, |
| "step": 265 |
| }, |
| { |
| "epoch": 4.706791827719492, |
| "grad_norm": 0.07147977962991126, |
| "learning_rate": 6.076898795116792e-07, |
| "loss": 0.3162, |
| "step": 266 |
| }, |
| { |
| "epoch": 4.72446162341248, |
| "grad_norm": 0.07009164551218153, |
| "learning_rate": 5.241608457311565e-07, |
| "loss": 0.3174, |
| "step": 267 |
| }, |
| { |
| "epoch": 4.742131419105466, |
| "grad_norm": 0.07130206559582739, |
| "learning_rate": 4.467669509948591e-07, |
| "loss": 0.3114, |
| "step": 268 |
| }, |
| { |
| "epoch": 4.759801214798454, |
| "grad_norm": 0.06774782729285217, |
| "learning_rate": 3.7552022347788766e-07, |
| "loss": 0.3138, |
| "step": 269 |
| }, |
| { |
| "epoch": 4.777471010491441, |
| "grad_norm": 0.07138164733421887, |
| "learning_rate": 3.104317359931175e-07, |
| "loss": 0.3209, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.7951408061844285, |
| "grad_norm": 0.06931798887191189, |
| "learning_rate": 2.5151160427029584e-07, |
| "loss": 0.3171, |
| "step": 271 |
| }, |
| { |
| "epoch": 4.812810601877416, |
| "grad_norm": 0.07008995274999451, |
| "learning_rate": 1.9876898538394362e-07, |
| "loss": 0.311, |
| "step": 272 |
| }, |
| { |
| "epoch": 4.830480397570403, |
| "grad_norm": 0.06835397987857686, |
| "learning_rate": 1.522120763301782e-07, |
| "loss": 0.323, |
| "step": 273 |
| }, |
| { |
| "epoch": 4.848150193263391, |
| "grad_norm": 0.06982710466662823, |
| "learning_rate": 1.1184811275279483e-07, |
| "loss": 0.3209, |
| "step": 274 |
| }, |
| { |
| "epoch": 4.865819988956377, |
| "grad_norm": 0.06719415454501948, |
| "learning_rate": 7.76833678187261e-08, |
| "loss": 0.3125, |
| "step": 275 |
| }, |
| { |
| "epoch": 4.883489784649365, |
| "grad_norm": 0.06985964768247895, |
| "learning_rate": 4.9723151243106225e-08, |
| "loss": 0.3192, |
| "step": 276 |
| }, |
| { |
| "epoch": 4.901159580342353, |
| "grad_norm": 0.06760152861997361, |
| "learning_rate": 2.797180846405567e-08, |
| "loss": 0.3176, |
| "step": 277 |
| }, |
| { |
| "epoch": 4.918829376035339, |
| "grad_norm": 0.07336231991253046, |
| "learning_rate": 1.2432719967350182e-08, |
| "loss": 0.3201, |
| "step": 278 |
| }, |
| { |
| "epoch": 4.936499171728327, |
| "grad_norm": 0.06901399516812774, |
| "learning_rate": 3.108300761005545e-09, |
| "loss": 0.3193, |
| "step": 279 |
| }, |
| { |
| "epoch": 4.954168967421314, |
| "grad_norm": 0.06958592985440362, |
| "learning_rate": 0.0, |
| "loss": 0.3178, |
| "step": 280 |
| }, |
| { |
| "epoch": 4.954168967421314, |
| "step": 280, |
| "total_flos": 7.445251410192499e+18, |
| "train_loss": 0.3906520079289164, |
| "train_runtime": 65492.582, |
| "train_samples_per_second": 2.212, |
| "train_steps_per_second": 0.004 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 280, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.445251410192499e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|