{ "best_metric": null, "best_model_checkpoint": null, "epoch": 64.48, "eval_steps": 500, "global_step": 4030, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32, "grad_norm": 2.246424436569214, "learning_rate": 2.3573200992555833e-06, "loss": 2.826, "step": 20 }, { "epoch": 0.64, "grad_norm": 0.9050242900848389, "learning_rate": 4.838709677419355e-06, "loss": 2.72, "step": 40 }, { "epoch": 0.96, "grad_norm": 2.6034655570983887, "learning_rate": 7.320099255583126e-06, "loss": 2.4912, "step": 60 }, { "epoch": 1.28, "grad_norm": 1.3487274646759033, "learning_rate": 9.801488833746898e-06, "loss": 2.0561, "step": 80 }, { "epoch": 1.6, "grad_norm": 1.6185756921768188, "learning_rate": 1.2282878411910669e-05, "loss": 1.7744, "step": 100 }, { "epoch": 1.92, "grad_norm": 3.017139196395874, "learning_rate": 1.4764267990074444e-05, "loss": 1.8387, "step": 120 }, { "epoch": 2.24, "grad_norm": 2.2100813388824463, "learning_rate": 1.7245657568238215e-05, "loss": 1.4478, "step": 140 }, { "epoch": 2.56, "grad_norm": 1.574629545211792, "learning_rate": 1.9727047146401986e-05, "loss": 1.285, "step": 160 }, { "epoch": 2.88, "grad_norm": 4.586638450622559, "learning_rate": 2.2208436724565757e-05, "loss": 1.2235, "step": 180 }, { "epoch": 3.2, "grad_norm": 2.7081515789031982, "learning_rate": 2.468982630272953e-05, "loss": 0.9575, "step": 200 }, { "epoch": 3.52, "grad_norm": 0.9670729041099548, "learning_rate": 2.7171215880893302e-05, "loss": 0.7086, "step": 220 }, { "epoch": 3.84, "grad_norm": 3.229243040084839, "learning_rate": 2.9652605459057077e-05, "loss": 0.8587, "step": 240 }, { "epoch": 4.16, "grad_norm": 1.1293463706970215, "learning_rate": 3.2133995037220844e-05, "loss": 0.5978, "step": 260 }, { "epoch": 4.48, "grad_norm": 1.7043830156326294, "learning_rate": 3.461538461538462e-05, "loss": 0.4668, "step": 280 }, { "epoch": 4.8, "grad_norm": 2.565268039703369, "learning_rate": 3.7096774193548386e-05, "loss": 0.5667, "step": 300 }, { "epoch": 5.12, "grad_norm": 1.158849835395813, "learning_rate": 3.957816377171216e-05, "loss": 0.4373, "step": 320 }, { "epoch": 5.44, "grad_norm": 2.714164972305298, "learning_rate": 4.205955334987593e-05, "loss": 0.3492, "step": 340 }, { "epoch": 5.76, "grad_norm": 2.2089672088623047, "learning_rate": 4.45409429280397e-05, "loss": 0.4018, "step": 360 }, { "epoch": 6.08, "grad_norm": 1.8179335594177246, "learning_rate": 4.702233250620348e-05, "loss": 0.279, "step": 380 }, { "epoch": 6.4, "grad_norm": 1.4858269691467285, "learning_rate": 4.950372208436725e-05, "loss": 0.2362, "step": 400 }, { "epoch": 6.72, "grad_norm": 1.7704375982284546, "learning_rate": 4.99975992459978e-05, "loss": 0.2665, "step": 420 }, { "epoch": 7.04, "grad_norm": 1.2611212730407715, "learning_rate": 4.9987846973104825e-05, "loss": 0.2029, "step": 440 }, { "epoch": 7.36, "grad_norm": 2.994542360305786, "learning_rate": 4.9970596058519116e-05, "loss": 0.1747, "step": 460 }, { "epoch": 7.68, "grad_norm": 2.7456889152526855, "learning_rate": 4.994585167909436e-05, "loss": 0.1486, "step": 480 }, { "epoch": 8.0, "grad_norm": 1.8236416578292847, "learning_rate": 4.9913621260409695e-05, "loss": 0.1866, "step": 500 }, { "epoch": 8.32, "grad_norm": 2.636003017425537, "learning_rate": 4.987391447454136e-05, "loss": 0.1476, "step": 520 }, { "epoch": 8.64, "grad_norm": 2.879154920578003, "learning_rate": 4.982674323716023e-05, "loss": 0.1403, "step": 540 }, { "epoch": 8.96, "grad_norm": 0.9377075433731079, "learning_rate": 4.977212170395598e-05, "loss": 0.1018, "step": 560 }, { "epoch": 9.28, "grad_norm": 0.311233788728714, "learning_rate": 4.9710066266389074e-05, "loss": 0.0992, "step": 580 }, { "epoch": 9.6, "grad_norm": 0.8316205739974976, "learning_rate": 4.964059554677187e-05, "loss": 0.1134, "step": 600 }, { "epoch": 9.92, "grad_norm": 2.567354679107666, "learning_rate": 4.956373039268022e-05, "loss": 0.0781, "step": 620 }, { "epoch": 10.24, "grad_norm": 0.0829504132270813, "learning_rate": 4.947949387069721e-05, "loss": 0.0892, "step": 640 }, { "epoch": 10.56, "grad_norm": 0.8588472008705139, "learning_rate": 4.938791125949119e-05, "loss": 0.0499, "step": 660 }, { "epoch": 10.88, "grad_norm": 1.2792423963546753, "learning_rate": 4.9289010042229765e-05, "loss": 0.0831, "step": 680 }, { "epoch": 11.2, "grad_norm": 0.4728279709815979, "learning_rate": 4.918281989833238e-05, "loss": 0.0715, "step": 700 }, { "epoch": 11.52, "grad_norm": 2.5855355262756348, "learning_rate": 4.9069372694563756e-05, "loss": 0.0718, "step": 720 }, { "epoch": 11.84, "grad_norm": 0.8059779405593872, "learning_rate": 4.8948702475470933e-05, "loss": 0.0849, "step": 740 }, { "epoch": 12.16, "grad_norm": 1.2841193675994873, "learning_rate": 4.882084545316684e-05, "loss": 0.0683, "step": 760 }, { "epoch": 12.48, "grad_norm": 1.3422589302062988, "learning_rate": 4.868583999646329e-05, "loss": 0.0808, "step": 780 }, { "epoch": 12.8, "grad_norm": 1.3376965522766113, "learning_rate": 4.8543726619356846e-05, "loss": 0.0607, "step": 800 }, { "epoch": 13.12, "grad_norm": 1.008899450302124, "learning_rate": 4.83945479688709e-05, "loss": 0.062, "step": 820 }, { "epoch": 13.44, "grad_norm": 0.441413551568985, "learning_rate": 4.8238348812257684e-05, "loss": 0.0461, "step": 840 }, { "epoch": 13.76, "grad_norm": 1.296985149383545, "learning_rate": 4.808349953928184e-05, "loss": 0.0482, "step": 860 }, { "epoch": 14.08, "grad_norm": 0.035805635154247284, "learning_rate": 4.791374712344622e-05, "loss": 0.0388, "step": 880 }, { "epoch": 14.4, "grad_norm": 0.10618308186531067, "learning_rate": 4.7737118485753564e-05, "loss": 0.0251, "step": 900 }, { "epoch": 14.72, "grad_norm": 0.866423487663269, "learning_rate": 4.75536666309653e-05, "loss": 0.0515, "step": 920 }, { "epoch": 15.04, "grad_norm": 0.5916399955749512, "learning_rate": 4.73634466114326e-05, "loss": 0.0536, "step": 940 }, { "epoch": 15.36, "grad_norm": 0.1653570532798767, "learning_rate": 4.7166515510575676e-05, "loss": 0.0392, "step": 960 }, { "epoch": 15.68, "grad_norm": 0.027391331270337105, "learning_rate": 4.696293242575356e-05, "loss": 0.0369, "step": 980 }, { "epoch": 16.0, "grad_norm": 2.17256760597229, "learning_rate": 4.675275845052942e-05, "loss": 0.0651, "step": 1000 }, { "epoch": 16.32, "grad_norm": 0.8612786531448364, "learning_rate": 4.6536056656336947e-05, "loss": 0.037, "step": 1020 }, { "epoch": 16.64, "grad_norm": 4.489969253540039, "learning_rate": 4.631289207355313e-05, "loss": 0.0272, "step": 1040 }, { "epoch": 16.96, "grad_norm": 0.4311043918132782, "learning_rate": 4.6083331671983185e-05, "loss": 0.0507, "step": 1060 }, { "epoch": 17.28, "grad_norm": 0.4327545762062073, "learning_rate": 4.584744434076352e-05, "loss": 0.0274, "step": 1080 }, { "epoch": 17.6, "grad_norm": 0.12099918723106384, "learning_rate": 4.560530086768863e-05, "loss": 0.0565, "step": 1100 }, { "epoch": 17.92, "grad_norm": 0.103216253221035, "learning_rate": 4.535697391796832e-05, "loss": 0.0425, "step": 1120 }, { "epoch": 18.24, "grad_norm": 0.419209748506546, "learning_rate": 4.510253801242147e-05, "loss": 0.0273, "step": 1140 }, { "epoch": 18.56, "grad_norm": 1.3193784952163696, "learning_rate": 4.4842069505112984e-05, "loss": 0.0438, "step": 1160 }, { "epoch": 18.88, "grad_norm": 1.5185387134552002, "learning_rate": 4.457564656044056e-05, "loss": 0.0544, "step": 1180 }, { "epoch": 19.2, "grad_norm": 0.4024270474910736, "learning_rate": 4.430334912967824e-05, "loss": 0.0283, "step": 1200 }, { "epoch": 19.52, "grad_norm": 0.16141988337039948, "learning_rate": 4.402525892698367e-05, "loss": 0.0393, "step": 1220 }, { "epoch": 19.84, "grad_norm": 0.07228437811136246, "learning_rate": 4.374145940487641e-05, "loss": 0.0249, "step": 1240 }, { "epoch": 20.16, "grad_norm": 0.7919737696647644, "learning_rate": 4.345203572919454e-05, "loss": 0.0293, "step": 1260 }, { "epoch": 20.48, "grad_norm": 0.26585039496421814, "learning_rate": 4.315707475353706e-05, "loss": 0.0287, "step": 1280 }, { "epoch": 20.8, "grad_norm": 0.5761149525642395, "learning_rate": 4.285666499319992e-05, "loss": 0.0521, "step": 1300 }, { "epoch": 21.12, "grad_norm": 0.018601374700665474, "learning_rate": 4.25508965986133e-05, "loss": 0.0285, "step": 1320 }, { "epoch": 21.44, "grad_norm": 0.00528874434530735, "learning_rate": 4.2239861328288214e-05, "loss": 0.0346, "step": 1340 }, { "epoch": 21.76, "grad_norm": 0.3073647618293762, "learning_rate": 4.1923652521280585e-05, "loss": 0.022, "step": 1360 }, { "epoch": 22.08, "grad_norm": 0.42911043763160706, "learning_rate": 4.160236506918098e-05, "loss": 0.0482, "step": 1380 }, { "epoch": 22.4, "grad_norm": 0.6457176804542542, "learning_rate": 4.127609538763842e-05, "loss": 0.019, "step": 1400 }, { "epoch": 22.72, "grad_norm": 2.3716557025909424, "learning_rate": 4.094494138742685e-05, "loss": 0.0312, "step": 1420 }, { "epoch": 23.04, "grad_norm": 0.01667410507798195, "learning_rate": 4.0609002445063036e-05, "loss": 0.0377, "step": 1440 }, { "epoch": 23.36, "grad_norm": 0.6381007432937622, "learning_rate": 4.02683793729844e-05, "loss": 0.0307, "step": 1460 }, { "epoch": 23.68, "grad_norm": 0.42919328808784485, "learning_rate": 3.9923174389296085e-05, "loss": 0.0419, "step": 1480 }, { "epoch": 24.0, "grad_norm": 0.01456019002944231, "learning_rate": 3.957349108709623e-05, "loss": 0.0223, "step": 1500 }, { "epoch": 24.32, "grad_norm": 0.31073492765426636, "learning_rate": 3.921943440338849e-05, "loss": 0.0209, "step": 1520 }, { "epoch": 24.64, "grad_norm": 0.38279736042022705, "learning_rate": 3.886111058759132e-05, "loss": 0.0491, "step": 1540 }, { "epoch": 24.96, "grad_norm": 0.30651962757110596, "learning_rate": 3.849862716965352e-05, "loss": 0.0298, "step": 1560 }, { "epoch": 25.28, "grad_norm": 0.4538489580154419, "learning_rate": 3.813209292778527e-05, "loss": 0.0319, "step": 1580 }, { "epoch": 25.6, "grad_norm": 0.11643072962760925, "learning_rate": 3.776161785581481e-05, "loss": 0.0302, "step": 1600 }, { "epoch": 25.92, "grad_norm": 0.008515519089996815, "learning_rate": 3.738731313018019e-05, "loss": 0.04, "step": 1620 }, { "epoch": 26.24, "grad_norm": 0.002214708598330617, "learning_rate": 3.700929107656614e-05, "loss": 0.0354, "step": 1640 }, { "epoch": 26.56, "grad_norm": 0.02200801856815815, "learning_rate": 3.662766513619611e-05, "loss": 0.0186, "step": 1660 }, { "epoch": 26.88, "grad_norm": 0.1882447600364685, "learning_rate": 3.62425498317895e-05, "loss": 0.022, "step": 1680 }, { "epoch": 27.2, "grad_norm": 0.004948125686496496, "learning_rate": 3.585406073319439e-05, "loss": 0.015, "step": 1700 }, { "epoch": 27.52, "grad_norm": 0.3387264013290405, "learning_rate": 3.546231442270596e-05, "loss": 0.0381, "step": 1720 }, { "epoch": 27.84, "grad_norm": 0.09048642963171005, "learning_rate": 3.506742846008116e-05, "loss": 0.0277, "step": 1740 }, { "epoch": 28.16, "grad_norm": 0.6405784487724304, "learning_rate": 3.4669521347259996e-05, "loss": 0.0423, "step": 1760 }, { "epoch": 28.48, "grad_norm": 0.16012047231197357, "learning_rate": 3.426871249280414e-05, "loss": 0.0115, "step": 1780 }, { "epoch": 28.8, "grad_norm": 0.3279825448989868, "learning_rate": 3.386512217606339e-05, "loss": 0.0275, "step": 1800 }, { "epoch": 29.12, "grad_norm": 0.005494344513863325, "learning_rate": 3.345887151108087e-05, "loss": 0.0309, "step": 1820 }, { "epoch": 29.44, "grad_norm": 0.0037028896622359753, "learning_rate": 3.305008241024774e-05, "loss": 0.0294, "step": 1840 }, { "epoch": 29.76, "grad_norm": 0.003084386931732297, "learning_rate": 3.2638877547718264e-05, "loss": 0.0213, "step": 1860 }, { "epoch": 30.08, "grad_norm": 0.0017954249633476138, "learning_rate": 3.222538032259643e-05, "loss": 0.0326, "step": 1880 }, { "epoch": 30.4, "grad_norm": 0.26840922236442566, "learning_rate": 3.1809714821904834e-05, "loss": 0.0249, "step": 1900 }, { "epoch": 30.72, "grad_norm": 0.7214370965957642, "learning_rate": 3.1392005783347244e-05, "loss": 0.0115, "step": 1920 }, { "epoch": 31.04, "grad_norm": 0.1613769233226776, "learning_rate": 3.0972378557875884e-05, "loss": 0.0322, "step": 1940 }, { "epoch": 31.36, "grad_norm": 0.18066717684268951, "learning_rate": 3.055095907207465e-05, "loss": 0.0316, "step": 1960 }, { "epoch": 31.68, "grad_norm": 0.24756371974945068, "learning_rate": 3.0127873790369627e-05, "loss": 0.0248, "step": 1980 }, { "epoch": 32.0, "grad_norm": 0.08604203909635544, "learning_rate": 2.9703249677078156e-05, "loss": 0.0234, "step": 2000 }, { "epoch": 32.32, "grad_norm": 0.0022385423071682453, "learning_rate": 2.9277214158307937e-05, "loss": 0.0277, "step": 2020 }, { "epoch": 32.64, "grad_norm": 0.0020592950750142336, "learning_rate": 2.8849895083717537e-05, "loss": 0.0162, "step": 2040 }, { "epoch": 32.96, "grad_norm": 0.20633552968502045, "learning_rate": 2.842142068814977e-05, "loss": 0.022, "step": 2060 }, { "epoch": 33.28, "grad_norm": 0.0019172705942764878, "learning_rate": 2.7991919553149497e-05, "loss": 0.0278, "step": 2080 }, { "epoch": 33.6, "grad_norm": 0.0013098755152896047, "learning_rate": 2.756152056837743e-05, "loss": 0.0189, "step": 2100 }, { "epoch": 33.92, "grad_norm": 0.09349821507930756, "learning_rate": 2.7130352892931388e-05, "loss": 0.0228, "step": 2120 }, { "epoch": 34.24, "grad_norm": 0.0017231553792953491, "learning_rate": 2.669854591658679e-05, "loss": 0.0319, "step": 2140 }, { "epoch": 34.56, "grad_norm": 0.047173839062452316, "learning_rate": 2.6266229220967818e-05, "loss": 0.0153, "step": 2160 }, { "epoch": 34.88, "grad_norm": 0.2877206802368164, "learning_rate": 2.5833532540661127e-05, "loss": 0.0267, "step": 2180 }, { "epoch": 35.2, "grad_norm": 0.25823402404785156, "learning_rate": 2.540058572428356e-05, "loss": 0.0178, "step": 2200 }, { "epoch": 35.52, "grad_norm": 0.23003694415092468, "learning_rate": 2.496751869551567e-05, "loss": 0.0217, "step": 2220 }, { "epoch": 35.84, "grad_norm": 0.23193888366222382, "learning_rate": 2.453446141411273e-05, "loss": 0.017, "step": 2240 }, { "epoch": 36.16, "grad_norm": 0.1941184252500534, "learning_rate": 2.4101543836904938e-05, "loss": 0.0257, "step": 2260 }, { "epoch": 36.48, "grad_norm": 0.012731954455375671, "learning_rate": 2.3668895878798424e-05, "loss": 0.0237, "step": 2280 }, { "epoch": 36.8, "grad_norm": 0.18219026923179626, "learning_rate": 2.32366473737889e-05, "loss": 0.024, "step": 2300 }, { "epoch": 37.12, "grad_norm": 0.256547212600708, "learning_rate": 2.2804928035999594e-05, "loss": 0.0225, "step": 2320 }, { "epoch": 37.44, "grad_norm": 0.45314905047416687, "learning_rate": 2.23738674207551e-05, "loss": 0.0239, "step": 2340 }, { "epoch": 37.76, "grad_norm": 0.3919714689254761, "learning_rate": 2.1943594885702984e-05, "loss": 0.0235, "step": 2360 }, { "epoch": 38.08, "grad_norm": 0.0769328773021698, "learning_rate": 2.151423955199456e-05, "loss": 0.0286, "step": 2380 }, { "epoch": 38.4, "grad_norm": 0.3520802855491638, "learning_rate": 2.108593026553681e-05, "loss": 0.0323, "step": 2400 }, { "epoch": 38.72, "grad_norm": 0.3691672384738922, "learning_rate": 2.0658795558326743e-05, "loss": 0.0241, "step": 2420 }, { "epoch": 39.04, "grad_norm": 0.001480752951465547, "learning_rate": 2.0232963609880093e-05, "loss": 0.0158, "step": 2440 }, { "epoch": 39.36, "grad_norm": 0.31921085715293884, "learning_rate": 1.9808562208765667e-05, "loss": 0.0241, "step": 2460 }, { "epoch": 39.68, "grad_norm": 0.20936931669712067, "learning_rate": 1.938571871425715e-05, "loss": 0.0174, "step": 2480 }, { "epoch": 40.0, "grad_norm": 0.0011563162552192807, "learning_rate": 1.896456001811357e-05, "loss": 0.0183, "step": 2500 }, { "epoch": 40.32, "grad_norm": 0.19230084121227264, "learning_rate": 1.854521250650026e-05, "loss": 0.012, "step": 2520 }, { "epoch": 40.64, "grad_norm": 0.32013317942619324, "learning_rate": 1.8127802022061334e-05, "loss": 0.0225, "step": 2540 }, { "epoch": 40.96, "grad_norm": 0.11989307403564453, "learning_rate": 1.7712453826155457e-05, "loss": 0.0391, "step": 2560 }, { "epoch": 41.28, "grad_norm": 0.0009496643324382603, "learning_rate": 1.72992925612659e-05, "loss": 0.0229, "step": 2580 }, { "epoch": 41.6, "grad_norm": 0.0012078011641278863, "learning_rate": 1.688844221359645e-05, "loss": 0.015, "step": 2600 }, { "epoch": 41.92, "grad_norm": 0.0012093032710254192, "learning_rate": 1.6480026075864163e-05, "loss": 0.0287, "step": 2620 }, { "epoch": 42.24, "grad_norm": 0.2027181088924408, "learning_rate": 1.6074166710300247e-05, "loss": 0.0229, "step": 2640 }, { "epoch": 42.56, "grad_norm": 0.2977555990219116, "learning_rate": 1.567098591187021e-05, "loss": 0.0352, "step": 2660 }, { "epoch": 42.88, "grad_norm": 0.36129167675971985, "learning_rate": 1.5270604671724188e-05, "loss": 0.0242, "step": 2680 }, { "epoch": 43.2, "grad_norm": 0.001115540275350213, "learning_rate": 1.4873143140888538e-05, "loss": 0.0165, "step": 2700 }, { "epoch": 43.52, "grad_norm": 0.19148553907871246, "learning_rate": 1.4478720594209532e-05, "loss": 0.0274, "step": 2720 }, { "epoch": 43.84, "grad_norm": 0.057757727801799774, "learning_rate": 1.4087455394559984e-05, "loss": 0.0185, "step": 2740 }, { "epoch": 44.16, "grad_norm": 0.0009874219540506601, "learning_rate": 1.369946495731954e-05, "loss": 0.0509, "step": 2760 }, { "epoch": 44.48, "grad_norm": 0.3896861672401428, "learning_rate": 1.3314865715139346e-05, "loss": 0.027, "step": 2780 }, { "epoch": 44.8, "grad_norm": 0.19004037976264954, "learning_rate": 1.2933773083001517e-05, "loss": 0.0163, "step": 2800 }, { "epoch": 45.12, "grad_norm": 0.0009183284710161388, "learning_rate": 1.255630142358421e-05, "loss": 0.0125, "step": 2820 }, { "epoch": 45.44, "grad_norm": 0.1238480657339096, "learning_rate": 1.2182564012942193e-05, "loss": 0.0327, "step": 2840 }, { "epoch": 45.76, "grad_norm": 0.0009572324343025684, "learning_rate": 1.1812673006513789e-05, "loss": 0.0302, "step": 2860 }, { "epoch": 46.08, "grad_norm": 0.0011610776418820024, "learning_rate": 1.14467394054639e-05, "loss": 0.0209, "step": 2880 }, { "epoch": 46.4, "grad_norm": 0.04993343725800514, "learning_rate": 1.108487302337353e-05, "loss": 0.025, "step": 2900 }, { "epoch": 46.72, "grad_norm": 0.1806841343641281, "learning_rate": 1.0727182453285647e-05, "loss": 0.0284, "step": 2920 }, { "epoch": 47.04, "grad_norm": 0.0011777572799474, "learning_rate": 1.0373775035117305e-05, "loss": 0.0174, "step": 2940 }, { "epoch": 47.36, "grad_norm": 0.14497865736484528, "learning_rate": 1.002475682344792e-05, "loss": 0.0115, "step": 2960 }, { "epoch": 47.68, "grad_norm": 0.0014984839363023639, "learning_rate": 9.680232555693067e-06, "loss": 0.0238, "step": 2980 }, { "epoch": 48.0, "grad_norm": 0.07430601865053177, "learning_rate": 9.340305620673778e-06, "loss": 0.0294, "step": 3000 }, { "epoch": 48.32, "grad_norm": 0.07801785320043564, "learning_rate": 9.005078027590375e-06, "loss": 0.0226, "step": 3020 }, { "epoch": 48.64, "grad_norm": 0.0007196432561613619, "learning_rate": 8.67465037541038e-06, "loss": 0.0196, "step": 3040 }, { "epoch": 48.96, "grad_norm": 0.0008374506141990423, "learning_rate": 8.34912182267959e-06, "loss": 0.0175, "step": 3060 }, { "epoch": 49.28, "grad_norm": 0.0010465418454259634, "learning_rate": 8.028590057765523e-06, "loss": 0.015, "step": 3080 }, { "epoch": 49.6, "grad_norm": 0.0007761380402371287, "learning_rate": 7.713151269541844e-06, "loss": 0.0221, "step": 3100 }, { "epoch": 49.92, "grad_norm": 0.0216947291046381, "learning_rate": 7.402900118522979e-06, "loss": 0.0161, "step": 3120 }, { "epoch": 50.24, "grad_norm": 0.26546710729599, "learning_rate": 7.097929708457282e-06, "loss": 0.0237, "step": 3140 }, { "epoch": 50.56, "grad_norm": 0.0011781662469729781, "learning_rate": 6.7983315583873695e-06, "loss": 0.0172, "step": 3160 }, { "epoch": 50.88, "grad_norm": 0.39518535137176514, "learning_rate": 6.504195575186009e-06, "loss": 0.0198, "step": 3180 }, { "epoch": 51.2, "grad_norm": 0.3506232500076294, "learning_rate": 6.215610026575916e-06, "loss": 0.0227, "step": 3200 }, { "epoch": 51.52, "grad_norm": 0.31244903802871704, "learning_rate": 5.93266151464123e-06, "loss": 0.0156, "step": 3220 }, { "epoch": 51.84, "grad_norm": 0.17840787768363953, "learning_rate": 5.655434949839061e-06, "loss": 0.0268, "step": 3240 }, { "epoch": 52.16, "grad_norm": 0.1670505702495575, "learning_rate": 5.384013525518541e-06, "loss": 0.0209, "step": 3260 }, { "epoch": 52.48, "grad_norm": 0.0010594127234071493, "learning_rate": 5.118478692955194e-06, "loss": 0.0202, "step": 3280 }, { "epoch": 52.8, "grad_norm": 0.0015649694250896573, "learning_rate": 4.858910136908123e-06, "loss": 0.0192, "step": 3300 }, { "epoch": 53.12, "grad_norm": 0.19762022793293, "learning_rate": 4.605385751707248e-06, "loss": 0.0205, "step": 3320 }, { "epoch": 53.44, "grad_norm": 0.2010522186756134, "learning_rate": 4.357981617877932e-06, "loss": 0.0129, "step": 3340 }, { "epoch": 53.76, "grad_norm": 0.19793441891670227, "learning_rate": 4.116771979309797e-06, "loss": 0.0258, "step": 3360 }, { "epoch": 54.08, "grad_norm": 0.2605569064617157, "learning_rate": 3.881829220976807e-06, "loss": 0.0306, "step": 3380 }, { "epoch": 54.4, "grad_norm": 0.037421807646751404, "learning_rate": 3.653223847215126e-06, "loss": 0.0198, "step": 3400 }, { "epoch": 54.72, "grad_norm": 0.0007586870342493057, "learning_rate": 3.4310244605653797e-06, "loss": 0.0257, "step": 3420 }, { "epoch": 55.04, "grad_norm": 0.27584579586982727, "learning_rate": 3.215297741185572e-06, "loss": 0.0125, "step": 3440 }, { "epoch": 55.36, "grad_norm": 0.0007228174363262951, "learning_rate": 3.0061084268410006e-06, "loss": 0.0124, "step": 3460 }, { "epoch": 55.68, "grad_norm": 0.04090801998972893, "learning_rate": 2.8035192934769362e-06, "loss": 0.023, "step": 3480 }, { "epoch": 56.0, "grad_norm": 0.3518761694431305, "learning_rate": 2.607591136380122e-06, "loss": 0.0194, "step": 3500 }, { "epoch": 56.32, "grad_norm": 0.06331823766231537, "learning_rate": 2.4183827519346308e-06, "loss": 0.0162, "step": 3520 }, { "epoch": 56.64, "grad_norm": 0.22303640842437744, "learning_rate": 2.235950919977545e-06, "loss": 0.0337, "step": 3540 }, { "epoch": 56.96, "grad_norm": 0.08465743064880371, "learning_rate": 2.0603503867598182e-06, "loss": 0.0139, "step": 3560 }, { "epoch": 57.28, "grad_norm": 0.20135080814361572, "learning_rate": 1.8916338485173823e-06, "loss": 0.0193, "step": 3580 }, { "epoch": 57.6, "grad_norm": 0.0006721566896885633, "learning_rate": 1.7298519356574727e-06, "loss": 0.0203, "step": 3600 }, { "epoch": 57.92, "grad_norm": 0.10799671709537506, "learning_rate": 1.5750531975648324e-06, "loss": 0.0212, "step": 3620 }, { "epoch": 58.24, "grad_norm": 0.0010109569411724806, "learning_rate": 1.4272840880324934e-06, "loss": 0.0173, "step": 3640 }, { "epoch": 58.56, "grad_norm": 0.0008448906592093408, "learning_rate": 1.286588951321363e-06, "loss": 0.0139, "step": 3660 }, { "epoch": 58.88, "grad_norm": 0.0010856656590476632, "learning_rate": 1.1530100088528867e-06, "loss": 0.0268, "step": 3680 }, { "epoch": 59.2, "grad_norm": 0.23958024382591248, "learning_rate": 1.0265873465387516e-06, "loss": 0.0191, "step": 3700 }, { "epoch": 59.52, "grad_norm": 0.20584586262702942, "learning_rate": 9.073589027514789e-07, "loss": 0.0168, "step": 3720 }, { "epoch": 59.84, "grad_norm": 0.031580936163663864, "learning_rate": 7.953604569393841e-07, "loss": 0.0246, "step": 3740 }, { "epoch": 60.16, "grad_norm": 0.14215555787086487, "learning_rate": 6.906256188895038e-07, "loss": 0.019, "step": 3760 }, { "epoch": 60.48, "grad_norm": 0.0012006442993879318, "learning_rate": 5.931858186415756e-07, "loss": 0.0168, "step": 3780 }, { "epoch": 60.8, "grad_norm": 0.0063135698437690735, "learning_rate": 5.03070297056149e-07, "loss": 0.0197, "step": 3800 }, { "epoch": 61.12, "grad_norm": 0.07496818155050278, "learning_rate": 4.203060970396383e-07, "loss": 0.0207, "step": 3820 }, { "epoch": 61.44, "grad_norm": 0.16551247239112854, "learning_rate": 3.4491805542899157e-07, "loss": 0.0224, "step": 3840 }, { "epoch": 61.76, "grad_norm": 0.0008456969517283142, "learning_rate": 2.769287955383532e-07, "loss": 0.0151, "step": 3860 }, { "epoch": 62.08, "grad_norm": 0.0008134017698466778, "learning_rate": 2.1635872037001626e-07, "loss": 0.0284, "step": 3880 }, { "epoch": 62.4, "grad_norm": 0.18878595530986786, "learning_rate": 1.6322600649162356e-07, "loss": 0.0217, "step": 3900 }, { "epoch": 62.72, "grad_norm": 0.0008310906123369932, "learning_rate": 1.1754659858156659e-07, "loss": 0.0103, "step": 3920 }, { "epoch": 63.04, "grad_norm": 0.38621172308921814, "learning_rate": 7.933420464410201e-08, "loss": 0.0333, "step": 3940 }, { "epoch": 63.36, "grad_norm": 0.016794312745332718, "learning_rate": 4.860029189569237e-08, "loss": 0.0231, "step": 3960 }, { "epoch": 63.68, "grad_norm": 0.16253815591335297, "learning_rate": 2.535408332381417e-08, "loss": 0.0226, "step": 3980 }, { "epoch": 64.0, "grad_norm": 0.2387680560350418, "learning_rate": 9.60255491919415e-09, "loss": 0.0218, "step": 4000 }, { "epoch": 64.32, "grad_norm": 0.16293394565582275, "learning_rate": 1.3504335823810722e-09, "loss": 0.0219, "step": 4020 }, { "epoch": 64.48, "step": 4030, "total_flos": 2.3325606118844006e+17, "train_loss": 0.1495482857003993, "train_runtime": 6882.5617, "train_samples_per_second": 4.722, "train_steps_per_second": 0.586 } ], "logging_steps": 20, "max_steps": 4030, "num_input_tokens_seen": 0, "num_train_epochs": 65, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3325606118844006e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }