{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998592210229939, "eval_steps": 100, "global_step": 3195, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009385265133740028, "grad_norm": 0.4665657877922058, "learning_rate": 1.6000000000000001e-06, "loss": 2.9241, "step": 10 }, { "epoch": 0.018770530267480056, "grad_norm": 0.36958208680152893, "learning_rate": 3.4000000000000005e-06, "loss": 2.9288, "step": 20 }, { "epoch": 0.028155795401220086, "grad_norm": 0.3911522626876831, "learning_rate": 5.2e-06, "loss": 2.8784, "step": 30 }, { "epoch": 0.03754106053496011, "grad_norm": 0.6396021842956543, "learning_rate": 7.2e-06, "loss": 2.9314, "step": 40 }, { "epoch": 0.04692632566870014, "grad_norm": 0.5952326059341431, "learning_rate": 9.2e-06, "loss": 2.8987, "step": 50 }, { "epoch": 0.05631159080244017, "grad_norm": 0.570318341255188, "learning_rate": 1.1200000000000001e-05, "loss": 2.7701, "step": 60 }, { "epoch": 0.0656968559361802, "grad_norm": 0.5945647358894348, "learning_rate": 1.32e-05, "loss": 2.7628, "step": 70 }, { "epoch": 0.07508212106992022, "grad_norm": 0.5424015522003174, "learning_rate": 1.52e-05, "loss": 2.8025, "step": 80 }, { "epoch": 0.08446738620366025, "grad_norm": 0.5131893157958984, "learning_rate": 1.7199999999999998e-05, "loss": 2.6489, "step": 90 }, { "epoch": 0.09385265133740028, "grad_norm": 0.550221860408783, "learning_rate": 1.9200000000000003e-05, "loss": 2.6995, "step": 100 }, { "epoch": 0.09385265133740028, "eval_loss": 2.63047194480896, "eval_runtime": 937.5041, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 100 }, { "epoch": 0.1032379164711403, "grad_norm": 0.42704904079437256, "learning_rate": 2.12e-05, "loss": 2.6395, "step": 110 }, { "epoch": 0.11262318160488034, "grad_norm": 0.44245535135269165, "learning_rate": 2.32e-05, "loss": 2.6048, "step": 120 }, { "epoch": 0.12200844673862037, "grad_norm": 0.3578210473060608, "learning_rate": 2.5200000000000003e-05, "loss": 2.5695, "step": 130 }, { "epoch": 0.1313937118723604, "grad_norm": 0.5043906569480896, "learning_rate": 2.7200000000000004e-05, "loss": 2.5239, "step": 140 }, { "epoch": 0.14077897700610043, "grad_norm": 0.4712482988834381, "learning_rate": 2.9199999999999998e-05, "loss": 2.4199, "step": 150 }, { "epoch": 0.15016424213984045, "grad_norm": 0.5192570090293884, "learning_rate": 3.12e-05, "loss": 2.5526, "step": 160 }, { "epoch": 0.15954950727358047, "grad_norm": 0.48030856251716614, "learning_rate": 3.32e-05, "loss": 2.4876, "step": 170 }, { "epoch": 0.1689347724073205, "grad_norm": 0.9590848088264465, "learning_rate": 3.52e-05, "loss": 2.439, "step": 180 }, { "epoch": 0.17832003754106054, "grad_norm": 0.5423749685287476, "learning_rate": 3.72e-05, "loss": 2.3547, "step": 190 }, { "epoch": 0.18770530267480057, "grad_norm": 0.4835156798362732, "learning_rate": 3.9200000000000004e-05, "loss": 2.4199, "step": 200 }, { "epoch": 0.18770530267480057, "eval_loss": 2.3978819847106934, "eval_runtime": 937.0546, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 200 }, { "epoch": 0.1970905678085406, "grad_norm": 0.7263341546058655, "learning_rate": 4.12e-05, "loss": 2.4937, "step": 210 }, { "epoch": 0.2064758329422806, "grad_norm": 0.6794207096099854, "learning_rate": 4.32e-05, "loss": 2.3535, "step": 220 }, { "epoch": 0.21586109807602064, "grad_norm": 0.6615617275238037, "learning_rate": 4.52e-05, "loss": 2.3753, "step": 230 }, { "epoch": 0.22524636320976069, "grad_norm": 0.7630943059921265, "learning_rate": 4.72e-05, "loss": 2.3971, "step": 240 }, { "epoch": 0.2346316283435007, "grad_norm": 0.7536003589630127, "learning_rate": 4.92e-05, "loss": 2.3282, "step": 250 }, { "epoch": 0.24401689347724073, "grad_norm": 1.0338555574417114, "learning_rate": 5.1200000000000004e-05, "loss": 2.333, "step": 260 }, { "epoch": 0.25340215861098075, "grad_norm": 0.7328284382820129, "learning_rate": 5.3200000000000006e-05, "loss": 2.3966, "step": 270 }, { "epoch": 0.2627874237447208, "grad_norm": 0.8608214855194092, "learning_rate": 5.520000000000001e-05, "loss": 2.2628, "step": 280 }, { "epoch": 0.2721726888784608, "grad_norm": 0.8262203931808472, "learning_rate": 5.72e-05, "loss": 2.3092, "step": 290 }, { "epoch": 0.28155795401220085, "grad_norm": 0.8401615619659424, "learning_rate": 5.92e-05, "loss": 2.2722, "step": 300 }, { "epoch": 0.28155795401220085, "eval_loss": 2.217956066131592, "eval_runtime": 936.7717, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 300 }, { "epoch": 0.29094321914594085, "grad_norm": 0.7364845275878906, "learning_rate": 6.12e-05, "loss": 2.2179, "step": 310 }, { "epoch": 0.3003284842796809, "grad_norm": 0.8354003429412842, "learning_rate": 6.3e-05, "loss": 2.3654, "step": 320 }, { "epoch": 0.30971374941342095, "grad_norm": 1.0776605606079102, "learning_rate": 6.500000000000001e-05, "loss": 2.2143, "step": 330 }, { "epoch": 0.31909901454716094, "grad_norm": 0.8842081427574158, "learning_rate": 6.7e-05, "loss": 2.2705, "step": 340 }, { "epoch": 0.328484279680901, "grad_norm": 1.1752427816390991, "learning_rate": 6.9e-05, "loss": 2.2334, "step": 350 }, { "epoch": 0.337869544814641, "grad_norm": 0.7227160334587097, "learning_rate": 7.1e-05, "loss": 2.228, "step": 360 }, { "epoch": 0.34725480994838104, "grad_norm": 1.1053409576416016, "learning_rate": 7.3e-05, "loss": 2.2071, "step": 370 }, { "epoch": 0.3566400750821211, "grad_norm": 0.9307537078857422, "learning_rate": 7.500000000000001e-05, "loss": 2.2334, "step": 380 }, { "epoch": 0.3660253402158611, "grad_norm": 0.9264342188835144, "learning_rate": 7.7e-05, "loss": 2.2815, "step": 390 }, { "epoch": 0.37541060534960113, "grad_norm": 1.0281509160995483, "learning_rate": 7.900000000000001e-05, "loss": 2.0762, "step": 400 }, { "epoch": 0.37541060534960113, "eval_loss": 2.12506103515625, "eval_runtime": 936.7185, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 400 }, { "epoch": 0.38479587048334113, "grad_norm": 0.979566216468811, "learning_rate": 8.1e-05, "loss": 2.1801, "step": 410 }, { "epoch": 0.3941811356170812, "grad_norm": 1.4208861589431763, "learning_rate": 8.3e-05, "loss": 2.263, "step": 420 }, { "epoch": 0.40356640075082123, "grad_norm": 1.0267932415008545, "learning_rate": 8.5e-05, "loss": 2.1086, "step": 430 }, { "epoch": 0.4129516658845612, "grad_norm": 1.010489583015442, "learning_rate": 8.7e-05, "loss": 2.1647, "step": 440 }, { "epoch": 0.4223369310183013, "grad_norm": 0.784968912601471, "learning_rate": 8.900000000000001e-05, "loss": 2.2024, "step": 450 }, { "epoch": 0.43172219615204127, "grad_norm": 1.1498106718063354, "learning_rate": 9.1e-05, "loss": 2.0905, "step": 460 }, { "epoch": 0.4411074612857813, "grad_norm": 0.7570444345474243, "learning_rate": 9.300000000000001e-05, "loss": 2.0965, "step": 470 }, { "epoch": 0.45049272641952137, "grad_norm": 1.162133812904358, "learning_rate": 9.5e-05, "loss": 2.1354, "step": 480 }, { "epoch": 0.45987799155326137, "grad_norm": 0.963750422000885, "learning_rate": 9.7e-05, "loss": 2.1286, "step": 490 }, { "epoch": 0.4692632566870014, "grad_norm": 0.9272373914718628, "learning_rate": 9.900000000000001e-05, "loss": 1.9652, "step": 500 }, { "epoch": 0.4692632566870014, "eval_loss": 2.085766077041626, "eval_runtime": 937.6717, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 500 }, { "epoch": 0.4786485218207414, "grad_norm": 0.7409124374389648, "learning_rate": 9.999915070025401e-05, "loss": 2.099, "step": 510 }, { "epoch": 0.48803378695448146, "grad_norm": 0.8032079339027405, "learning_rate": 9.999235647539953e-05, "loss": 2.1874, "step": 520 }, { "epoch": 0.4974190520882215, "grad_norm": 1.00968599319458, "learning_rate": 9.997876894893606e-05, "loss": 2.2589, "step": 530 }, { "epoch": 0.5068043172219615, "grad_norm": 0.7752834558486938, "learning_rate": 9.995838996722914e-05, "loss": 2.1808, "step": 540 }, { "epoch": 0.5161895823557016, "grad_norm": 0.8999218940734863, "learning_rate": 9.993122229951354e-05, "loss": 2.2034, "step": 550 }, { "epoch": 0.5255748474894416, "grad_norm": 0.8991349339485168, "learning_rate": 9.989726963751682e-05, "loss": 2.1284, "step": 560 }, { "epoch": 0.5349601126231815, "grad_norm": 0.9258519411087036, "learning_rate": 9.985653659495773e-05, "loss": 2.0642, "step": 570 }, { "epoch": 0.5443453777569216, "grad_norm": 0.7985184788703918, "learning_rate": 9.980902870691931e-05, "loss": 1.9404, "step": 580 }, { "epoch": 0.5537306428906617, "grad_norm": 0.9277619123458862, "learning_rate": 9.975475242909667e-05, "loss": 1.9928, "step": 590 }, { "epoch": 0.5631159080244017, "grad_norm": 0.8406923413276672, "learning_rate": 9.969371513691982e-05, "loss": 2.1893, "step": 600 }, { "epoch": 0.5631159080244017, "eval_loss": 2.0628976821899414, "eval_runtime": 937.447, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 600 }, { "epoch": 0.5725011731581418, "grad_norm": 0.678254246711731, "learning_rate": 9.962592512455138e-05, "loss": 2.1181, "step": 610 }, { "epoch": 0.5818864382918817, "grad_norm": 0.7161825895309448, "learning_rate": 9.955139160375959e-05, "loss": 2.1328, "step": 620 }, { "epoch": 0.5912717034256217, "grad_norm": 0.7554607391357422, "learning_rate": 9.947012470266645e-05, "loss": 2.0865, "step": 630 }, { "epoch": 0.6006569685593618, "grad_norm": 0.9590722322463989, "learning_rate": 9.938213546437154e-05, "loss": 2.1012, "step": 640 }, { "epoch": 0.6100422336931018, "grad_norm": 0.8377043008804321, "learning_rate": 9.928743584545132e-05, "loss": 2.1155, "step": 650 }, { "epoch": 0.6194274988268419, "grad_norm": 0.7528682947158813, "learning_rate": 9.91860387143345e-05, "loss": 2.2455, "step": 660 }, { "epoch": 0.6288127639605818, "grad_norm": 0.7164850234985352, "learning_rate": 9.907795784955327e-05, "loss": 2.155, "step": 670 }, { "epoch": 0.6381980290943219, "grad_norm": 0.7665808200836182, "learning_rate": 9.896320793787106e-05, "loss": 2.0722, "step": 680 }, { "epoch": 0.6475832942280619, "grad_norm": 0.8012193441390991, "learning_rate": 9.884180457228678e-05, "loss": 2.1045, "step": 690 }, { "epoch": 0.656968559361802, "grad_norm": 0.8847366571426392, "learning_rate": 9.871376424991589e-05, "loss": 2.0153, "step": 700 }, { "epoch": 0.656968559361802, "eval_loss": 2.047290802001953, "eval_runtime": 937.2712, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 700 }, { "epoch": 0.666353824495542, "grad_norm": 0.7482985854148865, "learning_rate": 9.85791043697488e-05, "loss": 1.981, "step": 710 }, { "epoch": 0.675739089629282, "grad_norm": 0.776764988899231, "learning_rate": 9.843784323028638e-05, "loss": 2.062, "step": 720 }, { "epoch": 0.685124354763022, "grad_norm": 0.8061379790306091, "learning_rate": 9.82900000270536e-05, "loss": 1.9919, "step": 730 }, { "epoch": 0.6945096198967621, "grad_norm": 0.7650445699691772, "learning_rate": 9.813559484999102e-05, "loss": 2.0104, "step": 740 }, { "epoch": 0.7038948850305021, "grad_norm": 1.2102171182632446, "learning_rate": 9.797464868072488e-05, "loss": 2.0407, "step": 750 }, { "epoch": 0.7132801501642422, "grad_norm": 0.7476488351821899, "learning_rate": 9.780718338971591e-05, "loss": 1.8859, "step": 760 }, { "epoch": 0.7226654152979821, "grad_norm": 0.7931784391403198, "learning_rate": 9.763322173328753e-05, "loss": 2.1804, "step": 770 }, { "epoch": 0.7320506804317222, "grad_norm": 1.1580551862716675, "learning_rate": 9.745278735053343e-05, "loss": 2.1195, "step": 780 }, { "epoch": 0.7414359455654622, "grad_norm": 0.7719026803970337, "learning_rate": 9.726590476010548e-05, "loss": 1.97, "step": 790 }, { "epoch": 0.7508212106992023, "grad_norm": 0.9482008218765259, "learning_rate": 9.707259935688187e-05, "loss": 1.9911, "step": 800 }, { "epoch": 0.7508212106992023, "eval_loss": 2.0318124294281006, "eval_runtime": 937.1438, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 800 }, { "epoch": 0.7602064758329423, "grad_norm": 0.7328064441680908, "learning_rate": 9.687289740851622e-05, "loss": 2.1643, "step": 810 }, { "epoch": 0.7695917409666823, "grad_norm": 0.7916940450668335, "learning_rate": 9.666682605186835e-05, "loss": 2.166, "step": 820 }, { "epoch": 0.7789770061004223, "grad_norm": 0.9480199217796326, "learning_rate": 9.645441328931654e-05, "loss": 2.108, "step": 830 }, { "epoch": 0.7883622712341624, "grad_norm": 0.8467483520507812, "learning_rate": 9.62356879849525e-05, "loss": 2.0794, "step": 840 }, { "epoch": 0.7977475363679024, "grad_norm": 0.6742168664932251, "learning_rate": 9.601067986065909e-05, "loss": 2.1227, "step": 850 }, { "epoch": 0.8071328015016425, "grad_norm": 0.7015961408615112, "learning_rate": 9.577941949207146e-05, "loss": 2.0288, "step": 860 }, { "epoch": 0.8165180666353824, "grad_norm": 0.8289031386375427, "learning_rate": 9.556596544693951e-05, "loss": 2.037, "step": 870 }, { "epoch": 0.8259033317691225, "grad_norm": 0.8550999760627747, "learning_rate": 9.53229130894619e-05, "loss": 2.1908, "step": 880 }, { "epoch": 0.8352885969028625, "grad_norm": 0.7619972229003906, "learning_rate": 9.50737019461194e-05, "loss": 2.1759, "step": 890 }, { "epoch": 0.8446738620366026, "grad_norm": 0.7303412556648254, "learning_rate": 9.481836588141808e-05, "loss": 2.1041, "step": 900 }, { "epoch": 0.8446738620366026, "eval_loss": 2.019794225692749, "eval_runtime": 938.1446, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.505, "step": 900 }, { "epoch": 0.8540591271703426, "grad_norm": 0.6646521687507629, "learning_rate": 9.455693959216005e-05, "loss": 2.0648, "step": 910 }, { "epoch": 0.8634443923040825, "grad_norm": 0.910926878452301, "learning_rate": 9.428945860272858e-05, "loss": 2.0945, "step": 920 }, { "epoch": 0.8728296574378226, "grad_norm": 0.7175999283790588, "learning_rate": 9.401595926026077e-05, "loss": 2.0488, "step": 930 }, { "epoch": 0.8822149225715626, "grad_norm": 1.090782642364502, "learning_rate": 9.373647872970852e-05, "loss": 1.9902, "step": 940 }, { "epoch": 0.8916001877053027, "grad_norm": 1.0074050426483154, "learning_rate": 9.345105498878826e-05, "loss": 1.9974, "step": 950 }, { "epoch": 0.9009854528390427, "grad_norm": 0.6742076277732849, "learning_rate": 9.315972682282031e-05, "loss": 2.0359, "step": 960 }, { "epoch": 0.9103707179727827, "grad_norm": 0.9036829471588135, "learning_rate": 9.286253381945837e-05, "loss": 2.1047, "step": 970 }, { "epoch": 0.9197559831065227, "grad_norm": 1.02723228931427, "learning_rate": 9.255951636331028e-05, "loss": 1.9049, "step": 980 }, { "epoch": 0.9291412482402628, "grad_norm": 0.9634993076324463, "learning_rate": 9.225071563045007e-05, "loss": 2.045, "step": 990 }, { "epoch": 0.9385265133740028, "grad_norm": 0.8380990624427795, "learning_rate": 9.193617358282277e-05, "loss": 2.0488, "step": 1000 }, { "epoch": 0.9385265133740028, "eval_loss": 2.0117270946502686, "eval_runtime": 939.385, "eval_samples_per_second": 1.008, "eval_steps_per_second": 0.505, "step": 1000 }, { "epoch": 0.9479117785077429, "grad_norm": 0.8317145705223083, "learning_rate": 9.161593296254235e-05, "loss": 2.0196, "step": 1010 }, { "epoch": 0.9572970436414828, "grad_norm": 0.9297407865524292, "learning_rate": 9.129003728608367e-05, "loss": 2.0798, "step": 1020 }, { "epoch": 0.9666823087752229, "grad_norm": 0.6938600540161133, "learning_rate": 9.095853083836902e-05, "loss": 2.1225, "step": 1030 }, { "epoch": 0.9760675739089629, "grad_norm": 0.7786221504211426, "learning_rate": 9.062145866675048e-05, "loss": 2.0098, "step": 1040 }, { "epoch": 0.985452839042703, "grad_norm": 0.7615213394165039, "learning_rate": 9.027886657488862e-05, "loss": 2.1385, "step": 1050 }, { "epoch": 0.994838104176443, "grad_norm": 0.9867497086524963, "learning_rate": 8.993080111652831e-05, "loss": 2.1039, "step": 1060 }, { "epoch": 1.004223369310183, "grad_norm": 0.747900664806366, "learning_rate": 8.95773095891727e-05, "loss": 2.104, "step": 1070 }, { "epoch": 1.013608634443923, "grad_norm": 0.6495920419692993, "learning_rate": 8.921844002765613e-05, "loss": 2.0333, "step": 1080 }, { "epoch": 1.022993899577663, "grad_norm": 0.8173234462738037, "learning_rate": 8.885424119761684e-05, "loss": 2.0524, "step": 1090 }, { "epoch": 1.0323791647114031, "grad_norm": 0.8507694602012634, "learning_rate": 8.848476258887031e-05, "loss": 1.897, "step": 1100 }, { "epoch": 1.0323791647114031, "eval_loss": 2.001809597015381, "eval_runtime": 940.385, "eval_samples_per_second": 1.007, "eval_steps_per_second": 0.504, "step": 1100 }, { "epoch": 1.0417644298451432, "grad_norm": 1.4220669269561768, "learning_rate": 8.814775911179585e-05, "loss": 1.9925, "step": 1110 }, { "epoch": 1.0511496949788832, "grad_norm": 0.7997964024543762, "learning_rate": 8.776838783200623e-05, "loss": 1.9378, "step": 1120 }, { "epoch": 1.0605349601126233, "grad_norm": 2.4897735118865967, "learning_rate": 8.738388432665424e-05, "loss": 1.855, "step": 1130 }, { "epoch": 1.069920225246363, "grad_norm": 0.8852785229682922, "learning_rate": 8.699430084469276e-05, "loss": 2.0958, "step": 1140 }, { "epoch": 1.0793054903801031, "grad_norm": 0.698052167892456, "learning_rate": 8.65996903253766e-05, "loss": 2.1623, "step": 1150 }, { "epoch": 1.0886907555138432, "grad_norm": 0.8183425068855286, "learning_rate": 8.620010639106853e-05, "loss": 2.0938, "step": 1160 }, { "epoch": 1.0980760206475833, "grad_norm": 0.8702631592750549, "learning_rate": 8.57956033399528e-05, "loss": 1.842, "step": 1170 }, { "epoch": 1.1074612857813233, "grad_norm": 0.7535381317138672, "learning_rate": 8.538623613865678e-05, "loss": 2.069, "step": 1180 }, { "epoch": 1.1168465509150634, "grad_norm": 0.7525476813316345, "learning_rate": 8.497206041478162e-05, "loss": 2.0564, "step": 1190 }, { "epoch": 1.1262318160488034, "grad_norm": 0.80061274766922, "learning_rate": 8.455313244934324e-05, "loss": 2.0298, "step": 1200 }, { "epoch": 1.1262318160488034, "eval_loss": 1.9951938390731812, "eval_runtime": 939.3487, "eval_samples_per_second": 1.008, "eval_steps_per_second": 0.505, "step": 1200 }, { "epoch": 1.1356170811825435, "grad_norm": 0.7788612246513367, "learning_rate": 8.412950916912451e-05, "loss": 2.1235, "step": 1210 }, { "epoch": 1.1450023463162835, "grad_norm": 0.7296183705329895, "learning_rate": 8.370124813893962e-05, "loss": 2.1001, "step": 1220 }, { "epoch": 1.1543876114500236, "grad_norm": 0.9630563855171204, "learning_rate": 8.326840755381176e-05, "loss": 2.1847, "step": 1230 }, { "epoch": 1.1637728765837636, "grad_norm": 0.8061946630477905, "learning_rate": 8.283104623106525e-05, "loss": 2.0888, "step": 1240 }, { "epoch": 1.1731581417175034, "grad_norm": 0.8088260889053345, "learning_rate": 8.238922360233297e-05, "loss": 1.9784, "step": 1250 }, { "epoch": 1.1825434068512435, "grad_norm": 0.8710635900497437, "learning_rate": 8.194299970548045e-05, "loss": 1.915, "step": 1260 }, { "epoch": 1.1919286719849835, "grad_norm": 0.6845631003379822, "learning_rate": 8.149243517644745e-05, "loss": 2.2073, "step": 1270 }, { "epoch": 1.2013139371187236, "grad_norm": 0.697823166847229, "learning_rate": 8.103759124100839e-05, "loss": 2.0622, "step": 1280 }, { "epoch": 1.2106992022524636, "grad_norm": 0.7662308216094971, "learning_rate": 8.057852970645254e-05, "loss": 2.0764, "step": 1290 }, { "epoch": 1.2200844673862037, "grad_norm": 0.7569838166236877, "learning_rate": 8.011531295318526e-05, "loss": 2.0989, "step": 1300 }, { "epoch": 1.2200844673862037, "eval_loss": 1.9889544248580933, "eval_runtime": 940.2035, "eval_samples_per_second": 1.007, "eval_steps_per_second": 0.504, "step": 1300 }, { "epoch": 1.2294697325199437, "grad_norm": 0.7303356528282166, "learning_rate": 7.964800392625129e-05, "loss": 1.9281, "step": 1310 }, { "epoch": 1.2388549976536838, "grad_norm": 0.8145589232444763, "learning_rate": 7.917666612678138e-05, "loss": 2.0838, "step": 1320 }, { "epoch": 1.2482402627874238, "grad_norm": 0.9209080934524536, "learning_rate": 7.870136360336328e-05, "loss": 2.0761, "step": 1330 }, { "epoch": 1.2576255279211637, "grad_norm": 0.8099146485328674, "learning_rate": 7.822216094333847e-05, "loss": 2.098, "step": 1340 }, { "epoch": 1.267010793054904, "grad_norm": 0.7984501719474792, "learning_rate": 7.773912326402543e-05, "loss": 1.8043, "step": 1350 }, { "epoch": 1.2763960581886438, "grad_norm": 0.713470995426178, "learning_rate": 7.72523162038713e-05, "loss": 1.9197, "step": 1360 }, { "epoch": 1.2857813233223838, "grad_norm": 0.8801192045211792, "learning_rate": 7.676180591353219e-05, "loss": 2.1053, "step": 1370 }, { "epoch": 1.2951665884561239, "grad_norm": 0.7982367873191833, "learning_rate": 7.626765904688447e-05, "loss": 2.2708, "step": 1380 }, { "epoch": 1.304551853589864, "grad_norm": 0.840467095375061, "learning_rate": 7.576994275196712e-05, "loss": 2.0068, "step": 1390 }, { "epoch": 1.313937118723604, "grad_norm": 0.8295313715934753, "learning_rate": 7.526872466185742e-05, "loss": 1.8695, "step": 1400 }, { "epoch": 1.313937118723604, "eval_loss": 1.9837820529937744, "eval_runtime": 938.2236, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.505, "step": 1400 }, { "epoch": 1.323322383857344, "grad_norm": 0.7913984060287476, "learning_rate": 7.476407288548036e-05, "loss": 1.9639, "step": 1410 }, { "epoch": 1.332707648991084, "grad_norm": 0.819983720779419, "learning_rate": 7.425605599835361e-05, "loss": 2.0229, "step": 1420 }, { "epoch": 1.342092914124824, "grad_norm": 0.8383524417877197, "learning_rate": 7.374474303326896e-05, "loss": 1.9001, "step": 1430 }, { "epoch": 1.3514781792585642, "grad_norm": 1.1012581586837769, "learning_rate": 7.323020347091177e-05, "loss": 1.9938, "step": 1440 }, { "epoch": 1.360863444392304, "grad_norm": 0.8445969223976135, "learning_rate": 7.271250723041932e-05, "loss": 2.0726, "step": 1450 }, { "epoch": 1.370248709526044, "grad_norm": 0.8882044553756714, "learning_rate": 7.21917246598798e-05, "loss": 2.0695, "step": 1460 }, { "epoch": 1.379633974659784, "grad_norm": 0.7796063423156738, "learning_rate": 7.1667926526773e-05, "loss": 2.0607, "step": 1470 }, { "epoch": 1.3890192397935242, "grad_norm": 0.7294184565544128, "learning_rate": 7.114118400835382e-05, "loss": 2.1345, "step": 1480 }, { "epoch": 1.3984045049272642, "grad_norm": 0.7649526000022888, "learning_rate": 7.061156868198028e-05, "loss": 1.999, "step": 1490 }, { "epoch": 1.4077897700610043, "grad_norm": 0.8375232219696045, "learning_rate": 7.007915251538708e-05, "loss": 2.1573, "step": 1500 }, { "epoch": 1.4077897700610043, "eval_loss": 1.976365566253662, "eval_runtime": 937.4612, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 1500 }, { "epoch": 1.4171750351947443, "grad_norm": 0.7321649193763733, "learning_rate": 6.954400785690622e-05, "loss": 2.0845, "step": 1510 }, { "epoch": 1.4265603003284844, "grad_norm": 0.778896152973175, "learning_rate": 6.900620742563562e-05, "loss": 1.9401, "step": 1520 }, { "epoch": 1.4359455654622244, "grad_norm": 0.7842182517051697, "learning_rate": 6.846582430155783e-05, "loss": 1.8992, "step": 1530 }, { "epoch": 1.4453308305959642, "grad_norm": 0.6991093754768372, "learning_rate": 6.792293191560914e-05, "loss": 2.0625, "step": 1540 }, { "epoch": 1.4547160957297043, "grad_norm": 0.9950138330459595, "learning_rate": 6.737760403970152e-05, "loss": 2.0905, "step": 1550 }, { "epoch": 1.4641013608634443, "grad_norm": 0.6939354538917542, "learning_rate": 6.682991477669781e-05, "loss": 2.2633, "step": 1560 }, { "epoch": 1.4734866259971844, "grad_norm": 0.842707633972168, "learning_rate": 6.627993855034228e-05, "loss": 1.8811, "step": 1570 }, { "epoch": 1.4828718911309244, "grad_norm": 0.8008860945701599, "learning_rate": 6.572775009514725e-05, "loss": 1.8528, "step": 1580 }, { "epoch": 1.4922571562646645, "grad_norm": 0.7409046292304993, "learning_rate": 6.517342444623784e-05, "loss": 1.9773, "step": 1590 }, { "epoch": 1.5016424213984045, "grad_norm": 0.7854930758476257, "learning_rate": 6.461703692915553e-05, "loss": 2.0183, "step": 1600 }, { "epoch": 1.5016424213984045, "eval_loss": 1.9713027477264404, "eval_runtime": 938.8142, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.505, "step": 1600 }, { "epoch": 1.5110276865321446, "grad_norm": 0.8054217100143433, "learning_rate": 6.405866314962252e-05, "loss": 2.1303, "step": 1610 }, { "epoch": 1.5204129516658846, "grad_norm": 0.7017131447792053, "learning_rate": 6.349837898326784e-05, "loss": 2.0846, "step": 1620 }, { "epoch": 1.5297982167996245, "grad_norm": 0.8393527865409851, "learning_rate": 6.293626056531693e-05, "loss": 1.8327, "step": 1630 }, { "epoch": 1.5391834819333647, "grad_norm": 0.8798466920852661, "learning_rate": 6.237238428024572e-05, "loss": 1.8657, "step": 1640 }, { "epoch": 1.5485687470671046, "grad_norm": 0.7530277371406555, "learning_rate": 6.180682675140121e-05, "loss": 2.245, "step": 1650 }, { "epoch": 1.5579540122008448, "grad_norm": 0.7642443776130676, "learning_rate": 6.123966483058916e-05, "loss": 1.9058, "step": 1660 }, { "epoch": 1.5673392773345847, "grad_norm": 0.7459161281585693, "learning_rate": 6.067097558763106e-05, "loss": 1.9482, "step": 1670 }, { "epoch": 1.5767245424683247, "grad_norm": 0.7460825443267822, "learning_rate": 6.0100836299891314e-05, "loss": 2.127, "step": 1680 }, { "epoch": 1.5861098076020648, "grad_norm": 0.710259735584259, "learning_rate": 5.9529324441776314e-05, "loss": 2.1407, "step": 1690 }, { "epoch": 1.5954950727358048, "grad_norm": 0.7227075695991516, "learning_rate": 5.8956517674206605e-05, "loss": 1.9229, "step": 1700 }, { "epoch": 1.5954950727358048, "eval_loss": 1.967227816581726, "eval_runtime": 937.7597, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.505, "step": 1700 }, { "epoch": 1.6048803378695449, "grad_norm": 1.0246224403381348, "learning_rate": 5.838249383406387e-05, "loss": 2.0563, "step": 1710 }, { "epoch": 1.6142656030032847, "grad_norm": 0.8386335968971252, "learning_rate": 5.780733092361388e-05, "loss": 1.8553, "step": 1720 }, { "epoch": 1.623650868137025, "grad_norm": 0.7936443090438843, "learning_rate": 5.723110709990707e-05, "loss": 2.1631, "step": 1730 }, { "epoch": 1.6330361332707648, "grad_norm": 0.7047923803329468, "learning_rate": 5.6653900664157934e-05, "loss": 1.9989, "step": 1740 }, { "epoch": 1.642421398404505, "grad_norm": 0.8624520897865295, "learning_rate": 5.6075790051105023e-05, "loss": 2.0198, "step": 1750 }, { "epoch": 1.651806663538245, "grad_norm": 0.8698344826698303, "learning_rate": 5.5496853818352614e-05, "loss": 2.1045, "step": 1760 }, { "epoch": 1.661191928671985, "grad_norm": 0.78273606300354, "learning_rate": 5.491717063569582e-05, "loss": 1.9399, "step": 1770 }, { "epoch": 1.670577193805725, "grad_norm": 0.7217704057693481, "learning_rate": 5.433681927443043e-05, "loss": 2.0161, "step": 1780 }, { "epoch": 1.679962458939465, "grad_norm": 0.7136641144752502, "learning_rate": 5.375587859664885e-05, "loss": 2.1437, "step": 1790 }, { "epoch": 1.689347724073205, "grad_norm": 0.7752694487571716, "learning_rate": 5.317442754452379e-05, "loss": 1.9732, "step": 1800 }, { "epoch": 1.689347724073205, "eval_loss": 1.9616819620132446, "eval_runtime": 938.0777, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.505, "step": 1800 }, { "epoch": 1.698732989206945, "grad_norm": 0.7387381196022034, "learning_rate": 5.2592545129581185e-05, "loss": 1.8547, "step": 1810 }, { "epoch": 1.7081182543406852, "grad_norm": 0.9277381300926208, "learning_rate": 5.2010310421963415e-05, "loss": 1.8679, "step": 1820 }, { "epoch": 1.717503519474425, "grad_norm": 0.7474396824836731, "learning_rate": 5.142780253968481e-05, "loss": 2.1186, "step": 1830 }, { "epoch": 1.7268887846081653, "grad_norm": 0.8091953992843628, "learning_rate": 5.084510063788056e-05, "loss": 2.0762, "step": 1840 }, { "epoch": 1.7362740497419051, "grad_norm": 0.7326928973197937, "learning_rate": 5.02622838980505e-05, "loss": 1.9626, "step": 1850 }, { "epoch": 1.7456593148756452, "grad_norm": 0.6803217530250549, "learning_rate": 4.967943151729945e-05, "loss": 1.8606, "step": 1860 }, { "epoch": 1.7550445800093852, "grad_norm": 0.7370252013206482, "learning_rate": 4.9096622697575394e-05, "loss": 1.9649, "step": 1870 }, { "epoch": 1.7644298451431253, "grad_norm": 0.7405309677124023, "learning_rate": 4.851393663490689e-05, "loss": 2.0119, "step": 1880 }, { "epoch": 1.7738151102768653, "grad_norm": 0.8256239891052246, "learning_rate": 4.793145250864151e-05, "loss": 1.9313, "step": 1890 }, { "epoch": 1.7832003754106054, "grad_norm": 0.8547274470329285, "learning_rate": 4.7349249470686266e-05, "loss": 1.6835, "step": 1900 }, { "epoch": 1.7832003754106054, "eval_loss": 1.9573733806610107, "eval_runtime": 937.9887, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.505, "step": 1900 }, { "epoch": 1.7925856405443454, "grad_norm": 0.9643909335136414, "learning_rate": 4.676740663475198e-05, "loss": 1.8968, "step": 1910 }, { "epoch": 1.8019709056780853, "grad_norm": 0.8356963992118835, "learning_rate": 4.6186003065602827e-05, "loss": 1.9651, "step": 1920 }, { "epoch": 1.8113561708118255, "grad_norm": 0.7793695330619812, "learning_rate": 4.560511776831235e-05, "loss": 2.0038, "step": 1930 }, { "epoch": 1.8207414359455654, "grad_norm": 0.7343499660491943, "learning_rate": 4.502482967752786e-05, "loss": 1.7593, "step": 1940 }, { "epoch": 1.8301267010793056, "grad_norm": 0.7215515971183777, "learning_rate": 4.444521764674411e-05, "loss": 2.0668, "step": 1950 }, { "epoch": 1.8395119662130455, "grad_norm": 0.8646712303161621, "learning_rate": 4.3866360437588294e-05, "loss": 1.9422, "step": 1960 }, { "epoch": 1.8488972313467855, "grad_norm": 0.8201174736022949, "learning_rate": 4.328833670911724e-05, "loss": 2.2159, "step": 1970 }, { "epoch": 1.8582824964805256, "grad_norm": 0.8474441766738892, "learning_rate": 4.2711225007128765e-05, "loss": 2.0485, "step": 1980 }, { "epoch": 1.8676677616142656, "grad_norm": 0.679102897644043, "learning_rate": 4.213510375348837e-05, "loss": 1.9853, "step": 1990 }, { "epoch": 1.8770530267480057, "grad_norm": 0.7910708785057068, "learning_rate": 4.15600512354726e-05, "loss": 1.9874, "step": 2000 }, { "epoch": 1.8770530267480057, "eval_loss": 1.9538992643356323, "eval_runtime": 937.8684, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.505, "step": 2000 }, { "epoch": 1.8864382918817455, "grad_norm": 0.6856018900871277, "learning_rate": 4.0986145595131055e-05, "loss": 1.9927, "step": 2010 }, { "epoch": 1.8958235570154858, "grad_norm": 0.7743241786956787, "learning_rate": 4.041346481866768e-05, "loss": 1.9437, "step": 2020 }, { "epoch": 1.9052088221492256, "grad_norm": 0.7838016748428345, "learning_rate": 3.9842086725843625e-05, "loss": 1.953, "step": 2030 }, { "epoch": 1.9145940872829659, "grad_norm": 0.7300184965133667, "learning_rate": 3.9272088959402534e-05, "loss": 2.0461, "step": 2040 }, { "epoch": 1.9239793524167057, "grad_norm": 0.8169785141944885, "learning_rate": 3.8703548974519874e-05, "loss": 2.1075, "step": 2050 }, { "epoch": 1.9333646175504458, "grad_norm": 0.724233090877533, "learning_rate": 3.8136544028277894e-05, "loss": 1.897, "step": 2060 }, { "epoch": 1.9427498826841858, "grad_norm": 0.7821764945983887, "learning_rate": 3.757115116916727e-05, "loss": 2.1728, "step": 2070 }, { "epoch": 1.9521351478179259, "grad_norm": 0.7325600981712341, "learning_rate": 3.7007447226617366e-05, "loss": 1.9865, "step": 2080 }, { "epoch": 1.961520412951666, "grad_norm": 0.7352250814437866, "learning_rate": 3.6445508800556036e-05, "loss": 2.0352, "step": 2090 }, { "epoch": 1.970905678085406, "grad_norm": 0.6944911479949951, "learning_rate": 3.5885412251000745e-05, "loss": 1.7607, "step": 2100 }, { "epoch": 1.970905678085406, "eval_loss": 1.9512391090393066, "eval_runtime": 935.8673, "eval_samples_per_second": 1.012, "eval_steps_per_second": 0.506, "step": 2100 }, { "epoch": 1.980290943219146, "grad_norm": 1.345037579536438, "learning_rate": 3.532723368768228e-05, "loss": 1.8189, "step": 2110 }, { "epoch": 1.9896762083528858, "grad_norm": 0.8876499533653259, "learning_rate": 3.477104895970234e-05, "loss": 2.0414, "step": 2120 }, { "epoch": 1.999061473486626, "grad_norm": 0.832295835018158, "learning_rate": 3.4216933645226776e-05, "loss": 1.9307, "step": 2130 }, { "epoch": 2.008446738620366, "grad_norm": 0.7709248661994934, "learning_rate": 3.3664963041215406e-05, "loss": 2.0266, "step": 2140 }, { "epoch": 2.017832003754106, "grad_norm": 0.8151468634605408, "learning_rate": 3.311521215319021e-05, "loss": 1.9638, "step": 2150 }, { "epoch": 2.027217268887846, "grad_norm": 0.731529951095581, "learning_rate": 3.256775568504305e-05, "loss": 1.9209, "step": 2160 }, { "epoch": 2.0366025340215863, "grad_norm": 0.7643239498138428, "learning_rate": 3.202266802888439e-05, "loss": 2.0578, "step": 2170 }, { "epoch": 2.045987799155326, "grad_norm": 0.7743598222732544, "learning_rate": 3.148002325493445e-05, "loss": 1.9511, "step": 2180 }, { "epoch": 2.055373064289066, "grad_norm": 0.7745042443275452, "learning_rate": 3.0939895101457916e-05, "loss": 1.9773, "step": 2190 }, { "epoch": 2.0647583294228062, "grad_norm": 0.8307595252990723, "learning_rate": 3.0402356964744027e-05, "loss": 1.9459, "step": 2200 }, { "epoch": 2.0647583294228062, "eval_loss": 1.947997808456421, "eval_runtime": 937.6638, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 2200 }, { "epoch": 2.074143594556546, "grad_norm": 0.7969174981117249, "learning_rate": 2.986748188913287e-05, "loss": 1.8188, "step": 2210 }, { "epoch": 2.0835288596902863, "grad_norm": 2.2084882259368896, "learning_rate": 2.9335342557089668e-05, "loss": 1.8969, "step": 2220 }, { "epoch": 2.092914124824026, "grad_norm": 0.7004146575927734, "learning_rate": 2.8806011279328256e-05, "loss": 1.7638, "step": 2230 }, { "epoch": 2.1022993899577664, "grad_norm": 0.6949226260185242, "learning_rate": 2.827955998498482e-05, "loss": 2.1413, "step": 2240 }, { "epoch": 2.1116846550915063, "grad_norm": 0.733250081539154, "learning_rate": 2.775606021184396e-05, "loss": 2.0401, "step": 2250 }, { "epoch": 2.1210699202252465, "grad_norm": 0.845836341381073, "learning_rate": 2.7235583096617346e-05, "loss": 1.9184, "step": 2260 }, { "epoch": 2.1304551853589864, "grad_norm": 0.7574362754821777, "learning_rate": 2.6718199365277397e-05, "loss": 2.0152, "step": 2270 }, { "epoch": 2.139840450492726, "grad_norm": 0.9965910911560059, "learning_rate": 2.6203979323446454e-05, "loss": 1.8746, "step": 2280 }, { "epoch": 2.1492257156264665, "grad_norm": 0.8181989192962646, "learning_rate": 2.5692992846843206e-05, "loss": 1.8114, "step": 2290 }, { "epoch": 2.1586109807602063, "grad_norm": 0.9191744327545166, "learning_rate": 2.5185309371787513e-05, "loss": 1.7611, "step": 2300 }, { "epoch": 2.1586109807602063, "eval_loss": 1.9463104009628296, "eval_runtime": 936.8197, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 2300 }, { "epoch": 2.1679962458939466, "grad_norm": 0.7393763065338135, "learning_rate": 2.468099788576482e-05, "loss": 1.9138, "step": 2310 }, { "epoch": 2.1773815110276864, "grad_norm": 0.8604278564453125, "learning_rate": 2.418012691805191e-05, "loss": 1.9187, "step": 2320 }, { "epoch": 2.1867667761614267, "grad_norm": 0.8256701827049255, "learning_rate": 2.3682764530404365e-05, "loss": 1.9313, "step": 2330 }, { "epoch": 2.1961520412951665, "grad_norm": 0.8674435019493103, "learning_rate": 2.3188978307808125e-05, "loss": 2.1127, "step": 2340 }, { "epoch": 2.2055373064289068, "grad_norm": 0.8510717153549194, "learning_rate": 2.2698835349295472e-05, "loss": 1.9931, "step": 2350 }, { "epoch": 2.2149225715626466, "grad_norm": 0.8951923847198486, "learning_rate": 2.2212402258827115e-05, "loss": 1.8811, "step": 2360 }, { "epoch": 2.224307836696387, "grad_norm": 0.7499418258666992, "learning_rate": 2.172974513624176e-05, "loss": 1.9194, "step": 2370 }, { "epoch": 2.2336931018301267, "grad_norm": 0.7999364137649536, "learning_rate": 2.1250929568273774e-05, "loss": 1.9925, "step": 2380 }, { "epoch": 2.2430783669638665, "grad_norm": 0.8435518145561218, "learning_rate": 2.0776020619641024e-05, "loss": 1.9746, "step": 2390 }, { "epoch": 2.252463632097607, "grad_norm": 0.8621445298194885, "learning_rate": 2.0305082824203343e-05, "loss": 1.8491, "step": 2400 }, { "epoch": 2.252463632097607, "eval_loss": 1.9441428184509277, "eval_runtime": 935.9838, "eval_samples_per_second": 1.012, "eval_steps_per_second": 0.506, "step": 2400 }, { "epoch": 2.2618488972313466, "grad_norm": 0.8402264714241028, "learning_rate": 1.9838180176193178e-05, "loss": 1.9554, "step": 2410 }, { "epoch": 2.271234162365087, "grad_norm": 0.8322455883026123, "learning_rate": 1.9375376121519807e-05, "loss": 1.9463, "step": 2420 }, { "epoch": 2.2806194274988267, "grad_norm": 0.8249323964118958, "learning_rate": 1.891673354914761e-05, "loss": 1.8215, "step": 2430 }, { "epoch": 2.290004692632567, "grad_norm": 0.772278904914856, "learning_rate": 1.8462314782550578e-05, "loss": 1.9064, "step": 2440 }, { "epoch": 2.299389957766307, "grad_norm": 0.7047111392021179, "learning_rate": 1.8012181571243097e-05, "loss": 2.0491, "step": 2450 }, { "epoch": 2.308775222900047, "grad_norm": 0.9021138548851013, "learning_rate": 1.756639508238922e-05, "loss": 1.9212, "step": 2460 }, { "epoch": 2.318160488033787, "grad_norm": 0.9528132677078247, "learning_rate": 1.7125015892490753e-05, "loss": 2.0436, "step": 2470 }, { "epoch": 2.327545753167527, "grad_norm": 0.826156735420227, "learning_rate": 1.668810397915568e-05, "loss": 1.9951, "step": 2480 }, { "epoch": 2.336931018301267, "grad_norm": 0.7792008519172668, "learning_rate": 1.6255718712948143e-05, "loss": 1.9846, "step": 2490 }, { "epoch": 2.346316283435007, "grad_norm": 0.8387865424156189, "learning_rate": 1.5827918849320567e-05, "loss": 1.9121, "step": 2500 }, { "epoch": 2.346316283435007, "eval_loss": 1.9427493810653687, "eval_runtime": 936.6706, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 2500 }, { "epoch": 2.355701548568747, "grad_norm": 0.7230735421180725, "learning_rate": 1.5404762520629724e-05, "loss": 1.8782, "step": 2510 }, { "epoch": 2.365086813702487, "grad_norm": 0.7646508812904358, "learning_rate": 1.4986307228237268e-05, "loss": 1.8842, "step": 2520 }, { "epoch": 2.3744720788362272, "grad_norm": 0.7921754121780396, "learning_rate": 1.4572609834695971e-05, "loss": 2.1265, "step": 2530 }, { "epoch": 2.383857343969967, "grad_norm": 0.8904575109481812, "learning_rate": 1.4163726556023054e-05, "loss": 1.8978, "step": 2540 }, { "epoch": 2.3932426091037073, "grad_norm": 0.9155052900314331, "learning_rate": 1.3759712954060921e-05, "loss": 1.8854, "step": 2550 }, { "epoch": 2.402627874237447, "grad_norm": 0.8808565139770508, "learning_rate": 1.3360623928927291e-05, "loss": 1.8698, "step": 2560 }, { "epoch": 2.4120131393711874, "grad_norm": 0.9437252879142761, "learning_rate": 1.2966513711554744e-05, "loss": 1.7782, "step": 2570 }, { "epoch": 2.4213984045049273, "grad_norm": 0.853032112121582, "learning_rate": 1.2577435856321668e-05, "loss": 1.953, "step": 2580 }, { "epoch": 2.430783669638667, "grad_norm": 0.8684459328651428, "learning_rate": 1.219344323377482e-05, "loss": 2.2737, "step": 2590 }, { "epoch": 2.4401689347724074, "grad_norm": 0.7905233502388, "learning_rate": 1.1814588023444878e-05, "loss": 1.8849, "step": 2600 }, { "epoch": 2.4401689347724074, "eval_loss": 1.941327452659607, "eval_runtime": 937.1681, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 2600 }, { "epoch": 2.449554199906147, "grad_norm": 0.769902765750885, "learning_rate": 1.1440921706756092e-05, "loss": 2.057, "step": 2610 }, { "epoch": 2.4589394650398875, "grad_norm": 0.8708339333534241, "learning_rate": 1.1072495060030418e-05, "loss": 1.8389, "step": 2620 }, { "epoch": 2.4683247301736273, "grad_norm": 0.7585152387619019, "learning_rate": 1.0709358147587884e-05, "loss": 1.9889, "step": 2630 }, { "epoch": 2.4777099953073676, "grad_norm": 0.8954083919525146, "learning_rate": 1.0351560314943392e-05, "loss": 2.0466, "step": 2640 }, { "epoch": 2.4870952604411074, "grad_norm": 0.8597133755683899, "learning_rate": 9.999150182101319e-06, "loss": 1.8554, "step": 2650 }, { "epoch": 2.4964805255748477, "grad_norm": 0.7284257411956787, "learning_rate": 9.652175636948807e-06, "loss": 1.9854, "step": 2660 }, { "epoch": 2.5058657907085875, "grad_norm": 0.8639576435089111, "learning_rate": 9.310683828748251e-06, "loss": 1.924, "step": 2670 }, { "epoch": 2.5152510558423273, "grad_norm": 0.8123131990432739, "learning_rate": 8.974721161730553e-06, "loss": 1.9737, "step": 2680 }, { "epoch": 2.5246363209760676, "grad_norm": 0.8097216486930847, "learning_rate": 8.64433328878917e-06, "loss": 2.0566, "step": 2690 }, { "epoch": 2.534021586109808, "grad_norm": 0.8345467448234558, "learning_rate": 8.319565105276678e-06, "loss": 2.0679, "step": 2700 }, { "epoch": 2.534021586109808, "eval_loss": 1.9400410652160645, "eval_runtime": 937.8634, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.505, "step": 2700 }, { "epoch": 2.5434068512435477, "grad_norm": 0.8246074318885803, "learning_rate": 8.000460742903987e-06, "loss": 2.0611, "step": 2710 }, { "epoch": 2.5527921163772875, "grad_norm": 0.8071450591087341, "learning_rate": 7.687063563743413e-06, "loss": 1.8266, "step": 2720 }, { "epoch": 2.562177381511028, "grad_norm": 0.8657225966453552, "learning_rate": 7.379416154336455e-06, "loss": 2.0888, "step": 2730 }, { "epoch": 2.5715626466447676, "grad_norm": 0.914188027381897, "learning_rate": 7.077560319906695e-06, "loss": 1.8913, "step": 2740 }, { "epoch": 2.580947911778508, "grad_norm": 1.0282979011535645, "learning_rate": 6.781537078679134e-06, "loss": 2.0157, "step": 2750 }, { "epoch": 2.5903331769122477, "grad_norm": 0.7493969202041626, "learning_rate": 6.491386656306319e-06, "loss": 2.0123, "step": 2760 }, { "epoch": 2.5997184420459876, "grad_norm": 0.8242475986480713, "learning_rate": 6.2071484804021475e-06, "loss": 1.8812, "step": 2770 }, { "epoch": 2.609103707179728, "grad_norm": 0.9322003722190857, "learning_rate": 5.928861175184336e-06, "loss": 1.9338, "step": 2780 }, { "epoch": 2.618488972313468, "grad_norm": 0.7862038016319275, "learning_rate": 5.656562556225692e-06, "loss": 2.0133, "step": 2790 }, { "epoch": 2.627874237447208, "grad_norm": 0.7466335892677307, "learning_rate": 5.3902896253156365e-06, "loss": 1.9908, "step": 2800 }, { "epoch": 2.627874237447208, "eval_loss": 1.939355492591858, "eval_runtime": 937.8925, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.505, "step": 2800 }, { "epoch": 2.6372595025809478, "grad_norm": 0.7764114737510681, "learning_rate": 5.13007856543209e-06, "loss": 1.9348, "step": 2810 }, { "epoch": 2.646644767714688, "grad_norm": 0.8519911766052246, "learning_rate": 4.875964735824645e-06, "loss": 1.9125, "step": 2820 }, { "epoch": 2.656030032848428, "grad_norm": 0.8778414726257324, "learning_rate": 4.627982667209818e-06, "loss": 2.0829, "step": 2830 }, { "epoch": 2.665415297982168, "grad_norm": 0.7963272929191589, "learning_rate": 4.386166057078639e-06, "loss": 1.989, "step": 2840 }, { "epoch": 2.674800563115908, "grad_norm": 0.790169894695282, "learning_rate": 4.150547765117746e-06, "loss": 1.9568, "step": 2850 }, { "epoch": 2.684185828249648, "grad_norm": 0.8890179395675659, "learning_rate": 3.921159808744085e-06, "loss": 2.104, "step": 2860 }, { "epoch": 2.693571093383388, "grad_norm": 0.8634796738624573, "learning_rate": 3.698033358754205e-06, "loss": 2.0033, "step": 2870 }, { "epoch": 2.7029563585171283, "grad_norm": 0.7101777791976929, "learning_rate": 3.481198735088581e-06, "loss": 1.9882, "step": 2880 }, { "epoch": 2.712341623650868, "grad_norm": 0.8689360022544861, "learning_rate": 3.270685402711471e-06, "loss": 1.9517, "step": 2890 }, { "epoch": 2.721726888784608, "grad_norm": 0.7733302712440491, "learning_rate": 3.0665219676071057e-06, "loss": 1.9557, "step": 2900 }, { "epoch": 2.721726888784608, "eval_loss": 1.9387511014938354, "eval_runtime": 937.6722, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.506, "step": 2900 }, { "epoch": 2.7311121539183483, "grad_norm": 0.8051707148551941, "learning_rate": 2.8687361728924056e-06, "loss": 1.9718, "step": 2910 }, { "epoch": 2.740497419052088, "grad_norm": 0.8936708569526672, "learning_rate": 2.6773548950471572e-06, "loss": 2.0474, "step": 2920 }, { "epoch": 2.7498826841858284, "grad_norm": 0.7301138639450073, "learning_rate": 2.492404140261795e-06, "loss": 1.9602, "step": 2930 }, { "epoch": 2.759267949319568, "grad_norm": 0.7741363644599915, "learning_rate": 2.3139090409034946e-06, "loss": 2.0386, "step": 2940 }, { "epoch": 2.768653214453308, "grad_norm": 0.771488606929779, "learning_rate": 2.1418938521010954e-06, "loss": 1.9822, "step": 2950 }, { "epoch": 2.7780384795870483, "grad_norm": 0.7883902788162231, "learning_rate": 1.9763819484490355e-06, "loss": 2.0061, "step": 2960 }, { "epoch": 2.7874237447207886, "grad_norm": 6.553975582122803, "learning_rate": 1.8173958208311526e-06, "loss": 1.9747, "step": 2970 }, { "epoch": 2.7968090098545284, "grad_norm": 0.8313648104667664, "learning_rate": 1.6649570733643982e-06, "loss": 2.0416, "step": 2980 }, { "epoch": 2.8061942749882682, "grad_norm": 0.867647647857666, "learning_rate": 1.5190864204631672e-06, "loss": 1.8462, "step": 2990 }, { "epoch": 2.8155795401220085, "grad_norm": 0.740106999874115, "learning_rate": 1.3798036840244667e-06, "loss": 1.9627, "step": 3000 }, { "epoch": 2.8155795401220085, "eval_loss": 1.9384320974349976, "eval_runtime": 936.8374, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 3000 }, { "epoch": 2.8249648052557483, "grad_norm": 0.7124555706977844, "learning_rate": 1.2471277907343703e-06, "loss": 2.1284, "step": 3010 }, { "epoch": 2.8343500703894886, "grad_norm": 0.809935986995697, "learning_rate": 1.1210767694961655e-06, "loss": 2.0541, "step": 3020 }, { "epoch": 2.8437353355232284, "grad_norm": 0.801823079586029, "learning_rate": 1.0016677489804171e-06, "loss": 1.9914, "step": 3030 }, { "epoch": 2.8531206006569687, "grad_norm": 0.7602815628051758, "learning_rate": 8.88916955297453e-07, "loss": 1.9802, "step": 3040 }, { "epoch": 2.8625058657907085, "grad_norm": 0.7440112829208374, "learning_rate": 7.8283970979241e-07, "loss": 1.9312, "step": 3050 }, { "epoch": 2.871891130924449, "grad_norm": 2.742288112640381, "learning_rate": 6.834504269632835e-07, "loss": 2.03, "step": 3060 }, { "epoch": 2.8812763960581886, "grad_norm": 0.815190315246582, "learning_rate": 5.907626125022159e-07, "loss": 1.9215, "step": 3070 }, { "epoch": 2.8906616611919285, "grad_norm": 0.8376407027244568, "learning_rate": 5.04788861460187e-07, "loss": 1.8776, "step": 3080 }, { "epoch": 2.9000469263256687, "grad_norm": 0.9095739126205444, "learning_rate": 4.255408565355612e-07, "loss": 1.9567, "step": 3090 }, { "epoch": 2.9094321914594086, "grad_norm": 0.858132541179657, "learning_rate": 3.530293664865514e-07, "loss": 1.8339, "step": 3100 }, { "epoch": 2.9094321914594086, "eval_loss": 1.9382846355438232, "eval_runtime": 937.0394, "eval_samples_per_second": 1.011, "eval_steps_per_second": 0.506, "step": 3100 }, { "epoch": 2.918817456593149, "grad_norm": 0.851668119430542, "learning_rate": 2.872642446678897e-07, "loss": 2.1549, "step": 3110 }, { "epoch": 2.9282027217268887, "grad_norm": 1.059181571006775, "learning_rate": 2.2825442769188188e-07, "loss": 2.0087, "step": 3120 }, { "epoch": 2.937587986860629, "grad_norm": 1.1176784038543701, "learning_rate": 1.7600793421402307e-07, "loss": 1.9762, "step": 3130 }, { "epoch": 2.9469732519943688, "grad_norm": 0.7648433446884155, "learning_rate": 1.305318638434083e-07, "loss": 1.9598, "step": 3140 }, { "epoch": 2.956358517128109, "grad_norm": 0.8515041470527649, "learning_rate": 9.183239617795436e-08, "loss": 2.0792, "step": 3150 }, { "epoch": 2.965743782261849, "grad_norm": 3.4551055431365967, "learning_rate": 5.991478996468236e-08, "loss": 1.9946, "step": 3160 }, { "epoch": 2.9751290473955887, "grad_norm": 0.7828194499015808, "learning_rate": 3.4783382385139565e-08, "loss": 1.9363, "step": 3170 }, { "epoch": 2.984514312529329, "grad_norm": 0.9214401245117188, "learning_rate": 1.644158846600963e-08, "loss": 1.9451, "step": 3180 }, { "epoch": 2.9938995776630692, "grad_norm": 0.8548945188522339, "learning_rate": 4.8919006150727195e-09, "loss": 1.8411, "step": 3190 }, { "epoch": 2.998592210229939, "step": 3195, "total_flos": 1.0917373877893988e+19, "train_loss": 2.069366264641751, "train_runtime": 97122.8592, "train_samples_per_second": 0.263, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 3195, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.0917373877893988e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }