{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9994747347410442, "eval_steps": 500, "global_step": 2379, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008404244143292362, "grad_norm": 0.7383340001106262, "learning_rate": 2e-05, "loss": 2.4066, "step": 10 }, { "epoch": 0.016808488286584725, "grad_norm": 0.46394309401512146, "learning_rate": 4e-05, "loss": 2.0375, "step": 20 }, { "epoch": 0.025212732429877087, "grad_norm": 0.4739478528499603, "learning_rate": 6e-05, "loss": 1.5044, "step": 30 }, { "epoch": 0.03361697657316945, "grad_norm": 0.20930196344852448, "learning_rate": 8e-05, "loss": 0.8704, "step": 40 }, { "epoch": 0.04202122071646181, "grad_norm": 0.15288038551807404, "learning_rate": 0.0001, "loss": 0.6533, "step": 50 }, { "epoch": 0.050425464859754174, "grad_norm": 0.13073962926864624, "learning_rate": 0.00012, "loss": 0.586, "step": 60 }, { "epoch": 0.058829709003046536, "grad_norm": 0.14555367827415466, "learning_rate": 0.00014, "loss": 0.5793, "step": 70 }, { "epoch": 0.0672339531463389, "grad_norm": 0.12397414445877075, "learning_rate": 0.00016, "loss": 0.581, "step": 80 }, { "epoch": 0.07563819728963127, "grad_norm": 0.13021130859851837, "learning_rate": 0.00018, "loss": 0.5512, "step": 90 }, { "epoch": 0.08404244143292362, "grad_norm": 0.13012883067131042, "learning_rate": 0.0002, "loss": 0.5403, "step": 100 }, { "epoch": 0.09244668557621599, "grad_norm": 0.11942347884178162, "learning_rate": 0.00019942313239111625, "loss": 0.5247, "step": 110 }, { "epoch": 0.10085092971950835, "grad_norm": 0.11690942198038101, "learning_rate": 0.0001988462647822325, "loss": 0.5417, "step": 120 }, { "epoch": 0.10925517386280072, "grad_norm": 0.1355101615190506, "learning_rate": 0.00019826939717334873, "loss": 0.5273, "step": 130 }, { "epoch": 0.11765941800609307, "grad_norm": 0.1345665603876114, "learning_rate": 0.00019769252956446497, "loss": 0.5243, "step": 140 }, { "epoch": 0.12606366214938544, "grad_norm": 0.12515193223953247, "learning_rate": 0.0001971156619555812, "loss": 0.5344, "step": 150 }, { "epoch": 0.1344679062926778, "grad_norm": 0.15686553716659546, "learning_rate": 0.00019653879434669745, "loss": 0.5118, "step": 160 }, { "epoch": 0.14287215043597015, "grad_norm": 0.12068944424390793, "learning_rate": 0.0001959619267378137, "loss": 0.4979, "step": 170 }, { "epoch": 0.15127639457926254, "grad_norm": 0.13319459557533264, "learning_rate": 0.00019538505912892993, "loss": 0.503, "step": 180 }, { "epoch": 0.1596806387225549, "grad_norm": 0.11806949228048325, "learning_rate": 0.00019480819152004617, "loss": 0.49, "step": 190 }, { "epoch": 0.16808488286584725, "grad_norm": 0.12932075560092926, "learning_rate": 0.00019423132391116238, "loss": 0.514, "step": 200 }, { "epoch": 0.17648912700913963, "grad_norm": 0.11743929982185364, "learning_rate": 0.00019365445630227862, "loss": 0.4788, "step": 210 }, { "epoch": 0.18489337115243198, "grad_norm": 0.11788313835859299, "learning_rate": 0.00019307758869339486, "loss": 0.4891, "step": 220 }, { "epoch": 0.19329761529572434, "grad_norm": 0.11414741724729538, "learning_rate": 0.0001925007210845111, "loss": 0.5033, "step": 230 }, { "epoch": 0.2017018594390167, "grad_norm": 0.11419043689966202, "learning_rate": 0.00019192385347562737, "loss": 0.4844, "step": 240 }, { "epoch": 0.21010610358230908, "grad_norm": 0.12788020074367523, "learning_rate": 0.0001913469858667436, "loss": 0.4697, "step": 250 }, { "epoch": 0.21851034772560143, "grad_norm": 0.13661302626132965, "learning_rate": 0.00019077011825785982, "loss": 0.4627, "step": 260 }, { "epoch": 0.2269145918688938, "grad_norm": 0.12041325867176056, "learning_rate": 0.00019019325064897606, "loss": 0.4964, "step": 270 }, { "epoch": 0.23531883601218614, "grad_norm": 0.133742094039917, "learning_rate": 0.0001896163830400923, "loss": 0.4658, "step": 280 }, { "epoch": 0.24372308015547853, "grad_norm": 0.1261977106332779, "learning_rate": 0.00018903951543120854, "loss": 0.4781, "step": 290 }, { "epoch": 0.2521273242987709, "grad_norm": 0.130150705575943, "learning_rate": 0.00018846264782232478, "loss": 0.4922, "step": 300 }, { "epoch": 0.26053156844206327, "grad_norm": 0.13174410164356232, "learning_rate": 0.00018788578021344102, "loss": 0.4559, "step": 310 }, { "epoch": 0.2689358125853556, "grad_norm": 0.1186077669262886, "learning_rate": 0.00018730891260455726, "loss": 0.4722, "step": 320 }, { "epoch": 0.277340056728648, "grad_norm": 0.116569384932518, "learning_rate": 0.0001867320449956735, "loss": 0.4457, "step": 330 }, { "epoch": 0.2857443008719403, "grad_norm": 0.12219471484422684, "learning_rate": 0.00018615517738678974, "loss": 0.4849, "step": 340 }, { "epoch": 0.2941485450152327, "grad_norm": 0.12746909260749817, "learning_rate": 0.00018557830977790598, "loss": 0.4821, "step": 350 }, { "epoch": 0.30255278915852507, "grad_norm": 0.14125944674015045, "learning_rate": 0.00018500144216902222, "loss": 0.4605, "step": 360 }, { "epoch": 0.3109570333018174, "grad_norm": 0.19157269597053528, "learning_rate": 0.00018442457456013846, "loss": 0.4541, "step": 370 }, { "epoch": 0.3193612774451098, "grad_norm": 0.12603330612182617, "learning_rate": 0.0001838477069512547, "loss": 0.4536, "step": 380 }, { "epoch": 0.32776552158840216, "grad_norm": 0.12653909623622894, "learning_rate": 0.00018327083934237091, "loss": 0.4468, "step": 390 }, { "epoch": 0.3361697657316945, "grad_norm": 0.15930472314357758, "learning_rate": 0.00018269397173348718, "loss": 0.4542, "step": 400 }, { "epoch": 0.3445740098749869, "grad_norm": 0.13266988098621368, "learning_rate": 0.00018211710412460342, "loss": 0.4335, "step": 410 }, { "epoch": 0.35297825401827926, "grad_norm": 0.12103667855262756, "learning_rate": 0.00018154023651571966, "loss": 0.4575, "step": 420 }, { "epoch": 0.3613824981615716, "grad_norm": 0.14439740777015686, "learning_rate": 0.0001809633689068359, "loss": 0.4377, "step": 430 }, { "epoch": 0.36978674230486397, "grad_norm": 0.12652407586574554, "learning_rate": 0.00018038650129795214, "loss": 0.4363, "step": 440 }, { "epoch": 0.3781909864481563, "grad_norm": 0.14594405889511108, "learning_rate": 0.00017980963368906835, "loss": 0.4306, "step": 450 }, { "epoch": 0.3865952305914487, "grad_norm": 0.12562687695026398, "learning_rate": 0.0001792327660801846, "loss": 0.4501, "step": 460 }, { "epoch": 0.39499947473474106, "grad_norm": 0.14584492146968842, "learning_rate": 0.00017865589847130083, "loss": 0.4509, "step": 470 }, { "epoch": 0.4034037188780334, "grad_norm": 0.13192500174045563, "learning_rate": 0.00017807903086241707, "loss": 0.4505, "step": 480 }, { "epoch": 0.4118079630213258, "grad_norm": 0.14266645908355713, "learning_rate": 0.00017750216325353331, "loss": 0.4585, "step": 490 }, { "epoch": 0.42021220716461816, "grad_norm": 0.1400412619113922, "learning_rate": 0.00017692529564464958, "loss": 0.4365, "step": 500 }, { "epoch": 0.4286164513079105, "grad_norm": 0.14728468656539917, "learning_rate": 0.0001763484280357658, "loss": 0.4303, "step": 510 }, { "epoch": 0.43702069545120287, "grad_norm": 0.15791365504264832, "learning_rate": 0.00017577156042688203, "loss": 0.4407, "step": 520 }, { "epoch": 0.4454249395944952, "grad_norm": 0.15447258949279785, "learning_rate": 0.00017519469281799827, "loss": 0.4365, "step": 530 }, { "epoch": 0.4538291837377876, "grad_norm": 0.1518252044916153, "learning_rate": 0.00017461782520911451, "loss": 0.4305, "step": 540 }, { "epoch": 0.46223342788107996, "grad_norm": 0.1154065877199173, "learning_rate": 0.00017404095760023075, "loss": 0.4212, "step": 550 }, { "epoch": 0.4706376720243723, "grad_norm": 0.12900012731552124, "learning_rate": 0.000173464089991347, "loss": 0.4277, "step": 560 }, { "epoch": 0.47904191616766467, "grad_norm": 0.1349458247423172, "learning_rate": 0.00017288722238246323, "loss": 0.4051, "step": 570 }, { "epoch": 0.48744616031095706, "grad_norm": 0.16337165236473083, "learning_rate": 0.00017231035477357947, "loss": 0.407, "step": 580 }, { "epoch": 0.4958504044542494, "grad_norm": 0.13420593738555908, "learning_rate": 0.0001717334871646957, "loss": 0.4138, "step": 590 }, { "epoch": 0.5042546485975418, "grad_norm": 0.13840581476688385, "learning_rate": 0.00017115661955581195, "loss": 0.4099, "step": 600 }, { "epoch": 0.5126588927408341, "grad_norm": 0.1378021389245987, "learning_rate": 0.0001705797519469282, "loss": 0.4254, "step": 610 }, { "epoch": 0.5210631368841265, "grad_norm": 0.1607150137424469, "learning_rate": 0.00017000288433804443, "loss": 0.4353, "step": 620 }, { "epoch": 0.5294673810274189, "grad_norm": 0.13462169468402863, "learning_rate": 0.00016942601672916067, "loss": 0.4267, "step": 630 }, { "epoch": 0.5378716251707112, "grad_norm": 0.14311543107032776, "learning_rate": 0.00016884914912027689, "loss": 0.4301, "step": 640 }, { "epoch": 0.5462758693140036, "grad_norm": 0.15559442341327667, "learning_rate": 0.00016827228151139313, "loss": 0.4102, "step": 650 }, { "epoch": 0.554680113457296, "grad_norm": 0.15557149052619934, "learning_rate": 0.00016769541390250937, "loss": 0.4136, "step": 660 }, { "epoch": 0.5630843576005883, "grad_norm": 0.135511115193367, "learning_rate": 0.00016711854629362563, "loss": 0.4153, "step": 670 }, { "epoch": 0.5714886017438806, "grad_norm": 0.13760776817798615, "learning_rate": 0.00016654167868474187, "loss": 0.4145, "step": 680 }, { "epoch": 0.579892845887173, "grad_norm": 0.14971590042114258, "learning_rate": 0.0001659648110758581, "loss": 0.3875, "step": 690 }, { "epoch": 0.5882970900304654, "grad_norm": 0.16005663573741913, "learning_rate": 0.00016538794346697433, "loss": 0.3938, "step": 700 }, { "epoch": 0.5967013341737577, "grad_norm": 0.1625218689441681, "learning_rate": 0.00016481107585809057, "loss": 0.3871, "step": 710 }, { "epoch": 0.6051055783170501, "grad_norm": 0.17047689855098724, "learning_rate": 0.0001642342082492068, "loss": 0.412, "step": 720 }, { "epoch": 0.6135098224603425, "grad_norm": 0.13825903832912445, "learning_rate": 0.00016365734064032305, "loss": 0.3948, "step": 730 }, { "epoch": 0.6219140666036348, "grad_norm": 0.14830929040908813, "learning_rate": 0.00016308047303143929, "loss": 0.3927, "step": 740 }, { "epoch": 0.6303183107469272, "grad_norm": 0.13950933516025543, "learning_rate": 0.00016250360542255553, "loss": 0.4051, "step": 750 }, { "epoch": 0.6387225548902196, "grad_norm": 0.15511371195316315, "learning_rate": 0.0001619267378136718, "loss": 0.4041, "step": 760 }, { "epoch": 0.6471267990335119, "grad_norm": 0.14828190207481384, "learning_rate": 0.000161349870204788, "loss": 0.3824, "step": 770 }, { "epoch": 0.6555310431768043, "grad_norm": 0.144051194190979, "learning_rate": 0.00016077300259590425, "loss": 0.3829, "step": 780 }, { "epoch": 0.6639352873200967, "grad_norm": 0.14780694246292114, "learning_rate": 0.00016019613498702049, "loss": 0.3814, "step": 790 }, { "epoch": 0.672339531463389, "grad_norm": 0.15042325854301453, "learning_rate": 0.00015961926737813673, "loss": 0.3962, "step": 800 }, { "epoch": 0.6807437756066814, "grad_norm": 0.16325107216835022, "learning_rate": 0.00015904239976925297, "loss": 0.3801, "step": 810 }, { "epoch": 0.6891480197499738, "grad_norm": 0.14843328297138214, "learning_rate": 0.0001584655321603692, "loss": 0.4082, "step": 820 }, { "epoch": 0.6975522638932661, "grad_norm": 0.16731064021587372, "learning_rate": 0.00015788866455148545, "loss": 0.4192, "step": 830 }, { "epoch": 0.7059565080365585, "grad_norm": 0.18703435361385345, "learning_rate": 0.00015731179694260169, "loss": 0.4009, "step": 840 }, { "epoch": 0.7143607521798508, "grad_norm": 0.13935630023479462, "learning_rate": 0.00015673492933371793, "loss": 0.3618, "step": 850 }, { "epoch": 0.7227649963231432, "grad_norm": 0.13263636827468872, "learning_rate": 0.00015615806172483417, "loss": 0.3963, "step": 860 }, { "epoch": 0.7311692404664355, "grad_norm": 0.14940643310546875, "learning_rate": 0.0001555811941159504, "loss": 0.3585, "step": 870 }, { "epoch": 0.7395734846097279, "grad_norm": 0.14807912707328796, "learning_rate": 0.00015500432650706665, "loss": 0.3748, "step": 880 }, { "epoch": 0.7479777287530203, "grad_norm": 0.15254080295562744, "learning_rate": 0.00015442745889818286, "loss": 0.3718, "step": 890 }, { "epoch": 0.7563819728963126, "grad_norm": 0.16590768098831177, "learning_rate": 0.0001538505912892991, "loss": 0.386, "step": 900 }, { "epoch": 0.764786217039605, "grad_norm": 0.15733902156352997, "learning_rate": 0.00015327372368041534, "loss": 0.3756, "step": 910 }, { "epoch": 0.7731904611828974, "grad_norm": 0.13757385313510895, "learning_rate": 0.00015269685607153158, "loss": 0.3843, "step": 920 }, { "epoch": 0.7815947053261897, "grad_norm": 0.14952607452869415, "learning_rate": 0.00015211998846264784, "loss": 0.3634, "step": 930 }, { "epoch": 0.7899989494694821, "grad_norm": 0.1516282558441162, "learning_rate": 0.00015154312085376408, "loss": 0.3798, "step": 940 }, { "epoch": 0.7984031936127745, "grad_norm": 0.17785628139972687, "learning_rate": 0.00015096625324488032, "loss": 0.3681, "step": 950 }, { "epoch": 0.8068074377560668, "grad_norm": 0.171351820230484, "learning_rate": 0.00015038938563599654, "loss": 0.3686, "step": 960 }, { "epoch": 0.8152116818993592, "grad_norm": 0.1742231398820877, "learning_rate": 0.00014981251802711278, "loss": 0.3792, "step": 970 }, { "epoch": 0.8236159260426515, "grad_norm": 0.16650599241256714, "learning_rate": 0.00014923565041822902, "loss": 0.3577, "step": 980 }, { "epoch": 0.8320201701859439, "grad_norm": 0.1497887670993805, "learning_rate": 0.00014865878280934526, "loss": 0.3553, "step": 990 }, { "epoch": 0.8404244143292363, "grad_norm": 0.14781557023525238, "learning_rate": 0.0001480819152004615, "loss": 0.3538, "step": 1000 }, { "epoch": 0.8488286584725286, "grad_norm": 0.15724751353263855, "learning_rate": 0.00014750504759157774, "loss": 0.3597, "step": 1010 }, { "epoch": 0.857232902615821, "grad_norm": 0.18635571002960205, "learning_rate": 0.00014692817998269398, "loss": 0.3615, "step": 1020 }, { "epoch": 0.8656371467591134, "grad_norm": 0.17742526531219482, "learning_rate": 0.00014635131237381022, "loss": 0.348, "step": 1030 }, { "epoch": 0.8740413909024057, "grad_norm": 0.20535768568515778, "learning_rate": 0.00014577444476492646, "loss": 0.3343, "step": 1040 }, { "epoch": 0.8824456350456981, "grad_norm": 0.18968522548675537, "learning_rate": 0.0001451975771560427, "loss": 0.3615, "step": 1050 }, { "epoch": 0.8908498791889904, "grad_norm": 0.1528492122888565, "learning_rate": 0.00014462070954715894, "loss": 0.3786, "step": 1060 }, { "epoch": 0.8992541233322828, "grad_norm": 0.15841075778007507, "learning_rate": 0.00014404384193827518, "loss": 0.3761, "step": 1070 }, { "epoch": 0.9076583674755752, "grad_norm": 0.15167982876300812, "learning_rate": 0.0001434669743293914, "loss": 0.3528, "step": 1080 }, { "epoch": 0.9160626116188675, "grad_norm": 0.14096671342849731, "learning_rate": 0.00014289010672050766, "loss": 0.371, "step": 1090 }, { "epoch": 0.9244668557621599, "grad_norm": 0.1579194813966751, "learning_rate": 0.0001423132391116239, "loss": 0.3491, "step": 1100 }, { "epoch": 0.9328710999054523, "grad_norm": 0.16789057850837708, "learning_rate": 0.00014173637150274014, "loss": 0.3536, "step": 1110 }, { "epoch": 0.9412753440487446, "grad_norm": 0.13980717957019806, "learning_rate": 0.00014115950389385638, "loss": 0.3423, "step": 1120 }, { "epoch": 0.949679588192037, "grad_norm": 0.19879643619060516, "learning_rate": 0.00014058263628497262, "loss": 0.3285, "step": 1130 }, { "epoch": 0.9580838323353293, "grad_norm": 0.16574440896511078, "learning_rate": 0.00014000576867608886, "loss": 0.3568, "step": 1140 }, { "epoch": 0.9664880764786217, "grad_norm": 0.15376180410385132, "learning_rate": 0.00013942890106720507, "loss": 0.3558, "step": 1150 }, { "epoch": 0.9748923206219141, "grad_norm": 0.17232170701026917, "learning_rate": 0.0001388520334583213, "loss": 0.342, "step": 1160 }, { "epoch": 0.9832965647652064, "grad_norm": 0.1959993690252304, "learning_rate": 0.00013827516584943755, "loss": 0.3458, "step": 1170 }, { "epoch": 0.9917008089084988, "grad_norm": 0.14029347896575928, "learning_rate": 0.0001376982982405538, "loss": 0.3297, "step": 1180 }, { "epoch": 1.0002101061035824, "grad_norm": 0.20758652687072754, "learning_rate": 0.00013712143063167006, "loss": 0.3642, "step": 1190 }, { "epoch": 1.0086143502468747, "grad_norm": 0.15599438548088074, "learning_rate": 0.0001365445630227863, "loss": 0.3004, "step": 1200 }, { "epoch": 1.017018594390167, "grad_norm": 0.16680683195590973, "learning_rate": 0.0001359676954139025, "loss": 0.2915, "step": 1210 }, { "epoch": 1.0254228385334594, "grad_norm": 0.1668105274438858, "learning_rate": 0.00013539082780501875, "loss": 0.2963, "step": 1220 }, { "epoch": 1.0338270826767517, "grad_norm": 0.16461539268493652, "learning_rate": 0.000134813960196135, "loss": 0.3041, "step": 1230 }, { "epoch": 1.042231326820044, "grad_norm": 0.18869394063949585, "learning_rate": 0.00013423709258725123, "loss": 0.3046, "step": 1240 }, { "epoch": 1.0506355709633364, "grad_norm": 0.16899700462818146, "learning_rate": 0.00013366022497836747, "loss": 0.2921, "step": 1250 }, { "epoch": 1.059039815106629, "grad_norm": 0.1905297338962555, "learning_rate": 0.0001330833573694837, "loss": 0.2879, "step": 1260 }, { "epoch": 1.0674440592499213, "grad_norm": 0.17273731529712677, "learning_rate": 0.00013250648976059995, "loss": 0.3038, "step": 1270 }, { "epoch": 1.0758483033932136, "grad_norm": 0.1947745531797409, "learning_rate": 0.0001319296221517162, "loss": 0.3029, "step": 1280 }, { "epoch": 1.084252547536506, "grad_norm": 0.1741725355386734, "learning_rate": 0.00013135275454283243, "loss": 0.3073, "step": 1290 }, { "epoch": 1.0926567916797982, "grad_norm": 0.18244194984436035, "learning_rate": 0.00013077588693394867, "loss": 0.287, "step": 1300 }, { "epoch": 1.1010610358230906, "grad_norm": 0.18360966444015503, "learning_rate": 0.0001301990193250649, "loss": 0.307, "step": 1310 }, { "epoch": 1.1094652799663831, "grad_norm": 0.16066686809062958, "learning_rate": 0.00012962215171618115, "loss": 0.2712, "step": 1320 }, { "epoch": 1.1178695241096754, "grad_norm": 0.16239213943481445, "learning_rate": 0.00012904528410729736, "loss": 0.2857, "step": 1330 }, { "epoch": 1.1262737682529678, "grad_norm": 0.16966617107391357, "learning_rate": 0.0001284684164984136, "loss": 0.3087, "step": 1340 }, { "epoch": 1.13467801239626, "grad_norm": 0.16753819584846497, "learning_rate": 0.00012789154888952984, "loss": 0.2852, "step": 1350 }, { "epoch": 1.1430822565395524, "grad_norm": 0.19184084236621857, "learning_rate": 0.0001273146812806461, "loss": 0.3138, "step": 1360 }, { "epoch": 1.1514865006828447, "grad_norm": 0.15949766337871552, "learning_rate": 0.00012673781367176235, "loss": 0.2812, "step": 1370 }, { "epoch": 1.159890744826137, "grad_norm": 0.16187496483325958, "learning_rate": 0.0001261609460628786, "loss": 0.2841, "step": 1380 }, { "epoch": 1.1682949889694296, "grad_norm": 0.1778268665075302, "learning_rate": 0.00012558407845399483, "loss": 0.3181, "step": 1390 }, { "epoch": 1.176699233112722, "grad_norm": 0.17179737985134125, "learning_rate": 0.00012500721084511104, "loss": 0.2904, "step": 1400 }, { "epoch": 1.1851034772560143, "grad_norm": 0.16989010572433472, "learning_rate": 0.00012443034323622728, "loss": 0.2856, "step": 1410 }, { "epoch": 1.1935077213993066, "grad_norm": 0.21040703356266022, "learning_rate": 0.00012385347562734352, "loss": 0.2743, "step": 1420 }, { "epoch": 1.201911965542599, "grad_norm": 0.19255656003952026, "learning_rate": 0.00012327660801845976, "loss": 0.316, "step": 1430 }, { "epoch": 1.2103162096858915, "grad_norm": 0.16303245723247528, "learning_rate": 0.000122699740409576, "loss": 0.2671, "step": 1440 }, { "epoch": 1.2187204538291838, "grad_norm": 0.21385671198368073, "learning_rate": 0.00012212287280069227, "loss": 0.2865, "step": 1450 }, { "epoch": 1.2271246979724761, "grad_norm": 0.18770861625671387, "learning_rate": 0.00012154600519180848, "loss": 0.2795, "step": 1460 }, { "epoch": 1.2355289421157685, "grad_norm": 0.20827870070934296, "learning_rate": 0.00012096913758292472, "loss": 0.2769, "step": 1470 }, { "epoch": 1.2439331862590608, "grad_norm": 0.1704486757516861, "learning_rate": 0.00012039226997404096, "loss": 0.2993, "step": 1480 }, { "epoch": 1.2523374304023531, "grad_norm": 0.21233461797237396, "learning_rate": 0.0001198154023651572, "loss": 0.2912, "step": 1490 }, { "epoch": 1.2607416745456455, "grad_norm": 0.1879620999097824, "learning_rate": 0.00011923853475627344, "loss": 0.2885, "step": 1500 }, { "epoch": 1.2691459186889378, "grad_norm": 0.14288674294948578, "learning_rate": 0.00011866166714738968, "loss": 0.2794, "step": 1510 }, { "epoch": 1.2775501628322303, "grad_norm": 0.1654644012451172, "learning_rate": 0.00011808479953850591, "loss": 0.2762, "step": 1520 }, { "epoch": 1.2859544069755227, "grad_norm": 0.15648572146892548, "learning_rate": 0.00011750793192962215, "loss": 0.2853, "step": 1530 }, { "epoch": 1.294358651118815, "grad_norm": 0.14321617782115936, "learning_rate": 0.00011693106432073839, "loss": 0.2949, "step": 1540 }, { "epoch": 1.3027628952621073, "grad_norm": 0.18823479115962982, "learning_rate": 0.00011635419671185464, "loss": 0.2734, "step": 1550 }, { "epoch": 1.3111671394053999, "grad_norm": 0.1524640917778015, "learning_rate": 0.00011577732910297088, "loss": 0.2668, "step": 1560 }, { "epoch": 1.3195713835486922, "grad_norm": 0.1731933057308197, "learning_rate": 0.00011520046149408712, "loss": 0.2815, "step": 1570 }, { "epoch": 1.3279756276919845, "grad_norm": 0.19858598709106445, "learning_rate": 0.00011462359388520336, "loss": 0.2863, "step": 1580 }, { "epoch": 1.3363798718352768, "grad_norm": 0.20350554585456848, "learning_rate": 0.00011404672627631959, "loss": 0.2974, "step": 1590 }, { "epoch": 1.3447841159785692, "grad_norm": 0.16735605895519257, "learning_rate": 0.00011346985866743583, "loss": 0.2742, "step": 1600 }, { "epoch": 1.3531883601218615, "grad_norm": 0.18708328902721405, "learning_rate": 0.00011289299105855207, "loss": 0.2877, "step": 1610 }, { "epoch": 1.3615926042651538, "grad_norm": 0.19334456324577332, "learning_rate": 0.00011231612344966831, "loss": 0.2735, "step": 1620 }, { "epoch": 1.3699968484084462, "grad_norm": 0.20367129147052765, "learning_rate": 0.00011173925584078455, "loss": 0.2801, "step": 1630 }, { "epoch": 1.3784010925517387, "grad_norm": 0.18539854884147644, "learning_rate": 0.00011116238823190079, "loss": 0.2842, "step": 1640 }, { "epoch": 1.386805336695031, "grad_norm": 0.2150140106678009, "learning_rate": 0.00011058552062301701, "loss": 0.2611, "step": 1650 }, { "epoch": 1.3952095808383234, "grad_norm": 0.162113755941391, "learning_rate": 0.00011000865301413325, "loss": 0.289, "step": 1660 }, { "epoch": 1.4036138249816157, "grad_norm": 0.18180853128433228, "learning_rate": 0.0001094317854052495, "loss": 0.2808, "step": 1670 }, { "epoch": 1.412018069124908, "grad_norm": 0.17916476726531982, "learning_rate": 0.00010885491779636575, "loss": 0.2912, "step": 1680 }, { "epoch": 1.4204223132682006, "grad_norm": 0.22721944749355316, "learning_rate": 0.00010827805018748199, "loss": 0.2611, "step": 1690 }, { "epoch": 1.428826557411493, "grad_norm": 0.16184848546981812, "learning_rate": 0.00010770118257859823, "loss": 0.2722, "step": 1700 }, { "epoch": 1.4372308015547852, "grad_norm": 0.19588448107242584, "learning_rate": 0.00010712431496971444, "loss": 0.2817, "step": 1710 }, { "epoch": 1.4456350456980775, "grad_norm": 0.1870766133069992, "learning_rate": 0.0001065474473608307, "loss": 0.2835, "step": 1720 }, { "epoch": 1.4540392898413699, "grad_norm": 0.1768248826265335, "learning_rate": 0.00010597057975194693, "loss": 0.2643, "step": 1730 }, { "epoch": 1.4624435339846622, "grad_norm": 0.1726955771446228, "learning_rate": 0.00010539371214306317, "loss": 0.2674, "step": 1740 }, { "epoch": 1.4708477781279545, "grad_norm": 0.1709883064031601, "learning_rate": 0.00010481684453417941, "loss": 0.262, "step": 1750 }, { "epoch": 1.4792520222712469, "grad_norm": 0.2008083164691925, "learning_rate": 0.00010423997692529565, "loss": 0.2634, "step": 1760 }, { "epoch": 1.4876562664145394, "grad_norm": 0.17773209512233734, "learning_rate": 0.0001036631093164119, "loss": 0.2805, "step": 1770 }, { "epoch": 1.4960605105578317, "grad_norm": 0.18000538647174835, "learning_rate": 0.00010308624170752812, "loss": 0.2443, "step": 1780 }, { "epoch": 1.504464754701124, "grad_norm": 0.2176659256219864, "learning_rate": 0.00010250937409864436, "loss": 0.2594, "step": 1790 }, { "epoch": 1.5128689988444164, "grad_norm": 0.15863171219825745, "learning_rate": 0.0001019325064897606, "loss": 0.2751, "step": 1800 }, { "epoch": 1.521273242987709, "grad_norm": 0.19906319677829742, "learning_rate": 0.00010135563888087685, "loss": 0.2865, "step": 1810 }, { "epoch": 1.5296774871310013, "grad_norm": 0.21247649192810059, "learning_rate": 0.00010077877127199309, "loss": 0.2892, "step": 1820 }, { "epoch": 1.5380817312742936, "grad_norm": 0.21099700033664703, "learning_rate": 0.00010020190366310933, "loss": 0.3008, "step": 1830 }, { "epoch": 1.546485975417586, "grad_norm": 0.15469135344028473, "learning_rate": 9.962503605422556e-05, "loss": 0.2672, "step": 1840 }, { "epoch": 1.5548902195608783, "grad_norm": 0.16477440297603607, "learning_rate": 9.90481684453418e-05, "loss": 0.2799, "step": 1850 }, { "epoch": 1.5632944637041706, "grad_norm": 0.17361459136009216, "learning_rate": 9.847130083645804e-05, "loss": 0.2756, "step": 1860 }, { "epoch": 1.571698707847463, "grad_norm": 0.15138483047485352, "learning_rate": 9.789443322757428e-05, "loss": 0.2785, "step": 1870 }, { "epoch": 1.5801029519907552, "grad_norm": 0.16653598845005035, "learning_rate": 9.731756561869052e-05, "loss": 0.2814, "step": 1880 }, { "epoch": 1.5885071961340476, "grad_norm": 0.16785801947116852, "learning_rate": 9.674069800980675e-05, "loss": 0.2752, "step": 1890 }, { "epoch": 1.59691144027734, "grad_norm": 0.21643054485321045, "learning_rate": 9.6163830400923e-05, "loss": 0.2623, "step": 1900 }, { "epoch": 1.6053156844206324, "grad_norm": 0.15368995070457458, "learning_rate": 9.558696279203924e-05, "loss": 0.2722, "step": 1910 }, { "epoch": 1.6137199285639248, "grad_norm": 0.21962004899978638, "learning_rate": 9.501009518315547e-05, "loss": 0.2563, "step": 1920 }, { "epoch": 1.622124172707217, "grad_norm": 0.14919191598892212, "learning_rate": 9.44332275742717e-05, "loss": 0.2502, "step": 1930 }, { "epoch": 1.6305284168505096, "grad_norm": 0.2036961317062378, "learning_rate": 9.385635996538795e-05, "loss": 0.2539, "step": 1940 }, { "epoch": 1.638932660993802, "grad_norm": 0.19002236425876617, "learning_rate": 9.327949235650419e-05, "loss": 0.2464, "step": 1950 }, { "epoch": 1.6473369051370943, "grad_norm": 0.16677500307559967, "learning_rate": 9.270262474762043e-05, "loss": 0.2684, "step": 1960 }, { "epoch": 1.6557411492803866, "grad_norm": 0.15206314623355865, "learning_rate": 9.212575713873667e-05, "loss": 0.242, "step": 1970 }, { "epoch": 1.664145393423679, "grad_norm": 0.17641034722328186, "learning_rate": 9.15488895298529e-05, "loss": 0.2604, "step": 1980 }, { "epoch": 1.6725496375669713, "grad_norm": 0.17574937641620636, "learning_rate": 9.097202192096915e-05, "loss": 0.2547, "step": 1990 }, { "epoch": 1.6809538817102636, "grad_norm": 0.16344806551933289, "learning_rate": 9.039515431208539e-05, "loss": 0.2681, "step": 2000 }, { "epoch": 1.689358125853556, "grad_norm": 0.18498322367668152, "learning_rate": 8.981828670320161e-05, "loss": 0.2713, "step": 2010 }, { "epoch": 1.6977623699968483, "grad_norm": 0.14767137169837952, "learning_rate": 8.924141909431785e-05, "loss": 0.2604, "step": 2020 }, { "epoch": 1.7061666141401408, "grad_norm": 0.1902410387992859, "learning_rate": 8.86645514854341e-05, "loss": 0.2516, "step": 2030 }, { "epoch": 1.7145708582834331, "grad_norm": 0.1728687733411789, "learning_rate": 8.808768387655033e-05, "loss": 0.2711, "step": 2040 }, { "epoch": 1.7229751024267255, "grad_norm": 0.1836615651845932, "learning_rate": 8.751081626766657e-05, "loss": 0.2717, "step": 2050 }, { "epoch": 1.731379346570018, "grad_norm": 0.1553170531988144, "learning_rate": 8.693394865878281e-05, "loss": 0.2303, "step": 2060 }, { "epoch": 1.7397835907133103, "grad_norm": 0.1942613571882248, "learning_rate": 8.635708104989905e-05, "loss": 0.2581, "step": 2070 }, { "epoch": 1.7481878348566027, "grad_norm": 0.1734922230243683, "learning_rate": 8.578021344101529e-05, "loss": 0.259, "step": 2080 }, { "epoch": 1.756592078999895, "grad_norm": 0.1309240758419037, "learning_rate": 8.520334583213153e-05, "loss": 0.2381, "step": 2090 }, { "epoch": 1.7649963231431873, "grad_norm": 0.17716042697429657, "learning_rate": 8.462647822324777e-05, "loss": 0.2413, "step": 2100 }, { "epoch": 1.7734005672864797, "grad_norm": 0.16437722742557526, "learning_rate": 8.404961061436401e-05, "loss": 0.2699, "step": 2110 }, { "epoch": 1.781804811429772, "grad_norm": 0.15865294635295868, "learning_rate": 8.347274300548025e-05, "loss": 0.2515, "step": 2120 }, { "epoch": 1.7902090555730643, "grad_norm": 0.16365793347358704, "learning_rate": 8.289587539659649e-05, "loss": 0.2507, "step": 2130 }, { "epoch": 1.7986132997163566, "grad_norm": 0.19089579582214355, "learning_rate": 8.231900778771272e-05, "loss": 0.2572, "step": 2140 }, { "epoch": 1.807017543859649, "grad_norm": 0.1750141978263855, "learning_rate": 8.174214017882896e-05, "loss": 0.2692, "step": 2150 }, { "epoch": 1.8154217880029415, "grad_norm": 0.14101552963256836, "learning_rate": 8.116527256994521e-05, "loss": 0.2658, "step": 2160 }, { "epoch": 1.8238260321462338, "grad_norm": 0.14396284520626068, "learning_rate": 8.058840496106144e-05, "loss": 0.2556, "step": 2170 }, { "epoch": 1.8322302762895262, "grad_norm": 0.15593650937080383, "learning_rate": 8.001153735217768e-05, "loss": 0.2442, "step": 2180 }, { "epoch": 1.8406345204328187, "grad_norm": 0.18202078342437744, "learning_rate": 7.943466974329392e-05, "loss": 0.2509, "step": 2190 }, { "epoch": 1.849038764576111, "grad_norm": 0.17855936288833618, "learning_rate": 7.885780213441016e-05, "loss": 0.2595, "step": 2200 }, { "epoch": 1.8574430087194034, "grad_norm": 0.16823212802410126, "learning_rate": 7.82809345255264e-05, "loss": 0.2469, "step": 2210 }, { "epoch": 1.8658472528626957, "grad_norm": 0.15248893201351166, "learning_rate": 7.770406691664264e-05, "loss": 0.2603, "step": 2220 }, { "epoch": 1.874251497005988, "grad_norm": 0.16229604184627533, "learning_rate": 7.712719930775886e-05, "loss": 0.2434, "step": 2230 }, { "epoch": 1.8826557411492804, "grad_norm": 0.18594375252723694, "learning_rate": 7.655033169887512e-05, "loss": 0.266, "step": 2240 }, { "epoch": 1.8910599852925727, "grad_norm": 0.18467053771018982, "learning_rate": 7.597346408999136e-05, "loss": 0.2535, "step": 2250 }, { "epoch": 1.899464229435865, "grad_norm": 0.18451227247714996, "learning_rate": 7.539659648110758e-05, "loss": 0.2579, "step": 2260 }, { "epoch": 1.9078684735791573, "grad_norm": 0.15458305180072784, "learning_rate": 7.481972887222382e-05, "loss": 0.2506, "step": 2270 }, { "epoch": 1.91627271772245, "grad_norm": 0.17949137091636658, "learning_rate": 7.424286126334006e-05, "loss": 0.2659, "step": 2280 }, { "epoch": 1.9246769618657422, "grad_norm": 0.1898379623889923, "learning_rate": 7.366599365445632e-05, "loss": 0.2882, "step": 2290 }, { "epoch": 1.9330812060090345, "grad_norm": 0.14720788598060608, "learning_rate": 7.308912604557254e-05, "loss": 0.2367, "step": 2300 }, { "epoch": 1.9414854501523269, "grad_norm": 0.15253467857837677, "learning_rate": 7.251225843668878e-05, "loss": 0.256, "step": 2310 }, { "epoch": 1.9498896942956194, "grad_norm": 0.1564057618379593, "learning_rate": 7.193539082780502e-05, "loss": 0.2536, "step": 2320 }, { "epoch": 1.9582939384389118, "grad_norm": 0.15893864631652832, "learning_rate": 7.135852321892126e-05, "loss": 0.2347, "step": 2330 }, { "epoch": 1.966698182582204, "grad_norm": 0.20592626929283142, "learning_rate": 7.07816556100375e-05, "loss": 0.2419, "step": 2340 }, { "epoch": 1.9751024267254964, "grad_norm": 0.20137999951839447, "learning_rate": 7.020478800115374e-05, "loss": 0.2415, "step": 2350 }, { "epoch": 1.9835066708687887, "grad_norm": 0.19287312030792236, "learning_rate": 6.962792039226997e-05, "loss": 0.2484, "step": 2360 }, { "epoch": 1.991910915012081, "grad_norm": 0.1620776355266571, "learning_rate": 6.905105278338622e-05, "loss": 0.2599, "step": 2370 } ], "logging_steps": 10, "max_steps": 3567, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.613824550319268e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }