{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 10818, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0036975411351451285, "grad_norm": 589585.4375, "learning_rate": 9.981512294324275e-06, "loss": 1.1475, "step": 20 }, { "epoch": 0.007395082270290257, "grad_norm": 156054.1875, "learning_rate": 9.96302458864855e-06, "loss": 0.8189, "step": 40 }, { "epoch": 0.011092623405435386, "grad_norm": 473015.4375, "learning_rate": 9.944536882972824e-06, "loss": 0.8013, "step": 60 }, { "epoch": 0.014790164540580514, "grad_norm": 591414.4375, "learning_rate": 9.926049177297098e-06, "loss": 0.7376, "step": 80 }, { "epoch": 0.018487705675725642, "grad_norm": 277118.34375, "learning_rate": 9.907561471621374e-06, "loss": 0.6362, "step": 100 }, { "epoch": 0.022185246810870772, "grad_norm": 380905.34375, "learning_rate": 9.889073765945647e-06, "loss": 0.6071, "step": 120 }, { "epoch": 0.0258827879460159, "grad_norm": 484594.5625, "learning_rate": 9.870586060269921e-06, "loss": 0.6172, "step": 140 }, { "epoch": 0.029580329081161028, "grad_norm": 651200.875, "learning_rate": 9.852098354594197e-06, "loss": 0.5612, "step": 160 }, { "epoch": 0.033277870216306155, "grad_norm": 385674.75, "learning_rate": 9.83361064891847e-06, "loss": 0.5435, "step": 180 }, { "epoch": 0.036975411351451284, "grad_norm": 248445.234375, "learning_rate": 9.815122943242745e-06, "loss": 0.5084, "step": 200 }, { "epoch": 0.040672952486596414, "grad_norm": 375588.0, "learning_rate": 9.79663523756702e-06, "loss": 0.5284, "step": 220 }, { "epoch": 0.044370493621741544, "grad_norm": 255991.15625, "learning_rate": 9.778147531891294e-06, "loss": 0.5286, "step": 240 }, { "epoch": 0.04806803475688667, "grad_norm": 428790.03125, "learning_rate": 9.759659826215568e-06, "loss": 0.5246, "step": 260 }, { "epoch": 0.0517655758920318, "grad_norm": 374524.40625, "learning_rate": 9.741172120539842e-06, "loss": 0.5035, "step": 280 }, { "epoch": 0.05546311702717693, "grad_norm": 545644.125, "learning_rate": 9.722684414864116e-06, "loss": 0.5088, "step": 300 }, { "epoch": 0.059160658162322056, "grad_norm": 433521.78125, "learning_rate": 9.704196709188391e-06, "loss": 0.4777, "step": 320 }, { "epoch": 0.06285819929746718, "grad_norm": 1164997.25, "learning_rate": 9.685709003512665e-06, "loss": 0.4866, "step": 340 }, { "epoch": 0.06655574043261231, "grad_norm": 393447.1875, "learning_rate": 9.667221297836939e-06, "loss": 0.4095, "step": 360 }, { "epoch": 0.07025328156775744, "grad_norm": 1360361.75, "learning_rate": 9.648733592161213e-06, "loss": 0.4031, "step": 380 }, { "epoch": 0.07395082270290257, "grad_norm": 533357.375, "learning_rate": 9.630245886485488e-06, "loss": 0.3758, "step": 400 }, { "epoch": 0.0776483638380477, "grad_norm": 980064.375, "learning_rate": 9.611758180809762e-06, "loss": 0.344, "step": 420 }, { "epoch": 0.08134590497319283, "grad_norm": 656362.5625, "learning_rate": 9.593270475134036e-06, "loss": 0.3117, "step": 440 }, { "epoch": 0.08504344610833796, "grad_norm": 661492.875, "learning_rate": 9.57478276945831e-06, "loss": 0.3077, "step": 460 }, { "epoch": 0.08874098724348309, "grad_norm": 553728.25, "learning_rate": 9.556295063782585e-06, "loss": 0.324, "step": 480 }, { "epoch": 0.09243852837862822, "grad_norm": 1357614.375, "learning_rate": 9.53780735810686e-06, "loss": 0.2724, "step": 500 }, { "epoch": 0.09613606951377333, "grad_norm": 519362.1875, "learning_rate": 9.519319652431133e-06, "loss": 0.2831, "step": 520 }, { "epoch": 0.09983361064891846, "grad_norm": 637897.4375, "learning_rate": 9.500831946755409e-06, "loss": 0.2703, "step": 540 }, { "epoch": 0.1035311517840636, "grad_norm": 823037.125, "learning_rate": 9.482344241079683e-06, "loss": 0.2425, "step": 560 }, { "epoch": 0.10722869291920872, "grad_norm": 905703.5, "learning_rate": 9.463856535403956e-06, "loss": 0.2729, "step": 580 }, { "epoch": 0.11092623405435385, "grad_norm": 1196128.75, "learning_rate": 9.445368829728232e-06, "loss": 0.2347, "step": 600 }, { "epoch": 0.11462377518949898, "grad_norm": 968333.625, "learning_rate": 9.426881124052506e-06, "loss": 0.2454, "step": 620 }, { "epoch": 0.11832131632464411, "grad_norm": 1234395.875, "learning_rate": 9.40839341837678e-06, "loss": 0.2284, "step": 640 }, { "epoch": 0.12201885745978924, "grad_norm": 767672.9375, "learning_rate": 9.389905712701055e-06, "loss": 0.2095, "step": 660 }, { "epoch": 0.12571639859493436, "grad_norm": 494605.09375, "learning_rate": 9.37141800702533e-06, "loss": 0.1952, "step": 680 }, { "epoch": 0.1294139397300795, "grad_norm": 1483855.875, "learning_rate": 9.352930301349603e-06, "loss": 0.2304, "step": 700 }, { "epoch": 0.13311148086522462, "grad_norm": 566157.4375, "learning_rate": 9.334442595673877e-06, "loss": 0.2251, "step": 720 }, { "epoch": 0.13680902200036976, "grad_norm": 1291356.0, "learning_rate": 9.315954889998153e-06, "loss": 0.1878, "step": 740 }, { "epoch": 0.14050656313551488, "grad_norm": 664807.3125, "learning_rate": 9.297467184322426e-06, "loss": 0.1692, "step": 760 }, { "epoch": 0.14420410427066002, "grad_norm": 450389.625, "learning_rate": 9.2789794786467e-06, "loss": 0.172, "step": 780 }, { "epoch": 0.14790164540580514, "grad_norm": 776172.4375, "learning_rate": 9.260491772970976e-06, "loss": 0.195, "step": 800 }, { "epoch": 0.15159918654095028, "grad_norm": 582122.625, "learning_rate": 9.24200406729525e-06, "loss": 0.183, "step": 820 }, { "epoch": 0.1552967276760954, "grad_norm": 631486.0, "learning_rate": 9.223516361619524e-06, "loss": 0.1687, "step": 840 }, { "epoch": 0.1589942688112405, "grad_norm": 502021.9375, "learning_rate": 9.205028655943799e-06, "loss": 0.174, "step": 860 }, { "epoch": 0.16269180994638566, "grad_norm": 882197.1875, "learning_rate": 9.186540950268073e-06, "loss": 0.1805, "step": 880 }, { "epoch": 0.16638935108153077, "grad_norm": 642990.0, "learning_rate": 9.168053244592347e-06, "loss": 0.1752, "step": 900 }, { "epoch": 0.17008689221667592, "grad_norm": 711188.5625, "learning_rate": 9.149565538916622e-06, "loss": 0.1544, "step": 920 }, { "epoch": 0.17378443335182103, "grad_norm": 333228.09375, "learning_rate": 9.131077833240896e-06, "loss": 0.1566, "step": 940 }, { "epoch": 0.17748197448696618, "grad_norm": 890502.75, "learning_rate": 9.11259012756517e-06, "loss": 0.1611, "step": 960 }, { "epoch": 0.1811795156221113, "grad_norm": 1566534.125, "learning_rate": 9.094102421889444e-06, "loss": 0.1774, "step": 980 }, { "epoch": 0.18487705675725644, "grad_norm": 726765.6875, "learning_rate": 9.07561471621372e-06, "loss": 0.164, "step": 1000 }, { "epoch": 0.18857459789240155, "grad_norm": 552103.1875, "learning_rate": 9.057127010537993e-06, "loss": 0.1604, "step": 1020 }, { "epoch": 0.19227213902754667, "grad_norm": 454113.75, "learning_rate": 9.038639304862267e-06, "loss": 0.1568, "step": 1040 }, { "epoch": 0.1959696801626918, "grad_norm": 1157071.125, "learning_rate": 9.020151599186541e-06, "loss": 0.1486, "step": 1060 }, { "epoch": 0.19966722129783693, "grad_norm": 1144903.125, "learning_rate": 9.001663893510815e-06, "loss": 0.1451, "step": 1080 }, { "epoch": 0.20336476243298207, "grad_norm": 953759.0, "learning_rate": 8.98317618783509e-06, "loss": 0.1497, "step": 1100 }, { "epoch": 0.2070623035681272, "grad_norm": 280053.78125, "learning_rate": 8.964688482159364e-06, "loss": 0.1532, "step": 1120 }, { "epoch": 0.21075984470327233, "grad_norm": 725245.9375, "learning_rate": 8.946200776483638e-06, "loss": 0.1367, "step": 1140 }, { "epoch": 0.21445738583841745, "grad_norm": 453623.09375, "learning_rate": 8.927713070807912e-06, "loss": 0.1498, "step": 1160 }, { "epoch": 0.2181549269735626, "grad_norm": 675041.0625, "learning_rate": 8.909225365132188e-06, "loss": 0.1409, "step": 1180 }, { "epoch": 0.2218524681087077, "grad_norm": 652479.5, "learning_rate": 8.890737659456462e-06, "loss": 0.1494, "step": 1200 }, { "epoch": 0.22555000924385285, "grad_norm": 393038.4375, "learning_rate": 8.872249953780735e-06, "loss": 0.146, "step": 1220 }, { "epoch": 0.22924755037899797, "grad_norm": 616304.6875, "learning_rate": 8.853762248105011e-06, "loss": 0.1346, "step": 1240 }, { "epoch": 0.23294509151414308, "grad_norm": 736962.0625, "learning_rate": 8.835274542429285e-06, "loss": 0.1457, "step": 1260 }, { "epoch": 0.23664263264928823, "grad_norm": 527761.5625, "learning_rate": 8.816786836753559e-06, "loss": 0.1432, "step": 1280 }, { "epoch": 0.24034017378443334, "grad_norm": 593494.5, "learning_rate": 8.798299131077834e-06, "loss": 0.1282, "step": 1300 }, { "epoch": 0.24403771491957849, "grad_norm": 515513.5, "learning_rate": 8.779811425402108e-06, "loss": 0.1369, "step": 1320 }, { "epoch": 0.2477352560547236, "grad_norm": 733587.375, "learning_rate": 8.761323719726382e-06, "loss": 0.1309, "step": 1340 }, { "epoch": 0.2514327971898687, "grad_norm": 660185.4375, "learning_rate": 8.742836014050658e-06, "loss": 0.1367, "step": 1360 }, { "epoch": 0.25513033832501386, "grad_norm": 692261.375, "learning_rate": 8.724348308374931e-06, "loss": 0.1438, "step": 1380 }, { "epoch": 0.258827879460159, "grad_norm": 733542.75, "learning_rate": 8.705860602699205e-06, "loss": 0.1414, "step": 1400 }, { "epoch": 0.26252542059530415, "grad_norm": 569422.8125, "learning_rate": 8.687372897023481e-06, "loss": 0.1356, "step": 1420 }, { "epoch": 0.26622296173044924, "grad_norm": 737659.5625, "learning_rate": 8.668885191347755e-06, "loss": 0.1238, "step": 1440 }, { "epoch": 0.2699205028655944, "grad_norm": 588390.3125, "learning_rate": 8.650397485672029e-06, "loss": 0.1315, "step": 1460 }, { "epoch": 0.2736180440007395, "grad_norm": 469582.0, "learning_rate": 8.631909779996304e-06, "loss": 0.1241, "step": 1480 }, { "epoch": 0.2773155851358846, "grad_norm": 877184.875, "learning_rate": 8.613422074320578e-06, "loss": 0.1217, "step": 1500 }, { "epoch": 0.28101312627102976, "grad_norm": 610305.125, "learning_rate": 8.594934368644852e-06, "loss": 0.1197, "step": 1520 }, { "epoch": 0.2847106674061749, "grad_norm": 435616.65625, "learning_rate": 8.576446662969127e-06, "loss": 0.128, "step": 1540 }, { "epoch": 0.28840820854132004, "grad_norm": 1002288.1875, "learning_rate": 8.557958957293401e-06, "loss": 0.1177, "step": 1560 }, { "epoch": 0.29210574967646513, "grad_norm": 407610.78125, "learning_rate": 8.539471251617675e-06, "loss": 0.1296, "step": 1580 }, { "epoch": 0.2958032908116103, "grad_norm": 628192.1875, "learning_rate": 8.520983545941949e-06, "loss": 0.121, "step": 1600 }, { "epoch": 0.2995008319467554, "grad_norm": 470853.6875, "learning_rate": 8.502495840266225e-06, "loss": 0.113, "step": 1620 }, { "epoch": 0.30319837308190056, "grad_norm": 420262.5, "learning_rate": 8.484008134590499e-06, "loss": 0.1156, "step": 1640 }, { "epoch": 0.30689591421704565, "grad_norm": 729730.0, "learning_rate": 8.465520428914772e-06, "loss": 0.1227, "step": 1660 }, { "epoch": 0.3105934553521908, "grad_norm": 493498.28125, "learning_rate": 8.447032723239046e-06, "loss": 0.1208, "step": 1680 }, { "epoch": 0.31429099648733594, "grad_norm": 690842.625, "learning_rate": 8.428545017563322e-06, "loss": 0.1112, "step": 1700 }, { "epoch": 0.317988537622481, "grad_norm": 473233.1875, "learning_rate": 8.410057311887596e-06, "loss": 0.1129, "step": 1720 }, { "epoch": 0.32168607875762617, "grad_norm": 780393.375, "learning_rate": 8.39156960621187e-06, "loss": 0.1126, "step": 1740 }, { "epoch": 0.3253836198927713, "grad_norm": 422343.6875, "learning_rate": 8.373081900536143e-06, "loss": 0.106, "step": 1760 }, { "epoch": 0.32908116102791646, "grad_norm": 665555.875, "learning_rate": 8.354594194860419e-06, "loss": 0.1148, "step": 1780 }, { "epoch": 0.33277870216306155, "grad_norm": 593427.9375, "learning_rate": 8.336106489184693e-06, "loss": 0.113, "step": 1800 }, { "epoch": 0.3364762432982067, "grad_norm": 370478.46875, "learning_rate": 8.317618783508967e-06, "loss": 0.1072, "step": 1820 }, { "epoch": 0.34017378443335183, "grad_norm": 280958.96875, "learning_rate": 8.29913107783324e-06, "loss": 0.1064, "step": 1840 }, { "epoch": 0.343871325568497, "grad_norm": 467909.65625, "learning_rate": 8.280643372157516e-06, "loss": 0.1051, "step": 1860 }, { "epoch": 0.34756886670364207, "grad_norm": 1186302.125, "learning_rate": 8.26215566648179e-06, "loss": 0.1191, "step": 1880 }, { "epoch": 0.3512664078387872, "grad_norm": 346905.46875, "learning_rate": 8.243667960806064e-06, "loss": 0.1157, "step": 1900 }, { "epoch": 0.35496394897393235, "grad_norm": 639500.4375, "learning_rate": 8.22518025513034e-06, "loss": 0.1092, "step": 1920 }, { "epoch": 0.35866149010907744, "grad_norm": 398886.0625, "learning_rate": 8.206692549454613e-06, "loss": 0.0992, "step": 1940 }, { "epoch": 0.3623590312442226, "grad_norm": 551855.8125, "learning_rate": 8.188204843778887e-06, "loss": 0.1108, "step": 1960 }, { "epoch": 0.36605657237936773, "grad_norm": 520954.9375, "learning_rate": 8.169717138103163e-06, "loss": 0.1035, "step": 1980 }, { "epoch": 0.36975411351451287, "grad_norm": 667530.5625, "learning_rate": 8.151229432427437e-06, "loss": 0.0984, "step": 2000 }, { "epoch": 0.37345165464965796, "grad_norm": 461173.25, "learning_rate": 8.13274172675171e-06, "loss": 0.1077, "step": 2020 }, { "epoch": 0.3771491957848031, "grad_norm": 797304.125, "learning_rate": 8.114254021075984e-06, "loss": 0.0921, "step": 2040 }, { "epoch": 0.38084673691994825, "grad_norm": 530894.0625, "learning_rate": 8.09576631540026e-06, "loss": 0.095, "step": 2060 }, { "epoch": 0.38454427805509334, "grad_norm": 551120.8125, "learning_rate": 8.077278609724534e-06, "loss": 0.1042, "step": 2080 }, { "epoch": 0.3882418191902385, "grad_norm": 423327.59375, "learning_rate": 8.058790904048808e-06, "loss": 0.1086, "step": 2100 }, { "epoch": 0.3919393603253836, "grad_norm": 373503.53125, "learning_rate": 8.040303198373083e-06, "loss": 0.0966, "step": 2120 }, { "epoch": 0.39563690146052877, "grad_norm": 431499.09375, "learning_rate": 8.021815492697357e-06, "loss": 0.0996, "step": 2140 }, { "epoch": 0.39933444259567386, "grad_norm": 799203.5625, "learning_rate": 8.003327787021631e-06, "loss": 0.0988, "step": 2160 }, { "epoch": 0.403031983730819, "grad_norm": 519873.125, "learning_rate": 7.984840081345906e-06, "loss": 0.0999, "step": 2180 }, { "epoch": 0.40672952486596414, "grad_norm": 450627.4375, "learning_rate": 7.96635237567018e-06, "loss": 0.0817, "step": 2200 }, { "epoch": 0.4104270660011093, "grad_norm": 854112.3125, "learning_rate": 7.947864669994454e-06, "loss": 0.0983, "step": 2220 }, { "epoch": 0.4141246071362544, "grad_norm": 616815.8125, "learning_rate": 7.92937696431873e-06, "loss": 0.099, "step": 2240 }, { "epoch": 0.4178221482713995, "grad_norm": 379120.875, "learning_rate": 7.910889258643004e-06, "loss": 0.0931, "step": 2260 }, { "epoch": 0.42151968940654466, "grad_norm": 347684.78125, "learning_rate": 7.892401552967277e-06, "loss": 0.0903, "step": 2280 }, { "epoch": 0.42521723054168975, "grad_norm": 394592.6875, "learning_rate": 7.873913847291551e-06, "loss": 0.09, "step": 2300 }, { "epoch": 0.4289147716768349, "grad_norm": 406101.0625, "learning_rate": 7.855426141615827e-06, "loss": 0.1019, "step": 2320 }, { "epoch": 0.43261231281198004, "grad_norm": 392444.125, "learning_rate": 7.8369384359401e-06, "loss": 0.0828, "step": 2340 }, { "epoch": 0.4363098539471252, "grad_norm": 751139.6875, "learning_rate": 7.818450730264375e-06, "loss": 0.0922, "step": 2360 }, { "epoch": 0.44000739508227027, "grad_norm": 324115.75, "learning_rate": 7.799963024588648e-06, "loss": 0.0923, "step": 2380 }, { "epoch": 0.4437049362174154, "grad_norm": 444140.375, "learning_rate": 7.781475318912924e-06, "loss": 0.0834, "step": 2400 }, { "epoch": 0.44740247735256056, "grad_norm": 476187.125, "learning_rate": 7.762987613237198e-06, "loss": 0.0889, "step": 2420 }, { "epoch": 0.4511000184877057, "grad_norm": 511409.96875, "learning_rate": 7.744499907561472e-06, "loss": 0.0828, "step": 2440 }, { "epoch": 0.4547975596228508, "grad_norm": 822892.375, "learning_rate": 7.726012201885746e-06, "loss": 0.0876, "step": 2460 }, { "epoch": 0.45849510075799593, "grad_norm": 282021.03125, "learning_rate": 7.707524496210021e-06, "loss": 0.091, "step": 2480 }, { "epoch": 0.4621926418931411, "grad_norm": 585528.8125, "learning_rate": 7.689036790534295e-06, "loss": 0.0871, "step": 2500 }, { "epoch": 0.46589018302828616, "grad_norm": 387105.9375, "learning_rate": 7.670549084858569e-06, "loss": 0.0808, "step": 2520 }, { "epoch": 0.4695877241634313, "grad_norm": 567624.375, "learning_rate": 7.652061379182843e-06, "loss": 0.0925, "step": 2540 }, { "epoch": 0.47328526529857645, "grad_norm": 314457.0, "learning_rate": 7.633573673507118e-06, "loss": 0.0865, "step": 2560 }, { "epoch": 0.4769828064337216, "grad_norm": 398509.59375, "learning_rate": 7.615085967831392e-06, "loss": 0.0904, "step": 2580 }, { "epoch": 0.4806803475688667, "grad_norm": 552215.5625, "learning_rate": 7.596598262155667e-06, "loss": 0.0922, "step": 2600 }, { "epoch": 0.4843778887040118, "grad_norm": 259611.421875, "learning_rate": 7.578110556479942e-06, "loss": 0.0817, "step": 2620 }, { "epoch": 0.48807542983915697, "grad_norm": 471336.875, "learning_rate": 7.5596228508042155e-06, "loss": 0.0819, "step": 2640 }, { "epoch": 0.4917729709743021, "grad_norm": 405907.1875, "learning_rate": 7.541135145128489e-06, "loss": 0.0886, "step": 2660 }, { "epoch": 0.4954705121094472, "grad_norm": 471205.25, "learning_rate": 7.522647439452765e-06, "loss": 0.0863, "step": 2680 }, { "epoch": 0.49916805324459235, "grad_norm": 357797.8125, "learning_rate": 7.504159733777039e-06, "loss": 0.0817, "step": 2700 }, { "epoch": 0.5028655943797374, "grad_norm": 490397.75, "learning_rate": 7.485672028101313e-06, "loss": 0.0794, "step": 2720 }, { "epoch": 0.5065631355148826, "grad_norm": 317551.46875, "learning_rate": 7.467184322425588e-06, "loss": 0.0761, "step": 2740 }, { "epoch": 0.5102606766500277, "grad_norm": 391744.40625, "learning_rate": 7.448696616749862e-06, "loss": 0.0817, "step": 2760 }, { "epoch": 0.5139582177851728, "grad_norm": 559249.4375, "learning_rate": 7.430208911074136e-06, "loss": 0.0803, "step": 2780 }, { "epoch": 0.517655758920318, "grad_norm": 245447.3125, "learning_rate": 7.411721205398411e-06, "loss": 0.0799, "step": 2800 }, { "epoch": 0.5213533000554631, "grad_norm": 203768.40625, "learning_rate": 7.393233499722685e-06, "loss": 0.0825, "step": 2820 }, { "epoch": 0.5250508411906083, "grad_norm": 256229.953125, "learning_rate": 7.374745794046959e-06, "loss": 0.0842, "step": 2840 }, { "epoch": 0.5287483823257534, "grad_norm": 748886.6875, "learning_rate": 7.356258088371234e-06, "loss": 0.0829, "step": 2860 }, { "epoch": 0.5324459234608985, "grad_norm": 392705.1875, "learning_rate": 7.337770382695508e-06, "loss": 0.0853, "step": 2880 }, { "epoch": 0.5361434645960437, "grad_norm": 372998.21875, "learning_rate": 7.3192826770197826e-06, "loss": 0.0763, "step": 2900 }, { "epoch": 0.5398410057311888, "grad_norm": 419793.71875, "learning_rate": 7.300794971344057e-06, "loss": 0.0809, "step": 2920 }, { "epoch": 0.5435385468663338, "grad_norm": 401173.0, "learning_rate": 7.282307265668331e-06, "loss": 0.0652, "step": 2940 }, { "epoch": 0.547236088001479, "grad_norm": 270387.03125, "learning_rate": 7.263819559992605e-06, "loss": 0.0742, "step": 2960 }, { "epoch": 0.5509336291366241, "grad_norm": 353803.25, "learning_rate": 7.24533185431688e-06, "loss": 0.0793, "step": 2980 }, { "epoch": 0.5546311702717692, "grad_norm": 339236.09375, "learning_rate": 7.2268441486411544e-06, "loss": 0.0802, "step": 3000 }, { "epoch": 0.5583287114069144, "grad_norm": 398772.21875, "learning_rate": 7.208356442965428e-06, "loss": 0.0774, "step": 3020 }, { "epoch": 0.5620262525420595, "grad_norm": 403407.84375, "learning_rate": 7.189868737289702e-06, "loss": 0.0758, "step": 3040 }, { "epoch": 0.5657237936772047, "grad_norm": 384857.3125, "learning_rate": 7.171381031613978e-06, "loss": 0.0692, "step": 3060 }, { "epoch": 0.5694213348123498, "grad_norm": 295177.6875, "learning_rate": 7.152893325938252e-06, "loss": 0.0748, "step": 3080 }, { "epoch": 0.5731188759474949, "grad_norm": 320366.5625, "learning_rate": 7.1344056202625255e-06, "loss": 0.0724, "step": 3100 }, { "epoch": 0.5768164170826401, "grad_norm": 455169.15625, "learning_rate": 7.115917914586801e-06, "loss": 0.0848, "step": 3120 }, { "epoch": 0.5805139582177852, "grad_norm": 313038.40625, "learning_rate": 7.097430208911075e-06, "loss": 0.0803, "step": 3140 }, { "epoch": 0.5842114993529303, "grad_norm": 545048.125, "learning_rate": 7.078942503235349e-06, "loss": 0.0737, "step": 3160 }, { "epoch": 0.5879090404880755, "grad_norm": 519241.8125, "learning_rate": 7.0604547975596235e-06, "loss": 0.0718, "step": 3180 }, { "epoch": 0.5916065816232206, "grad_norm": 505339.90625, "learning_rate": 7.041967091883898e-06, "loss": 0.074, "step": 3200 }, { "epoch": 0.5953041227583656, "grad_norm": 304366.46875, "learning_rate": 7.023479386208172e-06, "loss": 0.0685, "step": 3220 }, { "epoch": 0.5990016638935108, "grad_norm": 573642.75, "learning_rate": 7.004991680532447e-06, "loss": 0.0674, "step": 3240 }, { "epoch": 0.6026992050286559, "grad_norm": 440082.625, "learning_rate": 6.986503974856721e-06, "loss": 0.0713, "step": 3260 }, { "epoch": 0.6063967461638011, "grad_norm": 435252.34375, "learning_rate": 6.968016269180995e-06, "loss": 0.0645, "step": 3280 }, { "epoch": 0.6100942872989462, "grad_norm": 186664.71875, "learning_rate": 6.94952856350527e-06, "loss": 0.0687, "step": 3300 }, { "epoch": 0.6137918284340913, "grad_norm": 371479.125, "learning_rate": 6.931040857829544e-06, "loss": 0.0708, "step": 3320 }, { "epoch": 0.6174893695692365, "grad_norm": 162696.5, "learning_rate": 6.912553152153818e-06, "loss": 0.0638, "step": 3340 }, { "epoch": 0.6211869107043816, "grad_norm": 506640.0625, "learning_rate": 6.894065446478093e-06, "loss": 0.0707, "step": 3360 }, { "epoch": 0.6248844518395267, "grad_norm": 307854.53125, "learning_rate": 6.875577740802367e-06, "loss": 0.0648, "step": 3380 }, { "epoch": 0.6285819929746719, "grad_norm": 252089.015625, "learning_rate": 6.857090035126641e-06, "loss": 0.0785, "step": 3400 }, { "epoch": 0.632279534109817, "grad_norm": 218276.875, "learning_rate": 6.838602329450915e-06, "loss": 0.0679, "step": 3420 }, { "epoch": 0.635977075244962, "grad_norm": 262298.65625, "learning_rate": 6.8201146237751905e-06, "loss": 0.0725, "step": 3440 }, { "epoch": 0.6396746163801073, "grad_norm": 460630.875, "learning_rate": 6.801626918099464e-06, "loss": 0.0654, "step": 3460 }, { "epoch": 0.6433721575152523, "grad_norm": 313295.375, "learning_rate": 6.783139212423738e-06, "loss": 0.0718, "step": 3480 }, { "epoch": 0.6470696986503975, "grad_norm": 425139.90625, "learning_rate": 6.764651506748014e-06, "loss": 0.0648, "step": 3500 }, { "epoch": 0.6507672397855426, "grad_norm": 643511.1875, "learning_rate": 6.746163801072288e-06, "loss": 0.0667, "step": 3520 }, { "epoch": 0.6544647809206877, "grad_norm": 203010.78125, "learning_rate": 6.7276760953965615e-06, "loss": 0.0587, "step": 3540 }, { "epoch": 0.6581623220558329, "grad_norm": 738180.5, "learning_rate": 6.709188389720836e-06, "loss": 0.058, "step": 3560 }, { "epoch": 0.661859863190978, "grad_norm": 374890.40625, "learning_rate": 6.69070068404511e-06, "loss": 0.0703, "step": 3580 }, { "epoch": 0.6655574043261231, "grad_norm": 321488.34375, "learning_rate": 6.672212978369385e-06, "loss": 0.0626, "step": 3600 }, { "epoch": 0.6692549454612683, "grad_norm": 421561.0625, "learning_rate": 6.6537252726936595e-06, "loss": 0.0728, "step": 3620 }, { "epoch": 0.6729524865964134, "grad_norm": 372063.625, "learning_rate": 6.635237567017933e-06, "loss": 0.0637, "step": 3640 }, { "epoch": 0.6766500277315585, "grad_norm": 261809.125, "learning_rate": 6.616749861342207e-06, "loss": 0.0619, "step": 3660 }, { "epoch": 0.6803475688667037, "grad_norm": 290989.65625, "learning_rate": 6.598262155666483e-06, "loss": 0.0657, "step": 3680 }, { "epoch": 0.6840451100018488, "grad_norm": 242986.140625, "learning_rate": 6.579774449990757e-06, "loss": 0.0576, "step": 3700 }, { "epoch": 0.687742651136994, "grad_norm": 406223.15625, "learning_rate": 6.5612867443150306e-06, "loss": 0.0599, "step": 3720 }, { "epoch": 0.691440192272139, "grad_norm": 737275.5625, "learning_rate": 6.542799038639306e-06, "loss": 0.0687, "step": 3740 }, { "epoch": 0.6951377334072841, "grad_norm": 205932.21875, "learning_rate": 6.52431133296358e-06, "loss": 0.0654, "step": 3760 }, { "epoch": 0.6988352745424293, "grad_norm": 355624.5, "learning_rate": 6.505823627287854e-06, "loss": 0.0616, "step": 3780 }, { "epoch": 0.7025328156775744, "grad_norm": 269638.0, "learning_rate": 6.487335921612129e-06, "loss": 0.0549, "step": 3800 }, { "epoch": 0.7062303568127195, "grad_norm": 520156.78125, "learning_rate": 6.468848215936403e-06, "loss": 0.0687, "step": 3820 }, { "epoch": 0.7099278979478647, "grad_norm": 435804.65625, "learning_rate": 6.450360510260677e-06, "loss": 0.0551, "step": 3840 }, { "epoch": 0.7136254390830098, "grad_norm": 287883.53125, "learning_rate": 6.431872804584951e-06, "loss": 0.0585, "step": 3860 }, { "epoch": 0.7173229802181549, "grad_norm": 513667.65625, "learning_rate": 6.413385098909226e-06, "loss": 0.0588, "step": 3880 }, { "epoch": 0.7210205213533001, "grad_norm": 560210.9375, "learning_rate": 6.3948973932335004e-06, "loss": 0.0601, "step": 3900 }, { "epoch": 0.7247180624884452, "grad_norm": 570266.3125, "learning_rate": 6.376409687557774e-06, "loss": 0.0656, "step": 3920 }, { "epoch": 0.7284156036235903, "grad_norm": 360799.875, "learning_rate": 6.357921981882049e-06, "loss": 0.0554, "step": 3940 }, { "epoch": 0.7321131447587355, "grad_norm": 477566.0, "learning_rate": 6.339434276206323e-06, "loss": 0.0526, "step": 3960 }, { "epoch": 0.7358106858938805, "grad_norm": 333243.9375, "learning_rate": 6.320946570530598e-06, "loss": 0.0612, "step": 3980 }, { "epoch": 0.7395082270290257, "grad_norm": 318909.875, "learning_rate": 6.302458864854872e-06, "loss": 0.0621, "step": 4000 }, { "epoch": 0.7432057681641708, "grad_norm": 359029.1875, "learning_rate": 6.283971159179146e-06, "loss": 0.0536, "step": 4020 }, { "epoch": 0.7469033092993159, "grad_norm": 640655.25, "learning_rate": 6.26548345350342e-06, "loss": 0.0546, "step": 4040 }, { "epoch": 0.7506008504344611, "grad_norm": 488760.09375, "learning_rate": 6.246995747827696e-06, "loss": 0.058, "step": 4060 }, { "epoch": 0.7542983915696062, "grad_norm": 312268.46875, "learning_rate": 6.2285080421519695e-06, "loss": 0.0598, "step": 4080 }, { "epoch": 0.7579959327047513, "grad_norm": 342215.8125, "learning_rate": 6.210020336476243e-06, "loss": 0.0571, "step": 4100 }, { "epoch": 0.7616934738398965, "grad_norm": 287534.5625, "learning_rate": 6.191532630800519e-06, "loss": 0.0592, "step": 4120 }, { "epoch": 0.7653910149750416, "grad_norm": 189238.671875, "learning_rate": 6.173044925124793e-06, "loss": 0.0557, "step": 4140 }, { "epoch": 0.7690885561101867, "grad_norm": 267354.3125, "learning_rate": 6.154557219449067e-06, "loss": 0.052, "step": 4160 }, { "epoch": 0.7727860972453319, "grad_norm": 343019.71875, "learning_rate": 6.136069513773341e-06, "loss": 0.0552, "step": 4180 }, { "epoch": 0.776483638380477, "grad_norm": 232639.453125, "learning_rate": 6.117581808097616e-06, "loss": 0.0547, "step": 4200 }, { "epoch": 0.7801811795156222, "grad_norm": 282819.8125, "learning_rate": 6.09909410242189e-06, "loss": 0.0562, "step": 4220 }, { "epoch": 0.7838787206507672, "grad_norm": 230195.125, "learning_rate": 6.080606396746165e-06, "loss": 0.06, "step": 4240 }, { "epoch": 0.7875762617859123, "grad_norm": 301125.625, "learning_rate": 6.0621186910704385e-06, "loss": 0.0533, "step": 4260 }, { "epoch": 0.7912738029210575, "grad_norm": 460118.625, "learning_rate": 6.043630985394713e-06, "loss": 0.0517, "step": 4280 }, { "epoch": 0.7949713440562026, "grad_norm": 432589.125, "learning_rate": 6.025143279718987e-06, "loss": 0.0562, "step": 4300 }, { "epoch": 0.7986688851913477, "grad_norm": 237171.75, "learning_rate": 6.006655574043262e-06, "loss": 0.0554, "step": 4320 }, { "epoch": 0.8023664263264929, "grad_norm": 216067.484375, "learning_rate": 5.988167868367536e-06, "loss": 0.0521, "step": 4340 }, { "epoch": 0.806063967461638, "grad_norm": 472261.09375, "learning_rate": 5.9696801626918095e-06, "loss": 0.0574, "step": 4360 }, { "epoch": 0.8097615085967831, "grad_norm": 278600.25, "learning_rate": 5.951192457016085e-06, "loss": 0.0563, "step": 4380 }, { "epoch": 0.8134590497319283, "grad_norm": 308273.0625, "learning_rate": 5.932704751340359e-06, "loss": 0.0503, "step": 4400 }, { "epoch": 0.8171565908670734, "grad_norm": 367411.625, "learning_rate": 5.914217045664633e-06, "loss": 0.0591, "step": 4420 }, { "epoch": 0.8208541320022186, "grad_norm": 246328.625, "learning_rate": 5.895729339988908e-06, "loss": 0.0505, "step": 4440 }, { "epoch": 0.8245516731373637, "grad_norm": 208687.65625, "learning_rate": 5.877241634313182e-06, "loss": 0.0514, "step": 4460 }, { "epoch": 0.8282492142725087, "grad_norm": 188789.828125, "learning_rate": 5.858753928637456e-06, "loss": 0.0484, "step": 4480 }, { "epoch": 0.831946755407654, "grad_norm": 565020.375, "learning_rate": 5.840266222961732e-06, "loss": 0.0507, "step": 4500 }, { "epoch": 0.835644296542799, "grad_norm": 253717.375, "learning_rate": 5.8217785172860055e-06, "loss": 0.052, "step": 4520 }, { "epoch": 0.8393418376779441, "grad_norm": 240820.609375, "learning_rate": 5.803290811610279e-06, "loss": 0.0453, "step": 4540 }, { "epoch": 0.8430393788130893, "grad_norm": 285280.9375, "learning_rate": 5.784803105934554e-06, "loss": 0.0462, "step": 4560 }, { "epoch": 0.8467369199482344, "grad_norm": 496788.34375, "learning_rate": 5.766315400258828e-06, "loss": 0.0494, "step": 4580 }, { "epoch": 0.8504344610833795, "grad_norm": 270688.40625, "learning_rate": 5.747827694583103e-06, "loss": 0.0551, "step": 4600 }, { "epoch": 0.8541320022185247, "grad_norm": 352414.625, "learning_rate": 5.729339988907377e-06, "loss": 0.0493, "step": 4620 }, { "epoch": 0.8578295433536698, "grad_norm": 236690.390625, "learning_rate": 5.710852283231651e-06, "loss": 0.0504, "step": 4640 }, { "epoch": 0.861527084488815, "grad_norm": 354131.8125, "learning_rate": 5.692364577555925e-06, "loss": 0.0505, "step": 4660 }, { "epoch": 0.8652246256239601, "grad_norm": 200457.0625, "learning_rate": 5.673876871880201e-06, "loss": 0.04, "step": 4680 }, { "epoch": 0.8689221667591052, "grad_norm": 311071.1875, "learning_rate": 5.6553891662044746e-06, "loss": 0.0487, "step": 4700 }, { "epoch": 0.8726197078942504, "grad_norm": 159538.90625, "learning_rate": 5.6369014605287484e-06, "loss": 0.0405, "step": 4720 }, { "epoch": 0.8763172490293955, "grad_norm": 135958.078125, "learning_rate": 5.618413754853022e-06, "loss": 0.0425, "step": 4740 }, { "epoch": 0.8800147901645405, "grad_norm": 377985.625, "learning_rate": 5.599926049177298e-06, "loss": 0.0478, "step": 4760 }, { "epoch": 0.8837123312996857, "grad_norm": 351438.78125, "learning_rate": 5.581438343501572e-06, "loss": 0.0432, "step": 4780 }, { "epoch": 0.8874098724348308, "grad_norm": 203324.046875, "learning_rate": 5.562950637825846e-06, "loss": 0.0452, "step": 4800 }, { "epoch": 0.8911074135699759, "grad_norm": 265801.84375, "learning_rate": 5.544462932150121e-06, "loss": 0.0458, "step": 4820 }, { "epoch": 0.8948049547051211, "grad_norm": 237225.953125, "learning_rate": 5.525975226474395e-06, "loss": 0.0441, "step": 4840 }, { "epoch": 0.8985024958402662, "grad_norm": 248215.796875, "learning_rate": 5.507487520798669e-06, "loss": 0.0464, "step": 4860 }, { "epoch": 0.9022000369754114, "grad_norm": 216602.59375, "learning_rate": 5.488999815122944e-06, "loss": 0.0496, "step": 4880 }, { "epoch": 0.9058975781105565, "grad_norm": 212512.0, "learning_rate": 5.470512109447218e-06, "loss": 0.0426, "step": 4900 }, { "epoch": 0.9095951192457016, "grad_norm": 264672.21875, "learning_rate": 5.452024403771492e-06, "loss": 0.0431, "step": 4920 }, { "epoch": 0.9132926603808468, "grad_norm": 195740.625, "learning_rate": 5.433536698095767e-06, "loss": 0.0454, "step": 4940 }, { "epoch": 0.9169902015159919, "grad_norm": 239070.890625, "learning_rate": 5.415048992420041e-06, "loss": 0.0388, "step": 4960 }, { "epoch": 0.920687742651137, "grad_norm": 327097.84375, "learning_rate": 5.3965612867443155e-06, "loss": 0.0418, "step": 4980 }, { "epoch": 0.9243852837862822, "grad_norm": 319707.625, "learning_rate": 5.37807358106859e-06, "loss": 0.0519, "step": 5000 }, { "epoch": 0.9280828249214272, "grad_norm": 166737.625, "learning_rate": 5.359585875392864e-06, "loss": 0.0437, "step": 5020 }, { "epoch": 0.9317803660565723, "grad_norm": 489044.0625, "learning_rate": 5.341098169717138e-06, "loss": 0.0438, "step": 5040 }, { "epoch": 0.9354779071917175, "grad_norm": 288754.90625, "learning_rate": 5.3226104640414135e-06, "loss": 0.0447, "step": 5060 }, { "epoch": 0.9391754483268626, "grad_norm": 205332.171875, "learning_rate": 5.304122758365687e-06, "loss": 0.044, "step": 5080 }, { "epoch": 0.9428729894620078, "grad_norm": 266008.21875, "learning_rate": 5.285635052689961e-06, "loss": 0.0448, "step": 5100 }, { "epoch": 0.9465705305971529, "grad_norm": 433041.125, "learning_rate": 5.267147347014237e-06, "loss": 0.0428, "step": 5120 }, { "epoch": 0.950268071732298, "grad_norm": 176340.359375, "learning_rate": 5.248659641338511e-06, "loss": 0.041, "step": 5140 }, { "epoch": 0.9539656128674432, "grad_norm": 238454.6875, "learning_rate": 5.2301719356627845e-06, "loss": 0.0372, "step": 5160 }, { "epoch": 0.9576631540025883, "grad_norm": 269030.46875, "learning_rate": 5.211684229987059e-06, "loss": 0.0461, "step": 5180 }, { "epoch": 0.9613606951377334, "grad_norm": 270103.5, "learning_rate": 5.193196524311334e-06, "loss": 0.039, "step": 5200 }, { "epoch": 0.9650582362728786, "grad_norm": 152555.5625, "learning_rate": 5.174708818635608e-06, "loss": 0.0425, "step": 5220 }, { "epoch": 0.9687557774080237, "grad_norm": 308841.71875, "learning_rate": 5.156221112959882e-06, "loss": 0.0419, "step": 5240 }, { "epoch": 0.9724533185431687, "grad_norm": 298958.28125, "learning_rate": 5.137733407284156e-06, "loss": 0.043, "step": 5260 }, { "epoch": 0.9761508596783139, "grad_norm": 293175.53125, "learning_rate": 5.119245701608431e-06, "loss": 0.0488, "step": 5280 }, { "epoch": 0.979848400813459, "grad_norm": 306048.71875, "learning_rate": 5.100757995932705e-06, "loss": 0.0414, "step": 5300 }, { "epoch": 0.9835459419486042, "grad_norm": 263478.65625, "learning_rate": 5.08227029025698e-06, "loss": 0.0377, "step": 5320 }, { "epoch": 0.9872434830837493, "grad_norm": 269423.96875, "learning_rate": 5.0637825845812535e-06, "loss": 0.042, "step": 5340 }, { "epoch": 0.9909410242188944, "grad_norm": 373293.375, "learning_rate": 5.045294878905527e-06, "loss": 0.0404, "step": 5360 }, { "epoch": 0.9946385653540396, "grad_norm": 230399.25, "learning_rate": 5.026807173229803e-06, "loss": 0.0363, "step": 5380 }, { "epoch": 0.9983361064891847, "grad_norm": 297080.53125, "learning_rate": 5.008319467554077e-06, "loss": 0.0368, "step": 5400 }, { "epoch": 1.00203364762433, "grad_norm": 193461.09375, "learning_rate": 4.9898317618783515e-06, "loss": 0.0318, "step": 5420 }, { "epoch": 1.0057311887594749, "grad_norm": 313585.34375, "learning_rate": 4.971344056202625e-06, "loss": 0.0321, "step": 5440 }, { "epoch": 1.00942872989462, "grad_norm": 305049.40625, "learning_rate": 4.9528563505269e-06, "loss": 0.0319, "step": 5460 }, { "epoch": 1.0131262710297653, "grad_norm": 184038.984375, "learning_rate": 4.934368644851175e-06, "loss": 0.0337, "step": 5480 }, { "epoch": 1.0168238121649102, "grad_norm": 250623.484375, "learning_rate": 4.915880939175449e-06, "loss": 0.0339, "step": 5500 }, { "epoch": 1.0205213533000554, "grad_norm": 389090.90625, "learning_rate": 4.897393233499723e-06, "loss": 0.0361, "step": 5520 }, { "epoch": 1.0242188944352006, "grad_norm": 496750.34375, "learning_rate": 4.878905527823997e-06, "loss": 0.0345, "step": 5540 }, { "epoch": 1.0279164355703456, "grad_norm": 173545.296875, "learning_rate": 4.860417822148272e-06, "loss": 0.0335, "step": 5560 }, { "epoch": 1.0316139767054908, "grad_norm": 135819.625, "learning_rate": 4.841930116472547e-06, "loss": 0.0341, "step": 5580 }, { "epoch": 1.035311517840636, "grad_norm": 331638.21875, "learning_rate": 4.8234424107968206e-06, "loss": 0.031, "step": 5600 }, { "epoch": 1.0390090589757812, "grad_norm": 129892.90625, "learning_rate": 4.8049547051210944e-06, "loss": 0.0322, "step": 5620 }, { "epoch": 1.0427066001109262, "grad_norm": 185758.609375, "learning_rate": 4.786466999445369e-06, "loss": 0.0329, "step": 5640 }, { "epoch": 1.0464041412460714, "grad_norm": 123809.5390625, "learning_rate": 4.767979293769643e-06, "loss": 0.0321, "step": 5660 }, { "epoch": 1.0501016823812166, "grad_norm": 254882.265625, "learning_rate": 4.749491588093918e-06, "loss": 0.0345, "step": 5680 }, { "epoch": 1.0537992235163616, "grad_norm": 243718.71875, "learning_rate": 4.7310038824181924e-06, "loss": 0.0338, "step": 5700 }, { "epoch": 1.0574967646515068, "grad_norm": 282382.6875, "learning_rate": 4.712516176742466e-06, "loss": 0.0314, "step": 5720 }, { "epoch": 1.061194305786652, "grad_norm": 224000.4375, "learning_rate": 4.694028471066741e-06, "loss": 0.0276, "step": 5740 }, { "epoch": 1.064891846921797, "grad_norm": 134676.859375, "learning_rate": 4.675540765391015e-06, "loss": 0.0313, "step": 5760 }, { "epoch": 1.0685893880569421, "grad_norm": 277227.3125, "learning_rate": 4.65705305971529e-06, "loss": 0.0312, "step": 5780 }, { "epoch": 1.0722869291920873, "grad_norm": 193661.1875, "learning_rate": 4.638565354039564e-06, "loss": 0.0328, "step": 5800 }, { "epoch": 1.0759844703272323, "grad_norm": 240792.234375, "learning_rate": 4.620077648363838e-06, "loss": 0.0299, "step": 5820 }, { "epoch": 1.0796820114623775, "grad_norm": 205885.5625, "learning_rate": 4.601589942688113e-06, "loss": 0.0313, "step": 5840 }, { "epoch": 1.0833795525975227, "grad_norm": 325387.34375, "learning_rate": 4.583102237012388e-06, "loss": 0.033, "step": 5860 }, { "epoch": 1.0870770937326677, "grad_norm": 141092.296875, "learning_rate": 4.5646145313366615e-06, "loss": 0.0339, "step": 5880 }, { "epoch": 1.090774634867813, "grad_norm": 571566.5625, "learning_rate": 4.546126825660936e-06, "loss": 0.0336, "step": 5900 }, { "epoch": 1.094472176002958, "grad_norm": 176920.59375, "learning_rate": 4.52763911998521e-06, "loss": 0.0312, "step": 5920 }, { "epoch": 1.098169717138103, "grad_norm": 310769.78125, "learning_rate": 4.509151414309485e-06, "loss": 0.0324, "step": 5940 }, { "epoch": 1.1018672582732483, "grad_norm": 385363.90625, "learning_rate": 4.490663708633759e-06, "loss": 0.0286, "step": 5960 }, { "epoch": 1.1055647994083935, "grad_norm": 245921.84375, "learning_rate": 4.472176002958033e-06, "loss": 0.0307, "step": 5980 }, { "epoch": 1.1092623405435384, "grad_norm": 295249.21875, "learning_rate": 4.453688297282307e-06, "loss": 0.0275, "step": 6000 }, { "epoch": 1.1129598816786836, "grad_norm": 200136.734375, "learning_rate": 4.435200591606582e-06, "loss": 0.0321, "step": 6020 }, { "epoch": 1.1166574228138288, "grad_norm": 317951.15625, "learning_rate": 4.416712885930856e-06, "loss": 0.0295, "step": 6040 }, { "epoch": 1.1203549639489738, "grad_norm": 407278.71875, "learning_rate": 4.3982251802551305e-06, "loss": 0.0262, "step": 6060 }, { "epoch": 1.124052505084119, "grad_norm": 197725.1875, "learning_rate": 4.379737474579405e-06, "loss": 0.0254, "step": 6080 }, { "epoch": 1.1277500462192642, "grad_norm": 209825.9375, "learning_rate": 4.361249768903679e-06, "loss": 0.0283, "step": 6100 }, { "epoch": 1.1314475873544092, "grad_norm": 165184.359375, "learning_rate": 4.342762063227954e-06, "loss": 0.0265, "step": 6120 }, { "epoch": 1.1351451284895544, "grad_norm": 176458.5, "learning_rate": 4.3242743575522285e-06, "loss": 0.0285, "step": 6140 }, { "epoch": 1.1388426696246996, "grad_norm": 177583.390625, "learning_rate": 4.305786651876502e-06, "loss": 0.0295, "step": 6160 }, { "epoch": 1.1425402107598448, "grad_norm": 229201.890625, "learning_rate": 4.287298946200777e-06, "loss": 0.0279, "step": 6180 }, { "epoch": 1.1462377518949898, "grad_norm": 407142.15625, "learning_rate": 4.268811240525051e-06, "loss": 0.031, "step": 6200 }, { "epoch": 1.149935293030135, "grad_norm": 150456.34375, "learning_rate": 4.250323534849326e-06, "loss": 0.0315, "step": 6220 }, { "epoch": 1.1536328341652802, "grad_norm": 148167.5, "learning_rate": 4.2318358291736e-06, "loss": 0.0274, "step": 6240 }, { "epoch": 1.1573303753004252, "grad_norm": 207488.640625, "learning_rate": 4.213348123497874e-06, "loss": 0.0285, "step": 6260 }, { "epoch": 1.1610279164355704, "grad_norm": 193723.734375, "learning_rate": 4.194860417822149e-06, "loss": 0.0273, "step": 6280 }, { "epoch": 1.1647254575707155, "grad_norm": 367727.40625, "learning_rate": 4.176372712146423e-06, "loss": 0.0286, "step": 6300 }, { "epoch": 1.1684229987058605, "grad_norm": 170541.34375, "learning_rate": 4.1578850064706975e-06, "loss": 0.0242, "step": 6320 }, { "epoch": 1.1721205398410057, "grad_norm": 183535.828125, "learning_rate": 4.139397300794971e-06, "loss": 0.0262, "step": 6340 }, { "epoch": 1.175818080976151, "grad_norm": 286020.03125, "learning_rate": 4.120909595119246e-06, "loss": 0.0291, "step": 6360 }, { "epoch": 1.179515622111296, "grad_norm": 173424.8125, "learning_rate": 4.10242188944352e-06, "loss": 0.0259, "step": 6380 }, { "epoch": 1.183213163246441, "grad_norm": 127936.703125, "learning_rate": 4.083934183767795e-06, "loss": 0.0261, "step": 6400 }, { "epoch": 1.1869107043815863, "grad_norm": 293457.0625, "learning_rate": 4.065446478092069e-06, "loss": 0.0289, "step": 6420 }, { "epoch": 1.1906082455167313, "grad_norm": 175406.0625, "learning_rate": 4.046958772416343e-06, "loss": 0.0281, "step": 6440 }, { "epoch": 1.1943057866518765, "grad_norm": 163937.984375, "learning_rate": 4.028471066740618e-06, "loss": 0.0247, "step": 6460 }, { "epoch": 1.1980033277870217, "grad_norm": 229977.171875, "learning_rate": 4.009983361064892e-06, "loss": 0.026, "step": 6480 }, { "epoch": 1.2017008689221669, "grad_norm": 343183.40625, "learning_rate": 3.991495655389167e-06, "loss": 0.0305, "step": 6500 }, { "epoch": 1.2053984100573119, "grad_norm": 303342.8125, "learning_rate": 3.973007949713441e-06, "loss": 0.0269, "step": 6520 }, { "epoch": 1.209095951192457, "grad_norm": 175805.6875, "learning_rate": 3.954520244037715e-06, "loss": 0.0267, "step": 6540 }, { "epoch": 1.2127934923276023, "grad_norm": 266918.03125, "learning_rate": 3.93603253836199e-06, "loss": 0.03, "step": 6560 }, { "epoch": 1.2164910334627472, "grad_norm": 81738.5546875, "learning_rate": 3.917544832686265e-06, "loss": 0.0239, "step": 6580 }, { "epoch": 1.2201885745978924, "grad_norm": 159759.4375, "learning_rate": 3.8990571270105384e-06, "loss": 0.0262, "step": 6600 }, { "epoch": 1.2238861157330376, "grad_norm": 131182.578125, "learning_rate": 3.880569421334812e-06, "loss": 0.0252, "step": 6620 }, { "epoch": 1.2275836568681826, "grad_norm": 215293.34375, "learning_rate": 3.862081715659087e-06, "loss": 0.025, "step": 6640 }, { "epoch": 1.2312811980033278, "grad_norm": 246891.640625, "learning_rate": 3.843594009983361e-06, "loss": 0.0267, "step": 6660 }, { "epoch": 1.234978739138473, "grad_norm": 203203.875, "learning_rate": 3.825106304307636e-06, "loss": 0.0269, "step": 6680 }, { "epoch": 1.238676280273618, "grad_norm": 218348.234375, "learning_rate": 3.80661859863191e-06, "loss": 0.0264, "step": 6700 }, { "epoch": 1.2423738214087632, "grad_norm": 378088.65625, "learning_rate": 3.7881308929561846e-06, "loss": 0.0262, "step": 6720 }, { "epoch": 1.2460713625439084, "grad_norm": 142539.5625, "learning_rate": 3.769643187280459e-06, "loss": 0.0226, "step": 6740 }, { "epoch": 1.2497689036790534, "grad_norm": 285223.40625, "learning_rate": 3.7511554816047328e-06, "loss": 0.0256, "step": 6760 }, { "epoch": 1.2534664448141986, "grad_norm": 355714.0625, "learning_rate": 3.7326677759290075e-06, "loss": 0.0244, "step": 6780 }, { "epoch": 1.2571639859493438, "grad_norm": 227273.59375, "learning_rate": 3.714180070253282e-06, "loss": 0.027, "step": 6800 }, { "epoch": 1.2608615270844887, "grad_norm": 155393.6875, "learning_rate": 3.695692364577556e-06, "loss": 0.0203, "step": 6820 }, { "epoch": 1.264559068219634, "grad_norm": 325504.15625, "learning_rate": 3.6772046589018308e-06, "loss": 0.0245, "step": 6840 }, { "epoch": 1.2682566093547791, "grad_norm": 240933.125, "learning_rate": 3.658716953226105e-06, "loss": 0.0258, "step": 6860 }, { "epoch": 1.271954150489924, "grad_norm": 106693.859375, "learning_rate": 3.6402292475503794e-06, "loss": 0.0226, "step": 6880 }, { "epoch": 1.2756516916250693, "grad_norm": 146650.640625, "learning_rate": 3.6217415418746536e-06, "loss": 0.0194, "step": 6900 }, { "epoch": 1.2793492327602145, "grad_norm": 310544.5625, "learning_rate": 3.603253836198928e-06, "loss": 0.0273, "step": 6920 }, { "epoch": 1.2830467738953595, "grad_norm": 176867.46875, "learning_rate": 3.5847661305232022e-06, "loss": 0.0245, "step": 6940 }, { "epoch": 1.2867443150305047, "grad_norm": 128429.8125, "learning_rate": 3.566278424847477e-06, "loss": 0.022, "step": 6960 }, { "epoch": 1.2904418561656499, "grad_norm": 139811.3125, "learning_rate": 3.547790719171751e-06, "loss": 0.0229, "step": 6980 }, { "epoch": 1.2941393973007949, "grad_norm": 252133.390625, "learning_rate": 3.5293030134960255e-06, "loss": 0.0254, "step": 7000 }, { "epoch": 1.29783693843594, "grad_norm": 151203.578125, "learning_rate": 3.5108153078203e-06, "loss": 0.0232, "step": 7020 }, { "epoch": 1.3015344795710853, "grad_norm": 147495.046875, "learning_rate": 3.492327602144574e-06, "loss": 0.0222, "step": 7040 }, { "epoch": 1.3052320207062302, "grad_norm": 278537.0, "learning_rate": 3.4738398964688484e-06, "loss": 0.0204, "step": 7060 }, { "epoch": 1.3089295618413754, "grad_norm": 201168.078125, "learning_rate": 3.455352190793123e-06, "loss": 0.0258, "step": 7080 }, { "epoch": 1.3126271029765206, "grad_norm": 291186.3125, "learning_rate": 3.436864485117397e-06, "loss": 0.0255, "step": 7100 }, { "epoch": 1.3163246441116656, "grad_norm": 105182.578125, "learning_rate": 3.4183767794416717e-06, "loss": 0.0213, "step": 7120 }, { "epoch": 1.3200221852468108, "grad_norm": 192964.09375, "learning_rate": 3.3998890737659455e-06, "loss": 0.0234, "step": 7140 }, { "epoch": 1.323719726381956, "grad_norm": 187349.546875, "learning_rate": 3.3814013680902203e-06, "loss": 0.0231, "step": 7160 }, { "epoch": 1.3274172675171012, "grad_norm": 91186.453125, "learning_rate": 3.362913662414495e-06, "loss": 0.0216, "step": 7180 }, { "epoch": 1.3311148086522462, "grad_norm": 197646.78125, "learning_rate": 3.344425956738769e-06, "loss": 0.0216, "step": 7200 }, { "epoch": 1.3348123497873914, "grad_norm": 113568.28125, "learning_rate": 3.3259382510630435e-06, "loss": 0.0167, "step": 7220 }, { "epoch": 1.3385098909225366, "grad_norm": 171078.3125, "learning_rate": 3.307450545387318e-06, "loss": 0.0245, "step": 7240 }, { "epoch": 1.3422074320576816, "grad_norm": 280644.15625, "learning_rate": 3.2889628397115917e-06, "loss": 0.0196, "step": 7260 }, { "epoch": 1.3459049731928268, "grad_norm": 107269.703125, "learning_rate": 3.2704751340358664e-06, "loss": 0.0191, "step": 7280 }, { "epoch": 1.349602514327972, "grad_norm": 152088.6875, "learning_rate": 3.251987428360141e-06, "loss": 0.0214, "step": 7300 }, { "epoch": 1.3533000554631172, "grad_norm": 169087.515625, "learning_rate": 3.233499722684415e-06, "loss": 0.0226, "step": 7320 }, { "epoch": 1.3569975965982621, "grad_norm": 121175.640625, "learning_rate": 3.2150120170086897e-06, "loss": 0.0209, "step": 7340 }, { "epoch": 1.3606951377334073, "grad_norm": 149657.40625, "learning_rate": 3.1965243113329636e-06, "loss": 0.0188, "step": 7360 }, { "epoch": 1.3643926788685525, "grad_norm": 102583.3125, "learning_rate": 3.1780366056572383e-06, "loss": 0.0227, "step": 7380 }, { "epoch": 1.3680902200036975, "grad_norm": 220809.03125, "learning_rate": 3.1595488999815126e-06, "loss": 0.0184, "step": 7400 }, { "epoch": 1.3717877611388427, "grad_norm": 164637.875, "learning_rate": 3.141061194305787e-06, "loss": 0.0206, "step": 7420 }, { "epoch": 1.375485302273988, "grad_norm": 160252.15625, "learning_rate": 3.122573488630061e-06, "loss": 0.0179, "step": 7440 }, { "epoch": 1.3791828434091329, "grad_norm": 113583.3984375, "learning_rate": 3.104085782954336e-06, "loss": 0.0191, "step": 7460 }, { "epoch": 1.382880384544278, "grad_norm": 171301.390625, "learning_rate": 3.0855980772786097e-06, "loss": 0.0189, "step": 7480 }, { "epoch": 1.3865779256794233, "grad_norm": 141760.046875, "learning_rate": 3.0671103716028845e-06, "loss": 0.0189, "step": 7500 }, { "epoch": 1.3902754668145683, "grad_norm": 164532.734375, "learning_rate": 3.0486226659271587e-06, "loss": 0.0211, "step": 7520 }, { "epoch": 1.3939730079497135, "grad_norm": 201083.53125, "learning_rate": 3.030134960251433e-06, "loss": 0.0203, "step": 7540 }, { "epoch": 1.3976705490848587, "grad_norm": 114150.078125, "learning_rate": 3.0116472545757073e-06, "loss": 0.0202, "step": 7560 }, { "epoch": 1.4013680902200036, "grad_norm": 105227.3046875, "learning_rate": 2.9931595488999816e-06, "loss": 0.0179, "step": 7580 }, { "epoch": 1.4050656313551488, "grad_norm": 137791.65625, "learning_rate": 2.974671843224256e-06, "loss": 0.0178, "step": 7600 }, { "epoch": 1.408763172490294, "grad_norm": 93458.9921875, "learning_rate": 2.9561841375485306e-06, "loss": 0.0188, "step": 7620 }, { "epoch": 1.412460713625439, "grad_norm": 132947.59375, "learning_rate": 2.9376964318728045e-06, "loss": 0.0231, "step": 7640 }, { "epoch": 1.4161582547605842, "grad_norm": 137827.40625, "learning_rate": 2.919208726197079e-06, "loss": 0.0188, "step": 7660 }, { "epoch": 1.4198557958957294, "grad_norm": 221912.953125, "learning_rate": 2.900721020521354e-06, "loss": 0.0192, "step": 7680 }, { "epoch": 1.4235533370308744, "grad_norm": 167596.15625, "learning_rate": 2.8822333148456278e-06, "loss": 0.0188, "step": 7700 }, { "epoch": 1.4272508781660196, "grad_norm": 222441.71875, "learning_rate": 2.8637456091699025e-06, "loss": 0.0215, "step": 7720 }, { "epoch": 1.4309484193011648, "grad_norm": 133030.28125, "learning_rate": 2.8452579034941768e-06, "loss": 0.0175, "step": 7740 }, { "epoch": 1.4346459604363098, "grad_norm": 170401.625, "learning_rate": 2.826770197818451e-06, "loss": 0.0184, "step": 7760 }, { "epoch": 1.438343501571455, "grad_norm": 249026.03125, "learning_rate": 2.8082824921427254e-06, "loss": 0.0187, "step": 7780 }, { "epoch": 1.4420410427066002, "grad_norm": 153504.90625, "learning_rate": 2.7897947864669992e-06, "loss": 0.0184, "step": 7800 }, { "epoch": 1.4457385838417451, "grad_norm": 116464.4765625, "learning_rate": 2.771307080791274e-06, "loss": 0.0195, "step": 7820 }, { "epoch": 1.4494361249768903, "grad_norm": 134998.984375, "learning_rate": 2.7528193751155486e-06, "loss": 0.0191, "step": 7840 }, { "epoch": 1.4531336661120355, "grad_norm": 177865.984375, "learning_rate": 2.7343316694398225e-06, "loss": 0.0169, "step": 7860 }, { "epoch": 1.4568312072471805, "grad_norm": 217818.40625, "learning_rate": 2.7158439637640972e-06, "loss": 0.0165, "step": 7880 }, { "epoch": 1.4605287483823257, "grad_norm": 267455.9375, "learning_rate": 2.6973562580883715e-06, "loss": 0.0202, "step": 7900 }, { "epoch": 1.464226289517471, "grad_norm": 91404.4296875, "learning_rate": 2.678868552412646e-06, "loss": 0.017, "step": 7920 }, { "epoch": 1.467923830652616, "grad_norm": 253760.109375, "learning_rate": 2.66038084673692e-06, "loss": 0.0157, "step": 7940 }, { "epoch": 1.471621371787761, "grad_norm": 108044.6015625, "learning_rate": 2.641893141061195e-06, "loss": 0.0187, "step": 7960 }, { "epoch": 1.4753189129229063, "grad_norm": 114390.0, "learning_rate": 2.6234054353854687e-06, "loss": 0.0168, "step": 7980 }, { "epoch": 1.4790164540580513, "grad_norm": 213150.25, "learning_rate": 2.6049177297097434e-06, "loss": 0.0165, "step": 8000 }, { "epoch": 1.4827139951931965, "grad_norm": 124014.53125, "learning_rate": 2.5864300240340173e-06, "loss": 0.0138, "step": 8020 }, { "epoch": 1.4864115363283417, "grad_norm": 125034.8984375, "learning_rate": 2.567942318358292e-06, "loss": 0.0151, "step": 8040 }, { "epoch": 1.4901090774634869, "grad_norm": 93910.4453125, "learning_rate": 2.5494546126825663e-06, "loss": 0.0175, "step": 8060 }, { "epoch": 1.4938066185986318, "grad_norm": 99099.5390625, "learning_rate": 2.5309669070068405e-06, "loss": 0.017, "step": 8080 }, { "epoch": 1.497504159733777, "grad_norm": 101766.75, "learning_rate": 2.512479201331115e-06, "loss": 0.0158, "step": 8100 }, { "epoch": 1.501201700868922, "grad_norm": 146969.875, "learning_rate": 2.493991495655389e-06, "loss": 0.0171, "step": 8120 }, { "epoch": 1.5048992420040674, "grad_norm": 113886.953125, "learning_rate": 2.475503789979664e-06, "loss": 0.0157, "step": 8140 }, { "epoch": 1.5085967831392124, "grad_norm": 310139.90625, "learning_rate": 2.457016084303938e-06, "loss": 0.0163, "step": 8160 }, { "epoch": 1.5122943242743574, "grad_norm": 78983.7890625, "learning_rate": 2.4385283786282124e-06, "loss": 0.0177, "step": 8180 }, { "epoch": 1.5159918654095028, "grad_norm": 155035.984375, "learning_rate": 2.4200406729524867e-06, "loss": 0.0172, "step": 8200 }, { "epoch": 1.5196894065446478, "grad_norm": 111692.59375, "learning_rate": 2.4015529672767614e-06, "loss": 0.0135, "step": 8220 }, { "epoch": 1.523386947679793, "grad_norm": 89581.5546875, "learning_rate": 2.3830652616010357e-06, "loss": 0.016, "step": 8240 }, { "epoch": 1.5270844888149382, "grad_norm": 74326.9140625, "learning_rate": 2.36457755592531e-06, "loss": 0.0146, "step": 8260 }, { "epoch": 1.5307820299500832, "grad_norm": 86257.5625, "learning_rate": 2.3460898502495843e-06, "loss": 0.0158, "step": 8280 }, { "epoch": 1.5344795710852284, "grad_norm": 204998.53125, "learning_rate": 2.3276021445738586e-06, "loss": 0.0166, "step": 8300 }, { "epoch": 1.5381771122203736, "grad_norm": 131477.296875, "learning_rate": 2.309114438898133e-06, "loss": 0.0163, "step": 8320 }, { "epoch": 1.5418746533555185, "grad_norm": 62000.34375, "learning_rate": 2.290626733222407e-06, "loss": 0.0141, "step": 8340 }, { "epoch": 1.5455721944906637, "grad_norm": 131954.5, "learning_rate": 2.272139027546682e-06, "loss": 0.0143, "step": 8360 }, { "epoch": 1.549269735625809, "grad_norm": 176431.921875, "learning_rate": 2.253651321870956e-06, "loss": 0.014, "step": 8380 }, { "epoch": 1.552967276760954, "grad_norm": 168082.734375, "learning_rate": 2.2351636161952305e-06, "loss": 0.0163, "step": 8400 }, { "epoch": 1.5566648178960991, "grad_norm": 213557.40625, "learning_rate": 2.2166759105195047e-06, "loss": 0.0138, "step": 8420 }, { "epoch": 1.5603623590312443, "grad_norm": 149194.671875, "learning_rate": 2.198188204843779e-06, "loss": 0.0145, "step": 8440 }, { "epoch": 1.5640599001663893, "grad_norm": 108979.765625, "learning_rate": 2.1797004991680533e-06, "loss": 0.015, "step": 8460 }, { "epoch": 1.5677574413015345, "grad_norm": 113606.09375, "learning_rate": 2.1612127934923276e-06, "loss": 0.0148, "step": 8480 }, { "epoch": 1.5714549824366797, "grad_norm": 54511.078125, "learning_rate": 2.1427250878166023e-06, "loss": 0.013, "step": 8500 }, { "epoch": 1.5751525235718247, "grad_norm": 223038.484375, "learning_rate": 2.1242373821408766e-06, "loss": 0.0123, "step": 8520 }, { "epoch": 1.5788500647069699, "grad_norm": 81127.296875, "learning_rate": 2.105749676465151e-06, "loss": 0.0179, "step": 8540 }, { "epoch": 1.582547605842115, "grad_norm": 205924.984375, "learning_rate": 2.087261970789425e-06, "loss": 0.0139, "step": 8560 }, { "epoch": 1.58624514697726, "grad_norm": 136897.609375, "learning_rate": 2.0687742651136995e-06, "loss": 0.013, "step": 8580 }, { "epoch": 1.5899426881124052, "grad_norm": 74720.484375, "learning_rate": 2.0502865594379738e-06, "loss": 0.0144, "step": 8600 }, { "epoch": 1.5936402292475504, "grad_norm": 131154.421875, "learning_rate": 2.031798853762248e-06, "loss": 0.0141, "step": 8620 }, { "epoch": 1.5973377703826954, "grad_norm": 141421.515625, "learning_rate": 2.0133111480865224e-06, "loss": 0.0141, "step": 8640 }, { "epoch": 1.6010353115178406, "grad_norm": 95285.7890625, "learning_rate": 1.994823442410797e-06, "loss": 0.0149, "step": 8660 }, { "epoch": 1.6047328526529858, "grad_norm": 259090.546875, "learning_rate": 1.9763357367350714e-06, "loss": 0.0143, "step": 8680 }, { "epoch": 1.6084303937881308, "grad_norm": 129890.265625, "learning_rate": 1.9578480310593456e-06, "loss": 0.0124, "step": 8700 }, { "epoch": 1.612127934923276, "grad_norm": 199939.984375, "learning_rate": 1.9393603253836204e-06, "loss": 0.0124, "step": 8720 }, { "epoch": 1.6158254760584212, "grad_norm": 105627.1640625, "learning_rate": 1.9208726197078946e-06, "loss": 0.0146, "step": 8740 }, { "epoch": 1.6195230171935662, "grad_norm": 111494.3125, "learning_rate": 1.9023849140321687e-06, "loss": 0.0118, "step": 8760 }, { "epoch": 1.6232205583287114, "grad_norm": 162020.4375, "learning_rate": 1.883897208356443e-06, "loss": 0.0133, "step": 8780 }, { "epoch": 1.6269180994638566, "grad_norm": 151192.859375, "learning_rate": 1.8654095026807175e-06, "loss": 0.0146, "step": 8800 }, { "epoch": 1.6306156405990015, "grad_norm": 92355.328125, "learning_rate": 1.8469217970049918e-06, "loss": 0.0125, "step": 8820 }, { "epoch": 1.6343131817341467, "grad_norm": 86571.875, "learning_rate": 1.828434091329266e-06, "loss": 0.0128, "step": 8840 }, { "epoch": 1.638010722869292, "grad_norm": 155513.75, "learning_rate": 1.8099463856535404e-06, "loss": 0.0149, "step": 8860 }, { "epoch": 1.641708264004437, "grad_norm": 140762.703125, "learning_rate": 1.7914586799778149e-06, "loss": 0.0141, "step": 8880 }, { "epoch": 1.6454058051395821, "grad_norm": 129466.796875, "learning_rate": 1.7729709743020892e-06, "loss": 0.0122, "step": 8900 }, { "epoch": 1.6491033462747273, "grad_norm": 134758.546875, "learning_rate": 1.7544832686263635e-06, "loss": 0.0133, "step": 8920 }, { "epoch": 1.6528008874098723, "grad_norm": 81380.0546875, "learning_rate": 1.7359955629506382e-06, "loss": 0.0131, "step": 8940 }, { "epoch": 1.6564984285450177, "grad_norm": 102433.5234375, "learning_rate": 1.7175078572749125e-06, "loss": 0.0126, "step": 8960 }, { "epoch": 1.6601959696801627, "grad_norm": 95113.1640625, "learning_rate": 1.6990201515991865e-06, "loss": 0.0133, "step": 8980 }, { "epoch": 1.6638935108153077, "grad_norm": 194246.734375, "learning_rate": 1.6805324459234608e-06, "loss": 0.0116, "step": 9000 }, { "epoch": 1.667591051950453, "grad_norm": 128436.3125, "learning_rate": 1.6620447402477356e-06, "loss": 0.0139, "step": 9020 }, { "epoch": 1.671288593085598, "grad_norm": 93527.5546875, "learning_rate": 1.6435570345720098e-06, "loss": 0.0112, "step": 9040 }, { "epoch": 1.674986134220743, "grad_norm": 85030.171875, "learning_rate": 1.6250693288962841e-06, "loss": 0.0117, "step": 9060 }, { "epoch": 1.6786836753558885, "grad_norm": 193011.28125, "learning_rate": 1.6065816232205584e-06, "loss": 0.0122, "step": 9080 }, { "epoch": 1.6823812164910334, "grad_norm": 167224.15625, "learning_rate": 1.588093917544833e-06, "loss": 0.0137, "step": 9100 }, { "epoch": 1.6860787576261784, "grad_norm": 97394.6328125, "learning_rate": 1.5696062118691072e-06, "loss": 0.0114, "step": 9120 }, { "epoch": 1.6897762987613238, "grad_norm": 61188.08203125, "learning_rate": 1.5511185061933815e-06, "loss": 0.0123, "step": 9140 }, { "epoch": 1.6934738398964688, "grad_norm": 109640.2265625, "learning_rate": 1.532630800517656e-06, "loss": 0.0119, "step": 9160 }, { "epoch": 1.697171381031614, "grad_norm": 96847.4296875, "learning_rate": 1.5141430948419303e-06, "loss": 0.0109, "step": 9180 }, { "epoch": 1.7008689221667592, "grad_norm": 138873.15625, "learning_rate": 1.4956553891662046e-06, "loss": 0.0125, "step": 9200 }, { "epoch": 1.7045664633019042, "grad_norm": 92997.3203125, "learning_rate": 1.4771676834904789e-06, "loss": 0.0115, "step": 9220 }, { "epoch": 1.7082640044370494, "grad_norm": 108250.3359375, "learning_rate": 1.4586799778147534e-06, "loss": 0.0116, "step": 9240 }, { "epoch": 1.7119615455721946, "grad_norm": 96745.375, "learning_rate": 1.4401922721390277e-06, "loss": 0.0108, "step": 9260 }, { "epoch": 1.7156590867073396, "grad_norm": 71503.3515625, "learning_rate": 1.421704566463302e-06, "loss": 0.0117, "step": 9280 }, { "epoch": 1.7193566278424848, "grad_norm": 131563.34375, "learning_rate": 1.4032168607875762e-06, "loss": 0.0111, "step": 9300 }, { "epoch": 1.72305416897763, "grad_norm": 152288.3125, "learning_rate": 1.3847291551118507e-06, "loss": 0.0131, "step": 9320 }, { "epoch": 1.726751710112775, "grad_norm": 188523.875, "learning_rate": 1.366241449436125e-06, "loss": 0.0117, "step": 9340 }, { "epoch": 1.7304492512479202, "grad_norm": 135298.65625, "learning_rate": 1.3477537437603993e-06, "loss": 0.0098, "step": 9360 }, { "epoch": 1.7341467923830653, "grad_norm": 73664.140625, "learning_rate": 1.3292660380846738e-06, "loss": 0.0113, "step": 9380 }, { "epoch": 1.7378443335182103, "grad_norm": 157000.625, "learning_rate": 1.3107783324089481e-06, "loss": 0.0111, "step": 9400 }, { "epoch": 1.7415418746533555, "grad_norm": 114460.046875, "learning_rate": 1.2922906267332224e-06, "loss": 0.0101, "step": 9420 }, { "epoch": 1.7452394157885007, "grad_norm": 86580.3046875, "learning_rate": 1.2738029210574967e-06, "loss": 0.0109, "step": 9440 }, { "epoch": 1.7489369569236457, "grad_norm": 105799.8984375, "learning_rate": 1.2553152153817714e-06, "loss": 0.0113, "step": 9460 }, { "epoch": 1.752634498058791, "grad_norm": 141433.953125, "learning_rate": 1.2368275097060457e-06, "loss": 0.0104, "step": 9480 }, { "epoch": 1.756332039193936, "grad_norm": 232147.828125, "learning_rate": 1.2183398040303198e-06, "loss": 0.0105, "step": 9500 }, { "epoch": 1.760029580329081, "grad_norm": 91419.984375, "learning_rate": 1.1998520983545943e-06, "loss": 0.0099, "step": 9520 }, { "epoch": 1.7637271214642263, "grad_norm": 105158.8515625, "learning_rate": 1.1813643926788688e-06, "loss": 0.0086, "step": 9540 }, { "epoch": 1.7674246625993715, "grad_norm": 99868.90625, "learning_rate": 1.162876687003143e-06, "loss": 0.0103, "step": 9560 }, { "epoch": 1.7711222037345165, "grad_norm": 91508.3046875, "learning_rate": 1.1443889813274174e-06, "loss": 0.0097, "step": 9580 }, { "epoch": 1.7748197448696617, "grad_norm": 124763.6640625, "learning_rate": 1.1259012756516916e-06, "loss": 0.0101, "step": 9600 }, { "epoch": 1.7785172860048069, "grad_norm": 68321.0078125, "learning_rate": 1.1074135699759661e-06, "loss": 0.0106, "step": 9620 }, { "epoch": 1.7822148271399518, "grad_norm": 54323.26171875, "learning_rate": 1.0889258643002404e-06, "loss": 0.0104, "step": 9640 }, { "epoch": 1.785912368275097, "grad_norm": 170615.03125, "learning_rate": 1.0704381586245147e-06, "loss": 0.009, "step": 9660 }, { "epoch": 1.7896099094102422, "grad_norm": 54507.4921875, "learning_rate": 1.051950452948789e-06, "loss": 0.0113, "step": 9680 }, { "epoch": 1.7933074505453872, "grad_norm": 224390.078125, "learning_rate": 1.0334627472730635e-06, "loss": 0.0081, "step": 9700 }, { "epoch": 1.7970049916805324, "grad_norm": 114423.484375, "learning_rate": 1.0149750415973378e-06, "loss": 0.0092, "step": 9720 }, { "epoch": 1.8007025328156776, "grad_norm": 40952.4609375, "learning_rate": 9.96487335921612e-07, "loss": 0.008, "step": 9740 }, { "epoch": 1.8044000739508226, "grad_norm": 125467.2421875, "learning_rate": 9.779996302458866e-07, "loss": 0.012, "step": 9760 }, { "epoch": 1.8080976150859678, "grad_norm": 67106.8046875, "learning_rate": 9.595119245701609e-07, "loss": 0.0093, "step": 9780 }, { "epoch": 1.811795156221113, "grad_norm": 53429.3359375, "learning_rate": 9.410242188944353e-07, "loss": 0.0099, "step": 9800 }, { "epoch": 1.815492697356258, "grad_norm": 66503.3515625, "learning_rate": 9.225365132187096e-07, "loss": 0.0098, "step": 9820 }, { "epoch": 1.8191902384914034, "grad_norm": 86430.2734375, "learning_rate": 9.04048807542984e-07, "loss": 0.009, "step": 9840 }, { "epoch": 1.8228877796265484, "grad_norm": 165746.984375, "learning_rate": 8.855611018672583e-07, "loss": 0.0091, "step": 9860 }, { "epoch": 1.8265853207616933, "grad_norm": 105324.359375, "learning_rate": 8.670733961915328e-07, "loss": 0.0091, "step": 9880 }, { "epoch": 1.8302828618968388, "grad_norm": 89842.6015625, "learning_rate": 8.485856905158069e-07, "loss": 0.0092, "step": 9900 }, { "epoch": 1.8339804030319837, "grad_norm": 206000.046875, "learning_rate": 8.300979848400814e-07, "loss": 0.0098, "step": 9920 }, { "epoch": 1.8376779441671287, "grad_norm": 79479.296875, "learning_rate": 8.116102791643558e-07, "loss": 0.0081, "step": 9940 }, { "epoch": 1.8413754853022741, "grad_norm": 42255.9140625, "learning_rate": 7.931225734886301e-07, "loss": 0.0092, "step": 9960 }, { "epoch": 1.845073026437419, "grad_norm": 48692.4453125, "learning_rate": 7.746348678129045e-07, "loss": 0.0096, "step": 9980 }, { "epoch": 1.848770567572564, "grad_norm": 60314.51171875, "learning_rate": 7.561471621371788e-07, "loss": 0.0087, "step": 10000 }, { "epoch": 1.8524681087077095, "grad_norm": 67695.015625, "learning_rate": 7.376594564614532e-07, "loss": 0.0095, "step": 10020 }, { "epoch": 1.8561656498428545, "grad_norm": 57189.34765625, "learning_rate": 7.191717507857275e-07, "loss": 0.008, "step": 10040 }, { "epoch": 1.8598631909779997, "grad_norm": 47331.48046875, "learning_rate": 7.006840451100019e-07, "loss": 0.0087, "step": 10060 }, { "epoch": 1.8635607321131449, "grad_norm": 78001.078125, "learning_rate": 6.821963394342762e-07, "loss": 0.0086, "step": 10080 }, { "epoch": 1.8672582732482899, "grad_norm": 134432.046875, "learning_rate": 6.637086337585506e-07, "loss": 0.0101, "step": 10100 }, { "epoch": 1.870955814383435, "grad_norm": 70737.28125, "learning_rate": 6.452209280828249e-07, "loss": 0.0091, "step": 10120 }, { "epoch": 1.8746533555185803, "grad_norm": 100655.2578125, "learning_rate": 6.267332224070994e-07, "loss": 0.0072, "step": 10140 }, { "epoch": 1.8783508966537252, "grad_norm": 86067.2890625, "learning_rate": 6.082455167313737e-07, "loss": 0.0081, "step": 10160 }, { "epoch": 1.8820484377888704, "grad_norm": 113390.1640625, "learning_rate": 5.897578110556481e-07, "loss": 0.0096, "step": 10180 }, { "epoch": 1.8857459789240156, "grad_norm": 148825.921875, "learning_rate": 5.712701053799224e-07, "loss": 0.0102, "step": 10200 }, { "epoch": 1.8894435200591606, "grad_norm": 81128.765625, "learning_rate": 5.527823997041967e-07, "loss": 0.0089, "step": 10220 }, { "epoch": 1.8931410611943058, "grad_norm": 91735.953125, "learning_rate": 5.342946940284711e-07, "loss": 0.0081, "step": 10240 }, { "epoch": 1.896838602329451, "grad_norm": 171614.328125, "learning_rate": 5.158069883527455e-07, "loss": 0.0078, "step": 10260 }, { "epoch": 1.900536143464596, "grad_norm": 136330.6875, "learning_rate": 4.973192826770198e-07, "loss": 0.0075, "step": 10280 }, { "epoch": 1.9042336845997412, "grad_norm": 72486.4453125, "learning_rate": 4.788315770012942e-07, "loss": 0.0096, "step": 10300 }, { "epoch": 1.9079312257348864, "grad_norm": 119471.4453125, "learning_rate": 4.6034387132556857e-07, "loss": 0.0079, "step": 10320 }, { "epoch": 1.9116287668700314, "grad_norm": 83491.46875, "learning_rate": 4.418561656498429e-07, "loss": 0.0087, "step": 10340 }, { "epoch": 1.9153263080051766, "grad_norm": 161448.078125, "learning_rate": 4.2336845997411725e-07, "loss": 0.0088, "step": 10360 }, { "epoch": 1.9190238491403218, "grad_norm": 190009.40625, "learning_rate": 4.048807542983916e-07, "loss": 0.009, "step": 10380 }, { "epoch": 1.9227213902754667, "grad_norm": 105300.2421875, "learning_rate": 3.8639304862266594e-07, "loss": 0.0087, "step": 10400 }, { "epoch": 1.926418931410612, "grad_norm": 59896.84375, "learning_rate": 3.679053429469403e-07, "loss": 0.0082, "step": 10420 }, { "epoch": 1.9301164725457571, "grad_norm": 86433.484375, "learning_rate": 3.494176372712146e-07, "loss": 0.008, "step": 10440 }, { "epoch": 1.933814013680902, "grad_norm": 82433.390625, "learning_rate": 3.3092993159548907e-07, "loss": 0.0073, "step": 10460 }, { "epoch": 1.9375115548160473, "grad_norm": 158433.75, "learning_rate": 3.1244222591976336e-07, "loss": 0.0087, "step": 10480 }, { "epoch": 1.9412090959511925, "grad_norm": 74148.828125, "learning_rate": 2.9395452024403776e-07, "loss": 0.0078, "step": 10500 }, { "epoch": 1.9449066370863375, "grad_norm": 122634.9140625, "learning_rate": 2.754668145683121e-07, "loss": 0.0088, "step": 10520 }, { "epoch": 1.9486041782214827, "grad_norm": 81306.5703125, "learning_rate": 2.5697910889258644e-07, "loss": 0.0082, "step": 10540 }, { "epoch": 1.9523017193566279, "grad_norm": 62461.99609375, "learning_rate": 2.384914032168608e-07, "loss": 0.0074, "step": 10560 }, { "epoch": 1.9559992604917729, "grad_norm": 293536.875, "learning_rate": 2.2000369754113515e-07, "loss": 0.0076, "step": 10580 }, { "epoch": 1.959696801626918, "grad_norm": 98041.9765625, "learning_rate": 2.015159918654095e-07, "loss": 0.009, "step": 10600 }, { "epoch": 1.9633943427620633, "grad_norm": 102565.984375, "learning_rate": 1.830282861896839e-07, "loss": 0.0083, "step": 10620 }, { "epoch": 1.9670918838972082, "grad_norm": 53710.390625, "learning_rate": 1.6454058051395823e-07, "loss": 0.007, "step": 10640 }, { "epoch": 1.9707894250323534, "grad_norm": 150167.515625, "learning_rate": 1.4605287483823258e-07, "loss": 0.0072, "step": 10660 }, { "epoch": 1.9744869661674986, "grad_norm": 134856.953125, "learning_rate": 1.2756516916250695e-07, "loss": 0.0095, "step": 10680 }, { "epoch": 1.9781845073026436, "grad_norm": 86079.8125, "learning_rate": 1.090774634867813e-07, "loss": 0.0075, "step": 10700 }, { "epoch": 1.981882048437789, "grad_norm": 142126.03125, "learning_rate": 9.058975781105564e-08, "loss": 0.0072, "step": 10720 }, { "epoch": 1.985579589572934, "grad_norm": 62874.44140625, "learning_rate": 7.210205213533001e-08, "loss": 0.0071, "step": 10740 }, { "epoch": 1.989277130708079, "grad_norm": 87455.859375, "learning_rate": 5.361434645960437e-08, "loss": 0.0082, "step": 10760 }, { "epoch": 1.9929746718432244, "grad_norm": 109955.4140625, "learning_rate": 3.5126640783878725e-08, "loss": 0.0082, "step": 10780 }, { "epoch": 1.9966722129783694, "grad_norm": 66628.1484375, "learning_rate": 1.663893510815308e-08, "loss": 0.0076, "step": 10800 } ], "logging_steps": 20, "max_steps": 10818, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.922032294823199e+17, "train_batch_size": 200, "trial_name": null, "trial_params": null }