{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 438, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00684931506849315, "grad_norm": 5.92502498626709, "learning_rate": 2.2727272727272729e-07, "loss": 0.8592, "step": 1 }, { "epoch": 0.0136986301369863, "grad_norm": 5.975358009338379, "learning_rate": 4.5454545454545457e-07, "loss": 0.8441, "step": 2 }, { "epoch": 0.02054794520547945, "grad_norm": 6.038137435913086, "learning_rate": 6.818181818181818e-07, "loss": 0.8847, "step": 3 }, { "epoch": 0.0273972602739726, "grad_norm": 6.023561000823975, "learning_rate": 9.090909090909091e-07, "loss": 0.8802, "step": 4 }, { "epoch": 0.03424657534246575, "grad_norm": 5.958974361419678, "learning_rate": 1.1363636363636364e-06, "loss": 0.9015, "step": 5 }, { "epoch": 0.0410958904109589, "grad_norm": 5.522011756896973, "learning_rate": 1.3636363636363636e-06, "loss": 0.8771, "step": 6 }, { "epoch": 0.04794520547945205, "grad_norm": 5.2923994064331055, "learning_rate": 1.590909090909091e-06, "loss": 0.8668, "step": 7 }, { "epoch": 0.0547945205479452, "grad_norm": 4.468110084533691, "learning_rate": 1.8181818181818183e-06, "loss": 0.8269, "step": 8 }, { "epoch": 0.06164383561643835, "grad_norm": 3.9916787147521973, "learning_rate": 2.0454545454545457e-06, "loss": 0.7879, "step": 9 }, { "epoch": 0.0684931506849315, "grad_norm": 2.3367724418640137, "learning_rate": 2.2727272727272728e-06, "loss": 0.7631, "step": 10 }, { "epoch": 0.07534246575342465, "grad_norm": 2.0509274005889893, "learning_rate": 2.5e-06, "loss": 0.7555, "step": 11 }, { "epoch": 0.0821917808219178, "grad_norm": 1.9280574321746826, "learning_rate": 2.7272727272727272e-06, "loss": 0.7413, "step": 12 }, { "epoch": 0.08904109589041095, "grad_norm": 1.8343186378479004, "learning_rate": 2.954545454545455e-06, "loss": 0.7588, "step": 13 }, { "epoch": 0.0958904109589041, "grad_norm": 3.5027122497558594, "learning_rate": 3.181818181818182e-06, "loss": 0.7552, "step": 14 }, { "epoch": 0.10273972602739725, "grad_norm": 3.766814947128296, "learning_rate": 3.409090909090909e-06, "loss": 0.7653, "step": 15 }, { "epoch": 0.1095890410958904, "grad_norm": 3.9283227920532227, "learning_rate": 3.6363636363636366e-06, "loss": 0.7895, "step": 16 }, { "epoch": 0.11643835616438356, "grad_norm": 3.853430986404419, "learning_rate": 3.863636363636364e-06, "loss": 0.7636, "step": 17 }, { "epoch": 0.1232876712328767, "grad_norm": 3.0980968475341797, "learning_rate": 4.0909090909090915e-06, "loss": 0.7197, "step": 18 }, { "epoch": 0.13013698630136986, "grad_norm": 2.2356159687042236, "learning_rate": 4.3181818181818185e-06, "loss": 0.6567, "step": 19 }, { "epoch": 0.136986301369863, "grad_norm": 2.1344797611236572, "learning_rate": 4.5454545454545455e-06, "loss": 0.7121, "step": 20 }, { "epoch": 0.14383561643835616, "grad_norm": 1.604323148727417, "learning_rate": 4.772727272727273e-06, "loss": 0.7, "step": 21 }, { "epoch": 0.1506849315068493, "grad_norm": 1.3561691045761108, "learning_rate": 5e-06, "loss": 0.6897, "step": 22 }, { "epoch": 0.15753424657534246, "grad_norm": 1.3645248413085938, "learning_rate": 5.2272727272727274e-06, "loss": 0.668, "step": 23 }, { "epoch": 0.1643835616438356, "grad_norm": 1.4382814168930054, "learning_rate": 5.4545454545454545e-06, "loss": 0.6585, "step": 24 }, { "epoch": 0.17123287671232876, "grad_norm": 1.3407988548278809, "learning_rate": 5.681818181818183e-06, "loss": 0.6639, "step": 25 }, { "epoch": 0.1780821917808219, "grad_norm": 1.243182897567749, "learning_rate": 5.90909090909091e-06, "loss": 0.6519, "step": 26 }, { "epoch": 0.18493150684931506, "grad_norm": 0.9758120775222778, "learning_rate": 6.136363636363637e-06, "loss": 0.6133, "step": 27 }, { "epoch": 0.1917808219178082, "grad_norm": 0.9279192686080933, "learning_rate": 6.363636363636364e-06, "loss": 0.6404, "step": 28 }, { "epoch": 0.19863013698630136, "grad_norm": 1.1322438716888428, "learning_rate": 6.590909090909091e-06, "loss": 0.6385, "step": 29 }, { "epoch": 0.2054794520547945, "grad_norm": 1.0254402160644531, "learning_rate": 6.818181818181818e-06, "loss": 0.6447, "step": 30 }, { "epoch": 0.21232876712328766, "grad_norm": 0.7659736275672913, "learning_rate": 7.045454545454546e-06, "loss": 0.6244, "step": 31 }, { "epoch": 0.2191780821917808, "grad_norm": 0.8017905354499817, "learning_rate": 7.272727272727273e-06, "loss": 0.6137, "step": 32 }, { "epoch": 0.22602739726027396, "grad_norm": 0.8775509595870972, "learning_rate": 7.500000000000001e-06, "loss": 0.6211, "step": 33 }, { "epoch": 0.2328767123287671, "grad_norm": 0.8729256987571716, "learning_rate": 7.727272727272727e-06, "loss": 0.6034, "step": 34 }, { "epoch": 0.23972602739726026, "grad_norm": 0.6592076420783997, "learning_rate": 7.954545454545455e-06, "loss": 0.6151, "step": 35 }, { "epoch": 0.2465753424657534, "grad_norm": 0.5613800883293152, "learning_rate": 8.181818181818183e-06, "loss": 0.5855, "step": 36 }, { "epoch": 0.2534246575342466, "grad_norm": 0.6701193451881409, "learning_rate": 8.40909090909091e-06, "loss": 0.6001, "step": 37 }, { "epoch": 0.2602739726027397, "grad_norm": 0.708198070526123, "learning_rate": 8.636363636363637e-06, "loss": 0.5876, "step": 38 }, { "epoch": 0.2671232876712329, "grad_norm": 0.5600025653839111, "learning_rate": 8.863636363636365e-06, "loss": 0.5883, "step": 39 }, { "epoch": 0.273972602739726, "grad_norm": 0.5653926134109497, "learning_rate": 9.090909090909091e-06, "loss": 0.5614, "step": 40 }, { "epoch": 0.2808219178082192, "grad_norm": 0.7606906890869141, "learning_rate": 9.318181818181819e-06, "loss": 0.6156, "step": 41 }, { "epoch": 0.2876712328767123, "grad_norm": 0.5844655632972717, "learning_rate": 9.545454545454547e-06, "loss": 0.586, "step": 42 }, { "epoch": 0.2945205479452055, "grad_norm": 0.49894917011260986, "learning_rate": 9.772727272727273e-06, "loss": 0.5747, "step": 43 }, { "epoch": 0.3013698630136986, "grad_norm": 0.5528267621994019, "learning_rate": 1e-05, "loss": 0.5419, "step": 44 }, { "epoch": 0.3082191780821918, "grad_norm": 0.6437212228775024, "learning_rate": 9.999841055681184e-06, "loss": 0.568, "step": 45 }, { "epoch": 0.3150684931506849, "grad_norm": 0.5419601798057556, "learning_rate": 9.999364232830053e-06, "loss": 0.5715, "step": 46 }, { "epoch": 0.3219178082191781, "grad_norm": 0.4963567554950714, "learning_rate": 9.99856956176192e-06, "loss": 0.5492, "step": 47 }, { "epoch": 0.3287671232876712, "grad_norm": 0.7037913203239441, "learning_rate": 9.997457093000165e-06, "loss": 0.5849, "step": 48 }, { "epoch": 0.3356164383561644, "grad_norm": 0.7691269516944885, "learning_rate": 9.996026897273024e-06, "loss": 0.585, "step": 49 }, { "epoch": 0.3424657534246575, "grad_norm": 0.4761511981487274, "learning_rate": 9.994279065509094e-06, "loss": 0.5813, "step": 50 }, { "epoch": 0.3493150684931507, "grad_norm": 0.60807204246521, "learning_rate": 9.992213708831542e-06, "loss": 0.5576, "step": 51 }, { "epoch": 0.3561643835616438, "grad_norm": 0.7713252902030945, "learning_rate": 9.989830958551058e-06, "loss": 0.5776, "step": 52 }, { "epoch": 0.363013698630137, "grad_norm": 0.5777241587638855, "learning_rate": 9.987130966157486e-06, "loss": 0.5973, "step": 53 }, { "epoch": 0.3698630136986301, "grad_norm": 0.4666934907436371, "learning_rate": 9.984113903310206e-06, "loss": 0.5383, "step": 54 }, { "epoch": 0.3767123287671233, "grad_norm": 0.6798987984657288, "learning_rate": 9.98077996182722e-06, "loss": 0.5371, "step": 55 }, { "epoch": 0.3835616438356164, "grad_norm": 0.6998324394226074, "learning_rate": 9.977129353672951e-06, "loss": 0.5708, "step": 56 }, { "epoch": 0.3904109589041096, "grad_norm": 0.46906980872154236, "learning_rate": 9.973162310944769e-06, "loss": 0.5777, "step": 57 }, { "epoch": 0.3972602739726027, "grad_norm": 0.595076858997345, "learning_rate": 9.968879085858234e-06, "loss": 0.5366, "step": 58 }, { "epoch": 0.4041095890410959, "grad_norm": 0.6636199355125427, "learning_rate": 9.964279950731066e-06, "loss": 0.5822, "step": 59 }, { "epoch": 0.410958904109589, "grad_norm": 0.4243064224720001, "learning_rate": 9.959365197965824e-06, "loss": 0.5814, "step": 60 }, { "epoch": 0.4178082191780822, "grad_norm": 0.49584805965423584, "learning_rate": 9.954135140031322e-06, "loss": 0.5594, "step": 61 }, { "epoch": 0.4246575342465753, "grad_norm": 0.6090409755706787, "learning_rate": 9.948590109442755e-06, "loss": 0.5516, "step": 62 }, { "epoch": 0.4315068493150685, "grad_norm": 0.5092521905899048, "learning_rate": 9.942730458740568e-06, "loss": 0.5335, "step": 63 }, { "epoch": 0.4383561643835616, "grad_norm": 0.4809435307979584, "learning_rate": 9.936556560468037e-06, "loss": 0.5421, "step": 64 }, { "epoch": 0.4452054794520548, "grad_norm": 0.545276939868927, "learning_rate": 9.930068807147585e-06, "loss": 0.5375, "step": 65 }, { "epoch": 0.4520547945205479, "grad_norm": 0.4461447596549988, "learning_rate": 9.923267611255824e-06, "loss": 0.5605, "step": 66 }, { "epoch": 0.4589041095890411, "grad_norm": 0.4646362364292145, "learning_rate": 9.916153405197333e-06, "loss": 0.5229, "step": 67 }, { "epoch": 0.4657534246575342, "grad_norm": 0.40728962421417236, "learning_rate": 9.908726641277167e-06, "loss": 0.5364, "step": 68 }, { "epoch": 0.4726027397260274, "grad_norm": 0.4487316906452179, "learning_rate": 9.9009877916721e-06, "loss": 0.5351, "step": 69 }, { "epoch": 0.4794520547945205, "grad_norm": 0.4371180832386017, "learning_rate": 9.8929373484006e-06, "loss": 0.543, "step": 70 }, { "epoch": 0.4863013698630137, "grad_norm": 0.40577659010887146, "learning_rate": 9.884575823291561e-06, "loss": 0.5959, "step": 71 }, { "epoch": 0.4931506849315068, "grad_norm": 0.45803892612457275, "learning_rate": 9.875903747951742e-06, "loss": 0.569, "step": 72 }, { "epoch": 0.5, "grad_norm": 0.41057565808296204, "learning_rate": 9.866921673731991e-06, "loss": 0.5376, "step": 73 }, { "epoch": 0.5068493150684932, "grad_norm": 0.40417274832725525, "learning_rate": 9.857630171692175e-06, "loss": 0.5356, "step": 74 }, { "epoch": 0.5136986301369864, "grad_norm": 0.4480303227901459, "learning_rate": 9.848029832564875e-06, "loss": 0.5151, "step": 75 }, { "epoch": 0.5205479452054794, "grad_norm": 0.4397015869617462, "learning_rate": 9.83812126671784e-06, "loss": 0.5274, "step": 76 }, { "epoch": 0.5273972602739726, "grad_norm": 0.4616395831108093, "learning_rate": 9.827905104115167e-06, "loss": 0.5365, "step": 77 }, { "epoch": 0.5342465753424658, "grad_norm": 0.40124353766441345, "learning_rate": 9.81738199427726e-06, "loss": 0.5463, "step": 78 }, { "epoch": 0.541095890410959, "grad_norm": 0.4343346059322357, "learning_rate": 9.80655260623953e-06, "loss": 0.5346, "step": 79 }, { "epoch": 0.547945205479452, "grad_norm": 0.44946178793907166, "learning_rate": 9.795417628509857e-06, "loss": 0.5553, "step": 80 }, { "epoch": 0.5547945205479452, "grad_norm": 0.416344553232193, "learning_rate": 9.783977769024821e-06, "loss": 0.5467, "step": 81 }, { "epoch": 0.5616438356164384, "grad_norm": 0.41620340943336487, "learning_rate": 9.772233755104695e-06, "loss": 0.5365, "step": 82 }, { "epoch": 0.5684931506849316, "grad_norm": 0.43192237615585327, "learning_rate": 9.76018633340719e-06, "loss": 0.5366, "step": 83 }, { "epoch": 0.5753424657534246, "grad_norm": 0.4866390526294708, "learning_rate": 9.747836269880005e-06, "loss": 0.5549, "step": 84 }, { "epoch": 0.5821917808219178, "grad_norm": 0.5592001676559448, "learning_rate": 9.73518434971211e-06, "loss": 0.5605, "step": 85 }, { "epoch": 0.589041095890411, "grad_norm": 0.4245613217353821, "learning_rate": 9.722231377283841e-06, "loss": 0.5304, "step": 86 }, { "epoch": 0.5958904109589042, "grad_norm": 0.434048056602478, "learning_rate": 9.70897817611575e-06, "loss": 0.5392, "step": 87 }, { "epoch": 0.6027397260273972, "grad_norm": 0.36968135833740234, "learning_rate": 9.695425588816248e-06, "loss": 0.536, "step": 88 }, { "epoch": 0.6095890410958904, "grad_norm": 0.4373226463794708, "learning_rate": 9.681574477028039e-06, "loss": 0.531, "step": 89 }, { "epoch": 0.6164383561643836, "grad_norm": 0.3857197165489197, "learning_rate": 9.667425721373333e-06, "loss": 0.5239, "step": 90 }, { "epoch": 0.6232876712328768, "grad_norm": 0.4224616587162018, "learning_rate": 9.65298022139786e-06, "loss": 0.5462, "step": 91 }, { "epoch": 0.6301369863013698, "grad_norm": 0.4477905035018921, "learning_rate": 9.638238895513687e-06, "loss": 0.496, "step": 92 }, { "epoch": 0.636986301369863, "grad_norm": 0.38324639201164246, "learning_rate": 9.623202680940811e-06, "loss": 0.5521, "step": 93 }, { "epoch": 0.6438356164383562, "grad_norm": 0.5522112250328064, "learning_rate": 9.607872533647584e-06, "loss": 0.5576, "step": 94 }, { "epoch": 0.6506849315068494, "grad_norm": 0.45175299048423767, "learning_rate": 9.592249428289935e-06, "loss": 0.5461, "step": 95 }, { "epoch": 0.6575342465753424, "grad_norm": 0.5615484714508057, "learning_rate": 9.5763343581494e-06, "loss": 0.5065, "step": 96 }, { "epoch": 0.6643835616438356, "grad_norm": 0.46789291501045227, "learning_rate": 9.560128335069971e-06, "loss": 0.5649, "step": 97 }, { "epoch": 0.6712328767123288, "grad_norm": 0.45602020621299744, "learning_rate": 9.543632389393767e-06, "loss": 0.5235, "step": 98 }, { "epoch": 0.678082191780822, "grad_norm": 0.5114532113075256, "learning_rate": 9.526847569895529e-06, "loss": 0.5454, "step": 99 }, { "epoch": 0.684931506849315, "grad_norm": 0.39697399735450745, "learning_rate": 9.50977494371594e-06, "loss": 0.5445, "step": 100 }, { "epoch": 0.6917808219178082, "grad_norm": 0.47998112440109253, "learning_rate": 9.49241559629377e-06, "loss": 0.5241, "step": 101 }, { "epoch": 0.6986301369863014, "grad_norm": 0.4278751313686371, "learning_rate": 9.474770631296882e-06, "loss": 0.512, "step": 102 }, { "epoch": 0.7054794520547946, "grad_norm": 0.5100277662277222, "learning_rate": 9.456841170552054e-06, "loss": 0.5387, "step": 103 }, { "epoch": 0.7123287671232876, "grad_norm": 0.41954633593559265, "learning_rate": 9.438628353973654e-06, "loss": 0.5454, "step": 104 }, { "epoch": 0.7191780821917808, "grad_norm": 0.42173242568969727, "learning_rate": 9.420133339491171e-06, "loss": 0.4957, "step": 105 }, { "epoch": 0.726027397260274, "grad_norm": 0.5260700583457947, "learning_rate": 9.4013573029756e-06, "loss": 0.5745, "step": 106 }, { "epoch": 0.7328767123287672, "grad_norm": 0.3974344730377197, "learning_rate": 9.382301438164673e-06, "loss": 0.5289, "step": 107 }, { "epoch": 0.7397260273972602, "grad_norm": 0.4914883077144623, "learning_rate": 9.36296695658697e-06, "loss": 0.5337, "step": 108 }, { "epoch": 0.7465753424657534, "grad_norm": 0.454052597284317, "learning_rate": 9.343355087484893e-06, "loss": 0.5377, "step": 109 }, { "epoch": 0.7534246575342466, "grad_norm": 0.42050087451934814, "learning_rate": 9.323467077736513e-06, "loss": 0.5216, "step": 110 }, { "epoch": 0.7602739726027398, "grad_norm": 0.506266713142395, "learning_rate": 9.303304191776291e-06, "loss": 0.502, "step": 111 }, { "epoch": 0.7671232876712328, "grad_norm": 0.44245612621307373, "learning_rate": 9.282867711514703e-06, "loss": 0.548, "step": 112 }, { "epoch": 0.773972602739726, "grad_norm": 0.4844461679458618, "learning_rate": 9.262158936256717e-06, "loss": 0.5564, "step": 113 }, { "epoch": 0.7808219178082192, "grad_norm": 0.44854727387428284, "learning_rate": 9.241179182619207e-06, "loss": 0.495, "step": 114 }, { "epoch": 0.7876712328767124, "grad_norm": 0.44603100419044495, "learning_rate": 9.219929784447232e-06, "loss": 0.5371, "step": 115 }, { "epoch": 0.7945205479452054, "grad_norm": 0.4621168076992035, "learning_rate": 9.19841209272924e-06, "loss": 0.5636, "step": 116 }, { "epoch": 0.8013698630136986, "grad_norm": 0.36872074007987976, "learning_rate": 9.176627475511171e-06, "loss": 0.5362, "step": 117 }, { "epoch": 0.8082191780821918, "grad_norm": 0.442421019077301, "learning_rate": 9.154577317809483e-06, "loss": 0.5426, "step": 118 }, { "epoch": 0.815068493150685, "grad_norm": 0.47269773483276367, "learning_rate": 9.132263021523096e-06, "loss": 0.5525, "step": 119 }, { "epoch": 0.821917808219178, "grad_norm": 0.39111027121543884, "learning_rate": 9.109686005344258e-06, "loss": 0.5411, "step": 120 }, { "epoch": 0.8287671232876712, "grad_norm": 0.4781222641468048, "learning_rate": 9.086847704668352e-06, "loss": 0.5368, "step": 121 }, { "epoch": 0.8356164383561644, "grad_norm": 0.49406808614730835, "learning_rate": 9.063749571502633e-06, "loss": 0.5375, "step": 122 }, { "epoch": 0.8424657534246576, "grad_norm": 0.4259958565235138, "learning_rate": 9.040393074373921e-06, "loss": 0.5504, "step": 123 }, { "epoch": 0.8493150684931506, "grad_norm": 0.41925352811813354, "learning_rate": 9.016779698235227e-06, "loss": 0.5184, "step": 124 }, { "epoch": 0.8561643835616438, "grad_norm": 0.5044152736663818, "learning_rate": 8.992910944371343e-06, "loss": 0.5405, "step": 125 }, { "epoch": 0.863013698630137, "grad_norm": 0.4105503261089325, "learning_rate": 8.9687883303034e-06, "loss": 0.5234, "step": 126 }, { "epoch": 0.8698630136986302, "grad_norm": 0.46981051564216614, "learning_rate": 8.94441338969238e-06, "loss": 0.5467, "step": 127 }, { "epoch": 0.8767123287671232, "grad_norm": 0.4437629282474518, "learning_rate": 8.919787672241619e-06, "loss": 0.5259, "step": 128 }, { "epoch": 0.8835616438356164, "grad_norm": 0.41010263562202454, "learning_rate": 8.894912743598269e-06, "loss": 0.557, "step": 129 }, { "epoch": 0.8904109589041096, "grad_norm": 0.4265572130680084, "learning_rate": 8.869790185253766e-06, "loss": 0.4936, "step": 130 }, { "epoch": 0.8972602739726028, "grad_norm": 0.516581118106842, "learning_rate": 8.84442159444328e-06, "loss": 0.5336, "step": 131 }, { "epoch": 0.9041095890410958, "grad_norm": 0.4106625020503998, "learning_rate": 8.818808584044163e-06, "loss": 0.5374, "step": 132 }, { "epoch": 0.910958904109589, "grad_norm": 0.397277295589447, "learning_rate": 8.792952782473415e-06, "loss": 0.5485, "step": 133 }, { "epoch": 0.9178082191780822, "grad_norm": 0.40992820262908936, "learning_rate": 8.76685583358414e-06, "loss": 0.5257, "step": 134 }, { "epoch": 0.9246575342465754, "grad_norm": 0.4277738332748413, "learning_rate": 8.740519396561045e-06, "loss": 0.5102, "step": 135 }, { "epoch": 0.9315068493150684, "grad_norm": 0.40470704436302185, "learning_rate": 8.713945145814948e-06, "loss": 0.5169, "step": 136 }, { "epoch": 0.9383561643835616, "grad_norm": 0.4239768385887146, "learning_rate": 8.68713477087632e-06, "loss": 0.5474, "step": 137 }, { "epoch": 0.9452054794520548, "grad_norm": 0.4196901023387909, "learning_rate": 8.660089976287875e-06, "loss": 0.511, "step": 138 }, { "epoch": 0.952054794520548, "grad_norm": 0.48978888988494873, "learning_rate": 8.632812481496195e-06, "loss": 0.5108, "step": 139 }, { "epoch": 0.958904109589041, "grad_norm": 0.4350135922431946, "learning_rate": 8.60530402074241e-06, "loss": 0.5004, "step": 140 }, { "epoch": 0.9657534246575342, "grad_norm": 0.4894917607307434, "learning_rate": 8.577566342951944e-06, "loss": 0.5342, "step": 141 }, { "epoch": 0.9726027397260274, "grad_norm": 0.4722926616668701, "learning_rate": 8.549601211623316e-06, "loss": 0.5374, "step": 142 }, { "epoch": 0.9794520547945206, "grad_norm": 0.40020692348480225, "learning_rate": 8.521410404716029e-06, "loss": 0.5241, "step": 143 }, { "epoch": 0.9863013698630136, "grad_norm": 0.4775395691394806, "learning_rate": 8.492995714537519e-06, "loss": 0.5149, "step": 144 }, { "epoch": 0.9931506849315068, "grad_norm": 0.4094436764717102, "learning_rate": 8.46435894762922e-06, "loss": 0.5025, "step": 145 }, { "epoch": 1.0, "grad_norm": 0.3945600688457489, "learning_rate": 8.43550192465169e-06, "loss": 0.5152, "step": 146 }, { "epoch": 1.0068493150684932, "grad_norm": 0.48859384655952454, "learning_rate": 8.406426480268881e-06, "loss": 0.4662, "step": 147 }, { "epoch": 1.0136986301369864, "grad_norm": 0.42010384798049927, "learning_rate": 8.377134463031468e-06, "loss": 0.4996, "step": 148 }, { "epoch": 1.0205479452054795, "grad_norm": 0.6074568033218384, "learning_rate": 8.347627735259344e-06, "loss": 0.4944, "step": 149 }, { "epoch": 1.0273972602739727, "grad_norm": 0.40178409218788147, "learning_rate": 8.317908172923207e-06, "loss": 0.5116, "step": 150 }, { "epoch": 1.0342465753424657, "grad_norm": 0.4420071244239807, "learning_rate": 8.287977665525292e-06, "loss": 0.4948, "step": 151 }, { "epoch": 1.0410958904109588, "grad_norm": 0.4358409643173218, "learning_rate": 8.257838115979244e-06, "loss": 0.4929, "step": 152 }, { "epoch": 1.047945205479452, "grad_norm": 0.4392963945865631, "learning_rate": 8.227491440489134e-06, "loss": 0.4923, "step": 153 }, { "epoch": 1.0547945205479452, "grad_norm": 0.44299256801605225, "learning_rate": 8.196939568427624e-06, "loss": 0.4921, "step": 154 }, { "epoch": 1.0616438356164384, "grad_norm": 0.41032472252845764, "learning_rate": 8.166184442213314e-06, "loss": 0.4864, "step": 155 }, { "epoch": 1.0684931506849316, "grad_norm": 0.4872773289680481, "learning_rate": 8.135228017187238e-06, "loss": 0.4849, "step": 156 }, { "epoch": 1.0753424657534247, "grad_norm": 0.4646698236465454, "learning_rate": 8.10407226148855e-06, "loss": 0.4813, "step": 157 }, { "epoch": 1.0821917808219177, "grad_norm": 0.42592811584472656, "learning_rate": 8.0727191559294e-06, "loss": 0.4919, "step": 158 }, { "epoch": 1.0890410958904109, "grad_norm": 0.39574742317199707, "learning_rate": 8.041170693868985e-06, "loss": 0.4844, "step": 159 }, { "epoch": 1.095890410958904, "grad_norm": 0.4392130970954895, "learning_rate": 8.009428881086836e-06, "loss": 0.4963, "step": 160 }, { "epoch": 1.1027397260273972, "grad_norm": 0.47602227330207825, "learning_rate": 7.977495735655271e-06, "loss": 0.5001, "step": 161 }, { "epoch": 1.1095890410958904, "grad_norm": 0.39845725893974304, "learning_rate": 7.945373287811116e-06, "loss": 0.4943, "step": 162 }, { "epoch": 1.1164383561643836, "grad_norm": 0.41520991921424866, "learning_rate": 7.913063579826601e-06, "loss": 0.5106, "step": 163 }, { "epoch": 1.1232876712328768, "grad_norm": 0.3922985792160034, "learning_rate": 7.880568665879542e-06, "loss": 0.4906, "step": 164 }, { "epoch": 1.13013698630137, "grad_norm": 0.46458759903907776, "learning_rate": 7.847890611922721e-06, "loss": 0.4927, "step": 165 }, { "epoch": 1.1369863013698631, "grad_norm": 0.37780556082725525, "learning_rate": 7.81503149555255e-06, "loss": 0.4639, "step": 166 }, { "epoch": 1.143835616438356, "grad_norm": 0.3818896412849426, "learning_rate": 7.781993405876974e-06, "loss": 0.4851, "step": 167 }, { "epoch": 1.1506849315068493, "grad_norm": 0.5063320398330688, "learning_rate": 7.748778443382658e-06, "loss": 0.4807, "step": 168 }, { "epoch": 1.1575342465753424, "grad_norm": 0.4002656936645508, "learning_rate": 7.715388719801437e-06, "loss": 0.4678, "step": 169 }, { "epoch": 1.1643835616438356, "grad_norm": 0.41656002402305603, "learning_rate": 7.68182635797606e-06, "loss": 0.4762, "step": 170 }, { "epoch": 1.1712328767123288, "grad_norm": 0.42518362402915955, "learning_rate": 7.648093491725224e-06, "loss": 0.5075, "step": 171 }, { "epoch": 1.178082191780822, "grad_norm": 0.3999499976634979, "learning_rate": 7.6141922657079045e-06, "loss": 0.4519, "step": 172 }, { "epoch": 1.1849315068493151, "grad_norm": 0.40690121054649353, "learning_rate": 7.580124835287013e-06, "loss": 0.4662, "step": 173 }, { "epoch": 1.191780821917808, "grad_norm": 0.4337243139743805, "learning_rate": 7.545893366392358e-06, "loss": 0.4894, "step": 174 }, { "epoch": 1.1986301369863013, "grad_norm": 0.40752294659614563, "learning_rate": 7.511500035382943e-06, "loss": 0.4788, "step": 175 }, { "epoch": 1.2054794520547945, "grad_norm": 0.4354107975959778, "learning_rate": 7.476947028908595e-06, "loss": 0.5209, "step": 176 }, { "epoch": 1.2123287671232876, "grad_norm": 0.37461692094802856, "learning_rate": 7.442236543770945e-06, "loss": 0.4733, "step": 177 }, { "epoch": 1.2191780821917808, "grad_norm": 0.3634370267391205, "learning_rate": 7.407370786783757e-06, "loss": 0.4707, "step": 178 }, { "epoch": 1.226027397260274, "grad_norm": 0.41078001260757446, "learning_rate": 7.372351974632634e-06, "loss": 0.4897, "step": 179 }, { "epoch": 1.2328767123287672, "grad_norm": 0.3920040428638458, "learning_rate": 7.33718233373407e-06, "loss": 0.4995, "step": 180 }, { "epoch": 1.2397260273972603, "grad_norm": 0.400307297706604, "learning_rate": 7.3018641000939115e-06, "loss": 0.5126, "step": 181 }, { "epoch": 1.2465753424657535, "grad_norm": 0.390546590089798, "learning_rate": 7.266399519165193e-06, "loss": 0.4777, "step": 182 }, { "epoch": 1.2534246575342465, "grad_norm": 0.3763103187084198, "learning_rate": 7.2307908457053786e-06, "loss": 0.4714, "step": 183 }, { "epoch": 1.2602739726027397, "grad_norm": 0.4261626899242401, "learning_rate": 7.195040343633006e-06, "loss": 0.4964, "step": 184 }, { "epoch": 1.2671232876712328, "grad_norm": 0.38287729024887085, "learning_rate": 7.159150285883757e-06, "loss": 0.5032, "step": 185 }, { "epoch": 1.273972602739726, "grad_norm": 0.34323829412460327, "learning_rate": 7.123122954265942e-06, "loss": 0.4533, "step": 186 }, { "epoch": 1.2808219178082192, "grad_norm": 0.3638916313648224, "learning_rate": 7.086960639315437e-06, "loss": 0.4923, "step": 187 }, { "epoch": 1.2876712328767124, "grad_norm": 0.41420572996139526, "learning_rate": 7.050665640150045e-06, "loss": 0.4993, "step": 188 }, { "epoch": 1.2945205479452055, "grad_norm": 0.3671460449695587, "learning_rate": 7.0142402643233346e-06, "loss": 0.4941, "step": 189 }, { "epoch": 1.3013698630136985, "grad_norm": 0.4007507860660553, "learning_rate": 6.977686827677926e-06, "loss": 0.5012, "step": 190 }, { "epoch": 1.308219178082192, "grad_norm": 0.4122563302516937, "learning_rate": 6.941007654198254e-06, "loss": 0.4887, "step": 191 }, { "epoch": 1.3150684931506849, "grad_norm": 0.4061591327190399, "learning_rate": 6.904205075862816e-06, "loss": 0.455, "step": 192 }, { "epoch": 1.321917808219178, "grad_norm": 0.38111311197280884, "learning_rate": 6.867281432495911e-06, "loss": 0.4815, "step": 193 }, { "epoch": 1.3287671232876712, "grad_norm": 0.41904938220977783, "learning_rate": 6.830239071618874e-06, "loss": 0.473, "step": 194 }, { "epoch": 1.3356164383561644, "grad_norm": 0.35697585344314575, "learning_rate": 6.793080348300834e-06, "loss": 0.5002, "step": 195 }, { "epoch": 1.3424657534246576, "grad_norm": 0.3706590533256531, "learning_rate": 6.755807625008974e-06, "loss": 0.4928, "step": 196 }, { "epoch": 1.3493150684931507, "grad_norm": 0.37768539786338806, "learning_rate": 6.718423271458343e-06, "loss": 0.4932, "step": 197 }, { "epoch": 1.356164383561644, "grad_norm": 0.3710521161556244, "learning_rate": 6.680929664461184e-06, "loss": 0.4774, "step": 198 }, { "epoch": 1.3630136986301369, "grad_norm": 0.36909735202789307, "learning_rate": 6.643329187775827e-06, "loss": 0.45, "step": 199 }, { "epoch": 1.36986301369863, "grad_norm": 0.3874911069869995, "learning_rate": 6.6056242319551315e-06, "loss": 0.4698, "step": 200 }, { "epoch": 1.3767123287671232, "grad_norm": 0.39832133054733276, "learning_rate": 6.567817194194508e-06, "loss": 0.4626, "step": 201 }, { "epoch": 1.3835616438356164, "grad_norm": 0.4187299311161041, "learning_rate": 6.529910478179499e-06, "loss": 0.4854, "step": 202 }, { "epoch": 1.3904109589041096, "grad_norm": 0.3984171450138092, "learning_rate": 6.491906493932968e-06, "loss": 0.4633, "step": 203 }, { "epoch": 1.3972602739726028, "grad_norm": 0.4085666835308075, "learning_rate": 6.45380765766187e-06, "loss": 0.477, "step": 204 }, { "epoch": 1.404109589041096, "grad_norm": 0.4559183716773987, "learning_rate": 6.415616391603639e-06, "loss": 0.478, "step": 205 }, { "epoch": 1.410958904109589, "grad_norm": 0.4116540551185608, "learning_rate": 6.377335123872177e-06, "loss": 0.4867, "step": 206 }, { "epoch": 1.4178082191780823, "grad_norm": 0.34374645352363586, "learning_rate": 6.338966288303499e-06, "loss": 0.4915, "step": 207 }, { "epoch": 1.4246575342465753, "grad_norm": 0.45456692576408386, "learning_rate": 6.300512324300975e-06, "loss": 0.4754, "step": 208 }, { "epoch": 1.4315068493150684, "grad_norm": 0.3950651288032532, "learning_rate": 6.261975676680252e-06, "loss": 0.4742, "step": 209 }, { "epoch": 1.4383561643835616, "grad_norm": 0.3701169192790985, "learning_rate": 6.223358795513812e-06, "loss": 0.4917, "step": 210 }, { "epoch": 1.4452054794520548, "grad_norm": 0.45799094438552856, "learning_rate": 6.184664135975202e-06, "loss": 0.4665, "step": 211 }, { "epoch": 1.452054794520548, "grad_norm": 0.40342605113983154, "learning_rate": 6.145894158182945e-06, "loss": 0.4827, "step": 212 }, { "epoch": 1.4589041095890412, "grad_norm": 0.3885434567928314, "learning_rate": 6.107051327044124e-06, "loss": 0.4705, "step": 213 }, { "epoch": 1.4657534246575343, "grad_norm": 0.41012731194496155, "learning_rate": 6.0681381120976745e-06, "loss": 0.4676, "step": 214 }, { "epoch": 1.4726027397260273, "grad_norm": 0.40364816784858704, "learning_rate": 6.029156987357373e-06, "loss": 0.4675, "step": 215 }, { "epoch": 1.4794520547945205, "grad_norm": 0.40886637568473816, "learning_rate": 5.990110431154549e-06, "loss": 0.4718, "step": 216 }, { "epoch": 1.4863013698630136, "grad_norm": 0.35799840092658997, "learning_rate": 5.951000925980509e-06, "loss": 0.5037, "step": 217 }, { "epoch": 1.4931506849315068, "grad_norm": 0.39343416690826416, "learning_rate": 5.9118309583287205e-06, "loss": 0.4883, "step": 218 }, { "epoch": 1.5, "grad_norm": 0.4305885434150696, "learning_rate": 5.872603018536713e-06, "loss": 0.5251, "step": 219 }, { "epoch": 1.5068493150684932, "grad_norm": 0.3737599551677704, "learning_rate": 5.8333196006277536e-06, "loss": 0.4737, "step": 220 }, { "epoch": 1.5136986301369864, "grad_norm": 0.3764914572238922, "learning_rate": 5.793983202152283e-06, "loss": 0.508, "step": 221 }, { "epoch": 1.5205479452054793, "grad_norm": 0.416117787361145, "learning_rate": 5.754596324029125e-06, "loss": 0.4693, "step": 222 }, { "epoch": 1.5273972602739727, "grad_norm": 0.3617672026157379, "learning_rate": 5.715161470386485e-06, "loss": 0.4683, "step": 223 }, { "epoch": 1.5342465753424657, "grad_norm": 0.3951134979724884, "learning_rate": 5.675681148402743e-06, "loss": 0.4884, "step": 224 }, { "epoch": 1.541095890410959, "grad_norm": 0.3958381414413452, "learning_rate": 5.636157868147054e-06, "loss": 0.4914, "step": 225 }, { "epoch": 1.547945205479452, "grad_norm": 0.38841086626052856, "learning_rate": 5.596594142419759e-06, "loss": 0.4879, "step": 226 }, { "epoch": 1.5547945205479452, "grad_norm": 0.3935975730419159, "learning_rate": 5.556992486592634e-06, "loss": 0.4929, "step": 227 }, { "epoch": 1.5616438356164384, "grad_norm": 0.4356209337711334, "learning_rate": 5.517355418448961e-06, "loss": 0.485, "step": 228 }, { "epoch": 1.5684931506849316, "grad_norm": 0.36613333225250244, "learning_rate": 5.47768545802346e-06, "loss": 0.4623, "step": 229 }, { "epoch": 1.5753424657534247, "grad_norm": 0.37810492515563965, "learning_rate": 5.437985127442065e-06, "loss": 0.478, "step": 230 }, { "epoch": 1.5821917808219177, "grad_norm": 0.41265738010406494, "learning_rate": 5.398256950761578e-06, "loss": 0.508, "step": 231 }, { "epoch": 1.589041095890411, "grad_norm": 0.3820304870605469, "learning_rate": 5.3585034538091885e-06, "loss": 0.468, "step": 232 }, { "epoch": 1.595890410958904, "grad_norm": 0.36171847581863403, "learning_rate": 5.318727164021896e-06, "loss": 0.474, "step": 233 }, { "epoch": 1.6027397260273972, "grad_norm": 0.4011186361312866, "learning_rate": 5.278930610285813e-06, "loss": 0.4931, "step": 234 }, { "epoch": 1.6095890410958904, "grad_norm": 0.3681487441062927, "learning_rate": 5.239116322775392e-06, "loss": 0.464, "step": 235 }, { "epoch": 1.6164383561643836, "grad_norm": 0.3996686339378357, "learning_rate": 5.199286832792553e-06, "loss": 0.5076, "step": 236 }, { "epoch": 1.6232876712328768, "grad_norm": 0.40732523798942566, "learning_rate": 5.159444672605759e-06, "loss": 0.4675, "step": 237 }, { "epoch": 1.6301369863013697, "grad_norm": 0.4019952714443207, "learning_rate": 5.119592375289015e-06, "loss": 0.4653, "step": 238 }, { "epoch": 1.6369863013698631, "grad_norm": 0.4392927587032318, "learning_rate": 5.079732474560821e-06, "loss": 0.4886, "step": 239 }, { "epoch": 1.643835616438356, "grad_norm": 0.42995181679725647, "learning_rate": 5.039867504623084e-06, "loss": 0.5046, "step": 240 }, { "epoch": 1.6506849315068495, "grad_norm": 0.36546292901039124, "learning_rate": 5e-06, "loss": 0.4801, "step": 241 }, { "epoch": 1.6575342465753424, "grad_norm": 0.4144984781742096, "learning_rate": 4.960132495376919e-06, "loss": 0.4831, "step": 242 }, { "epoch": 1.6643835616438356, "grad_norm": 0.38707658648490906, "learning_rate": 4.92026752543918e-06, "loss": 0.4937, "step": 243 }, { "epoch": 1.6712328767123288, "grad_norm": 0.37487655878067017, "learning_rate": 4.880407624710986e-06, "loss": 0.4648, "step": 244 }, { "epoch": 1.678082191780822, "grad_norm": 0.38767164945602417, "learning_rate": 4.8405553273942415e-06, "loss": 0.4995, "step": 245 }, { "epoch": 1.6849315068493151, "grad_norm": 0.401931494474411, "learning_rate": 4.800713167207449e-06, "loss": 0.4724, "step": 246 }, { "epoch": 1.691780821917808, "grad_norm": 0.35886698961257935, "learning_rate": 4.760883677224609e-06, "loss": 0.4681, "step": 247 }, { "epoch": 1.6986301369863015, "grad_norm": 0.3592085540294647, "learning_rate": 4.721069389714188e-06, "loss": 0.485, "step": 248 }, { "epoch": 1.7054794520547945, "grad_norm": 0.3601101040840149, "learning_rate": 4.6812728359781064e-06, "loss": 0.4881, "step": 249 }, { "epoch": 1.7123287671232876, "grad_norm": 0.37123242020606995, "learning_rate": 4.641496546190813e-06, "loss": 0.4868, "step": 250 }, { "epoch": 1.7191780821917808, "grad_norm": 0.3683170974254608, "learning_rate": 4.601743049238425e-06, "loss": 0.4827, "step": 251 }, { "epoch": 1.726027397260274, "grad_norm": 0.3582712411880493, "learning_rate": 4.562014872557936e-06, "loss": 0.4646, "step": 252 }, { "epoch": 1.7328767123287672, "grad_norm": 0.369663268327713, "learning_rate": 4.522314541976541e-06, "loss": 0.4802, "step": 253 }, { "epoch": 1.7397260273972601, "grad_norm": 0.3558022677898407, "learning_rate": 4.48264458155104e-06, "loss": 0.4901, "step": 254 }, { "epoch": 1.7465753424657535, "grad_norm": 0.3494178354740143, "learning_rate": 4.443007513407368e-06, "loss": 0.4881, "step": 255 }, { "epoch": 1.7534246575342465, "grad_norm": 0.3548129200935364, "learning_rate": 4.403405857580243e-06, "loss": 0.5079, "step": 256 }, { "epoch": 1.7602739726027399, "grad_norm": 0.3643389940261841, "learning_rate": 4.363842131852948e-06, "loss": 0.4889, "step": 257 }, { "epoch": 1.7671232876712328, "grad_norm": 0.3427479863166809, "learning_rate": 4.3243188515972575e-06, "loss": 0.4791, "step": 258 }, { "epoch": 1.773972602739726, "grad_norm": 0.38511818647384644, "learning_rate": 4.2848385296135165e-06, "loss": 0.473, "step": 259 }, { "epoch": 1.7808219178082192, "grad_norm": 0.37743064761161804, "learning_rate": 4.245403675970877e-06, "loss": 0.5084, "step": 260 }, { "epoch": 1.7876712328767124, "grad_norm": 0.33601808547973633, "learning_rate": 4.206016797847718e-06, "loss": 0.4822, "step": 261 }, { "epoch": 1.7945205479452055, "grad_norm": 0.3497495949268341, "learning_rate": 4.166680399372248e-06, "loss": 0.4936, "step": 262 }, { "epoch": 1.8013698630136985, "grad_norm": 0.32923176884651184, "learning_rate": 4.127396981463289e-06, "loss": 0.4987, "step": 263 }, { "epoch": 1.808219178082192, "grad_norm": 0.37325793504714966, "learning_rate": 4.08816904167128e-06, "loss": 0.4708, "step": 264 }, { "epoch": 1.8150684931506849, "grad_norm": 0.3285587728023529, "learning_rate": 4.048999074019493e-06, "loss": 0.4612, "step": 265 }, { "epoch": 1.821917808219178, "grad_norm": 0.34880074858665466, "learning_rate": 4.009889568845453e-06, "loss": 0.4805, "step": 266 }, { "epoch": 1.8287671232876712, "grad_norm": 0.366190105676651, "learning_rate": 3.9708430126426286e-06, "loss": 0.478, "step": 267 }, { "epoch": 1.8356164383561644, "grad_norm": 0.37551647424697876, "learning_rate": 3.9318618879023255e-06, "loss": 0.4897, "step": 268 }, { "epoch": 1.8424657534246576, "grad_norm": 0.3399874269962311, "learning_rate": 3.8929486729558775e-06, "loss": 0.4558, "step": 269 }, { "epoch": 1.8493150684931505, "grad_norm": 0.37752845883369446, "learning_rate": 3.854105841817056e-06, "loss": 0.476, "step": 270 }, { "epoch": 1.856164383561644, "grad_norm": 0.38380375504493713, "learning_rate": 3.8153358640248e-06, "loss": 0.4845, "step": 271 }, { "epoch": 1.8630136986301369, "grad_norm": 0.35891273617744446, "learning_rate": 3.776641204486191e-06, "loss": 0.4911, "step": 272 }, { "epoch": 1.8698630136986303, "grad_norm": 0.34820806980133057, "learning_rate": 3.738024323319749e-06, "loss": 0.5001, "step": 273 }, { "epoch": 1.8767123287671232, "grad_norm": 0.381046861410141, "learning_rate": 3.699487675699027e-06, "loss": 0.4655, "step": 274 }, { "epoch": 1.8835616438356164, "grad_norm": 0.3240637183189392, "learning_rate": 3.661033711696501e-06, "loss": 0.4779, "step": 275 }, { "epoch": 1.8904109589041096, "grad_norm": 0.32344701886177063, "learning_rate": 3.6226648761278238e-06, "loss": 0.4519, "step": 276 }, { "epoch": 1.8972602739726028, "grad_norm": 0.3502044677734375, "learning_rate": 3.5843836083963625e-06, "loss": 0.4843, "step": 277 }, { "epoch": 1.904109589041096, "grad_norm": 0.31665265560150146, "learning_rate": 3.5461923423381313e-06, "loss": 0.4679, "step": 278 }, { "epoch": 1.910958904109589, "grad_norm": 0.35308602452278137, "learning_rate": 3.5080935060670345e-06, "loss": 0.4798, "step": 279 }, { "epoch": 1.9178082191780823, "grad_norm": 0.32667192816734314, "learning_rate": 3.4700895218205026e-06, "loss": 0.4814, "step": 280 }, { "epoch": 1.9246575342465753, "grad_norm": 0.34751468896865845, "learning_rate": 3.432182805805495e-06, "loss": 0.4553, "step": 281 }, { "epoch": 1.9315068493150684, "grad_norm": 0.32501402497291565, "learning_rate": 3.3943757680448697e-06, "loss": 0.4725, "step": 282 }, { "epoch": 1.9383561643835616, "grad_norm": 0.3369636535644531, "learning_rate": 3.3566708122241753e-06, "loss": 0.4462, "step": 283 }, { "epoch": 1.9452054794520548, "grad_norm": 0.3620050549507141, "learning_rate": 3.3190703355388165e-06, "loss": 0.4758, "step": 284 }, { "epoch": 1.952054794520548, "grad_norm": 0.33129626512527466, "learning_rate": 3.2815767285416576e-06, "loss": 0.492, "step": 285 }, { "epoch": 1.958904109589041, "grad_norm": 0.3341040015220642, "learning_rate": 3.244192374991027e-06, "loss": 0.4611, "step": 286 }, { "epoch": 1.9657534246575343, "grad_norm": 0.3279862701892853, "learning_rate": 3.2069196516991685e-06, "loss": 0.4802, "step": 287 }, { "epoch": 1.9726027397260273, "grad_norm": 0.3518829643726349, "learning_rate": 3.169760928381127e-06, "loss": 0.4795, "step": 288 }, { "epoch": 1.9794520547945207, "grad_norm": 0.3467582166194916, "learning_rate": 3.1327185675040907e-06, "loss": 0.4993, "step": 289 }, { "epoch": 1.9863013698630136, "grad_norm": 0.34387102723121643, "learning_rate": 3.0957949241371845e-06, "loss": 0.4846, "step": 290 }, { "epoch": 1.9931506849315068, "grad_norm": 0.3198384642601013, "learning_rate": 3.0589923458017467e-06, "loss": 0.4681, "step": 291 }, { "epoch": 2.0, "grad_norm": 0.3447113633155823, "learning_rate": 3.0223131723220756e-06, "loss": 0.4804, "step": 292 }, { "epoch": 2.006849315068493, "grad_norm": 0.3846930265426636, "learning_rate": 2.9857597356766675e-06, "loss": 0.4599, "step": 293 }, { "epoch": 2.0136986301369864, "grad_norm": 0.3663610517978668, "learning_rate": 2.949334359849957e-06, "loss": 0.4611, "step": 294 }, { "epoch": 2.0205479452054793, "grad_norm": 0.34175989031791687, "learning_rate": 2.913039360684565e-06, "loss": 0.4806, "step": 295 }, { "epoch": 2.0273972602739727, "grad_norm": 0.29779112339019775, "learning_rate": 2.876877045734058e-06, "loss": 0.4257, "step": 296 }, { "epoch": 2.0342465753424657, "grad_norm": 0.32330426573753357, "learning_rate": 2.840849714116244e-06, "loss": 0.4458, "step": 297 }, { "epoch": 2.041095890410959, "grad_norm": 0.3471793234348297, "learning_rate": 2.8049596563669936e-06, "loss": 0.4457, "step": 298 }, { "epoch": 2.047945205479452, "grad_norm": 0.36945298314094543, "learning_rate": 2.769209154294623e-06, "loss": 0.4343, "step": 299 }, { "epoch": 2.0547945205479454, "grad_norm": 0.354952335357666, "learning_rate": 2.7336004808348094e-06, "loss": 0.4463, "step": 300 }, { "epoch": 2.0616438356164384, "grad_norm": 0.3860822916030884, "learning_rate": 2.69813589990609e-06, "loss": 0.4459, "step": 301 }, { "epoch": 2.0684931506849313, "grad_norm": 0.3559304475784302, "learning_rate": 2.662817666265932e-06, "loss": 0.4594, "step": 302 }, { "epoch": 2.0753424657534247, "grad_norm": 0.3339487314224243, "learning_rate": 2.6276480253673663e-06, "loss": 0.4612, "step": 303 }, { "epoch": 2.0821917808219177, "grad_norm": 0.404588520526886, "learning_rate": 2.5926292132162432e-06, "loss": 0.4606, "step": 304 }, { "epoch": 2.089041095890411, "grad_norm": 0.3664696216583252, "learning_rate": 2.5577634562290567e-06, "loss": 0.4416, "step": 305 }, { "epoch": 2.095890410958904, "grad_norm": 0.35626354813575745, "learning_rate": 2.5230529710914074e-06, "loss": 0.47, "step": 306 }, { "epoch": 2.1027397260273974, "grad_norm": 0.32117801904678345, "learning_rate": 2.48849996461706e-06, "loss": 0.4475, "step": 307 }, { "epoch": 2.1095890410958904, "grad_norm": 0.34588146209716797, "learning_rate": 2.4541066336076434e-06, "loss": 0.4436, "step": 308 }, { "epoch": 2.1164383561643834, "grad_norm": 0.31559157371520996, "learning_rate": 2.4198751647129896e-06, "loss": 0.458, "step": 309 }, { "epoch": 2.1232876712328768, "grad_norm": 0.3515860140323639, "learning_rate": 2.385807734292097e-06, "loss": 0.4564, "step": 310 }, { "epoch": 2.1301369863013697, "grad_norm": 0.35864606499671936, "learning_rate": 2.3519065082747777e-06, "loss": 0.4636, "step": 311 }, { "epoch": 2.136986301369863, "grad_norm": 0.34572356939315796, "learning_rate": 2.318173642023939e-06, "loss": 0.4431, "step": 312 }, { "epoch": 2.143835616438356, "grad_norm": 0.33984869718551636, "learning_rate": 2.284611280198563e-06, "loss": 0.4658, "step": 313 }, { "epoch": 2.1506849315068495, "grad_norm": 0.3217160105705261, "learning_rate": 2.251221556617344e-06, "loss": 0.4493, "step": 314 }, { "epoch": 2.1575342465753424, "grad_norm": 0.3655526340007782, "learning_rate": 2.218006594123028e-06, "loss": 0.4517, "step": 315 }, { "epoch": 2.1643835616438354, "grad_norm": 0.35365915298461914, "learning_rate": 2.184968504447453e-06, "loss": 0.4542, "step": 316 }, { "epoch": 2.171232876712329, "grad_norm": 0.30709370970726013, "learning_rate": 2.15210938807728e-06, "loss": 0.4265, "step": 317 }, { "epoch": 2.1780821917808217, "grad_norm": 0.31368520855903625, "learning_rate": 2.11943133412046e-06, "loss": 0.4563, "step": 318 }, { "epoch": 2.184931506849315, "grad_norm": 0.3410292863845825, "learning_rate": 2.086936420173399e-06, "loss": 0.4515, "step": 319 }, { "epoch": 2.191780821917808, "grad_norm": 0.3358498215675354, "learning_rate": 2.0546267121888863e-06, "loss": 0.4197, "step": 320 }, { "epoch": 2.1986301369863015, "grad_norm": 0.3185282051563263, "learning_rate": 2.0225042643447283e-06, "loss": 0.4613, "step": 321 }, { "epoch": 2.2054794520547945, "grad_norm": 0.3557981252670288, "learning_rate": 1.990571118913166e-06, "loss": 0.4824, "step": 322 }, { "epoch": 2.212328767123288, "grad_norm": 0.31085366010665894, "learning_rate": 1.9588293061310165e-06, "loss": 0.4413, "step": 323 }, { "epoch": 2.219178082191781, "grad_norm": 0.3326927125453949, "learning_rate": 1.9272808440706024e-06, "loss": 0.453, "step": 324 }, { "epoch": 2.2260273972602738, "grad_norm": 0.32726767659187317, "learning_rate": 1.8959277385114516e-06, "loss": 0.454, "step": 325 }, { "epoch": 2.232876712328767, "grad_norm": 0.3480210602283478, "learning_rate": 1.864771982812763e-06, "loss": 0.4168, "step": 326 }, { "epoch": 2.23972602739726, "grad_norm": 0.311626672744751, "learning_rate": 1.8338155577866873e-06, "loss": 0.4566, "step": 327 }, { "epoch": 2.2465753424657535, "grad_norm": 0.2938528060913086, "learning_rate": 1.8030604315723765e-06, "loss": 0.4372, "step": 328 }, { "epoch": 2.2534246575342465, "grad_norm": 0.2981064021587372, "learning_rate": 1.7725085595108682e-06, "loss": 0.4396, "step": 329 }, { "epoch": 2.26027397260274, "grad_norm": 0.3302954435348511, "learning_rate": 1.7421618840207576e-06, "loss": 0.4634, "step": 330 }, { "epoch": 2.267123287671233, "grad_norm": 0.31649407744407654, "learning_rate": 1.71202233447471e-06, "loss": 0.4425, "step": 331 }, { "epoch": 2.2739726027397262, "grad_norm": 0.30021733045578003, "learning_rate": 1.682091827076796e-06, "loss": 0.4365, "step": 332 }, { "epoch": 2.280821917808219, "grad_norm": 0.30878108739852905, "learning_rate": 1.6523722647406575e-06, "loss": 0.435, "step": 333 }, { "epoch": 2.287671232876712, "grad_norm": 0.33258000016212463, "learning_rate": 1.6228655369685342e-06, "loss": 0.4679, "step": 334 }, { "epoch": 2.2945205479452055, "grad_norm": 0.3215084373950958, "learning_rate": 1.5935735197311204e-06, "loss": 0.4452, "step": 335 }, { "epoch": 2.3013698630136985, "grad_norm": 0.32681819796562195, "learning_rate": 1.5644980753483109e-06, "loss": 0.4886, "step": 336 }, { "epoch": 2.308219178082192, "grad_norm": 0.3088940382003784, "learning_rate": 1.5356410523707828e-06, "loss": 0.4603, "step": 337 }, { "epoch": 2.315068493150685, "grad_norm": 0.3307572901248932, "learning_rate": 1.5070042854624833e-06, "loss": 0.4558, "step": 338 }, { "epoch": 2.3219178082191783, "grad_norm": 0.31423696875572205, "learning_rate": 1.4785895952839735e-06, "loss": 0.4485, "step": 339 }, { "epoch": 2.328767123287671, "grad_norm": 0.32906076312065125, "learning_rate": 1.4503987883766857e-06, "loss": 0.4587, "step": 340 }, { "epoch": 2.3356164383561646, "grad_norm": 0.31784313917160034, "learning_rate": 1.4224336570480574e-06, "loss": 0.457, "step": 341 }, { "epoch": 2.3424657534246576, "grad_norm": 0.30230167508125305, "learning_rate": 1.3946959792575915e-06, "loss": 0.4805, "step": 342 }, { "epoch": 2.3493150684931505, "grad_norm": 0.33398470282554626, "learning_rate": 1.3671875185038064e-06, "loss": 0.4369, "step": 343 }, { "epoch": 2.356164383561644, "grad_norm": 0.35357654094696045, "learning_rate": 1.3399100237121266e-06, "loss": 0.4409, "step": 344 }, { "epoch": 2.363013698630137, "grad_norm": 0.344047486782074, "learning_rate": 1.312865229123681e-06, "loss": 0.4811, "step": 345 }, { "epoch": 2.3698630136986303, "grad_norm": 0.30871498584747314, "learning_rate": 1.2860548541850542e-06, "loss": 0.4625, "step": 346 }, { "epoch": 2.3767123287671232, "grad_norm": 0.3363014757633209, "learning_rate": 1.2594806034389556e-06, "loss": 0.4463, "step": 347 }, { "epoch": 2.383561643835616, "grad_norm": 0.3206148147583008, "learning_rate": 1.233144166415861e-06, "loss": 0.4366, "step": 348 }, { "epoch": 2.3904109589041096, "grad_norm": 0.3465731143951416, "learning_rate": 1.2070472175265857e-06, "loss": 0.4448, "step": 349 }, { "epoch": 2.3972602739726026, "grad_norm": 0.29774489998817444, "learning_rate": 1.1811914159558374e-06, "loss": 0.4441, "step": 350 }, { "epoch": 2.404109589041096, "grad_norm": 0.29331234097480774, "learning_rate": 1.155578405556722e-06, "loss": 0.4299, "step": 351 }, { "epoch": 2.410958904109589, "grad_norm": 0.3341335952281952, "learning_rate": 1.1302098147462348e-06, "loss": 0.4509, "step": 352 }, { "epoch": 2.4178082191780823, "grad_norm": 0.3187620937824249, "learning_rate": 1.1050872564017329e-06, "loss": 0.4656, "step": 353 }, { "epoch": 2.4246575342465753, "grad_norm": 0.33351418375968933, "learning_rate": 1.080212327758382e-06, "loss": 0.437, "step": 354 }, { "epoch": 2.4315068493150687, "grad_norm": 0.311325341463089, "learning_rate": 1.0555866103076212e-06, "loss": 0.4551, "step": 355 }, { "epoch": 2.4383561643835616, "grad_norm": 0.30476653575897217, "learning_rate": 1.0312116696966012e-06, "loss": 0.4347, "step": 356 }, { "epoch": 2.4452054794520546, "grad_norm": 0.3320573568344116, "learning_rate": 1.0070890556286578e-06, "loss": 0.4355, "step": 357 }, { "epoch": 2.452054794520548, "grad_norm": 0.2996487021446228, "learning_rate": 9.832203017647746e-07, "loss": 0.4531, "step": 358 }, { "epoch": 2.458904109589041, "grad_norm": 0.31164291501045227, "learning_rate": 9.596069256260792e-07, "loss": 0.4914, "step": 359 }, { "epoch": 2.4657534246575343, "grad_norm": 0.3011796176433563, "learning_rate": 9.362504284973683e-07, "loss": 0.425, "step": 360 }, { "epoch": 2.4726027397260273, "grad_norm": 0.29375994205474854, "learning_rate": 9.131522953316502e-07, "loss": 0.4695, "step": 361 }, { "epoch": 2.4794520547945207, "grad_norm": 0.30826085805892944, "learning_rate": 8.903139946557437e-07, "loss": 0.457, "step": 362 }, { "epoch": 2.4863013698630136, "grad_norm": 0.32142892479896545, "learning_rate": 8.677369784769041e-07, "loss": 0.4567, "step": 363 }, { "epoch": 2.493150684931507, "grad_norm": 0.2887185215950012, "learning_rate": 8.454226821905171e-07, "loss": 0.4601, "step": 364 }, { "epoch": 2.5, "grad_norm": 0.33354097604751587, "learning_rate": 8.233725244888291e-07, "loss": 0.4549, "step": 365 }, { "epoch": 2.506849315068493, "grad_norm": 0.29360541701316833, "learning_rate": 8.015879072707611e-07, "loss": 0.4755, "step": 366 }, { "epoch": 2.5136986301369864, "grad_norm": 0.30357393622398376, "learning_rate": 7.800702155527695e-07, "loss": 0.437, "step": 367 }, { "epoch": 2.5205479452054793, "grad_norm": 0.2883985638618469, "learning_rate": 7.588208173807943e-07, "loss": 0.4511, "step": 368 }, { "epoch": 2.5273972602739727, "grad_norm": 0.31914854049682617, "learning_rate": 7.378410637432848e-07, "loss": 0.4499, "step": 369 }, { "epoch": 2.5342465753424657, "grad_norm": 0.30720996856689453, "learning_rate": 7.171322884852988e-07, "loss": 0.4815, "step": 370 }, { "epoch": 2.541095890410959, "grad_norm": 0.2876569926738739, "learning_rate": 6.966958082237096e-07, "loss": 0.4205, "step": 371 }, { "epoch": 2.547945205479452, "grad_norm": 0.3016420900821686, "learning_rate": 6.765329222634892e-07, "loss": 0.48, "step": 372 }, { "epoch": 2.5547945205479454, "grad_norm": 0.3060811161994934, "learning_rate": 6.566449125151086e-07, "loss": 0.4609, "step": 373 }, { "epoch": 2.5616438356164384, "grad_norm": 0.2952597141265869, "learning_rate": 6.370330434130317e-07, "loss": 0.4407, "step": 374 }, { "epoch": 2.5684931506849313, "grad_norm": 0.3057716488838196, "learning_rate": 6.176985618353282e-07, "loss": 0.4477, "step": 375 }, { "epoch": 2.5753424657534247, "grad_norm": 0.2792918086051941, "learning_rate": 5.986426970244009e-07, "loss": 0.4489, "step": 376 }, { "epoch": 2.5821917808219177, "grad_norm": 0.3092077672481537, "learning_rate": 5.798666605088293e-07, "loss": 0.4356, "step": 377 }, { "epoch": 2.589041095890411, "grad_norm": 0.27638155221939087, "learning_rate": 5.613716460263485e-07, "loss": 0.4553, "step": 378 }, { "epoch": 2.595890410958904, "grad_norm": 0.29136520624160767, "learning_rate": 5.431588294479479e-07, "loss": 0.4275, "step": 379 }, { "epoch": 2.602739726027397, "grad_norm": 0.2942065894603729, "learning_rate": 5.252293687031196e-07, "loss": 0.4159, "step": 380 }, { "epoch": 2.6095890410958904, "grad_norm": 0.3121497929096222, "learning_rate": 5.075844037062322e-07, "loss": 0.4503, "step": 381 }, { "epoch": 2.616438356164384, "grad_norm": 0.3024226129055023, "learning_rate": 4.902250562840622e-07, "loss": 0.4525, "step": 382 }, { "epoch": 2.6232876712328768, "grad_norm": 0.30542418360710144, "learning_rate": 4.7315243010447156e-07, "loss": 0.4335, "step": 383 }, { "epoch": 2.6301369863013697, "grad_norm": 0.3106946051120758, "learning_rate": 4.5636761060623314e-07, "loss": 0.4627, "step": 384 }, { "epoch": 2.636986301369863, "grad_norm": 0.294289231300354, "learning_rate": 4.398716649300311e-07, "loss": 0.4503, "step": 385 }, { "epoch": 2.643835616438356, "grad_norm": 0.29368045926094055, "learning_rate": 4.2366564185060134e-07, "loss": 0.4447, "step": 386 }, { "epoch": 2.6506849315068495, "grad_norm": 0.2810214161872864, "learning_rate": 4.077505717100666e-07, "loss": 0.4497, "step": 387 }, { "epoch": 2.6575342465753424, "grad_norm": 0.33020564913749695, "learning_rate": 3.921274663524183e-07, "loss": 0.4356, "step": 388 }, { "epoch": 2.6643835616438354, "grad_norm": 0.2780897319316864, "learning_rate": 3.767973190591906e-07, "loss": 0.3993, "step": 389 }, { "epoch": 2.671232876712329, "grad_norm": 0.3095431625843048, "learning_rate": 3.61761104486314e-07, "loss": 0.446, "step": 390 }, { "epoch": 2.678082191780822, "grad_norm": 0.3015943765640259, "learning_rate": 3.4701977860213956e-07, "loss": 0.4296, "step": 391 }, { "epoch": 2.684931506849315, "grad_norm": 0.2791678011417389, "learning_rate": 3.3257427862666894e-07, "loss": 0.442, "step": 392 }, { "epoch": 2.691780821917808, "grad_norm": 0.3039053678512573, "learning_rate": 3.184255229719624e-07, "loss": 0.4256, "step": 393 }, { "epoch": 2.6986301369863015, "grad_norm": 0.2986087203025818, "learning_rate": 3.045744111837529e-07, "loss": 0.4683, "step": 394 }, { "epoch": 2.7054794520547945, "grad_norm": 0.30011793971061707, "learning_rate": 2.9102182388425106e-07, "loss": 0.4459, "step": 395 }, { "epoch": 2.712328767123288, "grad_norm": 0.26771464943885803, "learning_rate": 2.777686227161591e-07, "loss": 0.4355, "step": 396 }, { "epoch": 2.719178082191781, "grad_norm": 0.28580793738365173, "learning_rate": 2.648156502878907e-07, "loss": 0.4607, "step": 397 }, { "epoch": 2.7260273972602738, "grad_norm": 0.30609655380249023, "learning_rate": 2.5216373011999697e-07, "loss": 0.4665, "step": 398 }, { "epoch": 2.732876712328767, "grad_norm": 0.30261853337287903, "learning_rate": 2.3981366659281135e-07, "loss": 0.4277, "step": 399 }, { "epoch": 2.73972602739726, "grad_norm": 0.2930530905723572, "learning_rate": 2.2776624489530664e-07, "loss": 0.4553, "step": 400 }, { "epoch": 2.7465753424657535, "grad_norm": 0.3095199465751648, "learning_rate": 2.1602223097517915e-07, "loss": 0.426, "step": 401 }, { "epoch": 2.7534246575342465, "grad_norm": 0.3025897741317749, "learning_rate": 2.0458237149014347e-07, "loss": 0.4385, "step": 402 }, { "epoch": 2.76027397260274, "grad_norm": 0.27555036544799805, "learning_rate": 1.9344739376047083e-07, "loss": 0.418, "step": 403 }, { "epoch": 2.767123287671233, "grad_norm": 0.27457016706466675, "learning_rate": 1.8261800572274001e-07, "loss": 0.4213, "step": 404 }, { "epoch": 2.7739726027397262, "grad_norm": 0.2816792130470276, "learning_rate": 1.7209489588483396e-07, "loss": 0.4562, "step": 405 }, { "epoch": 2.780821917808219, "grad_norm": 0.2918860614299774, "learning_rate": 1.6187873328216142e-07, "loss": 0.4405, "step": 406 }, { "epoch": 2.787671232876712, "grad_norm": 0.30940842628479004, "learning_rate": 1.519701674351265e-07, "loss": 0.4667, "step": 407 }, { "epoch": 2.7945205479452055, "grad_norm": 0.2933136224746704, "learning_rate": 1.4236982830782676e-07, "loss": 0.4426, "step": 408 }, { "epoch": 2.8013698630136985, "grad_norm": 0.2901608347892761, "learning_rate": 1.3307832626800966e-07, "loss": 0.4664, "step": 409 }, { "epoch": 2.808219178082192, "grad_norm": 0.2942031919956207, "learning_rate": 1.2409625204825802e-07, "loss": 0.4149, "step": 410 }, { "epoch": 2.815068493150685, "grad_norm": 0.29800334572792053, "learning_rate": 1.1542417670844075e-07, "loss": 0.4464, "step": 411 }, { "epoch": 2.821917808219178, "grad_norm": 0.27716100215911865, "learning_rate": 1.0706265159939944e-07, "loss": 0.4472, "step": 412 }, { "epoch": 2.828767123287671, "grad_norm": 0.2767964005470276, "learning_rate": 9.901220832790104e-08, "loss": 0.4535, "step": 413 }, { "epoch": 2.8356164383561646, "grad_norm": 0.2769363522529602, "learning_rate": 9.12733587228326e-08, "loss": 0.4219, "step": 414 }, { "epoch": 2.8424657534246576, "grad_norm": 0.29575496912002563, "learning_rate": 8.384659480266733e-08, "loss": 0.4549, "step": 415 }, { "epoch": 2.8493150684931505, "grad_norm": 0.2830677032470703, "learning_rate": 7.673238874417676e-08, "loss": 0.4469, "step": 416 }, { "epoch": 2.856164383561644, "grad_norm": 0.31160372495651245, "learning_rate": 6.9931192852416e-08, "loss": 0.4427, "step": 417 }, { "epoch": 2.863013698630137, "grad_norm": 0.28588008880615234, "learning_rate": 6.344343953196386e-08, "loss": 0.4769, "step": 418 }, { "epoch": 2.8698630136986303, "grad_norm": 0.2827588617801666, "learning_rate": 5.726954125943318e-08, "loss": 0.439, "step": 419 }, { "epoch": 2.8767123287671232, "grad_norm": 0.2822471857070923, "learning_rate": 5.1409890557246876e-08, "loss": 0.4169, "step": 420 }, { "epoch": 2.883561643835616, "grad_norm": 0.2951059937477112, "learning_rate": 4.586485996867951e-08, "loss": 0.4413, "step": 421 }, { "epoch": 2.8904109589041096, "grad_norm": 0.28772932291030884, "learning_rate": 4.063480203417625e-08, "loss": 0.4463, "step": 422 }, { "epoch": 2.897260273972603, "grad_norm": 0.2849860191345215, "learning_rate": 3.572004926893413e-08, "loss": 0.4184, "step": 423 }, { "epoch": 2.904109589041096, "grad_norm": 0.2915169298648834, "learning_rate": 3.1120914141766214e-08, "loss": 0.4391, "step": 424 }, { "epoch": 2.910958904109589, "grad_norm": 0.3080134987831116, "learning_rate": 2.683768905523243e-08, "loss": 0.4579, "step": 425 }, { "epoch": 2.9178082191780823, "grad_norm": 0.2908354103565216, "learning_rate": 2.287064632705005e-08, "loss": 0.456, "step": 426 }, { "epoch": 2.9246575342465753, "grad_norm": 0.29851260781288147, "learning_rate": 1.9220038172780843e-08, "loss": 0.4306, "step": 427 }, { "epoch": 2.9315068493150687, "grad_norm": 0.2807960510253906, "learning_rate": 1.588609668979446e-08, "loss": 0.4539, "step": 428 }, { "epoch": 2.9383561643835616, "grad_norm": 0.2773742079734802, "learning_rate": 1.286903384251581e-08, "loss": 0.4576, "step": 429 }, { "epoch": 2.9452054794520546, "grad_norm": 0.2927360534667969, "learning_rate": 1.016904144894304e-08, "loss": 0.4464, "step": 430 }, { "epoch": 2.952054794520548, "grad_norm": 0.28278273344039917, "learning_rate": 7.78629116845786e-09, "loss": 0.471, "step": 431 }, { "epoch": 2.958904109589041, "grad_norm": 0.2733604907989502, "learning_rate": 5.720934490907604e-09, "loss": 0.4357, "step": 432 }, { "epoch": 2.9657534246575343, "grad_norm": 0.2910250723361969, "learning_rate": 3.973102726976819e-09, "loss": 0.4451, "step": 433 }, { "epoch": 2.9726027397260273, "grad_norm": 0.29007551074028015, "learning_rate": 2.542906999836725e-09, "loss": 0.4503, "step": 434 }, { "epoch": 2.9794520547945207, "grad_norm": 0.2818272113800049, "learning_rate": 1.4304382380819771e-09, "loss": 0.4465, "step": 435 }, { "epoch": 2.9863013698630136, "grad_norm": 0.29641109704971313, "learning_rate": 6.357671699486201e-10, "loss": 0.4726, "step": 436 }, { "epoch": 2.993150684931507, "grad_norm": 0.3044302463531494, "learning_rate": 1.5894431881657845e-10, "loss": 0.4642, "step": 437 }, { "epoch": 3.0, "grad_norm": 0.2847689986228943, "learning_rate": 0.0, "loss": 0.4255, "step": 438 }, { "epoch": 3.0, "step": 438, "total_flos": 666359318511616.0, "train_loss": 0.5066342979125237, "train_runtime": 9306.6951, "train_samples_per_second": 4.514, "train_steps_per_second": 0.047 } ], "logging_steps": 1.0, "max_steps": 438, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 666359318511616.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }