{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "global_step": 400000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.7499999999999996e-07, "loss": 1.0134, "step": 1000 }, { "epoch": 0.0, "learning_rate": 7.499999999999999e-07, "loss": 0.8099, "step": 2000 }, { "epoch": 0.0, "learning_rate": 1.1249999999999998e-06, "loss": 0.766, "step": 3000 }, { "epoch": 0.0, "learning_rate": 1.4999999999999998e-06, "loss": 0.7487, "step": 4000 }, { "epoch": 0.01, "learning_rate": 1.8749999999999998e-06, "loss": 0.7437, "step": 5000 }, { "epoch": 0.01, "eval_runtime": 3361.9865, "eval_samples_per_second": 101.027, "eval_steps_per_second": 12.629, "step": 5000 }, { "epoch": 0.01, "learning_rate": 2.2499999999999996e-06, "loss": 0.7429, "step": 6000 }, { "epoch": 0.01, "learning_rate": 2.625e-06, "loss": 0.7413, "step": 7000 }, { "epoch": 0.01, "learning_rate": 2.9999999999999997e-06, "loss": 0.7393, "step": 8000 }, { "epoch": 0.01, "learning_rate": 3.3749999999999995e-06, "loss": 0.7396, "step": 9000 }, { "epoch": 0.01, "learning_rate": 3.7499999999999997e-06, "loss": 0.7387, "step": 10000 }, { "epoch": 0.01, "eval_runtime": 3096.2761, "eval_samples_per_second": 109.697, "eval_steps_per_second": 13.712, "step": 10000 }, { "epoch": 0.01, "learning_rate": 4.1249999999999995e-06, "loss": 0.739, "step": 11000 }, { "epoch": 0.01, "learning_rate": 4.499999999999999e-06, "loss": 0.7386, "step": 12000 }, { "epoch": 0.01, "learning_rate": 4.875e-06, "loss": 0.7336, "step": 13000 }, { "epoch": 0.01, "learning_rate": 5.25e-06, "loss": 0.7324, "step": 14000 }, { "epoch": 0.01, "learning_rate": 5.6249999999999995e-06, "loss": 0.7306, "step": 15000 }, { "epoch": 0.01, "eval_runtime": 3374.1441, "eval_samples_per_second": 100.663, "eval_steps_per_second": 12.583, "step": 15000 }, { "epoch": 0.02, "learning_rate": 5.999999999999999e-06, "loss": 0.7337, "step": 16000 }, { "epoch": 0.02, "learning_rate": 6.375e-06, "loss": 0.7369, "step": 17000 }, { "epoch": 0.02, "learning_rate": 6.749999999999999e-06, "loss": 0.7364, "step": 18000 }, { "epoch": 0.02, "learning_rate": 7.1249999999999995e-06, "loss": 0.738, "step": 19000 }, { "epoch": 0.02, "learning_rate": 7.499999999999999e-06, "loss": 0.7384, "step": 20000 }, { "epoch": 0.02, "eval_runtime": 3264.4039, "eval_samples_per_second": 104.047, "eval_steps_per_second": 13.006, "step": 20000 }, { "epoch": 0.02, "learning_rate": 7.874999999999998e-06, "loss": 0.736, "step": 21000 }, { "epoch": 0.02, "learning_rate": 8.249999999999999e-06, "loss": 0.7356, "step": 22000 }, { "epoch": 0.02, "learning_rate": 8.625e-06, "loss": 0.7345, "step": 23000 }, { "epoch": 0.02, "learning_rate": 8.999999999999999e-06, "loss": 0.7352, "step": 24000 }, { "epoch": 0.03, "learning_rate": 9.375e-06, "loss": 0.7363, "step": 25000 }, { "epoch": 0.03, "eval_runtime": 3202.6438, "eval_samples_per_second": 106.054, "eval_steps_per_second": 13.257, "step": 25000 }, { "epoch": 0.03, "learning_rate": 9.75e-06, "loss": 0.7336, "step": 26000 }, { "epoch": 0.03, "learning_rate": 1.0125e-05, "loss": 0.7194, "step": 27000 }, { "epoch": 0.03, "learning_rate": 1.05e-05, "loss": 0.7003, "step": 28000 }, { "epoch": 0.03, "learning_rate": 1.0874999999999998e-05, "loss": 0.6844, "step": 29000 }, { "epoch": 0.03, "learning_rate": 1.1249999999999999e-05, "loss": 0.6632, "step": 30000 }, { "epoch": 0.03, "eval_runtime": 3180.4112, "eval_samples_per_second": 106.795, "eval_steps_per_second": 13.35, "step": 30000 }, { "epoch": 0.03, "learning_rate": 1.1625e-05, "loss": 0.6374, "step": 31000 }, { "epoch": 0.03, "learning_rate": 1.1999999999999999e-05, "loss": 0.6217, "step": 32000 }, { "epoch": 0.03, "learning_rate": 1.2375e-05, "loss": 0.609, "step": 33000 }, { "epoch": 0.03, "learning_rate": 1.275e-05, "loss": 0.5931, "step": 34000 }, { "epoch": 0.04, "learning_rate": 1.3124999999999999e-05, "loss": 0.5809, "step": 35000 }, { "epoch": 0.04, "eval_runtime": 3129.6915, "eval_samples_per_second": 108.526, "eval_steps_per_second": 13.566, "step": 35000 }, { "epoch": 0.04, "learning_rate": 1.3499999999999998e-05, "loss": 0.5707, "step": 36000 }, { "epoch": 0.04, "learning_rate": 1.3874999999999998e-05, "loss": 0.5619, "step": 37000 }, { "epoch": 0.04, "learning_rate": 1.4249999999999999e-05, "loss": 0.5552, "step": 38000 }, { "epoch": 0.04, "learning_rate": 1.4625e-05, "loss": 0.5502, "step": 39000 }, { "epoch": 0.04, "learning_rate": 1.4999999999999999e-05, "loss": 0.5409, "step": 40000 }, { "epoch": 0.04, "eval_runtime": 3042.0268, "eval_samples_per_second": 111.653, "eval_steps_per_second": 13.957, "step": 40000 }, { "epoch": 0.04, "learning_rate": 1.5374999999999998e-05, "loss": 0.5352, "step": 41000 }, { "epoch": 0.04, "learning_rate": 1.5749999999999997e-05, "loss": 0.5329, "step": 42000 }, { "epoch": 0.04, "learning_rate": 1.6125e-05, "loss": 0.5304, "step": 43000 }, { "epoch": 0.04, "learning_rate": 1.6499999999999998e-05, "loss": 0.5232, "step": 44000 }, { "epoch": 0.04, "learning_rate": 1.6875e-05, "loss": 0.5164, "step": 45000 }, { "epoch": 0.04, "eval_runtime": 3334.4968, "eval_samples_per_second": 101.86, "eval_steps_per_second": 12.733, "step": 45000 }, { "epoch": 0.05, "learning_rate": 1.725e-05, "loss": 0.5133, "step": 46000 }, { "epoch": 0.05, "learning_rate": 1.7624999999999998e-05, "loss": 0.5104, "step": 47000 }, { "epoch": 0.05, "learning_rate": 1.7999999999999997e-05, "loss": 0.507, "step": 48000 }, { "epoch": 0.05, "learning_rate": 1.8375e-05, "loss": 0.5034, "step": 49000 }, { "epoch": 0.05, "learning_rate": 1.875e-05, "loss": 0.5007, "step": 50000 }, { "epoch": 0.05, "eval_runtime": 3361.5856, "eval_samples_per_second": 101.039, "eval_steps_per_second": 12.63, "step": 50000 }, { "epoch": 0.05, "learning_rate": 1.874997607785047e-05, "loss": 0.497, "step": 51000 }, { "epoch": 0.05, "learning_rate": 1.874990431166348e-05, "loss": 0.498, "step": 52000 }, { "epoch": 0.05, "learning_rate": 1.8749784702223863e-05, "loss": 0.4921, "step": 53000 }, { "epoch": 0.05, "learning_rate": 1.8749617250839647e-05, "loss": 0.4876, "step": 54000 }, { "epoch": 0.06, "learning_rate": 1.8749401959342052e-05, "loss": 0.483, "step": 55000 }, { "epoch": 0.06, "eval_runtime": 3145.913, "eval_samples_per_second": 107.966, "eval_steps_per_second": 13.496, "step": 55000 }, { "epoch": 0.06, "learning_rate": 1.874913883008547e-05, "loss": 0.4809, "step": 56000 }, { "epoch": 0.06, "learning_rate": 1.8748827865947437e-05, "loss": 0.4788, "step": 57000 }, { "epoch": 0.06, "learning_rate": 1.8748469070328614e-05, "loss": 0.4767, "step": 58000 }, { "epoch": 0.06, "learning_rate": 1.8748062447152732e-05, "loss": 0.475, "step": 59000 }, { "epoch": 0.06, "learning_rate": 1.874760800086655e-05, "loss": 0.4696, "step": 60000 }, { "epoch": 0.06, "eval_runtime": 3128.1932, "eval_samples_per_second": 108.578, "eval_steps_per_second": 13.572, "step": 60000 }, { "epoch": 0.06, "learning_rate": 1.8747105736439825e-05, "loss": 0.4672, "step": 61000 }, { "epoch": 0.06, "learning_rate": 1.8746555659365244e-05, "loss": 0.4634, "step": 62000 }, { "epoch": 0.06, "learning_rate": 1.8745957775658352e-05, "loss": 0.4605, "step": 63000 }, { "epoch": 0.06, "learning_rate": 1.8745312091857516e-05, "loss": 0.4554, "step": 64000 }, { "epoch": 0.07, "learning_rate": 1.8744618615023832e-05, "loss": 0.4539, "step": 65000 }, { "epoch": 0.07, "eval_runtime": 3195.2405, "eval_samples_per_second": 106.299, "eval_steps_per_second": 13.288, "step": 65000 }, { "epoch": 0.07, "learning_rate": 1.874387735274105e-05, "loss": 0.4547, "step": 66000 }, { "epoch": 0.07, "learning_rate": 1.8743088313115487e-05, "loss": 0.4536, "step": 67000 }, { "epoch": 0.07, "learning_rate": 1.8742251504775967e-05, "loss": 0.4548, "step": 68000 }, { "epoch": 0.07, "learning_rate": 1.8741366936873687e-05, "loss": 0.4528, "step": 69000 }, { "epoch": 0.07, "learning_rate": 1.8740434619082138e-05, "loss": 0.4525, "step": 70000 }, { "epoch": 0.07, "eval_runtime": 2998.8361, "eval_samples_per_second": 113.261, "eval_steps_per_second": 14.158, "step": 70000 }, { "epoch": 0.07, "learning_rate": 1.8739454561597e-05, "loss": 0.4555, "step": 71000 }, { "epoch": 0.07, "learning_rate": 1.873842677513602e-05, "loss": 0.4481, "step": 72000 }, { "epoch": 0.07, "learning_rate": 1.873735127093891e-05, "loss": 0.4446, "step": 73000 }, { "epoch": 0.07, "learning_rate": 1.8736228060767216e-05, "loss": 0.4417, "step": 74000 }, { "epoch": 0.07, "learning_rate": 1.873505715690418e-05, "loss": 0.4408, "step": 75000 }, { "epoch": 0.07, "eval_runtime": 3078.4956, "eval_samples_per_second": 110.331, "eval_steps_per_second": 13.791, "step": 75000 }, { "epoch": 0.08, "learning_rate": 1.8733838572154616e-05, "loss": 0.4382, "step": 76000 }, { "epoch": 0.08, "learning_rate": 1.8732572319844778e-05, "loss": 0.4349, "step": 77000 }, { "epoch": 0.08, "learning_rate": 1.8731258413822188e-05, "loss": 0.4359, "step": 78000 }, { "epoch": 0.08, "learning_rate": 1.8729896868455526e-05, "loss": 0.4317, "step": 79000 }, { "epoch": 0.08, "learning_rate": 1.872848769863442e-05, "loss": 0.4278, "step": 80000 }, { "epoch": 0.08, "eval_runtime": 3097.0612, "eval_samples_per_second": 109.669, "eval_steps_per_second": 13.709, "step": 80000 }, { "epoch": 0.08, "learning_rate": 1.872703091976934e-05, "loss": 0.4257, "step": 81000 }, { "epoch": 0.08, "learning_rate": 1.8725526547791372e-05, "loss": 0.4284, "step": 82000 }, { "epoch": 0.08, "learning_rate": 1.8723974599152097e-05, "loss": 0.4267, "step": 83000 }, { "epoch": 0.08, "learning_rate": 1.8722375090823373e-05, "loss": 0.4296, "step": 84000 }, { "epoch": 0.09, "learning_rate": 1.8720728040297167e-05, "loss": 0.4309, "step": 85000 }, { "epoch": 0.09, "eval_runtime": 3441.7811, "eval_samples_per_second": 98.685, "eval_steps_per_second": 12.336, "step": 85000 }, { "epoch": 0.09, "learning_rate": 1.8719033465585356e-05, "loss": 0.4298, "step": 86000 }, { "epoch": 0.09, "learning_rate": 1.8717291385219546e-05, "loss": 0.431, "step": 87000 }, { "epoch": 0.09, "learning_rate": 1.871550181825084e-05, "loss": 0.4296, "step": 88000 }, { "epoch": 0.09, "learning_rate": 1.8713664784249657e-05, "loss": 0.4262, "step": 89000 }, { "epoch": 0.09, "learning_rate": 1.8711780303305515e-05, "loss": 0.4246, "step": 90000 }, { "epoch": 0.09, "eval_runtime": 3097.2989, "eval_samples_per_second": 109.661, "eval_steps_per_second": 13.708, "step": 90000 }, { "epoch": 0.09, "learning_rate": 1.8709848396026785e-05, "loss": 0.4236, "step": 91000 }, { "epoch": 0.09, "learning_rate": 1.8707869083540504e-05, "loss": 0.422, "step": 92000 }, { "epoch": 0.09, "learning_rate": 1.870584238749211e-05, "loss": 0.4201, "step": 93000 }, { "epoch": 0.09, "learning_rate": 1.870376833004523e-05, "loss": 0.4159, "step": 94000 }, { "epoch": 0.1, "learning_rate": 1.8701646933881424e-05, "loss": 0.411, "step": 95000 }, { "epoch": 0.1, "eval_runtime": 3116.6941, "eval_samples_per_second": 108.978, "eval_steps_per_second": 13.622, "step": 95000 }, { "epoch": 0.1, "learning_rate": 1.8699478222199936e-05, "loss": 0.4144, "step": 96000 }, { "epoch": 0.1, "learning_rate": 1.8697262218717457e-05, "loss": 0.4151, "step": 97000 }, { "epoch": 0.1, "learning_rate": 1.8694998947667835e-05, "loss": 0.4098, "step": 98000 }, { "epoch": 0.1, "learning_rate": 1.869268843380185e-05, "loss": 0.416, "step": 99000 }, { "epoch": 0.1, "learning_rate": 1.8690330702386908e-05, "loss": 0.4171, "step": 100000 }, { "epoch": 0.1, "eval_runtime": 3064.7461, "eval_samples_per_second": 110.825, "eval_steps_per_second": 13.853, "step": 100000 }, { "epoch": 0.1, "learning_rate": 1.868792577920678e-05, "loss": 0.4142, "step": 101000 }, { "epoch": 0.1, "learning_rate": 1.8685473690561325e-05, "loss": 0.4115, "step": 102000 }, { "epoch": 0.1, "learning_rate": 1.868297446326619e-05, "loss": 0.4095, "step": 103000 }, { "epoch": 0.1, "learning_rate": 1.868042812465252e-05, "loss": 0.4088, "step": 104000 }, { "epoch": 0.1, "learning_rate": 1.8677834702566673e-05, "loss": 0.4067, "step": 105000 }, { "epoch": 0.1, "eval_runtime": 3225.8883, "eval_samples_per_second": 105.289, "eval_steps_per_second": 13.161, "step": 105000 }, { "epoch": 0.11, "learning_rate": 1.867519422536989e-05, "loss": 0.4071, "step": 106000 }, { "epoch": 0.11, "learning_rate": 1.867250672193801e-05, "loss": 0.4059, "step": 107000 }, { "epoch": 0.11, "learning_rate": 1.8669772221661142e-05, "loss": 0.4046, "step": 108000 }, { "epoch": 0.11, "learning_rate": 1.8666990754443344e-05, "loss": 0.4045, "step": 109000 }, { "epoch": 0.11, "learning_rate": 1.866416235070229e-05, "loss": 0.4051, "step": 110000 }, { "epoch": 0.11, "eval_runtime": 3164.1479, "eval_samples_per_second": 107.344, "eval_steps_per_second": 13.418, "step": 110000 }, { "epoch": 0.11, "learning_rate": 1.8661287041368955e-05, "loss": 0.4072, "step": 111000 }, { "epoch": 0.11, "learning_rate": 1.8658364857887257e-05, "loss": 0.4055, "step": 112000 }, { "epoch": 0.11, "learning_rate": 1.865539583221373e-05, "loss": 0.4046, "step": 113000 }, { "epoch": 0.11, "learning_rate": 1.865237999681716e-05, "loss": 0.4075, "step": 114000 }, { "epoch": 0.12, "learning_rate": 1.8649317384678245e-05, "loss": 0.4068, "step": 115000 }, { "epoch": 0.12, "eval_runtime": 3343.7877, "eval_samples_per_second": 101.577, "eval_steps_per_second": 12.697, "step": 115000 }, { "epoch": 0.12, "learning_rate": 1.8646208029289217e-05, "loss": 0.4054, "step": 116000 }, { "epoch": 0.12, "learning_rate": 1.8643051964653488e-05, "loss": 0.4031, "step": 117000 }, { "epoch": 0.12, "learning_rate": 1.8639849225285276e-05, "loss": 0.4035, "step": 118000 }, { "epoch": 0.12, "learning_rate": 1.8636599846209226e-05, "loss": 0.4039, "step": 119000 }, { "epoch": 0.12, "learning_rate": 1.8633303862960024e-05, "loss": 0.4067, "step": 120000 }, { "epoch": 0.12, "eval_runtime": 3601.8354, "eval_samples_per_second": 94.3, "eval_steps_per_second": 11.788, "step": 120000 }, { "epoch": 0.12, "learning_rate": 1.8629961311582018e-05, "loss": 0.4048, "step": 121000 }, { "epoch": 0.12, "learning_rate": 1.8626572228628815e-05, "loss": 0.4005, "step": 122000 }, { "epoch": 0.12, "learning_rate": 1.862313665116288e-05, "loss": 0.3918, "step": 123000 }, { "epoch": 0.12, "learning_rate": 1.8619654616755143e-05, "loss": 0.3931, "step": 124000 }, { "epoch": 0.12, "learning_rate": 1.861612616348457e-05, "loss": 0.394, "step": 125000 }, { "epoch": 0.12, "eval_runtime": 3425.9917, "eval_samples_per_second": 99.14, "eval_steps_per_second": 12.393, "step": 125000 }, { "epoch": 0.13, "learning_rate": 1.861255132993776e-05, "loss": 0.3948, "step": 126000 }, { "epoch": 0.13, "learning_rate": 1.860893015520852e-05, "loss": 0.3987, "step": 127000 }, { "epoch": 0.13, "learning_rate": 1.860526267889744e-05, "loss": 0.3978, "step": 128000 }, { "epoch": 0.13, "learning_rate": 1.8601548941111453e-05, "loss": 0.3989, "step": 129000 }, { "epoch": 0.13, "learning_rate": 1.85977889824634e-05, "loss": 0.3958, "step": 130000 }, { "epoch": 0.13, "eval_runtime": 3324.1954, "eval_samples_per_second": 102.176, "eval_steps_per_second": 12.772, "step": 130000 }, { "epoch": 0.13, "learning_rate": 1.859398284407158e-05, "loss": 0.3973, "step": 131000 }, { "epoch": 0.13, "learning_rate": 1.8590130567559324e-05, "loss": 0.3958, "step": 132000 }, { "epoch": 0.13, "learning_rate": 1.85862321950545e-05, "loss": 0.3932, "step": 133000 }, { "epoch": 0.13, "learning_rate": 1.8582287769189092e-05, "loss": 0.3953, "step": 134000 }, { "epoch": 0.14, "learning_rate": 1.857829733309871e-05, "loss": 0.3947, "step": 135000 }, { "epoch": 0.14, "eval_runtime": 3357.919, "eval_samples_per_second": 101.15, "eval_steps_per_second": 12.644, "step": 135000 }, { "epoch": 0.14, "learning_rate": 1.8574260930422114e-05, "loss": 0.392, "step": 136000 }, { "epoch": 0.14, "learning_rate": 1.857017860530077e-05, "loss": 0.3907, "step": 137000 }, { "epoch": 0.14, "learning_rate": 1.8566050402378328e-05, "loss": 0.3891, "step": 138000 }, { "epoch": 0.14, "learning_rate": 1.8561876366800144e-05, "loss": 0.385, "step": 139000 }, { "epoch": 0.14, "learning_rate": 1.8557656544212814e-05, "loss": 0.3867, "step": 140000 }, { "epoch": 0.14, "eval_runtime": 3296.2788, "eval_samples_per_second": 103.041, "eval_steps_per_second": 12.88, "step": 140000 }, { "epoch": 0.14, "learning_rate": 1.8553390980763637e-05, "loss": 0.3846, "step": 141000 }, { "epoch": 0.14, "learning_rate": 1.854907972310013e-05, "loss": 0.386, "step": 142000 }, { "epoch": 0.14, "learning_rate": 1.8544722818369517e-05, "loss": 0.3895, "step": 143000 }, { "epoch": 0.14, "learning_rate": 1.8540320314218213e-05, "loss": 0.3888, "step": 144000 }, { "epoch": 0.14, "learning_rate": 1.8535872258791296e-05, "loss": 0.3887, "step": 145000 }, { "epoch": 0.14, "eval_runtime": 3475.3376, "eval_samples_per_second": 97.732, "eval_steps_per_second": 12.217, "step": 145000 }, { "epoch": 0.15, "learning_rate": 1.8531378700731987e-05, "loss": 0.3886, "step": 146000 }, { "epoch": 0.15, "learning_rate": 1.8526839689181117e-05, "loss": 0.3891, "step": 147000 }, { "epoch": 0.15, "learning_rate": 1.852225527377659e-05, "loss": 0.3876, "step": 148000 }, { "epoch": 0.15, "learning_rate": 1.851762550465284e-05, "loss": 0.3882, "step": 149000 }, { "epoch": 0.15, "learning_rate": 1.8512950432440274e-05, "loss": 0.3892, "step": 150000 }, { "epoch": 0.15, "eval_runtime": 3067.6279, "eval_samples_per_second": 110.721, "eval_steps_per_second": 13.84, "step": 150000 }, { "epoch": 0.15, "learning_rate": 1.8508230108264738e-05, "loss": 0.3869, "step": 151000 }, { "epoch": 0.15, "learning_rate": 1.8503464583746943e-05, "loss": 0.3857, "step": 152000 }, { "epoch": 0.15, "learning_rate": 1.8498653911001896e-05, "loss": 0.3856, "step": 153000 }, { "epoch": 0.15, "learning_rate": 1.8493798142638353e-05, "loss": 0.3848, "step": 154000 }, { "epoch": 0.15, "learning_rate": 1.8488897331758204e-05, "loss": 0.3867, "step": 155000 }, { "epoch": 0.15, "eval_runtime": 3067.5049, "eval_samples_per_second": 110.726, "eval_steps_per_second": 13.841, "step": 155000 }, { "epoch": 0.16, "learning_rate": 1.8483951531955943e-05, "loss": 0.3869, "step": 156000 }, { "epoch": 0.16, "learning_rate": 1.8478960797318037e-05, "loss": 0.3855, "step": 157000 }, { "epoch": 0.16, "learning_rate": 1.847392518242237e-05, "loss": 0.3866, "step": 158000 }, { "epoch": 0.16, "learning_rate": 1.846884474233761e-05, "loss": 0.3881, "step": 159000 }, { "epoch": 0.16, "learning_rate": 1.846371953262264e-05, "loss": 0.3871, "step": 160000 }, { "epoch": 0.16, "eval_runtime": 3122.6166, "eval_samples_per_second": 108.772, "eval_steps_per_second": 13.597, "step": 160000 }, { "epoch": 0.16, "learning_rate": 1.845854960932593e-05, "loss": 0.3863, "step": 161000 }, { "epoch": 0.16, "learning_rate": 1.845333502898494e-05, "loss": 0.3895, "step": 162000 }, { "epoch": 0.16, "learning_rate": 1.844807584862548e-05, "loss": 0.3875, "step": 163000 }, { "epoch": 0.16, "learning_rate": 1.84427721257611e-05, "loss": 0.3877, "step": 164000 }, { "epoch": 0.17, "learning_rate": 1.8437423918392468e-05, "loss": 0.3874, "step": 165000 }, { "epoch": 0.17, "eval_runtime": 3037.4441, "eval_samples_per_second": 111.822, "eval_steps_per_second": 13.978, "step": 165000 }, { "epoch": 0.17, "learning_rate": 1.8432031285006723e-05, "loss": 0.3855, "step": 166000 }, { "epoch": 0.17, "learning_rate": 1.842659428457684e-05, "loss": 0.3856, "step": 167000 }, { "epoch": 0.17, "learning_rate": 1.842111297656098e-05, "loss": 0.3842, "step": 168000 }, { "epoch": 0.17, "learning_rate": 1.8415587420901857e-05, "loss": 0.3865, "step": 169000 }, { "epoch": 0.17, "learning_rate": 1.8410017678026057e-05, "loss": 0.3883, "step": 170000 }, { "epoch": 0.17, "eval_runtime": 3269.0941, "eval_samples_per_second": 103.898, "eval_steps_per_second": 12.987, "step": 170000 }, { "epoch": 0.17, "learning_rate": 1.8404403808843397e-05, "loss": 0.388, "step": 171000 }, { "epoch": 0.17, "learning_rate": 1.8398745874746253e-05, "loss": 0.3842, "step": 172000 }, { "epoch": 0.17, "learning_rate": 1.839304393760888e-05, "loss": 0.3848, "step": 173000 }, { "epoch": 0.17, "learning_rate": 1.8387298059786754e-05, "loss": 0.3832, "step": 174000 }, { "epoch": 0.17, "learning_rate": 1.8381508304115872e-05, "loss": 0.3823, "step": 175000 }, { "epoch": 0.17, "eval_runtime": 3160.104, "eval_samples_per_second": 107.481, "eval_steps_per_second": 13.435, "step": 175000 }, { "epoch": 0.18, "learning_rate": 1.837567473391208e-05, "loss": 0.3828, "step": 176000 }, { "epoch": 0.18, "learning_rate": 1.836979741297036e-05, "loss": 0.3827, "step": 177000 }, { "epoch": 0.18, "learning_rate": 1.8363876405564156e-05, "loss": 0.3839, "step": 178000 }, { "epoch": 0.18, "learning_rate": 1.8357911776444656e-05, "loss": 0.384, "step": 179000 }, { "epoch": 0.18, "learning_rate": 1.8351903590840085e-05, "loss": 0.3841, "step": 180000 }, { "epoch": 0.18, "eval_runtime": 3320.0581, "eval_samples_per_second": 102.303, "eval_steps_per_second": 12.788, "step": 180000 }, { "epoch": 0.18, "learning_rate": 1.8345851914455004e-05, "loss": 0.3833, "step": 181000 }, { "epoch": 0.18, "learning_rate": 1.8339756813469568e-05, "loss": 0.3806, "step": 182000 }, { "epoch": 0.18, "learning_rate": 1.8333618354538833e-05, "loss": 0.3814, "step": 183000 }, { "epoch": 0.18, "learning_rate": 1.8327436604792002e-05, "loss": 0.3798, "step": 184000 }, { "epoch": 0.18, "learning_rate": 1.8321211631831694e-05, "loss": 0.3788, "step": 185000 }, { "epoch": 0.18, "eval_runtime": 3164.8004, "eval_samples_per_second": 107.322, "eval_steps_per_second": 13.415, "step": 185000 }, { "epoch": 0.19, "learning_rate": 1.831494350373322e-05, "loss": 0.3794, "step": 186000 }, { "epoch": 0.19, "learning_rate": 1.8308632289043824e-05, "loss": 0.3795, "step": 187000 }, { "epoch": 0.19, "learning_rate": 1.830227805678194e-05, "loss": 0.3783, "step": 188000 }, { "epoch": 0.19, "learning_rate": 1.829588087643643e-05, "loss": 0.3765, "step": 189000 }, { "epoch": 0.19, "learning_rate": 1.8289440817965837e-05, "loss": 0.3752, "step": 190000 }, { "epoch": 0.19, "eval_runtime": 3330.5005, "eval_samples_per_second": 101.982, "eval_steps_per_second": 12.748, "step": 190000 }, { "epoch": 0.19, "learning_rate": 1.828295795179761e-05, "loss": 0.3747, "step": 191000 }, { "epoch": 0.19, "learning_rate": 1.8276432348827332e-05, "loss": 0.3753, "step": 192000 }, { "epoch": 0.19, "learning_rate": 1.8269864080417946e-05, "loss": 0.3741, "step": 193000 }, { "epoch": 0.19, "learning_rate": 1.8263253218398995e-05, "loss": 0.3717, "step": 194000 }, { "epoch": 0.2, "learning_rate": 1.82565998350658e-05, "loss": 0.3707, "step": 195000 }, { "epoch": 0.2, "eval_runtime": 3281.5026, "eval_samples_per_second": 103.505, "eval_steps_per_second": 12.938, "step": 195000 }, { "epoch": 0.2, "learning_rate": 1.8249904003178695e-05, "loss": 0.3758, "step": 196000 }, { "epoch": 0.2, "learning_rate": 1.8243165795962227e-05, "loss": 0.3735, "step": 197000 }, { "epoch": 0.2, "learning_rate": 1.8236385287104348e-05, "loss": 0.3765, "step": 198000 }, { "epoch": 0.2, "learning_rate": 1.8229562550755617e-05, "loss": 0.3753, "step": 199000 }, { "epoch": 0.2, "learning_rate": 1.8222697661528388e-05, "loss": 0.3713, "step": 200000 }, { "epoch": 0.2, "eval_runtime": 3419.5195, "eval_samples_per_second": 99.327, "eval_steps_per_second": 12.416, "step": 200000 }, { "epoch": 0.2, "learning_rate": 1.821579069449599e-05, "loss": 0.3706, "step": 201000 }, { "epoch": 0.2, "learning_rate": 1.82088417251919e-05, "loss": 0.373, "step": 202000 }, { "epoch": 0.2, "learning_rate": 1.8201850829608947e-05, "loss": 0.3777, "step": 203000 }, { "epoch": 0.2, "learning_rate": 1.819481808419843e-05, "loss": 0.3748, "step": 204000 }, { "epoch": 0.2, "learning_rate": 1.818774356586934e-05, "loss": 0.3742, "step": 205000 }, { "epoch": 0.2, "eval_runtime": 3221.436, "eval_samples_per_second": 105.435, "eval_steps_per_second": 13.18, "step": 205000 }, { "epoch": 0.21, "learning_rate": 1.8180627351987462e-05, "loss": 0.3724, "step": 206000 }, { "epoch": 0.21, "learning_rate": 1.8173469520374583e-05, "loss": 0.3716, "step": 207000 }, { "epoch": 0.21, "learning_rate": 1.8166270149307596e-05, "loss": 0.3721, "step": 208000 }, { "epoch": 0.21, "learning_rate": 1.8159029317517672e-05, "loss": 0.3724, "step": 209000 }, { "epoch": 0.21, "learning_rate": 1.8151747104189386e-05, "loss": 0.3721, "step": 210000 }, { "epoch": 0.21, "eval_runtime": 3177.8077, "eval_samples_per_second": 106.882, "eval_steps_per_second": 13.36, "step": 210000 }, { "epoch": 0.21, "learning_rate": 1.8144423588959855e-05, "loss": 0.3706, "step": 211000 }, { "epoch": 0.21, "learning_rate": 1.8137058851917872e-05, "loss": 0.3709, "step": 212000 }, { "epoch": 0.21, "learning_rate": 1.812965297360302e-05, "loss": 0.372, "step": 213000 }, { "epoch": 0.21, "learning_rate": 1.81222060350048e-05, "loss": 0.3734, "step": 214000 }, { "epoch": 0.21, "learning_rate": 1.811471811756173e-05, "loss": 0.3724, "step": 215000 }, { "epoch": 0.21, "eval_runtime": 3189.5397, "eval_samples_per_second": 106.489, "eval_steps_per_second": 13.311, "step": 215000 }, { "epoch": 0.22, "learning_rate": 1.8107189303160486e-05, "loss": 0.3723, "step": 216000 }, { "epoch": 0.22, "learning_rate": 1.8099619674134973e-05, "loss": 0.3714, "step": 217000 }, { "epoch": 0.22, "learning_rate": 1.809200931326544e-05, "loss": 0.3712, "step": 218000 }, { "epoch": 0.22, "learning_rate": 1.8084358303777576e-05, "loss": 0.3704, "step": 219000 }, { "epoch": 0.22, "learning_rate": 1.807666672934159e-05, "loss": 0.37, "step": 220000 }, { "epoch": 0.22, "eval_runtime": 3198.1902, "eval_samples_per_second": 106.201, "eval_steps_per_second": 13.275, "step": 220000 }, { "epoch": 0.22, "learning_rate": 1.8068934674071315e-05, "loss": 0.3679, "step": 221000 }, { "epoch": 0.22, "learning_rate": 1.8061162222523262e-05, "loss": 0.3655, "step": 222000 }, { "epoch": 0.22, "learning_rate": 1.8053349459695724e-05, "loss": 0.3657, "step": 223000 }, { "epoch": 0.22, "learning_rate": 1.8045496471027813e-05, "loss": 0.3655, "step": 224000 }, { "epoch": 0.23, "learning_rate": 1.803760334239856e-05, "loss": 0.3667, "step": 225000 }, { "epoch": 0.23, "eval_runtime": 3366.2132, "eval_samples_per_second": 100.9, "eval_steps_per_second": 12.613, "step": 225000 }, { "epoch": 0.23, "learning_rate": 1.802967016012596e-05, "loss": 0.3668, "step": 226000 }, { "epoch": 0.23, "learning_rate": 1.8021697010966016e-05, "loss": 0.3669, "step": 227000 }, { "epoch": 0.23, "learning_rate": 1.8013683982111812e-05, "loss": 0.366, "step": 228000 }, { "epoch": 0.23, "learning_rate": 1.8005631161192552e-05, "loss": 0.3692, "step": 229000 }, { "epoch": 0.23, "learning_rate": 1.7997538636272585e-05, "loss": 0.3683, "step": 230000 }, { "epoch": 0.23, "eval_runtime": 3198.8818, "eval_samples_per_second": 106.178, "eval_steps_per_second": 13.272, "step": 230000 }, { "epoch": 0.23, "learning_rate": 1.798940649585048e-05, "loss": 0.3703, "step": 231000 }, { "epoch": 0.23, "learning_rate": 1.7981234828858012e-05, "loss": 0.3696, "step": 232000 }, { "epoch": 0.23, "learning_rate": 1.7973023724659226e-05, "loss": 0.3696, "step": 233000 }, { "epoch": 0.23, "learning_rate": 1.7964773273049443e-05, "loss": 0.3696, "step": 234000 }, { "epoch": 0.23, "learning_rate": 1.795648356425428e-05, "loss": 0.3689, "step": 235000 }, { "epoch": 0.23, "eval_runtime": 3392.6969, "eval_samples_per_second": 100.113, "eval_steps_per_second": 12.514, "step": 235000 }, { "epoch": 0.24, "learning_rate": 1.7948154688928657e-05, "loss": 0.3691, "step": 236000 }, { "epoch": 0.24, "learning_rate": 1.793978673815583e-05, "loss": 0.368, "step": 237000 }, { "epoch": 0.24, "learning_rate": 1.7931379803446365e-05, "loss": 0.3662, "step": 238000 }, { "epoch": 0.24, "learning_rate": 1.792293397673715e-05, "loss": 0.3634, "step": 239000 }, { "epoch": 0.24, "learning_rate": 1.791444935039039e-05, "loss": 0.365, "step": 240000 }, { "epoch": 0.24, "eval_runtime": 3401.6269, "eval_samples_per_second": 99.85, "eval_steps_per_second": 12.481, "step": 240000 }, { "epoch": 0.24, "learning_rate": 1.7905926017192613e-05, "loss": 0.3632, "step": 241000 }, { "epoch": 0.24, "learning_rate": 1.7897364070353612e-05, "loss": 0.3633, "step": 242000 }, { "epoch": 0.24, "learning_rate": 1.788876360350547e-05, "loss": 0.3628, "step": 243000 }, { "epoch": 0.24, "learning_rate": 1.7880124710701515e-05, "loss": 0.3628, "step": 244000 }, { "epoch": 0.24, "learning_rate": 1.7871447486415292e-05, "loss": 0.363, "step": 245000 }, { "epoch": 0.24, "eval_runtime": 3598.9717, "eval_samples_per_second": 94.375, "eval_steps_per_second": 11.797, "step": 245000 }, { "epoch": 0.25, "learning_rate": 1.7862732025539543e-05, "loss": 0.364, "step": 246000 }, { "epoch": 0.25, "learning_rate": 1.7853978423385145e-05, "loss": 0.3626, "step": 247000 }, { "epoch": 0.25, "learning_rate": 1.784518677568009e-05, "loss": 0.361, "step": 248000 }, { "epoch": 0.25, "learning_rate": 1.783635717856843e-05, "loss": 0.3597, "step": 249000 }, { "epoch": 0.25, "learning_rate": 1.782748972860922e-05, "loss": 0.3626, "step": 250000 }, { "epoch": 0.25, "eval_runtime": 2943.8602, "eval_samples_per_second": 115.376, "eval_steps_per_second": 14.422, "step": 250000 }, { "epoch": 0.25, "learning_rate": 1.7818584522775476e-05, "loss": 0.3617, "step": 251000 }, { "epoch": 0.25, "learning_rate": 1.7809641658453108e-05, "loss": 0.3627, "step": 252000 }, { "epoch": 0.25, "learning_rate": 1.7800661233439838e-05, "loss": 0.3633, "step": 253000 }, { "epoch": 0.25, "learning_rate": 1.7791643345944158e-05, "loss": 0.3654, "step": 254000 }, { "epoch": 0.26, "learning_rate": 1.778258809458424e-05, "loss": 0.3657, "step": 255000 }, { "epoch": 0.26, "eval_runtime": 3099.287, "eval_samples_per_second": 109.59, "eval_steps_per_second": 13.699, "step": 255000 }, { "epoch": 0.26, "learning_rate": 1.7773495578386868e-05, "loss": 0.3615, "step": 256000 }, { "epoch": 0.26, "learning_rate": 1.7764365896786334e-05, "loss": 0.3594, "step": 257000 }, { "epoch": 0.26, "learning_rate": 1.7755199149623376e-05, "loss": 0.3595, "step": 258000 }, { "epoch": 0.26, "learning_rate": 1.7745995437144077e-05, "loss": 0.3598, "step": 259000 }, { "epoch": 0.26, "learning_rate": 1.7736754859998755e-05, "loss": 0.3604, "step": 260000 }, { "epoch": 0.26, "eval_runtime": 2888.4568, "eval_samples_per_second": 117.589, "eval_steps_per_second": 14.699, "step": 260000 }, { "epoch": 0.26, "learning_rate": 1.772747751924089e-05, "loss": 0.3593, "step": 261000 }, { "epoch": 0.26, "learning_rate": 1.7718163516325983e-05, "loss": 0.3609, "step": 262000 }, { "epoch": 0.26, "learning_rate": 1.770881295311049e-05, "loss": 0.3585, "step": 263000 }, { "epoch": 0.26, "learning_rate": 1.7699425931850663e-05, "loss": 0.3588, "step": 264000 }, { "epoch": 0.27, "learning_rate": 1.7690002555201464e-05, "loss": 0.3612, "step": 265000 }, { "epoch": 0.27, "eval_runtime": 2882.355, "eval_samples_per_second": 117.838, "eval_steps_per_second": 14.73, "step": 265000 }, { "epoch": 0.27, "learning_rate": 1.7680542926215433e-05, "loss": 0.3619, "step": 266000 }, { "epoch": 0.27, "learning_rate": 1.7671047148341554e-05, "loss": 0.3621, "step": 267000 }, { "epoch": 0.27, "learning_rate": 1.7661515325424137e-05, "loss": 0.3617, "step": 268000 }, { "epoch": 0.27, "learning_rate": 1.7651947561701667e-05, "loss": 0.3582, "step": 269000 }, { "epoch": 0.27, "learning_rate": 1.764234396180567e-05, "loss": 0.3601, "step": 270000 }, { "epoch": 0.27, "eval_runtime": 3115.4481, "eval_samples_per_second": 109.022, "eval_steps_per_second": 13.628, "step": 270000 }, { "epoch": 0.27, "learning_rate": 1.7632704630759587e-05, "loss": 0.3595, "step": 271000 }, { "epoch": 0.27, "learning_rate": 1.7623029673977594e-05, "loss": 0.3606, "step": 272000 }, { "epoch": 0.27, "learning_rate": 1.761331919726346e-05, "loss": 0.3608, "step": 273000 }, { "epoch": 0.27, "learning_rate": 1.7603573306809416e-05, "loss": 0.3603, "step": 274000 }, { "epoch": 0.28, "learning_rate": 1.759379210919495e-05, "loss": 0.3607, "step": 275000 }, { "epoch": 0.28, "eval_runtime": 3107.9101, "eval_samples_per_second": 109.286, "eval_steps_per_second": 13.661, "step": 275000 }, { "epoch": 0.28, "learning_rate": 1.7583975711385674e-05, "loss": 0.3608, "step": 276000 }, { "epoch": 0.28, "learning_rate": 1.757412422073214e-05, "loss": 0.3601, "step": 277000 }, { "epoch": 0.28, "learning_rate": 1.7564237744968674e-05, "loss": 0.3609, "step": 278000 }, { "epoch": 0.28, "learning_rate": 1.755431639221219e-05, "loss": 0.3618, "step": 279000 }, { "epoch": 0.28, "learning_rate": 1.7544360270961013e-05, "loss": 0.3606, "step": 280000 }, { "epoch": 0.28, "eval_runtime": 3254.5533, "eval_samples_per_second": 104.362, "eval_steps_per_second": 13.045, "step": 280000 }, { "epoch": 0.28, "learning_rate": 1.7534369490093688e-05, "loss": 0.3583, "step": 281000 }, { "epoch": 0.28, "learning_rate": 1.7524344158867797e-05, "loss": 0.3586, "step": 282000 }, { "epoch": 0.28, "learning_rate": 1.7514284386918754e-05, "loss": 0.3523, "step": 283000 }, { "epoch": 0.28, "learning_rate": 1.750419028425861e-05, "loss": 0.3599, "step": 284000 }, { "epoch": 0.28, "learning_rate": 1.749406196127486e-05, "loss": 0.3604, "step": 285000 }, { "epoch": 0.28, "eval_runtime": 3099.6554, "eval_samples_per_second": 109.577, "eval_steps_per_second": 13.697, "step": 285000 }, { "epoch": 0.29, "learning_rate": 1.7483899528729222e-05, "loss": 0.3573, "step": 286000 }, { "epoch": 0.29, "learning_rate": 1.7473703097756425e-05, "loss": 0.3546, "step": 287000 }, { "epoch": 0.29, "learning_rate": 1.7463472779863007e-05, "loss": 0.3524, "step": 288000 }, { "epoch": 0.29, "learning_rate": 1.7453208686926084e-05, "loss": 0.3532, "step": 289000 }, { "epoch": 0.29, "learning_rate": 1.7442910931192138e-05, "loss": 0.3555, "step": 290000 }, { "epoch": 0.29, "eval_runtime": 3271.082, "eval_samples_per_second": 103.835, "eval_steps_per_second": 12.979, "step": 290000 }, { "epoch": 0.29, "learning_rate": 1.7432579625275767e-05, "loss": 0.3557, "step": 291000 }, { "epoch": 0.29, "learning_rate": 1.7422214882158484e-05, "loss": 0.3569, "step": 292000 }, { "epoch": 0.29, "learning_rate": 1.7411816815187455e-05, "loss": 0.3564, "step": 293000 }, { "epoch": 0.29, "learning_rate": 1.7401385538074276e-05, "loss": 0.3574, "step": 294000 }, { "epoch": 0.29, "learning_rate": 1.7390921164893724e-05, "loss": 0.3573, "step": 295000 }, { "epoch": 0.29, "eval_runtime": 3244.7523, "eval_samples_per_second": 104.677, "eval_steps_per_second": 13.085, "step": 295000 }, { "epoch": 0.3, "learning_rate": 1.7380423810082507e-05, "loss": 0.353, "step": 296000 }, { "epoch": 0.3, "learning_rate": 1.7369893588438012e-05, "loss": 0.3568, "step": 297000 }, { "epoch": 0.3, "learning_rate": 1.7359330615117058e-05, "loss": 0.3537, "step": 298000 }, { "epoch": 0.3, "learning_rate": 1.734873500563463e-05, "loss": 0.3576, "step": 299000 }, { "epoch": 0.3, "learning_rate": 1.7338106875862617e-05, "loss": 0.3575, "step": 300000 }, { "epoch": 0.3, "eval_runtime": 3130.8127, "eval_samples_per_second": 108.487, "eval_steps_per_second": 13.561, "step": 300000 }, { "epoch": 0.3, "learning_rate": 1.732744634202854e-05, "loss": 0.3579, "step": 301000 }, { "epoch": 0.3, "learning_rate": 1.731675352071429e-05, "loss": 0.3569, "step": 302000 }, { "epoch": 0.3, "learning_rate": 1.7306028528854846e-05, "loss": 0.3561, "step": 303000 }, { "epoch": 0.3, "learning_rate": 1.7295271483737004e-05, "loss": 0.3563, "step": 304000 }, { "epoch": 0.3, "learning_rate": 1.7284482502998086e-05, "loss": 0.3567, "step": 305000 }, { "epoch": 0.3, "eval_runtime": 3353.5685, "eval_samples_per_second": 101.281, "eval_steps_per_second": 12.66, "step": 305000 }, { "epoch": 0.31, "learning_rate": 1.7273661704624656e-05, "loss": 0.3559, "step": 306000 }, { "epoch": 0.31, "learning_rate": 1.7262809206951228e-05, "loss": 0.355, "step": 307000 }, { "epoch": 0.31, "learning_rate": 1.725192512865898e-05, "loss": 0.3576, "step": 308000 }, { "epoch": 0.31, "learning_rate": 1.7241009588774453e-05, "loss": 0.3555, "step": 309000 }, { "epoch": 0.31, "learning_rate": 1.7230062706668237e-05, "loss": 0.3523, "step": 310000 }, { "epoch": 0.31, "eval_runtime": 3333.1448, "eval_samples_per_second": 101.901, "eval_steps_per_second": 12.738, "step": 310000 }, { "epoch": 0.31, "learning_rate": 1.721908460205368e-05, "loss": 0.3526, "step": 311000 }, { "epoch": 0.31, "learning_rate": 1.7208075394985582e-05, "loss": 0.3558, "step": 312000 }, { "epoch": 0.31, "learning_rate": 1.719703520585886e-05, "loss": 0.3536, "step": 313000 }, { "epoch": 0.31, "learning_rate": 1.718596415540726e-05, "loss": 0.3463, "step": 314000 }, { "epoch": 0.32, "learning_rate": 1.717486236470201e-05, "loss": 0.3538, "step": 315000 }, { "epoch": 0.32, "eval_runtime": 3107.5393, "eval_samples_per_second": 109.299, "eval_steps_per_second": 13.663, "step": 315000 }, { "epoch": 0.32, "learning_rate": 1.7163729955150515e-05, "loss": 0.353, "step": 316000 }, { "epoch": 0.32, "learning_rate": 1.7152567048495027e-05, "loss": 0.3533, "step": 317000 }, { "epoch": 0.32, "learning_rate": 1.7141373766811305e-05, "loss": 0.3519, "step": 318000 }, { "epoch": 0.32, "learning_rate": 1.713015023250728e-05, "loss": 0.352, "step": 319000 }, { "epoch": 0.32, "learning_rate": 1.7118896568321722e-05, "loss": 0.3507, "step": 320000 }, { "epoch": 0.32, "eval_runtime": 3244.0396, "eval_samples_per_second": 104.7, "eval_steps_per_second": 13.088, "step": 320000 }, { "epoch": 0.32, "learning_rate": 1.7107612897322908e-05, "loss": 0.3468, "step": 321000 }, { "epoch": 0.32, "learning_rate": 1.7096299342907253e-05, "loss": 0.348, "step": 322000 }, { "epoch": 0.32, "learning_rate": 1.708495602879797e-05, "loss": 0.3455, "step": 323000 }, { "epoch": 0.32, "learning_rate": 1.7073583079043734e-05, "loss": 0.3453, "step": 324000 }, { "epoch": 0.33, "learning_rate": 1.7062180618017294e-05, "loss": 0.3451, "step": 325000 }, { "epoch": 0.33, "eval_runtime": 3171.68, "eval_samples_per_second": 107.089, "eval_steps_per_second": 13.386, "step": 325000 }, { "epoch": 0.33, "learning_rate": 1.7050748770414142e-05, "loss": 0.3469, "step": 326000 }, { "epoch": 0.33, "learning_rate": 1.703928766125113e-05, "loss": 0.3489, "step": 327000 }, { "epoch": 0.33, "learning_rate": 1.7027797415865108e-05, "loss": 0.3472, "step": 328000 }, { "epoch": 0.33, "learning_rate": 1.7016278159911565e-05, "loss": 0.3488, "step": 329000 }, { "epoch": 0.33, "learning_rate": 1.7004730019363233e-05, "loss": 0.3518, "step": 330000 }, { "epoch": 0.33, "eval_runtime": 3098.1629, "eval_samples_per_second": 109.63, "eval_steps_per_second": 13.704, "step": 330000 }, { "epoch": 0.33, "learning_rate": 1.699315312050873e-05, "loss": 0.3498, "step": 331000 }, { "epoch": 0.33, "learning_rate": 1.698154758995117e-05, "loss": 0.348, "step": 332000 }, { "epoch": 0.33, "learning_rate": 1.6969913554606766e-05, "loss": 0.3523, "step": 333000 }, { "epoch": 0.33, "learning_rate": 1.6958251141703475e-05, "loss": 0.3521, "step": 334000 }, { "epoch": 0.34, "learning_rate": 1.6946560478779568e-05, "loss": 0.352, "step": 335000 }, { "epoch": 0.34, "eval_runtime": 3130.2899, "eval_samples_per_second": 108.505, "eval_steps_per_second": 13.563, "step": 335000 }, { "epoch": 0.34, "learning_rate": 1.693484169368227e-05, "loss": 0.3507, "step": 336000 }, { "epoch": 0.34, "learning_rate": 1.692309491456633e-05, "loss": 0.3482, "step": 337000 }, { "epoch": 0.34, "learning_rate": 1.6911320269892646e-05, "loss": 0.3505, "step": 338000 }, { "epoch": 0.34, "learning_rate": 1.6899517888426844e-05, "loss": 0.3528, "step": 339000 }, { "epoch": 0.34, "learning_rate": 1.6887687899237877e-05, "loss": 0.3518, "step": 340000 }, { "epoch": 0.34, "eval_runtime": 3206.3294, "eval_samples_per_second": 105.932, "eval_steps_per_second": 13.242, "step": 340000 }, { "epoch": 0.34, "learning_rate": 1.6875830431696614e-05, "loss": 0.3519, "step": 341000 }, { "epoch": 0.34, "learning_rate": 1.6863945615474417e-05, "loss": 0.3529, "step": 342000 }, { "epoch": 0.34, "learning_rate": 1.6852033580541736e-05, "loss": 0.3515, "step": 343000 }, { "epoch": 0.34, "learning_rate": 1.6840094457166672e-05, "loss": 0.3528, "step": 344000 }, { "epoch": 0.34, "learning_rate": 1.6828128375913567e-05, "loss": 0.3524, "step": 345000 }, { "epoch": 0.34, "eval_runtime": 3254.6061, "eval_samples_per_second": 104.36, "eval_steps_per_second": 13.045, "step": 345000 }, { "epoch": 0.35, "learning_rate": 1.681613546764157e-05, "loss": 0.3525, "step": 346000 }, { "epoch": 0.35, "learning_rate": 1.6804115863503203e-05, "loss": 0.3507, "step": 347000 }, { "epoch": 0.35, "learning_rate": 1.6792069694942933e-05, "loss": 0.3501, "step": 348000 }, { "epoch": 0.35, "learning_rate": 1.6779997093695726e-05, "loss": 0.3509, "step": 349000 }, { "epoch": 0.35, "learning_rate": 1.6767898191785617e-05, "loss": 0.3518, "step": 350000 }, { "epoch": 0.35, "eval_runtime": 3147.5706, "eval_samples_per_second": 107.909, "eval_steps_per_second": 13.489, "step": 350000 }, { "epoch": 0.35, "learning_rate": 1.675577312152426e-05, "loss": 0.3515, "step": 351000 }, { "epoch": 0.35, "learning_rate": 1.674362201550948e-05, "loss": 0.3515, "step": 352000 }, { "epoch": 0.35, "learning_rate": 1.673144500662382e-05, "loss": 0.3516, "step": 353000 }, { "epoch": 0.35, "learning_rate": 1.671924222803311e-05, "loss": 0.3496, "step": 354000 }, { "epoch": 0.35, "learning_rate": 1.6707013813184972e-05, "loss": 0.3533, "step": 355000 }, { "epoch": 0.35, "eval_runtime": 3316.1479, "eval_samples_per_second": 102.424, "eval_steps_per_second": 12.803, "step": 355000 }, { "epoch": 0.36, "learning_rate": 1.6694759895807394e-05, "loss": 0.3504, "step": 356000 }, { "epoch": 0.36, "learning_rate": 1.6682480609907256e-05, "loss": 0.3474, "step": 357000 }, { "epoch": 0.36, "learning_rate": 1.667017608976886e-05, "loss": 0.3462, "step": 358000 }, { "epoch": 0.36, "learning_rate": 1.665784646995246e-05, "loss": 0.3461, "step": 359000 }, { "epoch": 0.36, "learning_rate": 1.6645491885292816e-05, "loss": 0.3449, "step": 360000 }, { "epoch": 0.36, "eval_runtime": 3318.971, "eval_samples_per_second": 102.337, "eval_steps_per_second": 12.792, "step": 360000 }, { "epoch": 0.36, "learning_rate": 1.6633112470897675e-05, "loss": 0.3442, "step": 361000 }, { "epoch": 0.36, "learning_rate": 1.6620708362146338e-05, "loss": 0.3441, "step": 362000 }, { "epoch": 0.36, "learning_rate": 1.6608279694688143e-05, "loss": 0.3441, "step": 363000 }, { "epoch": 0.36, "learning_rate": 1.659582660444101e-05, "loss": 0.3419, "step": 364000 }, { "epoch": 0.36, "learning_rate": 1.658334922758994e-05, "loss": 0.3405, "step": 365000 }, { "epoch": 0.36, "eval_runtime": 3198.5996, "eval_samples_per_second": 106.188, "eval_steps_per_second": 13.274, "step": 365000 }, { "epoch": 0.37, "learning_rate": 1.6570847700585524e-05, "loss": 0.3406, "step": 366000 }, { "epoch": 0.37, "learning_rate": 1.6558322160142462e-05, "loss": 0.341, "step": 367000 }, { "epoch": 0.37, "learning_rate": 1.654577274323806e-05, "loss": 0.342, "step": 368000 }, { "epoch": 0.37, "learning_rate": 1.653319958711072e-05, "loss": 0.3479, "step": 369000 }, { "epoch": 0.37, "learning_rate": 1.6520602829258474e-05, "loss": 0.3487, "step": 370000 }, { "epoch": 0.37, "eval_runtime": 3164.3305, "eval_samples_per_second": 107.338, "eval_steps_per_second": 13.417, "step": 370000 }, { "epoch": 0.37, "learning_rate": 1.650798260743744e-05, "loss": 0.3488, "step": 371000 }, { "epoch": 0.37, "learning_rate": 1.6495339059660347e-05, "loss": 0.3483, "step": 372000 }, { "epoch": 0.37, "learning_rate": 1.6482672324195004e-05, "loss": 0.3477, "step": 373000 }, { "epoch": 0.37, "learning_rate": 1.6469982539562804e-05, "loss": 0.3446, "step": 374000 }, { "epoch": 0.38, "learning_rate": 1.6457269844537196e-05, "loss": 0.3442, "step": 375000 }, { "epoch": 0.38, "eval_runtime": 3159.753, "eval_samples_per_second": 107.493, "eval_steps_per_second": 13.437, "step": 375000 }, { "epoch": 0.38, "learning_rate": 1.644453437814218e-05, "loss": 0.3423, "step": 376000 }, { "epoch": 0.38, "learning_rate": 1.643177627965077e-05, "loss": 0.3422, "step": 377000 }, { "epoch": 0.38, "learning_rate": 1.641899568858349e-05, "loss": 0.3442, "step": 378000 }, { "epoch": 0.38, "learning_rate": 1.640619274470684e-05, "loss": 0.3418, "step": 379000 }, { "epoch": 0.38, "learning_rate": 1.639336758803176e-05, "loss": 0.3446, "step": 380000 }, { "epoch": 0.38, "eval_runtime": 3155.9637, "eval_samples_per_second": 107.622, "eval_steps_per_second": 13.453, "step": 380000 }, { "epoch": 0.38, "learning_rate": 1.6380520358812106e-05, "loss": 0.3466, "step": 381000 }, { "epoch": 0.38, "learning_rate": 1.636765119754312e-05, "loss": 0.345, "step": 382000 }, { "epoch": 0.38, "learning_rate": 1.635476024495989e-05, "loss": 0.3439, "step": 383000 }, { "epoch": 0.38, "learning_rate": 1.6341847642035807e-05, "loss": 0.3447, "step": 384000 }, { "epoch": 0.39, "learning_rate": 1.632891352998103e-05, "loss": 0.3453, "step": 385000 }, { "epoch": 0.39, "eval_runtime": 3143.6271, "eval_samples_per_second": 108.045, "eval_steps_per_second": 13.506, "step": 385000 }, { "epoch": 0.39, "learning_rate": 1.631595805024093e-05, "loss": 0.3451, "step": 386000 }, { "epoch": 0.39, "learning_rate": 1.6302981344494562e-05, "loss": 0.3411, "step": 387000 }, { "epoch": 0.39, "learning_rate": 1.62899835546531e-05, "loss": 0.3435, "step": 388000 }, { "epoch": 0.39, "learning_rate": 1.6276964822858297e-05, "loss": 0.3437, "step": 389000 }, { "epoch": 0.39, "learning_rate": 1.6263925291480904e-05, "loss": 0.3446, "step": 390000 }, { "epoch": 0.39, "eval_runtime": 3211.4597, "eval_samples_per_second": 105.762, "eval_steps_per_second": 13.22, "step": 390000 }, { "epoch": 0.39, "learning_rate": 1.625086510311916e-05, "loss": 0.3443, "step": 391000 }, { "epoch": 0.39, "learning_rate": 1.623778440059719e-05, "loss": 0.3439, "step": 392000 }, { "epoch": 0.39, "learning_rate": 1.622468332696346e-05, "loss": 0.3444, "step": 393000 }, { "epoch": 0.39, "learning_rate": 1.6211562025489212e-05, "loss": 0.3426, "step": 394000 }, { "epoch": 0.4, "learning_rate": 1.6198420639666893e-05, "loss": 0.3395, "step": 395000 }, { "epoch": 0.4, "eval_runtime": 3116.6643, "eval_samples_per_second": 108.979, "eval_steps_per_second": 13.623, "step": 395000 }, { "epoch": 0.4, "learning_rate": 1.61852593132086e-05, "loss": 0.3363, "step": 396000 }, { "epoch": 0.4, "learning_rate": 1.6172078190044487e-05, "loss": 0.3362, "step": 397000 }, { "epoch": 0.4, "learning_rate": 1.61588774143212e-05, "loss": 0.3414, "step": 398000 }, { "epoch": 0.4, "learning_rate": 1.614565713040032e-05, "loss": 0.3447, "step": 399000 }, { "epoch": 0.4, "learning_rate": 1.613241748285674e-05, "loss": 0.3438, "step": 400000 }, { "epoch": 0.4, "eval_runtime": 3111.8012, "eval_samples_per_second": 109.15, "eval_steps_per_second": 13.644, "step": 400000 } ], "max_steps": 1000000, "num_train_epochs": 9223372036854775807, "total_flos": 3.5418440692924416e+21, "trial_name": null, "trial_params": null }