{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 20, "global_step": 1125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-06, "loss": 1.0235, "step": 1 }, { "epoch": 0.01, "learning_rate": 1e-05, "loss": 1.0135, "step": 2 }, { "epoch": 0.01, "learning_rate": 1.5e-05, "loss": 1.0056, "step": 3 }, { "epoch": 0.02, "learning_rate": 2e-05, "loss": 1.073, "step": 4 }, { "epoch": 0.02, "learning_rate": 2.5e-05, "loss": 1.052, "step": 5 }, { "epoch": 0.03, "learning_rate": 3e-05, "loss": 0.9612, "step": 6 }, { "epoch": 0.03, "learning_rate": 3.5e-05, "loss": 1.0725, "step": 7 }, { "epoch": 0.04, "learning_rate": 4e-05, "loss": 1.045, "step": 8 }, { "epoch": 0.04, "learning_rate": 4.5e-05, "loss": 1.0201, "step": 9 }, { "epoch": 0.04, "learning_rate": 5e-05, "loss": 0.9867, "step": 10 }, { "epoch": 0.05, "learning_rate": 5.500000000000001e-05, "loss": 1.0, "step": 11 }, { "epoch": 0.05, "learning_rate": 6e-05, "loss": 1.0433, "step": 12 }, { "epoch": 0.06, "learning_rate": 6.500000000000001e-05, "loss": 0.9577, "step": 13 }, { "epoch": 0.06, "learning_rate": 7e-05, "loss": 0.9348, "step": 14 }, { "epoch": 0.07, "learning_rate": 7.500000000000001e-05, "loss": 1.0122, "step": 15 }, { "epoch": 0.07, "learning_rate": 8e-05, "loss": 1.0221, "step": 16 }, { "epoch": 0.08, "learning_rate": 8.5e-05, "loss": 0.911, "step": 17 }, { "epoch": 0.08, "learning_rate": 9e-05, "loss": 0.8725, "step": 18 }, { "epoch": 0.08, "learning_rate": 9.5e-05, "loss": 0.9493, "step": 19 }, { "epoch": 0.09, "learning_rate": 0.0001, "loss": 0.9235, "step": 20 }, { "epoch": 0.09, "eval_loss": 0.9229408502578735, "eval_runtime": 16.01, "eval_samples_per_second": 5.059, "eval_steps_per_second": 1.312, "step": 20 }, { "epoch": 0.09, "learning_rate": 9.999979792392145e-05, "loss": 0.8717, "step": 21 }, { "epoch": 0.1, "learning_rate": 9.999919169731915e-05, "loss": 0.929, "step": 22 }, { "epoch": 0.1, "learning_rate": 9.999818132509326e-05, "loss": 0.9449, "step": 23 }, { "epoch": 0.11, "learning_rate": 9.99967668154107e-05, "loss": 0.911, "step": 24 }, { "epoch": 0.11, "learning_rate": 9.999494817970498e-05, "loss": 0.9356, "step": 25 }, { "epoch": 0.12, "learning_rate": 9.999272543267621e-05, "loss": 0.9319, "step": 26 }, { "epoch": 0.12, "learning_rate": 9.999009859229096e-05, "loss": 0.8534, "step": 27 }, { "epoch": 0.12, "learning_rate": 9.998706767978208e-05, "loss": 0.8563, "step": 28 }, { "epoch": 0.13, "learning_rate": 9.998363271964859e-05, "loss": 0.9176, "step": 29 }, { "epoch": 0.13, "learning_rate": 9.997979373965541e-05, "loss": 0.8752, "step": 30 }, { "epoch": 0.14, "learning_rate": 9.997555077083319e-05, "loss": 0.9263, "step": 31 }, { "epoch": 0.14, "learning_rate": 9.9970903847478e-05, "loss": 0.8348, "step": 32 }, { "epoch": 0.15, "learning_rate": 9.996585300715116e-05, "loss": 0.9279, "step": 33 }, { "epoch": 0.15, "learning_rate": 9.99603982906788e-05, "loss": 0.8997, "step": 34 }, { "epoch": 0.16, "learning_rate": 9.995453974215163e-05, "loss": 0.9019, "step": 35 }, { "epoch": 0.16, "learning_rate": 9.994827740892458e-05, "loss": 0.865, "step": 36 }, { "epoch": 0.16, "learning_rate": 9.994161134161634e-05, "loss": 0.8226, "step": 37 }, { "epoch": 0.17, "learning_rate": 9.9934541594109e-05, "loss": 0.8094, "step": 38 }, { "epoch": 0.17, "learning_rate": 9.992706822354768e-05, "loss": 0.8706, "step": 39 }, { "epoch": 0.18, "learning_rate": 9.991919129033994e-05, "loss": 0.8544, "step": 40 }, { "epoch": 0.18, "eval_loss": 0.8436371684074402, "eval_runtime": 16.1045, "eval_samples_per_second": 5.03, "eval_steps_per_second": 1.304, "step": 40 }, { "epoch": 0.18, "learning_rate": 9.991091085815532e-05, "loss": 0.8041, "step": 41 }, { "epoch": 0.19, "learning_rate": 9.990222699392498e-05, "loss": 0.8563, "step": 42 }, { "epoch": 0.19, "learning_rate": 9.989313976784094e-05, "loss": 0.8284, "step": 43 }, { "epoch": 0.2, "learning_rate": 9.988364925335565e-05, "loss": 0.8608, "step": 44 }, { "epoch": 0.2, "learning_rate": 9.987375552718133e-05, "loss": 0.8893, "step": 45 }, { "epoch": 0.2, "learning_rate": 9.986345866928941e-05, "loss": 0.8383, "step": 46 }, { "epoch": 0.21, "learning_rate": 9.985275876290982e-05, "loss": 0.7732, "step": 47 }, { "epoch": 0.21, "learning_rate": 9.98416558945304e-05, "loss": 0.8397, "step": 48 }, { "epoch": 0.22, "learning_rate": 9.983015015389607e-05, "loss": 0.8497, "step": 49 }, { "epoch": 0.22, "learning_rate": 9.981824163400827e-05, "loss": 0.8805, "step": 50 }, { "epoch": 0.23, "learning_rate": 9.980593043112404e-05, "loss": 0.8751, "step": 51 }, { "epoch": 0.23, "learning_rate": 9.979321664475541e-05, "loss": 0.849, "step": 52 }, { "epoch": 0.24, "learning_rate": 9.978010037766841e-05, "loss": 0.8635, "step": 53 }, { "epoch": 0.24, "learning_rate": 9.976658173588244e-05, "loss": 0.7918, "step": 54 }, { "epoch": 0.24, "learning_rate": 9.975266082866923e-05, "loss": 0.8353, "step": 55 }, { "epoch": 0.25, "learning_rate": 9.97383377685521e-05, "loss": 0.8522, "step": 56 }, { "epoch": 0.25, "learning_rate": 9.972361267130495e-05, "loss": 0.8983, "step": 57 }, { "epoch": 0.26, "learning_rate": 9.970848565595137e-05, "loss": 0.8749, "step": 58 }, { "epoch": 0.26, "learning_rate": 9.96929568447637e-05, "loss": 0.8603, "step": 59 }, { "epoch": 0.27, "learning_rate": 9.967702636326194e-05, "loss": 0.8139, "step": 60 }, { "epoch": 0.27, "eval_loss": 0.8146741986274719, "eval_runtime": 16.1449, "eval_samples_per_second": 5.017, "eval_steps_per_second": 1.301, "step": 60 }, { "epoch": 0.27, "learning_rate": 9.966069434021293e-05, "loss": 0.7807, "step": 61 }, { "epoch": 0.28, "learning_rate": 9.964396090762908e-05, "loss": 0.7711, "step": 62 }, { "epoch": 0.28, "learning_rate": 9.962682620076744e-05, "loss": 0.8169, "step": 63 }, { "epoch": 0.28, "learning_rate": 9.96092903581286e-05, "loss": 0.8259, "step": 64 }, { "epoch": 0.29, "learning_rate": 9.959135352145552e-05, "loss": 0.8244, "step": 65 }, { "epoch": 0.29, "learning_rate": 9.957301583573244e-05, "loss": 0.7796, "step": 66 }, { "epoch": 0.3, "learning_rate": 9.955427744918367e-05, "loss": 0.8131, "step": 67 }, { "epoch": 0.3, "learning_rate": 9.953513851327236e-05, "loss": 0.7746, "step": 68 }, { "epoch": 0.31, "learning_rate": 9.951559918269939e-05, "loss": 0.7514, "step": 69 }, { "epoch": 0.31, "learning_rate": 9.9495659615402e-05, "loss": 0.7851, "step": 70 }, { "epoch": 0.32, "learning_rate": 9.947531997255256e-05, "loss": 0.8309, "step": 71 }, { "epoch": 0.32, "learning_rate": 9.94545804185573e-05, "loss": 0.7847, "step": 72 }, { "epoch": 0.32, "learning_rate": 9.943344112105492e-05, "loss": 0.844, "step": 73 }, { "epoch": 0.33, "learning_rate": 9.94119022509153e-05, "loss": 0.8984, "step": 74 }, { "epoch": 0.33, "learning_rate": 9.938996398223801e-05, "loss": 0.7758, "step": 75 }, { "epoch": 0.34, "learning_rate": 9.936762649235105e-05, "loss": 0.827, "step": 76 }, { "epoch": 0.34, "learning_rate": 9.934488996180931e-05, "loss": 0.8177, "step": 77 }, { "epoch": 0.35, "learning_rate": 9.932175457439317e-05, "loss": 0.8236, "step": 78 }, { "epoch": 0.35, "learning_rate": 9.92982205171069e-05, "loss": 0.7999, "step": 79 }, { "epoch": 0.36, "learning_rate": 9.927428798017738e-05, "loss": 0.8399, "step": 80 }, { "epoch": 0.36, "eval_loss": 0.7993953227996826, "eval_runtime": 16.1588, "eval_samples_per_second": 5.013, "eval_steps_per_second": 1.3, "step": 80 }, { "epoch": 0.36, "learning_rate": 9.924995715705229e-05, "loss": 0.7619, "step": 81 }, { "epoch": 0.36, "learning_rate": 9.922522824439872e-05, "loss": 0.8059, "step": 82 }, { "epoch": 0.37, "learning_rate": 9.920010144210157e-05, "loss": 0.797, "step": 83 }, { "epoch": 0.37, "learning_rate": 9.917457695326184e-05, "loss": 0.804, "step": 84 }, { "epoch": 0.38, "learning_rate": 9.91486549841951e-05, "loss": 0.8354, "step": 85 }, { "epoch": 0.38, "learning_rate": 9.912233574442971e-05, "loss": 0.831, "step": 86 }, { "epoch": 0.39, "learning_rate": 9.909561944670525e-05, "loss": 0.8985, "step": 87 }, { "epoch": 0.39, "learning_rate": 9.906850630697068e-05, "loss": 0.7296, "step": 88 }, { "epoch": 0.4, "learning_rate": 9.90409965443827e-05, "loss": 0.8307, "step": 89 }, { "epoch": 0.4, "learning_rate": 9.90130903813039e-05, "loss": 0.8038, "step": 90 }, { "epoch": 0.4, "learning_rate": 9.8984788043301e-05, "loss": 0.7976, "step": 91 }, { "epoch": 0.41, "learning_rate": 9.895608975914304e-05, "loss": 0.7875, "step": 92 }, { "epoch": 0.41, "learning_rate": 9.892699576079944e-05, "loss": 0.8176, "step": 93 }, { "epoch": 0.42, "learning_rate": 9.88975062834383e-05, "loss": 0.7454, "step": 94 }, { "epoch": 0.42, "learning_rate": 9.886762156542428e-05, "loss": 0.8036, "step": 95 }, { "epoch": 0.43, "learning_rate": 9.88373418483169e-05, "loss": 0.7755, "step": 96 }, { "epoch": 0.43, "learning_rate": 9.880666737686838e-05, "loss": 0.8433, "step": 97 }, { "epoch": 0.44, "learning_rate": 9.877559839902184e-05, "loss": 0.846, "step": 98 }, { "epoch": 0.44, "learning_rate": 9.874413516590912e-05, "loss": 0.7174, "step": 99 }, { "epoch": 0.44, "learning_rate": 9.871227793184892e-05, "loss": 0.8729, "step": 100 }, { "epoch": 0.44, "eval_loss": 0.792314887046814, "eval_runtime": 16.1661, "eval_samples_per_second": 5.01, "eval_steps_per_second": 1.299, "step": 100 }, { "epoch": 0.45, "learning_rate": 9.868002695434462e-05, "loss": 0.7872, "step": 101 }, { "epoch": 0.45, "learning_rate": 9.864738249408227e-05, "loss": 0.7842, "step": 102 }, { "epoch": 0.46, "learning_rate": 9.861434481492846e-05, "loss": 0.8254, "step": 103 }, { "epoch": 0.46, "learning_rate": 9.858091418392815e-05, "loss": 0.795, "step": 104 }, { "epoch": 0.47, "learning_rate": 9.85470908713026e-05, "loss": 0.7682, "step": 105 }, { "epoch": 0.47, "learning_rate": 9.85128751504471e-05, "loss": 0.8486, "step": 106 }, { "epoch": 0.48, "learning_rate": 9.84782672979288e-05, "loss": 0.7597, "step": 107 }, { "epoch": 0.48, "learning_rate": 9.844326759348443e-05, "loss": 0.8089, "step": 108 }, { "epoch": 0.48, "learning_rate": 9.840787632001817e-05, "loss": 0.8206, "step": 109 }, { "epoch": 0.49, "learning_rate": 9.837209376359917e-05, "loss": 0.8977, "step": 110 }, { "epoch": 0.49, "learning_rate": 9.833592021345937e-05, "loss": 0.803, "step": 111 }, { "epoch": 0.5, "learning_rate": 9.829935596199118e-05, "loss": 0.7868, "step": 112 }, { "epoch": 0.5, "learning_rate": 9.826240130474498e-05, "loss": 0.8622, "step": 113 }, { "epoch": 0.51, "learning_rate": 9.822505654042686e-05, "loss": 0.8019, "step": 114 }, { "epoch": 0.51, "learning_rate": 9.818732197089619e-05, "loss": 0.8117, "step": 115 }, { "epoch": 0.52, "learning_rate": 9.814919790116312e-05, "loss": 0.826, "step": 116 }, { "epoch": 0.52, "learning_rate": 9.811068463938613e-05, "loss": 0.7429, "step": 117 }, { "epoch": 0.52, "learning_rate": 9.807178249686958e-05, "loss": 0.8234, "step": 118 }, { "epoch": 0.53, "learning_rate": 9.803249178806117e-05, "loss": 0.6938, "step": 119 }, { "epoch": 0.53, "learning_rate": 9.79928128305494e-05, "loss": 0.7565, "step": 120 }, { "epoch": 0.53, "eval_loss": 0.7824007272720337, "eval_runtime": 16.1648, "eval_samples_per_second": 5.011, "eval_steps_per_second": 1.299, "step": 120 }, { "epoch": 0.54, "learning_rate": 9.7952745945061e-05, "loss": 0.7787, "step": 121 }, { "epoch": 0.54, "learning_rate": 9.791229145545831e-05, "loss": 0.7321, "step": 122 }, { "epoch": 0.55, "learning_rate": 9.787144968873674e-05, "loss": 0.7939, "step": 123 }, { "epoch": 0.55, "learning_rate": 9.783022097502204e-05, "loss": 0.7418, "step": 124 }, { "epoch": 0.56, "learning_rate": 9.77886056475677e-05, "loss": 0.717, "step": 125 }, { "epoch": 0.56, "learning_rate": 9.774660404275216e-05, "loss": 0.7836, "step": 126 }, { "epoch": 0.56, "learning_rate": 9.770421650007626e-05, "loss": 0.8028, "step": 127 }, { "epoch": 0.57, "learning_rate": 9.76614433621603e-05, "loss": 0.7333, "step": 128 }, { "epoch": 0.57, "learning_rate": 9.761828497474141e-05, "loss": 0.7807, "step": 129 }, { "epoch": 0.58, "learning_rate": 9.757474168667071e-05, "loss": 0.7477, "step": 130 }, { "epoch": 0.58, "learning_rate": 9.753081384991045e-05, "loss": 0.7751, "step": 131 }, { "epoch": 0.59, "learning_rate": 9.748650181953126e-05, "loss": 0.7487, "step": 132 }, { "epoch": 0.59, "learning_rate": 9.744180595370917e-05, "loss": 0.7202, "step": 133 }, { "epoch": 0.6, "learning_rate": 9.739672661372279e-05, "loss": 0.7786, "step": 134 }, { "epoch": 0.6, "learning_rate": 9.73512641639504e-05, "loss": 0.7736, "step": 135 }, { "epoch": 0.6, "learning_rate": 9.730541897186689e-05, "loss": 0.7865, "step": 136 }, { "epoch": 0.61, "learning_rate": 9.725919140804099e-05, "loss": 0.8128, "step": 137 }, { "epoch": 0.61, "learning_rate": 9.721258184613203e-05, "loss": 0.8746, "step": 138 }, { "epoch": 0.62, "learning_rate": 9.716559066288715e-05, "loss": 0.742, "step": 139 }, { "epoch": 0.62, "learning_rate": 9.711821823813812e-05, "loss": 0.7634, "step": 140 }, { "epoch": 0.62, "eval_loss": 0.7769067883491516, "eval_runtime": 16.1735, "eval_samples_per_second": 5.008, "eval_steps_per_second": 1.298, "step": 140 }, { "epoch": 0.63, "learning_rate": 9.707046495479826e-05, "loss": 0.8214, "step": 141 }, { "epoch": 0.63, "learning_rate": 9.702233119885943e-05, "loss": 0.7602, "step": 142 }, { "epoch": 0.64, "learning_rate": 9.697381735938887e-05, "loss": 0.8803, "step": 143 }, { "epoch": 0.64, "learning_rate": 9.6924923828526e-05, "loss": 0.8203, "step": 144 }, { "epoch": 0.64, "learning_rate": 9.687565100147939e-05, "loss": 0.7716, "step": 145 }, { "epoch": 0.65, "learning_rate": 9.682599927652341e-05, "loss": 0.7369, "step": 146 }, { "epoch": 0.65, "learning_rate": 9.677596905499507e-05, "loss": 0.8006, "step": 147 }, { "epoch": 0.66, "learning_rate": 9.672556074129085e-05, "loss": 0.7668, "step": 148 }, { "epoch": 0.66, "learning_rate": 9.667477474286328e-05, "loss": 0.7686, "step": 149 }, { "epoch": 0.67, "learning_rate": 9.662361147021779e-05, "loss": 0.7507, "step": 150 }, { "epoch": 0.67, "learning_rate": 9.657207133690936e-05, "loss": 0.7517, "step": 151 }, { "epoch": 0.68, "learning_rate": 9.652015475953904e-05, "loss": 0.818, "step": 152 }, { "epoch": 0.68, "learning_rate": 9.646786215775083e-05, "loss": 0.8582, "step": 153 }, { "epoch": 0.68, "learning_rate": 9.641519395422806e-05, "loss": 0.7712, "step": 154 }, { "epoch": 0.69, "learning_rate": 9.636215057469008e-05, "loss": 0.7736, "step": 155 }, { "epoch": 0.69, "learning_rate": 9.630873244788883e-05, "loss": 0.7466, "step": 156 }, { "epoch": 0.7, "learning_rate": 9.625494000560533e-05, "loss": 0.7985, "step": 157 }, { "epoch": 0.7, "learning_rate": 9.620077368264622e-05, "loss": 0.8343, "step": 158 }, { "epoch": 0.71, "learning_rate": 9.614623391684021e-05, "loss": 0.7508, "step": 159 }, { "epoch": 0.71, "learning_rate": 9.609132114903458e-05, "loss": 0.779, "step": 160 }, { "epoch": 0.71, "eval_loss": 0.7734922766685486, "eval_runtime": 16.2259, "eval_samples_per_second": 4.992, "eval_steps_per_second": 1.294, "step": 160 }, { "epoch": 0.72, "learning_rate": 9.603603582309162e-05, "loss": 0.8098, "step": 161 }, { "epoch": 0.72, "learning_rate": 9.598037838588499e-05, "loss": 0.8056, "step": 162 }, { "epoch": 0.72, "learning_rate": 9.592434928729616e-05, "loss": 0.7972, "step": 163 }, { "epoch": 0.73, "learning_rate": 9.586794898021075e-05, "loss": 0.7694, "step": 164 }, { "epoch": 0.73, "learning_rate": 9.581117792051486e-05, "loss": 0.7313, "step": 165 }, { "epoch": 0.74, "learning_rate": 9.575403656709146e-05, "loss": 0.76, "step": 166 }, { "epoch": 0.74, "learning_rate": 9.569652538181653e-05, "loss": 0.7332, "step": 167 }, { "epoch": 0.75, "learning_rate": 9.563864482955547e-05, "loss": 0.7197, "step": 168 }, { "epoch": 0.75, "learning_rate": 9.558039537815929e-05, "loss": 0.7504, "step": 169 }, { "epoch": 0.76, "learning_rate": 9.552177749846083e-05, "loss": 0.7278, "step": 170 }, { "epoch": 0.76, "learning_rate": 9.546279166427092e-05, "loss": 0.7565, "step": 171 }, { "epoch": 0.76, "learning_rate": 9.54034383523746e-05, "loss": 0.7512, "step": 172 }, { "epoch": 0.77, "learning_rate": 9.534371804252728e-05, "loss": 0.7861, "step": 173 }, { "epoch": 0.77, "learning_rate": 9.528363121745076e-05, "loss": 0.7935, "step": 174 }, { "epoch": 0.78, "learning_rate": 9.522317836282948e-05, "loss": 0.7924, "step": 175 }, { "epoch": 0.78, "learning_rate": 9.516235996730645e-05, "loss": 0.7461, "step": 176 }, { "epoch": 0.79, "learning_rate": 9.510117652247938e-05, "loss": 0.7863, "step": 177 }, { "epoch": 0.79, "learning_rate": 9.503962852289672e-05, "loss": 0.8284, "step": 178 }, { "epoch": 0.8, "learning_rate": 9.497771646605355e-05, "loss": 0.7805, "step": 179 }, { "epoch": 0.8, "learning_rate": 9.491544085238777e-05, "loss": 0.7776, "step": 180 }, { "epoch": 0.8, "eval_loss": 0.7671141028404236, "eval_runtime": 16.2323, "eval_samples_per_second": 4.99, "eval_steps_per_second": 1.294, "step": 180 }, { "epoch": 0.8, "learning_rate": 9.485280218527581e-05, "loss": 0.8404, "step": 181 }, { "epoch": 0.81, "learning_rate": 9.478980097102872e-05, "loss": 0.7056, "step": 182 }, { "epoch": 0.81, "learning_rate": 9.472643771888803e-05, "loss": 0.8308, "step": 183 }, { "epoch": 0.82, "learning_rate": 9.466271294102167e-05, "loss": 0.7528, "step": 184 }, { "epoch": 0.82, "learning_rate": 9.459862715251973e-05, "loss": 0.8127, "step": 185 }, { "epoch": 0.83, "learning_rate": 9.453418087139043e-05, "loss": 0.7245, "step": 186 }, { "epoch": 0.83, "learning_rate": 9.446937461855583e-05, "loss": 0.7629, "step": 187 }, { "epoch": 0.84, "learning_rate": 9.440420891784766e-05, "loss": 0.7152, "step": 188 }, { "epoch": 0.84, "learning_rate": 9.43386842960031e-05, "loss": 0.7143, "step": 189 }, { "epoch": 0.84, "learning_rate": 9.42728012826605e-05, "loss": 0.7566, "step": 190 }, { "epoch": 0.85, "learning_rate": 9.42065604103551e-05, "loss": 0.7708, "step": 191 }, { "epoch": 0.85, "learning_rate": 9.41399622145147e-05, "loss": 0.7418, "step": 192 }, { "epoch": 0.86, "learning_rate": 9.407300723345543e-05, "loss": 0.7811, "step": 193 }, { "epoch": 0.86, "learning_rate": 9.400569600837728e-05, "loss": 0.7059, "step": 194 }, { "epoch": 0.87, "learning_rate": 9.393802908335977e-05, "loss": 0.751, "step": 195 }, { "epoch": 0.87, "learning_rate": 9.387000700535758e-05, "loss": 0.7053, "step": 196 }, { "epoch": 0.88, "learning_rate": 9.380163032419611e-05, "loss": 0.7926, "step": 197 }, { "epoch": 0.88, "learning_rate": 9.373289959256701e-05, "loss": 0.8703, "step": 198 }, { "epoch": 0.88, "learning_rate": 9.366381536602378e-05, "loss": 0.7272, "step": 199 }, { "epoch": 0.89, "learning_rate": 9.359437820297717e-05, "loss": 0.7694, "step": 200 }, { "epoch": 0.89, "eval_loss": 0.7667359709739685, "eval_runtime": 16.1834, "eval_samples_per_second": 5.005, "eval_steps_per_second": 1.298, "step": 200 }, { "epoch": 0.89, "learning_rate": 9.352458866469076e-05, "loss": 0.8422, "step": 201 }, { "epoch": 0.9, "learning_rate": 9.345444731527642e-05, "loss": 0.8142, "step": 202 }, { "epoch": 0.9, "learning_rate": 9.338395472168971e-05, "loss": 0.7815, "step": 203 }, { "epoch": 0.91, "learning_rate": 9.331311145372528e-05, "loss": 0.8582, "step": 204 }, { "epoch": 0.91, "learning_rate": 9.324191808401235e-05, "loss": 0.7516, "step": 205 }, { "epoch": 0.92, "learning_rate": 9.317037518800998e-05, "loss": 0.7781, "step": 206 }, { "epoch": 0.92, "learning_rate": 9.309848334400246e-05, "loss": 0.7884, "step": 207 }, { "epoch": 0.92, "learning_rate": 9.302624313309471e-05, "loss": 0.7071, "step": 208 }, { "epoch": 0.93, "learning_rate": 9.295365513920748e-05, "loss": 0.6806, "step": 209 }, { "epoch": 0.93, "learning_rate": 9.288071994907261e-05, "loss": 0.7055, "step": 210 }, { "epoch": 0.94, "learning_rate": 9.280743815222841e-05, "loss": 0.7375, "step": 211 }, { "epoch": 0.94, "learning_rate": 9.273381034101482e-05, "loss": 0.73, "step": 212 }, { "epoch": 0.95, "learning_rate": 9.26598371105686e-05, "loss": 0.7278, "step": 213 }, { "epoch": 0.95, "learning_rate": 9.258551905881856e-05, "loss": 0.7403, "step": 214 }, { "epoch": 0.96, "learning_rate": 9.251085678648072e-05, "loss": 0.7537, "step": 215 }, { "epoch": 0.96, "learning_rate": 9.243585089705344e-05, "loss": 0.803, "step": 216 }, { "epoch": 0.96, "learning_rate": 9.236050199681258e-05, "loss": 0.7842, "step": 217 }, { "epoch": 0.97, "learning_rate": 9.228481069480655e-05, "loss": 0.6973, "step": 218 }, { "epoch": 0.97, "learning_rate": 9.220877760285141e-05, "loss": 0.7908, "step": 219 }, { "epoch": 0.98, "learning_rate": 9.213240333552589e-05, "loss": 0.7201, "step": 220 }, { "epoch": 0.98, "eval_loss": 0.761328935623169, "eval_runtime": 16.1777, "eval_samples_per_second": 5.007, "eval_steps_per_second": 1.298, "step": 220 }, { "epoch": 0.98, "learning_rate": 9.205568851016652e-05, "loss": 0.7408, "step": 221 }, { "epoch": 0.99, "learning_rate": 9.197863374686256e-05, "loss": 0.8622, "step": 222 }, { "epoch": 0.99, "learning_rate": 9.190123966845092e-05, "loss": 0.8189, "step": 223 }, { "epoch": 1.0, "learning_rate": 9.182350690051133e-05, "loss": 0.7666, "step": 224 }, { "epoch": 1.0, "learning_rate": 9.174543607136112e-05, "loss": 0.8323, "step": 225 }, { "epoch": 1.0, "learning_rate": 9.166702781205012e-05, "loss": 0.8211, "step": 226 }, { "epoch": 1.01, "learning_rate": 9.158828275635569e-05, "loss": 0.7923, "step": 227 }, { "epoch": 1.01, "learning_rate": 9.150920154077754e-05, "loss": 0.7626, "step": 228 }, { "epoch": 1.02, "learning_rate": 9.142978480453251e-05, "loss": 0.7593, "step": 229 }, { "epoch": 1.02, "learning_rate": 9.135003318954954e-05, "loss": 0.7953, "step": 230 }, { "epoch": 1.03, "learning_rate": 9.126994734046432e-05, "loss": 0.7352, "step": 231 }, { "epoch": 1.03, "learning_rate": 9.11895279046143e-05, "loss": 0.7583, "step": 232 }, { "epoch": 1.04, "learning_rate": 9.110877553203319e-05, "loss": 0.7573, "step": 233 }, { "epoch": 1.04, "learning_rate": 9.102769087544592e-05, "loss": 0.7988, "step": 234 }, { "epoch": 1.04, "learning_rate": 9.094627459026325e-05, "loss": 0.7627, "step": 235 }, { "epoch": 1.05, "learning_rate": 9.086452733457656e-05, "loss": 0.8511, "step": 236 }, { "epoch": 1.05, "learning_rate": 9.078244976915244e-05, "loss": 0.7316, "step": 237 }, { "epoch": 1.06, "learning_rate": 9.070004255742737e-05, "loss": 0.7108, "step": 238 }, { "epoch": 1.06, "learning_rate": 9.06173063655024e-05, "loss": 0.797, "step": 239 }, { "epoch": 1.07, "learning_rate": 9.053424186213775e-05, "loss": 0.7498, "step": 240 }, { "epoch": 1.07, "eval_loss": 0.758377194404602, "eval_runtime": 16.1613, "eval_samples_per_second": 5.012, "eval_steps_per_second": 1.299, "step": 240 }, { "epoch": 1.07, "learning_rate": 9.045084971874738e-05, "loss": 0.6338, "step": 241 }, { "epoch": 1.08, "learning_rate": 9.036713060939358e-05, "loss": 0.8083, "step": 242 }, { "epoch": 1.08, "learning_rate": 9.028308521078152e-05, "loss": 0.7359, "step": 243 }, { "epoch": 1.08, "learning_rate": 9.019871420225381e-05, "loss": 0.6618, "step": 244 }, { "epoch": 1.09, "learning_rate": 9.011401826578492e-05, "loss": 0.6512, "step": 245 }, { "epoch": 1.09, "learning_rate": 9.002899808597576e-05, "loss": 0.6483, "step": 246 }, { "epoch": 1.1, "learning_rate": 8.994365435004815e-05, "loss": 0.727, "step": 247 }, { "epoch": 1.1, "learning_rate": 8.985798774783913e-05, "loss": 0.7561, "step": 248 }, { "epoch": 1.11, "learning_rate": 8.977199897179558e-05, "loss": 0.7811, "step": 249 }, { "epoch": 1.11, "learning_rate": 8.968568871696847e-05, "loss": 0.6946, "step": 250 }, { "epoch": 1.12, "learning_rate": 8.959905768100733e-05, "loss": 0.713, "step": 251 }, { "epoch": 1.12, "learning_rate": 8.951210656415456e-05, "loss": 0.6828, "step": 252 }, { "epoch": 1.12, "learning_rate": 8.942483606923979e-05, "loss": 0.6932, "step": 253 }, { "epoch": 1.13, "learning_rate": 8.933724690167417e-05, "loss": 0.7629, "step": 254 }, { "epoch": 1.13, "learning_rate": 8.924933976944474e-05, "loss": 0.6765, "step": 255 }, { "epoch": 1.14, "learning_rate": 8.916111538310863e-05, "loss": 0.7267, "step": 256 }, { "epoch": 1.14, "learning_rate": 8.907257445578739e-05, "loss": 0.729, "step": 257 }, { "epoch": 1.15, "learning_rate": 8.898371770316111e-05, "loss": 0.6951, "step": 258 }, { "epoch": 1.15, "learning_rate": 8.889454584346282e-05, "loss": 0.8412, "step": 259 }, { "epoch": 1.16, "learning_rate": 8.880505959747244e-05, "loss": 0.7852, "step": 260 }, { "epoch": 1.16, "eval_loss": 0.7558982372283936, "eval_runtime": 16.1507, "eval_samples_per_second": 5.015, "eval_steps_per_second": 1.3, "step": 260 }, { "epoch": 1.16, "learning_rate": 8.87152596885112e-05, "loss": 0.6959, "step": 261 }, { "epoch": 1.16, "learning_rate": 8.862514684243562e-05, "loss": 0.7405, "step": 262 }, { "epoch": 1.17, "learning_rate": 8.853472178763171e-05, "loss": 0.6729, "step": 263 }, { "epoch": 1.17, "learning_rate": 8.844398525500914e-05, "loss": 0.752, "step": 264 }, { "epoch": 1.18, "learning_rate": 8.835293797799517e-05, "loss": 0.7236, "step": 265 }, { "epoch": 1.18, "learning_rate": 8.826158069252888e-05, "loss": 0.6427, "step": 266 }, { "epoch": 1.19, "learning_rate": 8.816991413705516e-05, "loss": 0.7169, "step": 267 }, { "epoch": 1.19, "learning_rate": 8.80779390525187e-05, "loss": 0.7441, "step": 268 }, { "epoch": 1.2, "learning_rate": 8.798565618235813e-05, "loss": 0.7225, "step": 269 }, { "epoch": 1.2, "learning_rate": 8.789306627249985e-05, "loss": 0.7396, "step": 270 }, { "epoch": 1.2, "learning_rate": 8.780017007135207e-05, "loss": 0.7642, "step": 271 }, { "epoch": 1.21, "learning_rate": 8.770696832979881e-05, "loss": 0.7186, "step": 272 }, { "epoch": 1.21, "learning_rate": 8.761346180119376e-05, "loss": 0.6484, "step": 273 }, { "epoch": 1.22, "learning_rate": 8.751965124135425e-05, "loss": 0.731, "step": 274 }, { "epoch": 1.22, "learning_rate": 8.742553740855506e-05, "loss": 0.7666, "step": 275 }, { "epoch": 1.23, "learning_rate": 8.733112106352237e-05, "loss": 0.8041, "step": 276 }, { "epoch": 1.23, "learning_rate": 8.723640296942755e-05, "loss": 0.7403, "step": 277 }, { "epoch": 1.24, "learning_rate": 8.714138389188106e-05, "loss": 0.689, "step": 278 }, { "epoch": 1.24, "learning_rate": 8.704606459892623e-05, "loss": 0.7662, "step": 279 }, { "epoch": 1.24, "learning_rate": 8.695044586103296e-05, "loss": 0.8143, "step": 280 }, { "epoch": 1.24, "eval_loss": 0.7563924193382263, "eval_runtime": 16.1537, "eval_samples_per_second": 5.014, "eval_steps_per_second": 1.3, "step": 280 }, { "epoch": 1.25, "learning_rate": 8.685452845109168e-05, "loss": 0.7726, "step": 281 }, { "epoch": 1.25, "learning_rate": 8.675831314440693e-05, "loss": 0.7412, "step": 282 }, { "epoch": 1.26, "learning_rate": 8.666180071869117e-05, "loss": 0.7479, "step": 283 }, { "epoch": 1.26, "learning_rate": 8.656499195405852e-05, "loss": 0.8669, "step": 284 }, { "epoch": 1.27, "learning_rate": 8.646788763301841e-05, "loss": 0.8918, "step": 285 }, { "epoch": 1.27, "learning_rate": 8.637048854046926e-05, "loss": 0.8616, "step": 286 }, { "epoch": 1.28, "learning_rate": 8.62727954636921e-05, "loss": 0.7365, "step": 287 }, { "epoch": 1.28, "learning_rate": 8.617480919234433e-05, "loss": 0.672, "step": 288 }, { "epoch": 1.28, "learning_rate": 8.607653051845316e-05, "loss": 0.7442, "step": 289 }, { "epoch": 1.29, "learning_rate": 8.597796023640941e-05, "loss": 0.7626, "step": 290 }, { "epoch": 1.29, "learning_rate": 8.587909914296089e-05, "loss": 0.731, "step": 291 }, { "epoch": 1.3, "learning_rate": 8.577994803720606e-05, "loss": 0.7305, "step": 292 }, { "epoch": 1.3, "learning_rate": 8.568050772058762e-05, "loss": 0.7216, "step": 293 }, { "epoch": 1.31, "learning_rate": 8.558077899688592e-05, "loss": 0.7631, "step": 294 }, { "epoch": 1.31, "learning_rate": 8.548076267221256e-05, "loss": 0.6488, "step": 295 }, { "epoch": 1.32, "learning_rate": 8.538045955500379e-05, "loss": 0.7594, "step": 296 }, { "epoch": 1.32, "learning_rate": 8.527987045601404e-05, "loss": 0.6487, "step": 297 }, { "epoch": 1.32, "learning_rate": 8.517899618830931e-05, "loss": 0.7834, "step": 298 }, { "epoch": 1.33, "learning_rate": 8.50778375672607e-05, "loss": 0.734, "step": 299 }, { "epoch": 1.33, "learning_rate": 8.49763954105377e-05, "loss": 0.8413, "step": 300 }, { "epoch": 1.33, "eval_loss": 0.7527250647544861, "eval_runtime": 16.1668, "eval_samples_per_second": 5.01, "eval_steps_per_second": 1.299, "step": 300 }, { "epoch": 1.34, "learning_rate": 8.487467053810161e-05, "loss": 0.765, "step": 301 }, { "epoch": 1.34, "learning_rate": 8.477266377219898e-05, "loss": 0.7672, "step": 302 }, { "epoch": 1.35, "learning_rate": 8.46703759373549e-05, "loss": 0.7322, "step": 303 }, { "epoch": 1.35, "learning_rate": 8.456780786036635e-05, "loss": 0.771, "step": 304 }, { "epoch": 1.36, "learning_rate": 8.446496037029555e-05, "loss": 0.7365, "step": 305 }, { "epoch": 1.36, "learning_rate": 8.436183429846313e-05, "loss": 0.73, "step": 306 }, { "epoch": 1.36, "learning_rate": 8.425843047844165e-05, "loss": 0.7694, "step": 307 }, { "epoch": 1.37, "learning_rate": 8.415474974604862e-05, "loss": 0.6973, "step": 308 }, { "epoch": 1.37, "learning_rate": 8.405079293933986e-05, "loss": 0.6781, "step": 309 }, { "epoch": 1.38, "learning_rate": 8.394656089860274e-05, "loss": 0.7672, "step": 310 }, { "epoch": 1.38, "learning_rate": 8.384205446634935e-05, "loss": 0.7333, "step": 311 }, { "epoch": 1.39, "learning_rate": 8.373727448730966e-05, "loss": 0.6644, "step": 312 }, { "epoch": 1.39, "learning_rate": 8.363222180842478e-05, "loss": 0.7538, "step": 313 }, { "epoch": 1.4, "learning_rate": 8.352689727884005e-05, "loss": 0.6857, "step": 314 }, { "epoch": 1.4, "learning_rate": 8.342130174989818e-05, "loss": 0.7679, "step": 315 }, { "epoch": 1.4, "learning_rate": 8.33154360751324e-05, "loss": 0.6924, "step": 316 }, { "epoch": 1.41, "learning_rate": 8.320930111025951e-05, "loss": 0.7447, "step": 317 }, { "epoch": 1.41, "learning_rate": 8.3102897713173e-05, "loss": 0.7555, "step": 318 }, { "epoch": 1.42, "learning_rate": 8.299622674393614e-05, "loss": 0.6906, "step": 319 }, { "epoch": 1.42, "learning_rate": 8.288928906477496e-05, "loss": 0.7673, "step": 320 }, { "epoch": 1.42, "eval_loss": 0.7525838017463684, "eval_runtime": 16.2133, "eval_samples_per_second": 4.996, "eval_steps_per_second": 1.295, "step": 320 }, { "epoch": 1.43, "learning_rate": 8.278208554007136e-05, "loss": 0.6646, "step": 321 }, { "epoch": 1.43, "learning_rate": 8.267461703635604e-05, "loss": 0.7592, "step": 322 }, { "epoch": 1.44, "learning_rate": 8.256688442230154e-05, "loss": 0.6533, "step": 323 }, { "epoch": 1.44, "learning_rate": 8.245888856871525e-05, "loss": 0.7016, "step": 324 }, { "epoch": 1.44, "learning_rate": 8.235063034853228e-05, "loss": 0.694, "step": 325 }, { "epoch": 1.45, "learning_rate": 8.224211063680853e-05, "loss": 0.7522, "step": 326 }, { "epoch": 1.45, "learning_rate": 8.213333031071351e-05, "loss": 0.7599, "step": 327 }, { "epoch": 1.46, "learning_rate": 8.202429024952327e-05, "loss": 0.7784, "step": 328 }, { "epoch": 1.46, "learning_rate": 8.191499133461332e-05, "loss": 0.7284, "step": 329 }, { "epoch": 1.47, "learning_rate": 8.180543444945153e-05, "loss": 0.7748, "step": 330 }, { "epoch": 1.47, "learning_rate": 8.169562047959093e-05, "loss": 0.7928, "step": 331 }, { "epoch": 1.48, "learning_rate": 8.158555031266254e-05, "loss": 0.7354, "step": 332 }, { "epoch": 1.48, "learning_rate": 8.147522483836833e-05, "loss": 0.7134, "step": 333 }, { "epoch": 1.48, "learning_rate": 8.13646449484738e-05, "loss": 0.6899, "step": 334 }, { "epoch": 1.49, "learning_rate": 8.125381153680103e-05, "loss": 0.7251, "step": 335 }, { "epoch": 1.49, "learning_rate": 8.114272549922122e-05, "loss": 0.75, "step": 336 }, { "epoch": 1.5, "learning_rate": 8.103138773364763e-05, "loss": 0.7581, "step": 337 }, { "epoch": 1.5, "learning_rate": 8.091979914002823e-05, "loss": 0.7181, "step": 338 }, { "epoch": 1.51, "learning_rate": 8.08079606203384e-05, "loss": 0.7017, "step": 339 }, { "epoch": 1.51, "learning_rate": 8.069587307857375e-05, "loss": 0.7212, "step": 340 }, { "epoch": 1.51, "eval_loss": 0.7467713356018066, "eval_runtime": 16.2203, "eval_samples_per_second": 4.994, "eval_steps_per_second": 1.295, "step": 340 }, { "epoch": 1.52, "learning_rate": 8.058353742074274e-05, "loss": 0.7242, "step": 341 }, { "epoch": 1.52, "learning_rate": 8.047095455485926e-05, "loss": 0.641, "step": 342 }, { "epoch": 1.52, "learning_rate": 8.035812539093557e-05, "loss": 0.6913, "step": 343 }, { "epoch": 1.53, "learning_rate": 8.024505084097461e-05, "loss": 0.6621, "step": 344 }, { "epoch": 1.53, "learning_rate": 8.013173181896283e-05, "loss": 0.6989, "step": 345 }, { "epoch": 1.54, "learning_rate": 8.001816924086281e-05, "loss": 0.7018, "step": 346 }, { "epoch": 1.54, "learning_rate": 7.990436402460575e-05, "loss": 0.6998, "step": 347 }, { "epoch": 1.55, "learning_rate": 7.979031709008415e-05, "loss": 0.6177, "step": 348 }, { "epoch": 1.55, "learning_rate": 7.967602935914427e-05, "loss": 0.7046, "step": 349 }, { "epoch": 1.56, "learning_rate": 7.95615017555788e-05, "loss": 0.7031, "step": 350 }, { "epoch": 1.56, "learning_rate": 7.944673520511925e-05, "loss": 0.8082, "step": 351 }, { "epoch": 1.56, "learning_rate": 7.933173063542865e-05, "loss": 0.6821, "step": 352 }, { "epoch": 1.57, "learning_rate": 7.921648897609388e-05, "loss": 0.7141, "step": 353 }, { "epoch": 1.57, "learning_rate": 7.910101115861826e-05, "loss": 0.7485, "step": 354 }, { "epoch": 1.58, "learning_rate": 7.898529811641394e-05, "loss": 0.6949, "step": 355 }, { "epoch": 1.58, "learning_rate": 7.886935078479445e-05, "loss": 0.6738, "step": 356 }, { "epoch": 1.59, "learning_rate": 7.875317010096705e-05, "loss": 0.7596, "step": 357 }, { "epoch": 1.59, "learning_rate": 7.863675700402526e-05, "loss": 0.7138, "step": 358 }, { "epoch": 1.6, "learning_rate": 7.852011243494115e-05, "loss": 0.7938, "step": 359 }, { "epoch": 1.6, "learning_rate": 7.840323733655778e-05, "loss": 0.7411, "step": 360 }, { "epoch": 1.6, "eval_loss": 0.7443473935127258, "eval_runtime": 16.2109, "eval_samples_per_second": 4.997, "eval_steps_per_second": 1.295, "step": 360 }, { "epoch": 1.6, "learning_rate": 7.828613265358167e-05, "loss": 0.7394, "step": 361 }, { "epoch": 1.61, "learning_rate": 7.816879933257494e-05, "loss": 0.7386, "step": 362 }, { "epoch": 1.61, "learning_rate": 7.805123832194797e-05, "loss": 0.6989, "step": 363 }, { "epoch": 1.62, "learning_rate": 7.793345057195143e-05, "loss": 0.6786, "step": 364 }, { "epoch": 1.62, "learning_rate": 7.78154370346688e-05, "loss": 0.6932, "step": 365 }, { "epoch": 1.63, "learning_rate": 7.76971986640086e-05, "loss": 0.7703, "step": 366 }, { "epoch": 1.63, "learning_rate": 7.757873641569666e-05, "loss": 0.7093, "step": 367 }, { "epoch": 1.64, "learning_rate": 7.746005124726847e-05, "loss": 0.7387, "step": 368 }, { "epoch": 1.64, "learning_rate": 7.734114411806133e-05, "loss": 0.7532, "step": 369 }, { "epoch": 1.64, "learning_rate": 7.722201598920673e-05, "loss": 0.6431, "step": 370 }, { "epoch": 1.65, "learning_rate": 7.710266782362247e-05, "loss": 0.6197, "step": 371 }, { "epoch": 1.65, "learning_rate": 7.698310058600492e-05, "loss": 0.7008, "step": 372 }, { "epoch": 1.66, "learning_rate": 7.68633152428212e-05, "loss": 0.7036, "step": 373 }, { "epoch": 1.66, "learning_rate": 7.674331276230143e-05, "loss": 0.6732, "step": 374 }, { "epoch": 1.67, "learning_rate": 7.662309411443082e-05, "loss": 0.7795, "step": 375 }, { "epoch": 1.67, "learning_rate": 7.65026602709419e-05, "loss": 0.807, "step": 376 }, { "epoch": 1.68, "learning_rate": 7.638201220530665e-05, "loss": 0.7432, "step": 377 }, { "epoch": 1.68, "learning_rate": 7.626115089272852e-05, "loss": 0.7394, "step": 378 }, { "epoch": 1.68, "learning_rate": 7.614007731013477e-05, "loss": 0.7725, "step": 379 }, { "epoch": 1.69, "learning_rate": 7.601879243616839e-05, "loss": 0.7654, "step": 380 }, { "epoch": 1.69, "eval_loss": 0.7455257177352905, "eval_runtime": 16.2214, "eval_samples_per_second": 4.993, "eval_steps_per_second": 1.295, "step": 380 }, { "epoch": 1.69, "learning_rate": 7.58972972511802e-05, "loss": 0.7174, "step": 381 }, { "epoch": 1.7, "learning_rate": 7.577559273722107e-05, "loss": 0.7298, "step": 382 }, { "epoch": 1.7, "learning_rate": 7.565367987803382e-05, "loss": 0.8, "step": 383 }, { "epoch": 1.71, "learning_rate": 7.553155965904535e-05, "loss": 0.7282, "step": 384 }, { "epoch": 1.71, "learning_rate": 7.540923306735868e-05, "loss": 0.7088, "step": 385 }, { "epoch": 1.72, "learning_rate": 7.528670109174489e-05, "loss": 0.7395, "step": 386 }, { "epoch": 1.72, "learning_rate": 7.516396472263524e-05, "loss": 0.7584, "step": 387 }, { "epoch": 1.72, "learning_rate": 7.504102495211312e-05, "loss": 0.7816, "step": 388 }, { "epoch": 1.73, "learning_rate": 7.491788277390595e-05, "loss": 0.7837, "step": 389 }, { "epoch": 1.73, "learning_rate": 7.479453918337734e-05, "loss": 0.7132, "step": 390 }, { "epoch": 1.74, "learning_rate": 7.467099517751879e-05, "loss": 0.7289, "step": 391 }, { "epoch": 1.74, "learning_rate": 7.454725175494183e-05, "loss": 0.7453, "step": 392 }, { "epoch": 1.75, "learning_rate": 7.442330991586995e-05, "loss": 0.7424, "step": 393 }, { "epoch": 1.75, "learning_rate": 7.42991706621303e-05, "loss": 0.7824, "step": 394 }, { "epoch": 1.76, "learning_rate": 7.417483499714589e-05, "loss": 0.6926, "step": 395 }, { "epoch": 1.76, "learning_rate": 7.405030392592723e-05, "loss": 0.72, "step": 396 }, { "epoch": 1.76, "learning_rate": 7.392557845506432e-05, "loss": 0.7812, "step": 397 }, { "epoch": 1.77, "learning_rate": 7.380065959271858e-05, "loss": 0.6893, "step": 398 }, { "epoch": 1.77, "learning_rate": 7.367554834861452e-05, "loss": 0.712, "step": 399 }, { "epoch": 1.78, "learning_rate": 7.355024573403174e-05, "loss": 0.7619, "step": 400 }, { "epoch": 1.78, "eval_loss": 0.7424753904342651, "eval_runtime": 16.2095, "eval_samples_per_second": 4.997, "eval_steps_per_second": 1.296, "step": 400 }, { "epoch": 1.78, "learning_rate": 7.342475276179668e-05, "loss": 0.7277, "step": 401 }, { "epoch": 1.79, "learning_rate": 7.329907044627444e-05, "loss": 0.6948, "step": 402 }, { "epoch": 1.79, "learning_rate": 7.31731998033606e-05, "loss": 0.7299, "step": 403 }, { "epoch": 1.8, "learning_rate": 7.3047141850473e-05, "loss": 0.7115, "step": 404 }, { "epoch": 1.8, "learning_rate": 7.292089760654351e-05, "loss": 0.7089, "step": 405 }, { "epoch": 1.8, "learning_rate": 7.279446809200981e-05, "loss": 0.6761, "step": 406 }, { "epoch": 1.81, "learning_rate": 7.266785432880711e-05, "loss": 0.837, "step": 407 }, { "epoch": 1.81, "learning_rate": 7.254105734035991e-05, "loss": 0.7154, "step": 408 }, { "epoch": 1.82, "learning_rate": 7.241407815157376e-05, "loss": 0.7904, "step": 409 }, { "epoch": 1.82, "learning_rate": 7.228691778882693e-05, "loss": 0.713, "step": 410 }, { "epoch": 1.83, "learning_rate": 7.215957727996207e-05, "loss": 0.7723, "step": 411 }, { "epoch": 1.83, "learning_rate": 7.203205765427806e-05, "loss": 0.6863, "step": 412 }, { "epoch": 1.84, "learning_rate": 7.190435994252149e-05, "loss": 0.6906, "step": 413 }, { "epoch": 1.84, "learning_rate": 7.177648517687852e-05, "loss": 0.6936, "step": 414 }, { "epoch": 1.84, "learning_rate": 7.164843439096636e-05, "loss": 0.7206, "step": 415 }, { "epoch": 1.85, "learning_rate": 7.152020861982505e-05, "loss": 0.7572, "step": 416 }, { "epoch": 1.85, "learning_rate": 7.139180889990903e-05, "loss": 0.7206, "step": 417 }, { "epoch": 1.86, "learning_rate": 7.126323626907878e-05, "loss": 0.7221, "step": 418 }, { "epoch": 1.86, "learning_rate": 7.113449176659241e-05, "loss": 0.6963, "step": 419 }, { "epoch": 1.87, "learning_rate": 7.100557643309732e-05, "loss": 0.7152, "step": 420 }, { "epoch": 1.87, "eval_loss": 0.7406202554702759, "eval_runtime": 16.1712, "eval_samples_per_second": 5.009, "eval_steps_per_second": 1.299, "step": 420 }, { "epoch": 1.87, "learning_rate": 7.087649131062167e-05, "loss": 0.6858, "step": 421 }, { "epoch": 1.88, "learning_rate": 7.074723744256609e-05, "loss": 0.886, "step": 422 }, { "epoch": 1.88, "learning_rate": 7.061781587369519e-05, "loss": 0.7167, "step": 423 }, { "epoch": 1.88, "learning_rate": 7.048822765012906e-05, "loss": 0.6678, "step": 424 }, { "epoch": 1.89, "learning_rate": 7.035847381933493e-05, "loss": 0.7218, "step": 425 }, { "epoch": 1.89, "learning_rate": 7.02285554301186e-05, "loss": 0.7909, "step": 426 }, { "epoch": 1.9, "learning_rate": 7.009847353261601e-05, "loss": 0.7765, "step": 427 }, { "epoch": 1.9, "learning_rate": 6.996822917828477e-05, "loss": 0.7674, "step": 428 }, { "epoch": 1.91, "learning_rate": 6.983782341989562e-05, "loss": 0.7803, "step": 429 }, { "epoch": 1.91, "learning_rate": 6.970725731152388e-05, "loss": 0.7154, "step": 430 }, { "epoch": 1.92, "learning_rate": 6.95765319085411e-05, "loss": 0.7647, "step": 431 }, { "epoch": 1.92, "learning_rate": 6.944564826760631e-05, "loss": 0.7262, "step": 432 }, { "epoch": 1.92, "learning_rate": 6.931460744665763e-05, "loss": 0.6987, "step": 433 }, { "epoch": 1.93, "learning_rate": 6.918341050490369e-05, "loss": 0.6296, "step": 434 }, { "epoch": 1.93, "learning_rate": 6.905205850281502e-05, "loss": 0.6835, "step": 435 }, { "epoch": 1.94, "learning_rate": 6.892055250211552e-05, "loss": 0.7104, "step": 436 }, { "epoch": 1.94, "learning_rate": 6.878889356577386e-05, "loss": 0.7301, "step": 437 }, { "epoch": 1.95, "learning_rate": 6.865708275799492e-05, "loss": 0.7071, "step": 438 }, { "epoch": 1.95, "learning_rate": 6.852512114421112e-05, "loss": 0.6801, "step": 439 }, { "epoch": 1.96, "learning_rate": 6.83930097910739e-05, "loss": 0.7442, "step": 440 }, { "epoch": 1.96, "eval_loss": 0.7402726411819458, "eval_runtime": 16.1717, "eval_samples_per_second": 5.009, "eval_steps_per_second": 1.299, "step": 440 }, { "epoch": 1.96, "learning_rate": 6.826074976644502e-05, "loss": 0.6664, "step": 441 }, { "epoch": 1.96, "learning_rate": 6.812834213938794e-05, "loss": 0.6879, "step": 442 }, { "epoch": 1.97, "learning_rate": 6.799578798015926e-05, "loss": 0.704, "step": 443 }, { "epoch": 1.97, "learning_rate": 6.786308836019997e-05, "loss": 0.6899, "step": 444 }, { "epoch": 1.98, "learning_rate": 6.773024435212678e-05, "loss": 0.6909, "step": 445 }, { "epoch": 1.98, "learning_rate": 6.759725702972358e-05, "loss": 0.6586, "step": 446 }, { "epoch": 1.99, "learning_rate": 6.746412746793263e-05, "loss": 0.6694, "step": 447 }, { "epoch": 1.99, "learning_rate": 6.733085674284589e-05, "loss": 0.7332, "step": 448 }, { "epoch": 2.0, "learning_rate": 6.719744593169641e-05, "loss": 0.7097, "step": 449 }, { "epoch": 2.0, "learning_rate": 6.706389611284954e-05, "loss": 0.7246, "step": 450 }, { "epoch": 2.0, "learning_rate": 6.693020836579418e-05, "loss": 0.7681, "step": 451 }, { "epoch": 2.01, "learning_rate": 6.679638377113419e-05, "loss": 0.6958, "step": 452 }, { "epoch": 2.01, "learning_rate": 6.666242341057958e-05, "loss": 0.7621, "step": 453 }, { "epoch": 2.02, "learning_rate": 6.652832836693764e-05, "loss": 0.695, "step": 454 }, { "epoch": 2.02, "learning_rate": 6.639409972410446e-05, "loss": 0.8196, "step": 455 }, { "epoch": 2.03, "learning_rate": 6.625973856705593e-05, "loss": 0.7137, "step": 456 }, { "epoch": 2.03, "learning_rate": 6.612524598183907e-05, "loss": 0.7218, "step": 457 }, { "epoch": 2.04, "learning_rate": 6.599062305556325e-05, "loss": 0.6928, "step": 458 }, { "epoch": 2.04, "learning_rate": 6.58558708763914e-05, "loss": 0.7925, "step": 459 }, { "epoch": 2.04, "learning_rate": 6.57209905335312e-05, "loss": 0.709, "step": 460 }, { "epoch": 2.04, "eval_loss": 0.7391338348388672, "eval_runtime": 16.1675, "eval_samples_per_second": 5.01, "eval_steps_per_second": 1.299, "step": 460 }, { "epoch": 2.05, "learning_rate": 6.558598311722626e-05, "loss": 0.6628, "step": 461 }, { "epoch": 2.05, "learning_rate": 6.545084971874738e-05, "loss": 0.7818, "step": 462 }, { "epoch": 2.06, "learning_rate": 6.531559143038362e-05, "loss": 0.7831, "step": 463 }, { "epoch": 2.06, "learning_rate": 6.518020934543359e-05, "loss": 0.688, "step": 464 }, { "epoch": 2.07, "learning_rate": 6.504470455819652e-05, "loss": 0.7234, "step": 465 }, { "epoch": 2.07, "learning_rate": 6.49090781639634e-05, "loss": 0.6163, "step": 466 }, { "epoch": 2.08, "learning_rate": 6.477333125900831e-05, "loss": 0.6454, "step": 467 }, { "epoch": 2.08, "learning_rate": 6.463746494057928e-05, "loss": 0.6303, "step": 468 }, { "epoch": 2.08, "learning_rate": 6.450148030688962e-05, "loss": 0.7002, "step": 469 }, { "epoch": 2.09, "learning_rate": 6.436537845710903e-05, "loss": 0.6609, "step": 470 }, { "epoch": 2.09, "learning_rate": 6.422916049135462e-05, "loss": 0.6433, "step": 471 }, { "epoch": 2.1, "learning_rate": 6.409282751068206e-05, "loss": 0.7334, "step": 472 }, { "epoch": 2.1, "learning_rate": 6.395638061707674e-05, "loss": 0.7172, "step": 473 }, { "epoch": 2.11, "learning_rate": 6.381982091344477e-05, "loss": 0.6729, "step": 474 }, { "epoch": 2.11, "learning_rate": 6.368314950360415e-05, "loss": 0.6502, "step": 475 }, { "epoch": 2.12, "learning_rate": 6.354636749227576e-05, "loss": 0.7175, "step": 476 }, { "epoch": 2.12, "learning_rate": 6.340947598507449e-05, "loss": 0.6623, "step": 477 }, { "epoch": 2.12, "learning_rate": 6.327247608850034e-05, "loss": 0.6991, "step": 478 }, { "epoch": 2.13, "learning_rate": 6.313536890992935e-05, "loss": 0.6839, "step": 479 }, { "epoch": 2.13, "learning_rate": 6.299815555760477e-05, "loss": 0.6784, "step": 480 }, { "epoch": 2.13, "eval_loss": 0.7386890649795532, "eval_runtime": 16.1698, "eval_samples_per_second": 5.009, "eval_steps_per_second": 1.299, "step": 480 }, { "epoch": 2.14, "learning_rate": 6.286083714062804e-05, "loss": 0.6487, "step": 481 }, { "epoch": 2.14, "learning_rate": 6.272341476894985e-05, "loss": 0.7033, "step": 482 }, { "epoch": 2.15, "learning_rate": 6.258588955336117e-05, "loss": 0.7006, "step": 483 }, { "epoch": 2.15, "learning_rate": 6.244826260548426e-05, "loss": 0.7005, "step": 484 }, { "epoch": 2.16, "learning_rate": 6.231053503776362e-05, "loss": 0.6183, "step": 485 }, { "epoch": 2.16, "learning_rate": 6.217270796345721e-05, "loss": 0.7022, "step": 486 }, { "epoch": 2.16, "learning_rate": 6.203478249662714e-05, "loss": 0.6653, "step": 487 }, { "epoch": 2.17, "learning_rate": 6.189675975213094e-05, "loss": 0.6226, "step": 488 }, { "epoch": 2.17, "learning_rate": 6.175864084561242e-05, "loss": 0.6435, "step": 489 }, { "epoch": 2.18, "learning_rate": 6.162042689349264e-05, "loss": 0.6788, "step": 490 }, { "epoch": 2.18, "learning_rate": 6.148211901296095e-05, "loss": 0.669, "step": 491 }, { "epoch": 2.19, "learning_rate": 6.13437183219659e-05, "loss": 0.6735, "step": 492 }, { "epoch": 2.19, "learning_rate": 6.120522593920628e-05, "loss": 0.6594, "step": 493 }, { "epoch": 2.2, "learning_rate": 6.106664298412196e-05, "loss": 0.6089, "step": 494 }, { "epoch": 2.2, "learning_rate": 6.092797057688495e-05, "loss": 0.6393, "step": 495 }, { "epoch": 2.2, "learning_rate": 6.078920983839031e-05, "loss": 0.6794, "step": 496 }, { "epoch": 2.21, "learning_rate": 6.0650361890247075e-05, "loss": 0.6395, "step": 497 }, { "epoch": 2.21, "learning_rate": 6.051142785476921e-05, "loss": 0.6425, "step": 498 }, { "epoch": 2.22, "learning_rate": 6.037240885496649e-05, "loss": 0.6211, "step": 499 }, { "epoch": 2.22, "learning_rate": 6.023330601453551e-05, "loss": 0.7064, "step": 500 }, { "epoch": 2.22, "eval_loss": 0.7412368655204773, "eval_runtime": 16.197, "eval_samples_per_second": 5.001, "eval_steps_per_second": 1.297, "step": 500 }, { "epoch": 2.23, "learning_rate": 6.009412045785051e-05, "loss": 0.7181, "step": 501 }, { "epoch": 2.23, "learning_rate": 5.995485330995438e-05, "loss": 0.6969, "step": 502 }, { "epoch": 2.24, "learning_rate": 5.981550569654947e-05, "loss": 0.7208, "step": 503 }, { "epoch": 2.24, "learning_rate": 5.967607874398854e-05, "loss": 0.747, "step": 504 }, { "epoch": 2.24, "learning_rate": 5.953657357926569e-05, "loss": 0.774, "step": 505 }, { "epoch": 2.25, "learning_rate": 5.939699133000714e-05, "loss": 0.7115, "step": 506 }, { "epoch": 2.25, "learning_rate": 5.925733312446228e-05, "loss": 0.7553, "step": 507 }, { "epoch": 2.26, "learning_rate": 5.911760009149438e-05, "loss": 0.6754, "step": 508 }, { "epoch": 2.26, "learning_rate": 5.8977793360571596e-05, "loss": 0.6937, "step": 509 }, { "epoch": 2.27, "learning_rate": 5.883791406175775e-05, "loss": 0.7532, "step": 510 }, { "epoch": 2.27, "learning_rate": 5.8697963325703255e-05, "loss": 0.7376, "step": 511 }, { "epoch": 2.28, "learning_rate": 5.8557942283635934e-05, "loss": 0.6842, "step": 512 }, { "epoch": 2.28, "learning_rate": 5.841785206735192e-05, "loss": 0.6874, "step": 513 }, { "epoch": 2.28, "learning_rate": 5.82776938092065e-05, "loss": 0.7199, "step": 514 }, { "epoch": 2.29, "learning_rate": 5.813746864210489e-05, "loss": 0.7406, "step": 515 }, { "epoch": 2.29, "learning_rate": 5.7997177699493175e-05, "loss": 0.7413, "step": 516 }, { "epoch": 2.3, "learning_rate": 5.785682211534911e-05, "loss": 0.6762, "step": 517 }, { "epoch": 2.3, "learning_rate": 5.771640302417291e-05, "loss": 0.6858, "step": 518 }, { "epoch": 2.31, "learning_rate": 5.7575921560978154e-05, "loss": 0.6122, "step": 519 }, { "epoch": 2.31, "learning_rate": 5.7435378861282585e-05, "loss": 0.718, "step": 520 }, { "epoch": 2.31, "eval_loss": 0.7355238795280457, "eval_runtime": 16.2114, "eval_samples_per_second": 4.996, "eval_steps_per_second": 1.295, "step": 520 }, { "epoch": 2.32, "learning_rate": 5.72947760610989e-05, "loss": 0.736, "step": 521 }, { "epoch": 2.32, "learning_rate": 5.71541142969256e-05, "loss": 0.6424, "step": 522 }, { "epoch": 2.32, "learning_rate": 5.701339470573779e-05, "loss": 0.6246, "step": 523 }, { "epoch": 2.33, "learning_rate": 5.6872618424977996e-05, "loss": 0.7625, "step": 524 }, { "epoch": 2.33, "learning_rate": 5.673178659254697e-05, "loss": 0.6304, "step": 525 }, { "epoch": 2.34, "learning_rate": 5.659090034679451e-05, "loss": 0.6563, "step": 526 }, { "epoch": 2.34, "learning_rate": 5.644996082651017e-05, "loss": 0.7153, "step": 527 }, { "epoch": 2.35, "learning_rate": 5.6308969170914214e-05, "loss": 0.6987, "step": 528 }, { "epoch": 2.35, "learning_rate": 5.6167926519648275e-05, "loss": 0.7146, "step": 529 }, { "epoch": 2.36, "learning_rate": 5.602683401276615e-05, "loss": 0.6472, "step": 530 }, { "epoch": 2.36, "learning_rate": 5.588569279072471e-05, "loss": 0.6651, "step": 531 }, { "epoch": 2.36, "learning_rate": 5.574450399437452e-05, "loss": 0.6967, "step": 532 }, { "epoch": 2.37, "learning_rate": 5.56032687649507e-05, "loss": 0.6471, "step": 533 }, { "epoch": 2.37, "learning_rate": 5.546198824406372e-05, "loss": 0.6559, "step": 534 }, { "epoch": 2.38, "learning_rate": 5.532066357369012e-05, "loss": 0.7009, "step": 535 }, { "epoch": 2.38, "learning_rate": 5.5179295896163306e-05, "loss": 0.6793, "step": 536 }, { "epoch": 2.39, "learning_rate": 5.5037886354164315e-05, "loss": 0.7662, "step": 537 }, { "epoch": 2.39, "learning_rate": 5.489643609071259e-05, "loss": 0.6045, "step": 538 }, { "epoch": 2.4, "learning_rate": 5.475494624915668e-05, "loss": 0.6578, "step": 539 }, { "epoch": 2.4, "learning_rate": 5.4613417973165106e-05, "loss": 0.6901, "step": 540 }, { "epoch": 2.4, "eval_loss": 0.7356687188148499, "eval_runtime": 16.1873, "eval_samples_per_second": 5.004, "eval_steps_per_second": 1.297, "step": 540 }, { "epoch": 2.4, "learning_rate": 5.447185240671703e-05, "loss": 0.7187, "step": 541 }, { "epoch": 2.41, "learning_rate": 5.433025069409301e-05, "loss": 0.6906, "step": 542 }, { "epoch": 2.41, "learning_rate": 5.418861397986581e-05, "loss": 0.6637, "step": 543 }, { "epoch": 2.42, "learning_rate": 5.404694340889112e-05, "loss": 0.7194, "step": 544 }, { "epoch": 2.42, "learning_rate": 5.390524012629824e-05, "loss": 0.6827, "step": 545 }, { "epoch": 2.43, "learning_rate": 5.376350527748094e-05, "loss": 0.6821, "step": 546 }, { "epoch": 2.43, "learning_rate": 5.3621740008088126e-05, "loss": 0.7305, "step": 547 }, { "epoch": 2.44, "learning_rate": 5.347994546401457e-05, "loss": 0.5709, "step": 548 }, { "epoch": 2.44, "learning_rate": 5.333812279139169e-05, "loss": 0.6363, "step": 549 }, { "epoch": 2.44, "learning_rate": 5.3196273136578286e-05, "loss": 0.7283, "step": 550 }, { "epoch": 2.45, "learning_rate": 5.305439764615121e-05, "loss": 0.7929, "step": 551 }, { "epoch": 2.45, "learning_rate": 5.291249746689619e-05, "loss": 0.7669, "step": 552 }, { "epoch": 2.46, "learning_rate": 5.27705737457985e-05, "loss": 0.722, "step": 553 }, { "epoch": 2.46, "learning_rate": 5.262862763003369e-05, "loss": 0.6837, "step": 554 }, { "epoch": 2.47, "learning_rate": 5.248666026695834e-05, "loss": 0.753, "step": 555 }, { "epoch": 2.47, "learning_rate": 5.234467280410078e-05, "loss": 0.7146, "step": 556 }, { "epoch": 2.48, "learning_rate": 5.220266638915178e-05, "loss": 0.6773, "step": 557 }, { "epoch": 2.48, "learning_rate": 5.206064216995532e-05, "loss": 0.7773, "step": 558 }, { "epoch": 2.48, "learning_rate": 5.191860129449931e-05, "loss": 0.7431, "step": 559 }, { "epoch": 2.49, "learning_rate": 5.177654491090627e-05, "loss": 0.7091, "step": 560 }, { "epoch": 2.49, "eval_loss": 0.7365412712097168, "eval_runtime": 16.1735, "eval_samples_per_second": 5.008, "eval_steps_per_second": 1.298, "step": 560 }, { "epoch": 2.49, "learning_rate": 5.163447416742405e-05, "loss": 0.6741, "step": 561 }, { "epoch": 2.5, "learning_rate": 5.149239021241663e-05, "loss": 0.6587, "step": 562 }, { "epoch": 2.5, "learning_rate": 5.1350294194354744e-05, "loss": 0.6847, "step": 563 }, { "epoch": 2.51, "learning_rate": 5.1208187261806615e-05, "loss": 0.7163, "step": 564 }, { "epoch": 2.51, "learning_rate": 5.106607056342874e-05, "loss": 0.6476, "step": 565 }, { "epoch": 2.52, "learning_rate": 5.092394524795649e-05, "loss": 0.6942, "step": 566 }, { "epoch": 2.52, "learning_rate": 5.0781812464194955e-05, "loss": 0.6458, "step": 567 }, { "epoch": 2.52, "learning_rate": 5.0639673361009544e-05, "loss": 0.6589, "step": 568 }, { "epoch": 2.53, "learning_rate": 5.049752908731675e-05, "loss": 0.6806, "step": 569 }, { "epoch": 2.53, "learning_rate": 5.035538079207488e-05, "loss": 0.6733, "step": 570 }, { "epoch": 2.54, "learning_rate": 5.021322962427475e-05, "loss": 0.7576, "step": 571 }, { "epoch": 2.54, "learning_rate": 5.0071076732930355e-05, "loss": 0.7129, "step": 572 }, { "epoch": 2.55, "learning_rate": 4.9928923267069656e-05, "loss": 0.7436, "step": 573 }, { "epoch": 2.55, "learning_rate": 4.978677037572526e-05, "loss": 0.6551, "step": 574 }, { "epoch": 2.56, "learning_rate": 4.964461920792512e-05, "loss": 0.6978, "step": 575 }, { "epoch": 2.56, "learning_rate": 4.950247091268326e-05, "loss": 0.7198, "step": 576 }, { "epoch": 2.56, "learning_rate": 4.936032663899046e-05, "loss": 0.6327, "step": 577 }, { "epoch": 2.57, "learning_rate": 4.921818753580506e-05, "loss": 0.7077, "step": 578 }, { "epoch": 2.57, "learning_rate": 4.907605475204352e-05, "loss": 0.7271, "step": 579 }, { "epoch": 2.58, "learning_rate": 4.893392943657127e-05, "loss": 0.6951, "step": 580 }, { "epoch": 2.58, "eval_loss": 0.7344306111335754, "eval_runtime": 16.1779, "eval_samples_per_second": 5.007, "eval_steps_per_second": 1.298, "step": 580 }, { "epoch": 2.58, "learning_rate": 4.87918127381934e-05, "loss": 0.6326, "step": 581 }, { "epoch": 2.59, "learning_rate": 4.8649705805645274e-05, "loss": 0.733, "step": 582 }, { "epoch": 2.59, "learning_rate": 4.850760978758338e-05, "loss": 0.6771, "step": 583 }, { "epoch": 2.6, "learning_rate": 4.836552583257597e-05, "loss": 0.673, "step": 584 }, { "epoch": 2.6, "learning_rate": 4.822345508909376e-05, "loss": 0.7064, "step": 585 }, { "epoch": 2.6, "learning_rate": 4.8081398705500706e-05, "loss": 0.6944, "step": 586 }, { "epoch": 2.61, "learning_rate": 4.79393578300447e-05, "loss": 0.6399, "step": 587 }, { "epoch": 2.61, "learning_rate": 4.7797333610848246e-05, "loss": 0.646, "step": 588 }, { "epoch": 2.62, "learning_rate": 4.7655327195899244e-05, "loss": 0.6829, "step": 589 }, { "epoch": 2.62, "learning_rate": 4.7513339733041663e-05, "loss": 0.6971, "step": 590 }, { "epoch": 2.63, "learning_rate": 4.737137236996631e-05, "loss": 0.6354, "step": 591 }, { "epoch": 2.63, "learning_rate": 4.72294262542015e-05, "loss": 0.6524, "step": 592 }, { "epoch": 2.64, "learning_rate": 4.7087502533103806e-05, "loss": 0.6613, "step": 593 }, { "epoch": 2.64, "learning_rate": 4.694560235384879e-05, "loss": 0.6854, "step": 594 }, { "epoch": 2.64, "learning_rate": 4.6803726863421725e-05, "loss": 0.6929, "step": 595 }, { "epoch": 2.65, "learning_rate": 4.666187720860831e-05, "loss": 0.6584, "step": 596 }, { "epoch": 2.65, "learning_rate": 4.652005453598544e-05, "loss": 0.7853, "step": 597 }, { "epoch": 2.66, "learning_rate": 4.6378259991911886e-05, "loss": 0.6494, "step": 598 }, { "epoch": 2.66, "learning_rate": 4.623649472251907e-05, "loss": 0.617, "step": 599 }, { "epoch": 2.67, "learning_rate": 4.609475987370177e-05, "loss": 0.5959, "step": 600 }, { "epoch": 2.67, "eval_loss": 0.7361752390861511, "eval_runtime": 16.2017, "eval_samples_per_second": 4.999, "eval_steps_per_second": 1.296, "step": 600 }, { "epoch": 2.67, "learning_rate": 4.595305659110889e-05, "loss": 0.7532, "step": 601 }, { "epoch": 2.68, "learning_rate": 4.5811386020134205e-05, "loss": 0.7152, "step": 602 }, { "epoch": 2.68, "learning_rate": 4.566974930590701e-05, "loss": 0.6756, "step": 603 }, { "epoch": 2.68, "learning_rate": 4.552814759328299e-05, "loss": 0.7294, "step": 604 }, { "epoch": 2.69, "learning_rate": 4.5386582026834906e-05, "loss": 0.7813, "step": 605 }, { "epoch": 2.69, "learning_rate": 4.524505375084333e-05, "loss": 0.721, "step": 606 }, { "epoch": 2.7, "learning_rate": 4.510356390928742e-05, "loss": 0.7941, "step": 607 }, { "epoch": 2.7, "learning_rate": 4.49621136458357e-05, "loss": 0.7977, "step": 608 }, { "epoch": 2.71, "learning_rate": 4.4820704103836705e-05, "loss": 0.6903, "step": 609 }, { "epoch": 2.71, "learning_rate": 4.467933642630989e-05, "loss": 0.7076, "step": 610 }, { "epoch": 2.72, "learning_rate": 4.453801175593629e-05, "loss": 0.6853, "step": 611 }, { "epoch": 2.72, "learning_rate": 4.439673123504931e-05, "loss": 0.6855, "step": 612 }, { "epoch": 2.72, "learning_rate": 4.425549600562549e-05, "loss": 0.7278, "step": 613 }, { "epoch": 2.73, "learning_rate": 4.411430720927531e-05, "loss": 0.7256, "step": 614 }, { "epoch": 2.73, "learning_rate": 4.397316598723385e-05, "loss": 0.7019, "step": 615 }, { "epoch": 2.74, "learning_rate": 4.383207348035175e-05, "loss": 0.6817, "step": 616 }, { "epoch": 2.74, "learning_rate": 4.369103082908581e-05, "loss": 0.7147, "step": 617 }, { "epoch": 2.75, "learning_rate": 4.3550039173489845e-05, "loss": 0.6466, "step": 618 }, { "epoch": 2.75, "learning_rate": 4.340909965320552e-05, "loss": 0.7691, "step": 619 }, { "epoch": 2.76, "learning_rate": 4.326821340745304e-05, "loss": 0.619, "step": 620 }, { "epoch": 2.76, "eval_loss": 0.7325919270515442, "eval_runtime": 16.2144, "eval_samples_per_second": 4.996, "eval_steps_per_second": 1.295, "step": 620 }, { "epoch": 2.76, "learning_rate": 4.312738157502202e-05, "loss": 0.5971, "step": 621 }, { "epoch": 2.76, "learning_rate": 4.298660529426223e-05, "loss": 0.6656, "step": 622 }, { "epoch": 2.77, "learning_rate": 4.284588570307442e-05, "loss": 0.7092, "step": 623 }, { "epoch": 2.77, "learning_rate": 4.270522393890112e-05, "loss": 0.6887, "step": 624 }, { "epoch": 2.78, "learning_rate": 4.2564621138717407e-05, "loss": 0.739, "step": 625 }, { "epoch": 2.78, "learning_rate": 4.2424078439021844e-05, "loss": 0.6589, "step": 626 }, { "epoch": 2.79, "learning_rate": 4.22835969758271e-05, "loss": 0.6241, "step": 627 }, { "epoch": 2.79, "learning_rate": 4.21431778846509e-05, "loss": 0.6595, "step": 628 }, { "epoch": 2.8, "learning_rate": 4.200282230050683e-05, "loss": 0.6708, "step": 629 }, { "epoch": 2.8, "learning_rate": 4.186253135789511e-05, "loss": 0.661, "step": 630 }, { "epoch": 2.8, "learning_rate": 4.1722306190793495e-05, "loss": 0.7229, "step": 631 }, { "epoch": 2.81, "learning_rate": 4.1582147932648074e-05, "loss": 0.5988, "step": 632 }, { "epoch": 2.81, "learning_rate": 4.144205771636407e-05, "loss": 0.7131, "step": 633 }, { "epoch": 2.82, "learning_rate": 4.1302036674296756e-05, "loss": 0.6935, "step": 634 }, { "epoch": 2.82, "learning_rate": 4.116208593824227e-05, "loss": 0.7132, "step": 635 }, { "epoch": 2.83, "learning_rate": 4.102220663942841e-05, "loss": 0.7209, "step": 636 }, { "epoch": 2.83, "learning_rate": 4.088239990850562e-05, "loss": 0.712, "step": 637 }, { "epoch": 2.84, "learning_rate": 4.074266687553773e-05, "loss": 0.7514, "step": 638 }, { "epoch": 2.84, "learning_rate": 4.060300866999286e-05, "loss": 0.6708, "step": 639 }, { "epoch": 2.84, "learning_rate": 4.0463426420734323e-05, "loss": 0.6953, "step": 640 }, { "epoch": 2.84, "eval_loss": 0.7320604920387268, "eval_runtime": 16.1931, "eval_samples_per_second": 5.002, "eval_steps_per_second": 1.297, "step": 640 }, { "epoch": 2.85, "learning_rate": 4.032392125601147e-05, "loss": 0.6768, "step": 641 }, { "epoch": 2.85, "learning_rate": 4.018449430345054e-05, "loss": 0.6666, "step": 642 }, { "epoch": 2.86, "learning_rate": 4.004514669004562e-05, "loss": 0.6697, "step": 643 }, { "epoch": 2.86, "learning_rate": 3.99058795421495e-05, "loss": 0.6809, "step": 644 }, { "epoch": 2.87, "learning_rate": 3.976669398546451e-05, "loss": 0.6717, "step": 645 }, { "epoch": 2.87, "learning_rate": 3.962759114503353e-05, "loss": 0.6091, "step": 646 }, { "epoch": 2.88, "learning_rate": 3.948857214523081e-05, "loss": 0.6579, "step": 647 }, { "epoch": 2.88, "learning_rate": 3.934963810975294e-05, "loss": 0.7374, "step": 648 }, { "epoch": 2.88, "learning_rate": 3.92107901616097e-05, "loss": 0.6407, "step": 649 }, { "epoch": 2.89, "learning_rate": 3.9072029423115055e-05, "loss": 0.6414, "step": 650 }, { "epoch": 2.89, "learning_rate": 3.8933357015878064e-05, "loss": 0.7717, "step": 651 }, { "epoch": 2.9, "learning_rate": 3.8794774060793735e-05, "loss": 0.6792, "step": 652 }, { "epoch": 2.9, "learning_rate": 3.86562816780341e-05, "loss": 0.7022, "step": 653 }, { "epoch": 2.91, "learning_rate": 3.851788098703907e-05, "loss": 0.729, "step": 654 }, { "epoch": 2.91, "learning_rate": 3.837957310650738e-05, "loss": 0.7356, "step": 655 }, { "epoch": 2.92, "learning_rate": 3.8241359154387595e-05, "loss": 0.6932, "step": 656 }, { "epoch": 2.92, "learning_rate": 3.8103240247869075e-05, "loss": 0.6775, "step": 657 }, { "epoch": 2.92, "learning_rate": 3.7965217503372877e-05, "loss": 0.6587, "step": 658 }, { "epoch": 2.93, "learning_rate": 3.7827292036542805e-05, "loss": 0.7077, "step": 659 }, { "epoch": 2.93, "learning_rate": 3.768946496223637e-05, "loss": 0.6363, "step": 660 }, { "epoch": 2.93, "eval_loss": 0.7314029932022095, "eval_runtime": 16.2089, "eval_samples_per_second": 4.997, "eval_steps_per_second": 1.296, "step": 660 }, { "epoch": 2.94, "learning_rate": 3.755173739451575e-05, "loss": 0.6865, "step": 661 }, { "epoch": 2.94, "learning_rate": 3.7414110446638825e-05, "loss": 0.7104, "step": 662 }, { "epoch": 2.95, "learning_rate": 3.7276585231050145e-05, "loss": 0.6059, "step": 663 }, { "epoch": 2.95, "learning_rate": 3.7139162859371955e-05, "loss": 0.6314, "step": 664 }, { "epoch": 2.96, "learning_rate": 3.700184444239524e-05, "loss": 0.7693, "step": 665 }, { "epoch": 2.96, "learning_rate": 3.6864631090070655e-05, "loss": 0.6617, "step": 666 }, { "epoch": 2.96, "learning_rate": 3.672752391149966e-05, "loss": 0.6895, "step": 667 }, { "epoch": 2.97, "learning_rate": 3.659052401492551e-05, "loss": 0.7244, "step": 668 }, { "epoch": 2.97, "learning_rate": 3.645363250772425e-05, "loss": 0.6243, "step": 669 }, { "epoch": 2.98, "learning_rate": 3.631685049639586e-05, "loss": 0.5942, "step": 670 }, { "epoch": 2.98, "learning_rate": 3.618017908655523e-05, "loss": 0.6391, "step": 671 }, { "epoch": 2.99, "learning_rate": 3.6043619382923274e-05, "loss": 0.6535, "step": 672 }, { "epoch": 2.99, "learning_rate": 3.590717248931794e-05, "loss": 0.602, "step": 673 }, { "epoch": 3.0, "learning_rate": 3.5770839508645385e-05, "loss": 0.6396, "step": 674 }, { "epoch": 3.0, "learning_rate": 3.563462154289098e-05, "loss": 0.725, "step": 675 }, { "epoch": 3.0, "learning_rate": 3.5498519693110375e-05, "loss": 0.8034, "step": 676 }, { "epoch": 3.01, "learning_rate": 3.536253505942073e-05, "loss": 0.6909, "step": 677 }, { "epoch": 3.01, "learning_rate": 3.5226668740991705e-05, "loss": 0.7072, "step": 678 }, { "epoch": 3.02, "learning_rate": 3.509092183603659e-05, "loss": 0.7086, "step": 679 }, { "epoch": 3.02, "learning_rate": 3.4955295441803495e-05, "loss": 0.7257, "step": 680 }, { "epoch": 3.02, "eval_loss": 0.7309479117393494, "eval_runtime": 16.2583, "eval_samples_per_second": 4.982, "eval_steps_per_second": 1.292, "step": 680 }, { "epoch": 3.03, "learning_rate": 3.481979065456642e-05, "loss": 0.6641, "step": 681 }, { "epoch": 3.03, "learning_rate": 3.468440856961639e-05, "loss": 0.675, "step": 682 }, { "epoch": 3.04, "learning_rate": 3.4549150281252636e-05, "loss": 0.6748, "step": 683 }, { "epoch": 3.04, "learning_rate": 3.4414016882773757e-05, "loss": 0.6625, "step": 684 }, { "epoch": 3.04, "learning_rate": 3.427900946646882e-05, "loss": 0.6625, "step": 685 }, { "epoch": 3.05, "learning_rate": 3.414412912360861e-05, "loss": 0.6641, "step": 686 }, { "epoch": 3.05, "learning_rate": 3.400937694443678e-05, "loss": 0.6629, "step": 687 }, { "epoch": 3.06, "learning_rate": 3.387475401816096e-05, "loss": 0.6882, "step": 688 }, { "epoch": 3.06, "learning_rate": 3.37402614329441e-05, "loss": 0.6896, "step": 689 }, { "epoch": 3.07, "learning_rate": 3.360590027589556e-05, "loss": 0.6455, "step": 690 }, { "epoch": 3.07, "learning_rate": 3.3471671633062376e-05, "loss": 0.6925, "step": 691 }, { "epoch": 3.08, "learning_rate": 3.333757658942045e-05, "loss": 0.5948, "step": 692 }, { "epoch": 3.08, "learning_rate": 3.320361622886581e-05, "loss": 0.6794, "step": 693 }, { "epoch": 3.08, "learning_rate": 3.306979163420582e-05, "loss": 0.733, "step": 694 }, { "epoch": 3.09, "learning_rate": 3.293610388715048e-05, "loss": 0.6766, "step": 695 }, { "epoch": 3.09, "learning_rate": 3.2802554068303596e-05, "loss": 0.6086, "step": 696 }, { "epoch": 3.1, "learning_rate": 3.2669143257154113e-05, "loss": 0.6252, "step": 697 }, { "epoch": 3.1, "learning_rate": 3.253587253206738e-05, "loss": 0.6326, "step": 698 }, { "epoch": 3.11, "learning_rate": 3.2402742970276426e-05, "loss": 0.7659, "step": 699 }, { "epoch": 3.11, "learning_rate": 3.226975564787322e-05, "loss": 0.6001, "step": 700 }, { "epoch": 3.11, "eval_loss": 0.7316482067108154, "eval_runtime": 16.2988, "eval_samples_per_second": 4.97, "eval_steps_per_second": 1.288, "step": 700 }, { "epoch": 3.12, "learning_rate": 3.213691163980004e-05, "loss": 0.6963, "step": 701 }, { "epoch": 3.12, "learning_rate": 3.200421201984074e-05, "loss": 0.6698, "step": 702 }, { "epoch": 3.12, "learning_rate": 3.187165786061206e-05, "loss": 0.6681, "step": 703 }, { "epoch": 3.13, "learning_rate": 3.1739250233554996e-05, "loss": 0.6811, "step": 704 }, { "epoch": 3.13, "learning_rate": 3.160699020892612e-05, "loss": 0.6669, "step": 705 }, { "epoch": 3.14, "learning_rate": 3.1474878855788895e-05, "loss": 0.6001, "step": 706 }, { "epoch": 3.14, "learning_rate": 3.13429172420051e-05, "loss": 0.6296, "step": 707 }, { "epoch": 3.15, "learning_rate": 3.121110643422615e-05, "loss": 0.6481, "step": 708 }, { "epoch": 3.15, "learning_rate": 3.107944749788449e-05, "loss": 0.6581, "step": 709 }, { "epoch": 3.16, "learning_rate": 3.094794149718498e-05, "loss": 0.7019, "step": 710 }, { "epoch": 3.16, "learning_rate": 3.0816589495096317e-05, "loss": 0.6645, "step": 711 }, { "epoch": 3.16, "learning_rate": 3.068539255334237e-05, "loss": 0.6239, "step": 712 }, { "epoch": 3.17, "learning_rate": 3.05543517323937e-05, "loss": 0.6124, "step": 713 }, { "epoch": 3.17, "learning_rate": 3.0423468091458918e-05, "loss": 0.6291, "step": 714 }, { "epoch": 3.18, "learning_rate": 3.0292742688476122e-05, "loss": 0.6312, "step": 715 }, { "epoch": 3.18, "learning_rate": 3.0162176580104397e-05, "loss": 0.6532, "step": 716 }, { "epoch": 3.19, "learning_rate": 3.003177082171523e-05, "loss": 0.6687, "step": 717 }, { "epoch": 3.19, "learning_rate": 2.9901526467383987e-05, "loss": 0.5563, "step": 718 }, { "epoch": 3.2, "learning_rate": 2.9771444569881413e-05, "loss": 0.666, "step": 719 }, { "epoch": 3.2, "learning_rate": 2.964152618066508e-05, "loss": 0.6246, "step": 720 }, { "epoch": 3.2, "eval_loss": 0.7331957221031189, "eval_runtime": 16.3691, "eval_samples_per_second": 4.948, "eval_steps_per_second": 1.283, "step": 720 }, { "epoch": 3.2, "learning_rate": 2.9511772349870958e-05, "loss": 0.6019, "step": 721 }, { "epoch": 3.21, "learning_rate": 2.9382184126304834e-05, "loss": 0.7261, "step": 722 }, { "epoch": 3.21, "learning_rate": 2.9252762557433922e-05, "loss": 0.6073, "step": 723 }, { "epoch": 3.22, "learning_rate": 2.9123508689378352e-05, "loss": 0.6041, "step": 724 }, { "epoch": 3.22, "learning_rate": 2.8994423566902707e-05, "loss": 0.6246, "step": 725 }, { "epoch": 3.23, "learning_rate": 2.8865508233407595e-05, "loss": 0.7411, "step": 726 }, { "epoch": 3.23, "learning_rate": 2.8736763730921228e-05, "loss": 0.7378, "step": 727 }, { "epoch": 3.24, "learning_rate": 2.8608191100090977e-05, "loss": 0.694, "step": 728 }, { "epoch": 3.24, "learning_rate": 2.847979138017496e-05, "loss": 0.7049, "step": 729 }, { "epoch": 3.24, "learning_rate": 2.835156560903365e-05, "loss": 0.6776, "step": 730 }, { "epoch": 3.25, "learning_rate": 2.822351482312149e-05, "loss": 0.7103, "step": 731 }, { "epoch": 3.25, "learning_rate": 2.80956400574785e-05, "loss": 0.6989, "step": 732 }, { "epoch": 3.26, "learning_rate": 2.7967942345721964e-05, "loss": 0.7524, "step": 733 }, { "epoch": 3.26, "learning_rate": 2.784042272003794e-05, "loss": 0.7126, "step": 734 }, { "epoch": 3.27, "learning_rate": 2.771308221117309e-05, "loss": 0.6255, "step": 735 }, { "epoch": 3.27, "learning_rate": 2.7585921848426242e-05, "loss": 0.663, "step": 736 }, { "epoch": 3.28, "learning_rate": 2.7458942659640085e-05, "loss": 0.6511, "step": 737 }, { "epoch": 3.28, "learning_rate": 2.73321456711929e-05, "loss": 0.6723, "step": 738 }, { "epoch": 3.28, "learning_rate": 2.720553190799019e-05, "loss": 0.6504, "step": 739 }, { "epoch": 3.29, "learning_rate": 2.7079102393456503e-05, "loss": 0.6465, "step": 740 }, { "epoch": 3.29, "eval_loss": 0.7326642274856567, "eval_runtime": 16.2926, "eval_samples_per_second": 4.972, "eval_steps_per_second": 1.289, "step": 740 }, { "epoch": 3.29, "learning_rate": 2.6952858149527017e-05, "loss": 0.6594, "step": 741 }, { "epoch": 3.3, "learning_rate": 2.6826800196639412e-05, "loss": 0.6149, "step": 742 }, { "epoch": 3.3, "learning_rate": 2.6700929553725573e-05, "loss": 0.6817, "step": 743 }, { "epoch": 3.31, "learning_rate": 2.6575247238203328e-05, "loss": 0.6066, "step": 744 }, { "epoch": 3.31, "learning_rate": 2.6449754265968264e-05, "loss": 0.6141, "step": 745 }, { "epoch": 3.32, "learning_rate": 2.6324451651385495e-05, "loss": 0.6716, "step": 746 }, { "epoch": 3.32, "learning_rate": 2.6199340407281435e-05, "loss": 0.6897, "step": 747 }, { "epoch": 3.32, "learning_rate": 2.607442154493568e-05, "loss": 0.6212, "step": 748 }, { "epoch": 3.33, "learning_rate": 2.5949696074072786e-05, "loss": 0.6493, "step": 749 }, { "epoch": 3.33, "learning_rate": 2.5825165002854124e-05, "loss": 0.6363, "step": 750 }, { "epoch": 3.34, "learning_rate": 2.57008293378697e-05, "loss": 0.6591, "step": 751 }, { "epoch": 3.34, "learning_rate": 2.5576690084130085e-05, "loss": 0.6951, "step": 752 }, { "epoch": 3.35, "learning_rate": 2.5452748245058177e-05, "loss": 0.6369, "step": 753 }, { "epoch": 3.35, "learning_rate": 2.532900482248124e-05, "loss": 0.6685, "step": 754 }, { "epoch": 3.36, "learning_rate": 2.5205460816622684e-05, "loss": 0.6467, "step": 755 }, { "epoch": 3.36, "learning_rate": 2.5082117226094047e-05, "loss": 0.6581, "step": 756 }, { "epoch": 3.36, "learning_rate": 2.49589750478869e-05, "loss": 0.7478, "step": 757 }, { "epoch": 3.37, "learning_rate": 2.4836035277364767e-05, "loss": 0.7247, "step": 758 }, { "epoch": 3.37, "learning_rate": 2.471329890825514e-05, "loss": 0.6335, "step": 759 }, { "epoch": 3.38, "learning_rate": 2.4590766932641353e-05, "loss": 0.6239, "step": 760 }, { "epoch": 3.38, "eval_loss": 0.7320056557655334, "eval_runtime": 16.2722, "eval_samples_per_second": 4.978, "eval_steps_per_second": 1.291, "step": 760 }, { "epoch": 3.38, "learning_rate": 2.446844034095466e-05, "loss": 0.6315, "step": 761 }, { "epoch": 3.39, "learning_rate": 2.4346320121966194e-05, "loss": 0.6445, "step": 762 }, { "epoch": 3.39, "learning_rate": 2.4224407262778925e-05, "loss": 0.5742, "step": 763 }, { "epoch": 3.4, "learning_rate": 2.410270274881981e-05, "loss": 0.581, "step": 764 }, { "epoch": 3.4, "learning_rate": 2.398120756383163e-05, "loss": 0.6181, "step": 765 }, { "epoch": 3.4, "learning_rate": 2.3859922689865234e-05, "loss": 0.624, "step": 766 }, { "epoch": 3.41, "learning_rate": 2.3738849107271476e-05, "loss": 0.6646, "step": 767 }, { "epoch": 3.41, "learning_rate": 2.361798779469336e-05, "loss": 0.6389, "step": 768 }, { "epoch": 3.42, "learning_rate": 2.3497339729058083e-05, "loss": 0.6622, "step": 769 }, { "epoch": 3.42, "learning_rate": 2.3376905885569182e-05, "loss": 0.6558, "step": 770 }, { "epoch": 3.43, "learning_rate": 2.325668723769858e-05, "loss": 0.6956, "step": 771 }, { "epoch": 3.43, "learning_rate": 2.3136684757178806e-05, "loss": 0.6325, "step": 772 }, { "epoch": 3.44, "learning_rate": 2.3016899413995092e-05, "loss": 0.5749, "step": 773 }, { "epoch": 3.44, "learning_rate": 2.2897332176377528e-05, "loss": 0.5596, "step": 774 }, { "epoch": 3.44, "learning_rate": 2.2777984010793264e-05, "loss": 0.6344, "step": 775 }, { "epoch": 3.45, "learning_rate": 2.2658855881938685e-05, "loss": 0.7134, "step": 776 }, { "epoch": 3.45, "learning_rate": 2.2539948752731555e-05, "loss": 0.6477, "step": 777 }, { "epoch": 3.46, "learning_rate": 2.2421263584303355e-05, "loss": 0.7192, "step": 778 }, { "epoch": 3.46, "learning_rate": 2.2302801335991413e-05, "loss": 0.7168, "step": 779 }, { "epoch": 3.47, "learning_rate": 2.2184562965331202e-05, "loss": 0.754, "step": 780 }, { "epoch": 3.47, "eval_loss": 0.7345401048660278, "eval_runtime": 16.2731, "eval_samples_per_second": 4.978, "eval_steps_per_second": 1.29, "step": 780 }, { "epoch": 3.47, "learning_rate": 2.206654942804857e-05, "loss": 0.7341, "step": 781 }, { "epoch": 3.48, "learning_rate": 2.1948761678052028e-05, "loss": 0.7202, "step": 782 }, { "epoch": 3.48, "learning_rate": 2.183120066742506e-05, "loss": 0.7192, "step": 783 }, { "epoch": 3.48, "learning_rate": 2.1713867346418354e-05, "loss": 0.8172, "step": 784 }, { "epoch": 3.49, "learning_rate": 2.1596762663442218e-05, "loss": 0.6996, "step": 785 }, { "epoch": 3.49, "learning_rate": 2.147988756505886e-05, "loss": 0.7763, "step": 786 }, { "epoch": 3.5, "learning_rate": 2.136324299597474e-05, "loss": 0.6673, "step": 787 }, { "epoch": 3.5, "learning_rate": 2.124682989903295e-05, "loss": 0.6465, "step": 788 }, { "epoch": 3.51, "learning_rate": 2.1130649215205584e-05, "loss": 0.7541, "step": 789 }, { "epoch": 3.51, "learning_rate": 2.1014701883586085e-05, "loss": 0.669, "step": 790 }, { "epoch": 3.52, "learning_rate": 2.089898884138177e-05, "loss": 0.7063, "step": 791 }, { "epoch": 3.52, "learning_rate": 2.0783511023906128e-05, "loss": 0.6963, "step": 792 }, { "epoch": 3.52, "learning_rate": 2.0668269364571358e-05, "loss": 0.6246, "step": 793 }, { "epoch": 3.53, "learning_rate": 2.0553264794880756e-05, "loss": 0.658, "step": 794 }, { "epoch": 3.53, "learning_rate": 2.043849824442124e-05, "loss": 0.6113, "step": 795 }, { "epoch": 3.54, "learning_rate": 2.0323970640855748e-05, "loss": 0.6036, "step": 796 }, { "epoch": 3.54, "learning_rate": 2.0209682909915855e-05, "loss": 0.6178, "step": 797 }, { "epoch": 3.55, "learning_rate": 2.0095635975394238e-05, "loss": 0.7395, "step": 798 }, { "epoch": 3.55, "learning_rate": 1.9981830759137186e-05, "loss": 0.6803, "step": 799 }, { "epoch": 3.56, "learning_rate": 1.9868268181037185e-05, "loss": 0.6661, "step": 800 }, { "epoch": 3.56, "eval_loss": 0.7309223413467407, "eval_runtime": 16.2587, "eval_samples_per_second": 4.982, "eval_steps_per_second": 1.292, "step": 800 }, { "epoch": 3.56, "learning_rate": 1.9754949159025415e-05, "loss": 0.665, "step": 801 }, { "epoch": 3.56, "learning_rate": 1.9641874609064443e-05, "loss": 0.6056, "step": 802 }, { "epoch": 3.57, "learning_rate": 1.9529045445140725e-05, "loss": 0.6633, "step": 803 }, { "epoch": 3.57, "learning_rate": 1.9416462579257273e-05, "loss": 0.6446, "step": 804 }, { "epoch": 3.58, "learning_rate": 1.9304126921426236e-05, "loss": 0.6842, "step": 805 }, { "epoch": 3.58, "learning_rate": 1.919203937966161e-05, "loss": 0.6465, "step": 806 }, { "epoch": 3.59, "learning_rate": 1.9080200859971793e-05, "loss": 0.6094, "step": 807 }, { "epoch": 3.59, "learning_rate": 1.8968612266352376e-05, "loss": 0.684, "step": 808 }, { "epoch": 3.6, "learning_rate": 1.885727450077879e-05, "loss": 0.6624, "step": 809 }, { "epoch": 3.6, "learning_rate": 1.8746188463198982e-05, "loss": 0.6649, "step": 810 }, { "epoch": 3.6, "learning_rate": 1.8635355051526195e-05, "loss": 0.7099, "step": 811 }, { "epoch": 3.61, "learning_rate": 1.8524775161631675e-05, "loss": 0.658, "step": 812 }, { "epoch": 3.61, "learning_rate": 1.8414449687337464e-05, "loss": 0.6599, "step": 813 }, { "epoch": 3.62, "learning_rate": 1.8304379520409087e-05, "loss": 0.6879, "step": 814 }, { "epoch": 3.62, "learning_rate": 1.8194565550548476e-05, "loss": 0.6552, "step": 815 }, { "epoch": 3.63, "learning_rate": 1.8085008665386687e-05, "loss": 0.6411, "step": 816 }, { "epoch": 3.63, "learning_rate": 1.7975709750476744e-05, "loss": 0.6946, "step": 817 }, { "epoch": 3.64, "learning_rate": 1.7866669689286496e-05, "loss": 0.693, "step": 818 }, { "epoch": 3.64, "learning_rate": 1.7757889363191483e-05, "loss": 0.6364, "step": 819 }, { "epoch": 3.64, "learning_rate": 1.764936965146773e-05, "loss": 0.6127, "step": 820 }, { "epoch": 3.64, "eval_loss": 0.730964720249176, "eval_runtime": 16.277, "eval_samples_per_second": 4.976, "eval_steps_per_second": 1.29, "step": 820 }, { "epoch": 3.65, "learning_rate": 1.7541111431284773e-05, "loss": 0.5951, "step": 821 }, { "epoch": 3.65, "learning_rate": 1.743311557769847e-05, "loss": 0.6106, "step": 822 }, { "epoch": 3.66, "learning_rate": 1.7325382963643976e-05, "loss": 0.5896, "step": 823 }, { "epoch": 3.66, "learning_rate": 1.7217914459928646e-05, "loss": 0.7827, "step": 824 }, { "epoch": 3.67, "learning_rate": 1.7110710935225055e-05, "loss": 0.5832, "step": 825 }, { "epoch": 3.67, "learning_rate": 1.700377325606388e-05, "loss": 0.7961, "step": 826 }, { "epoch": 3.68, "learning_rate": 1.689710228682702e-05, "loss": 0.6715, "step": 827 }, { "epoch": 3.68, "learning_rate": 1.679069888974052e-05, "loss": 0.6424, "step": 828 }, { "epoch": 3.68, "learning_rate": 1.668456392486762e-05, "loss": 0.7368, "step": 829 }, { "epoch": 3.69, "learning_rate": 1.6578698250101825e-05, "loss": 0.7096, "step": 830 }, { "epoch": 3.69, "learning_rate": 1.6473102721159957e-05, "loss": 0.718, "step": 831 }, { "epoch": 3.7, "learning_rate": 1.6367778191575223e-05, "loss": 0.7177, "step": 832 }, { "epoch": 3.7, "learning_rate": 1.6262725512690345e-05, "loss": 0.7212, "step": 833 }, { "epoch": 3.71, "learning_rate": 1.615794553365066e-05, "loss": 0.6808, "step": 834 }, { "epoch": 3.71, "learning_rate": 1.6053439101397255e-05, "loss": 0.7125, "step": 835 }, { "epoch": 3.72, "learning_rate": 1.5949207060660138e-05, "loss": 0.6934, "step": 836 }, { "epoch": 3.72, "learning_rate": 1.5845250253951393e-05, "loss": 0.6788, "step": 837 }, { "epoch": 3.72, "learning_rate": 1.5741569521558352e-05, "loss": 0.6613, "step": 838 }, { "epoch": 3.73, "learning_rate": 1.5638165701536868e-05, "loss": 0.6954, "step": 839 }, { "epoch": 3.73, "learning_rate": 1.553503962970447e-05, "loss": 0.6538, "step": 840 }, { "epoch": 3.73, "eval_loss": 0.731265664100647, "eval_runtime": 16.2788, "eval_samples_per_second": 4.976, "eval_steps_per_second": 1.29, "step": 840 }, { "epoch": 3.74, "learning_rate": 1.5432192139633645e-05, "loss": 0.6583, "step": 841 }, { "epoch": 3.74, "learning_rate": 1.53296240626451e-05, "loss": 0.6118, "step": 842 }, { "epoch": 3.75, "learning_rate": 1.5227336227801037e-05, "loss": 0.5973, "step": 843 }, { "epoch": 3.75, "learning_rate": 1.5125329461898408e-05, "loss": 0.6266, "step": 844 }, { "epoch": 3.76, "learning_rate": 1.502360458946232e-05, "loss": 0.6101, "step": 845 }, { "epoch": 3.76, "learning_rate": 1.49221624327393e-05, "loss": 0.7363, "step": 846 }, { "epoch": 3.76, "learning_rate": 1.4821003811690686e-05, "loss": 0.6602, "step": 847 }, { "epoch": 3.77, "learning_rate": 1.4720129543985973e-05, "loss": 0.6837, "step": 848 }, { "epoch": 3.77, "learning_rate": 1.4619540444996227e-05, "loss": 0.6254, "step": 849 }, { "epoch": 3.78, "learning_rate": 1.4519237327787449e-05, "loss": 0.6692, "step": 850 }, { "epoch": 3.78, "learning_rate": 1.4419221003114081e-05, "loss": 0.6508, "step": 851 }, { "epoch": 3.79, "learning_rate": 1.4319492279412388e-05, "loss": 0.6578, "step": 852 }, { "epoch": 3.79, "learning_rate": 1.422005196279395e-05, "loss": 0.6257, "step": 853 }, { "epoch": 3.8, "learning_rate": 1.4120900857039126e-05, "loss": 0.6523, "step": 854 }, { "epoch": 3.8, "learning_rate": 1.4022039763590594e-05, "loss": 0.6703, "step": 855 }, { "epoch": 3.8, "learning_rate": 1.3923469481546841e-05, "loss": 0.6597, "step": 856 }, { "epoch": 3.81, "learning_rate": 1.3825190807655692e-05, "loss": 0.6728, "step": 857 }, { "epoch": 3.81, "learning_rate": 1.3727204536307908e-05, "loss": 0.6369, "step": 858 }, { "epoch": 3.82, "learning_rate": 1.3629511459530758e-05, "loss": 0.6164, "step": 859 }, { "epoch": 3.82, "learning_rate": 1.3532112366981598e-05, "loss": 0.6646, "step": 860 }, { "epoch": 3.82, "eval_loss": 0.7303179502487183, "eval_runtime": 16.2683, "eval_samples_per_second": 4.979, "eval_steps_per_second": 1.291, "step": 860 }, { "epoch": 3.83, "learning_rate": 1.3435008045941483e-05, "loss": 0.6108, "step": 861 }, { "epoch": 3.83, "learning_rate": 1.3338199281308855e-05, "loss": 0.7907, "step": 862 }, { "epoch": 3.84, "learning_rate": 1.3241686855593105e-05, "loss": 0.6953, "step": 863 }, { "epoch": 3.84, "learning_rate": 1.3145471548908345e-05, "loss": 0.5835, "step": 864 }, { "epoch": 3.84, "learning_rate": 1.3049554138967051e-05, "loss": 0.6939, "step": 865 }, { "epoch": 3.85, "learning_rate": 1.2953935401073786e-05, "loss": 0.6388, "step": 866 }, { "epoch": 3.85, "learning_rate": 1.2858616108118932e-05, "loss": 0.5938, "step": 867 }, { "epoch": 3.86, "learning_rate": 1.2763597030572449e-05, "loss": 0.6354, "step": 868 }, { "epoch": 3.86, "learning_rate": 1.266887893647764e-05, "loss": 0.6286, "step": 869 }, { "epoch": 3.87, "learning_rate": 1.257446259144494e-05, "loss": 0.6327, "step": 870 }, { "epoch": 3.87, "learning_rate": 1.248034875864575e-05, "loss": 0.5474, "step": 871 }, { "epoch": 3.88, "learning_rate": 1.238653819880623e-05, "loss": 0.6045, "step": 872 }, { "epoch": 3.88, "learning_rate": 1.2293031670201205e-05, "loss": 0.6502, "step": 873 }, { "epoch": 3.88, "learning_rate": 1.2199829928647949e-05, "loss": 0.6524, "step": 874 }, { "epoch": 3.89, "learning_rate": 1.2106933727500169e-05, "loss": 0.6121, "step": 875 }, { "epoch": 3.89, "learning_rate": 1.2014343817641872e-05, "loss": 0.6873, "step": 876 }, { "epoch": 3.9, "learning_rate": 1.192206094748129e-05, "loss": 0.7149, "step": 877 }, { "epoch": 3.9, "learning_rate": 1.183008586294485e-05, "loss": 0.708, "step": 878 }, { "epoch": 3.91, "learning_rate": 1.1738419307471138e-05, "loss": 0.7516, "step": 879 }, { "epoch": 3.91, "learning_rate": 1.1647062022004845e-05, "loss": 0.7091, "step": 880 }, { "epoch": 3.91, "eval_loss": 0.7308124303817749, "eval_runtime": 16.2664, "eval_samples_per_second": 4.98, "eval_steps_per_second": 1.291, "step": 880 }, { "epoch": 3.92, "learning_rate": 1.1556014744990874e-05, "loss": 0.7198, "step": 881 }, { "epoch": 3.92, "learning_rate": 1.1465278212368285e-05, "loss": 0.6556, "step": 882 }, { "epoch": 3.92, "learning_rate": 1.1374853157564391e-05, "loss": 0.6451, "step": 883 }, { "epoch": 3.93, "learning_rate": 1.1284740311488812e-05, "loss": 0.6918, "step": 884 }, { "epoch": 3.93, "learning_rate": 1.1194940402527564e-05, "loss": 0.6494, "step": 885 }, { "epoch": 3.94, "learning_rate": 1.1105454156537204e-05, "loss": 0.6872, "step": 886 }, { "epoch": 3.94, "learning_rate": 1.1016282296838887e-05, "loss": 0.6875, "step": 887 }, { "epoch": 3.95, "learning_rate": 1.0927425544212622e-05, "loss": 0.6745, "step": 888 }, { "epoch": 3.95, "learning_rate": 1.083888461689137e-05, "loss": 0.6712, "step": 889 }, { "epoch": 3.96, "learning_rate": 1.0750660230555271e-05, "loss": 0.6447, "step": 890 }, { "epoch": 3.96, "learning_rate": 1.066275309832584e-05, "loss": 0.6174, "step": 891 }, { "epoch": 3.96, "learning_rate": 1.0575163930760235e-05, "loss": 0.6796, "step": 892 }, { "epoch": 3.97, "learning_rate": 1.0487893435845447e-05, "loss": 0.5846, "step": 893 }, { "epoch": 3.97, "learning_rate": 1.0400942318992668e-05, "loss": 0.6586, "step": 894 }, { "epoch": 3.98, "learning_rate": 1.031431128303153e-05, "loss": 0.6335, "step": 895 }, { "epoch": 3.98, "learning_rate": 1.0228001028204426e-05, "loss": 0.6112, "step": 896 }, { "epoch": 3.99, "learning_rate": 1.0142012252160877e-05, "loss": 0.6517, "step": 897 }, { "epoch": 3.99, "learning_rate": 1.0056345649951881e-05, "loss": 0.5948, "step": 898 }, { "epoch": 4.0, "learning_rate": 9.971001914024247e-06, "loss": 0.6068, "step": 899 }, { "epoch": 4.0, "learning_rate": 9.885981734215094e-06, "loss": 0.6261, "step": 900 }, { "epoch": 4.0, "eval_loss": 0.7309070229530334, "eval_runtime": 16.3022, "eval_samples_per_second": 4.969, "eval_steps_per_second": 1.288, "step": 900 }, { "epoch": 4.0, "learning_rate": 9.80128579774619e-06, "loss": 0.7467, "step": 901 }, { "epoch": 4.01, "learning_rate": 9.716914789218468e-06, "loss": 0.6475, "step": 902 }, { "epoch": 4.01, "learning_rate": 9.63286939060643e-06, "loss": 0.6467, "step": 903 }, { "epoch": 4.02, "learning_rate": 9.549150281252633e-06, "loss": 0.7299, "step": 904 }, { "epoch": 4.02, "learning_rate": 9.465758137862263e-06, "loss": 0.6303, "step": 905 }, { "epoch": 4.03, "learning_rate": 9.382693634497609e-06, "loss": 0.6799, "step": 906 }, { "epoch": 4.03, "learning_rate": 9.299957442572643e-06, "loss": 0.6997, "step": 907 }, { "epoch": 4.04, "learning_rate": 9.217550230847565e-06, "loss": 0.7221, "step": 908 }, { "epoch": 4.04, "learning_rate": 9.135472665423433e-06, "loss": 0.6812, "step": 909 }, { "epoch": 4.04, "learning_rate": 9.053725409736751e-06, "loss": 0.8037, "step": 910 }, { "epoch": 4.05, "learning_rate": 8.972309124554101e-06, "loss": 0.6067, "step": 911 }, { "epoch": 4.05, "learning_rate": 8.891224467966824e-06, "loss": 0.6613, "step": 912 }, { "epoch": 4.06, "learning_rate": 8.810472095385713e-06, "loss": 0.6712, "step": 913 }, { "epoch": 4.06, "learning_rate": 8.730052659535675e-06, "loss": 0.6991, "step": 914 }, { "epoch": 4.07, "learning_rate": 8.649966810450472e-06, "loss": 0.6237, "step": 915 }, { "epoch": 4.07, "learning_rate": 8.570215195467502e-06, "loss": 0.6039, "step": 916 }, { "epoch": 4.08, "learning_rate": 8.490798459222476e-06, "loss": 0.6738, "step": 917 }, { "epoch": 4.08, "learning_rate": 8.411717243644312e-06, "loss": 0.6461, "step": 918 }, { "epoch": 4.08, "learning_rate": 8.332972187949889e-06, "loss": 0.7234, "step": 919 }, { "epoch": 4.09, "learning_rate": 8.254563928638893e-06, "loss": 0.6206, "step": 920 }, { "epoch": 4.09, "eval_loss": 0.7307308316230774, "eval_runtime": 16.2336, "eval_samples_per_second": 4.99, "eval_steps_per_second": 1.294, "step": 920 }, { "epoch": 4.09, "learning_rate": 8.176493099488663e-06, "loss": 0.5918, "step": 921 }, { "epoch": 4.1, "learning_rate": 8.098760331549087e-06, "loss": 0.7177, "step": 922 }, { "epoch": 4.1, "learning_rate": 8.02136625313747e-06, "loss": 0.6874, "step": 923 }, { "epoch": 4.11, "learning_rate": 7.94431148983349e-06, "loss": 0.631, "step": 924 }, { "epoch": 4.11, "learning_rate": 7.86759666447412e-06, "loss": 0.6572, "step": 925 }, { "epoch": 4.12, "learning_rate": 7.791222397148612e-06, "loss": 0.6798, "step": 926 }, { "epoch": 4.12, "learning_rate": 7.715189305193454e-06, "loss": 0.6469, "step": 927 }, { "epoch": 4.12, "learning_rate": 7.639498003187417e-06, "loss": 0.6807, "step": 928 }, { "epoch": 4.13, "learning_rate": 7.564149102946572e-06, "loss": 0.6737, "step": 929 }, { "epoch": 4.13, "learning_rate": 7.489143213519301e-06, "loss": 0.608, "step": 930 }, { "epoch": 4.14, "learning_rate": 7.414480941181462e-06, "loss": 0.6284, "step": 931 }, { "epoch": 4.14, "learning_rate": 7.340162889431418e-06, "loss": 0.5924, "step": 932 }, { "epoch": 4.15, "learning_rate": 7.2661896589851895e-06, "loss": 0.6561, "step": 933 }, { "epoch": 4.15, "learning_rate": 7.192561847771589e-06, "loss": 0.5905, "step": 934 }, { "epoch": 4.16, "learning_rate": 7.119280050927407e-06, "loss": 0.6716, "step": 935 }, { "epoch": 4.16, "learning_rate": 7.046344860792525e-06, "loss": 0.6473, "step": 936 }, { "epoch": 4.16, "learning_rate": 6.973756866905279e-06, "loss": 0.6545, "step": 937 }, { "epoch": 4.17, "learning_rate": 6.901516655997536e-06, "loss": 0.6174, "step": 938 }, { "epoch": 4.17, "learning_rate": 6.829624811990038e-06, "loss": 0.636, "step": 939 }, { "epoch": 4.18, "learning_rate": 6.758081915987669e-06, "loss": 0.6257, "step": 940 }, { "epoch": 4.18, "eval_loss": 0.730650007724762, "eval_runtime": 16.243, "eval_samples_per_second": 4.987, "eval_steps_per_second": 1.293, "step": 940 }, { "epoch": 4.18, "learning_rate": 6.6868885462747276e-06, "loss": 0.6443, "step": 941 }, { "epoch": 4.19, "learning_rate": 6.616045278310301e-06, "loss": 0.672, "step": 942 }, { "epoch": 4.19, "learning_rate": 6.5455526847235825e-06, "loss": 0.6328, "step": 943 }, { "epoch": 4.2, "learning_rate": 6.475411335309245e-06, "loss": 0.5984, "step": 944 }, { "epoch": 4.2, "learning_rate": 6.405621797022848e-06, "loss": 0.5437, "step": 945 }, { "epoch": 4.2, "learning_rate": 6.3361846339762386e-06, "loss": 0.6539, "step": 946 }, { "epoch": 4.21, "learning_rate": 6.2671004074329965e-06, "loss": 0.6656, "step": 947 }, { "epoch": 4.21, "learning_rate": 6.1983696758039e-06, "loss": 0.5519, "step": 948 }, { "epoch": 4.22, "learning_rate": 6.129992994642425e-06, "loss": 0.6081, "step": 949 }, { "epoch": 4.22, "learning_rate": 6.061970916640236e-06, "loss": 0.6414, "step": 950 }, { "epoch": 4.23, "learning_rate": 5.9943039916227294e-06, "loss": 0.7595, "step": 951 }, { "epoch": 4.23, "learning_rate": 5.926992766544576e-06, "loss": 0.6684, "step": 952 }, { "epoch": 4.24, "learning_rate": 5.860037785485301e-06, "loss": 0.6643, "step": 953 }, { "epoch": 4.24, "learning_rate": 5.793439589644917e-06, "loss": 0.6311, "step": 954 }, { "epoch": 4.24, "learning_rate": 5.727198717339511e-06, "loss": 0.7277, "step": 955 }, { "epoch": 4.25, "learning_rate": 5.6613157039969055e-06, "loss": 0.6828, "step": 956 }, { "epoch": 4.25, "learning_rate": 5.595791082152352e-06, "loss": 0.6596, "step": 957 }, { "epoch": 4.26, "learning_rate": 5.530625381444182e-06, "loss": 0.6875, "step": 958 }, { "epoch": 4.26, "learning_rate": 5.4658191286095895e-06, "loss": 0.7416, "step": 959 }, { "epoch": 4.27, "learning_rate": 5.4013728474802846e-06, "loss": 0.7023, "step": 960 }, { "epoch": 4.27, "eval_loss": 0.7306858897209167, "eval_runtime": 16.2653, "eval_samples_per_second": 4.98, "eval_steps_per_second": 1.291, "step": 960 }, { "epoch": 4.27, "learning_rate": 5.337287058978347e-06, "loss": 0.6474, "step": 961 }, { "epoch": 4.28, "learning_rate": 5.2735622811119715e-06, "loss": 0.5983, "step": 962 }, { "epoch": 4.28, "learning_rate": 5.210199028971291e-06, "loss": 0.6468, "step": 963 }, { "epoch": 4.28, "learning_rate": 5.1471978147241974e-06, "loss": 0.7072, "step": 964 }, { "epoch": 4.29, "learning_rate": 5.0845591476122444e-06, "loss": 0.6677, "step": 965 }, { "epoch": 4.29, "learning_rate": 5.022283533946448e-06, "loss": 0.655, "step": 966 }, { "epoch": 4.3, "learning_rate": 4.960371477103304e-06, "loss": 0.6534, "step": 967 }, { "epoch": 4.3, "learning_rate": 4.898823477520625e-06, "loss": 0.5957, "step": 968 }, { "epoch": 4.31, "learning_rate": 4.837640032693558e-06, "loss": 0.6261, "step": 969 }, { "epoch": 4.31, "learning_rate": 4.776821637170526e-06, "loss": 0.6309, "step": 970 }, { "epoch": 4.32, "learning_rate": 4.7163687825492355e-06, "loss": 0.5786, "step": 971 }, { "epoch": 4.32, "learning_rate": 4.65628195747273e-06, "loss": 0.6396, "step": 972 }, { "epoch": 4.32, "learning_rate": 4.596561647625397e-06, "loss": 0.6884, "step": 973 }, { "epoch": 4.33, "learning_rate": 4.537208335729088e-06, "loss": 0.5858, "step": 974 }, { "epoch": 4.33, "learning_rate": 4.478222501539175e-06, "loss": 0.7217, "step": 975 }, { "epoch": 4.34, "learning_rate": 4.419604621840717e-06, "loss": 0.6265, "step": 976 }, { "epoch": 4.34, "learning_rate": 4.3613551704445365e-06, "loss": 0.5914, "step": 977 }, { "epoch": 4.35, "learning_rate": 4.303474618183484e-06, "loss": 0.662, "step": 978 }, { "epoch": 4.35, "learning_rate": 4.245963432908556e-06, "loss": 0.6491, "step": 979 }, { "epoch": 4.36, "learning_rate": 4.188822079485139e-06, "loss": 0.6101, "step": 980 }, { "epoch": 4.36, "eval_loss": 0.7306349277496338, "eval_runtime": 16.2893, "eval_samples_per_second": 4.973, "eval_steps_per_second": 1.289, "step": 980 }, { "epoch": 4.36, "learning_rate": 4.132051019789263e-06, "loss": 0.6338, "step": 981 }, { "epoch": 4.36, "learning_rate": 4.075650712703849e-06, "loss": 0.5961, "step": 982 }, { "epoch": 4.37, "learning_rate": 4.0196216141150214e-06, "loss": 0.622, "step": 983 }, { "epoch": 4.37, "learning_rate": 3.963964176908391e-06, "loss": 0.6119, "step": 984 }, { "epoch": 4.38, "learning_rate": 3.908678850965425e-06, "loss": 0.6194, "step": 985 }, { "epoch": 4.38, "learning_rate": 3.853766083159799e-06, "loss": 0.6721, "step": 986 }, { "epoch": 4.39, "learning_rate": 3.799226317353788e-06, "loss": 0.6138, "step": 987 }, { "epoch": 4.39, "learning_rate": 3.7450599943946728e-06, "loss": 0.6869, "step": 988 }, { "epoch": 4.4, "learning_rate": 3.691267552111183e-06, "loss": 0.6511, "step": 989 }, { "epoch": 4.4, "learning_rate": 3.6378494253099304e-06, "loss": 0.6247, "step": 990 }, { "epoch": 4.4, "learning_rate": 3.5848060457719547e-06, "loss": 0.6228, "step": 991 }, { "epoch": 4.41, "learning_rate": 3.532137842249178e-06, "loss": 0.5655, "step": 992 }, { "epoch": 4.41, "learning_rate": 3.4798452404609603e-06, "loss": 0.5822, "step": 993 }, { "epoch": 4.42, "learning_rate": 3.4279286630906572e-06, "loss": 0.6357, "step": 994 }, { "epoch": 4.42, "learning_rate": 3.376388529782215e-06, "loss": 0.6089, "step": 995 }, { "epoch": 4.43, "learning_rate": 3.325225257136738e-06, "loss": 0.6363, "step": 996 }, { "epoch": 4.43, "learning_rate": 3.2744392587091722e-06, "loss": 0.5946, "step": 997 }, { "epoch": 4.44, "learning_rate": 3.2240309450049355e-06, "loss": 0.5804, "step": 998 }, { "epoch": 4.44, "learning_rate": 3.1740007234766002e-06, "loss": 0.6326, "step": 999 }, { "epoch": 4.44, "learning_rate": 3.1243489985206097e-06, "loss": 0.6465, "step": 1000 }, { "epoch": 4.44, "eval_loss": 0.7306743860244751, "eval_runtime": 16.3111, "eval_samples_per_second": 4.966, "eval_steps_per_second": 1.287, "step": 1000 }, { "epoch": 4.45, "learning_rate": 3.075076171473995e-06, "loss": 0.7062, "step": 1001 }, { "epoch": 4.45, "learning_rate": 3.0261826406111525e-06, "loss": 0.7212, "step": 1002 }, { "epoch": 4.46, "learning_rate": 2.9776688011405817e-06, "loss": 0.7193, "step": 1003 }, { "epoch": 4.46, "learning_rate": 2.9295350452017535e-06, "loss": 0.7064, "step": 1004 }, { "epoch": 4.47, "learning_rate": 2.8817817618618847e-06, "loss": 0.7188, "step": 1005 }, { "epoch": 4.47, "learning_rate": 2.8344093371128424e-06, "loss": 0.7245, "step": 1006 }, { "epoch": 4.48, "learning_rate": 2.787418153867971e-06, "loss": 0.7176, "step": 1007 }, { "epoch": 4.48, "learning_rate": 2.7408085919590264e-06, "loss": 0.7244, "step": 1008 }, { "epoch": 4.48, "learning_rate": 2.6945810281331085e-06, "loss": 0.6673, "step": 1009 }, { "epoch": 4.49, "learning_rate": 2.648735836049615e-06, "loss": 0.6861, "step": 1010 }, { "epoch": 4.49, "learning_rate": 2.6032733862772105e-06, "loss": 0.6591, "step": 1011 }, { "epoch": 4.5, "learning_rate": 2.558194046290835e-06, "loss": 0.6897, "step": 1012 }, { "epoch": 4.5, "learning_rate": 2.5134981804687484e-06, "loss": 0.6069, "step": 1013 }, { "epoch": 4.51, "learning_rate": 2.4691861500895474e-06, "loss": 0.6301, "step": 1014 }, { "epoch": 4.51, "learning_rate": 2.4252583133292927e-06, "loss": 0.6238, "step": 1015 }, { "epoch": 4.52, "learning_rate": 2.381715025258585e-06, "loss": 0.6159, "step": 1016 }, { "epoch": 4.52, "learning_rate": 2.3385566378397007e-06, "loss": 0.6171, "step": 1017 }, { "epoch": 4.52, "learning_rate": 2.2957834999237424e-06, "loss": 0.5524, "step": 1018 }, { "epoch": 4.53, "learning_rate": 2.2533959572478392e-06, "loss": 0.6197, "step": 1019 }, { "epoch": 4.53, "learning_rate": 2.2113943524323167e-06, "loss": 0.7135, "step": 1020 }, { "epoch": 4.53, "eval_loss": 0.730741024017334, "eval_runtime": 16.3211, "eval_samples_per_second": 4.963, "eval_steps_per_second": 1.287, "step": 1020 }, { "epoch": 4.54, "learning_rate": 2.1697790249779636e-06, "loss": 0.6207, "step": 1021 }, { "epoch": 4.54, "learning_rate": 2.128550311263261e-06, "loss": 0.6479, "step": 1022 }, { "epoch": 4.55, "learning_rate": 2.087708544541689e-06, "loss": 0.6496, "step": 1023 }, { "epoch": 4.55, "learning_rate": 2.0472540549390074e-06, "loss": 0.6942, "step": 1024 }, { "epoch": 4.56, "learning_rate": 2.007187169450603e-06, "loss": 0.7154, "step": 1025 }, { "epoch": 4.56, "learning_rate": 1.9675082119388344e-06, "loss": 0.6424, "step": 1026 }, { "epoch": 4.56, "learning_rate": 1.9282175031304305e-06, "loss": 0.6996, "step": 1027 }, { "epoch": 4.57, "learning_rate": 1.8893153606138802e-06, "loss": 0.633, "step": 1028 }, { "epoch": 4.57, "learning_rate": 1.85080209883689e-06, "loss": 0.6168, "step": 1029 }, { "epoch": 4.58, "learning_rate": 1.8126780291038037e-06, "loss": 0.5781, "step": 1030 }, { "epoch": 4.58, "learning_rate": 1.7749434595731363e-06, "loss": 0.663, "step": 1031 }, { "epoch": 4.59, "learning_rate": 1.7375986952550326e-06, "loss": 0.655, "step": 1032 }, { "epoch": 4.59, "learning_rate": 1.7006440380088362e-06, "loss": 0.6346, "step": 1033 }, { "epoch": 4.6, "learning_rate": 1.6640797865406288e-06, "loss": 0.6138, "step": 1034 }, { "epoch": 4.6, "learning_rate": 1.6279062364008447e-06, "loss": 0.5901, "step": 1035 }, { "epoch": 4.6, "learning_rate": 1.5921236799818384e-06, "loss": 0.5759, "step": 1036 }, { "epoch": 4.61, "learning_rate": 1.5567324065155653e-06, "loss": 0.6452, "step": 1037 }, { "epoch": 4.61, "learning_rate": 1.5217327020712158e-06, "loss": 0.5874, "step": 1038 }, { "epoch": 4.62, "learning_rate": 1.4871248495529011e-06, "loss": 0.6349, "step": 1039 }, { "epoch": 4.62, "learning_rate": 1.4529091286973995e-06, "loss": 0.6658, "step": 1040 }, { "epoch": 4.62, "eval_loss": 0.7307910919189453, "eval_runtime": 16.3086, "eval_samples_per_second": 4.967, "eval_steps_per_second": 1.288, "step": 1040 }, { "epoch": 4.63, "learning_rate": 1.4190858160718467e-06, "loss": 0.6129, "step": 1041 }, { "epoch": 4.63, "learning_rate": 1.3856551850715494e-06, "loss": 0.6186, "step": 1042 }, { "epoch": 4.64, "learning_rate": 1.352617505917736e-06, "loss": 0.6641, "step": 1043 }, { "epoch": 4.64, "learning_rate": 1.3199730456553926e-06, "loss": 0.5646, "step": 1044 }, { "epoch": 4.64, "learning_rate": 1.2877220681510927e-06, "loss": 0.5679, "step": 1045 }, { "epoch": 4.65, "learning_rate": 1.2558648340908862e-06, "loss": 0.6184, "step": 1046 }, { "epoch": 4.65, "learning_rate": 1.2244016009781701e-06, "loss": 0.6944, "step": 1047 }, { "epoch": 4.66, "learning_rate": 1.19333262313161e-06, "loss": 0.5864, "step": 1048 }, { "epoch": 4.66, "learning_rate": 1.1626581516831048e-06, "loss": 0.5279, "step": 1049 }, { "epoch": 4.67, "learning_rate": 1.1323784345757205e-06, "loss": 0.6324, "step": 1050 }, { "epoch": 4.67, "learning_rate": 1.1024937165617145e-06, "loss": 0.7198, "step": 1051 }, { "epoch": 4.68, "learning_rate": 1.0730042392005647e-06, "loss": 0.7036, "step": 1052 }, { "epoch": 4.68, "learning_rate": 1.0439102408569769e-06, "loss": 0.73, "step": 1053 }, { "epoch": 4.68, "learning_rate": 1.0152119566990025e-06, "loss": 0.7607, "step": 1054 }, { "epoch": 4.69, "learning_rate": 9.869096186961025e-07, "loss": 0.7188, "step": 1055 }, { "epoch": 4.69, "learning_rate": 9.590034556173079e-07, "loss": 0.7229, "step": 1056 }, { "epoch": 4.7, "learning_rate": 9.314936930293283e-07, "loss": 0.7239, "step": 1057 }, { "epoch": 4.7, "learning_rate": 9.043805532947647e-07, "loss": 0.664, "step": 1058 }, { "epoch": 4.71, "learning_rate": 8.776642555702985e-07, "loss": 0.7025, "step": 1059 }, { "epoch": 4.71, "learning_rate": 8.513450158049108e-07, "loss": 0.6855, "step": 1060 }, { "epoch": 4.71, "eval_loss": 0.7306917309761047, "eval_runtime": 16.3221, "eval_samples_per_second": 4.963, "eval_steps_per_second": 1.287, "step": 1060 }, { "epoch": 4.72, "learning_rate": 8.254230467381551e-07, "loss": 0.6641, "step": 1061 }, { "epoch": 4.72, "learning_rate": 7.99898557898432e-07, "loss": 0.7383, "step": 1062 }, { "epoch": 4.72, "learning_rate": 7.747717556012779e-07, "loss": 0.6531, "step": 1063 }, { "epoch": 4.73, "learning_rate": 7.50042842947718e-07, "loss": 0.6781, "step": 1064 }, { "epoch": 4.73, "learning_rate": 7.25712019822622e-07, "loss": 0.6386, "step": 1065 }, { "epoch": 4.74, "learning_rate": 7.01779482893089e-07, "loss": 0.6938, "step": 1066 }, { "epoch": 4.74, "learning_rate": 6.782454256068438e-07, "loss": 0.7193, "step": 1067 }, { "epoch": 4.75, "learning_rate": 6.551100381906872e-07, "loss": 0.5737, "step": 1068 }, { "epoch": 4.75, "learning_rate": 6.323735076489535e-07, "loss": 0.6546, "step": 1069 }, { "epoch": 4.76, "learning_rate": 6.100360177619946e-07, "loss": 0.6558, "step": 1070 }, { "epoch": 4.76, "learning_rate": 5.880977490847151e-07, "loss": 0.5834, "step": 1071 }, { "epoch": 4.76, "learning_rate": 5.665588789450782e-07, "loss": 0.6137, "step": 1072 }, { "epoch": 4.77, "learning_rate": 5.454195814427021e-07, "loss": 0.7225, "step": 1073 }, { "epoch": 4.77, "learning_rate": 5.246800274474439e-07, "loss": 0.6166, "step": 1074 }, { "epoch": 4.78, "learning_rate": 5.043403845980122e-07, "loss": 0.6703, "step": 1075 }, { "epoch": 4.78, "learning_rate": 4.844008173006232e-07, "loss": 0.5972, "step": 1076 }, { "epoch": 4.79, "learning_rate": 4.6486148672764687e-07, "loss": 0.6436, "step": 1077 }, { "epoch": 4.79, "learning_rate": 4.457225508163465e-07, "loss": 0.6859, "step": 1078 }, { "epoch": 4.8, "learning_rate": 4.269841642675576e-07, "loss": 0.6171, "step": 1079 }, { "epoch": 4.8, "learning_rate": 4.086464785444777e-07, "loss": 0.5769, "step": 1080 }, { "epoch": 4.8, "eval_loss": 0.7307865023612976, "eval_runtime": 16.328, "eval_samples_per_second": 4.961, "eval_steps_per_second": 1.286, "step": 1080 }, { "epoch": 4.8, "learning_rate": 3.907096418714062e-07, "loss": 0.6674, "step": 1081 }, { "epoch": 4.81, "learning_rate": 3.7317379923256233e-07, "loss": 0.7353, "step": 1082 }, { "epoch": 4.81, "learning_rate": 3.5603909237092445e-07, "loss": 0.7192, "step": 1083 }, { "epoch": 4.82, "learning_rate": 3.393056597870703e-07, "loss": 0.7237, "step": 1084 }, { "epoch": 4.82, "learning_rate": 3.229736367380498e-07, "loss": 0.7342, "step": 1085 }, { "epoch": 4.83, "learning_rate": 3.0704315523631953e-07, "loss": 0.6708, "step": 1086 }, { "epoch": 4.83, "learning_rate": 2.915143440486379e-07, "loss": 0.5742, "step": 1087 }, { "epoch": 4.84, "learning_rate": 2.7638732869506025e-07, "loss": 0.5832, "step": 1088 }, { "epoch": 4.84, "learning_rate": 2.616622314479067e-07, "loss": 0.5766, "step": 1089 }, { "epoch": 4.84, "learning_rate": 2.473391713307738e-07, "loss": 0.6198, "step": 1090 }, { "epoch": 4.85, "learning_rate": 2.334182641175686e-07, "loss": 0.6962, "step": 1091 }, { "epoch": 4.85, "learning_rate": 2.19899622331593e-07, "loss": 0.596, "step": 1092 }, { "epoch": 4.86, "learning_rate": 2.0678335524460524e-07, "loss": 0.6843, "step": 1093 }, { "epoch": 4.86, "learning_rate": 1.9406956887595418e-07, "loss": 0.6217, "step": 1094 }, { "epoch": 4.87, "learning_rate": 1.8175836599173546e-07, "loss": 0.6389, "step": 1095 }, { "epoch": 4.87, "learning_rate": 1.6984984610392552e-07, "loss": 0.6922, "step": 1096 }, { "epoch": 4.88, "learning_rate": 1.5834410546960443e-07, "loss": 0.649, "step": 1097 }, { "epoch": 4.88, "learning_rate": 1.4724123709017878e-07, "loss": 0.6382, "step": 1098 }, { "epoch": 4.88, "learning_rate": 1.3654133071059893e-07, "loss": 0.6124, "step": 1099 }, { "epoch": 4.89, "learning_rate": 1.2624447281867624e-07, "loss": 0.5613, "step": 1100 }, { "epoch": 4.89, "eval_loss": 0.7309412956237793, "eval_runtime": 16.3302, "eval_samples_per_second": 4.96, "eval_steps_per_second": 1.286, "step": 1100 }, { "epoch": 4.89, "learning_rate": 1.1635074664436141e-07, "loss": 0.7437, "step": 1101 }, { "epoch": 4.9, "learning_rate": 1.0686023215906171e-07, "loss": 0.6838, "step": 1102 }, { "epoch": 4.9, "learning_rate": 9.777300607501927e-08, "loss": 0.7432, "step": 1103 }, { "epoch": 4.91, "learning_rate": 8.90891418446782e-08, "loss": 0.6784, "step": 1104 }, { "epoch": 4.91, "learning_rate": 8.080870966008514e-08, "loss": 0.7191, "step": 1105 }, { "epoch": 4.92, "learning_rate": 7.293177645232297e-08, "loss": 0.7513, "step": 1106 }, { "epoch": 4.92, "learning_rate": 6.545840589099461e-08, "loss": 0.673, "step": 1107 }, { "epoch": 4.92, "learning_rate": 5.838865838366792e-08, "loss": 0.6339, "step": 1108 }, { "epoch": 4.93, "learning_rate": 5.1722591075425986e-08, "loss": 0.7272, "step": 1109 }, { "epoch": 4.93, "learning_rate": 4.5460257848373156e-08, "loss": 0.7292, "step": 1110 }, { "epoch": 4.94, "learning_rate": 3.96017093212131e-08, "loss": 0.6564, "step": 1111 }, { "epoch": 4.94, "learning_rate": 3.4146992848854695e-08, "loss": 0.6613, "step": 1112 }, { "epoch": 4.95, "learning_rate": 2.9096152522006815e-08, "loss": 0.6325, "step": 1113 }, { "epoch": 4.95, "learning_rate": 2.4449229166823016e-08, "loss": 0.6954, "step": 1114 }, { "epoch": 4.96, "learning_rate": 2.020626034459072e-08, "loss": 0.7139, "step": 1115 }, { "epoch": 4.96, "learning_rate": 1.636728035140922e-08, "loss": 0.6613, "step": 1116 }, { "epoch": 4.96, "learning_rate": 1.2932320217917682e-08, "loss": 0.6576, "step": 1117 }, { "epoch": 4.97, "learning_rate": 9.901407709050902e-09, "loss": 0.6583, "step": 1118 }, { "epoch": 4.97, "learning_rate": 7.2745673238006076e-09, "loss": 0.6937, "step": 1119 }, { "epoch": 4.98, "learning_rate": 5.051820295032261e-09, "loss": 0.6591, "step": 1120 }, { "epoch": 4.98, "eval_loss": 0.7308295965194702, "eval_runtime": 16.298, "eval_samples_per_second": 4.97, "eval_steps_per_second": 1.288, "step": 1120 }, { "epoch": 4.98, "learning_rate": 3.2331845893074366e-09, "loss": 0.7024, "step": 1121 }, { "epoch": 4.99, "learning_rate": 1.8186749067339303e-09, "loss": 0.6562, "step": 1122 }, { "epoch": 4.99, "learning_rate": 8.083026808602956e-10, "loss": 0.5939, "step": 1123 }, { "epoch": 5.0, "learning_rate": 2.020760785648168e-10, "loss": 0.6105, "step": 1124 }, { "epoch": 5.0, "learning_rate": 0.0, "loss": 0.6582, "step": 1125 } ], "logging_steps": 1, "max_steps": 1125, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 20, "total_flos": 2.6726410071834624e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }