{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "global_step": 862850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.112668990819203e-05, "loss": 8.0034, "step": 1000 }, { "epoch": 0.0, "learning_rate": 4.533733258325905e-05, "loss": 7.5343, "step": 2000 }, { "epoch": 0.01, "learning_rate": 4.7778615089460934e-05, "loss": 7.2281, "step": 3000 }, { "epoch": 0.01, "learning_rate": 4.9509713131658946e-05, "loss": 6.5225, "step": 4000 }, { "epoch": 0.01, "learning_rate": 5e-05, "loss": 6.0666, "step": 5000 }, { "epoch": 0.01, "learning_rate": 5e-05, "loss": 5.7596, "step": 6000 }, { "epoch": 0.02, "learning_rate": 5e-05, "loss": 5.5413, "step": 7000 }, { "epoch": 0.02, "learning_rate": 5e-05, "loss": 5.3661, "step": 8000 }, { "epoch": 0.02, "learning_rate": 5e-05, "loss": 5.2017, "step": 9000 }, { "epoch": 0.02, "learning_rate": 5e-05, "loss": 5.052, "step": 10000 }, { "epoch": 0.03, "learning_rate": 5e-05, "loss": 4.9005, "step": 11000 }, { "epoch": 0.03, "learning_rate": 5e-05, "loss": 4.7597, "step": 12000 }, { "epoch": 0.03, "learning_rate": 5e-05, "loss": 4.6439, "step": 13000 }, { "epoch": 0.03, "learning_rate": 5e-05, "loss": 4.5544, "step": 14000 }, { "epoch": 0.03, "learning_rate": 5e-05, "loss": 4.4742, "step": 15000 }, { "epoch": 0.04, "learning_rate": 5e-05, "loss": 4.4085, "step": 16000 }, { "epoch": 0.04, "learning_rate": 5e-05, "loss": 4.3437, "step": 17000 }, { "epoch": 0.04, "learning_rate": 5e-05, "loss": 4.2957, "step": 18000 }, { "epoch": 0.04, "learning_rate": 5e-05, "loss": 4.2381, "step": 19000 }, { "epoch": 0.05, "learning_rate": 5e-05, "loss": 4.1846, "step": 20000 }, { "epoch": 0.05, "learning_rate": 5e-05, "loss": 4.1399, "step": 21000 }, { "epoch": 0.05, "learning_rate": 5e-05, "loss": 4.1097, "step": 22000 }, { "epoch": 0.05, "learning_rate": 5e-05, "loss": 4.0513, "step": 23000 }, { "epoch": 0.06, "learning_rate": 5e-05, "loss": 4.0177, "step": 24000 }, { "epoch": 0.06, "learning_rate": 5e-05, "loss": 3.9807, "step": 25000 }, { "epoch": 0.06, "learning_rate": 5e-05, "loss": 3.9417, "step": 26000 }, { "epoch": 0.06, "learning_rate": 5e-05, "loss": 3.9119, "step": 27000 }, { "epoch": 0.06, "learning_rate": 5e-05, "loss": 3.8789, "step": 28000 }, { "epoch": 0.07, "learning_rate": 5e-05, "loss": 3.8467, "step": 29000 }, { "epoch": 0.07, "learning_rate": 5e-05, "loss": 3.8162, "step": 30000 }, { "epoch": 0.07, "learning_rate": 5e-05, "loss": 3.7815, "step": 31000 }, { "epoch": 0.07, "learning_rate": 5e-05, "loss": 3.755, "step": 32000 }, { "epoch": 0.08, "learning_rate": 5e-05, "loss": 3.7321, "step": 33000 }, { "epoch": 0.08, "learning_rate": 5e-05, "loss": 3.7082, "step": 34000 }, { "epoch": 0.08, "learning_rate": 5e-05, "loss": 3.6796, "step": 35000 }, { "epoch": 0.08, "learning_rate": 5e-05, "loss": 3.6481, "step": 36000 }, { "epoch": 0.09, "learning_rate": 5e-05, "loss": 3.6342, "step": 37000 }, { "epoch": 0.09, "learning_rate": 5e-05, "loss": 3.5971, "step": 38000 }, { "epoch": 0.09, "learning_rate": 5e-05, "loss": 3.5824, "step": 39000 }, { "epoch": 0.09, "learning_rate": 5e-05, "loss": 3.5602, "step": 40000 }, { "epoch": 0.1, "learning_rate": 5e-05, "loss": 3.5299, "step": 41000 }, { "epoch": 0.1, "learning_rate": 5e-05, "loss": 3.5166, "step": 42000 }, { "epoch": 0.1, "learning_rate": 5e-05, "loss": 3.494, "step": 43000 }, { "epoch": 0.1, "learning_rate": 5e-05, "loss": 3.4759, "step": 44000 }, { "epoch": 0.1, "learning_rate": 5e-05, "loss": 3.4558, "step": 45000 }, { "epoch": 0.11, "learning_rate": 5e-05, "loss": 3.4323, "step": 46000 }, { "epoch": 0.11, "learning_rate": 5e-05, "loss": 3.415, "step": 47000 }, { "epoch": 0.11, "learning_rate": 5e-05, "loss": 3.396, "step": 48000 }, { "epoch": 0.11, "learning_rate": 5e-05, "loss": 3.3864, "step": 49000 }, { "epoch": 0.12, "learning_rate": 5e-05, "loss": 3.3646, "step": 50000 }, { "epoch": 0.12, "learning_rate": 5e-05, "loss": 3.3441, "step": 51000 }, { "epoch": 0.12, "learning_rate": 5e-05, "loss": 3.3341, "step": 52000 }, { "epoch": 0.12, "learning_rate": 5e-05, "loss": 3.3227, "step": 53000 }, { "epoch": 0.13, "learning_rate": 5e-05, "loss": 3.3056, "step": 54000 }, { "epoch": 0.13, "learning_rate": 5e-05, "loss": 3.289, "step": 55000 }, { "epoch": 0.13, "learning_rate": 5e-05, "loss": 3.2696, "step": 56000 }, { "epoch": 0.13, "learning_rate": 5e-05, "loss": 3.258, "step": 57000 }, { "epoch": 0.13, "learning_rate": 5e-05, "loss": 3.2438, "step": 58000 }, { "epoch": 0.14, "learning_rate": 5e-05, "loss": 3.223, "step": 59000 }, { "epoch": 0.14, "learning_rate": 5e-05, "loss": 3.2181, "step": 60000 }, { "epoch": 0.14, "learning_rate": 5e-05, "loss": 3.2049, "step": 61000 }, { "epoch": 0.14, "learning_rate": 5e-05, "loss": 3.1886, "step": 62000 }, { "epoch": 0.15, "learning_rate": 5e-05, "loss": 3.183, "step": 63000 }, { "epoch": 0.15, "learning_rate": 5e-05, "loss": 3.1623, "step": 64000 }, { "epoch": 0.15, "learning_rate": 5e-05, "loss": 3.1553, "step": 65000 }, { "epoch": 0.15, "learning_rate": 5e-05, "loss": 3.1407, "step": 66000 }, { "epoch": 0.16, "learning_rate": 5e-05, "loss": 3.1316, "step": 67000 }, { "epoch": 0.16, "learning_rate": 5e-05, "loss": 3.1209, "step": 68000 }, { "epoch": 0.16, "learning_rate": 5e-05, "loss": 3.1047, "step": 69000 }, { "epoch": 0.16, "learning_rate": 5e-05, "loss": 3.0984, "step": 70000 }, { "epoch": 0.16, "learning_rate": 5e-05, "loss": 3.0901, "step": 71000 }, { "epoch": 0.17, "learning_rate": 5e-05, "loss": 3.0713, "step": 72000 }, { "epoch": 0.17, "learning_rate": 5e-05, "loss": 3.0674, "step": 73000 }, { "epoch": 0.17, "learning_rate": 5e-05, "loss": 3.0568, "step": 74000 }, { "epoch": 0.17, "learning_rate": 5e-05, "loss": 3.0448, "step": 75000 }, { "epoch": 0.18, "learning_rate": 5e-05, "loss": 3.0282, "step": 76000 }, { "epoch": 0.18, "learning_rate": 5e-05, "loss": 3.0217, "step": 77000 }, { "epoch": 0.18, "learning_rate": 5e-05, "loss": 3.0215, "step": 78000 }, { "epoch": 0.18, "learning_rate": 5e-05, "loss": 3.0051, "step": 79000 }, { "epoch": 0.19, "learning_rate": 5e-05, "loss": 2.9998, "step": 80000 }, { "epoch": 0.19, "learning_rate": 5e-05, "loss": 2.9937, "step": 81000 }, { "epoch": 0.19, "learning_rate": 5e-05, "loss": 2.9874, "step": 82000 }, { "epoch": 0.19, "learning_rate": 5e-05, "loss": 2.9712, "step": 83000 }, { "epoch": 0.19, "learning_rate": 5e-05, "loss": 2.9656, "step": 84000 }, { "epoch": 0.2, "learning_rate": 5e-05, "loss": 2.9621, "step": 85000 }, { "epoch": 0.2, "learning_rate": 5e-05, "loss": 2.9509, "step": 86000 }, { "epoch": 0.2, "learning_rate": 5e-05, "loss": 2.9423, "step": 87000 }, { "epoch": 0.2, "learning_rate": 5e-05, "loss": 2.9436, "step": 88000 }, { "epoch": 0.21, "learning_rate": 5e-05, "loss": 2.9297, "step": 89000 }, { "epoch": 0.21, "learning_rate": 5e-05, "loss": 2.9205, "step": 90000 }, { "epoch": 0.21, "learning_rate": 5e-05, "loss": 2.9103, "step": 91000 }, { "epoch": 0.21, "learning_rate": 5e-05, "loss": 2.9019, "step": 92000 }, { "epoch": 0.22, "learning_rate": 5e-05, "loss": 2.8987, "step": 93000 }, { "epoch": 0.22, "learning_rate": 5e-05, "loss": 2.8913, "step": 94000 }, { "epoch": 0.22, "learning_rate": 5e-05, "loss": 2.8863, "step": 95000 }, { "epoch": 0.22, "learning_rate": 5e-05, "loss": 2.8781, "step": 96000 }, { "epoch": 0.22, "learning_rate": 5e-05, "loss": 2.8695, "step": 97000 }, { "epoch": 0.23, "learning_rate": 5e-05, "loss": 2.8731, "step": 98000 }, { "epoch": 0.23, "learning_rate": 5e-05, "loss": 2.8595, "step": 99000 }, { "epoch": 0.23, "learning_rate": 5e-05, "loss": 2.8548, "step": 100000 }, { "epoch": 0.23, "learning_rate": 5e-05, "loss": 2.8471, "step": 101000 }, { "epoch": 0.24, "learning_rate": 5e-05, "loss": 2.84, "step": 102000 }, { "epoch": 0.24, "learning_rate": 5e-05, "loss": 2.8328, "step": 103000 }, { "epoch": 0.24, "learning_rate": 5e-05, "loss": 2.8317, "step": 104000 }, { "epoch": 0.24, "learning_rate": 5e-05, "loss": 2.8278, "step": 105000 }, { "epoch": 0.25, "learning_rate": 5e-05, "loss": 2.821, "step": 106000 }, { "epoch": 0.25, "learning_rate": 5e-05, "loss": 2.8145, "step": 107000 }, { "epoch": 0.25, "learning_rate": 5e-05, "loss": 2.8083, "step": 108000 }, { "epoch": 0.25, "learning_rate": 5e-05, "loss": 2.7986, "step": 109000 }, { "epoch": 0.25, "learning_rate": 5e-05, "loss": 2.7942, "step": 110000 }, { "epoch": 0.26, "learning_rate": 5e-05, "loss": 2.7862, "step": 111000 }, { "epoch": 0.26, "learning_rate": 5e-05, "loss": 2.782, "step": 112000 }, { "epoch": 0.26, "learning_rate": 5e-05, "loss": 2.7773, "step": 113000 }, { "epoch": 0.26, "learning_rate": 5e-05, "loss": 2.7767, "step": 114000 }, { "epoch": 0.27, "learning_rate": 5e-05, "loss": 2.771, "step": 115000 }, { "epoch": 0.27, "learning_rate": 5e-05, "loss": 2.7649, "step": 116000 }, { "epoch": 0.27, "learning_rate": 5e-05, "loss": 2.7632, "step": 117000 }, { "epoch": 0.27, "learning_rate": 5e-05, "loss": 2.755, "step": 118000 }, { "epoch": 0.28, "learning_rate": 5e-05, "loss": 2.7415, "step": 119000 }, { "epoch": 0.28, "learning_rate": 5e-05, "loss": 2.7435, "step": 120000 }, { "epoch": 0.28, "learning_rate": 5e-05, "loss": 2.7405, "step": 121000 }, { "epoch": 0.28, "learning_rate": 5e-05, "loss": 2.7348, "step": 122000 }, { "epoch": 0.29, "learning_rate": 5e-05, "loss": 2.7296, "step": 123000 }, { "epoch": 0.29, "learning_rate": 5e-05, "loss": 2.7244, "step": 124000 }, { "epoch": 0.29, "learning_rate": 5e-05, "loss": 2.7208, "step": 125000 }, { "epoch": 0.29, "learning_rate": 5e-05, "loss": 2.7157, "step": 126000 }, { "epoch": 0.29, "learning_rate": 5e-05, "loss": 2.7143, "step": 127000 }, { "epoch": 0.3, "learning_rate": 5e-05, "loss": 2.7108, "step": 128000 }, { "epoch": 0.3, "learning_rate": 5e-05, "loss": 2.7009, "step": 129000 }, { "epoch": 0.3, "learning_rate": 5e-05, "loss": 2.7023, "step": 130000 }, { "epoch": 0.3, "learning_rate": 5e-05, "loss": 2.6931, "step": 131000 }, { "epoch": 0.31, "learning_rate": 5e-05, "loss": 2.6901, "step": 132000 }, { "epoch": 0.31, "learning_rate": 5e-05, "loss": 2.6871, "step": 133000 }, { "epoch": 0.31, "learning_rate": 5e-05, "loss": 2.6843, "step": 134000 }, { "epoch": 0.31, "learning_rate": 5e-05, "loss": 2.6842, "step": 135000 }, { "epoch": 0.32, "learning_rate": 5e-05, "loss": 2.6771, "step": 136000 }, { "epoch": 0.32, "learning_rate": 5e-05, "loss": 2.6708, "step": 137000 }, { "epoch": 0.32, "learning_rate": 5e-05, "loss": 2.6638, "step": 138000 }, { "epoch": 0.32, "learning_rate": 5e-05, "loss": 2.6617, "step": 139000 }, { "epoch": 0.32, "learning_rate": 5e-05, "loss": 2.6626, "step": 140000 }, { "epoch": 0.33, "learning_rate": 5e-05, "loss": 2.655, "step": 141000 }, { "epoch": 0.33, "learning_rate": 5e-05, "loss": 2.6576, "step": 142000 }, { "epoch": 0.33, "learning_rate": 5e-05, "loss": 2.6408, "step": 143000 }, { "epoch": 0.33, "learning_rate": 5e-05, "loss": 2.6422, "step": 144000 }, { "epoch": 0.34, "learning_rate": 5e-05, "loss": 2.6416, "step": 145000 }, { "epoch": 0.34, "learning_rate": 5e-05, "loss": 2.6411, "step": 146000 }, { "epoch": 0.34, "learning_rate": 5e-05, "loss": 2.6327, "step": 147000 }, { "epoch": 0.34, "learning_rate": 5e-05, "loss": 2.6334, "step": 148000 }, { "epoch": 0.35, "learning_rate": 5e-05, "loss": 2.6208, "step": 149000 }, { "epoch": 0.35, "learning_rate": 5e-05, "loss": 2.6232, "step": 150000 }, { "epoch": 0.35, "eval_accuracy": 0.5274870522655163, "eval_loss": 2.509765625, "eval_runtime": 4883.386, "eval_samples_per_second": 114.367, "eval_steps_per_second": 0.894, "step": 150000 }, { "epoch": 0.35, "learning_rate": 5e-05, "loss": 2.6229, "step": 151000 }, { "epoch": 0.35, "learning_rate": 5e-05, "loss": 2.6185, "step": 152000 }, { "epoch": 0.35, "learning_rate": 5e-05, "loss": 2.6103, "step": 153000 }, { "epoch": 0.36, "learning_rate": 5e-05, "loss": 2.6089, "step": 154000 }, { "epoch": 0.36, "learning_rate": 5e-05, "loss": 2.6157, "step": 155000 }, { "epoch": 0.36, "learning_rate": 5e-05, "loss": 2.6045, "step": 156000 }, { "epoch": 0.36, "learning_rate": 5e-05, "loss": 2.6045, "step": 157000 }, { "epoch": 0.37, "learning_rate": 5e-05, "loss": 2.5938, "step": 158000 }, { "epoch": 0.37, "learning_rate": 5e-05, "loss": 2.5924, "step": 159000 }, { "epoch": 0.37, "learning_rate": 5e-05, "loss": 2.5943, "step": 160000 }, { "epoch": 0.37, "learning_rate": 5e-05, "loss": 2.589, "step": 161000 }, { "epoch": 0.38, "learning_rate": 5e-05, "loss": 2.5906, "step": 162000 }, { "epoch": 0.38, "learning_rate": 5e-05, "loss": 2.575, "step": 163000 }, { "epoch": 0.38, "learning_rate": 5e-05, "loss": 2.5799, "step": 164000 }, { "epoch": 0.38, "learning_rate": 5e-05, "loss": 2.576, "step": 165000 }, { "epoch": 0.38, "learning_rate": 5e-05, "loss": 2.574, "step": 166000 }, { "epoch": 0.39, "learning_rate": 5e-05, "loss": 2.5697, "step": 167000 }, { "epoch": 0.39, "learning_rate": 5e-05, "loss": 2.5699, "step": 168000 }, { "epoch": 0.39, "learning_rate": 5e-05, "loss": 2.5694, "step": 169000 }, { "epoch": 0.39, "learning_rate": 5e-05, "loss": 2.5629, "step": 170000 }, { "epoch": 0.4, "learning_rate": 5e-05, "loss": 2.563, "step": 171000 }, { "epoch": 0.4, "learning_rate": 5e-05, "loss": 2.5618, "step": 172000 }, { "epoch": 0.4, "learning_rate": 5e-05, "loss": 2.551, "step": 173000 }, { "epoch": 0.4, "learning_rate": 5e-05, "loss": 2.5494, "step": 174000 }, { "epoch": 0.41, "learning_rate": 5e-05, "loss": 2.544, "step": 175000 }, { "epoch": 0.41, "learning_rate": 5e-05, "loss": 2.5483, "step": 176000 }, { "epoch": 0.41, "learning_rate": 5e-05, "loss": 2.5458, "step": 177000 }, { "epoch": 0.41, "learning_rate": 5e-05, "loss": 2.5463, "step": 178000 }, { "epoch": 0.41, "learning_rate": 5e-05, "loss": 2.5353, "step": 179000 }, { "epoch": 0.42, "learning_rate": 5e-05, "loss": 2.5317, "step": 180000 }, { "epoch": 0.42, "learning_rate": 5e-05, "loss": 2.5352, "step": 181000 }, { "epoch": 0.42, "learning_rate": 5e-05, "loss": 2.5295, "step": 182000 }, { "epoch": 0.42, "learning_rate": 5e-05, "loss": 2.5286, "step": 183000 }, { "epoch": 0.43, "learning_rate": 5e-05, "loss": 2.5196, "step": 184000 }, { "epoch": 0.43, "learning_rate": 5e-05, "loss": 2.5222, "step": 185000 }, { "epoch": 0.43, "learning_rate": 5e-05, "loss": 2.5245, "step": 186000 }, { "epoch": 0.43, "learning_rate": 5e-05, "loss": 2.5154, "step": 187000 }, { "epoch": 0.44, "learning_rate": 5e-05, "loss": 2.509, "step": 188000 }, { "epoch": 0.44, "learning_rate": 5e-05, "loss": 2.5131, "step": 189000 }, { "epoch": 0.44, "learning_rate": 5e-05, "loss": 2.5072, "step": 190000 }, { "epoch": 0.44, "learning_rate": 5e-05, "loss": 2.5079, "step": 191000 }, { "epoch": 0.45, "learning_rate": 5e-05, "loss": 2.5062, "step": 192000 }, { "epoch": 0.45, "learning_rate": 5e-05, "loss": 2.5008, "step": 193000 }, { "epoch": 0.45, "learning_rate": 5e-05, "loss": 2.504, "step": 194000 }, { "epoch": 0.45, "learning_rate": 5e-05, "loss": 2.5013, "step": 195000 }, { "epoch": 0.45, "learning_rate": 5e-05, "loss": 2.4979, "step": 196000 }, { "epoch": 0.46, "learning_rate": 5e-05, "loss": 2.4919, "step": 197000 }, { "epoch": 0.46, "learning_rate": 5e-05, "loss": 2.4892, "step": 198000 }, { "epoch": 0.46, "learning_rate": 5e-05, "loss": 2.4897, "step": 199000 }, { "epoch": 0.46, "learning_rate": 5e-05, "loss": 2.4897, "step": 200000 }, { "epoch": 0.47, "learning_rate": 5e-05, "loss": 2.4859, "step": 201000 }, { "epoch": 0.47, "learning_rate": 5e-05, "loss": 2.4766, "step": 202000 }, { "epoch": 0.47, "learning_rate": 5e-05, "loss": 2.4815, "step": 203000 }, { "epoch": 0.47, "learning_rate": 5e-05, "loss": 2.4797, "step": 204000 }, { "epoch": 0.48, "learning_rate": 5e-05, "loss": 2.4782, "step": 205000 }, { "epoch": 0.48, "learning_rate": 5e-05, "loss": 2.477, "step": 206000 }, { "epoch": 0.48, "learning_rate": 5e-05, "loss": 2.4714, "step": 207000 }, { "epoch": 0.48, "learning_rate": 5e-05, "loss": 2.4693, "step": 208000 }, { "epoch": 0.48, "learning_rate": 5e-05, "loss": 2.4647, "step": 209000 }, { "epoch": 0.49, "learning_rate": 5e-05, "loss": 2.4675, "step": 210000 }, { "epoch": 0.49, "learning_rate": 5e-05, "loss": 2.4628, "step": 211000 }, { "epoch": 0.49, "learning_rate": 5e-05, "loss": 2.458, "step": 212000 }, { "epoch": 0.49, "learning_rate": 5e-05, "loss": 2.4575, "step": 213000 }, { "epoch": 0.5, "learning_rate": 5e-05, "loss": 2.4588, "step": 214000 }, { "epoch": 0.5, "learning_rate": 5e-05, "loss": 2.4541, "step": 215000 }, { "epoch": 0.5, "learning_rate": 5e-05, "loss": 2.4556, "step": 216000 }, { "epoch": 0.5, "learning_rate": 5e-05, "loss": 2.4419, "step": 217000 }, { "epoch": 0.51, "learning_rate": 5e-05, "loss": 2.4505, "step": 218000 }, { "epoch": 0.51, "learning_rate": 5e-05, "loss": 2.4469, "step": 219000 }, { "epoch": 0.51, "learning_rate": 5e-05, "loss": 2.4488, "step": 220000 }, { "epoch": 0.51, "learning_rate": 5e-05, "loss": 2.4493, "step": 221000 }, { "epoch": 0.51, "learning_rate": 5e-05, "loss": 2.4399, "step": 222000 }, { "epoch": 0.52, "learning_rate": 5e-05, "loss": 2.4408, "step": 223000 }, { "epoch": 0.52, "learning_rate": 5e-05, "loss": 2.4413, "step": 224000 }, { "epoch": 0.52, "learning_rate": 5e-05, "loss": 2.4375, "step": 225000 }, { "epoch": 0.52, "learning_rate": 5e-05, "loss": 2.4306, "step": 226000 }, { "epoch": 0.53, "learning_rate": 5e-05, "loss": 2.4354, "step": 227000 }, { "epoch": 0.53, "learning_rate": 5e-05, "loss": 2.426, "step": 228000 }, { "epoch": 0.53, "learning_rate": 5e-05, "loss": 2.4307, "step": 229000 }, { "epoch": 0.53, "learning_rate": 5e-05, "loss": 2.4297, "step": 230000 }, { "epoch": 0.54, "learning_rate": 5e-05, "loss": 2.4268, "step": 231000 }, { "epoch": 0.54, "learning_rate": 5e-05, "loss": 2.4249, "step": 232000 }, { "epoch": 0.54, "learning_rate": 5e-05, "loss": 2.4211, "step": 233000 }, { "epoch": 0.54, "learning_rate": 5e-05, "loss": 2.4233, "step": 234000 }, { "epoch": 0.54, "learning_rate": 5e-05, "loss": 2.415, "step": 235000 }, { "epoch": 0.55, "learning_rate": 5e-05, "loss": 2.4163, "step": 236000 }, { "epoch": 0.55, "learning_rate": 5e-05, "loss": 2.4185, "step": 237000 }, { "epoch": 0.55, "learning_rate": 5e-05, "loss": 2.4148, "step": 238000 }, { "epoch": 0.55, "learning_rate": 5e-05, "loss": 2.4089, "step": 239000 }, { "epoch": 0.56, "learning_rate": 5e-05, "loss": 2.411, "step": 240000 }, { "epoch": 0.56, "learning_rate": 5e-05, "loss": 2.4061, "step": 241000 }, { "epoch": 0.56, "learning_rate": 5e-05, "loss": 2.4056, "step": 242000 }, { "epoch": 0.56, "learning_rate": 5e-05, "loss": 2.402, "step": 243000 }, { "epoch": 0.57, "learning_rate": 5e-05, "loss": 2.4042, "step": 244000 }, { "epoch": 0.57, "learning_rate": 5e-05, "loss": 2.4025, "step": 245000 }, { "epoch": 0.57, "learning_rate": 5e-05, "loss": 2.3981, "step": 246000 }, { "epoch": 0.57, "learning_rate": 5e-05, "loss": 2.4005, "step": 247000 }, { "epoch": 0.57, "learning_rate": 5e-05, "loss": 2.3938, "step": 248000 }, { "epoch": 0.58, "learning_rate": 5e-05, "loss": 2.3914, "step": 249000 }, { "epoch": 0.58, "learning_rate": 5e-05, "loss": 2.3901, "step": 250000 }, { "epoch": 0.58, "learning_rate": 5e-05, "loss": 2.3912, "step": 251000 }, { "epoch": 0.58, "learning_rate": 5e-05, "loss": 2.3881, "step": 252000 }, { "epoch": 0.59, "learning_rate": 5e-05, "loss": 2.3904, "step": 253000 }, { "epoch": 0.59, "learning_rate": 5e-05, "loss": 2.3907, "step": 254000 }, { "epoch": 0.59, "learning_rate": 5e-05, "loss": 2.3827, "step": 255000 }, { "epoch": 0.59, "learning_rate": 5e-05, "loss": 2.3832, "step": 256000 }, { "epoch": 0.6, "learning_rate": 5e-05, "loss": 2.3824, "step": 257000 }, { "epoch": 0.6, "learning_rate": 5e-05, "loss": 2.3831, "step": 258000 }, { "epoch": 0.6, "learning_rate": 5e-05, "loss": 2.377, "step": 259000 }, { "epoch": 0.6, "learning_rate": 5e-05, "loss": 2.3732, "step": 260000 }, { "epoch": 0.6, "learning_rate": 5e-05, "loss": 2.3753, "step": 261000 }, { "epoch": 0.61, "learning_rate": 5e-05, "loss": 2.3747, "step": 262000 }, { "epoch": 0.61, "learning_rate": 5e-05, "loss": 2.3684, "step": 263000 }, { "epoch": 0.61, "learning_rate": 5e-05, "loss": 2.3716, "step": 264000 }, { "epoch": 0.61, "learning_rate": 5e-05, "loss": 2.3681, "step": 265000 }, { "epoch": 0.62, "learning_rate": 5e-05, "loss": 2.3694, "step": 266000 }, { "epoch": 0.62, "learning_rate": 5e-05, "loss": 2.3622, "step": 267000 }, { "epoch": 0.62, "learning_rate": 5e-05, "loss": 2.3646, "step": 268000 }, { "epoch": 0.62, "learning_rate": 5e-05, "loss": 2.367, "step": 269000 }, { "epoch": 0.63, "learning_rate": 5e-05, "loss": 2.365, "step": 270000 }, { "epoch": 0.63, "learning_rate": 5e-05, "loss": 2.3608, "step": 271000 }, { "epoch": 0.63, "learning_rate": 5e-05, "loss": 2.3558, "step": 272000 }, { "epoch": 0.63, "learning_rate": 5e-05, "loss": 2.3627, "step": 273000 }, { "epoch": 0.64, "learning_rate": 5e-05, "loss": 2.3541, "step": 274000 }, { "epoch": 0.64, "learning_rate": 5e-05, "loss": 2.3541, "step": 275000 }, { "epoch": 0.64, "learning_rate": 5e-05, "loss": 2.3516, "step": 276000 }, { "epoch": 0.64, "learning_rate": 5e-05, "loss": 2.354, "step": 277000 }, { "epoch": 0.64, "learning_rate": 5e-05, "loss": 2.3494, "step": 278000 }, { "epoch": 0.65, "learning_rate": 5e-05, "loss": 2.3529, "step": 279000 }, { "epoch": 0.65, "learning_rate": 5e-05, "loss": 2.3493, "step": 280000 }, { "epoch": 0.65, "learning_rate": 5e-05, "loss": 2.3519, "step": 281000 }, { "epoch": 0.65, "learning_rate": 5e-05, "loss": 2.3442, "step": 282000 }, { "epoch": 0.66, "learning_rate": 5e-05, "loss": 2.3431, "step": 283000 }, { "epoch": 0.66, "learning_rate": 5e-05, "loss": 2.3489, "step": 284000 }, { "epoch": 0.66, "learning_rate": 5e-05, "loss": 2.3446, "step": 285000 }, { "epoch": 0.66, "learning_rate": 5e-05, "loss": 2.3451, "step": 286000 }, { "epoch": 0.67, "learning_rate": 5e-05, "loss": 2.3307, "step": 287000 }, { "epoch": 0.67, "learning_rate": 5e-05, "loss": 2.3336, "step": 288000 }, { "epoch": 0.67, "learning_rate": 5e-05, "loss": 2.3383, "step": 289000 }, { "epoch": 0.67, "learning_rate": 5e-05, "loss": 2.3376, "step": 290000 }, { "epoch": 0.67, "learning_rate": 5e-05, "loss": 2.3336, "step": 291000 }, { "epoch": 0.68, "learning_rate": 5e-05, "loss": 2.3351, "step": 292000 }, { "epoch": 0.68, "learning_rate": 5e-05, "loss": 2.3296, "step": 293000 }, { "epoch": 0.68, "learning_rate": 5e-05, "loss": 2.3295, "step": 294000 }, { "epoch": 0.68, "learning_rate": 5e-05, "loss": 2.3274, "step": 295000 }, { "epoch": 0.69, "learning_rate": 5e-05, "loss": 2.3268, "step": 296000 }, { "epoch": 0.69, "learning_rate": 5e-05, "loss": 2.3289, "step": 297000 }, { "epoch": 0.69, "learning_rate": 5e-05, "loss": 2.3293, "step": 298000 }, { "epoch": 0.69, "learning_rate": 5e-05, "loss": 2.3212, "step": 299000 }, { "epoch": 0.7, "learning_rate": 5e-05, "loss": 2.3189, "step": 300000 }, { "epoch": 0.7, "eval_accuracy": 0.5684444981680046, "eval_loss": 2.2109375, "eval_runtime": 4880.4554, "eval_samples_per_second": 114.436, "eval_steps_per_second": 0.894, "step": 300000 }, { "epoch": 0.7, "learning_rate": 5e-05, "loss": 2.3193, "step": 301000 }, { "epoch": 0.7, "learning_rate": 5e-05, "loss": 2.317, "step": 302000 }, { "epoch": 0.7, "learning_rate": 5e-05, "loss": 2.3212, "step": 303000 }, { "epoch": 0.7, "learning_rate": 5e-05, "loss": 2.317, "step": 304000 }, { "epoch": 0.71, "learning_rate": 5e-05, "loss": 2.3183, "step": 305000 }, { "epoch": 0.71, "learning_rate": 5e-05, "loss": 2.3178, "step": 306000 }, { "epoch": 0.71, "learning_rate": 5e-05, "loss": 2.3085, "step": 307000 }, { "epoch": 0.71, "learning_rate": 5e-05, "loss": 2.3127, "step": 308000 }, { "epoch": 0.72, "learning_rate": 5e-05, "loss": 2.3149, "step": 309000 }, { "epoch": 0.72, "learning_rate": 5e-05, "loss": 2.3091, "step": 310000 }, { "epoch": 0.72, "learning_rate": 5e-05, "loss": 2.3088, "step": 311000 }, { "epoch": 0.72, "learning_rate": 5e-05, "loss": 2.3097, "step": 312000 }, { "epoch": 0.73, "learning_rate": 5e-05, "loss": 2.3131, "step": 313000 }, { "epoch": 0.73, "learning_rate": 5e-05, "loss": 2.3031, "step": 314000 }, { "epoch": 0.73, "learning_rate": 5e-05, "loss": 2.302, "step": 315000 }, { "epoch": 0.73, "learning_rate": 5e-05, "loss": 2.3056, "step": 316000 }, { "epoch": 0.73, "learning_rate": 5e-05, "loss": 2.3046, "step": 317000 }, { "epoch": 0.74, "learning_rate": 5e-05, "loss": 2.2997, "step": 318000 }, { "epoch": 0.74, "learning_rate": 5e-05, "loss": 2.2933, "step": 319000 }, { "epoch": 0.74, "learning_rate": 5e-05, "loss": 2.2992, "step": 320000 }, { "epoch": 0.74, "learning_rate": 5e-05, "loss": 2.2947, "step": 321000 }, { "epoch": 0.75, "learning_rate": 5e-05, "loss": 2.2933, "step": 322000 }, { "epoch": 0.75, "learning_rate": 5e-05, "loss": 2.295, "step": 323000 }, { "epoch": 0.75, "learning_rate": 5e-05, "loss": 2.2935, "step": 324000 }, { "epoch": 0.75, "learning_rate": 5e-05, "loss": 2.2926, "step": 325000 }, { "epoch": 0.76, "learning_rate": 5e-05, "loss": 2.2887, "step": 326000 }, { "epoch": 0.76, "learning_rate": 5e-05, "loss": 2.2966, "step": 327000 }, { "epoch": 0.76, "learning_rate": 5e-05, "loss": 2.2924, "step": 328000 }, { "epoch": 0.76, "learning_rate": 5e-05, "loss": 2.2897, "step": 329000 }, { "epoch": 0.76, "learning_rate": 5e-05, "loss": 2.2903, "step": 330000 }, { "epoch": 0.77, "learning_rate": 5e-05, "loss": 2.282, "step": 331000 }, { "epoch": 0.77, "learning_rate": 5e-05, "loss": 2.281, "step": 332000 }, { "epoch": 0.77, "learning_rate": 5e-05, "loss": 2.2903, "step": 333000 }, { "epoch": 0.77, "learning_rate": 5e-05, "loss": 2.2831, "step": 334000 }, { "epoch": 0.78, "learning_rate": 5e-05, "loss": 2.2874, "step": 335000 }, { "epoch": 0.78, "learning_rate": 5e-05, "loss": 2.2722, "step": 336000 }, { "epoch": 0.78, "learning_rate": 5e-05, "loss": 2.2823, "step": 337000 }, { "epoch": 0.78, "learning_rate": 5e-05, "loss": 2.2793, "step": 338000 }, { "epoch": 0.79, "learning_rate": 5e-05, "loss": 2.2839, "step": 339000 }, { "epoch": 0.79, "learning_rate": 5e-05, "loss": 2.2808, "step": 340000 }, { "epoch": 0.79, "learning_rate": 5e-05, "loss": 2.2782, "step": 341000 }, { "epoch": 0.79, "learning_rate": 5e-05, "loss": 2.2751, "step": 342000 }, { "epoch": 0.8, "learning_rate": 5e-05, "loss": 2.2728, "step": 343000 }, { "epoch": 0.8, "learning_rate": 5e-05, "loss": 2.2736, "step": 344000 }, { "epoch": 0.8, "learning_rate": 5e-05, "loss": 2.2761, "step": 345000 }, { "epoch": 0.8, "learning_rate": 5e-05, "loss": 2.2692, "step": 346000 }, { "epoch": 0.8, "learning_rate": 5e-05, "loss": 2.269, "step": 347000 }, { "epoch": 0.81, "learning_rate": 5e-05, "loss": 2.2704, "step": 348000 }, { "epoch": 0.81, "learning_rate": 5e-05, "loss": 2.2673, "step": 349000 }, { "epoch": 0.81, "learning_rate": 5e-05, "loss": 2.2657, "step": 350000 }, { "epoch": 0.81, "learning_rate": 5e-05, "loss": 2.2731, "step": 351000 }, { "epoch": 0.82, "learning_rate": 5e-05, "loss": 2.2677, "step": 352000 }, { "epoch": 0.82, "learning_rate": 5e-05, "loss": 2.2628, "step": 353000 }, { "epoch": 0.82, "learning_rate": 5e-05, "loss": 2.2665, "step": 354000 }, { "epoch": 0.82, "learning_rate": 5e-05, "loss": 2.2649, "step": 355000 }, { "epoch": 0.83, "learning_rate": 5e-05, "loss": 2.266, "step": 356000 }, { "epoch": 0.83, "learning_rate": 5e-05, "loss": 2.2626, "step": 357000 }, { "epoch": 0.83, "learning_rate": 5e-05, "loss": 2.2605, "step": 358000 }, { "epoch": 0.83, "learning_rate": 5e-05, "loss": 2.2598, "step": 359000 }, { "epoch": 0.83, "learning_rate": 5e-05, "loss": 2.2541, "step": 360000 }, { "epoch": 0.84, "learning_rate": 5e-05, "loss": 2.2527, "step": 361000 }, { "epoch": 0.84, "learning_rate": 5e-05, "loss": 2.2495, "step": 362000 }, { "epoch": 0.84, "learning_rate": 5e-05, "loss": 2.2552, "step": 363000 }, { "epoch": 0.84, "learning_rate": 5e-05, "loss": 2.2575, "step": 364000 }, { "epoch": 0.85, "learning_rate": 5e-05, "loss": 2.2532, "step": 365000 }, { "epoch": 0.85, "learning_rate": 5e-05, "loss": 2.2531, "step": 366000 }, { "epoch": 0.85, "learning_rate": 5e-05, "loss": 2.2512, "step": 367000 }, { "epoch": 0.85, "learning_rate": 5e-05, "loss": 2.2536, "step": 368000 }, { "epoch": 0.86, "learning_rate": 5e-05, "loss": 2.25, "step": 369000 }, { "epoch": 0.86, "learning_rate": 5e-05, "loss": 2.2494, "step": 370000 }, { "epoch": 0.86, "learning_rate": 5e-05, "loss": 2.2454, "step": 371000 }, { "epoch": 0.86, "learning_rate": 5e-05, "loss": 2.2416, "step": 372000 }, { "epoch": 0.86, "learning_rate": 5e-05, "loss": 2.2417, "step": 373000 }, { "epoch": 0.87, "learning_rate": 5e-05, "loss": 2.2418, "step": 374000 }, { "epoch": 0.87, "learning_rate": 5e-05, "loss": 2.2467, "step": 375000 }, { "epoch": 0.87, "learning_rate": 5e-05, "loss": 2.2409, "step": 376000 }, { "epoch": 0.87, "learning_rate": 5e-05, "loss": 2.2451, "step": 377000 }, { "epoch": 0.88, "learning_rate": 5e-05, "loss": 2.2397, "step": 378000 }, { "epoch": 0.88, "learning_rate": 5e-05, "loss": 2.2392, "step": 379000 }, { "epoch": 0.88, "learning_rate": 5e-05, "loss": 2.2388, "step": 380000 }, { "epoch": 0.88, "learning_rate": 5e-05, "loss": 2.2369, "step": 381000 }, { "epoch": 0.89, "learning_rate": 5e-05, "loss": 2.2354, "step": 382000 }, { "epoch": 0.89, "learning_rate": 5e-05, "loss": 2.2367, "step": 383000 }, { "epoch": 0.89, "learning_rate": 5e-05, "loss": 2.2383, "step": 384000 }, { "epoch": 0.89, "learning_rate": 5e-05, "loss": 2.2352, "step": 385000 }, { "epoch": 0.89, "learning_rate": 5e-05, "loss": 2.236, "step": 386000 }, { "epoch": 0.9, "learning_rate": 5e-05, "loss": 2.2327, "step": 387000 }, { "epoch": 0.9, "learning_rate": 5e-05, "loss": 2.2345, "step": 388000 }, { "epoch": 0.9, "learning_rate": 5e-05, "loss": 2.2361, "step": 389000 }, { "epoch": 0.9, "learning_rate": 5e-05, "loss": 2.2316, "step": 390000 }, { "epoch": 0.91, "learning_rate": 5e-05, "loss": 2.2315, "step": 391000 }, { "epoch": 0.91, "learning_rate": 5e-05, "loss": 2.2315, "step": 392000 }, { "epoch": 0.91, "learning_rate": 5e-05, "loss": 2.227, "step": 393000 }, { "epoch": 0.91, "learning_rate": 5e-05, "loss": 2.2307, "step": 394000 }, { "epoch": 0.92, "learning_rate": 5e-05, "loss": 2.2276, "step": 395000 }, { "epoch": 0.92, "learning_rate": 5e-05, "loss": 2.219, "step": 396000 }, { "epoch": 0.92, "learning_rate": 5e-05, "loss": 2.222, "step": 397000 }, { "epoch": 0.92, "learning_rate": 5e-05, "loss": 2.2262, "step": 398000 }, { "epoch": 0.92, "learning_rate": 5e-05, "loss": 2.2224, "step": 399000 }, { "epoch": 0.93, "learning_rate": 5e-05, "loss": 2.217, "step": 400000 }, { "epoch": 0.93, "learning_rate": 5e-05, "loss": 2.2236, "step": 401000 }, { "epoch": 0.93, "learning_rate": 5e-05, "loss": 2.2256, "step": 402000 }, { "epoch": 0.93, "learning_rate": 5e-05, "loss": 2.2181, "step": 403000 }, { "epoch": 0.94, "learning_rate": 5e-05, "loss": 2.2209, "step": 404000 }, { "epoch": 0.94, "learning_rate": 5e-05, "loss": 2.2246, "step": 405000 }, { "epoch": 0.94, "learning_rate": 5e-05, "loss": 2.223, "step": 406000 }, { "epoch": 0.94, "learning_rate": 5e-05, "loss": 2.2164, "step": 407000 }, { "epoch": 0.95, "learning_rate": 5e-05, "loss": 2.2105, "step": 408000 }, { "epoch": 0.95, "learning_rate": 5e-05, "loss": 2.2181, "step": 409000 }, { "epoch": 0.95, "learning_rate": 5e-05, "loss": 2.2212, "step": 410000 }, { "epoch": 0.95, "learning_rate": 5e-05, "loss": 2.2138, "step": 411000 }, { "epoch": 0.95, "learning_rate": 5e-05, "loss": 2.208, "step": 412000 }, { "epoch": 0.96, "learning_rate": 5e-05, "loss": 2.217, "step": 413000 }, { "epoch": 0.96, "learning_rate": 5e-05, "loss": 2.2128, "step": 414000 }, { "epoch": 0.96, "learning_rate": 5e-05, "loss": 2.2139, "step": 415000 }, { "epoch": 0.96, "learning_rate": 5e-05, "loss": 2.206, "step": 416000 }, { "epoch": 0.97, "learning_rate": 5e-05, "loss": 2.213, "step": 417000 }, { "epoch": 0.97, "learning_rate": 5e-05, "loss": 2.2072, "step": 418000 }, { "epoch": 0.97, "learning_rate": 5e-05, "loss": 2.2115, "step": 419000 }, { "epoch": 0.97, "learning_rate": 5e-05, "loss": 2.2064, "step": 420000 }, { "epoch": 0.98, "learning_rate": 5e-05, "loss": 2.2061, "step": 421000 }, { "epoch": 0.98, "learning_rate": 5e-05, "loss": 2.2079, "step": 422000 }, { "epoch": 0.98, "learning_rate": 5e-05, "loss": 2.2088, "step": 423000 }, { "epoch": 0.98, "learning_rate": 5e-05, "loss": 2.2069, "step": 424000 }, { "epoch": 0.99, "learning_rate": 5e-05, "loss": 2.2037, "step": 425000 }, { "epoch": 0.99, "learning_rate": 5e-05, "loss": 2.2074, "step": 426000 }, { "epoch": 0.99, "learning_rate": 5e-05, "loss": 2.2036, "step": 427000 }, { "epoch": 0.99, "learning_rate": 5e-05, "loss": 2.202, "step": 428000 }, { "epoch": 0.99, "learning_rate": 5e-05, "loss": 2.2022, "step": 429000 }, { "epoch": 1.0, "learning_rate": 5e-05, "loss": 2.2031, "step": 430000 }, { "epoch": 1.0, "learning_rate": 5e-05, "loss": 2.1998, "step": 431000 }, { "epoch": 1.0, "learning_rate": 5e-05, "loss": 2.202, "step": 432000 }, { "epoch": 1.0, "learning_rate": 5e-05, "loss": 2.1916, "step": 433000 }, { "epoch": 1.01, "learning_rate": 5e-05, "loss": 2.1934, "step": 434000 }, { "epoch": 1.01, "learning_rate": 5e-05, "loss": 2.1958, "step": 435000 }, { "epoch": 1.01, "learning_rate": 5e-05, "loss": 2.1933, "step": 436000 }, { "epoch": 1.01, "learning_rate": 5e-05, "loss": 2.1945, "step": 437000 }, { "epoch": 1.02, "learning_rate": 5e-05, "loss": 2.1889, "step": 438000 }, { "epoch": 1.02, "learning_rate": 5e-05, "loss": 2.1929, "step": 439000 }, { "epoch": 1.02, "learning_rate": 5e-05, "loss": 2.1935, "step": 440000 }, { "epoch": 1.02, "learning_rate": 5e-05, "loss": 2.1883, "step": 441000 }, { "epoch": 1.02, "learning_rate": 5e-05, "loss": 2.1882, "step": 442000 }, { "epoch": 1.03, "learning_rate": 5e-05, "loss": 2.1835, "step": 443000 }, { "epoch": 1.03, "learning_rate": 5e-05, "loss": 2.1862, "step": 444000 }, { "epoch": 1.03, "learning_rate": 5e-05, "loss": 2.1869, "step": 445000 }, { "epoch": 1.03, "learning_rate": 5e-05, "loss": 2.1906, "step": 446000 }, { "epoch": 1.04, "learning_rate": 5e-05, "loss": 2.1825, "step": 447000 }, { "epoch": 1.04, "learning_rate": 5e-05, "loss": 2.1902, "step": 448000 }, { "epoch": 1.04, "learning_rate": 5e-05, "loss": 2.1843, "step": 449000 }, { "epoch": 1.04, "learning_rate": 5e-05, "loss": 2.1811, "step": 450000 }, { "epoch": 1.04, "eval_accuracy": 0.5877310925035356, "eval_loss": 2.078125, "eval_runtime": 4896.1088, "eval_samples_per_second": 114.07, "eval_steps_per_second": 0.891, "step": 450000 }, { "epoch": 1.05, "learning_rate": 5e-05, "loss": 2.1848, "step": 451000 }, { "epoch": 1.05, "learning_rate": 5e-05, "loss": 2.1788, "step": 452000 }, { "epoch": 1.05, "learning_rate": 5e-05, "loss": 2.185, "step": 453000 }, { "epoch": 1.05, "learning_rate": 5e-05, "loss": 2.1827, "step": 454000 }, { "epoch": 1.05, "learning_rate": 5e-05, "loss": 2.1772, "step": 455000 }, { "epoch": 1.06, "learning_rate": 5e-05, "loss": 2.1792, "step": 456000 }, { "epoch": 1.06, "learning_rate": 5e-05, "loss": 2.1799, "step": 457000 }, { "epoch": 1.06, "learning_rate": 5e-05, "loss": 2.1831, "step": 458000 }, { "epoch": 1.06, "learning_rate": 5e-05, "loss": 2.1763, "step": 459000 }, { "epoch": 1.07, "learning_rate": 5e-05, "loss": 2.1773, "step": 460000 }, { "epoch": 1.07, "learning_rate": 5e-05, "loss": 2.1804, "step": 461000 }, { "epoch": 1.07, "learning_rate": 5e-05, "loss": 2.1711, "step": 462000 }, { "epoch": 1.07, "learning_rate": 5e-05, "loss": 2.1788, "step": 463000 }, { "epoch": 1.08, "learning_rate": 5e-05, "loss": 2.174, "step": 464000 }, { "epoch": 1.08, "learning_rate": 5e-05, "loss": 2.1708, "step": 465000 }, { "epoch": 1.08, "learning_rate": 5e-05, "loss": 2.1772, "step": 466000 }, { "epoch": 1.08, "learning_rate": 5e-05, "loss": 2.174, "step": 467000 }, { "epoch": 1.08, "learning_rate": 5e-05, "loss": 2.1663, "step": 468000 }, { "epoch": 1.09, "learning_rate": 5e-05, "loss": 2.1737, "step": 469000 }, { "epoch": 1.09, "learning_rate": 5e-05, "loss": 2.1696, "step": 470000 }, { "epoch": 1.09, "learning_rate": 5e-05, "loss": 2.1726, "step": 471000 }, { "epoch": 1.09, "learning_rate": 5e-05, "loss": 2.1681, "step": 472000 }, { "epoch": 1.1, "learning_rate": 5e-05, "loss": 2.174, "step": 473000 }, { "epoch": 1.1, "learning_rate": 5e-05, "loss": 2.1679, "step": 474000 }, { "epoch": 1.1, "learning_rate": 5e-05, "loss": 2.17, "step": 475000 }, { "epoch": 1.1, "learning_rate": 5e-05, "loss": 2.1664, "step": 476000 }, { "epoch": 1.11, "learning_rate": 5e-05, "loss": 2.1657, "step": 477000 }, { "epoch": 1.11, "learning_rate": 5e-05, "loss": 2.1664, "step": 478000 }, { "epoch": 1.11, "learning_rate": 5e-05, "loss": 2.1674, "step": 479000 }, { "epoch": 1.11, "learning_rate": 5e-05, "loss": 2.1616, "step": 480000 }, { "epoch": 1.11, "learning_rate": 5e-05, "loss": 2.1597, "step": 481000 }, { "epoch": 1.12, "learning_rate": 5e-05, "loss": 2.161, "step": 482000 }, { "epoch": 1.12, "learning_rate": 5e-05, "loss": 2.1626, "step": 483000 }, { "epoch": 1.12, "learning_rate": 5e-05, "loss": 2.1643, "step": 484000 }, { "epoch": 1.12, "learning_rate": 5e-05, "loss": 2.1597, "step": 485000 }, { "epoch": 1.13, "learning_rate": 5e-05, "loss": 2.1575, "step": 486000 }, { "epoch": 1.13, "learning_rate": 5e-05, "loss": 2.1551, "step": 487000 }, { "epoch": 1.13, "learning_rate": 5e-05, "loss": 2.1561, "step": 488000 }, { "epoch": 1.13, "learning_rate": 5e-05, "loss": 2.1524, "step": 489000 }, { "epoch": 1.14, "learning_rate": 5e-05, "loss": 2.1631, "step": 490000 }, { "epoch": 1.14, "learning_rate": 5e-05, "loss": 2.1553, "step": 491000 }, { "epoch": 1.14, "learning_rate": 5e-05, "loss": 2.1595, "step": 492000 }, { "epoch": 1.14, "learning_rate": 5e-05, "loss": 2.1556, "step": 493000 }, { "epoch": 1.15, "learning_rate": 5e-05, "loss": 2.1572, "step": 494000 }, { "epoch": 1.15, "learning_rate": 5e-05, "loss": 2.1586, "step": 495000 }, { "epoch": 1.15, "learning_rate": 5e-05, "loss": 2.161, "step": 496000 }, { "epoch": 1.15, "learning_rate": 5e-05, "loss": 2.152, "step": 497000 }, { "epoch": 1.15, "learning_rate": 5e-05, "loss": 2.1611, "step": 498000 }, { "epoch": 1.16, "learning_rate": 5e-05, "loss": 2.1484, "step": 499000 }, { "epoch": 1.16, "learning_rate": 5e-05, "loss": 2.1516, "step": 500000 }, { "epoch": 1.16, "learning_rate": 5e-05, "loss": 2.1504, "step": 501000 }, { "epoch": 1.16, "learning_rate": 5e-05, "loss": 2.1493, "step": 502000 }, { "epoch": 1.17, "learning_rate": 5e-05, "loss": 2.1494, "step": 503000 }, { "epoch": 1.17, "learning_rate": 5e-05, "loss": 2.1507, "step": 504000 }, { "epoch": 1.17, "learning_rate": 5e-05, "loss": 2.1555, "step": 505000 }, { "epoch": 1.17, "learning_rate": 5e-05, "loss": 2.1522, "step": 506000 }, { "epoch": 1.18, "learning_rate": 5e-05, "loss": 2.1477, "step": 507000 }, { "epoch": 1.18, "learning_rate": 5e-05, "loss": 2.1452, "step": 508000 }, { "epoch": 1.18, "learning_rate": 5e-05, "loss": 2.1516, "step": 509000 }, { "epoch": 1.18, "learning_rate": 5e-05, "loss": 2.1486, "step": 510000 }, { "epoch": 1.18, "learning_rate": 5e-05, "loss": 2.1481, "step": 511000 }, { "epoch": 1.19, "learning_rate": 5e-05, "loss": 2.1461, "step": 512000 }, { "epoch": 1.19, "learning_rate": 5e-05, "loss": 2.1393, "step": 513000 }, { "epoch": 1.19, "learning_rate": 5e-05, "loss": 2.1419, "step": 514000 }, { "epoch": 1.19, "learning_rate": 5e-05, "loss": 2.1376, "step": 515000 }, { "epoch": 1.2, "learning_rate": 5e-05, "loss": 2.1486, "step": 516000 }, { "epoch": 1.2, "learning_rate": 5e-05, "loss": 2.1425, "step": 517000 }, { "epoch": 1.2, "learning_rate": 5e-05, "loss": 2.147, "step": 518000 }, { "epoch": 1.2, "learning_rate": 5e-05, "loss": 2.1458, "step": 519000 }, { "epoch": 1.21, "learning_rate": 5e-05, "loss": 2.1393, "step": 520000 }, { "epoch": 1.21, "learning_rate": 5e-05, "loss": 2.141, "step": 521000 }, { "epoch": 1.21, "learning_rate": 5e-05, "loss": 2.1375, "step": 522000 }, { "epoch": 1.21, "learning_rate": 5e-05, "loss": 2.143, "step": 523000 }, { "epoch": 1.21, "learning_rate": 5e-05, "loss": 2.1395, "step": 524000 }, { "epoch": 1.22, "learning_rate": 5e-05, "loss": 2.1376, "step": 525000 }, { "epoch": 1.22, "learning_rate": 5e-05, "loss": 2.1365, "step": 526000 }, { "epoch": 1.22, "learning_rate": 5e-05, "loss": 2.1378, "step": 527000 }, { "epoch": 1.22, "learning_rate": 5e-05, "loss": 2.1361, "step": 528000 }, { "epoch": 1.23, "learning_rate": 5e-05, "loss": 2.1397, "step": 529000 }, { "epoch": 1.23, "learning_rate": 5e-05, "loss": 2.1319, "step": 530000 }, { "epoch": 1.23, "learning_rate": 5e-05, "loss": 2.1341, "step": 531000 }, { "epoch": 1.23, "learning_rate": 5e-05, "loss": 2.1307, "step": 532000 }, { "epoch": 1.24, "learning_rate": 5e-05, "loss": 2.1273, "step": 533000 }, { "epoch": 1.24, "learning_rate": 5e-05, "loss": 2.1317, "step": 534000 }, { "epoch": 1.24, "learning_rate": 5e-05, "loss": 2.1268, "step": 535000 }, { "epoch": 1.24, "learning_rate": 5e-05, "loss": 2.1342, "step": 536000 }, { "epoch": 1.24, "learning_rate": 5e-05, "loss": 2.1372, "step": 537000 }, { "epoch": 1.25, "learning_rate": 5e-05, "loss": 2.1311, "step": 538000 }, { "epoch": 1.25, "learning_rate": 5e-05, "loss": 2.1301, "step": 539000 }, { "epoch": 1.25, "learning_rate": 5e-05, "loss": 2.1283, "step": 540000 }, { "epoch": 1.25, "learning_rate": 5e-05, "loss": 2.1264, "step": 541000 }, { "epoch": 1.26, "learning_rate": 5e-05, "loss": 2.1296, "step": 542000 }, { "epoch": 1.26, "learning_rate": 5e-05, "loss": 2.1277, "step": 543000 }, { "epoch": 1.26, "learning_rate": 5e-05, "loss": 2.1296, "step": 544000 }, { "epoch": 1.26, "learning_rate": 5e-05, "loss": 2.1264, "step": 545000 }, { "epoch": 1.27, "learning_rate": 5e-05, "loss": 2.1244, "step": 546000 }, { "epoch": 1.27, "learning_rate": 5e-05, "loss": 2.1227, "step": 547000 }, { "epoch": 1.27, "learning_rate": 5e-05, "loss": 2.1258, "step": 548000 }, { "epoch": 1.27, "learning_rate": 5e-05, "loss": 2.1238, "step": 549000 }, { "epoch": 1.27, "learning_rate": 5e-05, "loss": 2.1293, "step": 550000 }, { "epoch": 1.28, "learning_rate": 5e-05, "loss": 2.1263, "step": 551000 }, { "epoch": 1.28, "learning_rate": 5e-05, "loss": 2.117, "step": 552000 }, { "epoch": 1.28, "learning_rate": 5e-05, "loss": 2.1237, "step": 553000 }, { "epoch": 1.28, "learning_rate": 5e-05, "loss": 2.1251, "step": 554000 }, { "epoch": 1.29, "learning_rate": 5e-05, "loss": 2.1243, "step": 555000 }, { "epoch": 1.29, "learning_rate": 5e-05, "loss": 2.1187, "step": 556000 }, { "epoch": 1.29, "learning_rate": 5e-05, "loss": 2.1179, "step": 557000 }, { "epoch": 1.29, "learning_rate": 5e-05, "loss": 2.1226, "step": 558000 }, { "epoch": 1.3, "learning_rate": 5e-05, "loss": 2.1182, "step": 559000 }, { "epoch": 1.3, "learning_rate": 5e-05, "loss": 2.1159, "step": 560000 }, { "epoch": 1.3, "learning_rate": 5e-05, "loss": 2.1186, "step": 561000 }, { "epoch": 1.3, "learning_rate": 5e-05, "loss": 2.116, "step": 562000 }, { "epoch": 1.3, "learning_rate": 5e-05, "loss": 2.1152, "step": 563000 }, { "epoch": 1.31, "learning_rate": 5e-05, "loss": 2.1195, "step": 564000 }, { "epoch": 1.31, "learning_rate": 5e-05, "loss": 2.1144, "step": 565000 }, { "epoch": 1.31, "learning_rate": 5e-05, "loss": 2.125, "step": 566000 }, { "epoch": 1.31, "learning_rate": 5e-05, "loss": 2.1127, "step": 567000 }, { "epoch": 1.32, "learning_rate": 5e-05, "loss": 2.1177, "step": 568000 }, { "epoch": 1.32, "learning_rate": 5e-05, "loss": 2.1148, "step": 569000 }, { "epoch": 1.32, "learning_rate": 5e-05, "loss": 2.1185, "step": 570000 }, { "epoch": 1.32, "learning_rate": 5e-05, "loss": 2.115, "step": 571000 }, { "epoch": 1.33, "learning_rate": 5e-05, "loss": 2.1127, "step": 572000 }, { "epoch": 1.33, "learning_rate": 5e-05, "loss": 2.1075, "step": 573000 }, { "epoch": 1.33, "learning_rate": 5e-05, "loss": 2.111, "step": 574000 }, { "epoch": 1.33, "learning_rate": 5e-05, "loss": 2.1139, "step": 575000 }, { "epoch": 1.34, "learning_rate": 5e-05, "loss": 2.1092, "step": 576000 }, { "epoch": 1.34, "learning_rate": 5e-05, "loss": 2.1126, "step": 577000 }, { "epoch": 1.34, "learning_rate": 5e-05, "loss": 2.112, "step": 578000 }, { "epoch": 1.34, "learning_rate": 5e-05, "loss": 2.1093, "step": 579000 }, { "epoch": 1.34, "learning_rate": 5e-05, "loss": 2.1008, "step": 580000 }, { "epoch": 1.35, "learning_rate": 5e-05, "loss": 2.1119, "step": 581000 }, { "epoch": 1.35, "learning_rate": 5e-05, "loss": 2.1105, "step": 582000 }, { "epoch": 1.35, "learning_rate": 5e-05, "loss": 2.1078, "step": 583000 }, { "epoch": 1.35, "learning_rate": 5e-05, "loss": 2.1109, "step": 584000 }, { "epoch": 1.36, "learning_rate": 5e-05, "loss": 2.1093, "step": 585000 }, { "epoch": 1.36, "learning_rate": 5e-05, "loss": 2.1047, "step": 586000 }, { "epoch": 1.36, "learning_rate": 5e-05, "loss": 2.1038, "step": 587000 }, { "epoch": 1.36, "learning_rate": 5e-05, "loss": 2.1074, "step": 588000 }, { "epoch": 1.37, "learning_rate": 5e-05, "loss": 2.105, "step": 589000 }, { "epoch": 1.37, "learning_rate": 5e-05, "loss": 2.1114, "step": 590000 }, { "epoch": 1.37, "learning_rate": 5e-05, "loss": 2.1002, "step": 591000 }, { "epoch": 1.37, "learning_rate": 5e-05, "loss": 2.0985, "step": 592000 }, { "epoch": 1.37, "learning_rate": 5e-05, "loss": 2.1004, "step": 593000 }, { "epoch": 1.38, "learning_rate": 5e-05, "loss": 2.1008, "step": 594000 }, { "epoch": 1.38, "learning_rate": 5e-05, "loss": 2.0989, "step": 595000 }, { "epoch": 1.38, "learning_rate": 5e-05, "loss": 2.102, "step": 596000 }, { "epoch": 1.38, "learning_rate": 5e-05, "loss": 2.1039, "step": 597000 }, { "epoch": 1.39, "learning_rate": 5e-05, "loss": 2.0943, "step": 598000 }, { "epoch": 1.39, "learning_rate": 5e-05, "loss": 2.1032, "step": 599000 }, { "epoch": 1.39, "learning_rate": 5e-05, "loss": 2.1048, "step": 600000 }, { "epoch": 1.39, "eval_accuracy": 0.6001050830453463, "eval_loss": 1.994140625, "eval_runtime": 4891.4227, "eval_samples_per_second": 114.179, "eval_steps_per_second": 0.892, "step": 600000 }, { "epoch": 1.39, "learning_rate": 5e-05, "loss": 2.0963, "step": 601000 }, { "epoch": 1.4, "learning_rate": 5e-05, "loss": 2.1025, "step": 602000 }, { "epoch": 1.4, "learning_rate": 5e-05, "loss": 2.0995, "step": 603000 }, { "epoch": 1.4, "learning_rate": 5e-05, "loss": 2.1012, "step": 604000 }, { "epoch": 1.4, "learning_rate": 5e-05, "loss": 2.0948, "step": 605000 }, { "epoch": 1.4, "learning_rate": 5e-05, "loss": 2.0975, "step": 606000 }, { "epoch": 1.41, "learning_rate": 5e-05, "loss": 2.0955, "step": 607000 }, { "epoch": 1.41, "learning_rate": 5e-05, "loss": 2.0978, "step": 608000 }, { "epoch": 1.41, "learning_rate": 5e-05, "loss": 2.0966, "step": 609000 }, { "epoch": 1.41, "learning_rate": 5e-05, "loss": 2.0921, "step": 610000 }, { "epoch": 1.42, "learning_rate": 5e-05, "loss": 2.092, "step": 611000 }, { "epoch": 1.42, "learning_rate": 5e-05, "loss": 2.0944, "step": 612000 }, { "epoch": 1.42, "learning_rate": 5e-05, "loss": 2.095, "step": 613000 }, { "epoch": 1.42, "learning_rate": 5e-05, "loss": 2.0939, "step": 614000 }, { "epoch": 1.43, "learning_rate": 5e-05, "loss": 2.0952, "step": 615000 }, { "epoch": 1.43, "learning_rate": 5e-05, "loss": 2.0928, "step": 616000 }, { "epoch": 1.43, "learning_rate": 5e-05, "loss": 2.0932, "step": 617000 }, { "epoch": 1.43, "learning_rate": 5e-05, "loss": 2.0881, "step": 618000 }, { "epoch": 1.43, "learning_rate": 5e-05, "loss": 2.0912, "step": 619000 }, { "epoch": 1.44, "learning_rate": 5e-05, "loss": 2.0909, "step": 620000 }, { "epoch": 1.44, "learning_rate": 5e-05, "loss": 2.0856, "step": 621000 }, { "epoch": 1.44, "learning_rate": 5e-05, "loss": 2.0906, "step": 622000 }, { "epoch": 1.44, "learning_rate": 5e-05, "loss": 2.0905, "step": 623000 }, { "epoch": 1.45, "learning_rate": 5e-05, "loss": 2.0824, "step": 624000 }, { "epoch": 1.45, "learning_rate": 5e-05, "loss": 2.0874, "step": 625000 }, { "epoch": 1.45, "learning_rate": 5e-05, "loss": 2.0894, "step": 626000 }, { "epoch": 1.45, "learning_rate": 5e-05, "loss": 2.0824, "step": 627000 }, { "epoch": 1.46, "learning_rate": 5e-05, "loss": 2.0874, "step": 628000 }, { "epoch": 1.46, "learning_rate": 5e-05, "loss": 2.088, "step": 629000 }, { "epoch": 1.46, "learning_rate": 5e-05, "loss": 2.088, "step": 630000 }, { "epoch": 1.46, "learning_rate": 5e-05, "loss": 2.0815, "step": 631000 }, { "epoch": 1.46, "learning_rate": 5e-05, "loss": 2.0835, "step": 632000 }, { "epoch": 1.47, "learning_rate": 5e-05, "loss": 2.0886, "step": 633000 }, { "epoch": 1.47, "learning_rate": 5e-05, "loss": 2.0842, "step": 634000 }, { "epoch": 1.47, "learning_rate": 5e-05, "loss": 2.0814, "step": 635000 }, { "epoch": 1.47, "learning_rate": 5e-05, "loss": 2.0822, "step": 636000 }, { "epoch": 1.48, "learning_rate": 5e-05, "loss": 2.0811, "step": 637000 }, { "epoch": 1.48, "learning_rate": 5e-05, "loss": 2.0883, "step": 638000 }, { "epoch": 1.48, "learning_rate": 5e-05, "loss": 2.0858, "step": 639000 }, { "epoch": 1.48, "learning_rate": 5e-05, "loss": 2.0841, "step": 640000 }, { "epoch": 1.49, "learning_rate": 5e-05, "loss": 2.0893, "step": 641000 }, { "epoch": 1.49, "learning_rate": 5e-05, "loss": 2.0807, "step": 642000 }, { "epoch": 1.49, "learning_rate": 5e-05, "loss": 2.0841, "step": 643000 }, { "epoch": 1.49, "learning_rate": 5e-05, "loss": 2.0803, "step": 644000 }, { "epoch": 1.5, "learning_rate": 5e-05, "loss": 2.086, "step": 645000 }, { "epoch": 1.5, "learning_rate": 5e-05, "loss": 2.0816, "step": 646000 }, { "epoch": 1.5, "learning_rate": 5e-05, "loss": 2.083, "step": 647000 }, { "epoch": 1.5, "learning_rate": 5e-05, "loss": 2.0751, "step": 648000 }, { "epoch": 1.5, "learning_rate": 5e-05, "loss": 2.0836, "step": 649000 }, { "epoch": 1.51, "learning_rate": 5e-05, "loss": 2.0757, "step": 650000 }, { "epoch": 1.51, "learning_rate": 5e-05, "loss": 2.0795, "step": 651000 }, { "epoch": 1.51, "learning_rate": 5e-05, "loss": 2.0755, "step": 652000 }, { "epoch": 1.51, "learning_rate": 5e-05, "loss": 2.0811, "step": 653000 }, { "epoch": 1.52, "learning_rate": 5e-05, "loss": 2.0769, "step": 654000 }, { "epoch": 1.52, "learning_rate": 5e-05, "loss": 2.0737, "step": 655000 }, { "epoch": 1.52, "learning_rate": 5e-05, "loss": 2.0803, "step": 656000 }, { "epoch": 1.52, "learning_rate": 5e-05, "loss": 2.0739, "step": 657000 }, { "epoch": 1.53, "learning_rate": 5e-05, "loss": 2.0783, "step": 658000 }, { "epoch": 1.53, "learning_rate": 5e-05, "loss": 2.0744, "step": 659000 }, { "epoch": 1.53, "learning_rate": 5e-05, "loss": 2.0696, "step": 660000 }, { "epoch": 1.53, "learning_rate": 5e-05, "loss": 2.0785, "step": 661000 }, { "epoch": 1.53, "learning_rate": 5e-05, "loss": 2.0742, "step": 662000 }, { "epoch": 1.54, "learning_rate": 5e-05, "loss": 2.0715, "step": 663000 }, { "epoch": 1.54, "learning_rate": 5e-05, "loss": 2.0705, "step": 664000 }, { "epoch": 1.54, "learning_rate": 5e-05, "loss": 2.0698, "step": 665000 }, { "epoch": 1.54, "learning_rate": 5e-05, "loss": 2.0703, "step": 666000 }, { "epoch": 1.55, "learning_rate": 5e-05, "loss": 2.075, "step": 667000 }, { "epoch": 1.55, "learning_rate": 5e-05, "loss": 2.0694, "step": 668000 }, { "epoch": 1.55, "learning_rate": 5e-05, "loss": 2.0708, "step": 669000 }, { "epoch": 1.55, "learning_rate": 5e-05, "loss": 2.0656, "step": 670000 }, { "epoch": 1.56, "learning_rate": 5e-05, "loss": 2.069, "step": 671000 }, { "epoch": 1.56, "learning_rate": 5e-05, "loss": 2.0724, "step": 672000 }, { "epoch": 1.56, "learning_rate": 5e-05, "loss": 2.0699, "step": 673000 }, { "epoch": 1.56, "learning_rate": 5e-05, "loss": 2.0695, "step": 674000 }, { "epoch": 1.56, "learning_rate": 5e-05, "loss": 2.068, "step": 675000 }, { "epoch": 1.57, "learning_rate": 5e-05, "loss": 2.0671, "step": 676000 }, { "epoch": 1.57, "learning_rate": 5e-05, "loss": 2.0716, "step": 677000 }, { "epoch": 1.57, "learning_rate": 5e-05, "loss": 2.0689, "step": 678000 }, { "epoch": 1.57, "learning_rate": 5e-05, "loss": 2.062, "step": 679000 }, { "epoch": 1.58, "learning_rate": 5e-05, "loss": 2.0733, "step": 680000 }, { "epoch": 1.58, "learning_rate": 5e-05, "loss": 2.0671, "step": 681000 }, { "epoch": 1.58, "learning_rate": 5e-05, "loss": 2.0676, "step": 682000 }, { "epoch": 1.58, "learning_rate": 5e-05, "loss": 2.0651, "step": 683000 }, { "epoch": 1.59, "learning_rate": 5e-05, "loss": 2.0594, "step": 684000 }, { "epoch": 1.59, "learning_rate": 5e-05, "loss": 2.0658, "step": 685000 }, { "epoch": 1.59, "learning_rate": 5e-05, "loss": 2.0686, "step": 686000 }, { "epoch": 1.59, "learning_rate": 5e-05, "loss": 2.0685, "step": 687000 }, { "epoch": 1.59, "learning_rate": 5e-05, "loss": 2.0632, "step": 688000 }, { "epoch": 1.6, "learning_rate": 5e-05, "loss": 2.0662, "step": 689000 }, { "epoch": 1.6, "learning_rate": 5e-05, "loss": 2.0615, "step": 690000 }, { "epoch": 1.6, "learning_rate": 5e-05, "loss": 2.0546, "step": 691000 }, { "epoch": 1.6, "learning_rate": 5e-05, "loss": 2.0593, "step": 692000 }, { "epoch": 1.61, "learning_rate": 5e-05, "loss": 2.0599, "step": 693000 }, { "epoch": 1.61, "learning_rate": 5e-05, "loss": 2.0635, "step": 694000 }, { "epoch": 1.61, "learning_rate": 5e-05, "loss": 2.0621, "step": 695000 }, { "epoch": 1.61, "learning_rate": 5e-05, "loss": 2.0654, "step": 696000 }, { "epoch": 1.62, "learning_rate": 5e-05, "loss": 2.0576, "step": 697000 }, { "epoch": 1.62, "learning_rate": 5e-05, "loss": 2.0563, "step": 698000 }, { "epoch": 1.62, "learning_rate": 5e-05, "loss": 2.059, "step": 699000 }, { "epoch": 1.62, "learning_rate": 5e-05, "loss": 2.0616, "step": 700000 }, { "epoch": 1.62, "learning_rate": 5e-05, "loss": 2.0642, "step": 701000 }, { "epoch": 1.63, "learning_rate": 5e-05, "loss": 2.052, "step": 702000 }, { "epoch": 1.63, "learning_rate": 5e-05, "loss": 2.0569, "step": 703000 }, { "epoch": 1.63, "learning_rate": 5e-05, "loss": 2.0617, "step": 704000 }, { "epoch": 1.63, "learning_rate": 5e-05, "loss": 2.0613, "step": 705000 }, { "epoch": 1.64, "learning_rate": 5e-05, "loss": 2.0593, "step": 706000 }, { "epoch": 1.64, "learning_rate": 5e-05, "loss": 2.0582, "step": 707000 }, { "epoch": 1.64, "learning_rate": 5e-05, "loss": 2.0587, "step": 708000 }, { "epoch": 1.64, "learning_rate": 5e-05, "loss": 2.0518, "step": 709000 }, { "epoch": 1.65, "learning_rate": 5e-05, "loss": 2.0585, "step": 710000 }, { "epoch": 1.65, "learning_rate": 5e-05, "loss": 2.052, "step": 711000 }, { "epoch": 1.65, "learning_rate": 5e-05, "loss": 2.056, "step": 712000 }, { "epoch": 1.65, "learning_rate": 5e-05, "loss": 2.0559, "step": 713000 }, { "epoch": 1.65, "learning_rate": 5e-05, "loss": 2.0538, "step": 714000 }, { "epoch": 1.66, "learning_rate": 5e-05, "loss": 2.0566, "step": 715000 }, { "epoch": 1.66, "learning_rate": 5e-05, "loss": 2.0519, "step": 716000 }, { "epoch": 1.66, "learning_rate": 5e-05, "loss": 2.058, "step": 717000 }, { "epoch": 1.66, "learning_rate": 5e-05, "loss": 2.0627, "step": 718000 }, { "epoch": 1.67, "learning_rate": 5e-05, "loss": 2.0476, "step": 719000 }, { "epoch": 1.67, "learning_rate": 5e-05, "loss": 2.0484, "step": 720000 }, { "epoch": 1.67, "learning_rate": 5e-05, "loss": 2.0545, "step": 721000 }, { "epoch": 1.67, "learning_rate": 5e-05, "loss": 2.0558, "step": 722000 }, { "epoch": 1.68, "learning_rate": 5e-05, "loss": 2.0547, "step": 723000 }, { "epoch": 1.68, "learning_rate": 5e-05, "loss": 2.0501, "step": 724000 }, { "epoch": 1.68, "learning_rate": 5e-05, "loss": 2.0562, "step": 725000 }, { "epoch": 1.68, "learning_rate": 5e-05, "loss": 2.0467, "step": 726000 }, { "epoch": 1.69, "learning_rate": 5e-05, "loss": 2.0438, "step": 727000 }, { "epoch": 1.69, "learning_rate": 5e-05, "loss": 2.0458, "step": 728000 }, { "epoch": 1.69, "learning_rate": 5e-05, "loss": 2.0504, "step": 729000 }, { "epoch": 1.69, "learning_rate": 5e-05, "loss": 2.0501, "step": 730000 }, { "epoch": 1.69, "learning_rate": 5e-05, "loss": 2.0498, "step": 731000 }, { "epoch": 1.7, "learning_rate": 5e-05, "loss": 2.0495, "step": 732000 }, { "epoch": 1.7, "learning_rate": 5e-05, "loss": 2.0494, "step": 733000 }, { "epoch": 1.7, "learning_rate": 5e-05, "loss": 2.0469, "step": 734000 }, { "epoch": 1.7, "learning_rate": 5e-05, "loss": 2.0503, "step": 735000 }, { "epoch": 1.71, "learning_rate": 5e-05, "loss": 2.0492, "step": 736000 }, { "epoch": 1.71, "learning_rate": 5e-05, "loss": 2.0445, "step": 737000 }, { "epoch": 1.71, "learning_rate": 5e-05, "loss": 2.044, "step": 738000 }, { "epoch": 1.71, "learning_rate": 5e-05, "loss": 2.0418, "step": 739000 }, { "epoch": 1.72, "learning_rate": 5e-05, "loss": 2.0473, "step": 740000 }, { "epoch": 1.72, "learning_rate": 5e-05, "loss": 2.0503, "step": 741000 }, { "epoch": 1.72, "learning_rate": 5e-05, "loss": 2.0513, "step": 742000 }, { "epoch": 1.72, "learning_rate": 5e-05, "loss": 2.0492, "step": 743000 }, { "epoch": 1.72, "learning_rate": 5e-05, "loss": 2.0471, "step": 744000 }, { "epoch": 1.73, "learning_rate": 5e-05, "loss": 2.0357, "step": 745000 }, { "epoch": 1.73, "learning_rate": 5e-05, "loss": 2.045, "step": 746000 }, { "epoch": 1.73, "learning_rate": 5e-05, "loss": 2.042, "step": 747000 }, { "epoch": 1.73, "learning_rate": 5e-05, "loss": 2.0485, "step": 748000 }, { "epoch": 1.74, "learning_rate": 5e-05, "loss": 2.043, "step": 749000 }, { "epoch": 1.74, "learning_rate": 5e-05, "loss": 2.0345, "step": 750000 }, { "epoch": 1.74, "eval_accuracy": 0.6086964451170365, "eval_loss": 1.935546875, "eval_runtime": 4905.4193, "eval_samples_per_second": 113.853, "eval_steps_per_second": 0.89, "step": 750000 }, { "epoch": 1.74, "learning_rate": 5e-05, "loss": 2.0407, "step": 751000 }, { "epoch": 1.74, "learning_rate": 5e-05, "loss": 2.042, "step": 752000 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 2.0446, "step": 753000 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 2.0447, "step": 754000 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 2.0402, "step": 755000 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 2.0392, "step": 756000 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 2.037, "step": 757000 }, { "epoch": 1.76, "learning_rate": 5e-05, "loss": 2.0341, "step": 758000 }, { "epoch": 1.76, "learning_rate": 5e-05, "loss": 2.0381, "step": 759000 }, { "epoch": 1.76, "learning_rate": 5e-05, "loss": 2.0374, "step": 760000 }, { "epoch": 1.76, "learning_rate": 5e-05, "loss": 2.0347, "step": 761000 }, { "epoch": 1.77, "learning_rate": 5e-05, "loss": 2.0345, "step": 762000 }, { "epoch": 1.77, "learning_rate": 5e-05, "loss": 2.036, "step": 763000 }, { "epoch": 1.77, "learning_rate": 5e-05, "loss": 2.0357, "step": 764000 }, { "epoch": 1.77, "learning_rate": 5e-05, "loss": 2.0386, "step": 765000 }, { "epoch": 1.78, "learning_rate": 5e-05, "loss": 2.0363, "step": 766000 }, { "epoch": 1.78, "learning_rate": 5e-05, "loss": 2.0406, "step": 767000 }, { "epoch": 1.78, "learning_rate": 5e-05, "loss": 2.0332, "step": 768000 }, { "epoch": 1.78, "learning_rate": 5e-05, "loss": 2.0358, "step": 769000 }, { "epoch": 1.78, "learning_rate": 5e-05, "loss": 2.0359, "step": 770000 }, { "epoch": 1.79, "learning_rate": 5e-05, "loss": 2.0429, "step": 771000 }, { "epoch": 1.79, "learning_rate": 5e-05, "loss": 2.0402, "step": 772000 }, { "epoch": 1.79, "learning_rate": 5e-05, "loss": 2.0372, "step": 773000 }, { "epoch": 1.79, "learning_rate": 5e-05, "loss": 2.0343, "step": 774000 }, { "epoch": 1.8, "learning_rate": 5e-05, "loss": 2.0351, "step": 775000 }, { "epoch": 1.8, "learning_rate": 5e-05, "loss": 2.0366, "step": 776000 }, { "epoch": 1.8, "learning_rate": 5e-05, "loss": 2.0364, "step": 777000 }, { "epoch": 1.8, "learning_rate": 5e-05, "loss": 2.034, "step": 778000 }, { "epoch": 1.81, "learning_rate": 5e-05, "loss": 2.0313, "step": 779000 }, { "epoch": 1.81, "learning_rate": 5e-05, "loss": 2.0281, "step": 780000 }, { "epoch": 1.81, "learning_rate": 5e-05, "loss": 2.0316, "step": 781000 }, { "epoch": 1.81, "learning_rate": 5e-05, "loss": 2.0298, "step": 782000 }, { "epoch": 1.81, "learning_rate": 5e-05, "loss": 2.0292, "step": 783000 }, { "epoch": 1.82, "learning_rate": 5e-05, "loss": 2.0269, "step": 784000 }, { "epoch": 1.82, "learning_rate": 5e-05, "loss": 2.0309, "step": 785000 }, { "epoch": 1.82, "learning_rate": 5e-05, "loss": 2.0316, "step": 786000 }, { "epoch": 1.82, "learning_rate": 5e-05, "loss": 2.0379, "step": 787000 }, { "epoch": 1.83, "learning_rate": 5e-05, "loss": 2.0298, "step": 788000 }, { "epoch": 1.83, "learning_rate": 5e-05, "loss": 2.028, "step": 789000 }, { "epoch": 1.83, "learning_rate": 5e-05, "loss": 2.0278, "step": 790000 }, { "epoch": 1.83, "learning_rate": 5e-05, "loss": 2.0316, "step": 791000 }, { "epoch": 1.84, "learning_rate": 5e-05, "loss": 2.0269, "step": 792000 }, { "epoch": 1.84, "learning_rate": 5e-05, "loss": 2.031, "step": 793000 }, { "epoch": 1.84, "learning_rate": 5e-05, "loss": 2.0281, "step": 794000 }, { "epoch": 1.84, "learning_rate": 5e-05, "loss": 2.0237, "step": 795000 }, { "epoch": 1.85, "learning_rate": 5e-05, "loss": 2.0244, "step": 796000 }, { "epoch": 1.85, "learning_rate": 5e-05, "loss": 2.0262, "step": 797000 }, { "epoch": 1.85, "learning_rate": 5e-05, "loss": 2.0275, "step": 798000 }, { "epoch": 1.85, "learning_rate": 5e-05, "loss": 2.0253, "step": 799000 }, { "epoch": 1.85, "learning_rate": 5e-05, "loss": 2.0336, "step": 800000 }, { "epoch": 1.86, "learning_rate": 5e-05, "loss": 2.029, "step": 801000 }, { "epoch": 1.86, "learning_rate": 5e-05, "loss": 2.0271, "step": 802000 }, { "epoch": 1.86, "learning_rate": 5e-05, "loss": 2.0221, "step": 803000 }, { "epoch": 1.86, "learning_rate": 5e-05, "loss": 2.0287, "step": 804000 }, { "epoch": 1.87, "learning_rate": 5e-05, "loss": 2.0264, "step": 805000 }, { "epoch": 1.87, "learning_rate": 5e-05, "loss": 2.0263, "step": 806000 }, { "epoch": 1.87, "learning_rate": 5e-05, "loss": 2.0242, "step": 807000 }, { "epoch": 1.87, "learning_rate": 5e-05, "loss": 2.0221, "step": 808000 }, { "epoch": 1.88, "learning_rate": 5e-05, "loss": 2.0242, "step": 809000 }, { "epoch": 1.88, "learning_rate": 5e-05, "loss": 2.0212, "step": 810000 }, { "epoch": 1.88, "learning_rate": 5e-05, "loss": 2.0213, "step": 811000 }, { "epoch": 1.88, "learning_rate": 5e-05, "loss": 2.0182, "step": 812000 }, { "epoch": 1.88, "learning_rate": 5e-05, "loss": 2.0195, "step": 813000 }, { "epoch": 1.89, "learning_rate": 5e-05, "loss": 2.0245, "step": 814000 }, { "epoch": 1.89, "learning_rate": 5e-05, "loss": 2.018, "step": 815000 }, { "epoch": 1.89, "learning_rate": 5e-05, "loss": 2.0219, "step": 816000 }, { "epoch": 1.89, "learning_rate": 5e-05, "loss": 2.0221, "step": 817000 }, { "epoch": 1.9, "learning_rate": 5e-05, "loss": 2.019, "step": 818000 }, { "epoch": 1.9, "learning_rate": 5e-05, "loss": 2.0199, "step": 819000 }, { "epoch": 1.9, "learning_rate": 5e-05, "loss": 2.0166, "step": 820000 }, { "epoch": 1.9, "learning_rate": 5e-05, "loss": 2.0192, "step": 821000 }, { "epoch": 1.91, "learning_rate": 5e-05, "loss": 2.0217, "step": 822000 }, { "epoch": 1.91, "learning_rate": 5e-05, "loss": 2.0179, "step": 823000 }, { "epoch": 1.91, "learning_rate": 5e-05, "loss": 2.0242, "step": 824000 }, { "epoch": 1.91, "learning_rate": 5e-05, "loss": 2.0158, "step": 825000 }, { "epoch": 1.91, "learning_rate": 5e-05, "loss": 2.013, "step": 826000 }, { "epoch": 1.92, "learning_rate": 5e-05, "loss": 2.0196, "step": 827000 }, { "epoch": 1.92, "learning_rate": 5e-05, "loss": 2.02, "step": 828000 }, { "epoch": 1.92, "learning_rate": 5e-05, "loss": 2.021, "step": 829000 }, { "epoch": 1.92, "learning_rate": 5e-05, "loss": 2.0192, "step": 830000 }, { "epoch": 1.93, "learning_rate": 5e-05, "loss": 2.0191, "step": 831000 }, { "epoch": 1.93, "learning_rate": 5e-05, "loss": 2.0184, "step": 832000 }, { "epoch": 1.93, "learning_rate": 5e-05, "loss": 2.0196, "step": 833000 }, { "epoch": 1.93, "learning_rate": 5e-05, "loss": 2.0183, "step": 834000 }, { "epoch": 1.94, "learning_rate": 5e-05, "loss": 2.0134, "step": 835000 }, { "epoch": 1.94, "learning_rate": 5e-05, "loss": 2.0163, "step": 836000 }, { "epoch": 1.94, "learning_rate": 5e-05, "loss": 2.0113, "step": 837000 }, { "epoch": 1.94, "learning_rate": 5e-05, "loss": 2.0206, "step": 838000 }, { "epoch": 1.94, "learning_rate": 5e-05, "loss": 2.0166, "step": 839000 }, { "epoch": 1.95, "learning_rate": 5e-05, "loss": 2.0181, "step": 840000 }, { "epoch": 1.95, "learning_rate": 5e-05, "loss": 2.0168, "step": 841000 }, { "epoch": 1.95, "learning_rate": 5e-05, "loss": 2.01, "step": 842000 }, { "epoch": 1.95, "learning_rate": 5e-05, "loss": 2.0153, "step": 843000 }, { "epoch": 1.96, "learning_rate": 5e-05, "loss": 2.0084, "step": 844000 }, { "epoch": 1.96, "learning_rate": 5e-05, "loss": 2.0169, "step": 845000 }, { "epoch": 1.96, "learning_rate": 5e-05, "loss": 2.0142, "step": 846000 }, { "epoch": 1.96, "learning_rate": 5e-05, "loss": 2.0132, "step": 847000 }, { "epoch": 1.97, "learning_rate": 5e-05, "loss": 2.0126, "step": 848000 }, { "epoch": 1.97, "learning_rate": 5e-05, "loss": 2.0121, "step": 849000 }, { "epoch": 1.97, "learning_rate": 5e-05, "loss": 2.0122, "step": 850000 }, { "epoch": 1.97, "learning_rate": 5e-05, "loss": 2.0149, "step": 851000 }, { "epoch": 1.97, "learning_rate": 5e-05, "loss": 2.0125, "step": 852000 }, { "epoch": 1.98, "learning_rate": 5e-05, "loss": 2.0074, "step": 853000 }, { "epoch": 1.98, "learning_rate": 5e-05, "loss": 2.0136, "step": 854000 }, { "epoch": 1.98, "learning_rate": 5e-05, "loss": 2.0118, "step": 855000 }, { "epoch": 1.98, "learning_rate": 5e-05, "loss": 2.0139, "step": 856000 }, { "epoch": 1.99, "learning_rate": 5e-05, "loss": 2.0158, "step": 857000 }, { "epoch": 1.99, "learning_rate": 5e-05, "loss": 2.0105, "step": 858000 }, { "epoch": 1.99, "learning_rate": 5e-05, "loss": 2.0075, "step": 859000 }, { "epoch": 1.99, "learning_rate": 5e-05, "loss": 2.007, "step": 860000 }, { "epoch": 2.0, "learning_rate": 5e-05, "loss": 2.0143, "step": 861000 }, { "epoch": 2.0, "learning_rate": 5e-05, "loss": 2.0067, "step": 862000 }, { "epoch": 2.0, "step": 862850, "total_flos": 1.5713271682523202e+19, "train_loss": 0.02994945877374942, "train_runtime": 24111.6962, "train_samples_per_second": 9161.097, "train_steps_per_second": 35.786 } ], "max_steps": 862850, "num_train_epochs": 2, "total_flos": 1.5713271682523202e+19, "trial_name": null, "trial_params": null }