|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 500.0, |
|
"global_step": 22000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 4.55, |
|
"eval_loss": 1.7275390625, |
|
"eval_runtime": 2.8452, |
|
"eval_samples_per_second": 113.875, |
|
"eval_steps_per_second": 14.41, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"eval_loss": 1.052734375, |
|
"eval_runtime": 2.8537, |
|
"eval_samples_per_second": 113.538, |
|
"eval_steps_per_second": 14.367, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 11.36, |
|
"learning_rate": 5e-05, |
|
"loss": 1.9751, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"eval_loss": 0.64697265625, |
|
"eval_runtime": 2.8455, |
|
"eval_samples_per_second": 113.862, |
|
"eval_steps_per_second": 14.409, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"eval_loss": 0.3955078125, |
|
"eval_runtime": 2.8586, |
|
"eval_samples_per_second": 113.342, |
|
"eval_steps_per_second": 14.343, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 22.73, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7515, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 22.73, |
|
"eval_loss": 0.25244140625, |
|
"eval_runtime": 2.8476, |
|
"eval_samples_per_second": 113.78, |
|
"eval_steps_per_second": 14.398, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 27.27, |
|
"eval_loss": 0.167236328125, |
|
"eval_runtime": 2.8473, |
|
"eval_samples_per_second": 113.792, |
|
"eval_steps_per_second": 14.4, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 31.82, |
|
"eval_loss": 0.1171875, |
|
"eval_runtime": 2.8502, |
|
"eval_samples_per_second": 113.678, |
|
"eval_steps_per_second": 14.385, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 34.09, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3181, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 36.36, |
|
"eval_loss": 0.089599609375, |
|
"eval_runtime": 2.853, |
|
"eval_samples_per_second": 113.564, |
|
"eval_steps_per_second": 14.371, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 40.91, |
|
"eval_loss": 0.07159423828125, |
|
"eval_runtime": 2.8542, |
|
"eval_samples_per_second": 113.519, |
|
"eval_steps_per_second": 14.365, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 45.45, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1654, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 45.45, |
|
"eval_loss": 0.06207275390625, |
|
"eval_runtime": 2.8559, |
|
"eval_samples_per_second": 113.449, |
|
"eval_steps_per_second": 14.356, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_loss": 0.053802490234375, |
|
"eval_runtime": 2.8461, |
|
"eval_samples_per_second": 113.841, |
|
"eval_steps_per_second": 14.406, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 54.55, |
|
"eval_loss": 0.049713134765625, |
|
"eval_runtime": 2.8533, |
|
"eval_samples_per_second": 113.551, |
|
"eval_steps_per_second": 14.369, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 56.82, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1059, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 59.09, |
|
"eval_loss": 0.047698974609375, |
|
"eval_runtime": 2.8541, |
|
"eval_samples_per_second": 113.523, |
|
"eval_steps_per_second": 14.366, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 63.64, |
|
"eval_loss": 0.04449462890625, |
|
"eval_runtime": 2.8515, |
|
"eval_samples_per_second": 113.625, |
|
"eval_steps_per_second": 14.379, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 68.18, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0771, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 68.18, |
|
"eval_loss": 0.04400634765625, |
|
"eval_runtime": 2.853, |
|
"eval_samples_per_second": 113.565, |
|
"eval_steps_per_second": 14.371, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 72.73, |
|
"eval_loss": 0.03985595703125, |
|
"eval_runtime": 2.8483, |
|
"eval_samples_per_second": 113.753, |
|
"eval_steps_per_second": 14.395, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 77.27, |
|
"eval_loss": 0.039398193359375, |
|
"eval_runtime": 2.849, |
|
"eval_samples_per_second": 113.725, |
|
"eval_steps_per_second": 14.391, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 79.55, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0604, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 81.82, |
|
"eval_loss": 0.03973388671875, |
|
"eval_runtime": 2.8542, |
|
"eval_samples_per_second": 113.518, |
|
"eval_steps_per_second": 14.365, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 86.36, |
|
"eval_loss": 0.038177490234375, |
|
"eval_runtime": 2.85, |
|
"eval_samples_per_second": 113.682, |
|
"eval_steps_per_second": 14.386, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 90.91, |
|
"learning_rate": 5e-05, |
|
"loss": 0.05, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 90.91, |
|
"eval_loss": 0.037384033203125, |
|
"eval_runtime": 2.8526, |
|
"eval_samples_per_second": 113.581, |
|
"eval_steps_per_second": 14.373, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 95.45, |
|
"eval_loss": 0.037322998046875, |
|
"eval_runtime": 2.8554, |
|
"eval_samples_per_second": 113.468, |
|
"eval_steps_per_second": 14.359, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_loss": 0.035858154296875, |
|
"eval_runtime": 2.8533, |
|
"eval_samples_per_second": 113.555, |
|
"eval_steps_per_second": 14.37, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 102.27, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0423, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 104.55, |
|
"eval_loss": 0.036163330078125, |
|
"eval_runtime": 2.8573, |
|
"eval_samples_per_second": 113.392, |
|
"eval_steps_per_second": 14.349, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 109.09, |
|
"eval_loss": 0.0374755859375, |
|
"eval_runtime": 2.8521, |
|
"eval_samples_per_second": 113.599, |
|
"eval_steps_per_second": 14.375, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 113.64, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0364, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 113.64, |
|
"eval_loss": 0.035125732421875, |
|
"eval_runtime": 2.8541, |
|
"eval_samples_per_second": 113.521, |
|
"eval_steps_per_second": 14.365, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 118.18, |
|
"eval_loss": 0.034515380859375, |
|
"eval_runtime": 2.8551, |
|
"eval_samples_per_second": 113.483, |
|
"eval_steps_per_second": 14.361, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 122.73, |
|
"eval_loss": 0.036224365234375, |
|
"eval_runtime": 2.8548, |
|
"eval_samples_per_second": 113.491, |
|
"eval_steps_per_second": 14.362, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 125.0, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0321, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 127.27, |
|
"eval_loss": 0.0335693359375, |
|
"eval_runtime": 2.8567, |
|
"eval_samples_per_second": 113.418, |
|
"eval_steps_per_second": 14.352, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 131.82, |
|
"eval_loss": 0.03485107421875, |
|
"eval_runtime": 2.853, |
|
"eval_samples_per_second": 113.566, |
|
"eval_steps_per_second": 14.371, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 136.36, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0285, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 136.36, |
|
"eval_loss": 0.03436279296875, |
|
"eval_runtime": 2.8569, |
|
"eval_samples_per_second": 113.41, |
|
"eval_steps_per_second": 14.351, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 140.91, |
|
"eval_loss": 0.03497314453125, |
|
"eval_runtime": 2.8533, |
|
"eval_samples_per_second": 113.554, |
|
"eval_steps_per_second": 14.37, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 145.45, |
|
"eval_loss": 0.034088134765625, |
|
"eval_runtime": 2.8565, |
|
"eval_samples_per_second": 113.424, |
|
"eval_steps_per_second": 14.353, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 147.73, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0257, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 150.0, |
|
"eval_loss": 0.0341796875, |
|
"eval_runtime": 2.8575, |
|
"eval_samples_per_second": 113.385, |
|
"eval_steps_per_second": 14.348, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 154.55, |
|
"eval_loss": 0.035675048828125, |
|
"eval_runtime": 2.8571, |
|
"eval_samples_per_second": 113.402, |
|
"eval_steps_per_second": 14.35, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 159.09, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0235, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 159.09, |
|
"eval_loss": 0.0352783203125, |
|
"eval_runtime": 2.854, |
|
"eval_samples_per_second": 113.526, |
|
"eval_steps_per_second": 14.366, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 163.64, |
|
"eval_loss": 0.034881591796875, |
|
"eval_runtime": 2.8546, |
|
"eval_samples_per_second": 113.502, |
|
"eval_steps_per_second": 14.363, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 168.18, |
|
"eval_loss": 0.035003662109375, |
|
"eval_runtime": 2.8564, |
|
"eval_samples_per_second": 113.428, |
|
"eval_steps_per_second": 14.354, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 170.45, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0216, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 172.73, |
|
"eval_loss": 0.034149169921875, |
|
"eval_runtime": 2.8551, |
|
"eval_samples_per_second": 113.483, |
|
"eval_steps_per_second": 14.36, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 177.27, |
|
"eval_loss": 0.035186767578125, |
|
"eval_runtime": 2.8549, |
|
"eval_samples_per_second": 113.49, |
|
"eval_steps_per_second": 14.361, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 181.82, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0199, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 181.82, |
|
"eval_loss": 0.034149169921875, |
|
"eval_runtime": 2.8566, |
|
"eval_samples_per_second": 113.422, |
|
"eval_steps_per_second": 14.353, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 186.36, |
|
"eval_loss": 0.035308837890625, |
|
"eval_runtime": 2.8525, |
|
"eval_samples_per_second": 113.584, |
|
"eval_steps_per_second": 14.373, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 190.91, |
|
"eval_loss": 0.03472900390625, |
|
"eval_runtime": 2.8548, |
|
"eval_samples_per_second": 113.494, |
|
"eval_steps_per_second": 14.362, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 193.18, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0184, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 195.45, |
|
"eval_loss": 0.034210205078125, |
|
"eval_runtime": 2.8577, |
|
"eval_samples_per_second": 113.377, |
|
"eval_steps_per_second": 14.347, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"eval_loss": 0.03448486328125, |
|
"eval_runtime": 2.8551, |
|
"eval_samples_per_second": 113.48, |
|
"eval_steps_per_second": 14.36, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 204.55, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0173, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 204.55, |
|
"eval_loss": 0.034454345703125, |
|
"eval_runtime": 2.8538, |
|
"eval_samples_per_second": 113.533, |
|
"eval_steps_per_second": 14.367, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 209.09, |
|
"eval_loss": 0.0343017578125, |
|
"eval_runtime": 2.8484, |
|
"eval_samples_per_second": 113.747, |
|
"eval_steps_per_second": 14.394, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 213.64, |
|
"eval_loss": 0.0355224609375, |
|
"eval_runtime": 2.852, |
|
"eval_samples_per_second": 113.606, |
|
"eval_steps_per_second": 14.376, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 215.91, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0162, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 218.18, |
|
"eval_loss": 0.0350341796875, |
|
"eval_runtime": 2.8527, |
|
"eval_samples_per_second": 113.575, |
|
"eval_steps_per_second": 14.372, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 222.73, |
|
"eval_loss": 0.035247802734375, |
|
"eval_runtime": 2.8538, |
|
"eval_samples_per_second": 113.532, |
|
"eval_steps_per_second": 14.367, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 227.27, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0152, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 227.27, |
|
"eval_loss": 0.0350341796875, |
|
"eval_runtime": 2.856, |
|
"eval_samples_per_second": 113.444, |
|
"eval_steps_per_second": 14.356, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 231.82, |
|
"eval_loss": 0.0350341796875, |
|
"eval_runtime": 2.8472, |
|
"eval_samples_per_second": 113.795, |
|
"eval_steps_per_second": 14.4, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 236.36, |
|
"eval_loss": 0.035400390625, |
|
"eval_runtime": 2.8527, |
|
"eval_samples_per_second": 113.575, |
|
"eval_steps_per_second": 14.372, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 238.64, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0145, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 240.91, |
|
"eval_loss": 0.035125732421875, |
|
"eval_runtime": 2.8574, |
|
"eval_samples_per_second": 113.392, |
|
"eval_steps_per_second": 14.349, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 245.45, |
|
"eval_loss": 0.03546142578125, |
|
"eval_runtime": 2.8478, |
|
"eval_samples_per_second": 113.773, |
|
"eval_steps_per_second": 14.397, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 250.0, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0137, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 250.0, |
|
"eval_loss": 0.035552978515625, |
|
"eval_runtime": 2.8538, |
|
"eval_samples_per_second": 113.533, |
|
"eval_steps_per_second": 14.367, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 254.55, |
|
"eval_loss": 0.035736083984375, |
|
"eval_runtime": 2.848, |
|
"eval_samples_per_second": 113.764, |
|
"eval_steps_per_second": 14.396, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 259.09, |
|
"eval_loss": 0.03582763671875, |
|
"eval_runtime": 2.8521, |
|
"eval_samples_per_second": 113.601, |
|
"eval_steps_per_second": 14.375, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 261.36, |
|
"learning_rate": 5e-05, |
|
"loss": 0.013, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 263.64, |
|
"eval_loss": 0.03546142578125, |
|
"eval_runtime": 2.8549, |
|
"eval_samples_per_second": 113.489, |
|
"eval_steps_per_second": 14.361, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 268.18, |
|
"eval_loss": 0.035919189453125, |
|
"eval_runtime": 2.8533, |
|
"eval_samples_per_second": 113.553, |
|
"eval_steps_per_second": 14.369, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 272.73, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0127, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 272.73, |
|
"eval_loss": 0.0361328125, |
|
"eval_runtime": 2.8539, |
|
"eval_samples_per_second": 113.529, |
|
"eval_steps_per_second": 14.366, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 277.27, |
|
"eval_loss": 0.03619384765625, |
|
"eval_runtime": 2.8463, |
|
"eval_samples_per_second": 113.832, |
|
"eval_steps_per_second": 14.405, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 281.82, |
|
"eval_loss": 0.0361328125, |
|
"eval_runtime": 2.8549, |
|
"eval_samples_per_second": 113.491, |
|
"eval_steps_per_second": 14.362, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 284.09, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0118, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 286.36, |
|
"eval_loss": 0.035736083984375, |
|
"eval_runtime": 2.8573, |
|
"eval_samples_per_second": 113.395, |
|
"eval_steps_per_second": 14.349, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 290.91, |
|
"eval_loss": 0.036529541015625, |
|
"eval_runtime": 2.8571, |
|
"eval_samples_per_second": 113.402, |
|
"eval_steps_per_second": 14.35, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 295.45, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0114, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 295.45, |
|
"eval_loss": 0.0372314453125, |
|
"eval_runtime": 2.8553, |
|
"eval_samples_per_second": 113.473, |
|
"eval_steps_per_second": 14.359, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 300.0, |
|
"eval_loss": 0.037384033203125, |
|
"eval_runtime": 2.8451, |
|
"eval_samples_per_second": 113.881, |
|
"eval_steps_per_second": 14.411, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 304.55, |
|
"eval_loss": 0.0396728515625, |
|
"eval_runtime": 2.8561, |
|
"eval_samples_per_second": 113.44, |
|
"eval_steps_per_second": 14.355, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 306.82, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0106, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 309.09, |
|
"eval_loss": 0.036529541015625, |
|
"eval_runtime": 2.8552, |
|
"eval_samples_per_second": 113.478, |
|
"eval_steps_per_second": 14.36, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 313.64, |
|
"eval_loss": 0.036346435546875, |
|
"eval_runtime": 2.8561, |
|
"eval_samples_per_second": 113.442, |
|
"eval_steps_per_second": 14.355, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 318.18, |
|
"learning_rate": 5e-05, |
|
"loss": 0.01, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 318.18, |
|
"eval_loss": 0.037811279296875, |
|
"eval_runtime": 2.8552, |
|
"eval_samples_per_second": 113.477, |
|
"eval_steps_per_second": 14.36, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 322.73, |
|
"eval_loss": 0.036865234375, |
|
"eval_runtime": 2.8537, |
|
"eval_samples_per_second": 113.537, |
|
"eval_steps_per_second": 14.367, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 327.27, |
|
"eval_loss": 0.037261962890625, |
|
"eval_runtime": 2.8546, |
|
"eval_samples_per_second": 113.503, |
|
"eval_steps_per_second": 14.363, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 329.55, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0096, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 331.82, |
|
"eval_loss": 0.036590576171875, |
|
"eval_runtime": 2.8583, |
|
"eval_samples_per_second": 113.355, |
|
"eval_steps_per_second": 14.344, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 336.36, |
|
"eval_loss": 0.037109375, |
|
"eval_runtime": 2.8515, |
|
"eval_samples_per_second": 113.625, |
|
"eval_steps_per_second": 14.378, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 340.91, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0093, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 340.91, |
|
"eval_loss": 0.037567138671875, |
|
"eval_runtime": 2.8557, |
|
"eval_samples_per_second": 113.459, |
|
"eval_steps_per_second": 14.357, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 345.45, |
|
"eval_loss": 0.03717041015625, |
|
"eval_runtime": 2.8575, |
|
"eval_samples_per_second": 113.384, |
|
"eval_steps_per_second": 14.348, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 350.0, |
|
"eval_loss": 0.03729248046875, |
|
"eval_runtime": 2.8457, |
|
"eval_samples_per_second": 113.856, |
|
"eval_steps_per_second": 14.408, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 352.27, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0087, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 354.55, |
|
"eval_loss": 0.037506103515625, |
|
"eval_runtime": 2.8601, |
|
"eval_samples_per_second": 113.284, |
|
"eval_steps_per_second": 14.335, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 359.09, |
|
"eval_loss": 0.037445068359375, |
|
"eval_runtime": 2.8482, |
|
"eval_samples_per_second": 113.755, |
|
"eval_steps_per_second": 14.395, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 363.64, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0086, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 363.64, |
|
"eval_loss": 0.037841796875, |
|
"eval_runtime": 2.8578, |
|
"eval_samples_per_second": 113.373, |
|
"eval_steps_per_second": 14.347, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 368.18, |
|
"eval_loss": 0.03704833984375, |
|
"eval_runtime": 2.8552, |
|
"eval_samples_per_second": 113.477, |
|
"eval_steps_per_second": 14.36, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 372.73, |
|
"eval_loss": 0.0380859375, |
|
"eval_runtime": 2.851, |
|
"eval_samples_per_second": 113.644, |
|
"eval_steps_per_second": 14.381, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 375.0, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0084, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 377.27, |
|
"eval_loss": 0.037200927734375, |
|
"eval_runtime": 2.851, |
|
"eval_samples_per_second": 113.644, |
|
"eval_steps_per_second": 14.381, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 381.82, |
|
"eval_loss": 0.03680419921875, |
|
"eval_runtime": 2.8529, |
|
"eval_samples_per_second": 113.567, |
|
"eval_steps_per_second": 14.371, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 386.36, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0078, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 386.36, |
|
"eval_loss": 0.036956787109375, |
|
"eval_runtime": 2.8565, |
|
"eval_samples_per_second": 113.425, |
|
"eval_steps_per_second": 14.353, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 390.91, |
|
"eval_loss": 0.03759765625, |
|
"eval_runtime": 2.8479, |
|
"eval_samples_per_second": 113.768, |
|
"eval_steps_per_second": 14.397, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 395.45, |
|
"eval_loss": 0.038238525390625, |
|
"eval_runtime": 2.8538, |
|
"eval_samples_per_second": 113.534, |
|
"eval_steps_per_second": 14.367, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 397.73, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0076, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 400.0, |
|
"eval_loss": 0.03778076171875, |
|
"eval_runtime": 2.8549, |
|
"eval_samples_per_second": 113.488, |
|
"eval_steps_per_second": 14.361, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 404.55, |
|
"eval_loss": 0.038482666015625, |
|
"eval_runtime": 2.8533, |
|
"eval_samples_per_second": 113.554, |
|
"eval_steps_per_second": 14.37, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 409.09, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0073, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 409.09, |
|
"eval_loss": 0.037872314453125, |
|
"eval_runtime": 2.8518, |
|
"eval_samples_per_second": 113.614, |
|
"eval_steps_per_second": 14.377, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 413.64, |
|
"eval_loss": 0.038787841796875, |
|
"eval_runtime": 2.8524, |
|
"eval_samples_per_second": 113.588, |
|
"eval_steps_per_second": 14.374, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 418.18, |
|
"eval_loss": 0.0379638671875, |
|
"eval_runtime": 2.848, |
|
"eval_samples_per_second": 113.766, |
|
"eval_steps_per_second": 14.396, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 420.45, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0072, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 422.73, |
|
"eval_loss": 0.0380859375, |
|
"eval_runtime": 2.8569, |
|
"eval_samples_per_second": 113.409, |
|
"eval_steps_per_second": 14.351, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 427.27, |
|
"eval_loss": 0.037506103515625, |
|
"eval_runtime": 2.8521, |
|
"eval_samples_per_second": 113.599, |
|
"eval_steps_per_second": 14.375, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 431.82, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0072, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 431.82, |
|
"eval_loss": 0.038818359375, |
|
"eval_runtime": 2.8529, |
|
"eval_samples_per_second": 113.569, |
|
"eval_steps_per_second": 14.371, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 436.36, |
|
"eval_loss": 0.0380859375, |
|
"eval_runtime": 2.8546, |
|
"eval_samples_per_second": 113.501, |
|
"eval_steps_per_second": 14.363, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 440.91, |
|
"eval_loss": 0.037322998046875, |
|
"eval_runtime": 2.8547, |
|
"eval_samples_per_second": 113.496, |
|
"eval_steps_per_second": 14.362, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 443.18, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0066, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 445.45, |
|
"eval_loss": 0.037567138671875, |
|
"eval_runtime": 2.8553, |
|
"eval_samples_per_second": 113.474, |
|
"eval_steps_per_second": 14.359, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 450.0, |
|
"eval_loss": 0.038421630859375, |
|
"eval_runtime": 2.8536, |
|
"eval_samples_per_second": 113.543, |
|
"eval_steps_per_second": 14.368, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 454.55, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0066, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 454.55, |
|
"eval_loss": 0.0384521484375, |
|
"eval_runtime": 2.8565, |
|
"eval_samples_per_second": 113.424, |
|
"eval_steps_per_second": 14.353, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 459.09, |
|
"eval_loss": 0.039764404296875, |
|
"eval_runtime": 2.858, |
|
"eval_samples_per_second": 113.365, |
|
"eval_steps_per_second": 14.346, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 463.64, |
|
"eval_loss": 0.0384521484375, |
|
"eval_runtime": 2.8472, |
|
"eval_samples_per_second": 113.796, |
|
"eval_steps_per_second": 14.4, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 465.91, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0064, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 468.18, |
|
"eval_loss": 0.038665771484375, |
|
"eval_runtime": 2.8586, |
|
"eval_samples_per_second": 113.343, |
|
"eval_steps_per_second": 14.343, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 472.73, |
|
"eval_loss": 0.038970947265625, |
|
"eval_runtime": 2.8462, |
|
"eval_samples_per_second": 113.837, |
|
"eval_steps_per_second": 14.405, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 477.27, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0063, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 477.27, |
|
"eval_loss": 0.037750244140625, |
|
"eval_runtime": 2.8576, |
|
"eval_samples_per_second": 113.383, |
|
"eval_steps_per_second": 14.348, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 481.82, |
|
"eval_loss": 0.039154052734375, |
|
"eval_runtime": 2.8571, |
|
"eval_samples_per_second": 113.4, |
|
"eval_steps_per_second": 14.35, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 486.36, |
|
"eval_loss": 0.037994384765625, |
|
"eval_runtime": 2.8502, |
|
"eval_samples_per_second": 113.674, |
|
"eval_steps_per_second": 14.385, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 488.64, |
|
"learning_rate": 5e-05, |
|
"loss": 0.006, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 490.91, |
|
"eval_loss": 0.0377197265625, |
|
"eval_runtime": 2.8583, |
|
"eval_samples_per_second": 113.356, |
|
"eval_steps_per_second": 14.344, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 495.45, |
|
"eval_loss": 0.03857421875, |
|
"eval_runtime": 2.8515, |
|
"eval_samples_per_second": 113.623, |
|
"eval_steps_per_second": 14.378, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 500.0, |
|
"learning_rate": 5e-05, |
|
"loss": 0.006, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 500.0, |
|
"eval_loss": 0.038116455078125, |
|
"eval_runtime": 2.8565, |
|
"eval_samples_per_second": 113.426, |
|
"eval_steps_per_second": 14.353, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 500.0, |
|
"step": 22000, |
|
"total_flos": 9.1976456208384e+16, |
|
"train_loss": 0.09131514180790294, |
|
"train_runtime": 19622.0994, |
|
"train_samples_per_second": 17.939, |
|
"train_steps_per_second": 1.121 |
|
} |
|
], |
|
"max_steps": 22000, |
|
"num_train_epochs": 500, |
|
"total_flos": 9.1976456208384e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|