|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 10, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 22.5, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 1.9102, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 1.9285269975662231, |
|
"eval_runtime": 2.9533, |
|
"eval_samples_per_second": 52.822, |
|
"eval_steps_per_second": 2.709, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 14.0, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.8734, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 1.907572627067566, |
|
"eval_runtime": 2.9586, |
|
"eval_samples_per_second": 52.728, |
|
"eval_steps_per_second": 2.704, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 9.625, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.77, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.8689841032028198, |
|
"eval_runtime": 2.9565, |
|
"eval_samples_per_second": 52.765, |
|
"eval_steps_per_second": 2.706, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.8568, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 1.812107801437378, |
|
"eval_runtime": 2.9619, |
|
"eval_samples_per_second": 52.669, |
|
"eval_steps_per_second": 2.701, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.7562, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.7258977890014648, |
|
"eval_runtime": 2.9652, |
|
"eval_samples_per_second": 52.61, |
|
"eval_steps_per_second": 2.698, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.6208, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.5986799001693726, |
|
"eval_runtime": 2.9727, |
|
"eval_samples_per_second": 52.478, |
|
"eval_steps_per_second": 2.691, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 1.4408, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.3833414316177368, |
|
"eval_runtime": 2.9613, |
|
"eval_samples_per_second": 52.679, |
|
"eval_steps_per_second": 2.701, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 1.1332, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.1267226934432983, |
|
"eval_runtime": 2.97, |
|
"eval_samples_per_second": 52.525, |
|
"eval_steps_per_second": 2.694, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 1.0659, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 1.0400179624557495, |
|
"eval_runtime": 2.9774, |
|
"eval_samples_per_second": 52.395, |
|
"eval_steps_per_second": 2.687, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.9716721773147583, |
|
"eval_runtime": 2.9692, |
|
"eval_samples_per_second": 52.54, |
|
"eval_steps_per_second": 2.694, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.9241, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 0.9269797205924988, |
|
"eval_runtime": 2.9883, |
|
"eval_samples_per_second": 52.204, |
|
"eval_steps_per_second": 2.677, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.8071, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.8944205045700073, |
|
"eval_runtime": 2.9805, |
|
"eval_samples_per_second": 52.341, |
|
"eval_steps_per_second": 2.684, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.7872, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 0.874414324760437, |
|
"eval_runtime": 2.9734, |
|
"eval_samples_per_second": 52.465, |
|
"eval_steps_per_second": 2.691, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.7182, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 0.8589532375335693, |
|
"eval_runtime": 2.9762, |
|
"eval_samples_per_second": 52.415, |
|
"eval_steps_per_second": 2.688, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 6e-06, |
|
"loss": 0.7139, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.8482792973518372, |
|
"eval_runtime": 2.974, |
|
"eval_samples_per_second": 52.455, |
|
"eval_steps_per_second": 2.69, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.7533, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.8366221785545349, |
|
"eval_runtime": 2.9775, |
|
"eval_samples_per_second": 52.393, |
|
"eval_steps_per_second": 2.687, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.625, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.808, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 0.8267260789871216, |
|
"eval_runtime": 2.9805, |
|
"eval_samples_per_second": 52.341, |
|
"eval_steps_per_second": 2.684, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.5991, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 0.8209083676338196, |
|
"eval_runtime": 2.9739, |
|
"eval_samples_per_second": 52.456, |
|
"eval_steps_per_second": 2.69, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.6681, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 0.8131274580955505, |
|
"eval_runtime": 2.9722, |
|
"eval_samples_per_second": 52.486, |
|
"eval_steps_per_second": 2.692, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6707, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.8074705600738525, |
|
"eval_runtime": 2.9825, |
|
"eval_samples_per_second": 52.305, |
|
"eval_steps_per_second": 2.682, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.7301, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 0.8011156916618347, |
|
"eval_runtime": 2.9798, |
|
"eval_samples_per_second": 52.352, |
|
"eval_steps_per_second": 2.685, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.6486, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.7965356111526489, |
|
"eval_runtime": 2.9734, |
|
"eval_samples_per_second": 52.465, |
|
"eval_steps_per_second": 2.69, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.6744, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 0.7928165197372437, |
|
"eval_runtime": 2.9833, |
|
"eval_samples_per_second": 52.29, |
|
"eval_steps_per_second": 2.682, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.7228, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.7894927859306335, |
|
"eval_runtime": 2.9769, |
|
"eval_samples_per_second": 52.403, |
|
"eval_steps_per_second": 2.687, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7532, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.7849324345588684, |
|
"eval_runtime": 2.9712, |
|
"eval_samples_per_second": 52.504, |
|
"eval_steps_per_second": 2.693, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.7019, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 0.7828314304351807, |
|
"eval_runtime": 2.979, |
|
"eval_samples_per_second": 52.366, |
|
"eval_steps_per_second": 2.685, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 0.7255, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 0.7806087136268616, |
|
"eval_runtime": 2.9708, |
|
"eval_samples_per_second": 52.512, |
|
"eval_steps_per_second": 2.693, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.6541, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.7776769995689392, |
|
"eval_runtime": 2.972, |
|
"eval_samples_per_second": 52.489, |
|
"eval_steps_per_second": 2.692, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.16e-05, |
|
"loss": 0.6201, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 0.7737365961074829, |
|
"eval_runtime": 2.9739, |
|
"eval_samples_per_second": 52.456, |
|
"eval_steps_per_second": 2.69, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.6149, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.7721969485282898, |
|
"eval_runtime": 2.9719, |
|
"eval_samples_per_second": 52.492, |
|
"eval_steps_per_second": 2.692, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.2400000000000002e-05, |
|
"loss": 0.6927, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 0.7704524397850037, |
|
"eval_runtime": 2.9722, |
|
"eval_samples_per_second": 52.487, |
|
"eval_steps_per_second": 2.692, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.6573, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.7695561051368713, |
|
"eval_runtime": 2.9675, |
|
"eval_samples_per_second": 52.57, |
|
"eval_steps_per_second": 2.696, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 0.6786, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 0.7669094800949097, |
|
"eval_runtime": 2.9692, |
|
"eval_samples_per_second": 52.54, |
|
"eval_steps_per_second": 2.694, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.7484, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 0.7643001079559326, |
|
"eval_runtime": 2.9737, |
|
"eval_samples_per_second": 52.46, |
|
"eval_steps_per_second": 2.69, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.5991, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.7622258067131042, |
|
"eval_runtime": 2.9746, |
|
"eval_samples_per_second": 52.444, |
|
"eval_steps_per_second": 2.689, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.6621, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.761843204498291, |
|
"eval_runtime": 2.9802, |
|
"eval_samples_per_second": 52.345, |
|
"eval_steps_per_second": 2.684, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 1.48e-05, |
|
"loss": 0.697, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 0.7606173753738403, |
|
"eval_runtime": 2.9702, |
|
"eval_samples_per_second": 52.522, |
|
"eval_steps_per_second": 2.693, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.6213, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.7589120268821716, |
|
"eval_runtime": 2.9752, |
|
"eval_samples_per_second": 52.434, |
|
"eval_steps_per_second": 2.689, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 0.6767, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 0.7586512565612793, |
|
"eval_runtime": 2.9725, |
|
"eval_samples_per_second": 52.482, |
|
"eval_steps_per_second": 2.691, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.8044, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7563453316688538, |
|
"eval_runtime": 2.9763, |
|
"eval_samples_per_second": 52.414, |
|
"eval_steps_per_second": 2.688, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.6338, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 0.7531482577323914, |
|
"eval_runtime": 2.9746, |
|
"eval_samples_per_second": 52.444, |
|
"eval_steps_per_second": 2.689, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.6242, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.7514854669570923, |
|
"eval_runtime": 2.9804, |
|
"eval_samples_per_second": 52.342, |
|
"eval_steps_per_second": 2.684, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.6462, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 0.7528823614120483, |
|
"eval_runtime": 2.9744, |
|
"eval_samples_per_second": 52.448, |
|
"eval_steps_per_second": 2.69, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.6832, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.7538066506385803, |
|
"eval_runtime": 2.9692, |
|
"eval_samples_per_second": 52.54, |
|
"eval_steps_per_second": 2.694, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.5959, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.7551948428153992, |
|
"eval_runtime": 2.9739, |
|
"eval_samples_per_second": 52.456, |
|
"eval_steps_per_second": 2.69, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.6891, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 0.7493566870689392, |
|
"eval_runtime": 2.976, |
|
"eval_samples_per_second": 52.42, |
|
"eval_steps_per_second": 2.688, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.88e-05, |
|
"loss": 0.6819, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 0.7475622296333313, |
|
"eval_runtime": 2.9745, |
|
"eval_samples_per_second": 52.445, |
|
"eval_steps_per_second": 2.69, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.6456, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.7493394613265991, |
|
"eval_runtime": 2.968, |
|
"eval_samples_per_second": 52.56, |
|
"eval_steps_per_second": 2.695, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 0.6415, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 0.7468904256820679, |
|
"eval_runtime": 2.972, |
|
"eval_samples_per_second": 52.491, |
|
"eval_steps_per_second": 2.692, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 2e-05, |
|
"loss": 0.658, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7486743330955505, |
|
"eval_runtime": 2.9736, |
|
"eval_samples_per_second": 52.462, |
|
"eval_steps_per_second": 2.69, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.9999756307053947e-05, |
|
"loss": 0.547, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_loss": 0.7537599205970764, |
|
"eval_runtime": 2.957, |
|
"eval_samples_per_second": 52.757, |
|
"eval_steps_per_second": 2.705, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.9999025240093045e-05, |
|
"loss": 0.5551, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 0.7580664157867432, |
|
"eval_runtime": 2.9622, |
|
"eval_samples_per_second": 52.664, |
|
"eval_steps_per_second": 2.701, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.9997806834748455e-05, |
|
"loss": 0.596, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 0.7601476907730103, |
|
"eval_runtime": 2.9678, |
|
"eval_samples_per_second": 52.564, |
|
"eval_steps_per_second": 2.696, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.9996101150403543e-05, |
|
"loss": 0.5611, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 0.7624161243438721, |
|
"eval_runtime": 2.9644, |
|
"eval_samples_per_second": 52.624, |
|
"eval_steps_per_second": 2.699, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.999390827019096e-05, |
|
"loss": 0.5514, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 0.7590138912200928, |
|
"eval_runtime": 2.9709, |
|
"eval_samples_per_second": 52.51, |
|
"eval_steps_per_second": 2.693, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.9991228300988586e-05, |
|
"loss": 0.5109, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.754431426525116, |
|
"eval_runtime": 2.9642, |
|
"eval_samples_per_second": 52.628, |
|
"eval_steps_per_second": 2.699, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.9988061373414342e-05, |
|
"loss": 0.5984, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_loss": 0.7549035549163818, |
|
"eval_runtime": 2.9676, |
|
"eval_samples_per_second": 52.567, |
|
"eval_steps_per_second": 2.696, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.9984407641819812e-05, |
|
"loss": 0.5346, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 0.7530075311660767, |
|
"eval_runtime": 2.9726, |
|
"eval_samples_per_second": 52.48, |
|
"eval_steps_per_second": 2.691, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.9980267284282718e-05, |
|
"loss": 0.6438, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_loss": 0.7559706568717957, |
|
"eval_runtime": 2.9783, |
|
"eval_samples_per_second": 52.378, |
|
"eval_steps_per_second": 2.686, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.5193, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.7533587217330933, |
|
"eval_runtime": 2.9733, |
|
"eval_samples_per_second": 52.468, |
|
"eval_steps_per_second": 2.691, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.9970527522269204e-05, |
|
"loss": 0.6232, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"eval_loss": 0.7505587339401245, |
|
"eval_runtime": 2.9729, |
|
"eval_samples_per_second": 52.474, |
|
"eval_steps_per_second": 2.691, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.9964928592495046e-05, |
|
"loss": 0.6118, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 0.747382402420044, |
|
"eval_runtime": 2.9751, |
|
"eval_samples_per_second": 52.436, |
|
"eval_steps_per_second": 2.689, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 1.9958843986159705e-05, |
|
"loss": 0.6072, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_loss": 0.7488105893135071, |
|
"eval_runtime": 2.9725, |
|
"eval_samples_per_second": 52.481, |
|
"eval_steps_per_second": 2.691, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.9952273999818312e-05, |
|
"loss": 0.6088, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 0.7461674809455872, |
|
"eval_runtime": 2.9722, |
|
"eval_samples_per_second": 52.486, |
|
"eval_steps_per_second": 2.692, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 0.6276, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_loss": 0.7464598417282104, |
|
"eval_runtime": 2.9781, |
|
"eval_samples_per_second": 52.383, |
|
"eval_steps_per_second": 2.686, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.9937679191605964e-05, |
|
"loss": 0.5798, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 0.7467732429504395, |
|
"eval_runtime": 2.977, |
|
"eval_samples_per_second": 52.403, |
|
"eval_steps_per_second": 2.687, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.992965508106537e-05, |
|
"loss": 0.6283, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_loss": 0.7440893054008484, |
|
"eval_runtime": 2.9772, |
|
"eval_samples_per_second": 52.398, |
|
"eval_steps_per_second": 2.687, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.9921147013144782e-05, |
|
"loss": 0.5477, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 0.745173454284668, |
|
"eval_runtime": 2.9776, |
|
"eval_samples_per_second": 52.392, |
|
"eval_steps_per_second": 2.687, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.991215540251542e-05, |
|
"loss": 0.5479, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_loss": 0.7425445318222046, |
|
"eval_runtime": 2.9811, |
|
"eval_samples_per_second": 52.33, |
|
"eval_steps_per_second": 2.684, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 0.6648, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.7418652772903442, |
|
"eval_runtime": 2.9712, |
|
"eval_samples_per_second": 52.503, |
|
"eval_steps_per_second": 2.692, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.9892723329629885e-05, |
|
"loss": 0.5988, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"eval_loss": 0.7394148707389832, |
|
"eval_runtime": 2.9689, |
|
"eval_samples_per_second": 52.544, |
|
"eval_steps_per_second": 2.695, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.988228381446553e-05, |
|
"loss": 0.5557, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 0.7416422963142395, |
|
"eval_runtime": 2.9747, |
|
"eval_samples_per_second": 52.442, |
|
"eval_steps_per_second": 2.689, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.987136265072988e-05, |
|
"loss": 0.4506, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"eval_loss": 0.7449523210525513, |
|
"eval_runtime": 2.9679, |
|
"eval_samples_per_second": 52.563, |
|
"eval_steps_per_second": 2.696, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.985996037070505e-05, |
|
"loss": 0.6386, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 0.7423770427703857, |
|
"eval_runtime": 2.9762, |
|
"eval_samples_per_second": 52.416, |
|
"eval_steps_per_second": 2.688, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.5889, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.7404635548591614, |
|
"eval_runtime": 2.9716, |
|
"eval_samples_per_second": 52.497, |
|
"eval_steps_per_second": 2.692, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.983571470813386e-05, |
|
"loss": 0.5561, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 0.7413467764854431, |
|
"eval_runtime": 2.9834, |
|
"eval_samples_per_second": 52.29, |
|
"eval_steps_per_second": 2.682, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.982287250728689e-05, |
|
"loss": 0.5307, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_loss": 0.7405444383621216, |
|
"eval_runtime": 2.9797, |
|
"eval_samples_per_second": 52.355, |
|
"eval_steps_per_second": 2.685, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9809551553491918e-05, |
|
"loss": 0.5464, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 0.742436945438385, |
|
"eval_runtime": 2.973, |
|
"eval_samples_per_second": 52.471, |
|
"eval_steps_per_second": 2.691, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.979575249599344e-05, |
|
"loss": 0.5726, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_loss": 0.7410677671432495, |
|
"eval_runtime": 2.968, |
|
"eval_samples_per_second": 52.561, |
|
"eval_steps_per_second": 2.695, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.6039, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.7446211576461792, |
|
"eval_runtime": 2.9788, |
|
"eval_samples_per_second": 52.37, |
|
"eval_steps_per_second": 2.686, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.9766722783341682e-05, |
|
"loss": 0.5236, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"eval_loss": 0.7396934628486633, |
|
"eval_runtime": 2.9783, |
|
"eval_samples_per_second": 52.378, |
|
"eval_steps_per_second": 2.686, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9751493543055634e-05, |
|
"loss": 0.5176, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_loss": 0.7380808591842651, |
|
"eval_runtime": 2.9661, |
|
"eval_samples_per_second": 52.595, |
|
"eval_steps_per_second": 2.697, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.9735789028731603e-05, |
|
"loss": 0.5991, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"eval_loss": 0.7352851629257202, |
|
"eval_runtime": 2.9801, |
|
"eval_samples_per_second": 52.348, |
|
"eval_steps_per_second": 2.685, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.9719610005785466e-05, |
|
"loss": 0.5451, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_loss": 0.7372366786003113, |
|
"eval_runtime": 2.9742, |
|
"eval_samples_per_second": 52.452, |
|
"eval_steps_per_second": 2.69, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9702957262759964e-05, |
|
"loss": 0.4484, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.7352163791656494, |
|
"eval_runtime": 2.9703, |
|
"eval_samples_per_second": 52.52, |
|
"eval_steps_per_second": 2.693, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.9685831611286312e-05, |
|
"loss": 0.5568, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"eval_loss": 0.7367239594459534, |
|
"eval_runtime": 2.9748, |
|
"eval_samples_per_second": 52.441, |
|
"eval_steps_per_second": 2.689, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.9668233886044597e-05, |
|
"loss": 0.5643, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"eval_loss": 0.7365850806236267, |
|
"eval_runtime": 2.9727, |
|
"eval_samples_per_second": 52.477, |
|
"eval_steps_per_second": 2.691, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 1.9650164944723116e-05, |
|
"loss": 0.5341, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.7373999357223511, |
|
"eval_runtime": 2.9825, |
|
"eval_samples_per_second": 52.305, |
|
"eval_steps_per_second": 2.682, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.9631625667976584e-05, |
|
"loss": 0.5255, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"eval_loss": 0.7373459935188293, |
|
"eval_runtime": 2.9729, |
|
"eval_samples_per_second": 52.474, |
|
"eval_steps_per_second": 2.691, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 0.5351, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.7352133393287659, |
|
"eval_runtime": 2.9749, |
|
"eval_samples_per_second": 52.439, |
|
"eval_steps_per_second": 2.689, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.9593139745400575e-05, |
|
"loss": 0.6188, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"eval_loss": 0.7367925643920898, |
|
"eval_runtime": 2.9685, |
|
"eval_samples_per_second": 52.552, |
|
"eval_steps_per_second": 2.695, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.9573194975320672e-05, |
|
"loss": 0.4968, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_loss": 0.7363703846931458, |
|
"eval_runtime": 2.9748, |
|
"eval_samples_per_second": 52.441, |
|
"eval_steps_per_second": 2.689, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.9552783621223437e-05, |
|
"loss": 0.543, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"eval_loss": 0.7352641224861145, |
|
"eval_runtime": 2.9728, |
|
"eval_samples_per_second": 52.476, |
|
"eval_steps_per_second": 2.691, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.9531906677929472e-05, |
|
"loss": 0.6299, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 0.7344778776168823, |
|
"eval_runtime": 2.994, |
|
"eval_samples_per_second": 52.104, |
|
"eval_steps_per_second": 2.672, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.6208, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 0.7322725057601929, |
|
"eval_runtime": 2.9757, |
|
"eval_samples_per_second": 52.425, |
|
"eval_steps_per_second": 2.688, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.9488760116444966e-05, |
|
"loss": 0.5667, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 0.7319491505622864, |
|
"eval_runtime": 2.9677, |
|
"eval_samples_per_second": 52.565, |
|
"eval_steps_per_second": 2.696, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.9466492601156964e-05, |
|
"loss": 0.5686, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_loss": 0.7332378625869751, |
|
"eval_runtime": 2.9718, |
|
"eval_samples_per_second": 52.494, |
|
"eval_steps_per_second": 2.692, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.944376370237481e-05, |
|
"loss": 0.5745, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 0.7319948077201843, |
|
"eval_runtime": 2.9667, |
|
"eval_samples_per_second": 52.584, |
|
"eval_steps_per_second": 2.697, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.942057452787297e-05, |
|
"loss": 0.5632, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_loss": 0.7325617074966431, |
|
"eval_runtime": 2.9757, |
|
"eval_samples_per_second": 52.425, |
|
"eval_steps_per_second": 2.688, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.5558, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7342491149902344, |
|
"eval_runtime": 2.9766, |
|
"eval_samples_per_second": 52.41, |
|
"eval_steps_per_second": 2.688, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 5.34460687754199e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|