|
{ |
|
"best_metric": 0.23246362805366516, |
|
"best_model_checkpoint": "./fine-tuned/checkpoint-2000", |
|
"epoch": 3.99667497921862, |
|
"eval_steps": 100, |
|
"global_step": 2404, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0831255195344971, |
|
"grad_norm": 36186.3984375, |
|
"learning_rate": 2.937603993344426e-05, |
|
"loss": 1.1504, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1662510390689942, |
|
"grad_norm": 26857.228515625, |
|
"learning_rate": 2.8752079866888522e-05, |
|
"loss": 0.4993, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1662510390689942, |
|
"eval_loss": 0.35283052921295166, |
|
"eval_runtime": 38.0523, |
|
"eval_samples_per_second": 12.982, |
|
"eval_steps_per_second": 1.629, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24937655860349128, |
|
"grad_norm": 53959.0078125, |
|
"learning_rate": 2.812811980033278e-05, |
|
"loss": 0.4236, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3325020781379884, |
|
"grad_norm": 32498.453125, |
|
"learning_rate": 2.7504159733777037e-05, |
|
"loss": 0.376, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3325020781379884, |
|
"eval_loss": 0.30865946412086487, |
|
"eval_runtime": 38.2162, |
|
"eval_samples_per_second": 12.926, |
|
"eval_steps_per_second": 1.622, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.41562759767248547, |
|
"grad_norm": 32577.630859375, |
|
"learning_rate": 2.6880199667221298e-05, |
|
"loss": 0.3822, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49875311720698257, |
|
"grad_norm": 28815.4609375, |
|
"learning_rate": 2.625623960066556e-05, |
|
"loss": 0.3569, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.49875311720698257, |
|
"eval_loss": 0.2891499996185303, |
|
"eval_runtime": 38.1858, |
|
"eval_samples_per_second": 12.937, |
|
"eval_steps_per_second": 1.624, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5818786367414797, |
|
"grad_norm": 33334.5546875, |
|
"learning_rate": 2.563227953410982e-05, |
|
"loss": 0.3414, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6650041562759768, |
|
"grad_norm": 23824.673828125, |
|
"learning_rate": 2.5008319467554077e-05, |
|
"loss": 0.3671, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6650041562759768, |
|
"eval_loss": 0.2757515609264374, |
|
"eval_runtime": 38.2247, |
|
"eval_samples_per_second": 12.924, |
|
"eval_steps_per_second": 1.622, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7481296758104738, |
|
"grad_norm": 24386.048828125, |
|
"learning_rate": 2.4384359400998338e-05, |
|
"loss": 0.3234, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8312551953449709, |
|
"grad_norm": 19840.703125, |
|
"learning_rate": 2.3760399334442595e-05, |
|
"loss": 0.3299, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8312551953449709, |
|
"eval_loss": 0.2673098146915436, |
|
"eval_runtime": 38.0207, |
|
"eval_samples_per_second": 12.993, |
|
"eval_steps_per_second": 1.631, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.914380714879468, |
|
"grad_norm": 37282.65625, |
|
"learning_rate": 2.3136439267886856e-05, |
|
"loss": 0.3215, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"grad_norm": 21197.52734375, |
|
"learning_rate": 2.2512479201331116e-05, |
|
"loss": 0.3407, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"eval_loss": 0.26070085167884827, |
|
"eval_runtime": 37.7694, |
|
"eval_samples_per_second": 13.079, |
|
"eval_steps_per_second": 1.642, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0806317539484622, |
|
"grad_norm": 21656.6328125, |
|
"learning_rate": 2.1888519134775374e-05, |
|
"loss": 0.3191, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1637572734829593, |
|
"grad_norm": 22402.8125, |
|
"learning_rate": 2.1264559068219635e-05, |
|
"loss": 0.299, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1637572734829593, |
|
"eval_loss": 0.256939560174942, |
|
"eval_runtime": 38.0385, |
|
"eval_samples_per_second": 12.987, |
|
"eval_steps_per_second": 1.63, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2468827930174564, |
|
"grad_norm": 49383.11328125, |
|
"learning_rate": 2.0640599001663895e-05, |
|
"loss": 0.3131, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3300083125519535, |
|
"grad_norm": 27295.173828125, |
|
"learning_rate": 2.0016638935108153e-05, |
|
"loss": 0.3149, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3300083125519535, |
|
"eval_loss": 0.2525966763496399, |
|
"eval_runtime": 38.1831, |
|
"eval_samples_per_second": 12.938, |
|
"eval_steps_per_second": 1.624, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4131338320864506, |
|
"grad_norm": 15751.75390625, |
|
"learning_rate": 1.9392678868552414e-05, |
|
"loss": 0.2845, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4962593516209477, |
|
"grad_norm": 25327.384765625, |
|
"learning_rate": 1.8768718801996674e-05, |
|
"loss": 0.2945, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4962593516209477, |
|
"eval_loss": 0.24994711577892303, |
|
"eval_runtime": 37.4678, |
|
"eval_samples_per_second": 13.185, |
|
"eval_steps_per_second": 1.655, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5793848711554448, |
|
"grad_norm": 25815.150390625, |
|
"learning_rate": 1.8144758735440932e-05, |
|
"loss": 0.3005, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6625103906899419, |
|
"grad_norm": 24868.796875, |
|
"learning_rate": 1.7520798668885192e-05, |
|
"loss": 0.3129, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6625103906899419, |
|
"eval_loss": 0.246443971991539, |
|
"eval_runtime": 37.8324, |
|
"eval_samples_per_second": 13.058, |
|
"eval_steps_per_second": 1.639, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.745635910224439, |
|
"grad_norm": 32112.76171875, |
|
"learning_rate": 1.6896838602329453e-05, |
|
"loss": 0.2959, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.828761429758936, |
|
"grad_norm": 22870.244140625, |
|
"learning_rate": 1.627287853577371e-05, |
|
"loss": 0.2896, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.828761429758936, |
|
"eval_loss": 0.24409395456314087, |
|
"eval_runtime": 37.7821, |
|
"eval_samples_per_second": 13.075, |
|
"eval_steps_per_second": 1.641, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9118869492934332, |
|
"grad_norm": 30326.173828125, |
|
"learning_rate": 1.564891846921797e-05, |
|
"loss": 0.287, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.9950124688279303, |
|
"grad_norm": 20373.96875, |
|
"learning_rate": 1.502495840266223e-05, |
|
"loss": 0.2847, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.9950124688279303, |
|
"eval_loss": 0.2411041557788849, |
|
"eval_runtime": 37.3119, |
|
"eval_samples_per_second": 13.24, |
|
"eval_steps_per_second": 1.662, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.0781379883624274, |
|
"grad_norm": 13517.228515625, |
|
"learning_rate": 1.440099833610649e-05, |
|
"loss": 0.2754, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.1612635078969245, |
|
"grad_norm": 37159.10546875, |
|
"learning_rate": 1.3777038269550749e-05, |
|
"loss": 0.2841, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.1612635078969245, |
|
"eval_loss": 0.23963774740695953, |
|
"eval_runtime": 37.7867, |
|
"eval_samples_per_second": 13.073, |
|
"eval_steps_per_second": 1.641, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.2443890274314215, |
|
"grad_norm": 13807.1201171875, |
|
"learning_rate": 1.315307820299501e-05, |
|
"loss": 0.2831, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.3275145469659186, |
|
"grad_norm": 70717.4296875, |
|
"learning_rate": 1.2529118136439268e-05, |
|
"loss": 0.2926, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.3275145469659186, |
|
"eval_loss": 0.23829442262649536, |
|
"eval_runtime": 37.7411, |
|
"eval_samples_per_second": 13.089, |
|
"eval_steps_per_second": 1.643, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4106400665004157, |
|
"grad_norm": 20201.111328125, |
|
"learning_rate": 1.1905158069883528e-05, |
|
"loss": 0.285, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.493765586034913, |
|
"grad_norm": 28600.62109375, |
|
"learning_rate": 1.1281198003327787e-05, |
|
"loss": 0.2593, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.493765586034913, |
|
"eval_loss": 0.2369847148656845, |
|
"eval_runtime": 38.1397, |
|
"eval_samples_per_second": 12.952, |
|
"eval_steps_per_second": 1.626, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.57689110556941, |
|
"grad_norm": 22271.5, |
|
"learning_rate": 1.0657237936772047e-05, |
|
"loss": 0.2684, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.660016625103907, |
|
"grad_norm": 17982.9140625, |
|
"learning_rate": 1.0033277870216307e-05, |
|
"loss": 0.2753, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.660016625103907, |
|
"eval_loss": 0.23503336310386658, |
|
"eval_runtime": 37.7531, |
|
"eval_samples_per_second": 13.085, |
|
"eval_steps_per_second": 1.642, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.743142144638404, |
|
"grad_norm": 20275.65625, |
|
"learning_rate": 9.409317803660566e-06, |
|
"loss": 0.282, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.826267664172901, |
|
"grad_norm": 21899.2109375, |
|
"learning_rate": 8.785357737104826e-06, |
|
"loss": 0.2699, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.826267664172901, |
|
"eval_loss": 0.23422521352767944, |
|
"eval_runtime": 37.8729, |
|
"eval_samples_per_second": 13.044, |
|
"eval_steps_per_second": 1.637, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.9093931837073983, |
|
"grad_norm": 17812.615234375, |
|
"learning_rate": 8.161397670549084e-06, |
|
"loss": 0.277, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.9925187032418954, |
|
"grad_norm": 18110.896484375, |
|
"learning_rate": 7.5374376039933445e-06, |
|
"loss": 0.2673, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.9925187032418954, |
|
"eval_loss": 0.23330263793468475, |
|
"eval_runtime": 37.8678, |
|
"eval_samples_per_second": 13.045, |
|
"eval_steps_per_second": 1.637, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.0756442227763925, |
|
"grad_norm": 64865.62109375, |
|
"learning_rate": 6.913477537437604e-06, |
|
"loss": 0.2742, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.1587697423108896, |
|
"grad_norm": 22536.302734375, |
|
"learning_rate": 6.289517470881864e-06, |
|
"loss": 0.2723, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.1587697423108896, |
|
"eval_loss": 0.23302872478961945, |
|
"eval_runtime": 38.1436, |
|
"eval_samples_per_second": 12.951, |
|
"eval_steps_per_second": 1.625, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.2418952618453867, |
|
"grad_norm": 26661.78125, |
|
"learning_rate": 5.6655574043261234e-06, |
|
"loss": 0.273, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.3250207813798838, |
|
"grad_norm": 39719.59765625, |
|
"learning_rate": 5.0415973377703825e-06, |
|
"loss": 0.2746, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.3250207813798838, |
|
"eval_loss": 0.23246362805366516, |
|
"eval_runtime": 37.97, |
|
"eval_samples_per_second": 13.01, |
|
"eval_steps_per_second": 1.633, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.408146300914381, |
|
"grad_norm": 19064.091796875, |
|
"learning_rate": 4.4176372712146424e-06, |
|
"loss": 0.2531, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.491271820448878, |
|
"grad_norm": 24487.681640625, |
|
"learning_rate": 3.793677204658902e-06, |
|
"loss": 0.2763, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.491271820448878, |
|
"eval_loss": 0.23180559277534485, |
|
"eval_runtime": 37.9308, |
|
"eval_samples_per_second": 13.024, |
|
"eval_steps_per_second": 1.635, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.574397339983375, |
|
"grad_norm": 33160.66015625, |
|
"learning_rate": 3.1697171381031614e-06, |
|
"loss": 0.2706, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.657522859517872, |
|
"grad_norm": 20284.03515625, |
|
"learning_rate": 2.545757071547421e-06, |
|
"loss": 0.2521, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.657522859517872, |
|
"eval_loss": 0.23114623129367828, |
|
"eval_runtime": 38.1017, |
|
"eval_samples_per_second": 12.965, |
|
"eval_steps_per_second": 1.627, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.7406483790523692, |
|
"grad_norm": 55974.03125, |
|
"learning_rate": 1.9217970049916804e-06, |
|
"loss": 0.2542, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.8237738985868663, |
|
"grad_norm": 18724.478515625, |
|
"learning_rate": 1.2978369384359402e-06, |
|
"loss": 0.2684, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.8237738985868663, |
|
"eval_loss": 0.23083852231502533, |
|
"eval_runtime": 38.0527, |
|
"eval_samples_per_second": 12.982, |
|
"eval_steps_per_second": 1.629, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.9068994181213634, |
|
"grad_norm": 26152.619140625, |
|
"learning_rate": 6.738768718801997e-07, |
|
"loss": 0.2582, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.9900249376558605, |
|
"grad_norm": 20345.572265625, |
|
"learning_rate": 4.9916805324459236e-08, |
|
"loss": 0.2529, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.9900249376558605, |
|
"eval_loss": 0.23079748451709747, |
|
"eval_runtime": 37.7401, |
|
"eval_samples_per_second": 13.09, |
|
"eval_steps_per_second": 1.643, |
|
"step": 2400 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 2404, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.342112942882816e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|