Rakhman16's picture
Training in progress, step 2404, checkpoint
abb5cfb verified
{
"best_metric": 0.23246362805366516,
"best_model_checkpoint": "./fine-tuned/checkpoint-2000",
"epoch": 3.99667497921862,
"eval_steps": 100,
"global_step": 2404,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0831255195344971,
"grad_norm": 36186.3984375,
"learning_rate": 2.937603993344426e-05,
"loss": 1.1504,
"step": 50
},
{
"epoch": 0.1662510390689942,
"grad_norm": 26857.228515625,
"learning_rate": 2.8752079866888522e-05,
"loss": 0.4993,
"step": 100
},
{
"epoch": 0.1662510390689942,
"eval_loss": 0.35283052921295166,
"eval_runtime": 38.0523,
"eval_samples_per_second": 12.982,
"eval_steps_per_second": 1.629,
"step": 100
},
{
"epoch": 0.24937655860349128,
"grad_norm": 53959.0078125,
"learning_rate": 2.812811980033278e-05,
"loss": 0.4236,
"step": 150
},
{
"epoch": 0.3325020781379884,
"grad_norm": 32498.453125,
"learning_rate": 2.7504159733777037e-05,
"loss": 0.376,
"step": 200
},
{
"epoch": 0.3325020781379884,
"eval_loss": 0.30865946412086487,
"eval_runtime": 38.2162,
"eval_samples_per_second": 12.926,
"eval_steps_per_second": 1.622,
"step": 200
},
{
"epoch": 0.41562759767248547,
"grad_norm": 32577.630859375,
"learning_rate": 2.6880199667221298e-05,
"loss": 0.3822,
"step": 250
},
{
"epoch": 0.49875311720698257,
"grad_norm": 28815.4609375,
"learning_rate": 2.625623960066556e-05,
"loss": 0.3569,
"step": 300
},
{
"epoch": 0.49875311720698257,
"eval_loss": 0.2891499996185303,
"eval_runtime": 38.1858,
"eval_samples_per_second": 12.937,
"eval_steps_per_second": 1.624,
"step": 300
},
{
"epoch": 0.5818786367414797,
"grad_norm": 33334.5546875,
"learning_rate": 2.563227953410982e-05,
"loss": 0.3414,
"step": 350
},
{
"epoch": 0.6650041562759768,
"grad_norm": 23824.673828125,
"learning_rate": 2.5008319467554077e-05,
"loss": 0.3671,
"step": 400
},
{
"epoch": 0.6650041562759768,
"eval_loss": 0.2757515609264374,
"eval_runtime": 38.2247,
"eval_samples_per_second": 12.924,
"eval_steps_per_second": 1.622,
"step": 400
},
{
"epoch": 0.7481296758104738,
"grad_norm": 24386.048828125,
"learning_rate": 2.4384359400998338e-05,
"loss": 0.3234,
"step": 450
},
{
"epoch": 0.8312551953449709,
"grad_norm": 19840.703125,
"learning_rate": 2.3760399334442595e-05,
"loss": 0.3299,
"step": 500
},
{
"epoch": 0.8312551953449709,
"eval_loss": 0.2673098146915436,
"eval_runtime": 38.0207,
"eval_samples_per_second": 12.993,
"eval_steps_per_second": 1.631,
"step": 500
},
{
"epoch": 0.914380714879468,
"grad_norm": 37282.65625,
"learning_rate": 2.3136439267886856e-05,
"loss": 0.3215,
"step": 550
},
{
"epoch": 0.9975062344139651,
"grad_norm": 21197.52734375,
"learning_rate": 2.2512479201331116e-05,
"loss": 0.3407,
"step": 600
},
{
"epoch": 0.9975062344139651,
"eval_loss": 0.26070085167884827,
"eval_runtime": 37.7694,
"eval_samples_per_second": 13.079,
"eval_steps_per_second": 1.642,
"step": 600
},
{
"epoch": 1.0806317539484622,
"grad_norm": 21656.6328125,
"learning_rate": 2.1888519134775374e-05,
"loss": 0.3191,
"step": 650
},
{
"epoch": 1.1637572734829593,
"grad_norm": 22402.8125,
"learning_rate": 2.1264559068219635e-05,
"loss": 0.299,
"step": 700
},
{
"epoch": 1.1637572734829593,
"eval_loss": 0.256939560174942,
"eval_runtime": 38.0385,
"eval_samples_per_second": 12.987,
"eval_steps_per_second": 1.63,
"step": 700
},
{
"epoch": 1.2468827930174564,
"grad_norm": 49383.11328125,
"learning_rate": 2.0640599001663895e-05,
"loss": 0.3131,
"step": 750
},
{
"epoch": 1.3300083125519535,
"grad_norm": 27295.173828125,
"learning_rate": 2.0016638935108153e-05,
"loss": 0.3149,
"step": 800
},
{
"epoch": 1.3300083125519535,
"eval_loss": 0.2525966763496399,
"eval_runtime": 38.1831,
"eval_samples_per_second": 12.938,
"eval_steps_per_second": 1.624,
"step": 800
},
{
"epoch": 1.4131338320864506,
"grad_norm": 15751.75390625,
"learning_rate": 1.9392678868552414e-05,
"loss": 0.2845,
"step": 850
},
{
"epoch": 1.4962593516209477,
"grad_norm": 25327.384765625,
"learning_rate": 1.8768718801996674e-05,
"loss": 0.2945,
"step": 900
},
{
"epoch": 1.4962593516209477,
"eval_loss": 0.24994711577892303,
"eval_runtime": 37.4678,
"eval_samples_per_second": 13.185,
"eval_steps_per_second": 1.655,
"step": 900
},
{
"epoch": 1.5793848711554448,
"grad_norm": 25815.150390625,
"learning_rate": 1.8144758735440932e-05,
"loss": 0.3005,
"step": 950
},
{
"epoch": 1.6625103906899419,
"grad_norm": 24868.796875,
"learning_rate": 1.7520798668885192e-05,
"loss": 0.3129,
"step": 1000
},
{
"epoch": 1.6625103906899419,
"eval_loss": 0.246443971991539,
"eval_runtime": 37.8324,
"eval_samples_per_second": 13.058,
"eval_steps_per_second": 1.639,
"step": 1000
},
{
"epoch": 1.745635910224439,
"grad_norm": 32112.76171875,
"learning_rate": 1.6896838602329453e-05,
"loss": 0.2959,
"step": 1050
},
{
"epoch": 1.828761429758936,
"grad_norm": 22870.244140625,
"learning_rate": 1.627287853577371e-05,
"loss": 0.2896,
"step": 1100
},
{
"epoch": 1.828761429758936,
"eval_loss": 0.24409395456314087,
"eval_runtime": 37.7821,
"eval_samples_per_second": 13.075,
"eval_steps_per_second": 1.641,
"step": 1100
},
{
"epoch": 1.9118869492934332,
"grad_norm": 30326.173828125,
"learning_rate": 1.564891846921797e-05,
"loss": 0.287,
"step": 1150
},
{
"epoch": 1.9950124688279303,
"grad_norm": 20373.96875,
"learning_rate": 1.502495840266223e-05,
"loss": 0.2847,
"step": 1200
},
{
"epoch": 1.9950124688279303,
"eval_loss": 0.2411041557788849,
"eval_runtime": 37.3119,
"eval_samples_per_second": 13.24,
"eval_steps_per_second": 1.662,
"step": 1200
},
{
"epoch": 2.0781379883624274,
"grad_norm": 13517.228515625,
"learning_rate": 1.440099833610649e-05,
"loss": 0.2754,
"step": 1250
},
{
"epoch": 2.1612635078969245,
"grad_norm": 37159.10546875,
"learning_rate": 1.3777038269550749e-05,
"loss": 0.2841,
"step": 1300
},
{
"epoch": 2.1612635078969245,
"eval_loss": 0.23963774740695953,
"eval_runtime": 37.7867,
"eval_samples_per_second": 13.073,
"eval_steps_per_second": 1.641,
"step": 1300
},
{
"epoch": 2.2443890274314215,
"grad_norm": 13807.1201171875,
"learning_rate": 1.315307820299501e-05,
"loss": 0.2831,
"step": 1350
},
{
"epoch": 2.3275145469659186,
"grad_norm": 70717.4296875,
"learning_rate": 1.2529118136439268e-05,
"loss": 0.2926,
"step": 1400
},
{
"epoch": 2.3275145469659186,
"eval_loss": 0.23829442262649536,
"eval_runtime": 37.7411,
"eval_samples_per_second": 13.089,
"eval_steps_per_second": 1.643,
"step": 1400
},
{
"epoch": 2.4106400665004157,
"grad_norm": 20201.111328125,
"learning_rate": 1.1905158069883528e-05,
"loss": 0.285,
"step": 1450
},
{
"epoch": 2.493765586034913,
"grad_norm": 28600.62109375,
"learning_rate": 1.1281198003327787e-05,
"loss": 0.2593,
"step": 1500
},
{
"epoch": 2.493765586034913,
"eval_loss": 0.2369847148656845,
"eval_runtime": 38.1397,
"eval_samples_per_second": 12.952,
"eval_steps_per_second": 1.626,
"step": 1500
},
{
"epoch": 2.57689110556941,
"grad_norm": 22271.5,
"learning_rate": 1.0657237936772047e-05,
"loss": 0.2684,
"step": 1550
},
{
"epoch": 2.660016625103907,
"grad_norm": 17982.9140625,
"learning_rate": 1.0033277870216307e-05,
"loss": 0.2753,
"step": 1600
},
{
"epoch": 2.660016625103907,
"eval_loss": 0.23503336310386658,
"eval_runtime": 37.7531,
"eval_samples_per_second": 13.085,
"eval_steps_per_second": 1.642,
"step": 1600
},
{
"epoch": 2.743142144638404,
"grad_norm": 20275.65625,
"learning_rate": 9.409317803660566e-06,
"loss": 0.282,
"step": 1650
},
{
"epoch": 2.826267664172901,
"grad_norm": 21899.2109375,
"learning_rate": 8.785357737104826e-06,
"loss": 0.2699,
"step": 1700
},
{
"epoch": 2.826267664172901,
"eval_loss": 0.23422521352767944,
"eval_runtime": 37.8729,
"eval_samples_per_second": 13.044,
"eval_steps_per_second": 1.637,
"step": 1700
},
{
"epoch": 2.9093931837073983,
"grad_norm": 17812.615234375,
"learning_rate": 8.161397670549084e-06,
"loss": 0.277,
"step": 1750
},
{
"epoch": 2.9925187032418954,
"grad_norm": 18110.896484375,
"learning_rate": 7.5374376039933445e-06,
"loss": 0.2673,
"step": 1800
},
{
"epoch": 2.9925187032418954,
"eval_loss": 0.23330263793468475,
"eval_runtime": 37.8678,
"eval_samples_per_second": 13.045,
"eval_steps_per_second": 1.637,
"step": 1800
},
{
"epoch": 3.0756442227763925,
"grad_norm": 64865.62109375,
"learning_rate": 6.913477537437604e-06,
"loss": 0.2742,
"step": 1850
},
{
"epoch": 3.1587697423108896,
"grad_norm": 22536.302734375,
"learning_rate": 6.289517470881864e-06,
"loss": 0.2723,
"step": 1900
},
{
"epoch": 3.1587697423108896,
"eval_loss": 0.23302872478961945,
"eval_runtime": 38.1436,
"eval_samples_per_second": 12.951,
"eval_steps_per_second": 1.625,
"step": 1900
},
{
"epoch": 3.2418952618453867,
"grad_norm": 26661.78125,
"learning_rate": 5.6655574043261234e-06,
"loss": 0.273,
"step": 1950
},
{
"epoch": 3.3250207813798838,
"grad_norm": 39719.59765625,
"learning_rate": 5.0415973377703825e-06,
"loss": 0.2746,
"step": 2000
},
{
"epoch": 3.3250207813798838,
"eval_loss": 0.23246362805366516,
"eval_runtime": 37.97,
"eval_samples_per_second": 13.01,
"eval_steps_per_second": 1.633,
"step": 2000
},
{
"epoch": 3.408146300914381,
"grad_norm": 19064.091796875,
"learning_rate": 4.4176372712146424e-06,
"loss": 0.2531,
"step": 2050
},
{
"epoch": 3.491271820448878,
"grad_norm": 24487.681640625,
"learning_rate": 3.793677204658902e-06,
"loss": 0.2763,
"step": 2100
},
{
"epoch": 3.491271820448878,
"eval_loss": 0.23180559277534485,
"eval_runtime": 37.9308,
"eval_samples_per_second": 13.024,
"eval_steps_per_second": 1.635,
"step": 2100
},
{
"epoch": 3.574397339983375,
"grad_norm": 33160.66015625,
"learning_rate": 3.1697171381031614e-06,
"loss": 0.2706,
"step": 2150
},
{
"epoch": 3.657522859517872,
"grad_norm": 20284.03515625,
"learning_rate": 2.545757071547421e-06,
"loss": 0.2521,
"step": 2200
},
{
"epoch": 3.657522859517872,
"eval_loss": 0.23114623129367828,
"eval_runtime": 38.1017,
"eval_samples_per_second": 12.965,
"eval_steps_per_second": 1.627,
"step": 2200
},
{
"epoch": 3.7406483790523692,
"grad_norm": 55974.03125,
"learning_rate": 1.9217970049916804e-06,
"loss": 0.2542,
"step": 2250
},
{
"epoch": 3.8237738985868663,
"grad_norm": 18724.478515625,
"learning_rate": 1.2978369384359402e-06,
"loss": 0.2684,
"step": 2300
},
{
"epoch": 3.8237738985868663,
"eval_loss": 0.23083852231502533,
"eval_runtime": 38.0527,
"eval_samples_per_second": 12.982,
"eval_steps_per_second": 1.629,
"step": 2300
},
{
"epoch": 3.9068994181213634,
"grad_norm": 26152.619140625,
"learning_rate": 6.738768718801997e-07,
"loss": 0.2582,
"step": 2350
},
{
"epoch": 3.9900249376558605,
"grad_norm": 20345.572265625,
"learning_rate": 4.9916805324459236e-08,
"loss": 0.2529,
"step": 2400
},
{
"epoch": 3.9900249376558605,
"eval_loss": 0.23079748451709747,
"eval_runtime": 37.7401,
"eval_samples_per_second": 13.09,
"eval_steps_per_second": 1.643,
"step": 2400
}
],
"logging_steps": 50,
"max_steps": 2404,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.342112942882816e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}