tyzhu's picture
End of training
dca9d48 verified
raw
history blame contribute delete
No virus
11.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.946666666666665,
"eval_steps": 500,
"global_step": 3740,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.5333333333333333,
"grad_norm": 0.2082328200340271,
"learning_rate": 0.0003,
"loss": 1.7516,
"step": 100
},
{
"epoch": 0.9973333333333333,
"eval_accuracy": 0.608609865470852,
"eval_loss": 1.6714181900024414,
"eval_runtime": 6.6041,
"eval_samples_per_second": 75.71,
"eval_steps_per_second": 9.54,
"step": 187
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.2450818121433258,
"learning_rate": 0.0003,
"loss": 1.6997,
"step": 200
},
{
"epoch": 1.6,
"grad_norm": 0.36302751302719116,
"learning_rate": 0.0003,
"loss": 1.5219,
"step": 300
},
{
"epoch": 2.0,
"eval_accuracy": 0.6104035874439462,
"eval_loss": 1.6736148595809937,
"eval_runtime": 7.0545,
"eval_samples_per_second": 70.877,
"eval_steps_per_second": 8.93,
"step": 375
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.4031146764755249,
"learning_rate": 0.0003,
"loss": 1.4473,
"step": 400
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.4989413917064667,
"learning_rate": 0.0003,
"loss": 1.2037,
"step": 500
},
{
"epoch": 2.997333333333333,
"eval_accuracy": 0.6081076233183856,
"eval_loss": 1.756110429763794,
"eval_runtime": 6.5632,
"eval_samples_per_second": 76.183,
"eval_steps_per_second": 9.599,
"step": 562
},
{
"epoch": 3.2,
"grad_norm": 0.5431676506996155,
"learning_rate": 0.0003,
"loss": 1.0868,
"step": 600
},
{
"epoch": 3.7333333333333334,
"grad_norm": 0.5269047021865845,
"learning_rate": 0.0003,
"loss": 0.8815,
"step": 700
},
{
"epoch": 4.0,
"eval_accuracy": 0.6032645739910314,
"eval_loss": 1.8874716758728027,
"eval_runtime": 6.1584,
"eval_samples_per_second": 81.19,
"eval_steps_per_second": 10.23,
"step": 750
},
{
"epoch": 4.266666666666667,
"grad_norm": 0.681224524974823,
"learning_rate": 0.0003,
"loss": 0.7337,
"step": 800
},
{
"epoch": 4.8,
"grad_norm": 0.7092007994651794,
"learning_rate": 0.0003,
"loss": 0.6016,
"step": 900
},
{
"epoch": 4.997333333333334,
"eval_accuracy": 0.5979730941704036,
"eval_loss": 2.076803684234619,
"eval_runtime": 6.7069,
"eval_samples_per_second": 74.55,
"eval_steps_per_second": 9.393,
"step": 937
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.6530160903930664,
"learning_rate": 0.0003,
"loss": 0.4625,
"step": 1000
},
{
"epoch": 5.866666666666667,
"grad_norm": 0.7848784923553467,
"learning_rate": 0.0003,
"loss": 0.3979,
"step": 1100
},
{
"epoch": 6.0,
"eval_accuracy": 0.5953094170403588,
"eval_loss": 2.260585069656372,
"eval_runtime": 7.0817,
"eval_samples_per_second": 70.605,
"eval_steps_per_second": 8.896,
"step": 1125
},
{
"epoch": 6.4,
"grad_norm": 0.8100391030311584,
"learning_rate": 0.0003,
"loss": 0.2797,
"step": 1200
},
{
"epoch": 6.933333333333334,
"grad_norm": 0.723619282245636,
"learning_rate": 0.0003,
"loss": 0.2591,
"step": 1300
},
{
"epoch": 6.997333333333334,
"eval_accuracy": 0.5932645739910314,
"eval_loss": 2.4669973850250244,
"eval_runtime": 7.1955,
"eval_samples_per_second": 69.488,
"eval_steps_per_second": 8.755,
"step": 1312
},
{
"epoch": 7.466666666666667,
"grad_norm": 0.6629557013511658,
"learning_rate": 0.0003,
"loss": 0.1822,
"step": 1400
},
{
"epoch": 8.0,
"grad_norm": 0.6765617728233337,
"learning_rate": 0.0003,
"loss": 0.1821,
"step": 1500
},
{
"epoch": 8.0,
"eval_accuracy": 0.592152466367713,
"eval_loss": 2.6145341396331787,
"eval_runtime": 7.0099,
"eval_samples_per_second": 71.327,
"eval_steps_per_second": 8.987,
"step": 1500
},
{
"epoch": 8.533333333333333,
"grad_norm": 0.5296387672424316,
"learning_rate": 0.0003,
"loss": 0.1338,
"step": 1600
},
{
"epoch": 8.997333333333334,
"eval_accuracy": 0.5910582959641255,
"eval_loss": 2.739866256713867,
"eval_runtime": 7.1202,
"eval_samples_per_second": 70.223,
"eval_steps_per_second": 8.848,
"step": 1687
},
{
"epoch": 9.066666666666666,
"grad_norm": 0.402245432138443,
"learning_rate": 0.0003,
"loss": 0.1407,
"step": 1700
},
{
"epoch": 9.6,
"grad_norm": 0.44858765602111816,
"learning_rate": 0.0003,
"loss": 0.1172,
"step": 1800
},
{
"epoch": 10.0,
"eval_accuracy": 0.5914618834080717,
"eval_loss": 2.833005666732788,
"eval_runtime": 7.0274,
"eval_samples_per_second": 71.15,
"eval_steps_per_second": 8.965,
"step": 1875
},
{
"epoch": 10.133333333333333,
"grad_norm": 0.4508216381072998,
"learning_rate": 0.0003,
"loss": 0.1195,
"step": 1900
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.37036266922950745,
"learning_rate": 0.0003,
"loss": 0.1102,
"step": 2000
},
{
"epoch": 10.997333333333334,
"eval_accuracy": 0.5914349775784753,
"eval_loss": 2.8673934936523438,
"eval_runtime": 6.5903,
"eval_samples_per_second": 75.869,
"eval_steps_per_second": 9.56,
"step": 2062
},
{
"epoch": 11.2,
"grad_norm": 0.3041936457157135,
"learning_rate": 0.0003,
"loss": 0.1131,
"step": 2100
},
{
"epoch": 11.733333333333333,
"grad_norm": 0.5073165893554688,
"learning_rate": 0.0003,
"loss": 0.1079,
"step": 2200
},
{
"epoch": 12.0,
"eval_accuracy": 0.5903497757847533,
"eval_loss": 2.894710063934326,
"eval_runtime": 7.0771,
"eval_samples_per_second": 70.65,
"eval_steps_per_second": 8.902,
"step": 2250
},
{
"epoch": 12.266666666666667,
"grad_norm": 0.4313170909881592,
"learning_rate": 0.0003,
"loss": 0.1083,
"step": 2300
},
{
"epoch": 12.8,
"grad_norm": 0.4307994544506073,
"learning_rate": 0.0003,
"loss": 0.11,
"step": 2400
},
{
"epoch": 12.997333333333334,
"eval_accuracy": 0.589354260089686,
"eval_loss": 2.9230430126190186,
"eval_runtime": 7.1832,
"eval_samples_per_second": 69.607,
"eval_steps_per_second": 8.771,
"step": 2437
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.38372334837913513,
"learning_rate": 0.0003,
"loss": 0.1082,
"step": 2500
},
{
"epoch": 13.866666666666667,
"grad_norm": 0.49263113737106323,
"learning_rate": 0.0003,
"loss": 0.1136,
"step": 2600
},
{
"epoch": 14.0,
"eval_accuracy": 0.5888340807174888,
"eval_loss": 2.9049084186553955,
"eval_runtime": 6.7163,
"eval_samples_per_second": 74.446,
"eval_steps_per_second": 9.38,
"step": 2625
},
{
"epoch": 14.4,
"grad_norm": 0.42505690455436707,
"learning_rate": 0.0003,
"loss": 0.1086,
"step": 2700
},
{
"epoch": 14.933333333333334,
"grad_norm": 0.6179661750793457,
"learning_rate": 0.0003,
"loss": 0.1173,
"step": 2800
},
{
"epoch": 14.997333333333334,
"eval_accuracy": 0.5882511210762332,
"eval_loss": 2.8788018226623535,
"eval_runtime": 6.217,
"eval_samples_per_second": 80.425,
"eval_steps_per_second": 10.133,
"step": 2812
},
{
"epoch": 15.466666666666667,
"grad_norm": 0.45964017510414124,
"learning_rate": 0.0003,
"loss": 0.109,
"step": 2900
},
{
"epoch": 16.0,
"grad_norm": 0.5801168084144592,
"learning_rate": 0.0003,
"loss": 0.1163,
"step": 3000
},
{
"epoch": 16.0,
"eval_accuracy": 0.589237668161435,
"eval_loss": 2.9582040309906006,
"eval_runtime": 7.1226,
"eval_samples_per_second": 70.199,
"eval_steps_per_second": 8.845,
"step": 3000
},
{
"epoch": 16.533333333333335,
"grad_norm": 0.4587346315383911,
"learning_rate": 0.0003,
"loss": 0.1047,
"step": 3100
},
{
"epoch": 16.997333333333334,
"eval_accuracy": 0.5885650224215246,
"eval_loss": 2.9484808444976807,
"eval_runtime": 6.6834,
"eval_samples_per_second": 74.812,
"eval_steps_per_second": 9.426,
"step": 3187
},
{
"epoch": 17.066666666666666,
"grad_norm": 0.4009888470172882,
"learning_rate": 0.0003,
"loss": 0.1145,
"step": 3200
},
{
"epoch": 17.6,
"grad_norm": 1.4840149879455566,
"learning_rate": 0.0003,
"loss": 0.1044,
"step": 3300
},
{
"epoch": 18.0,
"eval_accuracy": 0.5894080717488789,
"eval_loss": 2.9815316200256348,
"eval_runtime": 7.2254,
"eval_samples_per_second": 69.2,
"eval_steps_per_second": 8.719,
"step": 3375
},
{
"epoch": 18.133333333333333,
"grad_norm": 0.4150511920452118,
"learning_rate": 0.0003,
"loss": 0.1111,
"step": 3400
},
{
"epoch": 18.666666666666668,
"grad_norm": 0.47382423281669617,
"learning_rate": 0.0003,
"loss": 0.105,
"step": 3500
},
{
"epoch": 18.997333333333334,
"eval_accuracy": 0.5881076233183856,
"eval_loss": 2.987971544265747,
"eval_runtime": 7.3831,
"eval_samples_per_second": 67.722,
"eval_steps_per_second": 8.533,
"step": 3562
},
{
"epoch": 19.2,
"grad_norm": 0.4355124533176422,
"learning_rate": 0.0003,
"loss": 0.1068,
"step": 3600
},
{
"epoch": 19.733333333333334,
"grad_norm": 0.43823131918907166,
"learning_rate": 0.0003,
"loss": 0.1036,
"step": 3700
},
{
"epoch": 19.946666666666665,
"eval_accuracy": 0.5885829596412556,
"eval_loss": 3.0184407234191895,
"eval_runtime": 6.2011,
"eval_samples_per_second": 80.631,
"eval_steps_per_second": 10.159,
"step": 3740
},
{
"epoch": 19.946666666666665,
"step": 3740,
"total_flos": 3.767212755417825e+17,
"train_loss": 0.406913380316872,
"train_runtime": 8744.2637,
"train_samples_per_second": 13.723,
"train_steps_per_second": 0.428
}
],
"logging_steps": 100,
"max_steps": 3740,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 3.767212755417825e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}