sedrickkeh's picture
End of training
3850af7 verified
raw
history blame
9.66 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.990490124359912,
"eval_steps": 500,
"global_step": 510,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.058522311631309436,
"grad_norm": 1.5647390529813434,
"learning_rate": 5e-06,
"loss": 0.8022,
"step": 10
},
{
"epoch": 0.11704462326261887,
"grad_norm": 2.4298906740936874,
"learning_rate": 5e-06,
"loss": 0.7306,
"step": 20
},
{
"epoch": 0.1755669348939283,
"grad_norm": 1.1278550883615037,
"learning_rate": 5e-06,
"loss": 0.7122,
"step": 30
},
{
"epoch": 0.23408924652523774,
"grad_norm": 1.1246574687423805,
"learning_rate": 5e-06,
"loss": 0.6975,
"step": 40
},
{
"epoch": 0.29261155815654716,
"grad_norm": 1.0811775432928663,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 50
},
{
"epoch": 0.3511338697878566,
"grad_norm": 0.8345121386846462,
"learning_rate": 5e-06,
"loss": 0.6822,
"step": 60
},
{
"epoch": 0.40965618141916604,
"grad_norm": 0.502423577542533,
"learning_rate": 5e-06,
"loss": 0.6631,
"step": 70
},
{
"epoch": 0.4681784930504755,
"grad_norm": 0.3206403744702351,
"learning_rate": 5e-06,
"loss": 0.6566,
"step": 80
},
{
"epoch": 0.5267008046817849,
"grad_norm": 0.3373586439028653,
"learning_rate": 5e-06,
"loss": 0.6613,
"step": 90
},
{
"epoch": 0.5852231163130943,
"grad_norm": 0.27440465078078524,
"learning_rate": 5e-06,
"loss": 0.6497,
"step": 100
},
{
"epoch": 0.6437454279444038,
"grad_norm": 0.25729298157504654,
"learning_rate": 5e-06,
"loss": 0.6506,
"step": 110
},
{
"epoch": 0.7022677395757132,
"grad_norm": 0.2774645357576214,
"learning_rate": 5e-06,
"loss": 0.6479,
"step": 120
},
{
"epoch": 0.7607900512070227,
"grad_norm": 0.2750786001903561,
"learning_rate": 5e-06,
"loss": 0.6509,
"step": 130
},
{
"epoch": 0.8193123628383321,
"grad_norm": 0.3018000353503424,
"learning_rate": 5e-06,
"loss": 0.6532,
"step": 140
},
{
"epoch": 0.8778346744696416,
"grad_norm": 0.2651836266343764,
"learning_rate": 5e-06,
"loss": 0.6407,
"step": 150
},
{
"epoch": 0.936356986100951,
"grad_norm": 0.2621590800809169,
"learning_rate": 5e-06,
"loss": 0.646,
"step": 160
},
{
"epoch": 0.9948792977322605,
"grad_norm": 0.274425449696112,
"learning_rate": 5e-06,
"loss": 0.6429,
"step": 170
},
{
"epoch": 0.9948792977322605,
"eval_loss": 0.6455708742141724,
"eval_runtime": 172.3102,
"eval_samples_per_second": 53.444,
"eval_steps_per_second": 0.418,
"step": 170
},
{
"epoch": 1.0563277249451353,
"grad_norm": 0.32659804954217203,
"learning_rate": 5e-06,
"loss": 0.6596,
"step": 180
},
{
"epoch": 1.1148500365764447,
"grad_norm": 0.29235225467356285,
"learning_rate": 5e-06,
"loss": 0.6219,
"step": 190
},
{
"epoch": 1.1733723482077543,
"grad_norm": 0.2686911846272285,
"learning_rate": 5e-06,
"loss": 0.6246,
"step": 200
},
{
"epoch": 1.2318946598390637,
"grad_norm": 0.2689856133611371,
"learning_rate": 5e-06,
"loss": 0.618,
"step": 210
},
{
"epoch": 1.290416971470373,
"grad_norm": 0.26872283242131406,
"learning_rate": 5e-06,
"loss": 0.6202,
"step": 220
},
{
"epoch": 1.3489392831016827,
"grad_norm": 0.301091252809549,
"learning_rate": 5e-06,
"loss": 0.618,
"step": 230
},
{
"epoch": 1.4074615947329918,
"grad_norm": 0.2920775430394786,
"learning_rate": 5e-06,
"loss": 0.6142,
"step": 240
},
{
"epoch": 1.4659839063643014,
"grad_norm": 0.2456820075171799,
"learning_rate": 5e-06,
"loss": 0.6155,
"step": 250
},
{
"epoch": 1.5245062179956108,
"grad_norm": 0.2938378044663654,
"learning_rate": 5e-06,
"loss": 0.6187,
"step": 260
},
{
"epoch": 1.5830285296269202,
"grad_norm": 0.32438651891226156,
"learning_rate": 5e-06,
"loss": 0.6219,
"step": 270
},
{
"epoch": 1.6415508412582298,
"grad_norm": 0.25545801371272864,
"learning_rate": 5e-06,
"loss": 0.616,
"step": 280
},
{
"epoch": 1.700073152889539,
"grad_norm": 0.26294073057220163,
"learning_rate": 5e-06,
"loss": 0.6127,
"step": 290
},
{
"epoch": 1.7585954645208486,
"grad_norm": 0.26462245389002803,
"learning_rate": 5e-06,
"loss": 0.6168,
"step": 300
},
{
"epoch": 1.817117776152158,
"grad_norm": 0.2847262707293318,
"learning_rate": 5e-06,
"loss": 0.6172,
"step": 310
},
{
"epoch": 1.8756400877834674,
"grad_norm": 0.2669714428041422,
"learning_rate": 5e-06,
"loss": 0.614,
"step": 320
},
{
"epoch": 1.934162399414777,
"grad_norm": 0.25457144598514,
"learning_rate": 5e-06,
"loss": 0.6166,
"step": 330
},
{
"epoch": 1.9926847110460864,
"grad_norm": 0.2608967910015083,
"learning_rate": 5e-06,
"loss": 0.6126,
"step": 340
},
{
"epoch": 1.9926847110460864,
"eval_loss": 0.6363422274589539,
"eval_runtime": 172.306,
"eval_samples_per_second": 53.446,
"eval_steps_per_second": 0.418,
"step": 340
},
{
"epoch": 2.0541331382589614,
"grad_norm": 0.271883921683299,
"learning_rate": 5e-06,
"loss": 0.6297,
"step": 350
},
{
"epoch": 2.1126554498902705,
"grad_norm": 0.24729272080119263,
"learning_rate": 5e-06,
"loss": 0.5897,
"step": 360
},
{
"epoch": 2.17117776152158,
"grad_norm": 0.27092891797600144,
"learning_rate": 5e-06,
"loss": 0.5946,
"step": 370
},
{
"epoch": 2.2297000731528893,
"grad_norm": 0.3032127102208398,
"learning_rate": 5e-06,
"loss": 0.594,
"step": 380
},
{
"epoch": 2.288222384784199,
"grad_norm": 0.25853126440367846,
"learning_rate": 5e-06,
"loss": 0.588,
"step": 390
},
{
"epoch": 2.3467446964155085,
"grad_norm": 0.3077689025344159,
"learning_rate": 5e-06,
"loss": 0.5943,
"step": 400
},
{
"epoch": 2.4052670080468177,
"grad_norm": 0.2827487146132787,
"learning_rate": 5e-06,
"loss": 0.5933,
"step": 410
},
{
"epoch": 2.4637893196781273,
"grad_norm": 0.2519214403191199,
"learning_rate": 5e-06,
"loss": 0.5898,
"step": 420
},
{
"epoch": 2.522311631309437,
"grad_norm": 0.2751668540595721,
"learning_rate": 5e-06,
"loss": 0.588,
"step": 430
},
{
"epoch": 2.580833942940746,
"grad_norm": 0.2530698336402752,
"learning_rate": 5e-06,
"loss": 0.5883,
"step": 440
},
{
"epoch": 2.6393562545720557,
"grad_norm": 0.25471213766207895,
"learning_rate": 5e-06,
"loss": 0.5951,
"step": 450
},
{
"epoch": 2.6978785662033653,
"grad_norm": 0.29077003251470107,
"learning_rate": 5e-06,
"loss": 0.5914,
"step": 460
},
{
"epoch": 2.7564008778346745,
"grad_norm": 0.30152118910674564,
"learning_rate": 5e-06,
"loss": 0.5917,
"step": 470
},
{
"epoch": 2.8149231894659836,
"grad_norm": 0.26709177034419973,
"learning_rate": 5e-06,
"loss": 0.5923,
"step": 480
},
{
"epoch": 2.8734455010972932,
"grad_norm": 0.23395614388611538,
"learning_rate": 5e-06,
"loss": 0.5888,
"step": 490
},
{
"epoch": 2.931967812728603,
"grad_norm": 0.28669880402317394,
"learning_rate": 5e-06,
"loss": 0.5865,
"step": 500
},
{
"epoch": 2.990490124359912,
"grad_norm": 0.2531977873715163,
"learning_rate": 5e-06,
"loss": 0.5898,
"step": 510
},
{
"epoch": 2.990490124359912,
"eval_loss": 0.63369220495224,
"eval_runtime": 171.9297,
"eval_samples_per_second": 53.563,
"eval_steps_per_second": 0.419,
"step": 510
},
{
"epoch": 2.990490124359912,
"step": 510,
"total_flos": 2138433883471872.0,
"train_loss": 0.6292863135244332,
"train_runtime": 27713.9731,
"train_samples_per_second": 18.939,
"train_steps_per_second": 0.018
}
],
"logging_steps": 10,
"max_steps": 510,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2138433883471872.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}