gsmyrnis's picture
End of training
b58c076 verified
raw
history blame
16 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 924,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.032467532467532464,
"grad_norm": 6.265174130511246,
"learning_rate": 5e-06,
"loss": 0.9365,
"step": 10
},
{
"epoch": 0.06493506493506493,
"grad_norm": 1.3770366162111063,
"learning_rate": 5e-06,
"loss": 0.8189,
"step": 20
},
{
"epoch": 0.09740259740259741,
"grad_norm": 1.0978187926506935,
"learning_rate": 5e-06,
"loss": 0.7833,
"step": 30
},
{
"epoch": 0.12987012987012986,
"grad_norm": 0.981210956561077,
"learning_rate": 5e-06,
"loss": 0.7556,
"step": 40
},
{
"epoch": 0.16233766233766234,
"grad_norm": 0.9693356495883646,
"learning_rate": 5e-06,
"loss": 0.7414,
"step": 50
},
{
"epoch": 0.19480519480519481,
"grad_norm": 1.0030466632996962,
"learning_rate": 5e-06,
"loss": 0.7236,
"step": 60
},
{
"epoch": 0.22727272727272727,
"grad_norm": 1.1260479581636729,
"learning_rate": 5e-06,
"loss": 0.7169,
"step": 70
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.7460895709589158,
"learning_rate": 5e-06,
"loss": 0.7168,
"step": 80
},
{
"epoch": 0.2922077922077922,
"grad_norm": 0.9486986195334304,
"learning_rate": 5e-06,
"loss": 0.7031,
"step": 90
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.7051606863668234,
"learning_rate": 5e-06,
"loss": 0.6974,
"step": 100
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.6756560441314118,
"learning_rate": 5e-06,
"loss": 0.692,
"step": 110
},
{
"epoch": 0.38961038961038963,
"grad_norm": 0.5240739369726283,
"learning_rate": 5e-06,
"loss": 0.693,
"step": 120
},
{
"epoch": 0.42207792207792205,
"grad_norm": 0.5785376996044719,
"learning_rate": 5e-06,
"loss": 0.689,
"step": 130
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.5933112722295861,
"learning_rate": 5e-06,
"loss": 0.691,
"step": 140
},
{
"epoch": 0.487012987012987,
"grad_norm": 0.6077588775577111,
"learning_rate": 5e-06,
"loss": 0.6806,
"step": 150
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.6238054182160374,
"learning_rate": 5e-06,
"loss": 0.6817,
"step": 160
},
{
"epoch": 0.551948051948052,
"grad_norm": 0.6081002667408969,
"learning_rate": 5e-06,
"loss": 0.6807,
"step": 170
},
{
"epoch": 0.5844155844155844,
"grad_norm": 0.6219764792866612,
"learning_rate": 5e-06,
"loss": 0.6773,
"step": 180
},
{
"epoch": 0.6168831168831169,
"grad_norm": 0.739529394087955,
"learning_rate": 5e-06,
"loss": 0.6795,
"step": 190
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.7524681424985254,
"learning_rate": 5e-06,
"loss": 0.6723,
"step": 200
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.6010281827966147,
"learning_rate": 5e-06,
"loss": 0.6709,
"step": 210
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.6611165599870378,
"learning_rate": 5e-06,
"loss": 0.6692,
"step": 220
},
{
"epoch": 0.7467532467532467,
"grad_norm": 0.8344801352021102,
"learning_rate": 5e-06,
"loss": 0.6738,
"step": 230
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.7851764850319622,
"learning_rate": 5e-06,
"loss": 0.6692,
"step": 240
},
{
"epoch": 0.8116883116883117,
"grad_norm": 0.6068138322416587,
"learning_rate": 5e-06,
"loss": 0.6693,
"step": 250
},
{
"epoch": 0.8441558441558441,
"grad_norm": 0.5781959225993195,
"learning_rate": 5e-06,
"loss": 0.6698,
"step": 260
},
{
"epoch": 0.8766233766233766,
"grad_norm": 0.7049586430934481,
"learning_rate": 5e-06,
"loss": 0.672,
"step": 270
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.6323170370591866,
"learning_rate": 5e-06,
"loss": 0.6668,
"step": 280
},
{
"epoch": 0.9415584415584416,
"grad_norm": 0.881618301887001,
"learning_rate": 5e-06,
"loss": 0.6706,
"step": 290
},
{
"epoch": 0.974025974025974,
"grad_norm": 0.5219254149696031,
"learning_rate": 5e-06,
"loss": 0.6679,
"step": 300
},
{
"epoch": 1.0,
"eval_loss": 0.6691647171974182,
"eval_runtime": 30.5714,
"eval_samples_per_second": 270.972,
"eval_steps_per_second": 1.079,
"step": 308
},
{
"epoch": 1.0064935064935066,
"grad_norm": 0.7028333992981749,
"learning_rate": 5e-06,
"loss": 0.6597,
"step": 310
},
{
"epoch": 1.0389610389610389,
"grad_norm": 0.620216384870711,
"learning_rate": 5e-06,
"loss": 0.6178,
"step": 320
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.6279544966110486,
"learning_rate": 5e-06,
"loss": 0.6196,
"step": 330
},
{
"epoch": 1.103896103896104,
"grad_norm": 0.47691022078448675,
"learning_rate": 5e-06,
"loss": 0.6213,
"step": 340
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.6169659732755709,
"learning_rate": 5e-06,
"loss": 0.6189,
"step": 350
},
{
"epoch": 1.1688311688311688,
"grad_norm": 0.6930896730291389,
"learning_rate": 5e-06,
"loss": 0.6179,
"step": 360
},
{
"epoch": 1.2012987012987013,
"grad_norm": 0.5888468229519391,
"learning_rate": 5e-06,
"loss": 0.6193,
"step": 370
},
{
"epoch": 1.2337662337662338,
"grad_norm": 0.5114807666495347,
"learning_rate": 5e-06,
"loss": 0.6205,
"step": 380
},
{
"epoch": 1.2662337662337662,
"grad_norm": 0.576480885597218,
"learning_rate": 5e-06,
"loss": 0.6143,
"step": 390
},
{
"epoch": 1.2987012987012987,
"grad_norm": 0.9781557440302872,
"learning_rate": 5e-06,
"loss": 0.616,
"step": 400
},
{
"epoch": 1.3311688311688312,
"grad_norm": 0.5493968761484528,
"learning_rate": 5e-06,
"loss": 0.6181,
"step": 410
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.8450188883114491,
"learning_rate": 5e-06,
"loss": 0.6186,
"step": 420
},
{
"epoch": 1.396103896103896,
"grad_norm": 0.6672141224772778,
"learning_rate": 5e-06,
"loss": 0.6182,
"step": 430
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.5436445484738832,
"learning_rate": 5e-06,
"loss": 0.6147,
"step": 440
},
{
"epoch": 1.4610389610389611,
"grad_norm": 0.5830504026660146,
"learning_rate": 5e-06,
"loss": 0.6179,
"step": 450
},
{
"epoch": 1.4935064935064934,
"grad_norm": 0.6473257236943104,
"learning_rate": 5e-06,
"loss": 0.6199,
"step": 460
},
{
"epoch": 1.525974025974026,
"grad_norm": 0.5427880278607804,
"learning_rate": 5e-06,
"loss": 0.6168,
"step": 470
},
{
"epoch": 1.5584415584415585,
"grad_norm": 0.5689580314401272,
"learning_rate": 5e-06,
"loss": 0.6159,
"step": 480
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.597927845953086,
"learning_rate": 5e-06,
"loss": 0.6175,
"step": 490
},
{
"epoch": 1.6233766233766234,
"grad_norm": 0.6128642707216239,
"learning_rate": 5e-06,
"loss": 0.6163,
"step": 500
},
{
"epoch": 1.655844155844156,
"grad_norm": 0.5455974938431143,
"learning_rate": 5e-06,
"loss": 0.6168,
"step": 510
},
{
"epoch": 1.6883116883116882,
"grad_norm": 0.5153120159264221,
"learning_rate": 5e-06,
"loss": 0.6204,
"step": 520
},
{
"epoch": 1.7207792207792207,
"grad_norm": 0.5767601324955324,
"learning_rate": 5e-06,
"loss": 0.619,
"step": 530
},
{
"epoch": 1.7532467532467533,
"grad_norm": 0.5856685996311523,
"learning_rate": 5e-06,
"loss": 0.6195,
"step": 540
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.5318505472371191,
"learning_rate": 5e-06,
"loss": 0.6142,
"step": 550
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.5867934386348821,
"learning_rate": 5e-06,
"loss": 0.6163,
"step": 560
},
{
"epoch": 1.8506493506493507,
"grad_norm": 0.5460100351131841,
"learning_rate": 5e-06,
"loss": 0.6209,
"step": 570
},
{
"epoch": 1.883116883116883,
"grad_norm": 0.5930045996717794,
"learning_rate": 5e-06,
"loss": 0.6173,
"step": 580
},
{
"epoch": 1.9155844155844157,
"grad_norm": 0.5210440244932204,
"learning_rate": 5e-06,
"loss": 0.6172,
"step": 590
},
{
"epoch": 1.948051948051948,
"grad_norm": 0.678263024145128,
"learning_rate": 5e-06,
"loss": 0.6219,
"step": 600
},
{
"epoch": 1.9805194805194806,
"grad_norm": 0.5193570456262979,
"learning_rate": 5e-06,
"loss": 0.6119,
"step": 610
},
{
"epoch": 2.0,
"eval_loss": 0.6595985293388367,
"eval_runtime": 30.7002,
"eval_samples_per_second": 269.835,
"eval_steps_per_second": 1.075,
"step": 616
},
{
"epoch": 2.012987012987013,
"grad_norm": 0.997715568070811,
"learning_rate": 5e-06,
"loss": 0.5894,
"step": 620
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.7592737794541236,
"learning_rate": 5e-06,
"loss": 0.5677,
"step": 630
},
{
"epoch": 2.0779220779220777,
"grad_norm": 0.6056584838864539,
"learning_rate": 5e-06,
"loss": 0.5669,
"step": 640
},
{
"epoch": 2.1103896103896105,
"grad_norm": 0.6211039916928865,
"learning_rate": 5e-06,
"loss": 0.5697,
"step": 650
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.615106862254971,
"learning_rate": 5e-06,
"loss": 0.5693,
"step": 660
},
{
"epoch": 2.175324675324675,
"grad_norm": 0.566417720964845,
"learning_rate": 5e-06,
"loss": 0.577,
"step": 670
},
{
"epoch": 2.207792207792208,
"grad_norm": 0.5480748428783726,
"learning_rate": 5e-06,
"loss": 0.5724,
"step": 680
},
{
"epoch": 2.24025974025974,
"grad_norm": 0.6883572551516758,
"learning_rate": 5e-06,
"loss": 0.5663,
"step": 690
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.7624758724871575,
"learning_rate": 5e-06,
"loss": 0.5721,
"step": 700
},
{
"epoch": 2.3051948051948052,
"grad_norm": 0.5925041865618843,
"learning_rate": 5e-06,
"loss": 0.5718,
"step": 710
},
{
"epoch": 2.3376623376623376,
"grad_norm": 0.5423034645452969,
"learning_rate": 5e-06,
"loss": 0.5681,
"step": 720
},
{
"epoch": 2.3701298701298703,
"grad_norm": 0.5480316834860852,
"learning_rate": 5e-06,
"loss": 0.5722,
"step": 730
},
{
"epoch": 2.4025974025974026,
"grad_norm": 0.5169062030347897,
"learning_rate": 5e-06,
"loss": 0.578,
"step": 740
},
{
"epoch": 2.435064935064935,
"grad_norm": 0.5457808079840645,
"learning_rate": 5e-06,
"loss": 0.57,
"step": 750
},
{
"epoch": 2.4675324675324677,
"grad_norm": 0.5470205045138103,
"learning_rate": 5e-06,
"loss": 0.5726,
"step": 760
},
{
"epoch": 2.5,
"grad_norm": 0.5125136364795218,
"learning_rate": 5e-06,
"loss": 0.5693,
"step": 770
},
{
"epoch": 2.5324675324675323,
"grad_norm": 0.5945664415971015,
"learning_rate": 5e-06,
"loss": 0.5714,
"step": 780
},
{
"epoch": 2.564935064935065,
"grad_norm": 0.5702694037641614,
"learning_rate": 5e-06,
"loss": 0.5689,
"step": 790
},
{
"epoch": 2.5974025974025974,
"grad_norm": 0.5441374726350022,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 800
},
{
"epoch": 2.62987012987013,
"grad_norm": 0.5674621294447999,
"learning_rate": 5e-06,
"loss": 0.5687,
"step": 810
},
{
"epoch": 2.6623376623376624,
"grad_norm": 0.5997098488587294,
"learning_rate": 5e-06,
"loss": 0.5763,
"step": 820
},
{
"epoch": 2.6948051948051948,
"grad_norm": 0.6199757649220302,
"learning_rate": 5e-06,
"loss": 0.5747,
"step": 830
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.6911213249901123,
"learning_rate": 5e-06,
"loss": 0.5711,
"step": 840
},
{
"epoch": 2.75974025974026,
"grad_norm": 0.5709123176208969,
"learning_rate": 5e-06,
"loss": 0.5701,
"step": 850
},
{
"epoch": 2.792207792207792,
"grad_norm": 0.6304517541226137,
"learning_rate": 5e-06,
"loss": 0.5673,
"step": 860
},
{
"epoch": 2.824675324675325,
"grad_norm": 0.6030037959776535,
"learning_rate": 5e-06,
"loss": 0.5713,
"step": 870
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.5603204730571357,
"learning_rate": 5e-06,
"loss": 0.5749,
"step": 880
},
{
"epoch": 2.8896103896103895,
"grad_norm": 0.5148606934943276,
"learning_rate": 5e-06,
"loss": 0.5671,
"step": 890
},
{
"epoch": 2.9220779220779223,
"grad_norm": 0.770823574891512,
"learning_rate": 5e-06,
"loss": 0.5694,
"step": 900
},
{
"epoch": 2.9545454545454546,
"grad_norm": 0.6707592403791355,
"learning_rate": 5e-06,
"loss": 0.5691,
"step": 910
},
{
"epoch": 2.987012987012987,
"grad_norm": 0.7817460976590817,
"learning_rate": 5e-06,
"loss": 0.568,
"step": 920
},
{
"epoch": 3.0,
"eval_loss": 0.6653555631637573,
"eval_runtime": 29.9562,
"eval_samples_per_second": 276.537,
"eval_steps_per_second": 1.102,
"step": 924
},
{
"epoch": 3.0,
"step": 924,
"total_flos": 1547734414786560.0,
"train_loss": 0.6307248511871735,
"train_runtime": 5890.0632,
"train_samples_per_second": 80.163,
"train_steps_per_second": 0.157
}
],
"logging_steps": 10,
"max_steps": 924,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1547734414786560.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}