hp_ablations_qwen_bsz256 / trainer_state.json
sedrickkeh's picture
End of training
3dfea0c verified
raw
history blame
43.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9982905982905983,
"eval_steps": 500,
"global_step": 2631,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011396011396011397,
"grad_norm": 1.1062365284901086,
"learning_rate": 5e-06,
"loss": 0.7552,
"step": 10
},
{
"epoch": 0.022792022792022793,
"grad_norm": 1.023752202077829,
"learning_rate": 5e-06,
"loss": 0.7051,
"step": 20
},
{
"epoch": 0.03418803418803419,
"grad_norm": 1.0176219936883037,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 30
},
{
"epoch": 0.045584045584045586,
"grad_norm": 0.7858042820189418,
"learning_rate": 5e-06,
"loss": 0.6856,
"step": 40
},
{
"epoch": 0.05698005698005698,
"grad_norm": 0.7933715158385674,
"learning_rate": 5e-06,
"loss": 0.6637,
"step": 50
},
{
"epoch": 0.06837606837606838,
"grad_norm": 0.8447246301495516,
"learning_rate": 5e-06,
"loss": 0.6622,
"step": 60
},
{
"epoch": 0.07977207977207977,
"grad_norm": 0.5473425796189046,
"learning_rate": 5e-06,
"loss": 0.6663,
"step": 70
},
{
"epoch": 0.09116809116809117,
"grad_norm": 0.4937146235526688,
"learning_rate": 5e-06,
"loss": 0.6571,
"step": 80
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.44446776727990156,
"learning_rate": 5e-06,
"loss": 0.6638,
"step": 90
},
{
"epoch": 0.11396011396011396,
"grad_norm": 0.41849443527281166,
"learning_rate": 5e-06,
"loss": 0.6579,
"step": 100
},
{
"epoch": 0.12535612535612536,
"grad_norm": 0.45940085033829986,
"learning_rate": 5e-06,
"loss": 0.6526,
"step": 110
},
{
"epoch": 0.13675213675213677,
"grad_norm": 0.45931809899313636,
"learning_rate": 5e-06,
"loss": 0.6369,
"step": 120
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.40591863341923856,
"learning_rate": 5e-06,
"loss": 0.6341,
"step": 130
},
{
"epoch": 0.15954415954415954,
"grad_norm": 0.42649491292164343,
"learning_rate": 5e-06,
"loss": 0.6573,
"step": 140
},
{
"epoch": 0.17094017094017094,
"grad_norm": 0.42419739622977437,
"learning_rate": 5e-06,
"loss": 0.6449,
"step": 150
},
{
"epoch": 0.18233618233618235,
"grad_norm": 0.41999442196069786,
"learning_rate": 5e-06,
"loss": 0.6558,
"step": 160
},
{
"epoch": 0.19373219373219372,
"grad_norm": 0.4637200181201795,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 170
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.41547705456707573,
"learning_rate": 5e-06,
"loss": 0.6439,
"step": 180
},
{
"epoch": 0.21652421652421652,
"grad_norm": 0.43068607713697277,
"learning_rate": 5e-06,
"loss": 0.6396,
"step": 190
},
{
"epoch": 0.22792022792022792,
"grad_norm": 0.4469322057262852,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 200
},
{
"epoch": 0.23931623931623933,
"grad_norm": 0.4680911563203023,
"learning_rate": 5e-06,
"loss": 0.6306,
"step": 210
},
{
"epoch": 0.25071225071225073,
"grad_norm": 0.4095294859092795,
"learning_rate": 5e-06,
"loss": 0.6277,
"step": 220
},
{
"epoch": 0.2621082621082621,
"grad_norm": 0.45759740926828324,
"learning_rate": 5e-06,
"loss": 0.6339,
"step": 230
},
{
"epoch": 0.27350427350427353,
"grad_norm": 0.4140379707131278,
"learning_rate": 5e-06,
"loss": 0.641,
"step": 240
},
{
"epoch": 0.2849002849002849,
"grad_norm": 0.4150377896672994,
"learning_rate": 5e-06,
"loss": 0.6372,
"step": 250
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.402341599576737,
"learning_rate": 5e-06,
"loss": 0.6403,
"step": 260
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.42585340932157245,
"learning_rate": 5e-06,
"loss": 0.6415,
"step": 270
},
{
"epoch": 0.3190883190883191,
"grad_norm": 0.45653778556147656,
"learning_rate": 5e-06,
"loss": 0.6399,
"step": 280
},
{
"epoch": 0.33048433048433046,
"grad_norm": 0.4195393934267986,
"learning_rate": 5e-06,
"loss": 0.6336,
"step": 290
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.46802670579447797,
"learning_rate": 5e-06,
"loss": 0.6337,
"step": 300
},
{
"epoch": 0.35327635327635326,
"grad_norm": 0.4203687337846972,
"learning_rate": 5e-06,
"loss": 0.637,
"step": 310
},
{
"epoch": 0.3646723646723647,
"grad_norm": 0.421822849143681,
"learning_rate": 5e-06,
"loss": 0.64,
"step": 320
},
{
"epoch": 0.37606837606837606,
"grad_norm": 0.4283596513144174,
"learning_rate": 5e-06,
"loss": 0.6252,
"step": 330
},
{
"epoch": 0.38746438746438744,
"grad_norm": 0.42324206057009117,
"learning_rate": 5e-06,
"loss": 0.636,
"step": 340
},
{
"epoch": 0.39886039886039887,
"grad_norm": 0.4062746526152,
"learning_rate": 5e-06,
"loss": 0.6387,
"step": 350
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.43787430045384385,
"learning_rate": 5e-06,
"loss": 0.6338,
"step": 360
},
{
"epoch": 0.42165242165242167,
"grad_norm": 0.4067600081663935,
"learning_rate": 5e-06,
"loss": 0.6337,
"step": 370
},
{
"epoch": 0.43304843304843305,
"grad_norm": 0.405651618692542,
"learning_rate": 5e-06,
"loss": 0.6235,
"step": 380
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.39893273449497857,
"learning_rate": 5e-06,
"loss": 0.6305,
"step": 390
},
{
"epoch": 0.45584045584045585,
"grad_norm": 0.4181843535226299,
"learning_rate": 5e-06,
"loss": 0.6424,
"step": 400
},
{
"epoch": 0.4672364672364672,
"grad_norm": 0.42805555885189545,
"learning_rate": 5e-06,
"loss": 0.6319,
"step": 410
},
{
"epoch": 0.47863247863247865,
"grad_norm": 0.44895050300003103,
"learning_rate": 5e-06,
"loss": 0.6332,
"step": 420
},
{
"epoch": 0.49002849002849,
"grad_norm": 0.3919946319959885,
"learning_rate": 5e-06,
"loss": 0.6328,
"step": 430
},
{
"epoch": 0.5014245014245015,
"grad_norm": 0.415410598131448,
"learning_rate": 5e-06,
"loss": 0.6402,
"step": 440
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.4393590856709396,
"learning_rate": 5e-06,
"loss": 0.6306,
"step": 450
},
{
"epoch": 0.5242165242165242,
"grad_norm": 0.43301735874135633,
"learning_rate": 5e-06,
"loss": 0.6344,
"step": 460
},
{
"epoch": 0.5356125356125356,
"grad_norm": 0.44449051507968934,
"learning_rate": 5e-06,
"loss": 0.6231,
"step": 470
},
{
"epoch": 0.5470085470085471,
"grad_norm": 0.41409814672813067,
"learning_rate": 5e-06,
"loss": 0.6341,
"step": 480
},
{
"epoch": 0.5584045584045584,
"grad_norm": 0.404549684025027,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 490
},
{
"epoch": 0.5698005698005698,
"grad_norm": 0.4180598818867765,
"learning_rate": 5e-06,
"loss": 0.6309,
"step": 500
},
{
"epoch": 0.5811965811965812,
"grad_norm": 0.40656312646305987,
"learning_rate": 5e-06,
"loss": 0.6331,
"step": 510
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.40247877044565616,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 520
},
{
"epoch": 0.603988603988604,
"grad_norm": 0.4186724709073127,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 530
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.3956067792496914,
"learning_rate": 5e-06,
"loss": 0.629,
"step": 540
},
{
"epoch": 0.6267806267806267,
"grad_norm": 0.43010520803632213,
"learning_rate": 5e-06,
"loss": 0.6418,
"step": 550
},
{
"epoch": 0.6381766381766382,
"grad_norm": 0.4242582783709579,
"learning_rate": 5e-06,
"loss": 0.6365,
"step": 560
},
{
"epoch": 0.6495726495726496,
"grad_norm": 0.4574479642511814,
"learning_rate": 5e-06,
"loss": 0.6298,
"step": 570
},
{
"epoch": 0.6609686609686609,
"grad_norm": 0.3999462091117723,
"learning_rate": 5e-06,
"loss": 0.6264,
"step": 580
},
{
"epoch": 0.6723646723646723,
"grad_norm": 0.43650664891174007,
"learning_rate": 5e-06,
"loss": 0.6338,
"step": 590
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.4209881207979195,
"learning_rate": 5e-06,
"loss": 0.6185,
"step": 600
},
{
"epoch": 0.6951566951566952,
"grad_norm": 0.4356837089917804,
"learning_rate": 5e-06,
"loss": 0.6285,
"step": 610
},
{
"epoch": 0.7065527065527065,
"grad_norm": 0.4267755900128707,
"learning_rate": 5e-06,
"loss": 0.6249,
"step": 620
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.4252749404036598,
"learning_rate": 5e-06,
"loss": 0.6297,
"step": 630
},
{
"epoch": 0.7293447293447294,
"grad_norm": 0.43616986641525424,
"learning_rate": 5e-06,
"loss": 0.624,
"step": 640
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.4164486549654651,
"learning_rate": 5e-06,
"loss": 0.629,
"step": 650
},
{
"epoch": 0.7521367521367521,
"grad_norm": 0.476343190261518,
"learning_rate": 5e-06,
"loss": 0.6177,
"step": 660
},
{
"epoch": 0.7635327635327636,
"grad_norm": 0.40486827396324065,
"learning_rate": 5e-06,
"loss": 0.6261,
"step": 670
},
{
"epoch": 0.7749287749287749,
"grad_norm": 0.4212351136466915,
"learning_rate": 5e-06,
"loss": 0.6304,
"step": 680
},
{
"epoch": 0.7863247863247863,
"grad_norm": 0.41575901401793347,
"learning_rate": 5e-06,
"loss": 0.6398,
"step": 690
},
{
"epoch": 0.7977207977207977,
"grad_norm": 0.4285454155969582,
"learning_rate": 5e-06,
"loss": 0.6319,
"step": 700
},
{
"epoch": 0.8091168091168092,
"grad_norm": 0.40726171067131095,
"learning_rate": 5e-06,
"loss": 0.6314,
"step": 710
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.41168149111216795,
"learning_rate": 5e-06,
"loss": 0.6243,
"step": 720
},
{
"epoch": 0.8319088319088319,
"grad_norm": 0.435567753751087,
"learning_rate": 5e-06,
"loss": 0.6226,
"step": 730
},
{
"epoch": 0.8433048433048433,
"grad_norm": 0.43940850789677355,
"learning_rate": 5e-06,
"loss": 0.6208,
"step": 740
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.4188384621992378,
"learning_rate": 5e-06,
"loss": 0.6338,
"step": 750
},
{
"epoch": 0.8660968660968661,
"grad_norm": 0.3960108041735021,
"learning_rate": 5e-06,
"loss": 0.6337,
"step": 760
},
{
"epoch": 0.8774928774928775,
"grad_norm": 0.40675640823017994,
"learning_rate": 5e-06,
"loss": 0.6296,
"step": 770
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.43353876595216656,
"learning_rate": 5e-06,
"loss": 0.6357,
"step": 780
},
{
"epoch": 0.9002849002849003,
"grad_norm": 0.43992543662793077,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 790
},
{
"epoch": 0.9116809116809117,
"grad_norm": 0.41627535741522503,
"learning_rate": 5e-06,
"loss": 0.6384,
"step": 800
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.4274496512159185,
"learning_rate": 5e-06,
"loss": 0.6309,
"step": 810
},
{
"epoch": 0.9344729344729344,
"grad_norm": 0.5000942948514508,
"learning_rate": 5e-06,
"loss": 0.6323,
"step": 820
},
{
"epoch": 0.9458689458689459,
"grad_norm": 0.39649163621370453,
"learning_rate": 5e-06,
"loss": 0.6117,
"step": 830
},
{
"epoch": 0.9572649572649573,
"grad_norm": 0.45128894713654466,
"learning_rate": 5e-06,
"loss": 0.6258,
"step": 840
},
{
"epoch": 0.9686609686609686,
"grad_norm": 0.4053334632337957,
"learning_rate": 5e-06,
"loss": 0.6334,
"step": 850
},
{
"epoch": 0.98005698005698,
"grad_norm": 0.4570308695791834,
"learning_rate": 5e-06,
"loss": 0.6299,
"step": 860
},
{
"epoch": 0.9914529914529915,
"grad_norm": 0.4142729888175128,
"learning_rate": 5e-06,
"loss": 0.6134,
"step": 870
},
{
"epoch": 0.9994301994301994,
"eval_loss": 0.622437059879303,
"eval_runtime": 442.3461,
"eval_samples_per_second": 26.728,
"eval_steps_per_second": 0.418,
"step": 877
},
{
"epoch": 1.002849002849003,
"grad_norm": 0.467206811021719,
"learning_rate": 5e-06,
"loss": 0.6384,
"step": 880
},
{
"epoch": 1.0142450142450143,
"grad_norm": 0.4575873633037112,
"learning_rate": 5e-06,
"loss": 0.5855,
"step": 890
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.4094192073196508,
"learning_rate": 5e-06,
"loss": 0.5924,
"step": 900
},
{
"epoch": 1.037037037037037,
"grad_norm": 0.41727147235729756,
"learning_rate": 5e-06,
"loss": 0.5882,
"step": 910
},
{
"epoch": 1.0484330484330484,
"grad_norm": 0.40097390374474684,
"learning_rate": 5e-06,
"loss": 0.5834,
"step": 920
},
{
"epoch": 1.0598290598290598,
"grad_norm": 0.3988722663272877,
"learning_rate": 5e-06,
"loss": 0.5875,
"step": 930
},
{
"epoch": 1.0712250712250713,
"grad_norm": 0.409835543782938,
"learning_rate": 5e-06,
"loss": 0.578,
"step": 940
},
{
"epoch": 1.0826210826210827,
"grad_norm": 0.4348656181993297,
"learning_rate": 5e-06,
"loss": 0.5945,
"step": 950
},
{
"epoch": 1.0940170940170941,
"grad_norm": 0.4560769367527893,
"learning_rate": 5e-06,
"loss": 0.591,
"step": 960
},
{
"epoch": 1.1054131054131053,
"grad_norm": 0.3987301391233058,
"learning_rate": 5e-06,
"loss": 0.5947,
"step": 970
},
{
"epoch": 1.1168091168091168,
"grad_norm": 0.4310263093448157,
"learning_rate": 5e-06,
"loss": 0.5989,
"step": 980
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.3988555704488419,
"learning_rate": 5e-06,
"loss": 0.5883,
"step": 990
},
{
"epoch": 1.1396011396011396,
"grad_norm": 0.41694498325264395,
"learning_rate": 5e-06,
"loss": 0.5857,
"step": 1000
},
{
"epoch": 1.150997150997151,
"grad_norm": 0.4261280155159663,
"learning_rate": 5e-06,
"loss": 0.5846,
"step": 1010
},
{
"epoch": 1.1623931623931625,
"grad_norm": 0.4090258551630524,
"learning_rate": 5e-06,
"loss": 0.5862,
"step": 1020
},
{
"epoch": 1.173789173789174,
"grad_norm": 0.39703392125897946,
"learning_rate": 5e-06,
"loss": 0.5828,
"step": 1030
},
{
"epoch": 1.1851851851851851,
"grad_norm": 0.4171232168803472,
"learning_rate": 5e-06,
"loss": 0.5969,
"step": 1040
},
{
"epoch": 1.1965811965811965,
"grad_norm": 0.3986677142839061,
"learning_rate": 5e-06,
"loss": 0.5849,
"step": 1050
},
{
"epoch": 1.207977207977208,
"grad_norm": 0.4210046425391405,
"learning_rate": 5e-06,
"loss": 0.5866,
"step": 1060
},
{
"epoch": 1.2193732193732194,
"grad_norm": 0.4497366233089093,
"learning_rate": 5e-06,
"loss": 0.5963,
"step": 1070
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.43086405644231185,
"learning_rate": 5e-06,
"loss": 0.5872,
"step": 1080
},
{
"epoch": 1.242165242165242,
"grad_norm": 0.4519037391850927,
"learning_rate": 5e-06,
"loss": 0.5952,
"step": 1090
},
{
"epoch": 1.2535612535612537,
"grad_norm": 0.41349582244683747,
"learning_rate": 5e-06,
"loss": 0.5903,
"step": 1100
},
{
"epoch": 1.264957264957265,
"grad_norm": 0.3837938001947666,
"learning_rate": 5e-06,
"loss": 0.5989,
"step": 1110
},
{
"epoch": 1.2763532763532763,
"grad_norm": 0.38645298038964926,
"learning_rate": 5e-06,
"loss": 0.583,
"step": 1120
},
{
"epoch": 1.2877492877492878,
"grad_norm": 0.39026828874261793,
"learning_rate": 5e-06,
"loss": 0.5938,
"step": 1130
},
{
"epoch": 1.2991452991452992,
"grad_norm": 0.48601873116831096,
"learning_rate": 5e-06,
"loss": 0.5805,
"step": 1140
},
{
"epoch": 1.3105413105413106,
"grad_norm": 0.4496341989317277,
"learning_rate": 5e-06,
"loss": 0.5862,
"step": 1150
},
{
"epoch": 1.3219373219373218,
"grad_norm": 0.43314588815183497,
"learning_rate": 5e-06,
"loss": 0.5883,
"step": 1160
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.4373497446033339,
"learning_rate": 5e-06,
"loss": 0.5833,
"step": 1170
},
{
"epoch": 1.3447293447293447,
"grad_norm": 0.4061985333964508,
"learning_rate": 5e-06,
"loss": 0.5824,
"step": 1180
},
{
"epoch": 1.3561253561253561,
"grad_norm": 0.4144234110159319,
"learning_rate": 5e-06,
"loss": 0.6004,
"step": 1190
},
{
"epoch": 1.3675213675213675,
"grad_norm": 0.4373239103878606,
"learning_rate": 5e-06,
"loss": 0.5818,
"step": 1200
},
{
"epoch": 1.378917378917379,
"grad_norm": 0.4210723366091624,
"learning_rate": 5e-06,
"loss": 0.5859,
"step": 1210
},
{
"epoch": 1.3903133903133904,
"grad_norm": 0.4052006957338942,
"learning_rate": 5e-06,
"loss": 0.5906,
"step": 1220
},
{
"epoch": 1.4017094017094016,
"grad_norm": 0.4292623892695985,
"learning_rate": 5e-06,
"loss": 0.5927,
"step": 1230
},
{
"epoch": 1.413105413105413,
"grad_norm": 0.4232783608596394,
"learning_rate": 5e-06,
"loss": 0.5956,
"step": 1240
},
{
"epoch": 1.4245014245014245,
"grad_norm": 0.43895695326546535,
"learning_rate": 5e-06,
"loss": 0.6033,
"step": 1250
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.4349281709940867,
"learning_rate": 5e-06,
"loss": 0.5825,
"step": 1260
},
{
"epoch": 1.4472934472934473,
"grad_norm": 0.4124297881341476,
"learning_rate": 5e-06,
"loss": 0.5842,
"step": 1270
},
{
"epoch": 1.4586894586894588,
"grad_norm": 0.4103899829789082,
"learning_rate": 5e-06,
"loss": 0.5851,
"step": 1280
},
{
"epoch": 1.4700854700854702,
"grad_norm": 0.4187405725906187,
"learning_rate": 5e-06,
"loss": 0.5815,
"step": 1290
},
{
"epoch": 1.4814814814814814,
"grad_norm": 0.4335514785672904,
"learning_rate": 5e-06,
"loss": 0.5893,
"step": 1300
},
{
"epoch": 1.4928774928774928,
"grad_norm": 0.4095416189258966,
"learning_rate": 5e-06,
"loss": 0.5812,
"step": 1310
},
{
"epoch": 1.5042735042735043,
"grad_norm": 0.4327159045023668,
"learning_rate": 5e-06,
"loss": 0.5872,
"step": 1320
},
{
"epoch": 1.5156695156695157,
"grad_norm": 0.42071355477765043,
"learning_rate": 5e-06,
"loss": 0.5894,
"step": 1330
},
{
"epoch": 1.5270655270655271,
"grad_norm": 0.41206489314047035,
"learning_rate": 5e-06,
"loss": 0.5948,
"step": 1340
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.4743925797235051,
"learning_rate": 5e-06,
"loss": 0.5755,
"step": 1350
},
{
"epoch": 1.54985754985755,
"grad_norm": 0.40595707601991954,
"learning_rate": 5e-06,
"loss": 0.5892,
"step": 1360
},
{
"epoch": 1.5612535612535612,
"grad_norm": 0.3947481991815675,
"learning_rate": 5e-06,
"loss": 0.5784,
"step": 1370
},
{
"epoch": 1.5726495726495726,
"grad_norm": 0.38785299206305784,
"learning_rate": 5e-06,
"loss": 0.5864,
"step": 1380
},
{
"epoch": 1.584045584045584,
"grad_norm": 0.4149251097325744,
"learning_rate": 5e-06,
"loss": 0.6075,
"step": 1390
},
{
"epoch": 1.5954415954415955,
"grad_norm": 0.43637706913229096,
"learning_rate": 5e-06,
"loss": 0.5944,
"step": 1400
},
{
"epoch": 1.606837606837607,
"grad_norm": 0.4169030325172147,
"learning_rate": 5e-06,
"loss": 0.5915,
"step": 1410
},
{
"epoch": 1.618233618233618,
"grad_norm": 0.40313516644365976,
"learning_rate": 5e-06,
"loss": 0.5817,
"step": 1420
},
{
"epoch": 1.6296296296296298,
"grad_norm": 0.39503546583616356,
"learning_rate": 5e-06,
"loss": 0.5983,
"step": 1430
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.3985527253164314,
"learning_rate": 5e-06,
"loss": 0.5826,
"step": 1440
},
{
"epoch": 1.6524216524216524,
"grad_norm": 0.4244488951950044,
"learning_rate": 5e-06,
"loss": 0.5889,
"step": 1450
},
{
"epoch": 1.6638176638176638,
"grad_norm": 0.39693307887587553,
"learning_rate": 5e-06,
"loss": 0.5859,
"step": 1460
},
{
"epoch": 1.6752136752136753,
"grad_norm": 0.3849679459506633,
"learning_rate": 5e-06,
"loss": 0.5806,
"step": 1470
},
{
"epoch": 1.6866096866096867,
"grad_norm": 0.4201985578364686,
"learning_rate": 5e-06,
"loss": 0.5866,
"step": 1480
},
{
"epoch": 1.698005698005698,
"grad_norm": 0.42432125023319545,
"learning_rate": 5e-06,
"loss": 0.5803,
"step": 1490
},
{
"epoch": 1.7094017094017095,
"grad_norm": 0.40730542273295467,
"learning_rate": 5e-06,
"loss": 0.5913,
"step": 1500
},
{
"epoch": 1.7207977207977208,
"grad_norm": 0.42837414750466624,
"learning_rate": 5e-06,
"loss": 0.5795,
"step": 1510
},
{
"epoch": 1.7321937321937322,
"grad_norm": 0.44083872834956234,
"learning_rate": 5e-06,
"loss": 0.5888,
"step": 1520
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.41133942102181764,
"learning_rate": 5e-06,
"loss": 0.5865,
"step": 1530
},
{
"epoch": 1.7549857549857548,
"grad_norm": 0.40260223356507924,
"learning_rate": 5e-06,
"loss": 0.5816,
"step": 1540
},
{
"epoch": 1.7663817663817665,
"grad_norm": 0.4054088563875919,
"learning_rate": 5e-06,
"loss": 0.5952,
"step": 1550
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.4218451424068199,
"learning_rate": 5e-06,
"loss": 0.5879,
"step": 1560
},
{
"epoch": 1.7891737891737893,
"grad_norm": 0.4423529568236007,
"learning_rate": 5e-06,
"loss": 0.5907,
"step": 1570
},
{
"epoch": 1.8005698005698005,
"grad_norm": 0.41215301182035746,
"learning_rate": 5e-06,
"loss": 0.5841,
"step": 1580
},
{
"epoch": 1.811965811965812,
"grad_norm": 0.4555696841177031,
"learning_rate": 5e-06,
"loss": 0.5849,
"step": 1590
},
{
"epoch": 1.8233618233618234,
"grad_norm": 0.41997083905529,
"learning_rate": 5e-06,
"loss": 0.5712,
"step": 1600
},
{
"epoch": 1.8347578347578346,
"grad_norm": 0.40350765403827904,
"learning_rate": 5e-06,
"loss": 0.5773,
"step": 1610
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.41505233462990104,
"learning_rate": 5e-06,
"loss": 0.5828,
"step": 1620
},
{
"epoch": 1.8575498575498575,
"grad_norm": 0.4094044224106121,
"learning_rate": 5e-06,
"loss": 0.577,
"step": 1630
},
{
"epoch": 1.868945868945869,
"grad_norm": 0.3989458077194491,
"learning_rate": 5e-06,
"loss": 0.5852,
"step": 1640
},
{
"epoch": 1.8803418803418803,
"grad_norm": 0.3968449176678109,
"learning_rate": 5e-06,
"loss": 0.5765,
"step": 1650
},
{
"epoch": 1.8917378917378918,
"grad_norm": 0.3975827713442406,
"learning_rate": 5e-06,
"loss": 0.5941,
"step": 1660
},
{
"epoch": 1.9031339031339032,
"grad_norm": 0.4591167052806216,
"learning_rate": 5e-06,
"loss": 0.5958,
"step": 1670
},
{
"epoch": 1.9145299145299144,
"grad_norm": 0.4763985809192953,
"learning_rate": 5e-06,
"loss": 0.5822,
"step": 1680
},
{
"epoch": 1.925925925925926,
"grad_norm": 0.40816873290685,
"learning_rate": 5e-06,
"loss": 0.591,
"step": 1690
},
{
"epoch": 1.9373219373219372,
"grad_norm": 0.43451011164507114,
"learning_rate": 5e-06,
"loss": 0.5866,
"step": 1700
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.42502005410583105,
"learning_rate": 5e-06,
"loss": 0.5812,
"step": 1710
},
{
"epoch": 1.96011396011396,
"grad_norm": 0.3868140358085357,
"learning_rate": 5e-06,
"loss": 0.5952,
"step": 1720
},
{
"epoch": 1.9715099715099715,
"grad_norm": 0.4233434645527226,
"learning_rate": 5e-06,
"loss": 0.5905,
"step": 1730
},
{
"epoch": 1.982905982905983,
"grad_norm": 0.46128367957303146,
"learning_rate": 5e-06,
"loss": 0.5835,
"step": 1740
},
{
"epoch": 1.9943019943019942,
"grad_norm": 0.41962900843595113,
"learning_rate": 5e-06,
"loss": 0.5823,
"step": 1750
},
{
"epoch": 2.0,
"eval_loss": 0.6174917817115784,
"eval_runtime": 442.5819,
"eval_samples_per_second": 26.714,
"eval_steps_per_second": 0.418,
"step": 1755
},
{
"epoch": 2.005698005698006,
"grad_norm": 0.43638289381677664,
"learning_rate": 5e-06,
"loss": 0.6003,
"step": 1760
},
{
"epoch": 2.017094017094017,
"grad_norm": 0.4032954694771035,
"learning_rate": 5e-06,
"loss": 0.5295,
"step": 1770
},
{
"epoch": 2.0284900284900287,
"grad_norm": 0.3978342138531873,
"learning_rate": 5e-06,
"loss": 0.5396,
"step": 1780
},
{
"epoch": 2.03988603988604,
"grad_norm": 0.3941941742542143,
"learning_rate": 5e-06,
"loss": 0.5498,
"step": 1790
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.40614413388153375,
"learning_rate": 5e-06,
"loss": 0.5485,
"step": 1800
},
{
"epoch": 2.0626780626780628,
"grad_norm": 0.4062005374187212,
"learning_rate": 5e-06,
"loss": 0.5443,
"step": 1810
},
{
"epoch": 2.074074074074074,
"grad_norm": 0.38632662394247547,
"learning_rate": 5e-06,
"loss": 0.5499,
"step": 1820
},
{
"epoch": 2.0854700854700856,
"grad_norm": 0.3877796238652637,
"learning_rate": 5e-06,
"loss": 0.5472,
"step": 1830
},
{
"epoch": 2.096866096866097,
"grad_norm": 0.4031396151639763,
"learning_rate": 5e-06,
"loss": 0.5557,
"step": 1840
},
{
"epoch": 2.1082621082621085,
"grad_norm": 0.3758020305089208,
"learning_rate": 5e-06,
"loss": 0.5423,
"step": 1850
},
{
"epoch": 2.1196581196581197,
"grad_norm": 0.46333515136342907,
"learning_rate": 5e-06,
"loss": 0.5556,
"step": 1860
},
{
"epoch": 2.131054131054131,
"grad_norm": 0.3990721210469113,
"learning_rate": 5e-06,
"loss": 0.5464,
"step": 1870
},
{
"epoch": 2.1424501424501425,
"grad_norm": 0.41896529664740606,
"learning_rate": 5e-06,
"loss": 0.5459,
"step": 1880
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.40224292638674486,
"learning_rate": 5e-06,
"loss": 0.5432,
"step": 1890
},
{
"epoch": 2.1652421652421654,
"grad_norm": 0.3703829495333715,
"learning_rate": 5e-06,
"loss": 0.5434,
"step": 1900
},
{
"epoch": 2.1766381766381766,
"grad_norm": 0.4195807512147461,
"learning_rate": 5e-06,
"loss": 0.548,
"step": 1910
},
{
"epoch": 2.1880341880341883,
"grad_norm": 0.42078014349068604,
"learning_rate": 5e-06,
"loss": 0.551,
"step": 1920
},
{
"epoch": 2.1994301994301995,
"grad_norm": 0.39550870444336733,
"learning_rate": 5e-06,
"loss": 0.5487,
"step": 1930
},
{
"epoch": 2.2108262108262107,
"grad_norm": 0.403560752581769,
"learning_rate": 5e-06,
"loss": 0.5518,
"step": 1940
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.4148295164570796,
"learning_rate": 5e-06,
"loss": 0.5455,
"step": 1950
},
{
"epoch": 2.2336182336182335,
"grad_norm": 0.37681071283125916,
"learning_rate": 5e-06,
"loss": 0.5371,
"step": 1960
},
{
"epoch": 2.245014245014245,
"grad_norm": 0.4085602540294654,
"learning_rate": 5e-06,
"loss": 0.548,
"step": 1970
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.42666262080387535,
"learning_rate": 5e-06,
"loss": 0.5489,
"step": 1980
},
{
"epoch": 2.267806267806268,
"grad_norm": 0.413370914720578,
"learning_rate": 5e-06,
"loss": 0.5452,
"step": 1990
},
{
"epoch": 2.2792022792022792,
"grad_norm": 0.3924258676572947,
"learning_rate": 5e-06,
"loss": 0.5506,
"step": 2000
},
{
"epoch": 2.2905982905982905,
"grad_norm": 0.4347195110430224,
"learning_rate": 5e-06,
"loss": 0.5495,
"step": 2010
},
{
"epoch": 2.301994301994302,
"grad_norm": 0.40213883875930767,
"learning_rate": 5e-06,
"loss": 0.5447,
"step": 2020
},
{
"epoch": 2.3133903133903133,
"grad_norm": 0.42546941310471453,
"learning_rate": 5e-06,
"loss": 0.5533,
"step": 2030
},
{
"epoch": 2.324786324786325,
"grad_norm": 0.40042388002057316,
"learning_rate": 5e-06,
"loss": 0.5493,
"step": 2040
},
{
"epoch": 2.336182336182336,
"grad_norm": 0.40985989196559397,
"learning_rate": 5e-06,
"loss": 0.5484,
"step": 2050
},
{
"epoch": 2.347578347578348,
"grad_norm": 0.4262197347046128,
"learning_rate": 5e-06,
"loss": 0.5561,
"step": 2060
},
{
"epoch": 2.358974358974359,
"grad_norm": 0.4079997903297647,
"learning_rate": 5e-06,
"loss": 0.5479,
"step": 2070
},
{
"epoch": 2.3703703703703702,
"grad_norm": 0.4171995256710412,
"learning_rate": 5e-06,
"loss": 0.5482,
"step": 2080
},
{
"epoch": 2.381766381766382,
"grad_norm": 0.4067288627883757,
"learning_rate": 5e-06,
"loss": 0.5495,
"step": 2090
},
{
"epoch": 2.393162393162393,
"grad_norm": 0.39812759486187826,
"learning_rate": 5e-06,
"loss": 0.5475,
"step": 2100
},
{
"epoch": 2.4045584045584047,
"grad_norm": 0.4252046487226247,
"learning_rate": 5e-06,
"loss": 0.564,
"step": 2110
},
{
"epoch": 2.415954415954416,
"grad_norm": 0.385246050290494,
"learning_rate": 5e-06,
"loss": 0.5495,
"step": 2120
},
{
"epoch": 2.427350427350427,
"grad_norm": 0.4086146276427414,
"learning_rate": 5e-06,
"loss": 0.56,
"step": 2130
},
{
"epoch": 2.438746438746439,
"grad_norm": 0.40396684063143223,
"learning_rate": 5e-06,
"loss": 0.5592,
"step": 2140
},
{
"epoch": 2.45014245014245,
"grad_norm": 0.40575491064321195,
"learning_rate": 5e-06,
"loss": 0.5633,
"step": 2150
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.4073296395669543,
"learning_rate": 5e-06,
"loss": 0.5488,
"step": 2160
},
{
"epoch": 2.472934472934473,
"grad_norm": 0.43882905338245753,
"learning_rate": 5e-06,
"loss": 0.5513,
"step": 2170
},
{
"epoch": 2.484330484330484,
"grad_norm": 0.4031322481681622,
"learning_rate": 5e-06,
"loss": 0.5554,
"step": 2180
},
{
"epoch": 2.4957264957264957,
"grad_norm": 0.42227630442588826,
"learning_rate": 5e-06,
"loss": 0.5574,
"step": 2190
},
{
"epoch": 2.5071225071225074,
"grad_norm": 0.4277624308363176,
"learning_rate": 5e-06,
"loss": 0.5629,
"step": 2200
},
{
"epoch": 2.5185185185185186,
"grad_norm": 0.40043500855114567,
"learning_rate": 5e-06,
"loss": 0.5444,
"step": 2210
},
{
"epoch": 2.52991452991453,
"grad_norm": 0.427445344908136,
"learning_rate": 5e-06,
"loss": 0.5591,
"step": 2220
},
{
"epoch": 2.5413105413105415,
"grad_norm": 0.4197028690010052,
"learning_rate": 5e-06,
"loss": 0.5513,
"step": 2230
},
{
"epoch": 2.5527065527065527,
"grad_norm": 0.41806376493939207,
"learning_rate": 5e-06,
"loss": 0.5501,
"step": 2240
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.4080384204790527,
"learning_rate": 5e-06,
"loss": 0.5488,
"step": 2250
},
{
"epoch": 2.5754985754985755,
"grad_norm": 0.4339972064470789,
"learning_rate": 5e-06,
"loss": 0.5534,
"step": 2260
},
{
"epoch": 2.5868945868945867,
"grad_norm": 0.4139873128656014,
"learning_rate": 5e-06,
"loss": 0.5509,
"step": 2270
},
{
"epoch": 2.5982905982905984,
"grad_norm": 0.39593523779791756,
"learning_rate": 5e-06,
"loss": 0.5515,
"step": 2280
},
{
"epoch": 2.6096866096866096,
"grad_norm": 0.3887745966959367,
"learning_rate": 5e-06,
"loss": 0.5567,
"step": 2290
},
{
"epoch": 2.6210826210826212,
"grad_norm": 0.3899940114191536,
"learning_rate": 5e-06,
"loss": 0.5429,
"step": 2300
},
{
"epoch": 2.6324786324786325,
"grad_norm": 0.4176311832860518,
"learning_rate": 5e-06,
"loss": 0.553,
"step": 2310
},
{
"epoch": 2.6438746438746437,
"grad_norm": 0.46727727994302587,
"learning_rate": 5e-06,
"loss": 0.5524,
"step": 2320
},
{
"epoch": 2.6552706552706553,
"grad_norm": 0.4368321834367039,
"learning_rate": 5e-06,
"loss": 0.5552,
"step": 2330
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.4479324367839254,
"learning_rate": 5e-06,
"loss": 0.5534,
"step": 2340
},
{
"epoch": 2.678062678062678,
"grad_norm": 0.41411545835899133,
"learning_rate": 5e-06,
"loss": 0.5467,
"step": 2350
},
{
"epoch": 2.6894586894586894,
"grad_norm": 0.4201299885965421,
"learning_rate": 5e-06,
"loss": 0.5565,
"step": 2360
},
{
"epoch": 2.700854700854701,
"grad_norm": 0.40978702073303064,
"learning_rate": 5e-06,
"loss": 0.5444,
"step": 2370
},
{
"epoch": 2.7122507122507122,
"grad_norm": 0.4233459449335634,
"learning_rate": 5e-06,
"loss": 0.5563,
"step": 2380
},
{
"epoch": 2.7236467236467234,
"grad_norm": 0.4159458912952842,
"learning_rate": 5e-06,
"loss": 0.5551,
"step": 2390
},
{
"epoch": 2.735042735042735,
"grad_norm": 0.41425606346483057,
"learning_rate": 5e-06,
"loss": 0.5539,
"step": 2400
},
{
"epoch": 2.7464387464387463,
"grad_norm": 0.4166133827092343,
"learning_rate": 5e-06,
"loss": 0.5588,
"step": 2410
},
{
"epoch": 2.757834757834758,
"grad_norm": 0.4263688845736852,
"learning_rate": 5e-06,
"loss": 0.5575,
"step": 2420
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.41269883049053624,
"learning_rate": 5e-06,
"loss": 0.5471,
"step": 2430
},
{
"epoch": 2.780626780626781,
"grad_norm": 0.3894335667283599,
"learning_rate": 5e-06,
"loss": 0.5468,
"step": 2440
},
{
"epoch": 2.792022792022792,
"grad_norm": 0.40933546113606567,
"learning_rate": 5e-06,
"loss": 0.5501,
"step": 2450
},
{
"epoch": 2.8034188034188032,
"grad_norm": 0.39714648665213204,
"learning_rate": 5e-06,
"loss": 0.5444,
"step": 2460
},
{
"epoch": 2.814814814814815,
"grad_norm": 0.40517136322070096,
"learning_rate": 5e-06,
"loss": 0.5601,
"step": 2470
},
{
"epoch": 2.826210826210826,
"grad_norm": 0.44447910033491683,
"learning_rate": 5e-06,
"loss": 0.5623,
"step": 2480
},
{
"epoch": 2.8376068376068377,
"grad_norm": 0.388103652560322,
"learning_rate": 5e-06,
"loss": 0.5543,
"step": 2490
},
{
"epoch": 2.849002849002849,
"grad_norm": 0.40171877838716236,
"learning_rate": 5e-06,
"loss": 0.562,
"step": 2500
},
{
"epoch": 2.8603988603988606,
"grad_norm": 0.41856657884436094,
"learning_rate": 5e-06,
"loss": 0.5536,
"step": 2510
},
{
"epoch": 2.871794871794872,
"grad_norm": 0.4173395435456696,
"learning_rate": 5e-06,
"loss": 0.5539,
"step": 2520
},
{
"epoch": 2.883190883190883,
"grad_norm": 0.39093712576995243,
"learning_rate": 5e-06,
"loss": 0.5601,
"step": 2530
},
{
"epoch": 2.8945868945868947,
"grad_norm": 0.4255070470787294,
"learning_rate": 5e-06,
"loss": 0.5557,
"step": 2540
},
{
"epoch": 2.905982905982906,
"grad_norm": 0.45247644117965885,
"learning_rate": 5e-06,
"loss": 0.5528,
"step": 2550
},
{
"epoch": 2.9173789173789175,
"grad_norm": 0.41729192613775734,
"learning_rate": 5e-06,
"loss": 0.5416,
"step": 2560
},
{
"epoch": 2.9287749287749287,
"grad_norm": 0.3959874387272076,
"learning_rate": 5e-06,
"loss": 0.5471,
"step": 2570
},
{
"epoch": 2.9401709401709404,
"grad_norm": 0.40279780924522723,
"learning_rate": 5e-06,
"loss": 0.5438,
"step": 2580
},
{
"epoch": 2.9515669515669516,
"grad_norm": 0.41492112649690777,
"learning_rate": 5e-06,
"loss": 0.5533,
"step": 2590
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.4072236941032463,
"learning_rate": 5e-06,
"loss": 0.5446,
"step": 2600
},
{
"epoch": 2.9743589743589745,
"grad_norm": 0.3967690970697916,
"learning_rate": 5e-06,
"loss": 0.556,
"step": 2610
},
{
"epoch": 2.9857549857549857,
"grad_norm": 0.4004788690287786,
"learning_rate": 5e-06,
"loss": 0.5571,
"step": 2620
},
{
"epoch": 2.9971509971509973,
"grad_norm": 0.39905352277311656,
"learning_rate": 5e-06,
"loss": 0.5443,
"step": 2630
},
{
"epoch": 2.9982905982905983,
"eval_loss": 0.6213015913963318,
"eval_runtime": 442.3632,
"eval_samples_per_second": 26.727,
"eval_steps_per_second": 0.418,
"step": 2631
},
{
"epoch": 2.9982905982905983,
"step": 2631,
"total_flos": 2758364765356032.0,
"train_loss": 0.5923774672614808,
"train_runtime": 70850.8498,
"train_samples_per_second": 9.511,
"train_steps_per_second": 0.037
}
],
"logging_steps": 10,
"max_steps": 2631,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2758364765356032.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}