Action_model / trainer_state.json
Raihan004's picture
🍻 cheers
5b5e6cd verified
{
"best_metric": 0.4589254856109619,
"best_model_checkpoint": "Action_model/checkpoint-1500",
"epoch": 10.0,
"eval_steps": 100,
"global_step": 2680,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 1.7369908094406128,
"learning_rate": 9.96268656716418e-05,
"loss": 2.2759,
"step": 10
},
{
"epoch": 0.07,
"grad_norm": 1.753720998764038,
"learning_rate": 9.925373134328359e-05,
"loss": 2.1743,
"step": 20
},
{
"epoch": 0.11,
"grad_norm": 1.8532754182815552,
"learning_rate": 9.888059701492539e-05,
"loss": 2.0233,
"step": 30
},
{
"epoch": 0.15,
"grad_norm": 2.195688486099243,
"learning_rate": 9.850746268656717e-05,
"loss": 1.8293,
"step": 40
},
{
"epoch": 0.19,
"grad_norm": 2.392077684402466,
"learning_rate": 9.813432835820896e-05,
"loss": 1.7307,
"step": 50
},
{
"epoch": 0.22,
"grad_norm": 2.851775646209717,
"learning_rate": 9.776119402985075e-05,
"loss": 1.5716,
"step": 60
},
{
"epoch": 0.26,
"grad_norm": 2.2557411193847656,
"learning_rate": 9.738805970149254e-05,
"loss": 1.4694,
"step": 70
},
{
"epoch": 0.3,
"grad_norm": 2.4612302780151367,
"learning_rate": 9.701492537313434e-05,
"loss": 1.3609,
"step": 80
},
{
"epoch": 0.34,
"grad_norm": 2.7514560222625732,
"learning_rate": 9.664179104477612e-05,
"loss": 1.2871,
"step": 90
},
{
"epoch": 0.37,
"grad_norm": 3.6256659030914307,
"learning_rate": 9.626865671641792e-05,
"loss": 1.2754,
"step": 100
},
{
"epoch": 0.37,
"eval_accuracy": 0.7328646748681898,
"eval_loss": 1.1163370609283447,
"eval_runtime": 12.5514,
"eval_samples_per_second": 45.333,
"eval_steps_per_second": 5.736,
"step": 100
},
{
"epoch": 0.41,
"grad_norm": 2.642601728439331,
"learning_rate": 9.58955223880597e-05,
"loss": 1.2354,
"step": 110
},
{
"epoch": 0.45,
"grad_norm": 2.4862725734710693,
"learning_rate": 9.552238805970149e-05,
"loss": 1.169,
"step": 120
},
{
"epoch": 0.49,
"grad_norm": 3.962764263153076,
"learning_rate": 9.514925373134329e-05,
"loss": 1.2546,
"step": 130
},
{
"epoch": 0.52,
"grad_norm": 2.9388816356658936,
"learning_rate": 9.477611940298507e-05,
"loss": 1.1702,
"step": 140
},
{
"epoch": 0.56,
"grad_norm": 4.958592414855957,
"learning_rate": 9.440298507462687e-05,
"loss": 1.0865,
"step": 150
},
{
"epoch": 0.6,
"grad_norm": 3.4470815658569336,
"learning_rate": 9.402985074626867e-05,
"loss": 1.0097,
"step": 160
},
{
"epoch": 0.63,
"grad_norm": 4.423004627227783,
"learning_rate": 9.365671641791045e-05,
"loss": 1.0749,
"step": 170
},
{
"epoch": 0.67,
"grad_norm": 2.808164358139038,
"learning_rate": 9.328358208955224e-05,
"loss": 0.9732,
"step": 180
},
{
"epoch": 0.71,
"grad_norm": 6.00456428527832,
"learning_rate": 9.291044776119402e-05,
"loss": 1.0009,
"step": 190
},
{
"epoch": 0.75,
"grad_norm": 5.091552734375,
"learning_rate": 9.253731343283582e-05,
"loss": 0.9345,
"step": 200
},
{
"epoch": 0.75,
"eval_accuracy": 0.7996485061511424,
"eval_loss": 0.8296495079994202,
"eval_runtime": 7.8912,
"eval_samples_per_second": 72.105,
"eval_steps_per_second": 9.124,
"step": 200
},
{
"epoch": 0.78,
"grad_norm": 3.2533326148986816,
"learning_rate": 9.216417910447762e-05,
"loss": 0.793,
"step": 210
},
{
"epoch": 0.82,
"grad_norm": 6.073918342590332,
"learning_rate": 9.17910447761194e-05,
"loss": 0.9835,
"step": 220
},
{
"epoch": 0.86,
"grad_norm": 3.6311192512512207,
"learning_rate": 9.14179104477612e-05,
"loss": 0.8801,
"step": 230
},
{
"epoch": 0.9,
"grad_norm": 4.446895599365234,
"learning_rate": 9.104477611940299e-05,
"loss": 1.0534,
"step": 240
},
{
"epoch": 0.93,
"grad_norm": 4.668705463409424,
"learning_rate": 9.067164179104479e-05,
"loss": 0.9396,
"step": 250
},
{
"epoch": 0.97,
"grad_norm": 6.191302299499512,
"learning_rate": 9.029850746268657e-05,
"loss": 0.9275,
"step": 260
},
{
"epoch": 1.01,
"grad_norm": 3.170959711074829,
"learning_rate": 8.992537313432836e-05,
"loss": 0.8595,
"step": 270
},
{
"epoch": 1.04,
"grad_norm": 3.690964460372925,
"learning_rate": 8.955223880597016e-05,
"loss": 0.733,
"step": 280
},
{
"epoch": 1.08,
"grad_norm": 4.871851444244385,
"learning_rate": 8.917910447761194e-05,
"loss": 0.7623,
"step": 290
},
{
"epoch": 1.12,
"grad_norm": 3.3851799964904785,
"learning_rate": 8.880597014925374e-05,
"loss": 0.8816,
"step": 300
},
{
"epoch": 1.12,
"eval_accuracy": 0.8101933216168717,
"eval_loss": 0.7156229615211487,
"eval_runtime": 7.8519,
"eval_samples_per_second": 72.467,
"eval_steps_per_second": 9.17,
"step": 300
},
{
"epoch": 1.16,
"grad_norm": 3.334380865097046,
"learning_rate": 8.843283582089554e-05,
"loss": 0.8567,
"step": 310
},
{
"epoch": 1.19,
"grad_norm": 4.673859596252441,
"learning_rate": 8.805970149253732e-05,
"loss": 0.7926,
"step": 320
},
{
"epoch": 1.23,
"grad_norm": 3.3042550086975098,
"learning_rate": 8.76865671641791e-05,
"loss": 0.6847,
"step": 330
},
{
"epoch": 1.27,
"grad_norm": 5.4356513023376465,
"learning_rate": 8.731343283582089e-05,
"loss": 0.7656,
"step": 340
},
{
"epoch": 1.31,
"grad_norm": 7.050413131713867,
"learning_rate": 8.694029850746269e-05,
"loss": 0.6658,
"step": 350
},
{
"epoch": 1.34,
"grad_norm": 5.980592727661133,
"learning_rate": 8.656716417910447e-05,
"loss": 0.7948,
"step": 360
},
{
"epoch": 1.38,
"grad_norm": 3.894716739654541,
"learning_rate": 8.619402985074627e-05,
"loss": 0.8381,
"step": 370
},
{
"epoch": 1.42,
"grad_norm": 7.189664363861084,
"learning_rate": 8.582089552238807e-05,
"loss": 0.6532,
"step": 380
},
{
"epoch": 1.46,
"grad_norm": 4.317276477813721,
"learning_rate": 8.544776119402986e-05,
"loss": 0.7763,
"step": 390
},
{
"epoch": 1.49,
"grad_norm": 4.480589866638184,
"learning_rate": 8.511194029850747e-05,
"loss": 0.7425,
"step": 400
},
{
"epoch": 1.49,
"eval_accuracy": 0.8066783831282952,
"eval_loss": 0.6529447436332703,
"eval_runtime": 7.793,
"eval_samples_per_second": 73.014,
"eval_steps_per_second": 9.239,
"step": 400
},
{
"epoch": 1.53,
"grad_norm": 4.1799163818359375,
"learning_rate": 8.473880597014926e-05,
"loss": 0.6928,
"step": 410
},
{
"epoch": 1.57,
"grad_norm": 4.81996488571167,
"learning_rate": 8.436567164179105e-05,
"loss": 0.7769,
"step": 420
},
{
"epoch": 1.6,
"grad_norm": 7.18645715713501,
"learning_rate": 8.399253731343283e-05,
"loss": 0.6848,
"step": 430
},
{
"epoch": 1.64,
"grad_norm": 3.888197660446167,
"learning_rate": 8.361940298507463e-05,
"loss": 0.5977,
"step": 440
},
{
"epoch": 1.68,
"grad_norm": 7.374312877655029,
"learning_rate": 8.324626865671642e-05,
"loss": 0.6001,
"step": 450
},
{
"epoch": 1.72,
"grad_norm": 6.553064823150635,
"learning_rate": 8.287313432835821e-05,
"loss": 0.6683,
"step": 460
},
{
"epoch": 1.75,
"grad_norm": 3.466761589050293,
"learning_rate": 8.25e-05,
"loss": 0.6484,
"step": 470
},
{
"epoch": 1.79,
"grad_norm": 3.534076690673828,
"learning_rate": 8.21268656716418e-05,
"loss": 0.6589,
"step": 480
},
{
"epoch": 1.83,
"grad_norm": 3.581280469894409,
"learning_rate": 8.17537313432836e-05,
"loss": 0.6173,
"step": 490
},
{
"epoch": 1.87,
"grad_norm": 6.162041664123535,
"learning_rate": 8.138059701492538e-05,
"loss": 0.6883,
"step": 500
},
{
"epoch": 1.87,
"eval_accuracy": 0.8242530755711776,
"eval_loss": 0.6078779697418213,
"eval_runtime": 7.6716,
"eval_samples_per_second": 74.169,
"eval_steps_per_second": 9.385,
"step": 500
},
{
"epoch": 1.9,
"grad_norm": 5.477086067199707,
"learning_rate": 8.100746268656717e-05,
"loss": 0.5952,
"step": 510
},
{
"epoch": 1.94,
"grad_norm": 2.389667510986328,
"learning_rate": 8.063432835820895e-05,
"loss": 0.5193,
"step": 520
},
{
"epoch": 1.98,
"grad_norm": 5.730781555175781,
"learning_rate": 8.026119402985075e-05,
"loss": 0.6818,
"step": 530
},
{
"epoch": 2.01,
"grad_norm": 6.305990219116211,
"learning_rate": 7.992537313432836e-05,
"loss": 0.5738,
"step": 540
},
{
"epoch": 2.05,
"grad_norm": 3.507434368133545,
"learning_rate": 7.955223880597016e-05,
"loss": 0.5685,
"step": 550
},
{
"epoch": 2.09,
"grad_norm": 12.683993339538574,
"learning_rate": 7.917910447761194e-05,
"loss": 0.6684,
"step": 560
},
{
"epoch": 2.13,
"grad_norm": 5.5166916847229,
"learning_rate": 7.880597014925374e-05,
"loss": 0.4787,
"step": 570
},
{
"epoch": 2.16,
"grad_norm": 6.427499294281006,
"learning_rate": 7.843283582089552e-05,
"loss": 0.5818,
"step": 580
},
{
"epoch": 2.2,
"grad_norm": 5.062973976135254,
"learning_rate": 7.805970149253732e-05,
"loss": 0.4766,
"step": 590
},
{
"epoch": 2.24,
"grad_norm": 5.720675945281982,
"learning_rate": 7.768656716417911e-05,
"loss": 0.5454,
"step": 600
},
{
"epoch": 2.24,
"eval_accuracy": 0.8347978910369068,
"eval_loss": 0.5604887008666992,
"eval_runtime": 7.7133,
"eval_samples_per_second": 73.769,
"eval_steps_per_second": 9.335,
"step": 600
},
{
"epoch": 2.28,
"grad_norm": 7.875051021575928,
"learning_rate": 7.731343283582089e-05,
"loss": 0.5935,
"step": 610
},
{
"epoch": 2.31,
"grad_norm": 4.378401756286621,
"learning_rate": 7.694029850746269e-05,
"loss": 0.4639,
"step": 620
},
{
"epoch": 2.35,
"grad_norm": 7.522930145263672,
"learning_rate": 7.656716417910448e-05,
"loss": 0.4867,
"step": 630
},
{
"epoch": 2.39,
"grad_norm": 6.3615288734436035,
"learning_rate": 7.619402985074627e-05,
"loss": 0.5302,
"step": 640
},
{
"epoch": 2.43,
"grad_norm": 3.8204784393310547,
"learning_rate": 7.582089552238806e-05,
"loss": 0.3864,
"step": 650
},
{
"epoch": 2.46,
"grad_norm": 2.3520662784576416,
"learning_rate": 7.544776119402986e-05,
"loss": 0.6458,
"step": 660
},
{
"epoch": 2.5,
"grad_norm": 3.9832942485809326,
"learning_rate": 7.507462686567166e-05,
"loss": 0.494,
"step": 670
},
{
"epoch": 2.54,
"grad_norm": 3.6783320903778076,
"learning_rate": 7.470149253731343e-05,
"loss": 0.6213,
"step": 680
},
{
"epoch": 2.57,
"grad_norm": 4.528789520263672,
"learning_rate": 7.432835820895523e-05,
"loss": 0.615,
"step": 690
},
{
"epoch": 2.61,
"grad_norm": 5.556227207183838,
"learning_rate": 7.395522388059701e-05,
"loss": 0.5383,
"step": 700
},
{
"epoch": 2.61,
"eval_accuracy": 0.8295254833040422,
"eval_loss": 0.5571200251579285,
"eval_runtime": 7.8934,
"eval_samples_per_second": 72.085,
"eval_steps_per_second": 9.122,
"step": 700
},
{
"epoch": 2.65,
"grad_norm": 4.617480754852295,
"learning_rate": 7.358208955223881e-05,
"loss": 0.4987,
"step": 710
},
{
"epoch": 2.69,
"grad_norm": 4.6940412521362305,
"learning_rate": 7.32089552238806e-05,
"loss": 0.5466,
"step": 720
},
{
"epoch": 2.72,
"grad_norm": 3.8839175701141357,
"learning_rate": 7.283582089552239e-05,
"loss": 0.5409,
"step": 730
},
{
"epoch": 2.76,
"grad_norm": 6.855696201324463,
"learning_rate": 7.246268656716419e-05,
"loss": 0.3972,
"step": 740
},
{
"epoch": 2.8,
"grad_norm": 3.9779269695281982,
"learning_rate": 7.208955223880597e-05,
"loss": 0.4719,
"step": 750
},
{
"epoch": 2.84,
"grad_norm": 10.327420234680176,
"learning_rate": 7.171641791044776e-05,
"loss": 0.668,
"step": 760
},
{
"epoch": 2.87,
"grad_norm": 5.06951904296875,
"learning_rate": 7.134328358208956e-05,
"loss": 0.5899,
"step": 770
},
{
"epoch": 2.91,
"grad_norm": 5.539373397827148,
"learning_rate": 7.097014925373134e-05,
"loss": 0.5813,
"step": 780
},
{
"epoch": 2.95,
"grad_norm": 4.622121334075928,
"learning_rate": 7.059701492537314e-05,
"loss": 0.5294,
"step": 790
},
{
"epoch": 2.99,
"grad_norm": 2.6457552909851074,
"learning_rate": 7.022388059701493e-05,
"loss": 0.5442,
"step": 800
},
{
"epoch": 2.99,
"eval_accuracy": 0.8189806678383128,
"eval_loss": 0.5864126682281494,
"eval_runtime": 7.8507,
"eval_samples_per_second": 72.478,
"eval_steps_per_second": 9.171,
"step": 800
},
{
"epoch": 3.02,
"grad_norm": 3.373798370361328,
"learning_rate": 6.985074626865672e-05,
"loss": 0.4183,
"step": 810
},
{
"epoch": 3.06,
"grad_norm": 4.0179667472839355,
"learning_rate": 6.947761194029851e-05,
"loss": 0.3611,
"step": 820
},
{
"epoch": 3.1,
"grad_norm": 7.72437858581543,
"learning_rate": 6.91044776119403e-05,
"loss": 0.4543,
"step": 830
},
{
"epoch": 3.13,
"grad_norm": 3.1097893714904785,
"learning_rate": 6.873134328358209e-05,
"loss": 0.5194,
"step": 840
},
{
"epoch": 3.17,
"grad_norm": 6.581250190734863,
"learning_rate": 6.835820895522388e-05,
"loss": 0.3839,
"step": 850
},
{
"epoch": 3.21,
"grad_norm": 5.605171203613281,
"learning_rate": 6.798507462686568e-05,
"loss": 0.4499,
"step": 860
},
{
"epoch": 3.25,
"grad_norm": 2.834651231765747,
"learning_rate": 6.761194029850747e-05,
"loss": 0.5067,
"step": 870
},
{
"epoch": 3.28,
"grad_norm": 4.615099906921387,
"learning_rate": 6.723880597014926e-05,
"loss": 0.4869,
"step": 880
},
{
"epoch": 3.32,
"grad_norm": 6.115981101989746,
"learning_rate": 6.686567164179106e-05,
"loss": 0.4793,
"step": 890
},
{
"epoch": 3.36,
"grad_norm": 1.1021697521209717,
"learning_rate": 6.649253731343283e-05,
"loss": 0.3986,
"step": 900
},
{
"epoch": 3.36,
"eval_accuracy": 0.8312829525483304,
"eval_loss": 0.5632173418998718,
"eval_runtime": 7.731,
"eval_samples_per_second": 73.6,
"eval_steps_per_second": 9.313,
"step": 900
},
{
"epoch": 3.4,
"grad_norm": 7.019008159637451,
"learning_rate": 6.611940298507463e-05,
"loss": 0.383,
"step": 910
},
{
"epoch": 3.43,
"grad_norm": 2.586031913757324,
"learning_rate": 6.574626865671642e-05,
"loss": 0.2752,
"step": 920
},
{
"epoch": 3.47,
"grad_norm": 2.5189669132232666,
"learning_rate": 6.537313432835821e-05,
"loss": 0.2944,
"step": 930
},
{
"epoch": 3.51,
"grad_norm": 10.028382301330566,
"learning_rate": 6.500000000000001e-05,
"loss": 0.4378,
"step": 940
},
{
"epoch": 3.54,
"grad_norm": 1.8697803020477295,
"learning_rate": 6.462686567164179e-05,
"loss": 0.3956,
"step": 950
},
{
"epoch": 3.58,
"grad_norm": 5.872415065765381,
"learning_rate": 6.425373134328359e-05,
"loss": 0.338,
"step": 960
},
{
"epoch": 3.62,
"grad_norm": 8.272451400756836,
"learning_rate": 6.388059701492538e-05,
"loss": 0.4264,
"step": 970
},
{
"epoch": 3.66,
"grad_norm": 9.422249794006348,
"learning_rate": 6.350746268656716e-05,
"loss": 0.4258,
"step": 980
},
{
"epoch": 3.69,
"grad_norm": 8.768738746643066,
"learning_rate": 6.313432835820896e-05,
"loss": 0.3308,
"step": 990
},
{
"epoch": 3.73,
"grad_norm": 6.355968475341797,
"learning_rate": 6.276119402985074e-05,
"loss": 0.3438,
"step": 1000
},
{
"epoch": 3.73,
"eval_accuracy": 0.836555360281195,
"eval_loss": 0.5606371760368347,
"eval_runtime": 7.818,
"eval_samples_per_second": 72.781,
"eval_steps_per_second": 9.21,
"step": 1000
},
{
"epoch": 3.77,
"grad_norm": 3.973480463027954,
"learning_rate": 6.238805970149254e-05,
"loss": 0.5042,
"step": 1010
},
{
"epoch": 3.81,
"grad_norm": 5.739313125610352,
"learning_rate": 6.201492537313434e-05,
"loss": 0.4515,
"step": 1020
},
{
"epoch": 3.84,
"grad_norm": 4.196649074554443,
"learning_rate": 6.164179104477613e-05,
"loss": 0.4404,
"step": 1030
},
{
"epoch": 3.88,
"grad_norm": 4.671971321105957,
"learning_rate": 6.126865671641791e-05,
"loss": 0.4746,
"step": 1040
},
{
"epoch": 3.92,
"grad_norm": 6.87581205368042,
"learning_rate": 6.08955223880597e-05,
"loss": 0.4637,
"step": 1050
},
{
"epoch": 3.96,
"grad_norm": 7.224815368652344,
"learning_rate": 6.052238805970149e-05,
"loss": 0.4754,
"step": 1060
},
{
"epoch": 3.99,
"grad_norm": 4.4340314865112305,
"learning_rate": 6.014925373134329e-05,
"loss": 0.4165,
"step": 1070
},
{
"epoch": 4.03,
"grad_norm": 1.151932716369629,
"learning_rate": 5.9776119402985076e-05,
"loss": 0.3498,
"step": 1080
},
{
"epoch": 4.07,
"grad_norm": 6.31879997253418,
"learning_rate": 5.940298507462687e-05,
"loss": 0.3505,
"step": 1090
},
{
"epoch": 4.1,
"grad_norm": 4.674696445465088,
"learning_rate": 5.902985074626865e-05,
"loss": 0.4345,
"step": 1100
},
{
"epoch": 4.1,
"eval_accuracy": 0.836555360281195,
"eval_loss": 0.5353797674179077,
"eval_runtime": 7.9559,
"eval_samples_per_second": 71.519,
"eval_steps_per_second": 9.05,
"step": 1100
},
{
"epoch": 4.14,
"grad_norm": 6.790203094482422,
"learning_rate": 5.865671641791045e-05,
"loss": 0.3189,
"step": 1110
},
{
"epoch": 4.18,
"grad_norm": 5.554905414581299,
"learning_rate": 5.828358208955225e-05,
"loss": 0.3255,
"step": 1120
},
{
"epoch": 4.22,
"grad_norm": 1.87189781665802,
"learning_rate": 5.7910447761194034e-05,
"loss": 0.2613,
"step": 1130
},
{
"epoch": 4.25,
"grad_norm": 3.4729249477386475,
"learning_rate": 5.7537313432835826e-05,
"loss": 0.4037,
"step": 1140
},
{
"epoch": 4.29,
"grad_norm": 3.2373063564300537,
"learning_rate": 5.716417910447761e-05,
"loss": 0.384,
"step": 1150
},
{
"epoch": 4.33,
"grad_norm": 1.8042526245117188,
"learning_rate": 5.679104477611941e-05,
"loss": 0.4024,
"step": 1160
},
{
"epoch": 4.37,
"grad_norm": 0.9592193365097046,
"learning_rate": 5.64179104477612e-05,
"loss": 0.3646,
"step": 1170
},
{
"epoch": 4.4,
"grad_norm": 4.0469584465026855,
"learning_rate": 5.6044776119402986e-05,
"loss": 0.3622,
"step": 1180
},
{
"epoch": 4.44,
"grad_norm": 4.470405578613281,
"learning_rate": 5.5671641791044784e-05,
"loss": 0.2996,
"step": 1190
},
{
"epoch": 4.48,
"grad_norm": 6.086768627166748,
"learning_rate": 5.529850746268657e-05,
"loss": 0.4523,
"step": 1200
},
{
"epoch": 4.48,
"eval_accuracy": 0.8576449912126538,
"eval_loss": 0.49876561760902405,
"eval_runtime": 7.8527,
"eval_samples_per_second": 72.459,
"eval_steps_per_second": 9.169,
"step": 1200
},
{
"epoch": 4.51,
"grad_norm": 3.478428363800049,
"learning_rate": 5.492537313432836e-05,
"loss": 0.4198,
"step": 1210
},
{
"epoch": 4.55,
"grad_norm": 4.539990425109863,
"learning_rate": 5.455223880597016e-05,
"loss": 0.3125,
"step": 1220
},
{
"epoch": 4.59,
"grad_norm": 3.971435070037842,
"learning_rate": 5.4179104477611943e-05,
"loss": 0.2773,
"step": 1230
},
{
"epoch": 4.63,
"grad_norm": 7.168191909790039,
"learning_rate": 5.3805970149253735e-05,
"loss": 0.4852,
"step": 1240
},
{
"epoch": 4.66,
"grad_norm": 2.896576166152954,
"learning_rate": 5.343283582089552e-05,
"loss": 0.3425,
"step": 1250
},
{
"epoch": 4.7,
"grad_norm": 1.4190607070922852,
"learning_rate": 5.305970149253732e-05,
"loss": 0.2219,
"step": 1260
},
{
"epoch": 4.74,
"grad_norm": 5.066045761108398,
"learning_rate": 5.268656716417911e-05,
"loss": 0.3447,
"step": 1270
},
{
"epoch": 4.78,
"grad_norm": 4.2649126052856445,
"learning_rate": 5.2313432835820895e-05,
"loss": 0.3931,
"step": 1280
},
{
"epoch": 4.81,
"grad_norm": 5.704684734344482,
"learning_rate": 5.197761194029851e-05,
"loss": 0.4274,
"step": 1290
},
{
"epoch": 4.85,
"grad_norm": 6.395939350128174,
"learning_rate": 5.16044776119403e-05,
"loss": 0.3162,
"step": 1300
},
{
"epoch": 4.85,
"eval_accuracy": 0.8541300527240774,
"eval_loss": 0.5099390745162964,
"eval_runtime": 7.9919,
"eval_samples_per_second": 71.197,
"eval_steps_per_second": 9.009,
"step": 1300
},
{
"epoch": 4.89,
"grad_norm": 2.4717729091644287,
"learning_rate": 5.123134328358209e-05,
"loss": 0.3442,
"step": 1310
},
{
"epoch": 4.93,
"grad_norm": 0.6504545211791992,
"learning_rate": 5.0858208955223885e-05,
"loss": 0.3313,
"step": 1320
},
{
"epoch": 4.96,
"grad_norm": 4.316141128540039,
"learning_rate": 5.048507462686567e-05,
"loss": 0.3787,
"step": 1330
},
{
"epoch": 5.0,
"grad_norm": 4.9243998527526855,
"learning_rate": 5.011194029850746e-05,
"loss": 0.38,
"step": 1340
},
{
"epoch": 5.04,
"grad_norm": 5.312038421630859,
"learning_rate": 4.973880597014925e-05,
"loss": 0.3268,
"step": 1350
},
{
"epoch": 5.07,
"grad_norm": 3.5483176708221436,
"learning_rate": 4.9365671641791045e-05,
"loss": 0.3423,
"step": 1360
},
{
"epoch": 5.11,
"grad_norm": 4.414547920227051,
"learning_rate": 4.899253731343284e-05,
"loss": 0.2421,
"step": 1370
},
{
"epoch": 5.15,
"grad_norm": 5.7323689460754395,
"learning_rate": 4.861940298507463e-05,
"loss": 0.2795,
"step": 1380
},
{
"epoch": 5.19,
"grad_norm": 4.2763471603393555,
"learning_rate": 4.824626865671642e-05,
"loss": 0.2402,
"step": 1390
},
{
"epoch": 5.22,
"grad_norm": 9.259199142456055,
"learning_rate": 4.787313432835821e-05,
"loss": 0.3793,
"step": 1400
},
{
"epoch": 5.22,
"eval_accuracy": 0.843585237258348,
"eval_loss": 0.5190387964248657,
"eval_runtime": 7.7562,
"eval_samples_per_second": 73.361,
"eval_steps_per_second": 9.283,
"step": 1400
},
{
"epoch": 5.26,
"grad_norm": 4.773892402648926,
"learning_rate": 4.75e-05,
"loss": 0.3476,
"step": 1410
},
{
"epoch": 5.3,
"grad_norm": 1.1271159648895264,
"learning_rate": 4.7126865671641794e-05,
"loss": 0.1949,
"step": 1420
},
{
"epoch": 5.34,
"grad_norm": 2.823958158493042,
"learning_rate": 4.6753731343283586e-05,
"loss": 0.3009,
"step": 1430
},
{
"epoch": 5.37,
"grad_norm": 0.35977163910865784,
"learning_rate": 4.638059701492538e-05,
"loss": 0.1821,
"step": 1440
},
{
"epoch": 5.41,
"grad_norm": 3.380308151245117,
"learning_rate": 4.600746268656716e-05,
"loss": 0.323,
"step": 1450
},
{
"epoch": 5.45,
"grad_norm": 5.946179389953613,
"learning_rate": 4.5634328358208954e-05,
"loss": 0.5344,
"step": 1460
},
{
"epoch": 5.49,
"grad_norm": 8.254781723022461,
"learning_rate": 4.526119402985075e-05,
"loss": 0.2799,
"step": 1470
},
{
"epoch": 5.52,
"grad_norm": 6.808130741119385,
"learning_rate": 4.4888059701492544e-05,
"loss": 0.3173,
"step": 1480
},
{
"epoch": 5.56,
"grad_norm": 17.452037811279297,
"learning_rate": 4.451492537313433e-05,
"loss": 0.3251,
"step": 1490
},
{
"epoch": 5.6,
"grad_norm": 2.3097095489501953,
"learning_rate": 4.414179104477612e-05,
"loss": 0.3228,
"step": 1500
},
{
"epoch": 5.6,
"eval_accuracy": 0.8576449912126538,
"eval_loss": 0.4589254856109619,
"eval_runtime": 8.0547,
"eval_samples_per_second": 70.642,
"eval_steps_per_second": 8.939,
"step": 1500
},
{
"epoch": 5.63,
"grad_norm": 3.337970018386841,
"learning_rate": 4.376865671641791e-05,
"loss": 0.2528,
"step": 1510
},
{
"epoch": 5.67,
"grad_norm": 0.5921415090560913,
"learning_rate": 4.33955223880597e-05,
"loss": 0.2459,
"step": 1520
},
{
"epoch": 5.71,
"grad_norm": 4.148998260498047,
"learning_rate": 4.3022388059701495e-05,
"loss": 0.2927,
"step": 1530
},
{
"epoch": 5.75,
"grad_norm": 5.740537166595459,
"learning_rate": 4.2649253731343286e-05,
"loss": 0.423,
"step": 1540
},
{
"epoch": 5.78,
"grad_norm": 5.316250324249268,
"learning_rate": 4.227611940298508e-05,
"loss": 0.3735,
"step": 1550
},
{
"epoch": 5.82,
"grad_norm": 5.52378511428833,
"learning_rate": 4.190298507462686e-05,
"loss": 0.3613,
"step": 1560
},
{
"epoch": 5.86,
"grad_norm": 2.1002511978149414,
"learning_rate": 4.152985074626866e-05,
"loss": 0.259,
"step": 1570
},
{
"epoch": 5.9,
"grad_norm": 5.339119911193848,
"learning_rate": 4.115671641791045e-05,
"loss": 0.3355,
"step": 1580
},
{
"epoch": 5.93,
"grad_norm": 3.0551536083221436,
"learning_rate": 4.0783582089552244e-05,
"loss": 0.4342,
"step": 1590
},
{
"epoch": 5.97,
"grad_norm": 6.549235820770264,
"learning_rate": 4.041044776119403e-05,
"loss": 0.1795,
"step": 1600
},
{
"epoch": 5.97,
"eval_accuracy": 0.8488576449912126,
"eval_loss": 0.5095508694648743,
"eval_runtime": 7.7872,
"eval_samples_per_second": 73.068,
"eval_steps_per_second": 9.246,
"step": 1600
},
{
"epoch": 6.01,
"grad_norm": 11.5170316696167,
"learning_rate": 4.003731343283582e-05,
"loss": 0.3778,
"step": 1610
},
{
"epoch": 6.04,
"grad_norm": 6.004143238067627,
"learning_rate": 3.966417910447761e-05,
"loss": 0.3624,
"step": 1620
},
{
"epoch": 6.08,
"grad_norm": 4.328847885131836,
"learning_rate": 3.9291044776119404e-05,
"loss": 0.3478,
"step": 1630
},
{
"epoch": 6.12,
"grad_norm": 3.5757558345794678,
"learning_rate": 3.8917910447761195e-05,
"loss": 0.2208,
"step": 1640
},
{
"epoch": 6.16,
"grad_norm": 8.37783432006836,
"learning_rate": 3.854477611940299e-05,
"loss": 0.3614,
"step": 1650
},
{
"epoch": 6.19,
"grad_norm": 2.4890713691711426,
"learning_rate": 3.817164179104478e-05,
"loss": 0.2514,
"step": 1660
},
{
"epoch": 6.23,
"grad_norm": 8.873276710510254,
"learning_rate": 3.7798507462686563e-05,
"loss": 0.2233,
"step": 1670
},
{
"epoch": 6.27,
"grad_norm": 0.29393309354782104,
"learning_rate": 3.742537313432836e-05,
"loss": 0.2474,
"step": 1680
},
{
"epoch": 6.31,
"grad_norm": 3.810150384902954,
"learning_rate": 3.7052238805970153e-05,
"loss": 0.2481,
"step": 1690
},
{
"epoch": 6.34,
"grad_norm": 1.989057183265686,
"learning_rate": 3.6679104477611945e-05,
"loss": 0.2626,
"step": 1700
},
{
"epoch": 6.34,
"eval_accuracy": 0.8488576449912126,
"eval_loss": 0.5402765274047852,
"eval_runtime": 7.9293,
"eval_samples_per_second": 71.759,
"eval_steps_per_second": 9.08,
"step": 1700
},
{
"epoch": 6.38,
"grad_norm": 8.488819122314453,
"learning_rate": 3.630597014925373e-05,
"loss": 0.2826,
"step": 1710
},
{
"epoch": 6.42,
"grad_norm": 5.542993068695068,
"learning_rate": 3.593283582089552e-05,
"loss": 0.3552,
"step": 1720
},
{
"epoch": 6.46,
"grad_norm": 6.646905422210693,
"learning_rate": 3.555970149253732e-05,
"loss": 0.4405,
"step": 1730
},
{
"epoch": 6.49,
"grad_norm": 4.022976398468018,
"learning_rate": 3.5186567164179105e-05,
"loss": 0.2738,
"step": 1740
},
{
"epoch": 6.53,
"grad_norm": 3.5472657680511475,
"learning_rate": 3.4813432835820896e-05,
"loss": 0.2807,
"step": 1750
},
{
"epoch": 6.57,
"grad_norm": 12.070052146911621,
"learning_rate": 3.444029850746269e-05,
"loss": 0.3634,
"step": 1760
},
{
"epoch": 6.6,
"grad_norm": 5.368374347686768,
"learning_rate": 3.406716417910448e-05,
"loss": 0.3252,
"step": 1770
},
{
"epoch": 6.64,
"grad_norm": 5.566130638122559,
"learning_rate": 3.369402985074627e-05,
"loss": 0.3034,
"step": 1780
},
{
"epoch": 6.68,
"grad_norm": 5.875336170196533,
"learning_rate": 3.332089552238806e-05,
"loss": 0.3406,
"step": 1790
},
{
"epoch": 6.72,
"grad_norm": 2.4168920516967773,
"learning_rate": 3.2947761194029854e-05,
"loss": 0.3041,
"step": 1800
},
{
"epoch": 6.72,
"eval_accuracy": 0.8488576449912126,
"eval_loss": 0.4907586872577667,
"eval_runtime": 7.8209,
"eval_samples_per_second": 72.754,
"eval_steps_per_second": 9.206,
"step": 1800
},
{
"epoch": 6.75,
"grad_norm": 3.1040282249450684,
"learning_rate": 3.2574626865671646e-05,
"loss": 0.3167,
"step": 1810
},
{
"epoch": 6.79,
"grad_norm": 1.8458846807479858,
"learning_rate": 3.220149253731343e-05,
"loss": 0.2061,
"step": 1820
},
{
"epoch": 6.83,
"grad_norm": 0.4053177833557129,
"learning_rate": 3.182835820895523e-05,
"loss": 0.3113,
"step": 1830
},
{
"epoch": 6.87,
"grad_norm": 0.23064230382442474,
"learning_rate": 3.145522388059702e-05,
"loss": 0.2368,
"step": 1840
},
{
"epoch": 6.9,
"grad_norm": 1.006479263305664,
"learning_rate": 3.1082089552238805e-05,
"loss": 0.2265,
"step": 1850
},
{
"epoch": 6.94,
"grad_norm": 4.072957992553711,
"learning_rate": 3.07089552238806e-05,
"loss": 0.2976,
"step": 1860
},
{
"epoch": 6.98,
"grad_norm": 16.575963973999023,
"learning_rate": 3.033582089552239e-05,
"loss": 0.1504,
"step": 1870
},
{
"epoch": 7.01,
"grad_norm": 2.9144656658172607,
"learning_rate": 2.9962686567164183e-05,
"loss": 0.2156,
"step": 1880
},
{
"epoch": 7.05,
"grad_norm": 4.547207832336426,
"learning_rate": 2.958955223880597e-05,
"loss": 0.2693,
"step": 1890
},
{
"epoch": 7.09,
"grad_norm": 0.5566532611846924,
"learning_rate": 2.9216417910447763e-05,
"loss": 0.1831,
"step": 1900
},
{
"epoch": 7.09,
"eval_accuracy": 0.8383128295254832,
"eval_loss": 0.5721341967582703,
"eval_runtime": 7.7377,
"eval_samples_per_second": 73.536,
"eval_steps_per_second": 9.305,
"step": 1900
},
{
"epoch": 7.13,
"grad_norm": 7.9241838455200195,
"learning_rate": 2.8843283582089555e-05,
"loss": 0.3037,
"step": 1910
},
{
"epoch": 7.16,
"grad_norm": 4.847833156585693,
"learning_rate": 2.8470149253731343e-05,
"loss": 0.2744,
"step": 1920
},
{
"epoch": 7.2,
"grad_norm": 4.368974208831787,
"learning_rate": 2.8097014925373134e-05,
"loss": 0.1603,
"step": 1930
},
{
"epoch": 7.24,
"grad_norm": 5.848027229309082,
"learning_rate": 2.772388059701493e-05,
"loss": 0.3318,
"step": 1940
},
{
"epoch": 7.28,
"grad_norm": 5.53363037109375,
"learning_rate": 2.7350746268656718e-05,
"loss": 0.2568,
"step": 1950
},
{
"epoch": 7.31,
"grad_norm": 1.3791863918304443,
"learning_rate": 2.697761194029851e-05,
"loss": 0.2186,
"step": 1960
},
{
"epoch": 7.35,
"grad_norm": 13.533841133117676,
"learning_rate": 2.6604477611940297e-05,
"loss": 0.2772,
"step": 1970
},
{
"epoch": 7.39,
"grad_norm": 1.113595962524414,
"learning_rate": 2.623134328358209e-05,
"loss": 0.3396,
"step": 1980
},
{
"epoch": 7.43,
"grad_norm": 3.193376064300537,
"learning_rate": 2.5858208955223884e-05,
"loss": 0.2171,
"step": 1990
},
{
"epoch": 7.46,
"grad_norm": 2.8687243461608887,
"learning_rate": 2.5485074626865672e-05,
"loss": 0.2275,
"step": 2000
},
{
"epoch": 7.46,
"eval_accuracy": 0.8312829525483304,
"eval_loss": 0.5349107980728149,
"eval_runtime": 8.0113,
"eval_samples_per_second": 71.025,
"eval_steps_per_second": 8.987,
"step": 2000
},
{
"epoch": 7.5,
"grad_norm": 6.330258846282959,
"learning_rate": 2.5111940298507464e-05,
"loss": 0.2165,
"step": 2010
},
{
"epoch": 7.54,
"grad_norm": 2.457519769668579,
"learning_rate": 2.4738805970149252e-05,
"loss": 0.3275,
"step": 2020
},
{
"epoch": 7.57,
"grad_norm": 1.468772053718567,
"learning_rate": 2.4365671641791047e-05,
"loss": 0.186,
"step": 2030
},
{
"epoch": 7.61,
"grad_norm": 4.308888912200928,
"learning_rate": 2.3992537313432835e-05,
"loss": 0.3182,
"step": 2040
},
{
"epoch": 7.65,
"grad_norm": 1.8849867582321167,
"learning_rate": 2.361940298507463e-05,
"loss": 0.2631,
"step": 2050
},
{
"epoch": 7.69,
"grad_norm": 2.6795170307159424,
"learning_rate": 2.3246268656716418e-05,
"loss": 0.1724,
"step": 2060
},
{
"epoch": 7.72,
"grad_norm": 0.22702960669994354,
"learning_rate": 2.287313432835821e-05,
"loss": 0.2542,
"step": 2070
},
{
"epoch": 7.76,
"grad_norm": 4.6633429527282715,
"learning_rate": 2.25e-05,
"loss": 0.259,
"step": 2080
},
{
"epoch": 7.8,
"grad_norm": 6.543178558349609,
"learning_rate": 2.2126865671641793e-05,
"loss": 0.3752,
"step": 2090
},
{
"epoch": 7.84,
"grad_norm": 7.109080791473389,
"learning_rate": 2.1753731343283585e-05,
"loss": 0.1762,
"step": 2100
},
{
"epoch": 7.84,
"eval_accuracy": 0.8541300527240774,
"eval_loss": 0.5203543901443481,
"eval_runtime": 7.8922,
"eval_samples_per_second": 72.096,
"eval_steps_per_second": 9.123,
"step": 2100
},
{
"epoch": 7.87,
"grad_norm": 3.3965115547180176,
"learning_rate": 2.1380597014925373e-05,
"loss": 0.1965,
"step": 2110
},
{
"epoch": 7.91,
"grad_norm": 0.1386798918247223,
"learning_rate": 2.1007462686567164e-05,
"loss": 0.1448,
"step": 2120
},
{
"epoch": 7.95,
"grad_norm": 8.268773078918457,
"learning_rate": 2.0634328358208956e-05,
"loss": 0.2203,
"step": 2130
},
{
"epoch": 7.99,
"grad_norm": 2.712890625,
"learning_rate": 2.0261194029850748e-05,
"loss": 0.2104,
"step": 2140
},
{
"epoch": 8.02,
"grad_norm": 2.0390050411224365,
"learning_rate": 1.988805970149254e-05,
"loss": 0.2063,
"step": 2150
},
{
"epoch": 8.06,
"grad_norm": 4.355598449707031,
"learning_rate": 1.951492537313433e-05,
"loss": 0.1356,
"step": 2160
},
{
"epoch": 8.1,
"grad_norm": 9.854630470275879,
"learning_rate": 1.914179104477612e-05,
"loss": 0.1686,
"step": 2170
},
{
"epoch": 8.13,
"grad_norm": 4.178330421447754,
"learning_rate": 1.8768656716417914e-05,
"loss": 0.2578,
"step": 2180
},
{
"epoch": 8.17,
"grad_norm": 5.019784450531006,
"learning_rate": 1.8395522388059702e-05,
"loss": 0.1923,
"step": 2190
},
{
"epoch": 8.21,
"grad_norm": 3.8136210441589355,
"learning_rate": 1.8022388059701494e-05,
"loss": 0.2112,
"step": 2200
},
{
"epoch": 8.21,
"eval_accuracy": 0.8629173989455184,
"eval_loss": 0.5188840627670288,
"eval_runtime": 8.1412,
"eval_samples_per_second": 69.891,
"eval_steps_per_second": 8.844,
"step": 2200
},
{
"epoch": 8.25,
"grad_norm": 2.7035305500030518,
"learning_rate": 1.7649253731343285e-05,
"loss": 0.2501,
"step": 2210
},
{
"epoch": 8.28,
"grad_norm": 6.736306190490723,
"learning_rate": 1.7276119402985073e-05,
"loss": 0.2213,
"step": 2220
},
{
"epoch": 8.32,
"grad_norm": 3.0436556339263916,
"learning_rate": 1.690298507462687e-05,
"loss": 0.1285,
"step": 2230
},
{
"epoch": 8.36,
"grad_norm": 4.729572772979736,
"learning_rate": 1.6529850746268657e-05,
"loss": 0.2984,
"step": 2240
},
{
"epoch": 8.4,
"grad_norm": 3.6665098667144775,
"learning_rate": 1.6156716417910448e-05,
"loss": 0.1796,
"step": 2250
},
{
"epoch": 8.43,
"grad_norm": 8.485068321228027,
"learning_rate": 1.578358208955224e-05,
"loss": 0.2137,
"step": 2260
},
{
"epoch": 8.47,
"grad_norm": 4.643974304199219,
"learning_rate": 1.541044776119403e-05,
"loss": 0.3009,
"step": 2270
},
{
"epoch": 8.51,
"grad_norm": 2.91859769821167,
"learning_rate": 1.5037313432835823e-05,
"loss": 0.1855,
"step": 2280
},
{
"epoch": 8.54,
"grad_norm": 9.799684524536133,
"learning_rate": 1.4664179104477613e-05,
"loss": 0.2186,
"step": 2290
},
{
"epoch": 8.58,
"grad_norm": 4.92659330368042,
"learning_rate": 1.4291044776119403e-05,
"loss": 0.1242,
"step": 2300
},
{
"epoch": 8.58,
"eval_accuracy": 0.8471001757469244,
"eval_loss": 0.5376706123352051,
"eval_runtime": 7.8653,
"eval_samples_per_second": 72.343,
"eval_steps_per_second": 9.154,
"step": 2300
},
{
"epoch": 8.62,
"grad_norm": 0.7728621363639832,
"learning_rate": 1.3917910447761196e-05,
"loss": 0.2769,
"step": 2310
},
{
"epoch": 8.66,
"grad_norm": 3.757192373275757,
"learning_rate": 1.3544776119402986e-05,
"loss": 0.31,
"step": 2320
},
{
"epoch": 8.69,
"grad_norm": 5.901330471038818,
"learning_rate": 1.3171641791044777e-05,
"loss": 0.2488,
"step": 2330
},
{
"epoch": 8.73,
"grad_norm": 0.1360226422548294,
"learning_rate": 1.2798507462686567e-05,
"loss": 0.2359,
"step": 2340
},
{
"epoch": 8.77,
"grad_norm": 5.801501750946045,
"learning_rate": 1.2425373134328359e-05,
"loss": 0.23,
"step": 2350
},
{
"epoch": 8.81,
"grad_norm": 3.3060359954833984,
"learning_rate": 1.2052238805970149e-05,
"loss": 0.1114,
"step": 2360
},
{
"epoch": 8.84,
"grad_norm": 2.0813100337982178,
"learning_rate": 1.167910447761194e-05,
"loss": 0.1569,
"step": 2370
},
{
"epoch": 8.88,
"grad_norm": 0.42951256036758423,
"learning_rate": 1.1305970149253732e-05,
"loss": 0.2636,
"step": 2380
},
{
"epoch": 8.92,
"grad_norm": 3.2714788913726807,
"learning_rate": 1.0932835820895524e-05,
"loss": 0.2197,
"step": 2390
},
{
"epoch": 8.96,
"grad_norm": 4.24855375289917,
"learning_rate": 1.0559701492537313e-05,
"loss": 0.1207,
"step": 2400
},
{
"epoch": 8.96,
"eval_accuracy": 0.8558875219683656,
"eval_loss": 0.5324714779853821,
"eval_runtime": 7.9022,
"eval_samples_per_second": 72.006,
"eval_steps_per_second": 9.111,
"step": 2400
},
{
"epoch": 8.99,
"grad_norm": 3.989713430404663,
"learning_rate": 1.0186567164179105e-05,
"loss": 0.2336,
"step": 2410
},
{
"epoch": 9.03,
"grad_norm": 5.590869903564453,
"learning_rate": 9.813432835820897e-06,
"loss": 0.2292,
"step": 2420
},
{
"epoch": 9.07,
"grad_norm": 3.405966281890869,
"learning_rate": 9.440298507462688e-06,
"loss": 0.1654,
"step": 2430
},
{
"epoch": 9.1,
"grad_norm": 3.733381986618042,
"learning_rate": 9.067164179104478e-06,
"loss": 0.2104,
"step": 2440
},
{
"epoch": 9.14,
"grad_norm": 0.1994183361530304,
"learning_rate": 8.694029850746268e-06,
"loss": 0.0789,
"step": 2450
},
{
"epoch": 9.18,
"grad_norm": 7.948019504547119,
"learning_rate": 8.32089552238806e-06,
"loss": 0.3335,
"step": 2460
},
{
"epoch": 9.22,
"grad_norm": 3.020522117614746,
"learning_rate": 7.947761194029851e-06,
"loss": 0.1838,
"step": 2470
},
{
"epoch": 9.25,
"grad_norm": 2.4797592163085938,
"learning_rate": 7.574626865671643e-06,
"loss": 0.1573,
"step": 2480
},
{
"epoch": 9.29,
"grad_norm": 0.7854322195053101,
"learning_rate": 7.201492537313433e-06,
"loss": 0.1868,
"step": 2490
},
{
"epoch": 9.33,
"grad_norm": 8.424530982971191,
"learning_rate": 6.828358208955224e-06,
"loss": 0.1806,
"step": 2500
},
{
"epoch": 9.33,
"eval_accuracy": 0.8646748681898067,
"eval_loss": 0.5149648785591125,
"eval_runtime": 7.8422,
"eval_samples_per_second": 72.556,
"eval_steps_per_second": 9.181,
"step": 2500
},
{
"epoch": 9.37,
"grad_norm": 2.9176523685455322,
"learning_rate": 6.455223880597015e-06,
"loss": 0.1977,
"step": 2510
},
{
"epoch": 9.4,
"grad_norm": 4.15384578704834,
"learning_rate": 6.082089552238806e-06,
"loss": 0.2007,
"step": 2520
},
{
"epoch": 9.44,
"grad_norm": 2.4758641719818115,
"learning_rate": 5.708955223880597e-06,
"loss": 0.2,
"step": 2530
},
{
"epoch": 9.48,
"grad_norm": 4.053123950958252,
"learning_rate": 5.335820895522389e-06,
"loss": 0.2514,
"step": 2540
},
{
"epoch": 9.51,
"grad_norm": 2.3916337490081787,
"learning_rate": 4.9626865671641796e-06,
"loss": 0.2104,
"step": 2550
},
{
"epoch": 9.55,
"grad_norm": 4.113661766052246,
"learning_rate": 4.58955223880597e-06,
"loss": 0.1998,
"step": 2560
},
{
"epoch": 9.59,
"grad_norm": 3.558722972869873,
"learning_rate": 4.216417910447761e-06,
"loss": 0.144,
"step": 2570
},
{
"epoch": 9.63,
"grad_norm": 2.689765691757202,
"learning_rate": 3.843283582089553e-06,
"loss": 0.1691,
"step": 2580
},
{
"epoch": 9.66,
"grad_norm": 4.95484733581543,
"learning_rate": 3.4701492537313434e-06,
"loss": 0.1875,
"step": 2590
},
{
"epoch": 9.7,
"grad_norm": 6.025635242462158,
"learning_rate": 3.0970149253731345e-06,
"loss": 0.1793,
"step": 2600
},
{
"epoch": 9.7,
"eval_accuracy": 0.8664323374340949,
"eval_loss": 0.5153330564498901,
"eval_runtime": 7.9144,
"eval_samples_per_second": 71.894,
"eval_steps_per_second": 9.097,
"step": 2600
},
{
"epoch": 9.74,
"grad_norm": 0.3092793822288513,
"learning_rate": 2.7238805970149257e-06,
"loss": 0.1385,
"step": 2610
},
{
"epoch": 9.78,
"grad_norm": 1.1317028999328613,
"learning_rate": 2.3507462686567164e-06,
"loss": 0.1628,
"step": 2620
},
{
"epoch": 9.81,
"grad_norm": 7.642726898193359,
"learning_rate": 1.9776119402985076e-06,
"loss": 0.2142,
"step": 2630
},
{
"epoch": 9.85,
"grad_norm": 4.3891191482543945,
"learning_rate": 1.6044776119402985e-06,
"loss": 0.2115,
"step": 2640
},
{
"epoch": 9.89,
"grad_norm": 5.876834869384766,
"learning_rate": 1.2313432835820897e-06,
"loss": 0.2859,
"step": 2650
},
{
"epoch": 9.93,
"grad_norm": 1.6104581356048584,
"learning_rate": 8.582089552238806e-07,
"loss": 0.2752,
"step": 2660
},
{
"epoch": 9.96,
"grad_norm": 5.835386276245117,
"learning_rate": 4.850746268656717e-07,
"loss": 0.2057,
"step": 2670
},
{
"epoch": 10.0,
"grad_norm": 7.006475925445557,
"learning_rate": 1.119402985074627e-07,
"loss": 0.2098,
"step": 2680
},
{
"epoch": 10.0,
"step": 2680,
"total_flos": 3.3230947683690086e+18,
"train_loss": 0.45543073504718384,
"train_runtime": 1353.2313,
"train_samples_per_second": 31.687,
"train_steps_per_second": 1.98
}
],
"logging_steps": 10,
"max_steps": 2680,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 3.3230947683690086e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}