Action_model / trainer_state.json
Raihan004's picture
🍻 cheers
3838257 verified
raw
history blame
No virus
49.7 kB
{
"best_metric": 0.6129801869392395,
"best_model_checkpoint": "Action_model/checkpoint-300",
"epoch": 10.0,
"eval_steps": 100,
"global_step": 2680,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 2.570383071899414,
"learning_rate": 9.96268656716418e-05,
"loss": 0.1841,
"step": 10
},
{
"epoch": 0.07,
"grad_norm": 6.266295433044434,
"learning_rate": 9.925373134328359e-05,
"loss": 0.2301,
"step": 20
},
{
"epoch": 0.11,
"grad_norm": 8.001986503601074,
"learning_rate": 9.888059701492539e-05,
"loss": 0.2533,
"step": 30
},
{
"epoch": 0.15,
"grad_norm": 5.319194316864014,
"learning_rate": 9.850746268656717e-05,
"loss": 0.2436,
"step": 40
},
{
"epoch": 0.19,
"grad_norm": 0.9653372764587402,
"learning_rate": 9.813432835820896e-05,
"loss": 0.3712,
"step": 50
},
{
"epoch": 0.22,
"grad_norm": 7.348043441772461,
"learning_rate": 9.776119402985075e-05,
"loss": 0.3645,
"step": 60
},
{
"epoch": 0.26,
"grad_norm": 2.1969542503356934,
"learning_rate": 9.738805970149254e-05,
"loss": 0.4609,
"step": 70
},
{
"epoch": 0.3,
"grad_norm": 6.397550106048584,
"learning_rate": 9.701492537313434e-05,
"loss": 0.4755,
"step": 80
},
{
"epoch": 0.34,
"grad_norm": 6.923007488250732,
"learning_rate": 9.664179104477612e-05,
"loss": 0.3901,
"step": 90
},
{
"epoch": 0.37,
"grad_norm": 4.786198616027832,
"learning_rate": 9.626865671641792e-05,
"loss": 0.255,
"step": 100
},
{
"epoch": 0.37,
"eval_accuracy": 0.7926186291739895,
"eval_loss": 0.7616190314292908,
"eval_runtime": 8.7209,
"eval_samples_per_second": 65.245,
"eval_steps_per_second": 8.256,
"step": 100
},
{
"epoch": 0.41,
"grad_norm": 8.368223190307617,
"learning_rate": 9.58955223880597e-05,
"loss": 0.3784,
"step": 110
},
{
"epoch": 0.45,
"grad_norm": 4.078306198120117,
"learning_rate": 9.552238805970149e-05,
"loss": 0.4148,
"step": 120
},
{
"epoch": 0.49,
"grad_norm": 7.815361022949219,
"learning_rate": 9.514925373134329e-05,
"loss": 0.3621,
"step": 130
},
{
"epoch": 0.52,
"grad_norm": 11.498431205749512,
"learning_rate": 9.477611940298507e-05,
"loss": 0.3974,
"step": 140
},
{
"epoch": 0.56,
"grad_norm": 7.946558952331543,
"learning_rate": 9.440298507462687e-05,
"loss": 0.3856,
"step": 150
},
{
"epoch": 0.6,
"grad_norm": 0.3486919403076172,
"learning_rate": 9.402985074626867e-05,
"loss": 0.2435,
"step": 160
},
{
"epoch": 0.63,
"grad_norm": 4.267444133758545,
"learning_rate": 9.365671641791045e-05,
"loss": 0.3736,
"step": 170
},
{
"epoch": 0.67,
"grad_norm": 3.022345542907715,
"learning_rate": 9.328358208955224e-05,
"loss": 0.439,
"step": 180
},
{
"epoch": 0.71,
"grad_norm": 5.57196044921875,
"learning_rate": 9.291044776119402e-05,
"loss": 0.2996,
"step": 190
},
{
"epoch": 0.75,
"grad_norm": 2.636216640472412,
"learning_rate": 9.253731343283582e-05,
"loss": 0.2048,
"step": 200
},
{
"epoch": 0.75,
"eval_accuracy": 0.8084358523725835,
"eval_loss": 0.724670946598053,
"eval_runtime": 8.4461,
"eval_samples_per_second": 67.368,
"eval_steps_per_second": 8.525,
"step": 200
},
{
"epoch": 0.78,
"grad_norm": 1.615098237991333,
"learning_rate": 9.216417910447762e-05,
"loss": 0.3594,
"step": 210
},
{
"epoch": 0.82,
"grad_norm": 9.315821647644043,
"learning_rate": 9.17910447761194e-05,
"loss": 0.3046,
"step": 220
},
{
"epoch": 0.86,
"grad_norm": 3.669430732727051,
"learning_rate": 9.14179104477612e-05,
"loss": 0.4158,
"step": 230
},
{
"epoch": 0.9,
"grad_norm": 7.0882978439331055,
"learning_rate": 9.104477611940299e-05,
"loss": 0.3477,
"step": 240
},
{
"epoch": 0.93,
"grad_norm": 1.1667325496673584,
"learning_rate": 9.067164179104479e-05,
"loss": 0.316,
"step": 250
},
{
"epoch": 0.97,
"grad_norm": 1.482625961303711,
"learning_rate": 9.029850746268657e-05,
"loss": 0.3922,
"step": 260
},
{
"epoch": 1.01,
"grad_norm": 0.20793116092681885,
"learning_rate": 8.992537313432836e-05,
"loss": 0.3751,
"step": 270
},
{
"epoch": 1.04,
"grad_norm": 6.772298812866211,
"learning_rate": 8.955223880597016e-05,
"loss": 0.3269,
"step": 280
},
{
"epoch": 1.08,
"grad_norm": 5.833349227905273,
"learning_rate": 8.917910447761194e-05,
"loss": 0.3026,
"step": 290
},
{
"epoch": 1.12,
"grad_norm": 6.349458694458008,
"learning_rate": 8.880597014925374e-05,
"loss": 0.3763,
"step": 300
},
{
"epoch": 1.12,
"eval_accuracy": 0.8330404217926186,
"eval_loss": 0.6129801869392395,
"eval_runtime": 8.4095,
"eval_samples_per_second": 67.661,
"eval_steps_per_second": 8.562,
"step": 300
},
{
"epoch": 1.16,
"grad_norm": 4.767229080200195,
"learning_rate": 8.843283582089554e-05,
"loss": 0.3808,
"step": 310
},
{
"epoch": 1.19,
"grad_norm": 12.675297737121582,
"learning_rate": 8.805970149253732e-05,
"loss": 0.3766,
"step": 320
},
{
"epoch": 1.23,
"grad_norm": 3.8118245601654053,
"learning_rate": 8.76865671641791e-05,
"loss": 0.2642,
"step": 330
},
{
"epoch": 1.27,
"grad_norm": 8.736045837402344,
"learning_rate": 8.731343283582089e-05,
"loss": 0.3041,
"step": 340
},
{
"epoch": 1.31,
"grad_norm": 6.683359146118164,
"learning_rate": 8.694029850746269e-05,
"loss": 0.1352,
"step": 350
},
{
"epoch": 1.34,
"grad_norm": 4.780521392822266,
"learning_rate": 8.656716417910447e-05,
"loss": 0.4005,
"step": 360
},
{
"epoch": 1.38,
"grad_norm": 9.654714584350586,
"learning_rate": 8.619402985074627e-05,
"loss": 0.3646,
"step": 370
},
{
"epoch": 1.42,
"grad_norm": 4.174666881561279,
"learning_rate": 8.582089552238807e-05,
"loss": 0.2353,
"step": 380
},
{
"epoch": 1.46,
"grad_norm": 7.596667289733887,
"learning_rate": 8.548507462686568e-05,
"loss": 0.3991,
"step": 390
},
{
"epoch": 1.49,
"grad_norm": 5.592709064483643,
"learning_rate": 8.511194029850747e-05,
"loss": 0.307,
"step": 400
},
{
"epoch": 1.49,
"eval_accuracy": 0.789103690685413,
"eval_loss": 0.8137023448944092,
"eval_runtime": 8.3292,
"eval_samples_per_second": 68.314,
"eval_steps_per_second": 8.644,
"step": 400
},
{
"epoch": 1.53,
"grad_norm": 2.232590675354004,
"learning_rate": 8.473880597014926e-05,
"loss": 0.4669,
"step": 410
},
{
"epoch": 1.57,
"grad_norm": 4.276609897613525,
"learning_rate": 8.436567164179105e-05,
"loss": 0.3831,
"step": 420
},
{
"epoch": 1.6,
"grad_norm": 7.262507915496826,
"learning_rate": 8.399253731343283e-05,
"loss": 0.3472,
"step": 430
},
{
"epoch": 1.64,
"grad_norm": 7.258556365966797,
"learning_rate": 8.361940298507463e-05,
"loss": 0.2396,
"step": 440
},
{
"epoch": 1.68,
"grad_norm": 4.945961952209473,
"learning_rate": 8.324626865671642e-05,
"loss": 0.2433,
"step": 450
},
{
"epoch": 1.72,
"grad_norm": 5.138702392578125,
"learning_rate": 8.287313432835821e-05,
"loss": 0.2947,
"step": 460
},
{
"epoch": 1.75,
"grad_norm": 1.1640909910202026,
"learning_rate": 8.25e-05,
"loss": 0.4791,
"step": 470
},
{
"epoch": 1.79,
"grad_norm": 4.626485824584961,
"learning_rate": 8.21268656716418e-05,
"loss": 0.286,
"step": 480
},
{
"epoch": 1.83,
"grad_norm": 5.178492069244385,
"learning_rate": 8.17537313432836e-05,
"loss": 0.3202,
"step": 490
},
{
"epoch": 1.87,
"grad_norm": 7.854339122772217,
"learning_rate": 8.138059701492538e-05,
"loss": 0.3542,
"step": 500
},
{
"epoch": 1.87,
"eval_accuracy": 0.8014059753954306,
"eval_loss": 0.6611581444740295,
"eval_runtime": 8.5853,
"eval_samples_per_second": 66.276,
"eval_steps_per_second": 8.386,
"step": 500
},
{
"epoch": 1.9,
"grad_norm": 1.429740071296692,
"learning_rate": 8.100746268656717e-05,
"loss": 0.3039,
"step": 510
},
{
"epoch": 1.94,
"grad_norm": 2.9776551723480225,
"learning_rate": 8.063432835820895e-05,
"loss": 0.3825,
"step": 520
},
{
"epoch": 1.98,
"grad_norm": 10.557899475097656,
"learning_rate": 8.026119402985075e-05,
"loss": 0.5109,
"step": 530
},
{
"epoch": 2.01,
"grad_norm": 1.448002815246582,
"learning_rate": 7.988805970149255e-05,
"loss": 0.3421,
"step": 540
},
{
"epoch": 2.05,
"grad_norm": 4.500860691070557,
"learning_rate": 7.951492537313433e-05,
"loss": 0.3008,
"step": 550
},
{
"epoch": 2.09,
"grad_norm": 8.077374458312988,
"learning_rate": 7.914179104477613e-05,
"loss": 0.27,
"step": 560
},
{
"epoch": 2.13,
"grad_norm": 0.16809479892253876,
"learning_rate": 7.876865671641792e-05,
"loss": 0.2184,
"step": 570
},
{
"epoch": 2.16,
"grad_norm": 4.892763137817383,
"learning_rate": 7.83955223880597e-05,
"loss": 0.1479,
"step": 580
},
{
"epoch": 2.2,
"grad_norm": 8.35221004486084,
"learning_rate": 7.80223880597015e-05,
"loss": 0.3498,
"step": 590
},
{
"epoch": 2.24,
"grad_norm": 12.043429374694824,
"learning_rate": 7.764925373134328e-05,
"loss": 0.3518,
"step": 600
},
{
"epoch": 2.24,
"eval_accuracy": 0.8189806678383128,
"eval_loss": 0.6964564919471741,
"eval_runtime": 8.3878,
"eval_samples_per_second": 67.837,
"eval_steps_per_second": 8.584,
"step": 600
},
{
"epoch": 2.28,
"grad_norm": 3.7737715244293213,
"learning_rate": 7.727611940298508e-05,
"loss": 0.3532,
"step": 610
},
{
"epoch": 2.31,
"grad_norm": 4.282881736755371,
"learning_rate": 7.690298507462687e-05,
"loss": 0.2214,
"step": 620
},
{
"epoch": 2.35,
"grad_norm": 6.733531475067139,
"learning_rate": 7.652985074626866e-05,
"loss": 0.2709,
"step": 630
},
{
"epoch": 2.39,
"grad_norm": 2.567267417907715,
"learning_rate": 7.615671641791045e-05,
"loss": 0.3725,
"step": 640
},
{
"epoch": 2.43,
"grad_norm": 3.120966911315918,
"learning_rate": 7.578358208955223e-05,
"loss": 0.3036,
"step": 650
},
{
"epoch": 2.46,
"grad_norm": 6.505622386932373,
"learning_rate": 7.541044776119403e-05,
"loss": 0.2426,
"step": 660
},
{
"epoch": 2.5,
"grad_norm": 4.887637615203857,
"learning_rate": 7.503731343283582e-05,
"loss": 0.281,
"step": 670
},
{
"epoch": 2.54,
"grad_norm": 9.790969848632812,
"learning_rate": 7.466417910447762e-05,
"loss": 0.4504,
"step": 680
},
{
"epoch": 2.57,
"grad_norm": 4.354789733886719,
"learning_rate": 7.429104477611941e-05,
"loss": 0.4094,
"step": 690
},
{
"epoch": 2.61,
"grad_norm": 5.015912055969238,
"learning_rate": 7.39179104477612e-05,
"loss": 0.3706,
"step": 700
},
{
"epoch": 2.61,
"eval_accuracy": 0.804920913884007,
"eval_loss": 0.7254143357276917,
"eval_runtime": 8.3242,
"eval_samples_per_second": 68.355,
"eval_steps_per_second": 8.649,
"step": 700
},
{
"epoch": 2.65,
"grad_norm": 5.382541656494141,
"learning_rate": 7.3544776119403e-05,
"loss": 0.1722,
"step": 710
},
{
"epoch": 2.69,
"grad_norm": 5.573971748352051,
"learning_rate": 7.317164179104478e-05,
"loss": 0.327,
"step": 720
},
{
"epoch": 2.72,
"grad_norm": 3.5606117248535156,
"learning_rate": 7.279850746268657e-05,
"loss": 0.2702,
"step": 730
},
{
"epoch": 2.76,
"grad_norm": 1.7398028373718262,
"learning_rate": 7.242537313432837e-05,
"loss": 0.238,
"step": 740
},
{
"epoch": 2.8,
"grad_norm": 2.7511751651763916,
"learning_rate": 7.205223880597015e-05,
"loss": 0.1848,
"step": 750
},
{
"epoch": 2.84,
"grad_norm": 3.381510019302368,
"learning_rate": 7.167910447761195e-05,
"loss": 0.2261,
"step": 760
},
{
"epoch": 2.87,
"grad_norm": 4.65634298324585,
"learning_rate": 7.130597014925373e-05,
"loss": 0.237,
"step": 770
},
{
"epoch": 2.91,
"grad_norm": 10.35020923614502,
"learning_rate": 7.093283582089553e-05,
"loss": 0.3012,
"step": 780
},
{
"epoch": 2.95,
"grad_norm": 8.878485679626465,
"learning_rate": 7.055970149253732e-05,
"loss": 0.4094,
"step": 790
},
{
"epoch": 2.99,
"grad_norm": 2.9728074073791504,
"learning_rate": 7.01865671641791e-05,
"loss": 0.4084,
"step": 800
},
{
"epoch": 2.99,
"eval_accuracy": 0.8101933216168717,
"eval_loss": 0.6746156811714172,
"eval_runtime": 8.2718,
"eval_samples_per_second": 68.788,
"eval_steps_per_second": 8.704,
"step": 800
},
{
"epoch": 3.02,
"grad_norm": 4.835368633270264,
"learning_rate": 6.98134328358209e-05,
"loss": 0.3152,
"step": 810
},
{
"epoch": 3.06,
"grad_norm": 2.9197049140930176,
"learning_rate": 6.944029850746268e-05,
"loss": 0.3433,
"step": 820
},
{
"epoch": 3.1,
"grad_norm": 5.646128177642822,
"learning_rate": 6.906716417910448e-05,
"loss": 0.2604,
"step": 830
},
{
"epoch": 3.13,
"grad_norm": 3.860607862472534,
"learning_rate": 6.869402985074627e-05,
"loss": 0.2831,
"step": 840
},
{
"epoch": 3.17,
"grad_norm": 0.1358175426721573,
"learning_rate": 6.832089552238807e-05,
"loss": 0.242,
"step": 850
},
{
"epoch": 3.21,
"grad_norm": 1.1011104583740234,
"learning_rate": 6.794776119402985e-05,
"loss": 0.2621,
"step": 860
},
{
"epoch": 3.25,
"grad_norm": 7.837879180908203,
"learning_rate": 6.757462686567164e-05,
"loss": 0.249,
"step": 870
},
{
"epoch": 3.28,
"grad_norm": 6.8647613525390625,
"learning_rate": 6.720149253731343e-05,
"loss": 0.3398,
"step": 880
},
{
"epoch": 3.32,
"grad_norm": 2.8186678886413574,
"learning_rate": 6.682835820895522e-05,
"loss": 0.3092,
"step": 890
},
{
"epoch": 3.36,
"grad_norm": 4.623282432556152,
"learning_rate": 6.645522388059702e-05,
"loss": 0.2533,
"step": 900
},
{
"epoch": 3.36,
"eval_accuracy": 0.8189806678383128,
"eval_loss": 0.6866591572761536,
"eval_runtime": 8.3143,
"eval_samples_per_second": 68.436,
"eval_steps_per_second": 8.66,
"step": 900
},
{
"epoch": 3.4,
"grad_norm": 4.85120964050293,
"learning_rate": 6.608208955223882e-05,
"loss": 0.2279,
"step": 910
},
{
"epoch": 3.43,
"grad_norm": 0.7263774275779724,
"learning_rate": 6.57089552238806e-05,
"loss": 0.1725,
"step": 920
},
{
"epoch": 3.47,
"grad_norm": 6.813180923461914,
"learning_rate": 6.53358208955224e-05,
"loss": 0.3304,
"step": 930
},
{
"epoch": 3.51,
"grad_norm": 8.58501148223877,
"learning_rate": 6.496268656716418e-05,
"loss": 0.1864,
"step": 940
},
{
"epoch": 3.54,
"grad_norm": 2.814436435699463,
"learning_rate": 6.458955223880597e-05,
"loss": 0.1496,
"step": 950
},
{
"epoch": 3.58,
"grad_norm": 8.36603832244873,
"learning_rate": 6.421641791044777e-05,
"loss": 0.208,
"step": 960
},
{
"epoch": 3.62,
"grad_norm": 3.5715956687927246,
"learning_rate": 6.384328358208955e-05,
"loss": 0.2429,
"step": 970
},
{
"epoch": 3.66,
"grad_norm": 4.983556270599365,
"learning_rate": 6.347014925373135e-05,
"loss": 0.4053,
"step": 980
},
{
"epoch": 3.69,
"grad_norm": 4.936723232269287,
"learning_rate": 6.309701492537313e-05,
"loss": 0.1545,
"step": 990
},
{
"epoch": 3.73,
"grad_norm": 6.59185791015625,
"learning_rate": 6.272388059701493e-05,
"loss": 0.3147,
"step": 1000
},
{
"epoch": 3.73,
"eval_accuracy": 0.8189806678383128,
"eval_loss": 0.7077136635780334,
"eval_runtime": 8.3117,
"eval_samples_per_second": 68.457,
"eval_steps_per_second": 8.662,
"step": 1000
},
{
"epoch": 3.77,
"grad_norm": 9.348366737365723,
"learning_rate": 6.235074626865672e-05,
"loss": 0.3634,
"step": 1010
},
{
"epoch": 3.81,
"grad_norm": 9.918521881103516,
"learning_rate": 6.19776119402985e-05,
"loss": 0.3151,
"step": 1020
},
{
"epoch": 3.84,
"grad_norm": 5.687044143676758,
"learning_rate": 6.16044776119403e-05,
"loss": 0.3088,
"step": 1030
},
{
"epoch": 3.88,
"grad_norm": 3.8347887992858887,
"learning_rate": 6.123134328358209e-05,
"loss": 0.2128,
"step": 1040
},
{
"epoch": 3.92,
"grad_norm": 5.380050182342529,
"learning_rate": 6.0858208955223884e-05,
"loss": 0.255,
"step": 1050
},
{
"epoch": 3.96,
"grad_norm": 8.848828315734863,
"learning_rate": 6.0485074626865676e-05,
"loss": 0.2794,
"step": 1060
},
{
"epoch": 3.99,
"grad_norm": 3.9666404724121094,
"learning_rate": 6.011194029850746e-05,
"loss": 0.1954,
"step": 1070
},
{
"epoch": 4.03,
"grad_norm": 0.3369455635547638,
"learning_rate": 5.973880597014926e-05,
"loss": 0.2298,
"step": 1080
},
{
"epoch": 4.07,
"grad_norm": 16.327823638916016,
"learning_rate": 5.9365671641791044e-05,
"loss": 0.2504,
"step": 1090
},
{
"epoch": 4.1,
"grad_norm": 7.070168495178223,
"learning_rate": 5.8992537313432835e-05,
"loss": 0.3182,
"step": 1100
},
{
"epoch": 4.1,
"eval_accuracy": 0.8189806678383128,
"eval_loss": 0.6661401987075806,
"eval_runtime": 8.2263,
"eval_samples_per_second": 69.169,
"eval_steps_per_second": 8.752,
"step": 1100
},
{
"epoch": 4.14,
"grad_norm": 2.853975534439087,
"learning_rate": 5.8619402985074634e-05,
"loss": 0.201,
"step": 1110
},
{
"epoch": 4.18,
"grad_norm": 0.958690881729126,
"learning_rate": 5.824626865671642e-05,
"loss": 0.1833,
"step": 1120
},
{
"epoch": 4.22,
"grad_norm": 3.4794461727142334,
"learning_rate": 5.787313432835822e-05,
"loss": 0.2796,
"step": 1130
},
{
"epoch": 4.25,
"grad_norm": 4.793296813964844,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.2281,
"step": 1140
},
{
"epoch": 4.29,
"grad_norm": 6.200154781341553,
"learning_rate": 5.712686567164179e-05,
"loss": 0.2814,
"step": 1150
},
{
"epoch": 4.33,
"grad_norm": 5.616389274597168,
"learning_rate": 5.675373134328359e-05,
"loss": 0.1656,
"step": 1160
},
{
"epoch": 4.37,
"grad_norm": 9.382554054260254,
"learning_rate": 5.6380597014925376e-05,
"loss": 0.19,
"step": 1170
},
{
"epoch": 4.4,
"grad_norm": 3.526240587234497,
"learning_rate": 5.600746268656717e-05,
"loss": 0.2063,
"step": 1180
},
{
"epoch": 4.44,
"grad_norm": 3.494896650314331,
"learning_rate": 5.563432835820895e-05,
"loss": 0.1681,
"step": 1190
},
{
"epoch": 4.48,
"grad_norm": 5.764057636260986,
"learning_rate": 5.526119402985075e-05,
"loss": 0.2248,
"step": 1200
},
{
"epoch": 4.48,
"eval_accuracy": 0.8418277680140598,
"eval_loss": 0.6632041335105896,
"eval_runtime": 8.1661,
"eval_samples_per_second": 69.679,
"eval_steps_per_second": 8.817,
"step": 1200
},
{
"epoch": 4.51,
"grad_norm": 4.680635452270508,
"learning_rate": 5.488805970149254e-05,
"loss": 0.2179,
"step": 1210
},
{
"epoch": 4.55,
"grad_norm": 10.24306869506836,
"learning_rate": 5.451492537313433e-05,
"loss": 0.2187,
"step": 1220
},
{
"epoch": 4.59,
"grad_norm": 3.054690361022949,
"learning_rate": 5.4141791044776126e-05,
"loss": 0.1729,
"step": 1230
},
{
"epoch": 4.63,
"grad_norm": 4.907272815704346,
"learning_rate": 5.376865671641791e-05,
"loss": 0.2762,
"step": 1240
},
{
"epoch": 4.66,
"grad_norm": 4.774748802185059,
"learning_rate": 5.33955223880597e-05,
"loss": 0.1965,
"step": 1250
},
{
"epoch": 4.7,
"grad_norm": 5.757875919342041,
"learning_rate": 5.30223880597015e-05,
"loss": 0.1564,
"step": 1260
},
{
"epoch": 4.74,
"grad_norm": 0.3608088791370392,
"learning_rate": 5.2649253731343286e-05,
"loss": 0.0946,
"step": 1270
},
{
"epoch": 4.78,
"grad_norm": 3.6289939880371094,
"learning_rate": 5.227611940298508e-05,
"loss": 0.3364,
"step": 1280
},
{
"epoch": 4.81,
"grad_norm": 5.132009029388428,
"learning_rate": 5.190298507462686e-05,
"loss": 0.231,
"step": 1290
},
{
"epoch": 4.85,
"grad_norm": 1.0347099304199219,
"learning_rate": 5.152985074626866e-05,
"loss": 0.1617,
"step": 1300
},
{
"epoch": 4.85,
"eval_accuracy": 0.8172231985940246,
"eval_loss": 0.7277125716209412,
"eval_runtime": 8.4693,
"eval_samples_per_second": 67.184,
"eval_steps_per_second": 8.501,
"step": 1300
},
{
"epoch": 4.89,
"grad_norm": 2.5996298789978027,
"learning_rate": 5.115671641791045e-05,
"loss": 0.385,
"step": 1310
},
{
"epoch": 4.93,
"grad_norm": 3.724181890487671,
"learning_rate": 5.078358208955224e-05,
"loss": 0.1786,
"step": 1320
},
{
"epoch": 4.96,
"grad_norm": 2.150557518005371,
"learning_rate": 5.0410447761194035e-05,
"loss": 0.2122,
"step": 1330
},
{
"epoch": 5.0,
"grad_norm": 3.8813323974609375,
"learning_rate": 5.003731343283582e-05,
"loss": 0.2425,
"step": 1340
},
{
"epoch": 5.04,
"grad_norm": 0.896369457244873,
"learning_rate": 4.966417910447762e-05,
"loss": 0.2208,
"step": 1350
},
{
"epoch": 5.07,
"grad_norm": 9.002110481262207,
"learning_rate": 4.92910447761194e-05,
"loss": 0.1432,
"step": 1360
},
{
"epoch": 5.11,
"grad_norm": 9.619662284851074,
"learning_rate": 4.8917910447761195e-05,
"loss": 0.1347,
"step": 1370
},
{
"epoch": 5.15,
"grad_norm": 3.5148773193359375,
"learning_rate": 4.8544776119402986e-05,
"loss": 0.2837,
"step": 1380
},
{
"epoch": 5.19,
"grad_norm": 7.631669044494629,
"learning_rate": 4.817164179104478e-05,
"loss": 0.1887,
"step": 1390
},
{
"epoch": 5.22,
"grad_norm": 11.738872528076172,
"learning_rate": 4.779850746268657e-05,
"loss": 0.2578,
"step": 1400
},
{
"epoch": 5.22,
"eval_accuracy": 0.8189806678383128,
"eval_loss": 0.7114442586898804,
"eval_runtime": 8.2672,
"eval_samples_per_second": 68.826,
"eval_steps_per_second": 8.709,
"step": 1400
},
{
"epoch": 5.26,
"grad_norm": 6.67802095413208,
"learning_rate": 4.742537313432836e-05,
"loss": 0.2527,
"step": 1410
},
{
"epoch": 5.3,
"grad_norm": 4.491325378417969,
"learning_rate": 4.705223880597015e-05,
"loss": 0.2386,
"step": 1420
},
{
"epoch": 5.34,
"grad_norm": 1.1810379028320312,
"learning_rate": 4.667910447761194e-05,
"loss": 0.1693,
"step": 1430
},
{
"epoch": 5.37,
"grad_norm": 6.075868129730225,
"learning_rate": 4.6305970149253736e-05,
"loss": 0.167,
"step": 1440
},
{
"epoch": 5.41,
"grad_norm": 2.315635919570923,
"learning_rate": 4.593283582089553e-05,
"loss": 0.2243,
"step": 1450
},
{
"epoch": 5.45,
"grad_norm": 10.839255332946777,
"learning_rate": 4.555970149253732e-05,
"loss": 0.2414,
"step": 1460
},
{
"epoch": 5.49,
"grad_norm": 4.562304496765137,
"learning_rate": 4.5186567164179104e-05,
"loss": 0.264,
"step": 1470
},
{
"epoch": 5.52,
"grad_norm": 1.8821789026260376,
"learning_rate": 4.4813432835820895e-05,
"loss": 0.1407,
"step": 1480
},
{
"epoch": 5.56,
"grad_norm": 8.406396865844727,
"learning_rate": 4.4440298507462694e-05,
"loss": 0.1454,
"step": 1490
},
{
"epoch": 5.6,
"grad_norm": 0.2816010117530823,
"learning_rate": 4.406716417910448e-05,
"loss": 0.1864,
"step": 1500
},
{
"epoch": 5.6,
"eval_accuracy": 0.8172231985940246,
"eval_loss": 0.755394458770752,
"eval_runtime": 8.2598,
"eval_samples_per_second": 68.888,
"eval_steps_per_second": 8.717,
"step": 1500
},
{
"epoch": 5.63,
"grad_norm": 6.619854927062988,
"learning_rate": 4.369402985074627e-05,
"loss": 0.2806,
"step": 1510
},
{
"epoch": 5.67,
"grad_norm": 2.056018829345703,
"learning_rate": 4.332089552238806e-05,
"loss": 0.2583,
"step": 1520
},
{
"epoch": 5.71,
"grad_norm": 0.966521680355072,
"learning_rate": 4.294776119402985e-05,
"loss": 0.0997,
"step": 1530
},
{
"epoch": 5.75,
"grad_norm": 2.8261241912841797,
"learning_rate": 4.2574626865671645e-05,
"loss": 0.1604,
"step": 1540
},
{
"epoch": 5.78,
"grad_norm": 3.089912176132202,
"learning_rate": 4.2201492537313436e-05,
"loss": 0.2775,
"step": 1550
},
{
"epoch": 5.82,
"grad_norm": 7.935690879821777,
"learning_rate": 4.182835820895523e-05,
"loss": 0.2522,
"step": 1560
},
{
"epoch": 5.86,
"grad_norm": 0.7999266982078552,
"learning_rate": 4.145522388059702e-05,
"loss": 0.0752,
"step": 1570
},
{
"epoch": 5.9,
"grad_norm": 6.0712480545043945,
"learning_rate": 4.1082089552238804e-05,
"loss": 0.1933,
"step": 1580
},
{
"epoch": 5.93,
"grad_norm": 10.768308639526367,
"learning_rate": 4.07089552238806e-05,
"loss": 0.1664,
"step": 1590
},
{
"epoch": 5.97,
"grad_norm": 9.641716003417969,
"learning_rate": 4.0335820895522394e-05,
"loss": 0.3134,
"step": 1600
},
{
"epoch": 5.97,
"eval_accuracy": 0.8154657293497364,
"eval_loss": 0.7593356966972351,
"eval_runtime": 8.4455,
"eval_samples_per_second": 67.373,
"eval_steps_per_second": 8.525,
"step": 1600
},
{
"epoch": 6.01,
"grad_norm": 6.7538838386535645,
"learning_rate": 3.996268656716418e-05,
"loss": 0.1747,
"step": 1610
},
{
"epoch": 6.04,
"grad_norm": 6.237377166748047,
"learning_rate": 3.958955223880597e-05,
"loss": 0.2406,
"step": 1620
},
{
"epoch": 6.08,
"grad_norm": 7.950930118560791,
"learning_rate": 3.921641791044776e-05,
"loss": 0.1884,
"step": 1630
},
{
"epoch": 6.12,
"grad_norm": 4.41484260559082,
"learning_rate": 3.8843283582089554e-05,
"loss": 0.1445,
"step": 1640
},
{
"epoch": 6.16,
"grad_norm": 6.339887619018555,
"learning_rate": 3.8470149253731345e-05,
"loss": 0.2906,
"step": 1650
},
{
"epoch": 6.19,
"grad_norm": 7.597599983215332,
"learning_rate": 3.809701492537314e-05,
"loss": 0.1576,
"step": 1660
},
{
"epoch": 6.23,
"grad_norm": 2.379629373550415,
"learning_rate": 3.772388059701493e-05,
"loss": 0.2016,
"step": 1670
},
{
"epoch": 6.27,
"grad_norm": 2.7694478034973145,
"learning_rate": 3.735074626865671e-05,
"loss": 0.1188,
"step": 1680
},
{
"epoch": 6.31,
"grad_norm": 2.1837210655212402,
"learning_rate": 3.6977611940298505e-05,
"loss": 0.1908,
"step": 1690
},
{
"epoch": 6.34,
"grad_norm": 4.4665350914001465,
"learning_rate": 3.66044776119403e-05,
"loss": 0.24,
"step": 1700
},
{
"epoch": 6.34,
"eval_accuracy": 0.8260105448154658,
"eval_loss": 0.7510848641395569,
"eval_runtime": 8.2044,
"eval_samples_per_second": 69.353,
"eval_steps_per_second": 8.776,
"step": 1700
},
{
"epoch": 6.38,
"grad_norm": 4.6551995277404785,
"learning_rate": 3.6231343283582095e-05,
"loss": 0.1631,
"step": 1710
},
{
"epoch": 6.42,
"grad_norm": 1.098407506942749,
"learning_rate": 3.585820895522388e-05,
"loss": 0.0912,
"step": 1720
},
{
"epoch": 6.46,
"grad_norm": 0.37138649821281433,
"learning_rate": 3.548507462686567e-05,
"loss": 0.2621,
"step": 1730
},
{
"epoch": 6.49,
"grad_norm": 7.4571757316589355,
"learning_rate": 3.511194029850746e-05,
"loss": 0.268,
"step": 1740
},
{
"epoch": 6.53,
"grad_norm": 0.5180323123931885,
"learning_rate": 3.4738805970149254e-05,
"loss": 0.2135,
"step": 1750
},
{
"epoch": 6.57,
"grad_norm": 1.0866820812225342,
"learning_rate": 3.4365671641791046e-05,
"loss": 0.1489,
"step": 1760
},
{
"epoch": 6.6,
"grad_norm": 8.90451717376709,
"learning_rate": 3.399253731343284e-05,
"loss": 0.288,
"step": 1770
},
{
"epoch": 6.64,
"grad_norm": 1.1608803272247314,
"learning_rate": 3.361940298507463e-05,
"loss": 0.18,
"step": 1780
},
{
"epoch": 6.68,
"grad_norm": 2.9207170009613037,
"learning_rate": 3.3246268656716414e-05,
"loss": 0.2414,
"step": 1790
},
{
"epoch": 6.72,
"grad_norm": 0.2674783170223236,
"learning_rate": 3.287313432835821e-05,
"loss": 0.2359,
"step": 1800
},
{
"epoch": 6.72,
"eval_accuracy": 0.8137082601054482,
"eval_loss": 0.7501537203788757,
"eval_runtime": 8.1528,
"eval_samples_per_second": 69.792,
"eval_steps_per_second": 8.831,
"step": 1800
},
{
"epoch": 6.75,
"grad_norm": 8.241676330566406,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.1975,
"step": 1810
},
{
"epoch": 6.79,
"grad_norm": 2.0347325801849365,
"learning_rate": 3.2126865671641796e-05,
"loss": 0.218,
"step": 1820
},
{
"epoch": 6.83,
"grad_norm": 1.0338706970214844,
"learning_rate": 3.175373134328358e-05,
"loss": 0.1437,
"step": 1830
},
{
"epoch": 6.87,
"grad_norm": 0.34902578592300415,
"learning_rate": 3.138059701492537e-05,
"loss": 0.1883,
"step": 1840
},
{
"epoch": 6.9,
"grad_norm": 6.642534255981445,
"learning_rate": 3.100746268656717e-05,
"loss": 0.2513,
"step": 1850
},
{
"epoch": 6.94,
"grad_norm": 4.432920455932617,
"learning_rate": 3.0634328358208955e-05,
"loss": 0.1058,
"step": 1860
},
{
"epoch": 6.98,
"grad_norm": 4.381640434265137,
"learning_rate": 3.0261194029850747e-05,
"loss": 0.2114,
"step": 1870
},
{
"epoch": 7.01,
"grad_norm": 7.730411529541016,
"learning_rate": 2.9888059701492538e-05,
"loss": 0.2542,
"step": 1880
},
{
"epoch": 7.05,
"grad_norm": 7.122923851013184,
"learning_rate": 2.9514925373134326e-05,
"loss": 0.2594,
"step": 1890
},
{
"epoch": 7.09,
"grad_norm": 1.411278486251831,
"learning_rate": 2.9141791044776125e-05,
"loss": 0.2322,
"step": 1900
},
{
"epoch": 7.09,
"eval_accuracy": 0.8347978910369068,
"eval_loss": 0.6952534317970276,
"eval_runtime": 8.3769,
"eval_samples_per_second": 67.925,
"eval_steps_per_second": 8.595,
"step": 1900
},
{
"epoch": 7.13,
"grad_norm": 2.219285011291504,
"learning_rate": 2.8768656716417913e-05,
"loss": 0.1344,
"step": 1910
},
{
"epoch": 7.16,
"grad_norm": 6.302455902099609,
"learning_rate": 2.8395522388059705e-05,
"loss": 0.2098,
"step": 1920
},
{
"epoch": 7.2,
"grad_norm": 1.2837783098220825,
"learning_rate": 2.8022388059701493e-05,
"loss": 0.0906,
"step": 1930
},
{
"epoch": 7.24,
"grad_norm": 6.604355335235596,
"learning_rate": 2.7649253731343284e-05,
"loss": 0.2352,
"step": 1940
},
{
"epoch": 7.28,
"grad_norm": 9.916419982910156,
"learning_rate": 2.727611940298508e-05,
"loss": 0.1422,
"step": 1950
},
{
"epoch": 7.31,
"grad_norm": 2.7665014266967773,
"learning_rate": 2.6902985074626868e-05,
"loss": 0.1722,
"step": 1960
},
{
"epoch": 7.35,
"grad_norm": 0.24231348931789398,
"learning_rate": 2.652985074626866e-05,
"loss": 0.2935,
"step": 1970
},
{
"epoch": 7.39,
"grad_norm": 0.8025885224342346,
"learning_rate": 2.6156716417910447e-05,
"loss": 0.157,
"step": 1980
},
{
"epoch": 7.43,
"grad_norm": 1.6752264499664307,
"learning_rate": 2.578358208955224e-05,
"loss": 0.1256,
"step": 1990
},
{
"epoch": 7.46,
"grad_norm": 2.404883861541748,
"learning_rate": 2.5410447761194027e-05,
"loss": 0.1514,
"step": 2000
},
{
"epoch": 7.46,
"eval_accuracy": 0.8260105448154658,
"eval_loss": 0.7120960354804993,
"eval_runtime": 8.1425,
"eval_samples_per_second": 69.88,
"eval_steps_per_second": 8.842,
"step": 2000
},
{
"epoch": 7.5,
"grad_norm": 5.409728050231934,
"learning_rate": 2.5037313432835825e-05,
"loss": 0.222,
"step": 2010
},
{
"epoch": 7.54,
"grad_norm": 3.949014663696289,
"learning_rate": 2.4664179104477614e-05,
"loss": 0.245,
"step": 2020
},
{
"epoch": 7.57,
"grad_norm": 8.40086555480957,
"learning_rate": 2.4291044776119405e-05,
"loss": 0.1408,
"step": 2030
},
{
"epoch": 7.61,
"grad_norm": 7.694955348968506,
"learning_rate": 2.3917910447761197e-05,
"loss": 0.2072,
"step": 2040
},
{
"epoch": 7.65,
"grad_norm": 1.9109055995941162,
"learning_rate": 2.3544776119402985e-05,
"loss": 0.145,
"step": 2050
},
{
"epoch": 7.69,
"grad_norm": 12.803776741027832,
"learning_rate": 2.3171641791044777e-05,
"loss": 0.1274,
"step": 2060
},
{
"epoch": 7.72,
"grad_norm": 3.3325235843658447,
"learning_rate": 2.2798507462686568e-05,
"loss": 0.1564,
"step": 2070
},
{
"epoch": 7.76,
"grad_norm": 1.105327844619751,
"learning_rate": 2.242537313432836e-05,
"loss": 0.2008,
"step": 2080
},
{
"epoch": 7.8,
"grad_norm": 1.7592620849609375,
"learning_rate": 2.2052238805970148e-05,
"loss": 0.203,
"step": 2090
},
{
"epoch": 7.84,
"grad_norm": 0.13264060020446777,
"learning_rate": 2.1679104477611943e-05,
"loss": 0.2089,
"step": 2100
},
{
"epoch": 7.84,
"eval_accuracy": 0.827768014059754,
"eval_loss": 0.693087637424469,
"eval_runtime": 8.2375,
"eval_samples_per_second": 69.074,
"eval_steps_per_second": 8.741,
"step": 2100
},
{
"epoch": 7.87,
"grad_norm": 5.904381275177002,
"learning_rate": 2.130597014925373e-05,
"loss": 0.1754,
"step": 2110
},
{
"epoch": 7.91,
"grad_norm": 1.7469266653060913,
"learning_rate": 2.0932835820895526e-05,
"loss": 0.1322,
"step": 2120
},
{
"epoch": 7.95,
"grad_norm": 4.313326835632324,
"learning_rate": 2.0559701492537314e-05,
"loss": 0.1418,
"step": 2130
},
{
"epoch": 7.99,
"grad_norm": 0.14211903512477875,
"learning_rate": 2.0186567164179106e-05,
"loss": 0.1534,
"step": 2140
},
{
"epoch": 8.02,
"grad_norm": 5.527184009552002,
"learning_rate": 1.9813432835820897e-05,
"loss": 0.2122,
"step": 2150
},
{
"epoch": 8.06,
"grad_norm": 0.2312430739402771,
"learning_rate": 1.9440298507462686e-05,
"loss": 0.1617,
"step": 2160
},
{
"epoch": 8.1,
"grad_norm": 0.23949085175991058,
"learning_rate": 1.906716417910448e-05,
"loss": 0.1286,
"step": 2170
},
{
"epoch": 8.13,
"grad_norm": 0.1903185099363327,
"learning_rate": 1.869402985074627e-05,
"loss": 0.0846,
"step": 2180
},
{
"epoch": 8.17,
"grad_norm": 0.08518023788928986,
"learning_rate": 1.832089552238806e-05,
"loss": 0.0801,
"step": 2190
},
{
"epoch": 8.21,
"grad_norm": 4.424215793609619,
"learning_rate": 1.7947761194029852e-05,
"loss": 0.2245,
"step": 2200
},
{
"epoch": 8.21,
"eval_accuracy": 0.8330404217926186,
"eval_loss": 0.7087014317512512,
"eval_runtime": 8.1117,
"eval_samples_per_second": 70.145,
"eval_steps_per_second": 8.876,
"step": 2200
},
{
"epoch": 8.25,
"grad_norm": 7.247931480407715,
"learning_rate": 1.7574626865671644e-05,
"loss": 0.0722,
"step": 2210
},
{
"epoch": 8.28,
"grad_norm": 4.80264949798584,
"learning_rate": 1.7201492537313435e-05,
"loss": 0.0844,
"step": 2220
},
{
"epoch": 8.32,
"grad_norm": 8.001790046691895,
"learning_rate": 1.6828358208955223e-05,
"loss": 0.1077,
"step": 2230
},
{
"epoch": 8.36,
"grad_norm": 5.419641017913818,
"learning_rate": 1.6455223880597015e-05,
"loss": 0.1627,
"step": 2240
},
{
"epoch": 8.4,
"grad_norm": 0.031686268746852875,
"learning_rate": 1.6082089552238806e-05,
"loss": 0.0984,
"step": 2250
},
{
"epoch": 8.43,
"grad_norm": 6.095193862915039,
"learning_rate": 1.5708955223880598e-05,
"loss": 0.1756,
"step": 2260
},
{
"epoch": 8.47,
"grad_norm": 5.179446220397949,
"learning_rate": 1.5335820895522386e-05,
"loss": 0.1708,
"step": 2270
},
{
"epoch": 8.51,
"grad_norm": 4.06497049331665,
"learning_rate": 1.496268656716418e-05,
"loss": 0.1493,
"step": 2280
},
{
"epoch": 8.54,
"grad_norm": 1.4721342325210571,
"learning_rate": 1.458955223880597e-05,
"loss": 0.2587,
"step": 2290
},
{
"epoch": 8.58,
"grad_norm": 4.418783664703369,
"learning_rate": 1.4216417910447763e-05,
"loss": 0.1328,
"step": 2300
},
{
"epoch": 8.58,
"eval_accuracy": 0.8312829525483304,
"eval_loss": 0.700339674949646,
"eval_runtime": 8.481,
"eval_samples_per_second": 67.091,
"eval_steps_per_second": 8.49,
"step": 2300
},
{
"epoch": 8.62,
"grad_norm": 1.5734038352966309,
"learning_rate": 1.3843283582089553e-05,
"loss": 0.165,
"step": 2310
},
{
"epoch": 8.66,
"grad_norm": 2.624784231185913,
"learning_rate": 1.3470149253731342e-05,
"loss": 0.0837,
"step": 2320
},
{
"epoch": 8.69,
"grad_norm": 2.7039573192596436,
"learning_rate": 1.3097014925373136e-05,
"loss": 0.2098,
"step": 2330
},
{
"epoch": 8.73,
"grad_norm": 6.542816638946533,
"learning_rate": 1.2723880597014926e-05,
"loss": 0.129,
"step": 2340
},
{
"epoch": 8.77,
"grad_norm": 2.9511120319366455,
"learning_rate": 1.2350746268656717e-05,
"loss": 0.1762,
"step": 2350
},
{
"epoch": 8.81,
"grad_norm": 3.435502529144287,
"learning_rate": 1.1977611940298509e-05,
"loss": 0.1345,
"step": 2360
},
{
"epoch": 8.84,
"grad_norm": 2.1689364910125732,
"learning_rate": 1.1604477611940299e-05,
"loss": 0.1011,
"step": 2370
},
{
"epoch": 8.88,
"grad_norm": 2.3366479873657227,
"learning_rate": 1.123134328358209e-05,
"loss": 0.1733,
"step": 2380
},
{
"epoch": 8.92,
"grad_norm": 5.928171634674072,
"learning_rate": 1.085820895522388e-05,
"loss": 0.1089,
"step": 2390
},
{
"epoch": 8.96,
"grad_norm": 0.08636012673377991,
"learning_rate": 1.0485074626865672e-05,
"loss": 0.1304,
"step": 2400
},
{
"epoch": 8.96,
"eval_accuracy": 0.8224956063268892,
"eval_loss": 0.7306046485900879,
"eval_runtime": 8.4262,
"eval_samples_per_second": 67.528,
"eval_steps_per_second": 8.545,
"step": 2400
},
{
"epoch": 8.99,
"grad_norm": 0.14256200194358826,
"learning_rate": 1.0111940298507463e-05,
"loss": 0.1506,
"step": 2410
},
{
"epoch": 9.03,
"grad_norm": 0.4166848659515381,
"learning_rate": 9.738805970149255e-06,
"loss": 0.2058,
"step": 2420
},
{
"epoch": 9.07,
"grad_norm": 0.3997032344341278,
"learning_rate": 9.365671641791045e-06,
"loss": 0.0482,
"step": 2430
},
{
"epoch": 9.1,
"grad_norm": 9.076058387756348,
"learning_rate": 8.992537313432836e-06,
"loss": 0.2201,
"step": 2440
},
{
"epoch": 9.14,
"grad_norm": 4.368849277496338,
"learning_rate": 8.619402985074628e-06,
"loss": 0.1288,
"step": 2450
},
{
"epoch": 9.18,
"grad_norm": 4.311466693878174,
"learning_rate": 8.24626865671642e-06,
"loss": 0.3058,
"step": 2460
},
{
"epoch": 9.22,
"grad_norm": 0.2911408543586731,
"learning_rate": 7.87313432835821e-06,
"loss": 0.1303,
"step": 2470
},
{
"epoch": 9.25,
"grad_norm": 5.493233680725098,
"learning_rate": 7.5e-06,
"loss": 0.0915,
"step": 2480
},
{
"epoch": 9.29,
"grad_norm": 0.09431172162294388,
"learning_rate": 7.126865671641792e-06,
"loss": 0.0954,
"step": 2490
},
{
"epoch": 9.33,
"grad_norm": 1.8603869676589966,
"learning_rate": 6.7537313432835825e-06,
"loss": 0.1514,
"step": 2500
},
{
"epoch": 9.33,
"eval_accuracy": 0.8260105448154658,
"eval_loss": 0.7162156701087952,
"eval_runtime": 8.3201,
"eval_samples_per_second": 68.389,
"eval_steps_per_second": 8.654,
"step": 2500
},
{
"epoch": 9.37,
"grad_norm": 4.870584964752197,
"learning_rate": 6.380597014925374e-06,
"loss": 0.1354,
"step": 2510
},
{
"epoch": 9.4,
"grad_norm": 2.316840410232544,
"learning_rate": 6.007462686567165e-06,
"loss": 0.1348,
"step": 2520
},
{
"epoch": 9.44,
"grad_norm": 1.9005101919174194,
"learning_rate": 5.6343283582089556e-06,
"loss": 0.1755,
"step": 2530
},
{
"epoch": 9.48,
"grad_norm": 0.1674620360136032,
"learning_rate": 5.261194029850746e-06,
"loss": 0.0878,
"step": 2540
},
{
"epoch": 9.51,
"grad_norm": 5.729959011077881,
"learning_rate": 4.888059701492537e-06,
"loss": 0.1637,
"step": 2550
},
{
"epoch": 9.55,
"grad_norm": 0.02724504843354225,
"learning_rate": 4.514925373134329e-06,
"loss": 0.1603,
"step": 2560
},
{
"epoch": 9.59,
"grad_norm": 2.728663921356201,
"learning_rate": 4.141791044776119e-06,
"loss": 0.1152,
"step": 2570
},
{
"epoch": 9.63,
"grad_norm": 8.920695304870605,
"learning_rate": 3.7686567164179105e-06,
"loss": 0.1964,
"step": 2580
},
{
"epoch": 9.66,
"grad_norm": 2.3974239826202393,
"learning_rate": 3.3955223880597013e-06,
"loss": 0.0842,
"step": 2590
},
{
"epoch": 9.7,
"grad_norm": 1.6431355476379395,
"learning_rate": 3.022388059701493e-06,
"loss": 0.2571,
"step": 2600
},
{
"epoch": 9.7,
"eval_accuracy": 0.8347978910369068,
"eval_loss": 0.7012546062469482,
"eval_runtime": 8.3265,
"eval_samples_per_second": 68.336,
"eval_steps_per_second": 8.647,
"step": 2600
},
{
"epoch": 9.74,
"grad_norm": 0.10621854662895203,
"learning_rate": 2.6492537313432836e-06,
"loss": 0.2632,
"step": 2610
},
{
"epoch": 9.78,
"grad_norm": 4.150152206420898,
"learning_rate": 2.2761194029850747e-06,
"loss": 0.2804,
"step": 2620
},
{
"epoch": 9.81,
"grad_norm": 4.01139497756958,
"learning_rate": 1.9029850746268657e-06,
"loss": 0.1696,
"step": 2630
},
{
"epoch": 9.85,
"grad_norm": 4.7402262687683105,
"learning_rate": 1.5298507462686568e-06,
"loss": 0.1891,
"step": 2640
},
{
"epoch": 9.89,
"grad_norm": 4.460111141204834,
"learning_rate": 1.1567164179104478e-06,
"loss": 0.1178,
"step": 2650
},
{
"epoch": 9.93,
"grad_norm": 5.822507858276367,
"learning_rate": 7.835820895522387e-07,
"loss": 0.089,
"step": 2660
},
{
"epoch": 9.96,
"grad_norm": 2.4408085346221924,
"learning_rate": 4.1044776119402984e-07,
"loss": 0.158,
"step": 2670
},
{
"epoch": 10.0,
"grad_norm": 10.792135238647461,
"learning_rate": 3.7313432835820895e-08,
"loss": 0.2038,
"step": 2680
},
{
"epoch": 10.0,
"step": 2680,
"total_flos": 3.3230947683690086e+18,
"train_loss": 0.23535207314277762,
"train_runtime": 1371.8304,
"train_samples_per_second": 31.258,
"train_steps_per_second": 1.954
}
],
"logging_steps": 10,
"max_steps": 2680,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 3.3230947683690086e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}