g7b-btb05-e1-seed5 / trainer_state.json
kykim0's picture
Upload folder using huggingface_hub
3ca1e0c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999929740743343,
"eval_steps": 100,
"global_step": 1779,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 30.728321433017737,
"learning_rate": 2.9915682967959526e-07,
"loss": 0.7843,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 36.77551634614557,
"learning_rate": 2.9831365935919053e-07,
"loss": 0.7146,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 35.07871745362434,
"learning_rate": 2.9747048903878585e-07,
"loss": 0.6938,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 22.884080905805792,
"learning_rate": 2.9662731871838107e-07,
"loss": 0.6587,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 20.00517219363236,
"learning_rate": 2.957841483979764e-07,
"loss": 0.653,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 18.36397890422735,
"learning_rate": 2.9494097807757167e-07,
"loss": 0.6644,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 25.380353671575364,
"learning_rate": 2.9409780775716694e-07,
"loss": 0.6591,
"step": 70
},
{
"epoch": 0.04,
"grad_norm": 22.96117971805064,
"learning_rate": 2.932546374367622e-07,
"loss": 0.6649,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 18.570972155544734,
"learning_rate": 2.924114671163575e-07,
"loss": 0.6461,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 31.463777178706607,
"learning_rate": 2.915682967959528e-07,
"loss": 0.6402,
"step": 100
},
{
"epoch": 0.06,
"eval_accuracy": 0.71716621253406,
"eval_loss": 0.6716727614402771,
"eval_runtime": 81.6137,
"eval_samples_per_second": 22.484,
"eval_steps_per_second": 0.711,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 16.52215127003667,
"learning_rate": 2.90725126475548e-07,
"loss": 0.6428,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 19.497709198702577,
"learning_rate": 2.8988195615514335e-07,
"loss": 0.6215,
"step": 120
},
{
"epoch": 0.07,
"grad_norm": 18.350616329827186,
"learning_rate": 2.890387858347386e-07,
"loss": 0.6408,
"step": 130
},
{
"epoch": 0.08,
"grad_norm": 21.31149136639458,
"learning_rate": 2.881956155143339e-07,
"loss": 0.6435,
"step": 140
},
{
"epoch": 0.08,
"grad_norm": 18.652897776189004,
"learning_rate": 2.8735244519392916e-07,
"loss": 0.6429,
"step": 150
},
{
"epoch": 0.09,
"grad_norm": 15.905534553134705,
"learning_rate": 2.8650927487352443e-07,
"loss": 0.6406,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 16.811536357450365,
"learning_rate": 2.856661045531197e-07,
"loss": 0.6433,
"step": 170
},
{
"epoch": 0.1,
"grad_norm": 18.105305932357698,
"learning_rate": 2.84822934232715e-07,
"loss": 0.6305,
"step": 180
},
{
"epoch": 0.11,
"grad_norm": 17.99993276337699,
"learning_rate": 2.839797639123103e-07,
"loss": 0.6339,
"step": 190
},
{
"epoch": 0.11,
"grad_norm": 19.99711675781162,
"learning_rate": 2.8313659359190557e-07,
"loss": 0.6293,
"step": 200
},
{
"epoch": 0.11,
"eval_accuracy": 0.740599455040872,
"eval_loss": 0.6711810231208801,
"eval_runtime": 81.6564,
"eval_samples_per_second": 22.472,
"eval_steps_per_second": 0.71,
"step": 200
},
{
"epoch": 0.12,
"grad_norm": 16.424824162884995,
"learning_rate": 2.8229342327150084e-07,
"loss": 0.6053,
"step": 210
},
{
"epoch": 0.12,
"grad_norm": 27.512486316457252,
"learning_rate": 2.814502529510961e-07,
"loss": 0.6109,
"step": 220
},
{
"epoch": 0.13,
"grad_norm": 22.320535945754912,
"learning_rate": 2.806070826306914e-07,
"loss": 0.6358,
"step": 230
},
{
"epoch": 0.13,
"grad_norm": 13.83846516782303,
"learning_rate": 2.7976391231028666e-07,
"loss": 0.6261,
"step": 240
},
{
"epoch": 0.14,
"grad_norm": 16.27927043935017,
"learning_rate": 2.7892074198988193e-07,
"loss": 0.6395,
"step": 250
},
{
"epoch": 0.15,
"grad_norm": 21.72712116315618,
"learning_rate": 2.780775716694772e-07,
"loss": 0.6273,
"step": 260
},
{
"epoch": 0.15,
"grad_norm": 19.022167698615288,
"learning_rate": 2.7723440134907247e-07,
"loss": 0.6206,
"step": 270
},
{
"epoch": 0.16,
"grad_norm": 16.64363952350825,
"learning_rate": 2.763912310286678e-07,
"loss": 0.6304,
"step": 280
},
{
"epoch": 0.16,
"grad_norm": 20.294764983153055,
"learning_rate": 2.7554806070826307e-07,
"loss": 0.6225,
"step": 290
},
{
"epoch": 0.17,
"grad_norm": 11.467691935838706,
"learning_rate": 2.7470489038785834e-07,
"loss": 0.6297,
"step": 300
},
{
"epoch": 0.17,
"eval_accuracy": 0.7444141689373297,
"eval_loss": 0.658985435962677,
"eval_runtime": 81.6887,
"eval_samples_per_second": 22.463,
"eval_steps_per_second": 0.71,
"step": 300
},
{
"epoch": 0.17,
"grad_norm": 21.910401668214213,
"learning_rate": 2.738617200674536e-07,
"loss": 0.6182,
"step": 310
},
{
"epoch": 0.18,
"grad_norm": 24.007473629091503,
"learning_rate": 2.730185497470489e-07,
"loss": 0.6038,
"step": 320
},
{
"epoch": 0.19,
"grad_norm": 21.290646545312395,
"learning_rate": 2.7217537942664415e-07,
"loss": 0.6195,
"step": 330
},
{
"epoch": 0.19,
"grad_norm": 23.68987516909537,
"learning_rate": 2.713322091062394e-07,
"loss": 0.6308,
"step": 340
},
{
"epoch": 0.2,
"grad_norm": 30.80192565983983,
"learning_rate": 2.704890387858347e-07,
"loss": 0.6341,
"step": 350
},
{
"epoch": 0.2,
"grad_norm": 21.8133450286374,
"learning_rate": 2.6964586846543e-07,
"loss": 0.6106,
"step": 360
},
{
"epoch": 0.21,
"grad_norm": 20.299330239483883,
"learning_rate": 2.688026981450253e-07,
"loss": 0.64,
"step": 370
},
{
"epoch": 0.21,
"grad_norm": 20.648521192709566,
"learning_rate": 2.6795952782462056e-07,
"loss": 0.6169,
"step": 380
},
{
"epoch": 0.22,
"grad_norm": 30.447858291866996,
"learning_rate": 2.6711635750421584e-07,
"loss": 0.6416,
"step": 390
},
{
"epoch": 0.22,
"grad_norm": 15.933419590444712,
"learning_rate": 2.662731871838111e-07,
"loss": 0.6112,
"step": 400
},
{
"epoch": 0.22,
"eval_accuracy": 0.7667574931880109,
"eval_loss": 0.6638922095298767,
"eval_runtime": 81.6313,
"eval_samples_per_second": 22.479,
"eval_steps_per_second": 0.711,
"step": 400
},
{
"epoch": 0.23,
"grad_norm": 23.46830344261005,
"learning_rate": 2.654300168634064e-07,
"loss": 0.6277,
"step": 410
},
{
"epoch": 0.24,
"grad_norm": 27.935995699348798,
"learning_rate": 2.6458684654300165e-07,
"loss": 0.6221,
"step": 420
},
{
"epoch": 0.24,
"grad_norm": 17.973218257874134,
"learning_rate": 2.63743676222597e-07,
"loss": 0.6477,
"step": 430
},
{
"epoch": 0.25,
"grad_norm": 18.376434311461416,
"learning_rate": 2.629005059021922e-07,
"loss": 0.6309,
"step": 440
},
{
"epoch": 0.25,
"grad_norm": 18.526800694282237,
"learning_rate": 2.620573355817875e-07,
"loss": 0.6316,
"step": 450
},
{
"epoch": 0.26,
"grad_norm": 15.15241254088161,
"learning_rate": 2.612141652613828e-07,
"loss": 0.6201,
"step": 460
},
{
"epoch": 0.26,
"grad_norm": 14.410194781468793,
"learning_rate": 2.6037099494097806e-07,
"loss": 0.6218,
"step": 470
},
{
"epoch": 0.27,
"grad_norm": 40.740369918242074,
"learning_rate": 2.5952782462057333e-07,
"loss": 0.609,
"step": 480
},
{
"epoch": 0.28,
"grad_norm": 17.442343289782034,
"learning_rate": 2.586846543001686e-07,
"loss": 0.6031,
"step": 490
},
{
"epoch": 0.28,
"grad_norm": 14.525640800311784,
"learning_rate": 2.5784148397976393e-07,
"loss": 0.6065,
"step": 500
},
{
"epoch": 0.28,
"eval_accuracy": 0.7623978201634878,
"eval_loss": 0.6653993725776672,
"eval_runtime": 81.6511,
"eval_samples_per_second": 22.474,
"eval_steps_per_second": 0.71,
"step": 500
},
{
"epoch": 0.29,
"grad_norm": 16.37698400765353,
"learning_rate": 2.5699831365935915e-07,
"loss": 0.6044,
"step": 510
},
{
"epoch": 0.29,
"grad_norm": 18.46191451753819,
"learning_rate": 2.5615514333895447e-07,
"loss": 0.6026,
"step": 520
},
{
"epoch": 0.3,
"grad_norm": 16.60110193637955,
"learning_rate": 2.5531197301854974e-07,
"loss": 0.6117,
"step": 530
},
{
"epoch": 0.3,
"grad_norm": 12.90641142874894,
"learning_rate": 2.54468802698145e-07,
"loss": 0.6218,
"step": 540
},
{
"epoch": 0.31,
"grad_norm": 12.782297037944778,
"learning_rate": 2.536256323777403e-07,
"loss": 0.6094,
"step": 550
},
{
"epoch": 0.31,
"grad_norm": 13.132450106152808,
"learning_rate": 2.5278246205733556e-07,
"loss": 0.6065,
"step": 560
},
{
"epoch": 0.32,
"grad_norm": 15.505668610650245,
"learning_rate": 2.519392917369309e-07,
"loss": 0.6141,
"step": 570
},
{
"epoch": 0.33,
"grad_norm": 10.877242291946278,
"learning_rate": 2.510961214165261e-07,
"loss": 0.6132,
"step": 580
},
{
"epoch": 0.33,
"grad_norm": 16.260577656983788,
"learning_rate": 2.502529510961214e-07,
"loss": 0.618,
"step": 590
},
{
"epoch": 0.34,
"grad_norm": 17.74551723827062,
"learning_rate": 2.494097807757167e-07,
"loss": 0.6011,
"step": 600
},
{
"epoch": 0.34,
"eval_accuracy": 0.7656675749318801,
"eval_loss": 0.6725944876670837,
"eval_runtime": 81.6007,
"eval_samples_per_second": 22.488,
"eval_steps_per_second": 0.711,
"step": 600
},
{
"epoch": 0.34,
"grad_norm": 13.489006632275235,
"learning_rate": 2.4856661045531197e-07,
"loss": 0.5972,
"step": 610
},
{
"epoch": 0.35,
"grad_norm": 11.884271769605707,
"learning_rate": 2.4772344013490724e-07,
"loss": 0.62,
"step": 620
},
{
"epoch": 0.35,
"grad_norm": 13.261788153937319,
"learning_rate": 2.468802698145025e-07,
"loss": 0.6167,
"step": 630
},
{
"epoch": 0.36,
"grad_norm": 14.172098159046573,
"learning_rate": 2.4603709949409783e-07,
"loss": 0.6153,
"step": 640
},
{
"epoch": 0.37,
"grad_norm": 21.378776702080216,
"learning_rate": 2.4519392917369305e-07,
"loss": 0.619,
"step": 650
},
{
"epoch": 0.37,
"grad_norm": 12.09294701146547,
"learning_rate": 2.443507588532884e-07,
"loss": 0.6282,
"step": 660
},
{
"epoch": 0.38,
"grad_norm": 14.012823513290494,
"learning_rate": 2.4350758853288365e-07,
"loss": 0.6329,
"step": 670
},
{
"epoch": 0.38,
"grad_norm": 12.10834170031786,
"learning_rate": 2.426644182124789e-07,
"loss": 0.6302,
"step": 680
},
{
"epoch": 0.39,
"grad_norm": 18.577060775724657,
"learning_rate": 2.418212478920742e-07,
"loss": 0.6311,
"step": 690
},
{
"epoch": 0.39,
"grad_norm": 13.914957035063992,
"learning_rate": 2.4097807757166946e-07,
"loss": 0.6188,
"step": 700
},
{
"epoch": 0.39,
"eval_accuracy": 0.7716621253405994,
"eval_loss": 0.6559094190597534,
"eval_runtime": 81.6864,
"eval_samples_per_second": 22.464,
"eval_steps_per_second": 0.71,
"step": 700
},
{
"epoch": 0.4,
"grad_norm": 18.235947830015988,
"learning_rate": 2.4013490725126473e-07,
"loss": 0.6333,
"step": 710
},
{
"epoch": 0.4,
"grad_norm": 18.663534626617114,
"learning_rate": 2.3929173693086e-07,
"loss": 0.6277,
"step": 720
},
{
"epoch": 0.41,
"grad_norm": 15.048399094387358,
"learning_rate": 2.3844856661045533e-07,
"loss": 0.6209,
"step": 730
},
{
"epoch": 0.42,
"grad_norm": 14.114525067135927,
"learning_rate": 2.3760539629005057e-07,
"loss": 0.6192,
"step": 740
},
{
"epoch": 0.42,
"grad_norm": 19.517752622102307,
"learning_rate": 2.3676222596964587e-07,
"loss": 0.6016,
"step": 750
},
{
"epoch": 0.43,
"grad_norm": 11.012158965376576,
"learning_rate": 2.3591905564924112e-07,
"loss": 0.619,
"step": 760
},
{
"epoch": 0.43,
"grad_norm": 12.289099272322433,
"learning_rate": 2.3507588532883641e-07,
"loss": 0.625,
"step": 770
},
{
"epoch": 0.44,
"grad_norm": 11.949697745076817,
"learning_rate": 2.3423271500843169e-07,
"loss": 0.6147,
"step": 780
},
{
"epoch": 0.44,
"grad_norm": 20.873971042525024,
"learning_rate": 2.3338954468802698e-07,
"loss": 0.6059,
"step": 790
},
{
"epoch": 0.45,
"grad_norm": 23.749605034362084,
"learning_rate": 2.3254637436762223e-07,
"loss": 0.5964,
"step": 800
},
{
"epoch": 0.45,
"eval_accuracy": 0.7623978201634878,
"eval_loss": 0.6648799180984497,
"eval_runtime": 81.6698,
"eval_samples_per_second": 22.469,
"eval_steps_per_second": 0.71,
"step": 800
},
{
"epoch": 0.46,
"grad_norm": 17.378412987521198,
"learning_rate": 2.3170320404721753e-07,
"loss": 0.5999,
"step": 810
},
{
"epoch": 0.46,
"grad_norm": 14.09219874944554,
"learning_rate": 2.3086003372681282e-07,
"loss": 0.6123,
"step": 820
},
{
"epoch": 0.47,
"grad_norm": 15.751831833203504,
"learning_rate": 2.3001686340640807e-07,
"loss": 0.6173,
"step": 830
},
{
"epoch": 0.47,
"grad_norm": 10.58472297190868,
"learning_rate": 2.2917369308600337e-07,
"loss": 0.6107,
"step": 840
},
{
"epoch": 0.48,
"grad_norm": 16.766833417242065,
"learning_rate": 2.2833052276559864e-07,
"loss": 0.5977,
"step": 850
},
{
"epoch": 0.48,
"grad_norm": 10.191079428211454,
"learning_rate": 2.2748735244519394e-07,
"loss": 0.6249,
"step": 860
},
{
"epoch": 0.49,
"grad_norm": 10.892435339015812,
"learning_rate": 2.2664418212478918e-07,
"loss": 0.6071,
"step": 870
},
{
"epoch": 0.49,
"grad_norm": 14.857284393951572,
"learning_rate": 2.2580101180438448e-07,
"loss": 0.6277,
"step": 880
},
{
"epoch": 0.5,
"grad_norm": 16.280040840533015,
"learning_rate": 2.2495784148397975e-07,
"loss": 0.6117,
"step": 890
},
{
"epoch": 0.51,
"grad_norm": 15.402943403313595,
"learning_rate": 2.2411467116357502e-07,
"loss": 0.6263,
"step": 900
},
{
"epoch": 0.51,
"eval_accuracy": 0.7596730245231608,
"eval_loss": 0.6583617329597473,
"eval_runtime": 81.6999,
"eval_samples_per_second": 22.46,
"eval_steps_per_second": 0.71,
"step": 900
},
{
"epoch": 0.51,
"grad_norm": 16.324857723183708,
"learning_rate": 2.2327150084317032e-07,
"loss": 0.6189,
"step": 910
},
{
"epoch": 0.52,
"grad_norm": 10.98662853747513,
"learning_rate": 2.224283305227656e-07,
"loss": 0.6097,
"step": 920
},
{
"epoch": 0.52,
"grad_norm": 17.67105325082076,
"learning_rate": 2.215851602023609e-07,
"loss": 0.6122,
"step": 930
},
{
"epoch": 0.53,
"grad_norm": 20.41264564438468,
"learning_rate": 2.2074198988195613e-07,
"loss": 0.6133,
"step": 940
},
{
"epoch": 0.53,
"grad_norm": 17.79087613152158,
"learning_rate": 2.1989881956155143e-07,
"loss": 0.6211,
"step": 950
},
{
"epoch": 0.54,
"grad_norm": 14.606633495853552,
"learning_rate": 2.1905564924114668e-07,
"loss": 0.6127,
"step": 960
},
{
"epoch": 0.55,
"grad_norm": 15.658044929958105,
"learning_rate": 2.1821247892074197e-07,
"loss": 0.613,
"step": 970
},
{
"epoch": 0.55,
"grad_norm": 10.939022037806527,
"learning_rate": 2.1736930860033725e-07,
"loss": 0.611,
"step": 980
},
{
"epoch": 0.56,
"grad_norm": 17.074626515502477,
"learning_rate": 2.1652613827993254e-07,
"loss": 0.6235,
"step": 990
},
{
"epoch": 0.56,
"grad_norm": 9.873729304084089,
"learning_rate": 2.1568296795952782e-07,
"loss": 0.6173,
"step": 1000
},
{
"epoch": 0.56,
"eval_accuracy": 0.7787465940054495,
"eval_loss": 0.6586682796478271,
"eval_runtime": 81.6576,
"eval_samples_per_second": 22.472,
"eval_steps_per_second": 0.71,
"step": 1000
},
{
"epoch": 0.57,
"grad_norm": 17.977983053245335,
"learning_rate": 2.148397976391231e-07,
"loss": 0.6085,
"step": 1010
},
{
"epoch": 0.57,
"grad_norm": 22.720038161057953,
"learning_rate": 2.1399662731871838e-07,
"loss": 0.6077,
"step": 1020
},
{
"epoch": 0.58,
"grad_norm": 14.934140128371691,
"learning_rate": 2.1315345699831363e-07,
"loss": 0.5936,
"step": 1030
},
{
"epoch": 0.58,
"grad_norm": 17.025807021499876,
"learning_rate": 2.1231028667790893e-07,
"loss": 0.6077,
"step": 1040
},
{
"epoch": 0.59,
"grad_norm": 13.012674899332776,
"learning_rate": 2.114671163575042e-07,
"loss": 0.6143,
"step": 1050
},
{
"epoch": 0.6,
"grad_norm": 12.997561334592964,
"learning_rate": 2.106239460370995e-07,
"loss": 0.6079,
"step": 1060
},
{
"epoch": 0.6,
"grad_norm": 12.442902629191648,
"learning_rate": 2.0978077571669474e-07,
"loss": 0.6027,
"step": 1070
},
{
"epoch": 0.61,
"grad_norm": 16.45898926071221,
"learning_rate": 2.0893760539629004e-07,
"loss": 0.6027,
"step": 1080
},
{
"epoch": 0.61,
"grad_norm": 21.037037529928906,
"learning_rate": 2.0809443507588534e-07,
"loss": 0.6048,
"step": 1090
},
{
"epoch": 0.62,
"grad_norm": 9.922931220178954,
"learning_rate": 2.0725126475548058e-07,
"loss": 0.6133,
"step": 1100
},
{
"epoch": 0.62,
"eval_accuracy": 0.7754768392370572,
"eval_loss": 0.6589598655700684,
"eval_runtime": 81.5819,
"eval_samples_per_second": 22.493,
"eval_steps_per_second": 0.711,
"step": 1100
},
{
"epoch": 0.62,
"grad_norm": 16.07724841249849,
"learning_rate": 2.0640809443507588e-07,
"loss": 0.6202,
"step": 1110
},
{
"epoch": 0.63,
"grad_norm": 12.964684643299604,
"learning_rate": 2.0556492411467115e-07,
"loss": 0.6114,
"step": 1120
},
{
"epoch": 0.64,
"grad_norm": 11.819826038425457,
"learning_rate": 2.0472175379426645e-07,
"loss": 0.6042,
"step": 1130
},
{
"epoch": 0.64,
"grad_norm": 10.985202176776713,
"learning_rate": 2.038785834738617e-07,
"loss": 0.5989,
"step": 1140
},
{
"epoch": 0.65,
"grad_norm": 15.914126285773381,
"learning_rate": 2.03035413153457e-07,
"loss": 0.6111,
"step": 1150
},
{
"epoch": 0.65,
"grad_norm": 20.198502391005317,
"learning_rate": 2.0219224283305226e-07,
"loss": 0.6193,
"step": 1160
},
{
"epoch": 0.66,
"grad_norm": 16.375353496861376,
"learning_rate": 2.0134907251264754e-07,
"loss": 0.6124,
"step": 1170
},
{
"epoch": 0.66,
"grad_norm": 14.424027706264562,
"learning_rate": 2.0050590219224283e-07,
"loss": 0.6095,
"step": 1180
},
{
"epoch": 0.67,
"grad_norm": 17.448118657270804,
"learning_rate": 1.996627318718381e-07,
"loss": 0.594,
"step": 1190
},
{
"epoch": 0.67,
"grad_norm": 15.835051950720354,
"learning_rate": 1.988195615514334e-07,
"loss": 0.5902,
"step": 1200
},
{
"epoch": 0.67,
"eval_accuracy": 0.7673024523160763,
"eval_loss": 0.6717323660850525,
"eval_runtime": 81.1264,
"eval_samples_per_second": 22.619,
"eval_steps_per_second": 0.715,
"step": 1200
},
{
"epoch": 0.68,
"grad_norm": 10.104802089341579,
"learning_rate": 1.9797639123102865e-07,
"loss": 0.6083,
"step": 1210
},
{
"epoch": 0.69,
"grad_norm": 16.88055370044505,
"learning_rate": 1.9713322091062395e-07,
"loss": 0.6208,
"step": 1220
},
{
"epoch": 0.69,
"grad_norm": 13.25377180665782,
"learning_rate": 1.962900505902192e-07,
"loss": 0.6195,
"step": 1230
},
{
"epoch": 0.7,
"grad_norm": 12.03529826522156,
"learning_rate": 1.954468802698145e-07,
"loss": 0.6013,
"step": 1240
},
{
"epoch": 0.7,
"grad_norm": 21.966805810887724,
"learning_rate": 1.9460370994940976e-07,
"loss": 0.5955,
"step": 1250
},
{
"epoch": 0.71,
"grad_norm": 25.150403183144306,
"learning_rate": 1.9376053962900506e-07,
"loss": 0.6123,
"step": 1260
},
{
"epoch": 0.71,
"grad_norm": 28.00330027046741,
"learning_rate": 1.9291736930860033e-07,
"loss": 0.6242,
"step": 1270
},
{
"epoch": 0.72,
"grad_norm": 10.71380749264463,
"learning_rate": 1.920741989881956e-07,
"loss": 0.6288,
"step": 1280
},
{
"epoch": 0.73,
"grad_norm": 14.346837709212238,
"learning_rate": 1.912310286677909e-07,
"loss": 0.6212,
"step": 1290
},
{
"epoch": 0.73,
"grad_norm": 11.295633948457253,
"learning_rate": 1.9038785834738614e-07,
"loss": 0.6027,
"step": 1300
},
{
"epoch": 0.73,
"eval_accuracy": 0.7787465940054495,
"eval_loss": 0.6629257202148438,
"eval_runtime": 81.1053,
"eval_samples_per_second": 22.625,
"eval_steps_per_second": 0.715,
"step": 1300
},
{
"epoch": 0.74,
"grad_norm": 11.81270801886205,
"learning_rate": 1.8954468802698144e-07,
"loss": 0.6127,
"step": 1310
},
{
"epoch": 0.74,
"grad_norm": 11.458987921580261,
"learning_rate": 1.887015177065767e-07,
"loss": 0.6027,
"step": 1320
},
{
"epoch": 0.75,
"grad_norm": 14.114816619174277,
"learning_rate": 1.87858347386172e-07,
"loss": 0.617,
"step": 1330
},
{
"epoch": 0.75,
"grad_norm": 9.445103600368194,
"learning_rate": 1.8701517706576726e-07,
"loss": 0.6016,
"step": 1340
},
{
"epoch": 0.76,
"grad_norm": 16.378432231814056,
"learning_rate": 1.8617200674536255e-07,
"loss": 0.607,
"step": 1350
},
{
"epoch": 0.76,
"grad_norm": 18.105605603170623,
"learning_rate": 1.8532883642495785e-07,
"loss": 0.6059,
"step": 1360
},
{
"epoch": 0.77,
"grad_norm": 25.98754135538099,
"learning_rate": 1.844856661045531e-07,
"loss": 0.6079,
"step": 1370
},
{
"epoch": 0.78,
"grad_norm": 13.387903869057888,
"learning_rate": 1.836424957841484e-07,
"loss": 0.6248,
"step": 1380
},
{
"epoch": 0.78,
"grad_norm": 15.072500301887933,
"learning_rate": 1.8279932546374367e-07,
"loss": 0.6059,
"step": 1390
},
{
"epoch": 0.79,
"grad_norm": 28.195360534555086,
"learning_rate": 1.8195615514333896e-07,
"loss": 0.6094,
"step": 1400
},
{
"epoch": 0.79,
"eval_accuracy": 0.7825613079019074,
"eval_loss": 0.6670619249343872,
"eval_runtime": 81.1088,
"eval_samples_per_second": 22.624,
"eval_steps_per_second": 0.715,
"step": 1400
},
{
"epoch": 0.79,
"grad_norm": 16.78097475585066,
"learning_rate": 1.811129848229342e-07,
"loss": 0.6026,
"step": 1410
},
{
"epoch": 0.8,
"grad_norm": 9.27565083160915,
"learning_rate": 1.802698145025295e-07,
"loss": 0.6061,
"step": 1420
},
{
"epoch": 0.8,
"grad_norm": 15.151917102254139,
"learning_rate": 1.7942664418212478e-07,
"loss": 0.6121,
"step": 1430
},
{
"epoch": 0.81,
"grad_norm": 15.628107133180718,
"learning_rate": 1.7858347386172005e-07,
"loss": 0.6136,
"step": 1440
},
{
"epoch": 0.82,
"grad_norm": 11.41747344754936,
"learning_rate": 1.7774030354131535e-07,
"loss": 0.6013,
"step": 1450
},
{
"epoch": 0.82,
"grad_norm": 17.94793616613114,
"learning_rate": 1.7689713322091062e-07,
"loss": 0.6094,
"step": 1460
},
{
"epoch": 0.83,
"grad_norm": 16.847404718639655,
"learning_rate": 1.7605396290050592e-07,
"loss": 0.6006,
"step": 1470
},
{
"epoch": 0.83,
"grad_norm": 19.943006958068334,
"learning_rate": 1.7521079258010116e-07,
"loss": 0.6203,
"step": 1480
},
{
"epoch": 0.84,
"grad_norm": 17.544121262898,
"learning_rate": 1.7436762225969646e-07,
"loss": 0.6118,
"step": 1490
},
{
"epoch": 0.84,
"grad_norm": 16.348862133230483,
"learning_rate": 1.735244519392917e-07,
"loss": 0.606,
"step": 1500
},
{
"epoch": 0.84,
"eval_accuracy": 0.7771117166212534,
"eval_loss": 0.6631556749343872,
"eval_runtime": 81.1021,
"eval_samples_per_second": 22.626,
"eval_steps_per_second": 0.715,
"step": 1500
},
{
"epoch": 0.85,
"grad_norm": 15.065882473200872,
"learning_rate": 1.72681281618887e-07,
"loss": 0.6105,
"step": 1510
},
{
"epoch": 0.85,
"grad_norm": 10.334423134734958,
"learning_rate": 1.718381112984823e-07,
"loss": 0.603,
"step": 1520
},
{
"epoch": 0.86,
"grad_norm": 13.630372232978868,
"learning_rate": 1.7099494097807757e-07,
"loss": 0.6177,
"step": 1530
},
{
"epoch": 0.87,
"grad_norm": 21.60021911475766,
"learning_rate": 1.7015177065767284e-07,
"loss": 0.6138,
"step": 1540
},
{
"epoch": 0.87,
"grad_norm": 27.660710692587415,
"learning_rate": 1.6930860033726811e-07,
"loss": 0.6058,
"step": 1550
},
{
"epoch": 0.88,
"grad_norm": 15.6224127860944,
"learning_rate": 1.684654300168634e-07,
"loss": 0.6275,
"step": 1560
},
{
"epoch": 0.88,
"grad_norm": 9.768689869942213,
"learning_rate": 1.6762225969645866e-07,
"loss": 0.6181,
"step": 1570
},
{
"epoch": 0.89,
"grad_norm": 29.86529261497021,
"learning_rate": 1.6677908937605395e-07,
"loss": 0.6175,
"step": 1580
},
{
"epoch": 0.89,
"grad_norm": 12.971455009975848,
"learning_rate": 1.6593591905564923e-07,
"loss": 0.6177,
"step": 1590
},
{
"epoch": 0.9,
"grad_norm": 15.387843128502691,
"learning_rate": 1.6509274873524452e-07,
"loss": 0.6119,
"step": 1600
},
{
"epoch": 0.9,
"eval_accuracy": 0.7754768392370572,
"eval_loss": 0.6606820225715637,
"eval_runtime": 81.096,
"eval_samples_per_second": 22.627,
"eval_steps_per_second": 0.715,
"step": 1600
},
{
"epoch": 0.9,
"grad_norm": 14.683073977357795,
"learning_rate": 1.642495784148398e-07,
"loss": 0.6132,
"step": 1610
},
{
"epoch": 0.91,
"grad_norm": 20.101674008953353,
"learning_rate": 1.6340640809443507e-07,
"loss": 0.6129,
"step": 1620
},
{
"epoch": 0.92,
"grad_norm": 9.565771214700746,
"learning_rate": 1.6256323777403036e-07,
"loss": 0.6032,
"step": 1630
},
{
"epoch": 0.92,
"grad_norm": 19.63652279943903,
"learning_rate": 1.617200674536256e-07,
"loss": 0.6044,
"step": 1640
},
{
"epoch": 0.93,
"grad_norm": 11.798398386935885,
"learning_rate": 1.608768971332209e-07,
"loss": 0.6104,
"step": 1650
},
{
"epoch": 0.93,
"grad_norm": 8.45308349951647,
"learning_rate": 1.6003372681281618e-07,
"loss": 0.5946,
"step": 1660
},
{
"epoch": 0.94,
"grad_norm": 6.748385507096111,
"learning_rate": 1.5919055649241148e-07,
"loss": 0.6124,
"step": 1670
},
{
"epoch": 0.94,
"grad_norm": 11.792848227458215,
"learning_rate": 1.5834738617200672e-07,
"loss": 0.6118,
"step": 1680
},
{
"epoch": 0.95,
"grad_norm": 15.049508549188333,
"learning_rate": 1.5750421585160202e-07,
"loss": 0.6085,
"step": 1690
},
{
"epoch": 0.96,
"grad_norm": 10.01303081102771,
"learning_rate": 1.5666104553119732e-07,
"loss": 0.5992,
"step": 1700
},
{
"epoch": 0.96,
"eval_accuracy": 0.7798365122615804,
"eval_loss": 0.6598241925239563,
"eval_runtime": 81.0778,
"eval_samples_per_second": 22.633,
"eval_steps_per_second": 0.715,
"step": 1700
},
{
"epoch": 0.96,
"grad_norm": 13.816105839597155,
"learning_rate": 1.5581787521079256e-07,
"loss": 0.614,
"step": 1710
},
{
"epoch": 0.97,
"grad_norm": 15.916990016500973,
"learning_rate": 1.5497470489038786e-07,
"loss": 0.6071,
"step": 1720
},
{
"epoch": 0.97,
"grad_norm": 14.639137907706433,
"learning_rate": 1.5413153456998313e-07,
"loss": 0.6069,
"step": 1730
},
{
"epoch": 0.98,
"grad_norm": 15.234658834038223,
"learning_rate": 1.5328836424957843e-07,
"loss": 0.5919,
"step": 1740
},
{
"epoch": 0.98,
"grad_norm": 13.325766641648226,
"learning_rate": 1.5244519392917367e-07,
"loss": 0.6179,
"step": 1750
},
{
"epoch": 0.99,
"grad_norm": 9.90933302447297,
"learning_rate": 1.5160202360876897e-07,
"loss": 0.6017,
"step": 1760
},
{
"epoch": 0.99,
"grad_norm": 18.624449453795865,
"learning_rate": 1.5075885328836422e-07,
"loss": 0.6055,
"step": 1770
}
],
"logging_steps": 10,
"max_steps": 3558,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}