enlm-roberta-130 / trainer_state.json
manirai91's picture
Training in progress, step 9320
d38cd35
raw
history blame contribute delete
No virus
126 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.346941507161024,
"global_step": 9320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"learning_rate": 9.989377682403434e-06,
"loss": 1.5134,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 9.978755364806867e-06,
"loss": 1.518,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 9.968133047210302e-06,
"loss": 1.5192,
"step": 30
},
{
"epoch": 0.09,
"learning_rate": 9.957510729613735e-06,
"loss": 1.5191,
"step": 40
},
{
"epoch": 0.11,
"learning_rate": 9.946888412017168e-06,
"loss": 1.5168,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 9.936266094420601e-06,
"loss": 1.5148,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 9.925643776824036e-06,
"loss": 1.518,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 9.915021459227469e-06,
"loss": 1.5194,
"step": 80
},
{
"epoch": 0.19,
"learning_rate": 9.904399141630902e-06,
"loss": 1.5159,
"step": 90
},
{
"epoch": 0.21,
"learning_rate": 9.893776824034335e-06,
"loss": 1.5157,
"step": 100
},
{
"epoch": 0.24,
"learning_rate": 9.88315450643777e-06,
"loss": 1.5205,
"step": 110
},
{
"epoch": 0.26,
"learning_rate": 9.872532188841201e-06,
"loss": 1.5164,
"step": 120
},
{
"epoch": 0.28,
"learning_rate": 9.861909871244636e-06,
"loss": 1.5171,
"step": 130
},
{
"epoch": 0.3,
"learning_rate": 9.85128755364807e-06,
"loss": 1.5206,
"step": 140
},
{
"epoch": 0.32,
"learning_rate": 9.840665236051502e-06,
"loss": 1.516,
"step": 150
},
{
"epoch": 0.34,
"learning_rate": 9.830042918454937e-06,
"loss": 1.5183,
"step": 160
},
{
"epoch": 0.34,
"eval_loss": 1.4158966541290283,
"eval_runtime": 75.2493,
"eval_samples_per_second": 531.566,
"eval_steps_per_second": 8.306,
"step": 160
},
{
"epoch": 0.36,
"learning_rate": 9.81942060085837e-06,
"loss": 1.521,
"step": 170
},
{
"epoch": 0.39,
"learning_rate": 9.808798283261803e-06,
"loss": 1.5193,
"step": 180
},
{
"epoch": 0.41,
"learning_rate": 9.798175965665236e-06,
"loss": 1.52,
"step": 190
},
{
"epoch": 0.43,
"learning_rate": 9.787553648068671e-06,
"loss": 1.5226,
"step": 200
},
{
"epoch": 0.45,
"learning_rate": 9.776931330472104e-06,
"loss": 1.5168,
"step": 210
},
{
"epoch": 0.47,
"learning_rate": 9.766309012875537e-06,
"loss": 1.5187,
"step": 220
},
{
"epoch": 0.49,
"learning_rate": 9.75568669527897e-06,
"loss": 1.5169,
"step": 230
},
{
"epoch": 0.51,
"learning_rate": 9.745064377682405e-06,
"loss": 1.5178,
"step": 240
},
{
"epoch": 0.54,
"learning_rate": 9.734442060085837e-06,
"loss": 1.5172,
"step": 250
},
{
"epoch": 0.56,
"learning_rate": 9.723819742489271e-06,
"loss": 1.518,
"step": 260
},
{
"epoch": 0.58,
"learning_rate": 9.713197424892705e-06,
"loss": 1.5193,
"step": 270
},
{
"epoch": 0.6,
"learning_rate": 9.702575107296138e-06,
"loss": 1.5177,
"step": 280
},
{
"epoch": 0.62,
"learning_rate": 9.69195278969957e-06,
"loss": 1.5213,
"step": 290
},
{
"epoch": 0.64,
"learning_rate": 9.681330472103006e-06,
"loss": 1.5195,
"step": 300
},
{
"epoch": 0.66,
"learning_rate": 9.670708154506439e-06,
"loss": 1.5245,
"step": 310
},
{
"epoch": 0.69,
"learning_rate": 9.660085836909872e-06,
"loss": 1.5188,
"step": 320
},
{
"epoch": 0.69,
"eval_loss": 1.4158122539520264,
"eval_runtime": 75.1458,
"eval_samples_per_second": 532.298,
"eval_steps_per_second": 8.317,
"step": 320
},
{
"epoch": 0.71,
"learning_rate": 9.649463519313305e-06,
"loss": 1.5178,
"step": 330
},
{
"epoch": 0.73,
"learning_rate": 9.63884120171674e-06,
"loss": 1.5199,
"step": 340
},
{
"epoch": 0.75,
"learning_rate": 9.628218884120173e-06,
"loss": 1.5212,
"step": 350
},
{
"epoch": 0.77,
"learning_rate": 9.617596566523606e-06,
"loss": 1.5176,
"step": 360
},
{
"epoch": 0.79,
"learning_rate": 9.60697424892704e-06,
"loss": 1.5199,
"step": 370
},
{
"epoch": 0.81,
"learning_rate": 9.596351931330472e-06,
"loss": 1.5209,
"step": 380
},
{
"epoch": 0.84,
"learning_rate": 9.585729613733907e-06,
"loss": 1.5191,
"step": 390
},
{
"epoch": 0.86,
"learning_rate": 9.57510729613734e-06,
"loss": 1.5182,
"step": 400
},
{
"epoch": 0.88,
"learning_rate": 9.564484978540773e-06,
"loss": 1.5212,
"step": 410
},
{
"epoch": 0.9,
"learning_rate": 9.553862660944206e-06,
"loss": 1.5203,
"step": 420
},
{
"epoch": 0.92,
"learning_rate": 9.54324034334764e-06,
"loss": 1.52,
"step": 430
},
{
"epoch": 0.94,
"learning_rate": 9.532618025751074e-06,
"loss": 1.5187,
"step": 440
},
{
"epoch": 0.96,
"learning_rate": 9.521995708154507e-06,
"loss": 1.5214,
"step": 450
},
{
"epoch": 0.99,
"learning_rate": 9.51137339055794e-06,
"loss": 1.5199,
"step": 460
},
{
"epoch": 1.01,
"learning_rate": 9.500751072961375e-06,
"loss": 1.6614,
"step": 470
},
{
"epoch": 1.03,
"learning_rate": 9.490128755364808e-06,
"loss": 1.5205,
"step": 480
},
{
"epoch": 1.03,
"eval_loss": 1.4153233766555786,
"eval_runtime": 75.1691,
"eval_samples_per_second": 532.133,
"eval_steps_per_second": 8.315,
"step": 480
},
{
"epoch": 1.05,
"learning_rate": 9.479506437768241e-06,
"loss": 1.5163,
"step": 490
},
{
"epoch": 1.07,
"learning_rate": 9.468884120171674e-06,
"loss": 1.5208,
"step": 500
},
{
"epoch": 1.09,
"learning_rate": 9.458261802575107e-06,
"loss": 1.5188,
"step": 510
},
{
"epoch": 1.12,
"learning_rate": 9.447639484978542e-06,
"loss": 1.5187,
"step": 520
},
{
"epoch": 1.14,
"learning_rate": 9.437017167381975e-06,
"loss": 1.518,
"step": 530
},
{
"epoch": 1.16,
"learning_rate": 9.426394849785408e-06,
"loss": 1.5191,
"step": 540
},
{
"epoch": 1.18,
"learning_rate": 9.415772532188841e-06,
"loss": 1.5184,
"step": 550
},
{
"epoch": 1.2,
"learning_rate": 9.405150214592276e-06,
"loss": 1.518,
"step": 560
},
{
"epoch": 1.22,
"learning_rate": 9.39452789699571e-06,
"loss": 1.5166,
"step": 570
},
{
"epoch": 1.24,
"learning_rate": 9.383905579399142e-06,
"loss": 1.5191,
"step": 580
},
{
"epoch": 1.27,
"learning_rate": 9.373283261802575e-06,
"loss": 1.5176,
"step": 590
},
{
"epoch": 1.29,
"learning_rate": 9.36266094420601e-06,
"loss": 1.5216,
"step": 600
},
{
"epoch": 1.31,
"learning_rate": 9.352038626609442e-06,
"loss": 1.5194,
"step": 610
},
{
"epoch": 1.33,
"learning_rate": 9.341416309012876e-06,
"loss": 1.5153,
"step": 620
},
{
"epoch": 1.35,
"learning_rate": 9.33079399141631e-06,
"loss": 1.52,
"step": 630
},
{
"epoch": 1.37,
"learning_rate": 9.320171673819743e-06,
"loss": 1.5213,
"step": 640
},
{
"epoch": 1.37,
"eval_loss": 1.4161808490753174,
"eval_runtime": 75.313,
"eval_samples_per_second": 531.117,
"eval_steps_per_second": 8.299,
"step": 640
},
{
"epoch": 1.39,
"learning_rate": 9.309549356223177e-06,
"loss": 1.5178,
"step": 650
},
{
"epoch": 1.42,
"learning_rate": 9.29892703862661e-06,
"loss": 1.5195,
"step": 660
},
{
"epoch": 1.44,
"learning_rate": 9.288304721030044e-06,
"loss": 1.5157,
"step": 670
},
{
"epoch": 1.46,
"learning_rate": 9.277682403433477e-06,
"loss": 1.5179,
"step": 680
},
{
"epoch": 1.48,
"learning_rate": 9.267060085836911e-06,
"loss": 1.5172,
"step": 690
},
{
"epoch": 1.5,
"learning_rate": 9.256437768240345e-06,
"loss": 1.5173,
"step": 700
},
{
"epoch": 1.52,
"learning_rate": 9.245815450643778e-06,
"loss": 1.5156,
"step": 710
},
{
"epoch": 1.54,
"learning_rate": 9.23519313304721e-06,
"loss": 1.5144,
"step": 720
},
{
"epoch": 1.57,
"learning_rate": 9.224570815450646e-06,
"loss": 1.5199,
"step": 730
},
{
"epoch": 1.59,
"learning_rate": 9.213948497854079e-06,
"loss": 1.5204,
"step": 740
},
{
"epoch": 1.61,
"learning_rate": 9.203326180257512e-06,
"loss": 1.516,
"step": 750
},
{
"epoch": 1.63,
"learning_rate": 9.192703862660945e-06,
"loss": 1.5206,
"step": 760
},
{
"epoch": 1.65,
"learning_rate": 9.18208154506438e-06,
"loss": 1.5188,
"step": 770
},
{
"epoch": 1.67,
"learning_rate": 9.171459227467811e-06,
"loss": 1.5211,
"step": 780
},
{
"epoch": 1.69,
"learning_rate": 9.160836909871246e-06,
"loss": 1.5184,
"step": 790
},
{
"epoch": 1.72,
"learning_rate": 9.150214592274679e-06,
"loss": 1.5195,
"step": 800
},
{
"epoch": 1.72,
"eval_loss": 1.4168281555175781,
"eval_runtime": 75.9377,
"eval_samples_per_second": 526.747,
"eval_steps_per_second": 8.23,
"step": 800
},
{
"epoch": 1.74,
"learning_rate": 9.139592274678112e-06,
"loss": 1.5188,
"step": 810
},
{
"epoch": 1.76,
"learning_rate": 9.128969957081545e-06,
"loss": 1.5167,
"step": 820
},
{
"epoch": 1.78,
"learning_rate": 9.11834763948498e-06,
"loss": 1.5207,
"step": 830
},
{
"epoch": 1.8,
"learning_rate": 9.107725321888413e-06,
"loss": 1.52,
"step": 840
},
{
"epoch": 1.82,
"learning_rate": 9.097103004291846e-06,
"loss": 1.5186,
"step": 850
},
{
"epoch": 1.84,
"learning_rate": 9.08648068669528e-06,
"loss": 1.5206,
"step": 860
},
{
"epoch": 1.87,
"learning_rate": 9.075858369098712e-06,
"loss": 1.5193,
"step": 870
},
{
"epoch": 1.89,
"learning_rate": 9.065236051502147e-06,
"loss": 1.5194,
"step": 880
},
{
"epoch": 1.91,
"learning_rate": 9.05461373390558e-06,
"loss": 1.522,
"step": 890
},
{
"epoch": 1.93,
"learning_rate": 9.043991416309015e-06,
"loss": 1.5169,
"step": 900
},
{
"epoch": 1.95,
"learning_rate": 9.033369098712446e-06,
"loss": 1.5164,
"step": 910
},
{
"epoch": 1.97,
"learning_rate": 9.022746781115881e-06,
"loss": 1.5207,
"step": 920
},
{
"epoch": 1.99,
"learning_rate": 9.012124463519314e-06,
"loss": 1.5193,
"step": 930
},
{
"epoch": 2.02,
"learning_rate": 9.001502145922747e-06,
"loss": 1.6613,
"step": 940
},
{
"epoch": 2.04,
"learning_rate": 8.99087982832618e-06,
"loss": 1.518,
"step": 950
},
{
"epoch": 2.06,
"learning_rate": 8.980257510729615e-06,
"loss": 1.5194,
"step": 960
},
{
"epoch": 2.06,
"eval_loss": 1.4150291681289673,
"eval_runtime": 75.9845,
"eval_samples_per_second": 526.423,
"eval_steps_per_second": 8.225,
"step": 960
},
{
"epoch": 2.08,
"learning_rate": 8.969635193133048e-06,
"loss": 1.5201,
"step": 970
},
{
"epoch": 2.1,
"learning_rate": 8.959012875536481e-06,
"loss": 1.5199,
"step": 980
},
{
"epoch": 2.12,
"learning_rate": 8.948390557939914e-06,
"loss": 1.5185,
"step": 990
},
{
"epoch": 2.15,
"learning_rate": 8.93776824034335e-06,
"loss": 1.5191,
"step": 1000
},
{
"epoch": 2.17,
"learning_rate": 8.927145922746782e-06,
"loss": 1.5187,
"step": 1010
},
{
"epoch": 2.19,
"learning_rate": 8.916523605150215e-06,
"loss": 1.5172,
"step": 1020
},
{
"epoch": 2.21,
"learning_rate": 8.905901287553649e-06,
"loss": 1.5166,
"step": 1030
},
{
"epoch": 2.23,
"learning_rate": 8.895278969957082e-06,
"loss": 1.5216,
"step": 1040
},
{
"epoch": 2.25,
"learning_rate": 8.884656652360516e-06,
"loss": 1.5186,
"step": 1050
},
{
"epoch": 2.27,
"learning_rate": 8.87403433476395e-06,
"loss": 1.5182,
"step": 1060
},
{
"epoch": 2.3,
"learning_rate": 8.863412017167383e-06,
"loss": 1.5194,
"step": 1070
},
{
"epoch": 2.32,
"learning_rate": 8.852789699570816e-06,
"loss": 1.515,
"step": 1080
},
{
"epoch": 2.34,
"learning_rate": 8.84216738197425e-06,
"loss": 1.5183,
"step": 1090
},
{
"epoch": 2.36,
"learning_rate": 8.831545064377682e-06,
"loss": 1.5166,
"step": 1100
},
{
"epoch": 2.38,
"learning_rate": 8.820922746781117e-06,
"loss": 1.5186,
"step": 1110
},
{
"epoch": 2.4,
"learning_rate": 8.81030042918455e-06,
"loss": 1.5182,
"step": 1120
},
{
"epoch": 2.4,
"eval_loss": 1.4141547679901123,
"eval_runtime": 75.9015,
"eval_samples_per_second": 526.999,
"eval_steps_per_second": 8.234,
"step": 1120
},
{
"epoch": 2.42,
"learning_rate": 8.799678111587985e-06,
"loss": 1.5164,
"step": 1130
},
{
"epoch": 2.45,
"learning_rate": 8.789055793991418e-06,
"loss": 1.5165,
"step": 1140
},
{
"epoch": 2.47,
"learning_rate": 8.77843347639485e-06,
"loss": 1.5216,
"step": 1150
},
{
"epoch": 2.49,
"learning_rate": 8.767811158798284e-06,
"loss": 1.5161,
"step": 1160
},
{
"epoch": 2.51,
"learning_rate": 8.757188841201717e-06,
"loss": 1.5189,
"step": 1170
},
{
"epoch": 2.53,
"learning_rate": 8.746566523605152e-06,
"loss": 1.5192,
"step": 1180
},
{
"epoch": 2.55,
"learning_rate": 8.735944206008585e-06,
"loss": 1.515,
"step": 1190
},
{
"epoch": 2.57,
"learning_rate": 8.725321888412018e-06,
"loss": 1.5185,
"step": 1200
},
{
"epoch": 2.6,
"learning_rate": 8.714699570815451e-06,
"loss": 1.5156,
"step": 1210
},
{
"epoch": 2.62,
"learning_rate": 8.704077253218886e-06,
"loss": 1.5191,
"step": 1220
},
{
"epoch": 2.64,
"learning_rate": 8.693454935622319e-06,
"loss": 1.5164,
"step": 1230
},
{
"epoch": 2.66,
"learning_rate": 8.682832618025752e-06,
"loss": 1.5216,
"step": 1240
},
{
"epoch": 2.68,
"learning_rate": 8.672210300429185e-06,
"loss": 1.5187,
"step": 1250
},
{
"epoch": 2.7,
"learning_rate": 8.66158798283262e-06,
"loss": 1.5172,
"step": 1260
},
{
"epoch": 2.72,
"learning_rate": 8.650965665236051e-06,
"loss": 1.5202,
"step": 1270
},
{
"epoch": 2.75,
"learning_rate": 8.640343347639486e-06,
"loss": 1.5182,
"step": 1280
},
{
"epoch": 2.75,
"eval_loss": 1.4131401777267456,
"eval_runtime": 76.5921,
"eval_samples_per_second": 522.247,
"eval_steps_per_second": 8.16,
"step": 1280
},
{
"epoch": 2.77,
"learning_rate": 8.629721030042919e-06,
"loss": 1.5177,
"step": 1290
},
{
"epoch": 2.79,
"learning_rate": 8.619098712446352e-06,
"loss": 1.5204,
"step": 1300
},
{
"epoch": 2.81,
"learning_rate": 8.608476394849785e-06,
"loss": 1.5183,
"step": 1310
},
{
"epoch": 2.83,
"learning_rate": 8.59785407725322e-06,
"loss": 1.52,
"step": 1320
},
{
"epoch": 2.85,
"learning_rate": 8.587231759656653e-06,
"loss": 1.5214,
"step": 1330
},
{
"epoch": 2.87,
"learning_rate": 8.576609442060086e-06,
"loss": 1.5193,
"step": 1340
},
{
"epoch": 2.9,
"learning_rate": 8.565987124463521e-06,
"loss": 1.5205,
"step": 1350
},
{
"epoch": 2.92,
"learning_rate": 8.555364806866953e-06,
"loss": 1.5196,
"step": 1360
},
{
"epoch": 2.94,
"learning_rate": 8.544742489270387e-06,
"loss": 1.5187,
"step": 1370
},
{
"epoch": 2.96,
"learning_rate": 8.53412017167382e-06,
"loss": 1.5198,
"step": 1380
},
{
"epoch": 2.98,
"learning_rate": 8.523497854077255e-06,
"loss": 1.5212,
"step": 1390
},
{
"epoch": 3.0,
"learning_rate": 8.512875536480687e-06,
"loss": 1.6612,
"step": 1400
},
{
"epoch": 3.03,
"learning_rate": 8.502253218884121e-06,
"loss": 1.519,
"step": 1410
},
{
"epoch": 3.05,
"learning_rate": 8.491630901287554e-06,
"loss": 1.5206,
"step": 1420
},
{
"epoch": 3.07,
"learning_rate": 8.48100858369099e-06,
"loss": 1.5206,
"step": 1430
},
{
"epoch": 3.09,
"learning_rate": 8.47038626609442e-06,
"loss": 1.5177,
"step": 1440
},
{
"epoch": 3.09,
"eval_loss": 1.4166858196258545,
"eval_runtime": 77.5531,
"eval_samples_per_second": 515.776,
"eval_steps_per_second": 8.059,
"step": 1440
},
{
"epoch": 3.11,
"learning_rate": 8.459763948497855e-06,
"loss": 1.5197,
"step": 1450
},
{
"epoch": 3.13,
"learning_rate": 8.449141630901289e-06,
"loss": 1.5183,
"step": 1460
},
{
"epoch": 3.15,
"learning_rate": 8.438519313304722e-06,
"loss": 1.5188,
"step": 1470
},
{
"epoch": 3.18,
"learning_rate": 8.427896995708155e-06,
"loss": 1.5197,
"step": 1480
},
{
"epoch": 3.2,
"learning_rate": 8.41727467811159e-06,
"loss": 1.5174,
"step": 1490
},
{
"epoch": 3.22,
"learning_rate": 8.406652360515023e-06,
"loss": 1.5163,
"step": 1500
},
{
"epoch": 3.24,
"learning_rate": 8.396030042918456e-06,
"loss": 1.5189,
"step": 1510
},
{
"epoch": 3.26,
"learning_rate": 8.385407725321889e-06,
"loss": 1.521,
"step": 1520
},
{
"epoch": 3.28,
"learning_rate": 8.374785407725322e-06,
"loss": 1.5168,
"step": 1530
},
{
"epoch": 3.3,
"learning_rate": 8.364163090128757e-06,
"loss": 1.5162,
"step": 1540
},
{
"epoch": 3.33,
"learning_rate": 8.35354077253219e-06,
"loss": 1.5172,
"step": 1550
},
{
"epoch": 3.35,
"learning_rate": 8.342918454935623e-06,
"loss": 1.5211,
"step": 1560
},
{
"epoch": 3.37,
"learning_rate": 8.332296137339056e-06,
"loss": 1.5183,
"step": 1570
},
{
"epoch": 3.39,
"learning_rate": 8.32167381974249e-06,
"loss": 1.5188,
"step": 1580
},
{
"epoch": 3.41,
"learning_rate": 8.311051502145922e-06,
"loss": 1.5199,
"step": 1590
},
{
"epoch": 3.43,
"learning_rate": 8.300429184549357e-06,
"loss": 1.5201,
"step": 1600
},
{
"epoch": 3.43,
"eval_loss": 1.415603756904602,
"eval_runtime": 78.5248,
"eval_samples_per_second": 509.393,
"eval_steps_per_second": 7.959,
"step": 1600
},
{
"epoch": 3.45,
"learning_rate": 8.28980686695279e-06,
"loss": 1.5193,
"step": 1610
},
{
"epoch": 3.48,
"learning_rate": 8.279184549356225e-06,
"loss": 1.5183,
"step": 1620
},
{
"epoch": 3.5,
"learning_rate": 8.268562231759658e-06,
"loss": 1.515,
"step": 1630
},
{
"epoch": 3.52,
"learning_rate": 8.257939914163091e-06,
"loss": 1.5145,
"step": 1640
},
{
"epoch": 3.54,
"learning_rate": 8.247317596566524e-06,
"loss": 1.5166,
"step": 1650
},
{
"epoch": 3.56,
"learning_rate": 8.236695278969959e-06,
"loss": 1.5163,
"step": 1660
},
{
"epoch": 3.58,
"learning_rate": 8.226072961373392e-06,
"loss": 1.5206,
"step": 1670
},
{
"epoch": 3.6,
"learning_rate": 8.215450643776825e-06,
"loss": 1.5164,
"step": 1680
},
{
"epoch": 3.63,
"learning_rate": 8.204828326180258e-06,
"loss": 1.5185,
"step": 1690
},
{
"epoch": 3.65,
"learning_rate": 8.194206008583691e-06,
"loss": 1.5182,
"step": 1700
},
{
"epoch": 3.67,
"learning_rate": 8.183583690987126e-06,
"loss": 1.5219,
"step": 1710
},
{
"epoch": 3.69,
"learning_rate": 8.17296137339056e-06,
"loss": 1.519,
"step": 1720
},
{
"epoch": 3.71,
"learning_rate": 8.162339055793992e-06,
"loss": 1.5169,
"step": 1730
},
{
"epoch": 3.73,
"learning_rate": 8.151716738197425e-06,
"loss": 1.5203,
"step": 1740
},
{
"epoch": 3.75,
"learning_rate": 8.14109442060086e-06,
"loss": 1.5184,
"step": 1750
},
{
"epoch": 3.78,
"learning_rate": 8.130472103004292e-06,
"loss": 1.5173,
"step": 1760
},
{
"epoch": 3.78,
"eval_loss": 1.4110851287841797,
"eval_runtime": 79.1992,
"eval_samples_per_second": 505.055,
"eval_steps_per_second": 7.891,
"step": 1760
},
{
"epoch": 3.8,
"learning_rate": 8.119849785407726e-06,
"loss": 1.5198,
"step": 1770
},
{
"epoch": 3.82,
"learning_rate": 8.10922746781116e-06,
"loss": 1.5187,
"step": 1780
},
{
"epoch": 3.84,
"learning_rate": 8.098605150214593e-06,
"loss": 1.5205,
"step": 1790
},
{
"epoch": 3.86,
"learning_rate": 8.087982832618026e-06,
"loss": 1.5195,
"step": 1800
},
{
"epoch": 3.88,
"learning_rate": 8.07736051502146e-06,
"loss": 1.5187,
"step": 1810
},
{
"epoch": 3.9,
"learning_rate": 8.066738197424893e-06,
"loss": 1.5187,
"step": 1820
},
{
"epoch": 3.93,
"learning_rate": 8.056115879828327e-06,
"loss": 1.5196,
"step": 1830
},
{
"epoch": 3.95,
"learning_rate": 8.045493562231761e-06,
"loss": 1.5214,
"step": 1840
},
{
"epoch": 3.97,
"learning_rate": 8.034871244635194e-06,
"loss": 1.5195,
"step": 1850
},
{
"epoch": 3.99,
"learning_rate": 8.024248927038628e-06,
"loss": 1.5206,
"step": 1860
},
{
"epoch": 4.01,
"learning_rate": 8.01362660944206e-06,
"loss": 1.66,
"step": 1870
},
{
"epoch": 4.03,
"learning_rate": 8.003004291845495e-06,
"loss": 1.5193,
"step": 1880
},
{
"epoch": 4.06,
"learning_rate": 7.992381974248929e-06,
"loss": 1.5179,
"step": 1890
},
{
"epoch": 4.08,
"learning_rate": 7.981759656652362e-06,
"loss": 1.517,
"step": 1900
},
{
"epoch": 4.1,
"learning_rate": 7.971137339055795e-06,
"loss": 1.5177,
"step": 1910
},
{
"epoch": 4.12,
"learning_rate": 7.96051502145923e-06,
"loss": 1.52,
"step": 1920
},
{
"epoch": 4.12,
"eval_loss": 1.4117424488067627,
"eval_runtime": 79.9929,
"eval_samples_per_second": 500.044,
"eval_steps_per_second": 7.813,
"step": 1920
},
{
"epoch": 4.14,
"learning_rate": 7.949892703862661e-06,
"loss": 1.5204,
"step": 1930
},
{
"epoch": 4.16,
"learning_rate": 7.939270386266096e-06,
"loss": 1.5208,
"step": 1940
},
{
"epoch": 4.18,
"learning_rate": 7.928648068669529e-06,
"loss": 1.5167,
"step": 1950
},
{
"epoch": 4.21,
"learning_rate": 7.918025751072962e-06,
"loss": 1.5161,
"step": 1960
},
{
"epoch": 4.23,
"learning_rate": 7.907403433476395e-06,
"loss": 1.5168,
"step": 1970
},
{
"epoch": 4.25,
"learning_rate": 7.89678111587983e-06,
"loss": 1.5211,
"step": 1980
},
{
"epoch": 4.27,
"learning_rate": 7.886158798283263e-06,
"loss": 1.5203,
"step": 1990
},
{
"epoch": 4.29,
"learning_rate": 7.875536480686696e-06,
"loss": 1.5189,
"step": 2000
},
{
"epoch": 4.31,
"learning_rate": 7.864914163090129e-06,
"loss": 1.518,
"step": 2010
},
{
"epoch": 4.33,
"learning_rate": 7.854291845493562e-06,
"loss": 1.5172,
"step": 2020
},
{
"epoch": 4.36,
"learning_rate": 7.843669527896997e-06,
"loss": 1.5191,
"step": 2030
},
{
"epoch": 4.38,
"learning_rate": 7.83304721030043e-06,
"loss": 1.5206,
"step": 2040
},
{
"epoch": 4.4,
"learning_rate": 7.822424892703863e-06,
"loss": 1.5184,
"step": 2050
},
{
"epoch": 4.42,
"learning_rate": 7.811802575107296e-06,
"loss": 1.5221,
"step": 2060
},
{
"epoch": 4.44,
"learning_rate": 7.801180257510731e-06,
"loss": 1.5175,
"step": 2070
},
{
"epoch": 4.46,
"learning_rate": 7.790557939914162e-06,
"loss": 1.5184,
"step": 2080
},
{
"epoch": 4.46,
"eval_loss": 1.4151064157485962,
"eval_runtime": 81.878,
"eval_samples_per_second": 488.532,
"eval_steps_per_second": 7.633,
"step": 2080
},
{
"epoch": 4.48,
"learning_rate": 7.779935622317597e-06,
"loss": 1.5166,
"step": 2090
},
{
"epoch": 4.51,
"learning_rate": 7.76931330472103e-06,
"loss": 1.5172,
"step": 2100
},
{
"epoch": 4.53,
"learning_rate": 7.758690987124465e-06,
"loss": 1.5164,
"step": 2110
},
{
"epoch": 4.55,
"learning_rate": 7.748068669527898e-06,
"loss": 1.5145,
"step": 2120
},
{
"epoch": 4.57,
"learning_rate": 7.737446351931331e-06,
"loss": 1.5202,
"step": 2130
},
{
"epoch": 4.59,
"learning_rate": 7.726824034334764e-06,
"loss": 1.517,
"step": 2140
},
{
"epoch": 4.61,
"learning_rate": 7.7162017167382e-06,
"loss": 1.5185,
"step": 2150
},
{
"epoch": 4.63,
"learning_rate": 7.70557939914163e-06,
"loss": 1.5155,
"step": 2160
},
{
"epoch": 4.66,
"learning_rate": 7.694957081545065e-06,
"loss": 1.5193,
"step": 2170
},
{
"epoch": 4.68,
"learning_rate": 7.684334763948498e-06,
"loss": 1.5214,
"step": 2180
},
{
"epoch": 4.7,
"learning_rate": 7.673712446351932e-06,
"loss": 1.5178,
"step": 2190
},
{
"epoch": 4.72,
"learning_rate": 7.663090128755365e-06,
"loss": 1.518,
"step": 2200
},
{
"epoch": 4.74,
"learning_rate": 7.6524678111588e-06,
"loss": 1.5197,
"step": 2210
},
{
"epoch": 4.76,
"learning_rate": 7.641845493562233e-06,
"loss": 1.5167,
"step": 2220
},
{
"epoch": 4.78,
"learning_rate": 7.631223175965666e-06,
"loss": 1.5202,
"step": 2230
},
{
"epoch": 4.81,
"learning_rate": 7.620600858369098e-06,
"loss": 1.5198,
"step": 2240
},
{
"epoch": 4.81,
"eval_loss": 1.4096823930740356,
"eval_runtime": 80.6519,
"eval_samples_per_second": 495.959,
"eval_steps_per_second": 7.749,
"step": 2240
},
{
"epoch": 4.83,
"learning_rate": 7.609978540772531e-06,
"loss": 1.5202,
"step": 2250
},
{
"epoch": 4.85,
"learning_rate": 7.599356223175966e-06,
"loss": 1.5164,
"step": 2260
},
{
"epoch": 4.87,
"learning_rate": 7.5887339055794e-06,
"loss": 1.5208,
"step": 2270
},
{
"epoch": 4.89,
"learning_rate": 7.578111587982832e-06,
"loss": 1.5176,
"step": 2280
},
{
"epoch": 4.91,
"learning_rate": 7.567489270386266e-06,
"loss": 1.5201,
"step": 2290
},
{
"epoch": 4.93,
"learning_rate": 7.5568669527897e-06,
"loss": 1.5181,
"step": 2300
},
{
"epoch": 4.96,
"learning_rate": 7.546244635193132e-06,
"loss": 1.519,
"step": 2310
},
{
"epoch": 4.98,
"learning_rate": 7.535622317596566e-06,
"loss": 1.5198,
"step": 2320
},
{
"epoch": 5.0,
"learning_rate": 7.525e-06,
"loss": 1.5171,
"step": 2330
},
{
"epoch": 5.02,
"learning_rate": 7.514377682403433e-06,
"loss": 1.6592,
"step": 2340
},
{
"epoch": 5.04,
"learning_rate": 7.503755364806867e-06,
"loss": 1.5173,
"step": 2350
},
{
"epoch": 5.06,
"learning_rate": 7.4931330472103e-06,
"loss": 1.5182,
"step": 2360
},
{
"epoch": 5.09,
"learning_rate": 7.482510729613735e-06,
"loss": 1.5192,
"step": 2370
},
{
"epoch": 5.11,
"learning_rate": 7.471888412017167e-06,
"loss": 1.5198,
"step": 2380
},
{
"epoch": 5.13,
"learning_rate": 7.4612660944206e-06,
"loss": 1.5163,
"step": 2390
},
{
"epoch": 5.15,
"learning_rate": 7.450643776824033e-06,
"loss": 1.5202,
"step": 2400
},
{
"epoch": 5.15,
"eval_loss": 1.4161678552627563,
"eval_runtime": 75.8006,
"eval_samples_per_second": 527.7,
"eval_steps_per_second": 8.245,
"step": 2400
},
{
"epoch": 5.17,
"learning_rate": 7.440021459227468e-06,
"loss": 1.5173,
"step": 2410
},
{
"epoch": 5.19,
"learning_rate": 7.429399141630901e-06,
"loss": 1.5165,
"step": 2420
},
{
"epoch": 5.21,
"learning_rate": 7.418776824034335e-06,
"loss": 1.5169,
"step": 2430
},
{
"epoch": 5.24,
"learning_rate": 7.408154506437768e-06,
"loss": 1.5196,
"step": 2440
},
{
"epoch": 5.26,
"learning_rate": 7.397532188841201e-06,
"loss": 1.5176,
"step": 2450
},
{
"epoch": 5.28,
"learning_rate": 7.386909871244635e-06,
"loss": 1.5205,
"step": 2460
},
{
"epoch": 5.3,
"learning_rate": 7.376287553648069e-06,
"loss": 1.518,
"step": 2470
},
{
"epoch": 5.32,
"learning_rate": 7.3656652360515015e-06,
"loss": 1.5171,
"step": 2480
},
{
"epoch": 5.34,
"learning_rate": 7.355042918454935e-06,
"loss": 1.5211,
"step": 2490
},
{
"epoch": 5.36,
"learning_rate": 7.344420600858369e-06,
"loss": 1.5201,
"step": 2500
},
{
"epoch": 5.39,
"learning_rate": 7.333798283261802e-06,
"loss": 1.5177,
"step": 2510
},
{
"epoch": 5.41,
"learning_rate": 7.3231759656652355e-06,
"loss": 1.5194,
"step": 2520
},
{
"epoch": 5.43,
"learning_rate": 7.3125536480686695e-06,
"loss": 1.5175,
"step": 2530
},
{
"epoch": 5.45,
"learning_rate": 7.3019313304721026e-06,
"loss": 1.5177,
"step": 2540
},
{
"epoch": 5.47,
"learning_rate": 7.291309012875537e-06,
"loss": 1.5184,
"step": 2550
},
{
"epoch": 5.49,
"learning_rate": 7.2806866952789704e-06,
"loss": 1.5166,
"step": 2560
},
{
"epoch": 5.49,
"eval_loss": 1.4130014181137085,
"eval_runtime": 75.8444,
"eval_samples_per_second": 527.396,
"eval_steps_per_second": 8.241,
"step": 2560
},
{
"epoch": 5.51,
"learning_rate": 7.2700643776824035e-06,
"loss": 1.5158,
"step": 2570
},
{
"epoch": 5.54,
"learning_rate": 7.2594420600858375e-06,
"loss": 1.5163,
"step": 2580
},
{
"epoch": 5.56,
"learning_rate": 7.2488197424892706e-06,
"loss": 1.5173,
"step": 2590
},
{
"epoch": 5.58,
"learning_rate": 7.238197424892704e-06,
"loss": 1.5206,
"step": 2600
},
{
"epoch": 5.6,
"learning_rate": 7.227575107296138e-06,
"loss": 1.515,
"step": 2610
},
{
"epoch": 5.62,
"learning_rate": 7.216952789699571e-06,
"loss": 1.5196,
"step": 2620
},
{
"epoch": 5.64,
"learning_rate": 7.206330472103005e-06,
"loss": 1.5179,
"step": 2630
},
{
"epoch": 5.66,
"learning_rate": 7.195708154506439e-06,
"loss": 1.5226,
"step": 2640
},
{
"epoch": 5.69,
"learning_rate": 7.185085836909871e-06,
"loss": 1.5206,
"step": 2650
},
{
"epoch": 5.71,
"learning_rate": 7.174463519313305e-06,
"loss": 1.5173,
"step": 2660
},
{
"epoch": 5.73,
"learning_rate": 7.163841201716739e-06,
"loss": 1.5169,
"step": 2670
},
{
"epoch": 5.75,
"learning_rate": 7.153218884120171e-06,
"loss": 1.5199,
"step": 2680
},
{
"epoch": 5.77,
"learning_rate": 7.142596566523605e-06,
"loss": 1.5179,
"step": 2690
},
{
"epoch": 5.79,
"learning_rate": 7.13197424892704e-06,
"loss": 1.5202,
"step": 2700
},
{
"epoch": 5.81,
"learning_rate": 7.121351931330473e-06,
"loss": 1.5202,
"step": 2710
},
{
"epoch": 5.84,
"learning_rate": 7.110729613733906e-06,
"loss": 1.5184,
"step": 2720
},
{
"epoch": 5.84,
"eval_loss": 1.4138652086257935,
"eval_runtime": 75.865,
"eval_samples_per_second": 527.252,
"eval_steps_per_second": 8.238,
"step": 2720
},
{
"epoch": 5.86,
"learning_rate": 7.100107296137338e-06,
"loss": 1.5209,
"step": 2730
},
{
"epoch": 5.88,
"learning_rate": 7.089484978540771e-06,
"loss": 1.5183,
"step": 2740
},
{
"epoch": 5.9,
"learning_rate": 7.078862660944206e-06,
"loss": 1.5196,
"step": 2750
},
{
"epoch": 5.92,
"learning_rate": 7.06824034334764e-06,
"loss": 1.5169,
"step": 2760
},
{
"epoch": 5.94,
"learning_rate": 7.057618025751072e-06,
"loss": 1.5194,
"step": 2770
},
{
"epoch": 5.96,
"learning_rate": 7.046995708154507e-06,
"loss": 1.5211,
"step": 2780
},
{
"epoch": 5.99,
"learning_rate": 7.03637339055794e-06,
"loss": 1.5191,
"step": 2790
},
{
"epoch": 6.01,
"learning_rate": 7.025751072961372e-06,
"loss": 1.6597,
"step": 2800
},
{
"epoch": 6.03,
"learning_rate": 7.015128755364807e-06,
"loss": 1.5205,
"step": 2810
},
{
"epoch": 6.05,
"learning_rate": 7.00450643776824e-06,
"loss": 1.5186,
"step": 2820
},
{
"epoch": 6.07,
"learning_rate": 6.993884120171674e-06,
"loss": 1.517,
"step": 2830
},
{
"epoch": 6.09,
"learning_rate": 6.983261802575107e-06,
"loss": 1.5152,
"step": 2840
},
{
"epoch": 6.12,
"learning_rate": 6.97263948497854e-06,
"loss": 1.5173,
"step": 2850
},
{
"epoch": 6.14,
"learning_rate": 6.962017167381975e-06,
"loss": 1.5206,
"step": 2860
},
{
"epoch": 6.16,
"learning_rate": 6.951394849785407e-06,
"loss": 1.5178,
"step": 2870
},
{
"epoch": 6.18,
"learning_rate": 6.9407725321888405e-06,
"loss": 1.5174,
"step": 2880
},
{
"epoch": 6.18,
"eval_loss": 1.4127614498138428,
"eval_runtime": 75.8874,
"eval_samples_per_second": 527.097,
"eval_steps_per_second": 8.236,
"step": 2880
},
{
"epoch": 6.2,
"learning_rate": 6.930150214592274e-06,
"loss": 1.5191,
"step": 2890
},
{
"epoch": 6.22,
"learning_rate": 6.919527896995708e-06,
"loss": 1.5183,
"step": 2900
},
{
"epoch": 6.24,
"learning_rate": 6.9089055793991415e-06,
"loss": 1.5223,
"step": 2910
},
{
"epoch": 6.27,
"learning_rate": 6.898283261802575e-06,
"loss": 1.5183,
"step": 2920
},
{
"epoch": 6.29,
"learning_rate": 6.8876609442060085e-06,
"loss": 1.5192,
"step": 2930
},
{
"epoch": 6.31,
"learning_rate": 6.877038626609442e-06,
"loss": 1.5175,
"step": 2940
},
{
"epoch": 6.33,
"learning_rate": 6.8664163090128755e-06,
"loss": 1.5176,
"step": 2950
},
{
"epoch": 6.35,
"learning_rate": 6.8557939914163095e-06,
"loss": 1.5202,
"step": 2960
},
{
"epoch": 6.37,
"learning_rate": 6.845171673819742e-06,
"loss": 1.5187,
"step": 2970
},
{
"epoch": 6.39,
"learning_rate": 6.834549356223176e-06,
"loss": 1.5184,
"step": 2980
},
{
"epoch": 6.42,
"learning_rate": 6.82392703862661e-06,
"loss": 1.5202,
"step": 2990
},
{
"epoch": 6.44,
"learning_rate": 6.813304721030042e-06,
"loss": 1.5158,
"step": 3000
},
{
"epoch": 6.46,
"learning_rate": 6.8026824034334775e-06,
"loss": 1.5215,
"step": 3010
},
{
"epoch": 6.48,
"learning_rate": 6.79206008583691e-06,
"loss": 1.5158,
"step": 3020
},
{
"epoch": 6.5,
"learning_rate": 6.781437768240343e-06,
"loss": 1.5161,
"step": 3030
},
{
"epoch": 6.52,
"learning_rate": 6.770815450643778e-06,
"loss": 1.5161,
"step": 3040
},
{
"epoch": 6.52,
"eval_loss": 1.412627100944519,
"eval_runtime": 75.8175,
"eval_samples_per_second": 527.583,
"eval_steps_per_second": 8.243,
"step": 3040
},
{
"epoch": 6.54,
"learning_rate": 6.760193133047211e-06,
"loss": 1.5151,
"step": 3050
},
{
"epoch": 6.57,
"learning_rate": 6.749570815450644e-06,
"loss": 1.5175,
"step": 3060
},
{
"epoch": 6.59,
"learning_rate": 6.738948497854078e-06,
"loss": 1.5197,
"step": 3070
},
{
"epoch": 6.61,
"learning_rate": 6.728326180257511e-06,
"loss": 1.5172,
"step": 3080
},
{
"epoch": 6.63,
"learning_rate": 6.717703862660944e-06,
"loss": 1.5193,
"step": 3090
},
{
"epoch": 6.65,
"learning_rate": 6.707081545064378e-06,
"loss": 1.5191,
"step": 3100
},
{
"epoch": 6.67,
"learning_rate": 6.696459227467811e-06,
"loss": 1.5202,
"step": 3110
},
{
"epoch": 6.69,
"learning_rate": 6.685836909871245e-06,
"loss": 1.5184,
"step": 3120
},
{
"epoch": 6.72,
"learning_rate": 6.675214592274679e-06,
"loss": 1.5173,
"step": 3130
},
{
"epoch": 6.74,
"learning_rate": 6.664592274678111e-06,
"loss": 1.5202,
"step": 3140
},
{
"epoch": 6.76,
"learning_rate": 6.653969957081545e-06,
"loss": 1.517,
"step": 3150
},
{
"epoch": 6.78,
"learning_rate": 6.643347639484979e-06,
"loss": 1.5171,
"step": 3160
},
{
"epoch": 6.8,
"learning_rate": 6.632725321888411e-06,
"loss": 1.52,
"step": 3170
},
{
"epoch": 6.82,
"learning_rate": 6.622103004291845e-06,
"loss": 1.5191,
"step": 3180
},
{
"epoch": 6.84,
"learning_rate": 6.61148068669528e-06,
"loss": 1.5169,
"step": 3190
},
{
"epoch": 6.87,
"learning_rate": 6.600858369098713e-06,
"loss": 1.5175,
"step": 3200
},
{
"epoch": 6.87,
"eval_loss": 1.4094613790512085,
"eval_runtime": 75.8182,
"eval_samples_per_second": 527.578,
"eval_steps_per_second": 8.243,
"step": 3200
},
{
"epoch": 6.89,
"learning_rate": 6.590236051502146e-06,
"loss": 1.5178,
"step": 3210
},
{
"epoch": 6.91,
"learning_rate": 6.579613733905578e-06,
"loss": 1.5206,
"step": 3220
},
{
"epoch": 6.93,
"learning_rate": 6.5689914163090115e-06,
"loss": 1.519,
"step": 3230
},
{
"epoch": 6.95,
"learning_rate": 6.558369098712447e-06,
"loss": 1.5162,
"step": 3240
},
{
"epoch": 6.97,
"learning_rate": 6.54774678111588e-06,
"loss": 1.5203,
"step": 3250
},
{
"epoch": 6.99,
"learning_rate": 6.5371244635193125e-06,
"loss": 1.5193,
"step": 3260
},
{
"epoch": 7.02,
"learning_rate": 6.526502145922747e-06,
"loss": 1.6616,
"step": 3270
},
{
"epoch": 7.04,
"learning_rate": 6.51587982832618e-06,
"loss": 1.5154,
"step": 3280
},
{
"epoch": 7.06,
"learning_rate": 6.505257510729613e-06,
"loss": 1.5169,
"step": 3290
},
{
"epoch": 7.08,
"learning_rate": 6.494635193133047e-06,
"loss": 1.5204,
"step": 3300
},
{
"epoch": 7.1,
"learning_rate": 6.4840128755364805e-06,
"loss": 1.5175,
"step": 3310
},
{
"epoch": 7.12,
"learning_rate": 6.4733905579399144e-06,
"loss": 1.5195,
"step": 3320
},
{
"epoch": 7.15,
"learning_rate": 6.4627682403433475e-06,
"loss": 1.5165,
"step": 3330
},
{
"epoch": 7.17,
"learning_rate": 6.452145922746781e-06,
"loss": 1.5161,
"step": 3340
},
{
"epoch": 7.19,
"learning_rate": 6.441523605150215e-06,
"loss": 1.5182,
"step": 3350
},
{
"epoch": 7.21,
"learning_rate": 6.430901287553648e-06,
"loss": 1.5169,
"step": 3360
},
{
"epoch": 7.21,
"eval_loss": 1.4117693901062012,
"eval_runtime": 75.551,
"eval_samples_per_second": 529.443,
"eval_steps_per_second": 8.273,
"step": 3360
},
{
"epoch": 7.23,
"learning_rate": 6.420278969957081e-06,
"loss": 1.5192,
"step": 3370
},
{
"epoch": 7.25,
"learning_rate": 6.409656652360514e-06,
"loss": 1.5188,
"step": 3380
},
{
"epoch": 7.27,
"learning_rate": 6.399034334763949e-06,
"loss": 1.5219,
"step": 3390
},
{
"epoch": 7.3,
"learning_rate": 6.388412017167382e-06,
"loss": 1.5202,
"step": 3400
},
{
"epoch": 7.32,
"learning_rate": 6.377789699570816e-06,
"loss": 1.5176,
"step": 3410
},
{
"epoch": 7.34,
"learning_rate": 6.367167381974249e-06,
"loss": 1.5178,
"step": 3420
},
{
"epoch": 7.36,
"learning_rate": 6.356545064377682e-06,
"loss": 1.5185,
"step": 3430
},
{
"epoch": 7.38,
"learning_rate": 6.345922746781116e-06,
"loss": 1.52,
"step": 3440
},
{
"epoch": 7.4,
"learning_rate": 6.33530042918455e-06,
"loss": 1.5186,
"step": 3450
},
{
"epoch": 7.42,
"learning_rate": 6.324678111587982e-06,
"loss": 1.5197,
"step": 3460
},
{
"epoch": 7.45,
"learning_rate": 6.314055793991417e-06,
"loss": 1.5171,
"step": 3470
},
{
"epoch": 7.47,
"learning_rate": 6.30343347639485e-06,
"loss": 1.5193,
"step": 3480
},
{
"epoch": 7.49,
"learning_rate": 6.292811158798282e-06,
"loss": 1.5145,
"step": 3490
},
{
"epoch": 7.51,
"learning_rate": 6.282188841201718e-06,
"loss": 1.5151,
"step": 3500
},
{
"epoch": 7.53,
"learning_rate": 6.27156652360515e-06,
"loss": 1.5172,
"step": 3510
},
{
"epoch": 7.55,
"learning_rate": 6.260944206008583e-06,
"loss": 1.516,
"step": 3520
},
{
"epoch": 7.55,
"eval_loss": 1.411309003829956,
"eval_runtime": 75.5679,
"eval_samples_per_second": 529.326,
"eval_steps_per_second": 8.271,
"step": 3520
},
{
"epoch": 7.57,
"learning_rate": 6.250321888412018e-06,
"loss": 1.5177,
"step": 3530
},
{
"epoch": 7.6,
"learning_rate": 6.239699570815451e-06,
"loss": 1.5179,
"step": 3540
},
{
"epoch": 7.62,
"learning_rate": 6.229077253218884e-06,
"loss": 1.5194,
"step": 3550
},
{
"epoch": 7.64,
"learning_rate": 6.218454935622318e-06,
"loss": 1.5171,
"step": 3560
},
{
"epoch": 7.66,
"learning_rate": 6.207832618025751e-06,
"loss": 1.5225,
"step": 3570
},
{
"epoch": 7.68,
"learning_rate": 6.197210300429184e-06,
"loss": 1.5184,
"step": 3580
},
{
"epoch": 7.7,
"learning_rate": 6.186587982832618e-06,
"loss": 1.5148,
"step": 3590
},
{
"epoch": 7.72,
"learning_rate": 6.175965665236051e-06,
"loss": 1.5159,
"step": 3600
},
{
"epoch": 7.75,
"learning_rate": 6.165343347639485e-06,
"loss": 1.5202,
"step": 3610
},
{
"epoch": 7.77,
"learning_rate": 6.154721030042919e-06,
"loss": 1.5146,
"step": 3620
},
{
"epoch": 7.79,
"learning_rate": 6.144098712446351e-06,
"loss": 1.5195,
"step": 3630
},
{
"epoch": 7.81,
"learning_rate": 6.133476394849785e-06,
"loss": 1.5179,
"step": 3640
},
{
"epoch": 7.83,
"learning_rate": 6.122854077253219e-06,
"loss": 1.5217,
"step": 3650
},
{
"epoch": 7.85,
"learning_rate": 6.1122317596566515e-06,
"loss": 1.5221,
"step": 3660
},
{
"epoch": 7.87,
"learning_rate": 6.1016094420600854e-06,
"loss": 1.518,
"step": 3670
},
{
"epoch": 7.9,
"learning_rate": 6.09098712446352e-06,
"loss": 1.5182,
"step": 3680
},
{
"epoch": 7.9,
"eval_loss": 1.4096906185150146,
"eval_runtime": 75.6834,
"eval_samples_per_second": 528.517,
"eval_steps_per_second": 8.258,
"step": 3680
},
{
"epoch": 7.92,
"learning_rate": 6.080364806866953e-06,
"loss": 1.5166,
"step": 3690
},
{
"epoch": 7.94,
"learning_rate": 6.069742489270387e-06,
"loss": 1.5203,
"step": 3700
},
{
"epoch": 7.96,
"learning_rate": 6.059120171673819e-06,
"loss": 1.5183,
"step": 3710
},
{
"epoch": 7.98,
"learning_rate": 6.048497854077252e-06,
"loss": 1.5186,
"step": 3720
},
{
"epoch": 8.0,
"learning_rate": 6.037875536480687e-06,
"loss": 1.6603,
"step": 3730
},
{
"epoch": 8.03,
"learning_rate": 6.02725321888412e-06,
"loss": 1.5197,
"step": 3740
},
{
"epoch": 8.05,
"learning_rate": 6.016630901287553e-06,
"loss": 1.521,
"step": 3750
},
{
"epoch": 8.07,
"learning_rate": 6.0060085836909875e-06,
"loss": 1.5189,
"step": 3760
},
{
"epoch": 8.09,
"learning_rate": 5.995386266094421e-06,
"loss": 1.5204,
"step": 3770
},
{
"epoch": 8.11,
"learning_rate": 5.9847639484978546e-06,
"loss": 1.5182,
"step": 3780
},
{
"epoch": 8.13,
"learning_rate": 5.974141630901288e-06,
"loss": 1.5174,
"step": 3790
},
{
"epoch": 8.15,
"learning_rate": 5.963519313304721e-06,
"loss": 1.5173,
"step": 3800
},
{
"epoch": 8.18,
"learning_rate": 5.952896995708155e-06,
"loss": 1.5165,
"step": 3810
},
{
"epoch": 8.2,
"learning_rate": 5.942274678111588e-06,
"loss": 1.5148,
"step": 3820
},
{
"epoch": 8.22,
"learning_rate": 5.931652360515021e-06,
"loss": 1.5177,
"step": 3830
},
{
"epoch": 8.24,
"learning_rate": 5.921030042918456e-06,
"loss": 1.5195,
"step": 3840
},
{
"epoch": 8.24,
"eval_loss": 1.4117556810379028,
"eval_runtime": 75.1101,
"eval_samples_per_second": 532.551,
"eval_steps_per_second": 8.321,
"step": 3840
},
{
"epoch": 8.26,
"learning_rate": 5.910407725321888e-06,
"loss": 1.5196,
"step": 3850
},
{
"epoch": 8.28,
"learning_rate": 5.899785407725321e-06,
"loss": 1.5199,
"step": 3860
},
{
"epoch": 8.3,
"learning_rate": 5.889163090128754e-06,
"loss": 1.5181,
"step": 3870
},
{
"epoch": 8.33,
"learning_rate": 5.878540772532189e-06,
"loss": 1.5141,
"step": 3880
},
{
"epoch": 8.02,
"learning_rate": 5.867918454935622e-06,
"loss": 1.5185,
"step": 3890
},
{
"epoch": 8.04,
"learning_rate": 5.857296137339056e-06,
"loss": 1.5208,
"step": 3900
},
{
"epoch": 8.06,
"learning_rate": 5.846673819742489e-06,
"loss": 1.5179,
"step": 3910
},
{
"epoch": 8.09,
"learning_rate": 5.836051502145922e-06,
"loss": 1.5178,
"step": 3920
},
{
"epoch": 8.11,
"learning_rate": 5.825429184549356e-06,
"loss": 1.5186,
"step": 3930
},
{
"epoch": 8.13,
"learning_rate": 5.81480686695279e-06,
"loss": 1.5197,
"step": 3940
},
{
"epoch": 8.15,
"learning_rate": 5.804184549356222e-06,
"loss": 1.5179,
"step": 3950
},
{
"epoch": 8.17,
"learning_rate": 5.793562231759657e-06,
"loss": 1.5192,
"step": 3960
},
{
"epoch": 8.19,
"learning_rate": 5.78293991416309e-06,
"loss": 1.5174,
"step": 3970
},
{
"epoch": 8.21,
"learning_rate": 5.772317596566522e-06,
"loss": 1.5159,
"step": 3980
},
{
"epoch": 8.24,
"learning_rate": 5.761695278969958e-06,
"loss": 1.5194,
"step": 3990
},
{
"epoch": 8.26,
"learning_rate": 5.75107296137339e-06,
"loss": 1.5187,
"step": 4000
},
{
"epoch": 8.26,
"eval_loss": 1.4119081497192383,
"eval_runtime": 75.3322,
"eval_samples_per_second": 530.981,
"eval_steps_per_second": 8.297,
"step": 4000
},
{
"epoch": 8.28,
"learning_rate": 5.740450643776823e-06,
"loss": 1.5213,
"step": 4010
},
{
"epoch": 8.3,
"learning_rate": 5.729828326180258e-06,
"loss": 1.5207,
"step": 4020
},
{
"epoch": 8.32,
"learning_rate": 5.719206008583691e-06,
"loss": 1.5175,
"step": 4030
},
{
"epoch": 8.34,
"learning_rate": 5.708583690987124e-06,
"loss": 1.5199,
"step": 4040
},
{
"epoch": 8.36,
"learning_rate": 5.697961373390558e-06,
"loss": 1.5199,
"step": 4050
},
{
"epoch": 8.39,
"learning_rate": 5.687339055793991e-06,
"loss": 1.5171,
"step": 4060
},
{
"epoch": 8.41,
"learning_rate": 5.6767167381974245e-06,
"loss": 1.5184,
"step": 4070
},
{
"epoch": 8.43,
"learning_rate": 5.666094420600858e-06,
"loss": 1.5198,
"step": 4080
},
{
"epoch": 8.45,
"learning_rate": 5.6554721030042915e-06,
"loss": 1.5176,
"step": 4090
},
{
"epoch": 8.47,
"learning_rate": 5.6448497854077255e-06,
"loss": 1.5203,
"step": 4100
},
{
"epoch": 8.49,
"learning_rate": 5.634227467811159e-06,
"loss": 1.5168,
"step": 4110
},
{
"epoch": 8.51,
"learning_rate": 5.623605150214592e-06,
"loss": 1.5165,
"step": 4120
},
{
"epoch": 8.54,
"learning_rate": 5.612982832618026e-06,
"loss": 1.5179,
"step": 4130
},
{
"epoch": 8.56,
"learning_rate": 5.6023605150214595e-06,
"loss": 1.5199,
"step": 4140
},
{
"epoch": 8.58,
"learning_rate": 5.591738197424892e-06,
"loss": 1.5183,
"step": 4150
},
{
"epoch": 8.6,
"learning_rate": 5.581115879828326e-06,
"loss": 1.5149,
"step": 4160
},
{
"epoch": 8.6,
"eval_loss": 1.4132717847824097,
"eval_runtime": 75.3167,
"eval_samples_per_second": 531.09,
"eval_steps_per_second": 8.298,
"step": 4160
},
{
"epoch": 8.62,
"learning_rate": 5.5704935622317605e-06,
"loss": 1.5219,
"step": 4170
},
{
"epoch": 8.64,
"learning_rate": 5.559871244635194e-06,
"loss": 1.5165,
"step": 4180
},
{
"epoch": 8.66,
"learning_rate": 5.5492489270386275e-06,
"loss": 1.5201,
"step": 4190
},
{
"epoch": 8.69,
"learning_rate": 5.538626609442059e-06,
"loss": 1.5194,
"step": 4200
},
{
"epoch": 8.71,
"learning_rate": 5.528004291845492e-06,
"loss": 1.5175,
"step": 4210
},
{
"epoch": 8.73,
"learning_rate": 5.517381974248928e-06,
"loss": 1.5168,
"step": 4220
},
{
"epoch": 8.75,
"learning_rate": 5.50675965665236e-06,
"loss": 1.5181,
"step": 4230
},
{
"epoch": 8.77,
"learning_rate": 5.496137339055794e-06,
"loss": 1.5192,
"step": 4240
},
{
"epoch": 8.79,
"learning_rate": 5.485515021459228e-06,
"loss": 1.5203,
"step": 4250
},
{
"epoch": 8.81,
"learning_rate": 5.474892703862661e-06,
"loss": 1.5199,
"step": 4260
},
{
"epoch": 8.84,
"learning_rate": 5.464270386266095e-06,
"loss": 1.5191,
"step": 4270
},
{
"epoch": 8.86,
"learning_rate": 5.453648068669528e-06,
"loss": 1.5191,
"step": 4280
},
{
"epoch": 8.88,
"learning_rate": 5.443025751072961e-06,
"loss": 1.519,
"step": 4290
},
{
"epoch": 8.9,
"learning_rate": 5.432403433476395e-06,
"loss": 1.5198,
"step": 4300
},
{
"epoch": 8.92,
"learning_rate": 5.421781115879828e-06,
"loss": 1.5212,
"step": 4310
},
{
"epoch": 8.94,
"learning_rate": 5.411158798283261e-06,
"loss": 1.5183,
"step": 4320
},
{
"epoch": 8.94,
"eval_loss": 1.4096869230270386,
"eval_runtime": 75.5469,
"eval_samples_per_second": 529.472,
"eval_steps_per_second": 8.273,
"step": 4320
},
{
"epoch": 8.96,
"learning_rate": 5.400536480686696e-06,
"loss": 1.5217,
"step": 4330
},
{
"epoch": 8.99,
"learning_rate": 5.389914163090128e-06,
"loss": 1.5194,
"step": 4340
},
{
"epoch": 9.01,
"learning_rate": 5.379291845493561e-06,
"loss": 1.6613,
"step": 4350
},
{
"epoch": 9.03,
"learning_rate": 5.368669527896994e-06,
"loss": 1.5163,
"step": 4360
},
{
"epoch": 9.05,
"learning_rate": 5.358047210300429e-06,
"loss": 1.5187,
"step": 4370
},
{
"epoch": 9.07,
"learning_rate": 5.347424892703862e-06,
"loss": 1.5219,
"step": 4380
},
{
"epoch": 9.09,
"learning_rate": 5.336802575107296e-06,
"loss": 1.5155,
"step": 4390
},
{
"epoch": 9.12,
"learning_rate": 5.326180257510729e-06,
"loss": 1.5201,
"step": 4400
},
{
"epoch": 9.14,
"learning_rate": 5.315557939914162e-06,
"loss": 1.5186,
"step": 4410
},
{
"epoch": 9.16,
"learning_rate": 5.304935622317597e-06,
"loss": 1.519,
"step": 4420
},
{
"epoch": 9.18,
"learning_rate": 5.29431330472103e-06,
"loss": 1.5198,
"step": 4430
},
{
"epoch": 9.2,
"learning_rate": 5.2836909871244625e-06,
"loss": 1.5179,
"step": 4440
},
{
"epoch": 9.22,
"learning_rate": 5.273068669527897e-06,
"loss": 1.518,
"step": 4450
},
{
"epoch": 9.24,
"learning_rate": 5.26244635193133e-06,
"loss": 1.5194,
"step": 4460
},
{
"epoch": 9.27,
"learning_rate": 5.251824034334765e-06,
"loss": 1.5188,
"step": 4470
},
{
"epoch": 9.29,
"learning_rate": 5.241201716738198e-06,
"loss": 1.5192,
"step": 4480
},
{
"epoch": 9.29,
"eval_loss": 1.4101494550704956,
"eval_runtime": 75.3568,
"eval_samples_per_second": 530.808,
"eval_steps_per_second": 8.294,
"step": 4480
},
{
"epoch": 9.31,
"learning_rate": 5.2305793991416305e-06,
"loss": 1.515,
"step": 4490
},
{
"epoch": 9.33,
"learning_rate": 5.219957081545064e-06,
"loss": 1.5177,
"step": 4500
},
{
"epoch": 9.35,
"learning_rate": 5.2093347639484984e-06,
"loss": 1.5199,
"step": 4510
},
{
"epoch": 9.37,
"learning_rate": 5.1987124463519315e-06,
"loss": 1.5191,
"step": 4520
},
{
"epoch": 9.39,
"learning_rate": 5.188090128755365e-06,
"loss": 1.518,
"step": 4530
},
{
"epoch": 9.42,
"learning_rate": 5.1774678111587986e-06,
"loss": 1.5198,
"step": 4540
},
{
"epoch": 9.44,
"learning_rate": 5.166845493562232e-06,
"loss": 1.519,
"step": 4550
},
{
"epoch": 9.46,
"learning_rate": 5.156223175965665e-06,
"loss": 1.519,
"step": 4560
},
{
"epoch": 9.48,
"learning_rate": 5.145600858369099e-06,
"loss": 1.5173,
"step": 4570
},
{
"epoch": 9.5,
"learning_rate": 5.134978540772532e-06,
"loss": 1.5176,
"step": 4580
},
{
"epoch": 9.52,
"learning_rate": 5.124356223175966e-06,
"loss": 1.5165,
"step": 4590
},
{
"epoch": 9.54,
"learning_rate": 5.1137339055794e-06,
"loss": 1.5119,
"step": 4600
},
{
"epoch": 9.57,
"learning_rate": 5.103111587982832e-06,
"loss": 1.518,
"step": 4610
},
{
"epoch": 9.59,
"learning_rate": 5.092489270386266e-06,
"loss": 1.5174,
"step": 4620
},
{
"epoch": 9.61,
"learning_rate": 5.0818669527897e-06,
"loss": 1.5171,
"step": 4630
},
{
"epoch": 9.63,
"learning_rate": 5.071244635193132e-06,
"loss": 1.5191,
"step": 4640
},
{
"epoch": 9.63,
"eval_loss": 1.414576768875122,
"eval_runtime": 75.373,
"eval_samples_per_second": 530.694,
"eval_steps_per_second": 8.292,
"step": 4640
},
{
"epoch": 9.65,
"learning_rate": 5.060622317596566e-06,
"loss": 1.5188,
"step": 4650
},
{
"epoch": 9.67,
"learning_rate": 5.050000000000001e-06,
"loss": 1.5213,
"step": 4660
},
{
"epoch": 9.69,
"learning_rate": 5.039377682403434e-06,
"loss": 1.5196,
"step": 4670
},
{
"epoch": 9.72,
"learning_rate": 5.028755364806867e-06,
"loss": 1.5159,
"step": 4680
},
{
"epoch": 9.74,
"learning_rate": 5.018133047210299e-06,
"loss": 1.5182,
"step": 4690
},
{
"epoch": 9.76,
"learning_rate": 5.007510729613734e-06,
"loss": 1.5166,
"step": 4700
},
{
"epoch": 9.78,
"learning_rate": 4.996888412017167e-06,
"loss": 1.5171,
"step": 4710
},
{
"epoch": 9.8,
"learning_rate": 4.9862660944206e-06,
"loss": 1.5191,
"step": 4720
},
{
"epoch": 9.82,
"learning_rate": 4.975643776824034e-06,
"loss": 1.5206,
"step": 4730
},
{
"epoch": 9.84,
"learning_rate": 4.965021459227468e-06,
"loss": 1.5189,
"step": 4740
},
{
"epoch": 9.87,
"learning_rate": 4.954399141630901e-06,
"loss": 1.5213,
"step": 4750
},
{
"epoch": 9.89,
"learning_rate": 4.943776824034335e-06,
"loss": 1.5168,
"step": 4760
},
{
"epoch": 9.91,
"learning_rate": 4.933154506437768e-06,
"loss": 1.5192,
"step": 4770
},
{
"epoch": 9.93,
"learning_rate": 4.922532188841201e-06,
"loss": 1.5167,
"step": 4780
},
{
"epoch": 9.95,
"learning_rate": 4.911909871244635e-06,
"loss": 1.5174,
"step": 4790
},
{
"epoch": 9.97,
"learning_rate": 4.901287553648069e-06,
"loss": 1.5192,
"step": 4800
},
{
"epoch": 9.97,
"eval_loss": 1.4164894819259644,
"eval_runtime": 75.1669,
"eval_samples_per_second": 532.149,
"eval_steps_per_second": 8.315,
"step": 4800
},
{
"epoch": 9.99,
"learning_rate": 4.8906652360515014e-06,
"loss": 1.5199,
"step": 4810
},
{
"epoch": 10.02,
"learning_rate": 4.880042918454935e-06,
"loss": 1.6615,
"step": 4820
},
{
"epoch": 10.04,
"learning_rate": 4.869420600858369e-06,
"loss": 1.5158,
"step": 4830
},
{
"epoch": 10.06,
"learning_rate": 4.858798283261803e-06,
"loss": 1.5177,
"step": 4840
},
{
"epoch": 10.08,
"learning_rate": 4.8481759656652355e-06,
"loss": 1.5187,
"step": 4850
},
{
"epoch": 10.1,
"learning_rate": 4.8375536480686694e-06,
"loss": 1.5207,
"step": 4860
},
{
"epoch": 10.12,
"learning_rate": 4.826931330472103e-06,
"loss": 1.5193,
"step": 4870
},
{
"epoch": 10.15,
"learning_rate": 4.8163090128755365e-06,
"loss": 1.5177,
"step": 4880
},
{
"epoch": 10.17,
"learning_rate": 4.8056866952789696e-06,
"loss": 1.5195,
"step": 4890
},
{
"epoch": 10.19,
"learning_rate": 4.7950643776824035e-06,
"loss": 1.5171,
"step": 4900
},
{
"epoch": 10.21,
"learning_rate": 4.784442060085837e-06,
"loss": 1.5149,
"step": 4910
},
{
"epoch": 10.23,
"learning_rate": 4.7738197424892706e-06,
"loss": 1.5177,
"step": 4920
},
{
"epoch": 10.25,
"learning_rate": 4.763197424892704e-06,
"loss": 1.5178,
"step": 4930
},
{
"epoch": 10.27,
"learning_rate": 4.752575107296137e-06,
"loss": 1.5182,
"step": 4940
},
{
"epoch": 10.3,
"learning_rate": 4.741952789699571e-06,
"loss": 1.5177,
"step": 4950
},
{
"epoch": 10.32,
"learning_rate": 4.731330472103005e-06,
"loss": 1.5164,
"step": 4960
},
{
"epoch": 10.32,
"eval_loss": 1.411863923072815,
"eval_runtime": 75.1375,
"eval_samples_per_second": 532.358,
"eval_steps_per_second": 8.318,
"step": 4960
},
{
"epoch": 10.34,
"learning_rate": 4.7207081545064386e-06,
"loss": 1.5154,
"step": 4970
},
{
"epoch": 10.36,
"learning_rate": 4.710085836909871e-06,
"loss": 1.5185,
"step": 4980
},
{
"epoch": 10.38,
"learning_rate": 4.699463519313305e-06,
"loss": 1.5185,
"step": 4990
},
{
"epoch": 10.4,
"learning_rate": 4.688841201716739e-06,
"loss": 1.5181,
"step": 5000
},
{
"epoch": 10.42,
"learning_rate": 4.678218884120171e-06,
"loss": 1.5178,
"step": 5010
},
{
"epoch": 10.45,
"learning_rate": 4.667596566523605e-06,
"loss": 1.5163,
"step": 5020
},
{
"epoch": 10.47,
"learning_rate": 4.656974248927039e-06,
"loss": 1.5222,
"step": 5030
},
{
"epoch": 10.49,
"learning_rate": 4.646351931330472e-06,
"loss": 1.5118,
"step": 5040
},
{
"epoch": 10.51,
"learning_rate": 4.635729613733905e-06,
"loss": 1.5157,
"step": 5050
},
{
"epoch": 10.53,
"learning_rate": 4.625107296137339e-06,
"loss": 1.5164,
"step": 5060
},
{
"epoch": 10.55,
"learning_rate": 4.614484978540773e-06,
"loss": 1.5154,
"step": 5070
},
{
"epoch": 10.57,
"learning_rate": 4.603862660944206e-06,
"loss": 1.519,
"step": 5080
},
{
"epoch": 10.6,
"learning_rate": 4.59324034334764e-06,
"loss": 1.5173,
"step": 5090
},
{
"epoch": 10.62,
"learning_rate": 4.582618025751073e-06,
"loss": 1.5173,
"step": 5100
},
{
"epoch": 10.64,
"learning_rate": 4.571995708154506e-06,
"loss": 1.5178,
"step": 5110
},
{
"epoch": 10.66,
"learning_rate": 4.56137339055794e-06,
"loss": 1.5235,
"step": 5120
},
{
"epoch": 10.66,
"eval_loss": 1.4089233875274658,
"eval_runtime": 75.36,
"eval_samples_per_second": 530.785,
"eval_steps_per_second": 8.294,
"step": 5120
},
{
"epoch": 10.68,
"learning_rate": 4.550751072961374e-06,
"loss": 1.5191,
"step": 5130
},
{
"epoch": 10.7,
"learning_rate": 4.540128755364806e-06,
"loss": 1.5176,
"step": 5140
},
{
"epoch": 10.72,
"learning_rate": 4.52950643776824e-06,
"loss": 1.5161,
"step": 5150
},
{
"epoch": 10.75,
"learning_rate": 4.518884120171674e-06,
"loss": 1.521,
"step": 5160
},
{
"epoch": 10.77,
"learning_rate": 4.508261802575106e-06,
"loss": 1.5174,
"step": 5170
},
{
"epoch": 10.79,
"learning_rate": 4.49763948497854e-06,
"loss": 1.5192,
"step": 5180
},
{
"epoch": 10.81,
"learning_rate": 4.487017167381974e-06,
"loss": 1.5172,
"step": 5190
},
{
"epoch": 10.83,
"learning_rate": 4.476394849785408e-06,
"loss": 1.5201,
"step": 5200
},
{
"epoch": 10.85,
"learning_rate": 4.4657725321888405e-06,
"loss": 1.5213,
"step": 5210
},
{
"epoch": 10.87,
"learning_rate": 4.455150214592274e-06,
"loss": 1.5197,
"step": 5220
},
{
"epoch": 10.9,
"learning_rate": 4.444527896995708e-06,
"loss": 1.517,
"step": 5230
},
{
"epoch": 10.92,
"learning_rate": 4.4339055793991414e-06,
"loss": 1.5144,
"step": 5240
},
{
"epoch": 10.94,
"learning_rate": 4.423283261802575e-06,
"loss": 1.5191,
"step": 5250
},
{
"epoch": 10.96,
"learning_rate": 4.4126609442060085e-06,
"loss": 1.5206,
"step": 5260
},
{
"epoch": 10.98,
"learning_rate": 4.4020386266094416e-06,
"loss": 1.5202,
"step": 5270
},
{
"epoch": 11.0,
"learning_rate": 4.3914163090128755e-06,
"loss": 1.6571,
"step": 5280
},
{
"epoch": 11.0,
"eval_loss": 1.4120959043502808,
"eval_runtime": 75.422,
"eval_samples_per_second": 530.349,
"eval_steps_per_second": 8.287,
"step": 5280
},
{
"epoch": 11.03,
"learning_rate": 4.3807939914163095e-06,
"loss": 1.5169,
"step": 5290
},
{
"epoch": 11.05,
"learning_rate": 4.370171673819742e-06,
"loss": 1.5192,
"step": 5300
},
{
"epoch": 11.07,
"learning_rate": 4.359549356223176e-06,
"loss": 1.5176,
"step": 5310
},
{
"epoch": 11.09,
"learning_rate": 4.34892703862661e-06,
"loss": 1.5213,
"step": 5320
},
{
"epoch": 11.11,
"learning_rate": 4.3383047210300435e-06,
"loss": 1.5185,
"step": 5330
},
{
"epoch": 11.13,
"learning_rate": 4.327682403433476e-06,
"loss": 1.5159,
"step": 5340
},
{
"epoch": 11.15,
"learning_rate": 4.31706008583691e-06,
"loss": 1.5148,
"step": 5350
},
{
"epoch": 11.18,
"learning_rate": 4.306437768240344e-06,
"loss": 1.5167,
"step": 5360
},
{
"epoch": 11.2,
"learning_rate": 4.295815450643777e-06,
"loss": 1.5164,
"step": 5370
},
{
"epoch": 11.22,
"learning_rate": 4.28519313304721e-06,
"loss": 1.5171,
"step": 5380
},
{
"epoch": 11.24,
"learning_rate": 4.274570815450644e-06,
"loss": 1.5181,
"step": 5390
},
{
"epoch": 11.26,
"learning_rate": 4.263948497854077e-06,
"loss": 1.5188,
"step": 5400
},
{
"epoch": 11.28,
"learning_rate": 4.253326180257511e-06,
"loss": 1.5194,
"step": 5410
},
{
"epoch": 11.3,
"learning_rate": 4.242703862660944e-06,
"loss": 1.5175,
"step": 5420
},
{
"epoch": 11.33,
"learning_rate": 4.232081545064378e-06,
"loss": 1.5138,
"step": 5430
},
{
"epoch": 11.35,
"learning_rate": 4.221459227467811e-06,
"loss": 1.5184,
"step": 5440
},
{
"epoch": 11.35,
"eval_loss": 1.4102325439453125,
"eval_runtime": 75.0945,
"eval_samples_per_second": 532.662,
"eval_steps_per_second": 8.323,
"step": 5440
},
{
"epoch": 11.37,
"learning_rate": 4.210836909871245e-06,
"loss": 1.5191,
"step": 5450
},
{
"epoch": 11.39,
"learning_rate": 4.200214592274679e-06,
"loss": 1.5189,
"step": 5460
},
{
"epoch": 11.41,
"learning_rate": 4.189592274678111e-06,
"loss": 1.517,
"step": 5470
},
{
"epoch": 11.43,
"learning_rate": 4.178969957081545e-06,
"loss": 1.5159,
"step": 5480
},
{
"epoch": 11.45,
"learning_rate": 4.168347639484979e-06,
"loss": 1.5183,
"step": 5490
},
{
"epoch": 11.48,
"learning_rate": 4.157725321888411e-06,
"loss": 1.5185,
"step": 5500
},
{
"epoch": 11.5,
"learning_rate": 4.147103004291845e-06,
"loss": 1.5148,
"step": 5510
},
{
"epoch": 11.52,
"learning_rate": 4.136480686695279e-06,
"loss": 1.5146,
"step": 5520
},
{
"epoch": 11.54,
"learning_rate": 4.125858369098712e-06,
"loss": 1.5176,
"step": 5530
},
{
"epoch": 11.56,
"learning_rate": 4.115236051502145e-06,
"loss": 1.5153,
"step": 5540
},
{
"epoch": 11.58,
"learning_rate": 4.104613733905579e-06,
"loss": 1.5208,
"step": 5550
},
{
"epoch": 11.6,
"learning_rate": 4.093991416309013e-06,
"loss": 1.5162,
"step": 5560
},
{
"epoch": 11.63,
"learning_rate": 4.083369098712446e-06,
"loss": 1.5196,
"step": 5570
},
{
"epoch": 11.65,
"learning_rate": 4.07274678111588e-06,
"loss": 1.517,
"step": 5580
},
{
"epoch": 11.67,
"learning_rate": 4.062124463519313e-06,
"loss": 1.5217,
"step": 5590
},
{
"epoch": 11.69,
"learning_rate": 4.051502145922746e-06,
"loss": 1.5185,
"step": 5600
},
{
"epoch": 11.69,
"eval_loss": 1.4111074209213257,
"eval_runtime": 74.9661,
"eval_samples_per_second": 533.575,
"eval_steps_per_second": 8.337,
"step": 5600
},
{
"epoch": 11.71,
"learning_rate": 4.04087982832618e-06,
"loss": 1.5178,
"step": 5610
},
{
"epoch": 11.73,
"learning_rate": 4.030257510729614e-06,
"loss": 1.5198,
"step": 5620
},
{
"epoch": 11.75,
"learning_rate": 4.0196351931330465e-06,
"loss": 1.5173,
"step": 5630
},
{
"epoch": 11.78,
"learning_rate": 4.0090128755364805e-06,
"loss": 1.5193,
"step": 5640
},
{
"epoch": 11.8,
"learning_rate": 3.998390557939914e-06,
"loss": 1.5197,
"step": 5650
},
{
"epoch": 11.82,
"learning_rate": 3.987768240343348e-06,
"loss": 1.516,
"step": 5660
},
{
"epoch": 11.84,
"learning_rate": 3.977145922746781e-06,
"loss": 1.5179,
"step": 5670
},
{
"epoch": 11.86,
"learning_rate": 3.9665236051502145e-06,
"loss": 1.516,
"step": 5680
},
{
"epoch": 11.88,
"learning_rate": 3.9559012875536485e-06,
"loss": 1.5155,
"step": 5690
},
{
"epoch": 11.9,
"learning_rate": 3.945278969957081e-06,
"loss": 1.5188,
"step": 5700
},
{
"epoch": 11.93,
"learning_rate": 3.934656652360515e-06,
"loss": 1.5197,
"step": 5710
},
{
"epoch": 11.95,
"learning_rate": 3.924034334763949e-06,
"loss": 1.5186,
"step": 5720
},
{
"epoch": 11.97,
"learning_rate": 3.913412017167382e-06,
"loss": 1.5194,
"step": 5730
},
{
"epoch": 11.99,
"learning_rate": 3.902789699570816e-06,
"loss": 1.519,
"step": 5740
},
{
"epoch": 12.01,
"learning_rate": 3.89216738197425e-06,
"loss": 1.6582,
"step": 5750
},
{
"epoch": 12.03,
"learning_rate": 3.881545064377682e-06,
"loss": 1.5172,
"step": 5760
},
{
"epoch": 12.03,
"eval_loss": 1.414238452911377,
"eval_runtime": 75.0838,
"eval_samples_per_second": 532.738,
"eval_steps_per_second": 8.324,
"step": 5760
},
{
"epoch": 12.06,
"learning_rate": 3.870922746781116e-06,
"loss": 1.5144,
"step": 5770
},
{
"epoch": 12.08,
"learning_rate": 3.86030042918455e-06,
"loss": 1.5208,
"step": 5780
},
{
"epoch": 12.1,
"learning_rate": 3.849678111587984e-06,
"loss": 1.5146,
"step": 5790
},
{
"epoch": 12.12,
"learning_rate": 3.839055793991416e-06,
"loss": 1.5187,
"step": 5800
},
{
"epoch": 12.14,
"learning_rate": 3.82843347639485e-06,
"loss": 1.5155,
"step": 5810
},
{
"epoch": 12.16,
"learning_rate": 3.817811158798284e-06,
"loss": 1.5178,
"step": 5820
},
{
"epoch": 12.18,
"learning_rate": 3.8071888412017165e-06,
"loss": 1.5184,
"step": 5830
},
{
"epoch": 12.21,
"learning_rate": 3.7965665236051504e-06,
"loss": 1.5136,
"step": 5840
},
{
"epoch": 12.23,
"learning_rate": 3.785944206008584e-06,
"loss": 1.5176,
"step": 5850
},
{
"epoch": 12.25,
"learning_rate": 3.775321888412017e-06,
"loss": 1.5168,
"step": 5860
},
{
"epoch": 12.27,
"learning_rate": 3.7646995708154505e-06,
"loss": 1.5187,
"step": 5870
},
{
"epoch": 12.29,
"learning_rate": 3.754077253218884e-06,
"loss": 1.5168,
"step": 5880
},
{
"epoch": 12.31,
"learning_rate": 3.743454935622318e-06,
"loss": 1.5164,
"step": 5890
},
{
"epoch": 12.33,
"learning_rate": 3.7328326180257507e-06,
"loss": 1.5146,
"step": 5900
},
{
"epoch": 12.36,
"learning_rate": 3.722210300429185e-06,
"loss": 1.5185,
"step": 5910
},
{
"epoch": 12.38,
"learning_rate": 3.7115879828326186e-06,
"loss": 1.5189,
"step": 5920
},
{
"epoch": 12.38,
"eval_loss": 1.4129457473754883,
"eval_runtime": 75.5813,
"eval_samples_per_second": 529.231,
"eval_steps_per_second": 8.269,
"step": 5920
},
{
"epoch": 12.4,
"learning_rate": 3.7009656652360516e-06,
"loss": 1.5164,
"step": 5930
},
{
"epoch": 12.42,
"learning_rate": 3.6903433476394847e-06,
"loss": 1.5186,
"step": 5940
},
{
"epoch": 12.44,
"learning_rate": 3.6797210300429187e-06,
"loss": 1.5153,
"step": 5950
},
{
"epoch": 12.46,
"learning_rate": 3.6690987124463514e-06,
"loss": 1.5184,
"step": 5960
},
{
"epoch": 12.48,
"learning_rate": 3.6584763948497853e-06,
"loss": 1.5147,
"step": 5970
},
{
"epoch": 12.51,
"learning_rate": 3.6478540772532197e-06,
"loss": 1.5169,
"step": 5980
},
{
"epoch": 12.53,
"learning_rate": 3.637231759656652e-06,
"loss": 1.516,
"step": 5990
},
{
"epoch": 12.55,
"learning_rate": 3.6266094420600854e-06,
"loss": 1.5143,
"step": 6000
},
{
"epoch": 12.57,
"learning_rate": 3.6159871244635194e-06,
"loss": 1.5171,
"step": 6010
},
{
"epoch": 12.59,
"learning_rate": 3.6053648068669533e-06,
"loss": 1.5169,
"step": 6020
},
{
"epoch": 12.61,
"learning_rate": 3.594742489270386e-06,
"loss": 1.5168,
"step": 6030
},
{
"epoch": 12.63,
"learning_rate": 3.58412017167382e-06,
"loss": 1.5174,
"step": 6040
},
{
"epoch": 12.66,
"learning_rate": 3.5734978540772534e-06,
"loss": 1.5191,
"step": 6050
},
{
"epoch": 12.68,
"learning_rate": 3.562875536480686e-06,
"loss": 1.5196,
"step": 6060
},
{
"epoch": 12.7,
"learning_rate": 3.55225321888412e-06,
"loss": 1.5149,
"step": 6070
},
{
"epoch": 12.72,
"learning_rate": 3.541630901287554e-06,
"loss": 1.5147,
"step": 6080
},
{
"epoch": 12.72,
"eval_loss": 1.4089046716690063,
"eval_runtime": 75.931,
"eval_samples_per_second": 526.794,
"eval_steps_per_second": 8.231,
"step": 6080
},
{
"epoch": 12.74,
"learning_rate": 3.5310085836909867e-06,
"loss": 1.5196,
"step": 6090
},
{
"epoch": 12.76,
"learning_rate": 3.5203862660944198e-06,
"loss": 1.5154,
"step": 6100
},
{
"epoch": 12.78,
"learning_rate": 3.509763948497854e-06,
"loss": 1.5179,
"step": 6110
},
{
"epoch": 12.81,
"learning_rate": 3.499141630901288e-06,
"loss": 1.5162,
"step": 6120
},
{
"epoch": 12.83,
"learning_rate": 3.4885193133047207e-06,
"loss": 1.5202,
"step": 6130
},
{
"epoch": 12.85,
"learning_rate": 3.4778969957081547e-06,
"loss": 1.5191,
"step": 6140
},
{
"epoch": 12.87,
"learning_rate": 3.4672746781115886e-06,
"loss": 1.5194,
"step": 6150
},
{
"epoch": 12.89,
"learning_rate": 3.456652360515021e-06,
"loss": 1.5155,
"step": 6160
},
{
"epoch": 12.91,
"learning_rate": 3.4460300429184544e-06,
"loss": 1.5197,
"step": 6170
},
{
"epoch": 12.93,
"learning_rate": 3.4354077253218888e-06,
"loss": 1.5175,
"step": 6180
},
{
"epoch": 12.96,
"learning_rate": 3.424785407725321e-06,
"loss": 1.5163,
"step": 6190
},
{
"epoch": 12.98,
"learning_rate": 3.4141630901287554e-06,
"loss": 1.5197,
"step": 6200
},
{
"epoch": 13.0,
"learning_rate": 3.403540772532189e-06,
"loss": 1.5171,
"step": 6210
},
{
"epoch": 13.02,
"learning_rate": 3.3929184549356216e-06,
"loss": 1.6585,
"step": 6220
},
{
"epoch": 13.04,
"learning_rate": 3.3822961373390555e-06,
"loss": 1.5188,
"step": 6230
},
{
"epoch": 13.06,
"learning_rate": 3.371673819742489e-06,
"loss": 1.5177,
"step": 6240
},
{
"epoch": 13.06,
"eval_loss": 1.4098169803619385,
"eval_runtime": 75.6507,
"eval_samples_per_second": 528.746,
"eval_steps_per_second": 8.262,
"step": 6240
},
{
"epoch": 13.09,
"learning_rate": 3.361051502145923e-06,
"loss": 1.5155,
"step": 6250
},
{
"epoch": 13.11,
"learning_rate": 3.3504291845493556e-06,
"loss": 1.5182,
"step": 6260
},
{
"epoch": 13.13,
"learning_rate": 3.3398068669527896e-06,
"loss": 1.5186,
"step": 6270
},
{
"epoch": 13.15,
"learning_rate": 3.3291845493562235e-06,
"loss": 1.5174,
"step": 6280
},
{
"epoch": 13.17,
"learning_rate": 3.318562231759656e-06,
"loss": 1.5167,
"step": 6290
},
{
"epoch": 13.19,
"learning_rate": 3.30793991416309e-06,
"loss": 1.516,
"step": 6300
},
{
"epoch": 13.21,
"learning_rate": 3.2973175965665236e-06,
"loss": 1.517,
"step": 6310
},
{
"epoch": 13.24,
"learning_rate": 3.2866952789699567e-06,
"loss": 1.5188,
"step": 6320
},
{
"epoch": 13.26,
"learning_rate": 3.2760729613733907e-06,
"loss": 1.5159,
"step": 6330
},
{
"epoch": 13.28,
"learning_rate": 3.265450643776824e-06,
"loss": 1.5168,
"step": 6340
},
{
"epoch": 13.3,
"learning_rate": 3.254828326180258e-06,
"loss": 1.516,
"step": 6350
},
{
"epoch": 13.32,
"learning_rate": 3.244206008583691e-06,
"loss": 1.5141,
"step": 6360
},
{
"epoch": 13.34,
"learning_rate": 3.2335836909871243e-06,
"loss": 1.5172,
"step": 6370
},
{
"epoch": 13.36,
"learning_rate": 3.2229613733905583e-06,
"loss": 1.5169,
"step": 6380
},
{
"epoch": 13.39,
"learning_rate": 3.212339055793991e-06,
"loss": 1.5169,
"step": 6390
},
{
"epoch": 13.41,
"learning_rate": 3.2017167381974253e-06,
"loss": 1.5164,
"step": 6400
},
{
"epoch": 13.41,
"eval_loss": 1.40966796875,
"eval_runtime": 75.1896,
"eval_samples_per_second": 531.988,
"eval_steps_per_second": 8.312,
"step": 6400
},
{
"epoch": 13.43,
"learning_rate": 3.191094420600859e-06,
"loss": 1.5162,
"step": 6410
},
{
"epoch": 13.45,
"learning_rate": 3.180472103004292e-06,
"loss": 1.5146,
"step": 6420
},
{
"epoch": 13.47,
"learning_rate": 3.169849785407725e-06,
"loss": 1.5186,
"step": 6430
},
{
"epoch": 13.49,
"learning_rate": 3.159227467811159e-06,
"loss": 1.5142,
"step": 6440
},
{
"epoch": 13.51,
"learning_rate": 3.1486051502145916e-06,
"loss": 1.5143,
"step": 6450
},
{
"epoch": 13.54,
"learning_rate": 3.1379828326180256e-06,
"loss": 1.5159,
"step": 6460
},
{
"epoch": 13.56,
"learning_rate": 3.12736051502146e-06,
"loss": 1.5143,
"step": 6470
},
{
"epoch": 13.58,
"learning_rate": 3.1167381974248926e-06,
"loss": 1.5186,
"step": 6480
},
{
"epoch": 13.6,
"learning_rate": 3.1061158798283257e-06,
"loss": 1.5153,
"step": 6490
},
{
"epoch": 13.62,
"learning_rate": 3.0954935622317596e-06,
"loss": 1.5184,
"step": 6500
},
{
"epoch": 13.64,
"learning_rate": 3.0848712446351936e-06,
"loss": 1.5186,
"step": 6510
},
{
"epoch": 13.66,
"learning_rate": 3.0742489270386263e-06,
"loss": 1.5228,
"step": 6520
},
{
"epoch": 13.69,
"learning_rate": 3.0636266094420598e-06,
"loss": 1.5177,
"step": 6530
},
{
"epoch": 13.71,
"learning_rate": 3.0530042918454937e-06,
"loss": 1.5168,
"step": 6540
},
{
"epoch": 13.73,
"learning_rate": 3.0423819742489264e-06,
"loss": 1.5179,
"step": 6550
},
{
"epoch": 13.75,
"learning_rate": 3.0317596566523603e-06,
"loss": 1.5188,
"step": 6560
},
{
"epoch": 13.75,
"eval_loss": 1.4108576774597168,
"eval_runtime": 74.9353,
"eval_samples_per_second": 533.794,
"eval_steps_per_second": 8.341,
"step": 6560
},
{
"epoch": 13.77,
"learning_rate": 3.0211373390557943e-06,
"loss": 1.5163,
"step": 6570
},
{
"epoch": 13.79,
"learning_rate": 3.010515021459227e-06,
"loss": 1.5186,
"step": 6580
},
{
"epoch": 13.81,
"learning_rate": 2.99989270386266e-06,
"loss": 1.5162,
"step": 6590
},
{
"epoch": 13.84,
"learning_rate": 2.9892703862660944e-06,
"loss": 1.5188,
"step": 6600
},
{
"epoch": 13.86,
"learning_rate": 2.9786480686695283e-06,
"loss": 1.5164,
"step": 6610
},
{
"epoch": 13.88,
"learning_rate": 2.968025751072961e-06,
"loss": 1.5167,
"step": 6620
},
{
"epoch": 13.9,
"learning_rate": 2.957403433476395e-06,
"loss": 1.5175,
"step": 6630
},
{
"epoch": 13.92,
"learning_rate": 2.946781115879829e-06,
"loss": 1.5165,
"step": 6640
},
{
"epoch": 13.94,
"learning_rate": 2.936158798283261e-06,
"loss": 1.5203,
"step": 6650
},
{
"epoch": 13.96,
"learning_rate": 2.9255364806866947e-06,
"loss": 1.5196,
"step": 6660
},
{
"epoch": 13.99,
"learning_rate": 2.914914163090129e-06,
"loss": 1.5157,
"step": 6670
},
{
"epoch": 14.01,
"learning_rate": 2.9042918454935613e-06,
"loss": 1.6583,
"step": 6680
},
{
"epoch": 14.03,
"learning_rate": 2.8936695278969956e-06,
"loss": 1.518,
"step": 6690
},
{
"epoch": 14.05,
"learning_rate": 2.883047210300429e-06,
"loss": 1.5165,
"step": 6700
},
{
"epoch": 14.07,
"learning_rate": 2.8724248927038627e-06,
"loss": 1.5166,
"step": 6710
},
{
"epoch": 14.09,
"learning_rate": 2.8618025751072958e-06,
"loss": 1.5158,
"step": 6720
},
{
"epoch": 14.09,
"eval_loss": 1.4134007692337036,
"eval_runtime": 74.8707,
"eval_samples_per_second": 534.254,
"eval_steps_per_second": 8.348,
"step": 6720
},
{
"epoch": 14.12,
"learning_rate": 2.8511802575107293e-06,
"loss": 1.5155,
"step": 6730
},
{
"epoch": 14.14,
"learning_rate": 2.8405579399141632e-06,
"loss": 1.5165,
"step": 6740
},
{
"epoch": 14.16,
"learning_rate": 2.829935622317596e-06,
"loss": 1.5155,
"step": 6750
},
{
"epoch": 14.18,
"learning_rate": 2.81931330472103e-06,
"loss": 1.5167,
"step": 6760
},
{
"epoch": 14.2,
"learning_rate": 2.8086909871244638e-06,
"loss": 1.5166,
"step": 6770
},
{
"epoch": 14.22,
"learning_rate": 2.7980686695278965e-06,
"loss": 1.5153,
"step": 6780
},
{
"epoch": 14.24,
"learning_rate": 2.7874463519313304e-06,
"loss": 1.5183,
"step": 6790
},
{
"epoch": 14.27,
"learning_rate": 2.776824034334764e-06,
"loss": 1.5178,
"step": 6800
},
{
"epoch": 14.29,
"learning_rate": 2.766201716738197e-06,
"loss": 1.5166,
"step": 6810
},
{
"epoch": 14.31,
"learning_rate": 2.755579399141631e-06,
"loss": 1.5171,
"step": 6820
},
{
"epoch": 14.33,
"learning_rate": 2.7449570815450645e-06,
"loss": 1.5144,
"step": 6830
},
{
"epoch": 14.35,
"learning_rate": 2.7343347639484984e-06,
"loss": 1.5171,
"step": 6840
},
{
"epoch": 14.37,
"learning_rate": 2.723712446351931e-06,
"loss": 1.5167,
"step": 6850
},
{
"epoch": 14.39,
"learning_rate": 2.7130901287553646e-06,
"loss": 1.5165,
"step": 6860
},
{
"epoch": 14.42,
"learning_rate": 2.7024678111587985e-06,
"loss": 1.5195,
"step": 6870
},
{
"epoch": 14.44,
"learning_rate": 2.691845493562231e-06,
"loss": 1.5134,
"step": 6880
},
{
"epoch": 14.44,
"eval_loss": 1.4091237783432007,
"eval_runtime": 74.8243,
"eval_samples_per_second": 534.586,
"eval_steps_per_second": 8.353,
"step": 6880
},
{
"epoch": 14.46,
"learning_rate": 2.6812231759656656e-06,
"loss": 1.519,
"step": 6890
},
{
"epoch": 14.48,
"learning_rate": 2.670600858369099e-06,
"loss": 1.5172,
"step": 6900
},
{
"epoch": 14.5,
"learning_rate": 2.659978540772532e-06,
"loss": 1.5146,
"step": 6910
},
{
"epoch": 14.52,
"learning_rate": 2.6493562231759653e-06,
"loss": 1.5169,
"step": 6920
},
{
"epoch": 14.54,
"learning_rate": 2.6387339055793992e-06,
"loss": 1.5138,
"step": 6930
},
{
"epoch": 14.57,
"learning_rate": 2.6281115879828336e-06,
"loss": 1.5179,
"step": 6940
},
{
"epoch": 14.59,
"learning_rate": 2.617489270386266e-06,
"loss": 1.5163,
"step": 6950
},
{
"epoch": 14.61,
"learning_rate": 2.6068669527897e-06,
"loss": 1.5131,
"step": 6960
},
{
"epoch": 14.63,
"learning_rate": 2.596244635193133e-06,
"loss": 1.5183,
"step": 6970
},
{
"epoch": 14.65,
"learning_rate": 2.585622317596566e-06,
"loss": 1.5165,
"step": 6980
},
{
"epoch": 14.67,
"learning_rate": 2.575e-06,
"loss": 1.5195,
"step": 6990
},
{
"epoch": 14.69,
"learning_rate": 2.564377682403434e-06,
"loss": 1.517,
"step": 7000
},
{
"epoch": 14.72,
"learning_rate": 2.5537553648068665e-06,
"loss": 1.5183,
"step": 7010
},
{
"epoch": 14.74,
"learning_rate": 2.5431330472103e-06,
"loss": 1.5186,
"step": 7020
},
{
"epoch": 14.76,
"learning_rate": 2.532510729613734e-06,
"loss": 1.5175,
"step": 7030
},
{
"epoch": 14.78,
"learning_rate": 2.5218884120171667e-06,
"loss": 1.5167,
"step": 7040
},
{
"epoch": 14.78,
"eval_loss": 1.4089406728744507,
"eval_runtime": 74.9005,
"eval_samples_per_second": 534.042,
"eval_steps_per_second": 8.344,
"step": 7040
},
{
"epoch": 14.8,
"learning_rate": 2.5112660944206006e-06,
"loss": 1.5191,
"step": 7050
},
{
"epoch": 14.82,
"learning_rate": 2.5006437768240345e-06,
"loss": 1.5178,
"step": 7060
},
{
"epoch": 14.84,
"learning_rate": 2.490021459227468e-06,
"loss": 1.517,
"step": 7070
},
{
"epoch": 14.87,
"learning_rate": 2.4793991416309007e-06,
"loss": 1.5222,
"step": 7080
},
{
"epoch": 14.89,
"learning_rate": 2.4687768240343347e-06,
"loss": 1.5156,
"step": 7090
},
{
"epoch": 14.91,
"learning_rate": 2.4581545064377686e-06,
"loss": 1.5181,
"step": 7100
},
{
"epoch": 14.93,
"learning_rate": 2.4475321888412013e-06,
"loss": 1.5154,
"step": 7110
},
{
"epoch": 14.95,
"learning_rate": 2.4369098712446352e-06,
"loss": 1.5175,
"step": 7120
},
{
"epoch": 14.97,
"learning_rate": 2.4262875536480687e-06,
"loss": 1.5198,
"step": 7130
},
{
"epoch": 14.99,
"learning_rate": 2.4156652360515014e-06,
"loss": 1.5161,
"step": 7140
},
{
"epoch": 15.02,
"learning_rate": 2.4050429184549354e-06,
"loss": 1.6582,
"step": 7150
},
{
"epoch": 15.04,
"learning_rate": 2.3944206008583693e-06,
"loss": 1.516,
"step": 7160
},
{
"epoch": 15.06,
"learning_rate": 2.383798283261803e-06,
"loss": 1.5172,
"step": 7170
},
{
"epoch": 15.08,
"learning_rate": 2.373175965665236e-06,
"loss": 1.5129,
"step": 7180
},
{
"epoch": 15.1,
"learning_rate": 2.3625536480686694e-06,
"loss": 1.5181,
"step": 7190
},
{
"epoch": 15.12,
"learning_rate": 2.3519313304721034e-06,
"loss": 1.5163,
"step": 7200
},
{
"epoch": 15.12,
"eval_loss": 1.4139931201934814,
"eval_runtime": 74.8677,
"eval_samples_per_second": 534.276,
"eval_steps_per_second": 8.348,
"step": 7200
},
{
"epoch": 15.15,
"learning_rate": 2.341309012875536e-06,
"loss": 1.5156,
"step": 7210
},
{
"epoch": 15.17,
"learning_rate": 2.33068669527897e-06,
"loss": 1.5178,
"step": 7220
},
{
"epoch": 15.19,
"learning_rate": 2.3200643776824035e-06,
"loss": 1.5163,
"step": 7230
},
{
"epoch": 15.21,
"learning_rate": 2.3094420600858366e-06,
"loss": 1.5156,
"step": 7240
},
{
"epoch": 15.23,
"learning_rate": 2.29881974248927e-06,
"loss": 1.5184,
"step": 7250
},
{
"epoch": 15.25,
"learning_rate": 2.288197424892704e-06,
"loss": 1.5173,
"step": 7260
},
{
"epoch": 15.27,
"learning_rate": 2.2775751072961367e-06,
"loss": 1.5169,
"step": 7270
},
{
"epoch": 15.3,
"learning_rate": 2.2669527896995707e-06,
"loss": 1.5184,
"step": 7280
},
{
"epoch": 15.32,
"learning_rate": 2.256330472103004e-06,
"loss": 1.5125,
"step": 7290
},
{
"epoch": 15.34,
"learning_rate": 2.245708154506438e-06,
"loss": 1.5161,
"step": 7300
},
{
"epoch": 15.36,
"learning_rate": 2.235085836909871e-06,
"loss": 1.5154,
"step": 7310
},
{
"epoch": 15.38,
"learning_rate": 2.2244635193133047e-06,
"loss": 1.518,
"step": 7320
},
{
"epoch": 15.4,
"learning_rate": 2.2138412017167387e-06,
"loss": 1.5171,
"step": 7330
},
{
"epoch": 15.42,
"learning_rate": 2.2032188841201713e-06,
"loss": 1.5158,
"step": 7340
},
{
"epoch": 15.45,
"learning_rate": 2.192596566523605e-06,
"loss": 1.5165,
"step": 7350
},
{
"epoch": 15.47,
"learning_rate": 2.181974248927039e-06,
"loss": 1.5172,
"step": 7360
},
{
"epoch": 15.47,
"eval_loss": 1.4083021879196167,
"eval_runtime": 74.9039,
"eval_samples_per_second": 534.018,
"eval_steps_per_second": 8.344,
"step": 7360
},
{
"epoch": 15.49,
"learning_rate": 2.1713519313304715e-06,
"loss": 1.5105,
"step": 7370
},
{
"epoch": 15.51,
"learning_rate": 2.1607296137339054e-06,
"loss": 1.5149,
"step": 7380
},
{
"epoch": 15.53,
"learning_rate": 2.1501072961373394e-06,
"loss": 1.5168,
"step": 7390
},
{
"epoch": 15.55,
"learning_rate": 2.139484978540773e-06,
"loss": 1.5151,
"step": 7400
},
{
"epoch": 15.57,
"learning_rate": 2.1288626609442056e-06,
"loss": 1.5163,
"step": 7410
},
{
"epoch": 15.6,
"learning_rate": 2.1182403433476395e-06,
"loss": 1.5149,
"step": 7420
},
{
"epoch": 15.62,
"learning_rate": 2.1076180257510734e-06,
"loss": 1.516,
"step": 7430
},
{
"epoch": 15.64,
"learning_rate": 2.096995708154506e-06,
"loss": 1.5134,
"step": 7440
},
{
"epoch": 15.66,
"learning_rate": 2.08637339055794e-06,
"loss": 1.5204,
"step": 7450
},
{
"epoch": 15.68,
"learning_rate": 2.0757510729613736e-06,
"loss": 1.5173,
"step": 7460
},
{
"epoch": 15.7,
"learning_rate": 2.0651287553648062e-06,
"loss": 1.5172,
"step": 7470
},
{
"epoch": 15.72,
"learning_rate": 2.05450643776824e-06,
"loss": 1.5164,
"step": 7480
},
{
"epoch": 15.75,
"learning_rate": 2.043884120171674e-06,
"loss": 1.5144,
"step": 7490
},
{
"epoch": 15.77,
"learning_rate": 2.033261802575107e-06,
"loss": 1.515,
"step": 7500
},
{
"epoch": 15.79,
"learning_rate": 2.0226394849785403e-06,
"loss": 1.5178,
"step": 7510
},
{
"epoch": 15.81,
"learning_rate": 2.0120171673819743e-06,
"loss": 1.5153,
"step": 7520
},
{
"epoch": 15.81,
"eval_loss": 1.4108531475067139,
"eval_runtime": 74.8832,
"eval_samples_per_second": 534.165,
"eval_steps_per_second": 8.346,
"step": 7520
},
{
"epoch": 15.83,
"learning_rate": 2.001394849785408e-06,
"loss": 1.5179,
"step": 7530
},
{
"epoch": 15.85,
"learning_rate": 1.990772532188841e-06,
"loss": 1.5193,
"step": 7540
},
{
"epoch": 15.87,
"learning_rate": 1.980150214592275e-06,
"loss": 1.516,
"step": 7550
},
{
"epoch": 15.9,
"learning_rate": 1.9695278969957087e-06,
"loss": 1.5181,
"step": 7560
},
{
"epoch": 15.92,
"learning_rate": 1.9589055793991414e-06,
"loss": 1.5148,
"step": 7570
},
{
"epoch": 15.94,
"learning_rate": 1.948283261802575e-06,
"loss": 1.516,
"step": 7580
},
{
"epoch": 15.96,
"learning_rate": 1.937660944206009e-06,
"loss": 1.5202,
"step": 7590
},
{
"epoch": 15.98,
"learning_rate": 1.9270386266094416e-06,
"loss": 1.5186,
"step": 7600
},
{
"epoch": 16.0,
"learning_rate": 1.9164163090128755e-06,
"loss": 1.6598,
"step": 7610
},
{
"epoch": 16.03,
"learning_rate": 1.9057939914163096e-06,
"loss": 1.5168,
"step": 7620
},
{
"epoch": 16.05,
"learning_rate": 1.895171673819742e-06,
"loss": 1.5168,
"step": 7630
},
{
"epoch": 16.07,
"learning_rate": 1.8845493562231756e-06,
"loss": 1.5153,
"step": 7640
},
{
"epoch": 16.09,
"learning_rate": 1.8739270386266096e-06,
"loss": 1.5147,
"step": 7650
},
{
"epoch": 16.11,
"learning_rate": 1.8633047210300433e-06,
"loss": 1.5162,
"step": 7660
},
{
"epoch": 16.13,
"learning_rate": 1.8526824034334762e-06,
"loss": 1.5151,
"step": 7670
},
{
"epoch": 16.15,
"learning_rate": 1.8420600858369101e-06,
"loss": 1.5164,
"step": 7680
},
{
"epoch": 16.15,
"eval_loss": 1.4092837572097778,
"eval_runtime": 74.9552,
"eval_samples_per_second": 533.652,
"eval_steps_per_second": 8.338,
"step": 7680
},
{
"epoch": 16.18,
"learning_rate": 1.8314377682403436e-06,
"loss": 1.5168,
"step": 7690
},
{
"epoch": 16.2,
"learning_rate": 1.8208154506437763e-06,
"loss": 1.5179,
"step": 7700
},
{
"epoch": 16.22,
"learning_rate": 1.8101931330472102e-06,
"loss": 1.515,
"step": 7710
},
{
"epoch": 16.24,
"learning_rate": 1.7995708154506442e-06,
"loss": 1.5175,
"step": 7720
},
{
"epoch": 16.26,
"learning_rate": 1.7889484978540769e-06,
"loss": 1.5177,
"step": 7730
},
{
"epoch": 16.28,
"learning_rate": 1.7783261802575102e-06,
"loss": 1.5157,
"step": 7740
},
{
"epoch": 16.3,
"learning_rate": 1.7677038626609443e-06,
"loss": 1.5161,
"step": 7750
},
{
"epoch": 16.33,
"learning_rate": 1.7570815450643783e-06,
"loss": 1.5166,
"step": 7760
},
{
"epoch": 16.02,
"learning_rate": 1.7464592274678111e-06,
"loss": 1.5157,
"step": 7770
},
{
"epoch": 16.04,
"learning_rate": 1.7358369098712449e-06,
"loss": 1.5163,
"step": 7780
},
{
"epoch": 16.06,
"learning_rate": 1.7252145922746786e-06,
"loss": 1.5171,
"step": 7790
},
{
"epoch": 16.09,
"learning_rate": 1.714592274678111e-06,
"loss": 1.516,
"step": 7800
},
{
"epoch": 16.11,
"learning_rate": 1.7039699570815448e-06,
"loss": 1.5139,
"step": 7810
},
{
"epoch": 16.13,
"learning_rate": 1.693347639484979e-06,
"loss": 1.5152,
"step": 7820
},
{
"epoch": 16.15,
"learning_rate": 1.6827253218884116e-06,
"loss": 1.5139,
"step": 7830
},
{
"epoch": 16.17,
"learning_rate": 1.6721030042918458e-06,
"loss": 1.5164,
"step": 7840
},
{
"epoch": 16.17,
"eval_loss": 1.4107787609100342,
"eval_runtime": 90.0318,
"eval_samples_per_second": 444.288,
"eval_steps_per_second": 6.942,
"step": 7840
},
{
"epoch": 16.19,
"learning_rate": 1.661480686695279e-06,
"loss": 1.5149,
"step": 7850
},
{
"epoch": 16.21,
"learning_rate": 1.6508583690987118e-06,
"loss": 1.5136,
"step": 7860
},
{
"epoch": 16.24,
"learning_rate": 1.6402360515021457e-06,
"loss": 1.5168,
"step": 7870
},
{
"epoch": 16.26,
"learning_rate": 1.6296137339055796e-06,
"loss": 1.5128,
"step": 7880
},
{
"epoch": 16.28,
"learning_rate": 1.6189914163090132e-06,
"loss": 1.5193,
"step": 7890
},
{
"epoch": 16.3,
"learning_rate": 1.6083690987124462e-06,
"loss": 1.5132,
"step": 7900
},
{
"epoch": 16.32,
"learning_rate": 1.5977467811158798e-06,
"loss": 1.5139,
"step": 7910
},
{
"epoch": 16.34,
"learning_rate": 1.5871244635193137e-06,
"loss": 1.5131,
"step": 7920
},
{
"epoch": 16.36,
"learning_rate": 1.5765021459227464e-06,
"loss": 1.5167,
"step": 7930
},
{
"epoch": 16.39,
"learning_rate": 1.5658798283261803e-06,
"loss": 1.5147,
"step": 7940
},
{
"epoch": 16.41,
"learning_rate": 1.5552575107296138e-06,
"loss": 1.5131,
"step": 7950
},
{
"epoch": 16.43,
"learning_rate": 1.544635193133047e-06,
"loss": 1.5183,
"step": 7960
},
{
"epoch": 16.45,
"learning_rate": 1.5340128755364804e-06,
"loss": 1.5162,
"step": 7970
},
{
"epoch": 16.47,
"learning_rate": 1.5233905579399144e-06,
"loss": 1.5194,
"step": 7980
},
{
"epoch": 16.49,
"learning_rate": 1.5127682403433483e-06,
"loss": 1.5126,
"step": 7990
},
{
"epoch": 16.51,
"learning_rate": 1.502145922746781e-06,
"loss": 1.515,
"step": 8000
},
{
"epoch": 16.51,
"eval_loss": 1.410232424736023,
"eval_runtime": 88.626,
"eval_samples_per_second": 451.335,
"eval_steps_per_second": 7.052,
"step": 8000
},
{
"epoch": 16.54,
"learning_rate": 1.4915236051502147e-06,
"loss": 1.5139,
"step": 8010
},
{
"epoch": 16.56,
"learning_rate": 1.4809012875536485e-06,
"loss": 1.5157,
"step": 8020
},
{
"epoch": 16.58,
"learning_rate": 1.470278969957081e-06,
"loss": 1.5185,
"step": 8030
},
{
"epoch": 16.6,
"learning_rate": 1.459656652360515e-06,
"loss": 1.5148,
"step": 8040
},
{
"epoch": 16.62,
"learning_rate": 1.449034334763949e-06,
"loss": 1.5171,
"step": 8050
},
{
"epoch": 16.64,
"learning_rate": 1.438412017167382e-06,
"loss": 1.5161,
"step": 8060
},
{
"epoch": 16.66,
"learning_rate": 1.4277896995708152e-06,
"loss": 1.5214,
"step": 8070
},
{
"epoch": 16.69,
"learning_rate": 1.417167381974249e-06,
"loss": 1.5153,
"step": 8080
},
{
"epoch": 16.71,
"learning_rate": 1.4065450643776818e-06,
"loss": 1.5146,
"step": 8090
},
{
"epoch": 16.73,
"learning_rate": 1.3959227467811158e-06,
"loss": 1.5181,
"step": 8100
},
{
"epoch": 16.75,
"learning_rate": 1.38530042918455e-06,
"loss": 1.519,
"step": 8110
},
{
"epoch": 16.77,
"learning_rate": 1.3746781115879832e-06,
"loss": 1.515,
"step": 8120
},
{
"epoch": 16.79,
"learning_rate": 1.3640557939914159e-06,
"loss": 1.5184,
"step": 8130
},
{
"epoch": 16.81,
"learning_rate": 1.3534334763948498e-06,
"loss": 1.5149,
"step": 8140
},
{
"epoch": 16.84,
"learning_rate": 1.3428111587982836e-06,
"loss": 1.5181,
"step": 8150
},
{
"epoch": 16.86,
"learning_rate": 1.3321888412017164e-06,
"loss": 1.5164,
"step": 8160
},
{
"epoch": 16.86,
"eval_loss": 1.4089733362197876,
"eval_runtime": 75.6986,
"eval_samples_per_second": 528.411,
"eval_steps_per_second": 8.256,
"step": 8160
},
{
"epoch": 16.88,
"learning_rate": 1.32156652360515e-06,
"loss": 1.5162,
"step": 8170
},
{
"epoch": 16.9,
"learning_rate": 1.310944206008584e-06,
"loss": 1.5192,
"step": 8180
},
{
"epoch": 16.92,
"learning_rate": 1.3003218884120166e-06,
"loss": 1.5175,
"step": 8190
},
{
"epoch": 16.94,
"learning_rate": 1.2896995708154505e-06,
"loss": 1.5167,
"step": 8200
},
{
"epoch": 16.96,
"learning_rate": 1.2790772532188845e-06,
"loss": 1.5187,
"step": 8210
},
{
"epoch": 16.99,
"learning_rate": 1.268454935622318e-06,
"loss": 1.5179,
"step": 8220
},
{
"epoch": 17.01,
"learning_rate": 1.2578326180257504e-06,
"loss": 1.6602,
"step": 8230
},
{
"epoch": 17.03,
"learning_rate": 1.2472103004291846e-06,
"loss": 1.5216,
"step": 8240
},
{
"epoch": 17.05,
"learning_rate": 1.2365879828326185e-06,
"loss": 1.5173,
"step": 8250
},
{
"epoch": 17.07,
"learning_rate": 1.2259656652360512e-06,
"loss": 1.5189,
"step": 8260
},
{
"epoch": 17.09,
"learning_rate": 1.2153433476394851e-06,
"loss": 1.5138,
"step": 8270
},
{
"epoch": 17.12,
"learning_rate": 1.2047210300429187e-06,
"loss": 1.5169,
"step": 8280
},
{
"epoch": 17.14,
"learning_rate": 1.1940987124463513e-06,
"loss": 1.5151,
"step": 8290
},
{
"epoch": 17.16,
"learning_rate": 1.1834763948497853e-06,
"loss": 1.5167,
"step": 8300
},
{
"epoch": 17.18,
"learning_rate": 1.1728540772532192e-06,
"loss": 1.519,
"step": 8310
},
{
"epoch": 17.2,
"learning_rate": 1.1622317596566519e-06,
"loss": 1.5163,
"step": 8320
},
{
"epoch": 17.2,
"eval_loss": 1.4109740257263184,
"eval_runtime": 75.6126,
"eval_samples_per_second": 529.012,
"eval_steps_per_second": 8.266,
"step": 8320
},
{
"epoch": 17.22,
"learning_rate": 1.1516094420600858e-06,
"loss": 1.5149,
"step": 8330
},
{
"epoch": 17.24,
"learning_rate": 1.1409871244635193e-06,
"loss": 1.5184,
"step": 8340
},
{
"epoch": 17.27,
"learning_rate": 1.1303648068669533e-06,
"loss": 1.5157,
"step": 8350
},
{
"epoch": 17.29,
"learning_rate": 1.119742489270386e-06,
"loss": 1.5178,
"step": 8360
},
{
"epoch": 17.31,
"learning_rate": 1.10912017167382e-06,
"loss": 1.5144,
"step": 8370
},
{
"epoch": 17.33,
"learning_rate": 1.0984978540772534e-06,
"loss": 1.5142,
"step": 8380
},
{
"epoch": 17.35,
"learning_rate": 1.0878755364806865e-06,
"loss": 1.5178,
"step": 8390
},
{
"epoch": 17.37,
"learning_rate": 1.07725321888412e-06,
"loss": 1.5145,
"step": 8400
},
{
"epoch": 17.39,
"learning_rate": 1.066630901287554e-06,
"loss": 1.5142,
"step": 8410
},
{
"epoch": 17.42,
"learning_rate": 1.0560085836909866e-06,
"loss": 1.5178,
"step": 8420
},
{
"epoch": 17.44,
"learning_rate": 1.0453862660944206e-06,
"loss": 1.5136,
"step": 8430
},
{
"epoch": 17.46,
"learning_rate": 1.0347639484978543e-06,
"loss": 1.5158,
"step": 8440
},
{
"epoch": 17.48,
"learning_rate": 1.024141630901288e-06,
"loss": 1.5147,
"step": 8450
},
{
"epoch": 17.5,
"learning_rate": 1.0135193133047207e-06,
"loss": 1.5132,
"step": 8460
},
{
"epoch": 17.52,
"learning_rate": 1.0028969957081547e-06,
"loss": 1.5158,
"step": 8470
},
{
"epoch": 17.54,
"learning_rate": 9.922746781115884e-07,
"loss": 1.5142,
"step": 8480
},
{
"epoch": 17.54,
"eval_loss": 1.4122473001480103,
"eval_runtime": 75.6407,
"eval_samples_per_second": 528.816,
"eval_steps_per_second": 8.263,
"step": 8480
},
{
"epoch": 17.57,
"learning_rate": 9.816523605150213e-07,
"loss": 1.5191,
"step": 8490
},
{
"epoch": 17.59,
"learning_rate": 9.71030042918455e-07,
"loss": 1.5172,
"step": 8500
},
{
"epoch": 17.61,
"learning_rate": 9.604077253218887e-07,
"loss": 1.5157,
"step": 8510
},
{
"epoch": 17.63,
"learning_rate": 9.497854077253216e-07,
"loss": 1.5165,
"step": 8520
},
{
"epoch": 17.65,
"learning_rate": 9.391630901287551e-07,
"loss": 1.5156,
"step": 8530
},
{
"epoch": 17.67,
"learning_rate": 9.285407725321891e-07,
"loss": 1.5191,
"step": 8540
},
{
"epoch": 17.69,
"learning_rate": 9.179184549356219e-07,
"loss": 1.5147,
"step": 8550
},
{
"epoch": 17.72,
"learning_rate": 9.072961373390556e-07,
"loss": 1.5154,
"step": 8560
},
{
"epoch": 17.74,
"learning_rate": 8.966738197424894e-07,
"loss": 1.5166,
"step": 8570
},
{
"epoch": 17.76,
"learning_rate": 8.860515021459234e-07,
"loss": 1.5149,
"step": 8580
},
{
"epoch": 17.78,
"learning_rate": 8.754291845493559e-07,
"loss": 1.5154,
"step": 8590
},
{
"epoch": 17.8,
"learning_rate": 8.648068669527898e-07,
"loss": 1.5185,
"step": 8600
},
{
"epoch": 17.82,
"learning_rate": 8.541845493562236e-07,
"loss": 1.517,
"step": 8610
},
{
"epoch": 17.84,
"learning_rate": 8.435622317596563e-07,
"loss": 1.5174,
"step": 8620
},
{
"epoch": 17.87,
"learning_rate": 8.329399141630901e-07,
"loss": 1.5211,
"step": 8630
},
{
"epoch": 17.89,
"learning_rate": 8.223175965665239e-07,
"loss": 1.5166,
"step": 8640
},
{
"epoch": 17.89,
"eval_loss": 1.409213662147522,
"eval_runtime": 75.6047,
"eval_samples_per_second": 529.068,
"eval_steps_per_second": 8.267,
"step": 8640
},
{
"epoch": 17.91,
"learning_rate": 8.116952789699566e-07,
"loss": 1.5189,
"step": 8650
},
{
"epoch": 17.93,
"learning_rate": 8.010729613733904e-07,
"loss": 1.5177,
"step": 8660
},
{
"epoch": 17.95,
"learning_rate": 7.904506437768243e-07,
"loss": 1.5154,
"step": 8670
},
{
"epoch": 17.97,
"learning_rate": 7.79828326180258e-07,
"loss": 1.5186,
"step": 8680
},
{
"epoch": 17.99,
"learning_rate": 7.692060085836908e-07,
"loss": 1.5185,
"step": 8690
},
{
"epoch": 18.02,
"learning_rate": 7.585836909871245e-07,
"loss": 1.6583,
"step": 8700
},
{
"epoch": 18.04,
"learning_rate": 7.479613733905582e-07,
"loss": 1.517,
"step": 8710
},
{
"epoch": 18.06,
"learning_rate": 7.37339055793991e-07,
"loss": 1.5154,
"step": 8720
},
{
"epoch": 18.08,
"learning_rate": 7.26716738197425e-07,
"loss": 1.5173,
"step": 8730
},
{
"epoch": 18.1,
"learning_rate": 7.160944206008587e-07,
"loss": 1.5173,
"step": 8740
},
{
"epoch": 18.12,
"learning_rate": 7.054721030042915e-07,
"loss": 1.5159,
"step": 8750
},
{
"epoch": 18.15,
"learning_rate": 6.948497854077252e-07,
"loss": 1.5156,
"step": 8760
},
{
"epoch": 18.17,
"learning_rate": 6.84227467811159e-07,
"loss": 1.5149,
"step": 8770
},
{
"epoch": 18.19,
"learning_rate": 6.736051502145918e-07,
"loss": 1.5168,
"step": 8780
},
{
"epoch": 18.21,
"learning_rate": 6.629828326180255e-07,
"loss": 1.5156,
"step": 8790
},
{
"epoch": 18.23,
"learning_rate": 6.523605150214595e-07,
"loss": 1.5172,
"step": 8800
},
{
"epoch": 18.23,
"eval_loss": 1.405761480331421,
"eval_runtime": 75.6165,
"eval_samples_per_second": 528.985,
"eval_steps_per_second": 8.265,
"step": 8800
},
{
"epoch": 18.25,
"learning_rate": 6.417381974248932e-07,
"loss": 1.5152,
"step": 8810
},
{
"epoch": 18.27,
"learning_rate": 6.31115879828326e-07,
"loss": 1.5184,
"step": 8820
},
{
"epoch": 18.3,
"learning_rate": 6.204935622317597e-07,
"loss": 1.518,
"step": 8830
},
{
"epoch": 18.32,
"learning_rate": 6.098712446351936e-07,
"loss": 1.5147,
"step": 8840
},
{
"epoch": 18.34,
"learning_rate": 5.992489270386262e-07,
"loss": 1.5164,
"step": 8850
},
{
"epoch": 18.36,
"learning_rate": 5.886266094420601e-07,
"loss": 1.5162,
"step": 8860
},
{
"epoch": 18.38,
"learning_rate": 5.780042918454939e-07,
"loss": 1.5188,
"step": 8870
},
{
"epoch": 18.4,
"learning_rate": 5.673819742489266e-07,
"loss": 1.5148,
"step": 8880
},
{
"epoch": 18.42,
"learning_rate": 5.567596566523604e-07,
"loss": 1.5172,
"step": 8890
},
{
"epoch": 18.45,
"learning_rate": 5.461373390557942e-07,
"loss": 1.5137,
"step": 8900
},
{
"epoch": 18.47,
"learning_rate": 5.355150214592269e-07,
"loss": 1.5179,
"step": 8910
},
{
"epoch": 18.49,
"learning_rate": 5.248927038626608e-07,
"loss": 1.5154,
"step": 8920
},
{
"epoch": 18.51,
"learning_rate": 5.142703862660946e-07,
"loss": 1.5135,
"step": 8930
},
{
"epoch": 18.53,
"learning_rate": 5.036480686695283e-07,
"loss": 1.5171,
"step": 8940
},
{
"epoch": 18.55,
"learning_rate": 4.930257510729611e-07,
"loss": 1.5136,
"step": 8950
},
{
"epoch": 18.57,
"learning_rate": 4.824034334763949e-07,
"loss": 1.5153,
"step": 8960
},
{
"epoch": 18.57,
"eval_loss": 1.411158561706543,
"eval_runtime": 75.4883,
"eval_samples_per_second": 529.884,
"eval_steps_per_second": 8.279,
"step": 8960
},
{
"epoch": 18.6,
"learning_rate": 4.717811158798287e-07,
"loss": 1.5157,
"step": 8970
},
{
"epoch": 18.62,
"learning_rate": 4.611587982832614e-07,
"loss": 1.5166,
"step": 8980
},
{
"epoch": 18.64,
"learning_rate": 4.505364806866952e-07,
"loss": 1.5136,
"step": 8990
},
{
"epoch": 18.66,
"learning_rate": 4.39914163090129e-07,
"loss": 1.5197,
"step": 9000
},
{
"epoch": 18.68,
"learning_rate": 4.2929184549356173e-07,
"loss": 1.5174,
"step": 9010
},
{
"epoch": 18.7,
"learning_rate": 4.1866952789699556e-07,
"loss": 1.5153,
"step": 9020
},
{
"epoch": 18.72,
"learning_rate": 4.080472103004293e-07,
"loss": 1.5167,
"step": 9030
},
{
"epoch": 18.75,
"learning_rate": 3.974248927038632e-07,
"loss": 1.5149,
"step": 9040
},
{
"epoch": 18.77,
"learning_rate": 3.868025751072959e-07,
"loss": 1.5155,
"step": 9050
},
{
"epoch": 18.79,
"learning_rate": 3.761802575107297e-07,
"loss": 1.5176,
"step": 9060
},
{
"epoch": 18.81,
"learning_rate": 3.655579399141635e-07,
"loss": 1.5194,
"step": 9070
},
{
"epoch": 18.83,
"learning_rate": 3.549356223175962e-07,
"loss": 1.5176,
"step": 9080
},
{
"epoch": 18.85,
"learning_rate": 3.4431330472103003e-07,
"loss": 1.5181,
"step": 9090
},
{
"epoch": 18.87,
"learning_rate": 3.3369098712446387e-07,
"loss": 1.5143,
"step": 9100
},
{
"epoch": 18.9,
"learning_rate": 3.2306866952789654e-07,
"loss": 1.5157,
"step": 9110
},
{
"epoch": 18.92,
"learning_rate": 3.124463519313304e-07,
"loss": 1.517,
"step": 9120
},
{
"epoch": 18.92,
"eval_loss": 1.4098219871520996,
"eval_runtime": 75.6146,
"eval_samples_per_second": 528.998,
"eval_steps_per_second": 8.266,
"step": 9120
},
{
"epoch": 18.94,
"learning_rate": 3.018240343347642e-07,
"loss": 1.5181,
"step": 9130
},
{
"epoch": 18.96,
"learning_rate": 2.912017167381969e-07,
"loss": 1.5167,
"step": 9140
},
{
"epoch": 18.98,
"learning_rate": 2.805793991416307e-07,
"loss": 1.519,
"step": 9150
},
{
"epoch": 19.0,
"learning_rate": 2.699570815450645e-07,
"loss": 1.66,
"step": 9160
},
{
"epoch": 19.03,
"learning_rate": 2.5933476394849834e-07,
"loss": 1.5183,
"step": 9170
},
{
"epoch": 19.05,
"learning_rate": 2.4871244635193106e-07,
"loss": 1.5174,
"step": 9180
},
{
"epoch": 19.07,
"learning_rate": 2.3809012875536484e-07,
"loss": 1.5166,
"step": 9190
},
{
"epoch": 19.09,
"learning_rate": 2.2746781115879863e-07,
"loss": 1.5178,
"step": 9200
},
{
"epoch": 19.11,
"learning_rate": 2.1684549356223138e-07,
"loss": 1.5152,
"step": 9210
},
{
"epoch": 19.13,
"learning_rate": 2.0622317596566518e-07,
"loss": 1.5159,
"step": 9220
},
{
"epoch": 19.15,
"learning_rate": 1.95600858369099e-07,
"loss": 1.5153,
"step": 9230
},
{
"epoch": 19.18,
"learning_rate": 1.849785407725317e-07,
"loss": 1.5167,
"step": 9240
},
{
"epoch": 19.2,
"learning_rate": 1.7435622317596553e-07,
"loss": 1.5139,
"step": 9250
},
{
"epoch": 19.22,
"learning_rate": 1.637339055793993e-07,
"loss": 1.5161,
"step": 9260
},
{
"epoch": 19.24,
"learning_rate": 1.5311158798283315e-07,
"loss": 1.5158,
"step": 9270
},
{
"epoch": 19.26,
"learning_rate": 1.4248927038626584e-07,
"loss": 1.5163,
"step": 9280
},
{
"epoch": 19.26,
"eval_loss": 1.4113389253616333,
"eval_runtime": 75.6638,
"eval_samples_per_second": 528.654,
"eval_steps_per_second": 8.26,
"step": 9280
},
{
"epoch": 19.28,
"learning_rate": 1.3186695278969965e-07,
"loss": 1.5164,
"step": 9290
},
{
"epoch": 19.3,
"learning_rate": 1.2124463519313346e-07,
"loss": 1.5164,
"step": 9300
},
{
"epoch": 19.33,
"learning_rate": 1.1062231759656619e-07,
"loss": 1.5135,
"step": 9310
},
{
"epoch": 19.35,
"learning_rate": 1e-07,
"loss": 1.5163,
"step": 9320
},
{
"before_init_mem_cpu": 1174028288,
"before_init_mem_gpu": 0,
"epoch": 19.35,
"init_mem_cpu_alloc_delta": 2783887360,
"init_mem_cpu_peaked_delta": 414412800,
"init_mem_gpu_alloc_delta": 497994240,
"init_mem_gpu_peaked_delta": 512,
"step": 9320,
"total_flos": 2.013804511704069e+19,
"train_loss": 0.2542688299146333,
"train_mem_cpu_alloc_delta": 8443502592,
"train_mem_cpu_peaked_delta": 383119360,
"train_mem_gpu_alloc_delta": 2006973440,
"train_mem_gpu_peaked_delta": 15468135424,
"train_runtime": 73469.8819,
"train_samples_per_second": 1041.258,
"train_steps_per_second": 0.127
}
],
"max_steps": 9320,
"num_train_epochs": 20,
"total_flos": 2.013804511704069e+19,
"trial_name": null,
"trial_params": null
}