deepshard-13B-ft / trainer_state.json
srikanthsrnvs's picture
Upload with huggingface_hub
ed93222
raw
history blame
No virus
59.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"global_step": 4800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 6.944444444444446e-07,
"loss": 1.4352,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 2.0833333333333334e-06,
"loss": 1.4072,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 3.4722222222222224e-06,
"loss": 1.0503,
"step": 30
},
{
"epoch": 0.03,
"learning_rate": 4.861111111111111e-06,
"loss": 0.8994,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 6.25e-06,
"loss": 0.7676,
"step": 50
},
{
"epoch": 0.04,
"learning_rate": 7.638888888888888e-06,
"loss": 0.7975,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 9.027777777777779e-06,
"loss": 0.7504,
"step": 70
},
{
"epoch": 0.05,
"learning_rate": 1.0416666666666668e-05,
"loss": 0.7608,
"step": 80
},
{
"epoch": 0.06,
"learning_rate": 1.1805555555555557e-05,
"loss": 0.7681,
"step": 90
},
{
"epoch": 0.06,
"learning_rate": 1.3194444444444446e-05,
"loss": 0.7687,
"step": 100
},
{
"epoch": 0.07,
"learning_rate": 1.4583333333333333e-05,
"loss": 0.7127,
"step": 110
},
{
"epoch": 0.07,
"learning_rate": 1.5972222222222224e-05,
"loss": 0.7564,
"step": 120
},
{
"epoch": 0.08,
"learning_rate": 1.7361111111111114e-05,
"loss": 0.7366,
"step": 130
},
{
"epoch": 0.09,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.7586,
"step": 140
},
{
"epoch": 0.09,
"learning_rate": 1.9999997723625092e-05,
"loss": 0.7792,
"step": 150
},
{
"epoch": 0.1,
"learning_rate": 1.999972455989006e-05,
"loss": 0.7505,
"step": 160
},
{
"epoch": 0.11,
"learning_rate": 1.9998996135423355e-05,
"loss": 0.7458,
"step": 170
},
{
"epoch": 0.11,
"learning_rate": 1.9997812483388194e-05,
"loss": 0.7705,
"step": 180
},
{
"epoch": 0.12,
"learning_rate": 1.9996173657673094e-05,
"loss": 0.7879,
"step": 190
},
{
"epoch": 0.12,
"learning_rate": 1.9994079732889404e-05,
"loss": 0.7624,
"step": 200
},
{
"epoch": 0.13,
"learning_rate": 1.999153080436793e-05,
"loss": 0.7882,
"step": 210
},
{
"epoch": 0.14,
"learning_rate": 1.998852698815457e-05,
"loss": 0.744,
"step": 220
},
{
"epoch": 0.14,
"learning_rate": 1.998506842100505e-05,
"loss": 0.7527,
"step": 230
},
{
"epoch": 0.15,
"learning_rate": 1.9981155260378685e-05,
"loss": 0.7637,
"step": 240
},
{
"epoch": 0.16,
"learning_rate": 1.9976787684431223e-05,
"loss": 0.7317,
"step": 250
},
{
"epoch": 0.16,
"learning_rate": 1.997196589200672e-05,
"loss": 0.7602,
"step": 260
},
{
"epoch": 0.17,
"learning_rate": 1.9966690102628496e-05,
"loss": 0.7491,
"step": 270
},
{
"epoch": 0.17,
"learning_rate": 1.9960960556489145e-05,
"loss": 0.7414,
"step": 280
},
{
"epoch": 0.18,
"learning_rate": 1.995477751443959e-05,
"loss": 0.7513,
"step": 290
},
{
"epoch": 0.19,
"learning_rate": 1.9948141257977198e-05,
"loss": 0.7192,
"step": 300
},
{
"epoch": 0.19,
"learning_rate": 1.9941052089233e-05,
"loss": 0.7267,
"step": 310
},
{
"epoch": 0.2,
"learning_rate": 1.9933510330957896e-05,
"loss": 0.7066,
"step": 320
},
{
"epoch": 0.21,
"learning_rate": 1.9925516326507984e-05,
"loss": 0.6888,
"step": 330
},
{
"epoch": 0.21,
"learning_rate": 1.9917070439828916e-05,
"loss": 0.7601,
"step": 340
},
{
"epoch": 0.22,
"learning_rate": 1.9908173055439343e-05,
"loss": 0.7598,
"step": 350
},
{
"epoch": 0.23,
"learning_rate": 1.989882457841339e-05,
"loss": 0.7245,
"step": 360
},
{
"epoch": 0.23,
"learning_rate": 1.9889025434362236e-05,
"loss": 0.6829,
"step": 370
},
{
"epoch": 0.24,
"learning_rate": 1.9878776069414714e-05,
"loss": 0.7353,
"step": 380
},
{
"epoch": 0.24,
"learning_rate": 1.986807695019701e-05,
"loss": 0.7357,
"step": 390
},
{
"epoch": 0.25,
"learning_rate": 1.9856928563811438e-05,
"loss": 0.7185,
"step": 400
},
{
"epoch": 0.26,
"learning_rate": 1.9845331417814223e-05,
"loss": 0.7281,
"step": 410
},
{
"epoch": 0.26,
"learning_rate": 1.983328604019243e-05,
"loss": 0.6729,
"step": 420
},
{
"epoch": 0.27,
"learning_rate": 1.9820792979339897e-05,
"loss": 0.7019,
"step": 430
},
{
"epoch": 0.28,
"learning_rate": 1.9807852804032306e-05,
"loss": 0.6777,
"step": 440
},
{
"epoch": 0.28,
"learning_rate": 1.9794466103401244e-05,
"loss": 0.661,
"step": 450
},
{
"epoch": 0.29,
"learning_rate": 1.978063348690741e-05,
"loss": 0.679,
"step": 460
},
{
"epoch": 0.29,
"learning_rate": 1.9766355584312866e-05,
"loss": 0.7216,
"step": 470
},
{
"epoch": 0.3,
"learning_rate": 1.975163304565235e-05,
"loss": 0.6873,
"step": 480
},
{
"epoch": 0.31,
"learning_rate": 1.97364665412037e-05,
"loss": 0.7134,
"step": 490
},
{
"epoch": 0.31,
"learning_rate": 1.9720856761457326e-05,
"loss": 0.6959,
"step": 500
},
{
"epoch": 0.32,
"learning_rate": 1.9704804417084772e-05,
"loss": 0.7185,
"step": 510
},
{
"epoch": 0.33,
"learning_rate": 1.9688310238906378e-05,
"loss": 0.7001,
"step": 520
},
{
"epoch": 0.33,
"learning_rate": 1.9671374977857987e-05,
"loss": 0.6652,
"step": 530
},
{
"epoch": 0.34,
"learning_rate": 1.9653999404956775e-05,
"loss": 0.6838,
"step": 540
},
{
"epoch": 0.34,
"learning_rate": 1.9636184311266126e-05,
"loss": 0.7071,
"step": 550
},
{
"epoch": 0.35,
"learning_rate": 1.9617930507859643e-05,
"loss": 0.6807,
"step": 560
},
{
"epoch": 0.36,
"learning_rate": 1.9599238825784212e-05,
"loss": 0.6743,
"step": 570
},
{
"epoch": 0.36,
"learning_rate": 1.9580110116022145e-05,
"loss": 0.6475,
"step": 580
},
{
"epoch": 0.37,
"learning_rate": 1.9560545249452477e-05,
"loss": 0.7439,
"step": 590
},
{
"epoch": 0.38,
"learning_rate": 1.9540545116811295e-05,
"loss": 0.6432,
"step": 600
},
{
"epoch": 0.38,
"learning_rate": 1.9520110628651174e-05,
"loss": 0.6582,
"step": 610
},
{
"epoch": 0.39,
"learning_rate": 1.9499242715299743e-05,
"loss": 0.6583,
"step": 620
},
{
"epoch": 0.39,
"learning_rate": 1.9477942326817322e-05,
"loss": 0.6494,
"step": 630
},
{
"epoch": 0.4,
"learning_rate": 1.9456210432953663e-05,
"loss": 0.6734,
"step": 640
},
{
"epoch": 0.41,
"learning_rate": 1.943404802310381e-05,
"loss": 0.6404,
"step": 650
},
{
"epoch": 0.41,
"learning_rate": 1.9411456106263053e-05,
"loss": 0.7011,
"step": 660
},
{
"epoch": 0.42,
"learning_rate": 1.9388435710980975e-05,
"loss": 0.6624,
"step": 670
},
{
"epoch": 0.42,
"learning_rate": 1.9364987885314645e-05,
"loss": 0.6611,
"step": 680
},
{
"epoch": 0.43,
"learning_rate": 1.9341113696780892e-05,
"loss": 0.6348,
"step": 690
},
{
"epoch": 0.44,
"learning_rate": 1.931681423230771e-05,
"loss": 0.6237,
"step": 700
},
{
"epoch": 0.44,
"learning_rate": 1.9292090598184768e-05,
"loss": 0.6512,
"step": 710
},
{
"epoch": 0.45,
"learning_rate": 1.926694392001305e-05,
"loss": 0.6436,
"step": 720
},
{
"epoch": 0.46,
"learning_rate": 1.924137534265359e-05,
"loss": 0.6544,
"step": 730
},
{
"epoch": 0.46,
"learning_rate": 1.9215386030175383e-05,
"loss": 0.6266,
"step": 740
},
{
"epoch": 0.47,
"learning_rate": 1.9188977165802362e-05,
"loss": 0.6249,
"step": 750
},
{
"epoch": 0.47,
"learning_rate": 1.9162149951859527e-05,
"loss": 0.6258,
"step": 760
},
{
"epoch": 0.48,
"learning_rate": 1.9134905609718235e-05,
"loss": 0.6292,
"step": 770
},
{
"epoch": 0.49,
"learning_rate": 1.910724537974056e-05,
"loss": 0.6194,
"step": 780
},
{
"epoch": 0.49,
"learning_rate": 1.9079170521222835e-05,
"loss": 0.6591,
"step": 790
},
{
"epoch": 0.5,
"learning_rate": 1.905068231233834e-05,
"loss": 0.6126,
"step": 800
},
{
"epoch": 0.51,
"learning_rate": 1.9021782050079074e-05,
"loss": 0.6783,
"step": 810
},
{
"epoch": 0.51,
"learning_rate": 1.899247105019674e-05,
"loss": 0.6676,
"step": 820
},
{
"epoch": 0.52,
"learning_rate": 1.8962750647142808e-05,
"loss": 0.6596,
"step": 830
},
{
"epoch": 0.53,
"learning_rate": 1.89326221940078e-05,
"loss": 0.6616,
"step": 840
},
{
"epoch": 0.53,
"learning_rate": 1.8902087062459658e-05,
"loss": 0.6219,
"step": 850
},
{
"epoch": 0.54,
"learning_rate": 1.8871146642681304e-05,
"loss": 0.6721,
"step": 860
},
{
"epoch": 0.54,
"learning_rate": 1.8839802343307354e-05,
"loss": 0.6224,
"step": 870
},
{
"epoch": 0.55,
"learning_rate": 1.880805559135998e-05,
"loss": 0.6115,
"step": 880
},
{
"epoch": 0.56,
"learning_rate": 1.8775907832183945e-05,
"loss": 0.6151,
"step": 890
},
{
"epoch": 0.56,
"learning_rate": 1.8743360529380794e-05,
"loss": 0.6023,
"step": 900
},
{
"epoch": 0.57,
"learning_rate": 1.871041516474224e-05,
"loss": 0.5917,
"step": 910
},
{
"epoch": 0.57,
"learning_rate": 1.8677073238182667e-05,
"loss": 0.6032,
"step": 920
},
{
"epoch": 0.58,
"learning_rate": 1.8643336267670885e-05,
"loss": 0.5802,
"step": 930
},
{
"epoch": 0.59,
"learning_rate": 1.860920578916098e-05,
"loss": 0.5687,
"step": 940
},
{
"epoch": 0.59,
"learning_rate": 1.8574683356522416e-05,
"loss": 0.5965,
"step": 950
},
{
"epoch": 0.6,
"learning_rate": 1.853977054146928e-05,
"loss": 0.6706,
"step": 960
},
{
"epoch": 0.61,
"learning_rate": 1.850446893348872e-05,
"loss": 0.5983,
"step": 970
},
{
"epoch": 0.61,
"learning_rate": 1.8468780139768602e-05,
"loss": 0.5642,
"step": 980
},
{
"epoch": 0.62,
"learning_rate": 1.8432705785124306e-05,
"loss": 0.5775,
"step": 990
},
{
"epoch": 0.62,
"learning_rate": 1.8396247511924782e-05,
"loss": 0.6504,
"step": 1000
},
{
"epoch": 0.62,
"eval_loss": NaN,
"eval_runtime": 16.558,
"eval_samples_per_second": 61.843,
"eval_steps_per_second": 3.865,
"step": 1000
},
{
"epoch": 0.63,
"learning_rate": 1.8359406980017763e-05,
"loss": 0.5675,
"step": 1010
},
{
"epoch": 0.64,
"learning_rate": 1.8322185866654192e-05,
"loss": 0.5895,
"step": 1020
},
{
"epoch": 0.64,
"learning_rate": 1.8284585866411882e-05,
"loss": 0.5816,
"step": 1030
},
{
"epoch": 0.65,
"learning_rate": 1.8246608691118343e-05,
"loss": 0.5847,
"step": 1040
},
{
"epoch": 0.66,
"learning_rate": 1.820825606977286e-05,
"loss": 0.6139,
"step": 1050
},
{
"epoch": 0.66,
"learning_rate": 1.816952974846777e-05,
"loss": 0.6137,
"step": 1060
},
{
"epoch": 0.67,
"learning_rate": 1.813043149030898e-05,
"loss": 0.5607,
"step": 1070
},
{
"epoch": 0.68,
"learning_rate": 1.809096307533567e-05,
"loss": 0.6062,
"step": 1080
},
{
"epoch": 0.68,
"learning_rate": 1.8051126300439295e-05,
"loss": 0.5783,
"step": 1090
},
{
"epoch": 0.69,
"learning_rate": 1.801092297928173e-05,
"loss": 0.5812,
"step": 1100
},
{
"epoch": 0.69,
"learning_rate": 1.7970354942212736e-05,
"loss": 0.5843,
"step": 1110
},
{
"epoch": 0.7,
"learning_rate": 1.7929424036186603e-05,
"loss": 0.5849,
"step": 1120
},
{
"epoch": 0.71,
"learning_rate": 1.788813212467809e-05,
"loss": 0.6044,
"step": 1130
},
{
"epoch": 0.71,
"learning_rate": 1.7846481087597553e-05,
"loss": 0.5326,
"step": 1140
},
{
"epoch": 0.72,
"learning_rate": 1.7804472821205387e-05,
"loss": 0.5767,
"step": 1150
},
{
"epoch": 0.72,
"learning_rate": 1.7762109238025682e-05,
"loss": 0.5525,
"step": 1160
},
{
"epoch": 0.73,
"learning_rate": 1.771939226675915e-05,
"loss": 0.5936,
"step": 1170
},
{
"epoch": 0.74,
"learning_rate": 1.7676323852195313e-05,
"loss": 0.5523,
"step": 1180
},
{
"epoch": 0.74,
"learning_rate": 1.763290595512398e-05,
"loss": 0.5267,
"step": 1190
},
{
"epoch": 0.75,
"learning_rate": 1.7589140552245946e-05,
"loss": 0.5738,
"step": 1200
},
{
"epoch": 0.76,
"learning_rate": 1.7545029636083036e-05,
"loss": 0.5236,
"step": 1210
},
{
"epoch": 0.76,
"learning_rate": 1.7500575214887354e-05,
"loss": 0.5457,
"step": 1220
},
{
"epoch": 0.77,
"learning_rate": 1.7455779312549885e-05,
"loss": 0.5873,
"step": 1230
},
{
"epoch": 0.78,
"learning_rate": 1.7410643968508324e-05,
"loss": 0.5426,
"step": 1240
},
{
"epoch": 0.78,
"learning_rate": 1.736517123765425e-05,
"loss": 0.5286,
"step": 1250
},
{
"epoch": 0.79,
"learning_rate": 1.731936319023956e-05,
"loss": 0.5222,
"step": 1260
},
{
"epoch": 0.79,
"learning_rate": 1.727322191178221e-05,
"loss": 0.5428,
"step": 1270
},
{
"epoch": 0.8,
"learning_rate": 1.7226749502971288e-05,
"loss": 0.528,
"step": 1280
},
{
"epoch": 0.81,
"learning_rate": 1.717994807957135e-05,
"loss": 0.5526,
"step": 1290
},
{
"epoch": 0.81,
"learning_rate": 1.7132819772326107e-05,
"loss": 0.5461,
"step": 1300
},
{
"epoch": 0.82,
"learning_rate": 1.708536672686143e-05,
"loss": 0.5559,
"step": 1310
},
{
"epoch": 0.82,
"learning_rate": 1.7037591103587643e-05,
"loss": 0.5404,
"step": 1320
},
{
"epoch": 0.83,
"learning_rate": 1.6989495077601174e-05,
"loss": 0.5629,
"step": 1330
},
{
"epoch": 0.84,
"learning_rate": 1.6941080838585537e-05,
"loss": 0.529,
"step": 1340
},
{
"epoch": 0.84,
"learning_rate": 1.6892350590711634e-05,
"loss": 0.5623,
"step": 1350
},
{
"epoch": 0.85,
"learning_rate": 1.6843306552537394e-05,
"loss": 0.5111,
"step": 1360
},
{
"epoch": 0.86,
"learning_rate": 1.67939509569068e-05,
"loss": 0.5376,
"step": 1370
},
{
"epoch": 0.86,
"learning_rate": 1.6744286050848194e-05,
"loss": 0.5547,
"step": 1380
},
{
"epoch": 0.87,
"learning_rate": 1.6694314095472006e-05,
"loss": 0.5552,
"step": 1390
},
{
"epoch": 0.88,
"learning_rate": 1.66440373658678e-05,
"loss": 0.5061,
"step": 1400
},
{
"epoch": 0.88,
"learning_rate": 1.659345815100069e-05,
"loss": 0.5242,
"step": 1410
},
{
"epoch": 0.89,
"learning_rate": 1.654257875360714e-05,
"loss": 0.5066,
"step": 1420
},
{
"epoch": 0.89,
"learning_rate": 1.6491401490090125e-05,
"loss": 0.5358,
"step": 1430
},
{
"epoch": 0.9,
"learning_rate": 1.6439928690413663e-05,
"loss": 0.5006,
"step": 1440
},
{
"epoch": 0.91,
"learning_rate": 1.638816269799674e-05,
"loss": 0.5012,
"step": 1450
},
{
"epoch": 0.91,
"learning_rate": 1.633610586960664e-05,
"loss": 0.5365,
"step": 1460
},
{
"epoch": 0.92,
"learning_rate": 1.6283760575251613e-05,
"loss": 0.513,
"step": 1470
},
{
"epoch": 0.93,
"learning_rate": 1.6231129198073e-05,
"loss": 0.5552,
"step": 1480
},
{
"epoch": 0.93,
"learning_rate": 1.6178214134236733e-05,
"loss": 0.5281,
"step": 1490
},
{
"epoch": 0.94,
"learning_rate": 1.6125017792824237e-05,
"loss": 0.5118,
"step": 1500
},
{
"epoch": 0.94,
"learning_rate": 1.607154259572275e-05,
"loss": 0.4905,
"step": 1510
},
{
"epoch": 0.95,
"learning_rate": 1.6017790977515063e-05,
"loss": 0.5183,
"step": 1520
},
{
"epoch": 0.96,
"learning_rate": 1.5963765385368687e-05,
"loss": 0.5085,
"step": 1530
},
{
"epoch": 0.96,
"learning_rate": 1.5909468278924433e-05,
"loss": 0.5221,
"step": 1540
},
{
"epoch": 0.97,
"learning_rate": 1.5854902130184426e-05,
"loss": 0.5477,
"step": 1550
},
{
"epoch": 0.97,
"learning_rate": 1.5800069423399576e-05,
"loss": 0.5533,
"step": 1560
},
{
"epoch": 0.98,
"learning_rate": 1.5744972654956466e-05,
"loss": 0.5319,
"step": 1570
},
{
"epoch": 0.99,
"learning_rate": 1.568961433326369e-05,
"loss": 0.5048,
"step": 1580
},
{
"epoch": 0.99,
"learning_rate": 1.5633996978637685e-05,
"loss": 0.4891,
"step": 1590
},
{
"epoch": 1.0,
"learning_rate": 1.5578123123187944e-05,
"loss": 0.531,
"step": 1600
},
{
"epoch": 1.01,
"learning_rate": 1.5521995310701762e-05,
"loss": 0.4531,
"step": 1610
},
{
"epoch": 1.01,
"learning_rate": 1.5465616096528427e-05,
"loss": 0.4815,
"step": 1620
},
{
"epoch": 1.02,
"learning_rate": 1.5408988047462866e-05,
"loss": 0.5315,
"step": 1630
},
{
"epoch": 1.02,
"learning_rate": 1.5352113741628795e-05,
"loss": 0.4574,
"step": 1640
},
{
"epoch": 1.03,
"learning_rate": 1.5294995768361342e-05,
"loss": 0.477,
"step": 1650
},
{
"epoch": 1.04,
"learning_rate": 1.5237636728089169e-05,
"loss": 0.4898,
"step": 1660
},
{
"epoch": 1.04,
"learning_rate": 1.5180039232216062e-05,
"loss": 0.4988,
"step": 1670
},
{
"epoch": 1.05,
"learning_rate": 1.5122205903002068e-05,
"loss": 0.4846,
"step": 1680
},
{
"epoch": 1.06,
"learning_rate": 1.5064139373444077e-05,
"loss": 0.4404,
"step": 1690
},
{
"epoch": 1.06,
"learning_rate": 1.500584228715599e-05,
"loss": 0.4469,
"step": 1700
},
{
"epoch": 1.07,
"learning_rate": 1.494731729824832e-05,
"loss": 0.4866,
"step": 1710
},
{
"epoch": 1.07,
"learning_rate": 1.4888567071207383e-05,
"loss": 0.4554,
"step": 1720
},
{
"epoch": 1.08,
"learning_rate": 1.4829594280773993e-05,
"loss": 0.4532,
"step": 1730
},
{
"epoch": 1.09,
"learning_rate": 1.4770401611821672e-05,
"loss": 0.4777,
"step": 1740
},
{
"epoch": 1.09,
"learning_rate": 1.4710991759234425e-05,
"loss": 0.4504,
"step": 1750
},
{
"epoch": 1.1,
"learning_rate": 1.4651367427784049e-05,
"loss": 0.4489,
"step": 1760
},
{
"epoch": 1.11,
"learning_rate": 1.4591531332006987e-05,
"loss": 0.496,
"step": 1770
},
{
"epoch": 1.11,
"learning_rate": 1.4531486196080753e-05,
"loss": 0.4507,
"step": 1780
},
{
"epoch": 1.12,
"learning_rate": 1.4471234753699887e-05,
"loss": 0.4795,
"step": 1790
},
{
"epoch": 1.12,
"learning_rate": 1.4410779747951526e-05,
"loss": 0.4479,
"step": 1800
},
{
"epoch": 1.13,
"learning_rate": 1.4350123931190498e-05,
"loss": 0.3965,
"step": 1810
},
{
"epoch": 1.14,
"learning_rate": 1.4289270064914012e-05,
"loss": 0.5061,
"step": 1820
},
{
"epoch": 1.14,
"learning_rate": 1.4228220919635946e-05,
"loss": 0.3941,
"step": 1830
},
{
"epoch": 1.15,
"learning_rate": 1.4166979274760718e-05,
"loss": 0.4335,
"step": 1840
},
{
"epoch": 1.16,
"learning_rate": 1.4105547918456726e-05,
"loss": 0.4662,
"step": 1850
},
{
"epoch": 1.16,
"learning_rate": 1.4043929647529424e-05,
"loss": 0.4777,
"step": 1860
},
{
"epoch": 1.17,
"learning_rate": 1.3982127267294e-05,
"loss": 0.4501,
"step": 1870
},
{
"epoch": 1.18,
"learning_rate": 1.3920143591447635e-05,
"loss": 0.4332,
"step": 1880
},
{
"epoch": 1.18,
"learning_rate": 1.3857981441941419e-05,
"loss": 0.4321,
"step": 1890
},
{
"epoch": 1.19,
"learning_rate": 1.3795643648851869e-05,
"loss": 0.4029,
"step": 1900
},
{
"epoch": 1.19,
"learning_rate": 1.3733133050252087e-05,
"loss": 0.4371,
"step": 1910
},
{
"epoch": 1.2,
"learning_rate": 1.3670452492082552e-05,
"loss": 0.4139,
"step": 1920
},
{
"epoch": 1.21,
"learning_rate": 1.3607604828021534e-05,
"loss": 0.4569,
"step": 1930
},
{
"epoch": 1.21,
"learning_rate": 1.3544592919355203e-05,
"loss": 0.4088,
"step": 1940
},
{
"epoch": 1.22,
"learning_rate": 1.348141963484734e-05,
"loss": 0.4386,
"step": 1950
},
{
"epoch": 1.23,
"learning_rate": 1.3418087850608735e-05,
"loss": 0.4337,
"step": 1960
},
{
"epoch": 1.23,
"learning_rate": 1.335460044996625e-05,
"loss": 0.4365,
"step": 1970
},
{
"epoch": 1.24,
"learning_rate": 1.329096032333155e-05,
"loss": 0.4581,
"step": 1980
},
{
"epoch": 1.24,
"learning_rate": 1.3227170368069496e-05,
"loss": 0.4316,
"step": 1990
},
{
"epoch": 1.25,
"learning_rate": 1.3163233488366254e-05,
"loss": 0.3938,
"step": 2000
},
{
"epoch": 1.25,
"eval_loss": NaN,
"eval_runtime": 17.5701,
"eval_samples_per_second": 58.281,
"eval_steps_per_second": 3.643,
"step": 2000
},
{
"epoch": 1.26,
"learning_rate": 1.309915259509706e-05,
"loss": 0.4294,
"step": 2010
},
{
"epoch": 1.26,
"learning_rate": 1.3034930605693716e-05,
"loss": 0.4351,
"step": 2020
},
{
"epoch": 1.27,
"learning_rate": 1.2970570444011739e-05,
"loss": 0.4263,
"step": 2030
},
{
"epoch": 1.27,
"learning_rate": 1.290607504019727e-05,
"loss": 0.412,
"step": 2040
},
{
"epoch": 1.28,
"learning_rate": 1.2841447330553658e-05,
"loss": 0.4085,
"step": 2050
},
{
"epoch": 1.29,
"learning_rate": 1.2776690257407782e-05,
"loss": 0.4281,
"step": 2060
},
{
"epoch": 1.29,
"learning_rate": 1.2711806768976102e-05,
"loss": 0.4265,
"step": 2070
},
{
"epoch": 1.3,
"learning_rate": 1.264679981923042e-05,
"loss": 0.4109,
"step": 2080
},
{
"epoch": 1.31,
"learning_rate": 1.2581672367763408e-05,
"loss": 0.4149,
"step": 2090
},
{
"epoch": 1.31,
"learning_rate": 1.251642737965386e-05,
"loss": 0.4164,
"step": 2100
},
{
"epoch": 1.32,
"learning_rate": 1.245106782533169e-05,
"loss": 0.4153,
"step": 2110
},
{
"epoch": 1.32,
"learning_rate": 1.2385596680442715e-05,
"loss": 0.4099,
"step": 2120
},
{
"epoch": 1.33,
"learning_rate": 1.2320016925713168e-05,
"loss": 0.4614,
"step": 2130
},
{
"epoch": 1.34,
"learning_rate": 1.2254331546813995e-05,
"loss": 0.421,
"step": 2140
},
{
"epoch": 1.34,
"learning_rate": 1.218854353422494e-05,
"loss": 0.4066,
"step": 2150
},
{
"epoch": 1.35,
"learning_rate": 1.2122655883098369e-05,
"loss": 0.3837,
"step": 2160
},
{
"epoch": 1.36,
"learning_rate": 1.205667159312293e-05,
"loss": 0.3882,
"step": 2170
},
{
"epoch": 1.36,
"learning_rate": 1.199059366838699e-05,
"loss": 0.3884,
"step": 2180
},
{
"epoch": 1.37,
"learning_rate": 1.1924425117241837e-05,
"loss": 0.4101,
"step": 2190
},
{
"epoch": 1.38,
"learning_rate": 1.1858168952164757e-05,
"loss": 0.3678,
"step": 2200
},
{
"epoch": 1.38,
"learning_rate": 1.1791828189621848e-05,
"loss": 0.4123,
"step": 2210
},
{
"epoch": 1.39,
"learning_rate": 1.1725405849930713e-05,
"loss": 0.4034,
"step": 2220
},
{
"epoch": 1.39,
"learning_rate": 1.1658904957122941e-05,
"loss": 0.3858,
"step": 2230
},
{
"epoch": 1.4,
"learning_rate": 1.1592328538806439e-05,
"loss": 0.4078,
"step": 2240
},
{
"epoch": 1.41,
"learning_rate": 1.1525679626027585e-05,
"loss": 0.404,
"step": 2250
},
{
"epoch": 1.41,
"learning_rate": 1.1458961253133238e-05,
"loss": 0.3735,
"step": 2260
},
{
"epoch": 1.42,
"learning_rate": 1.1392176457632586e-05,
"loss": 0.3782,
"step": 2270
},
{
"epoch": 1.43,
"learning_rate": 1.1325328280058867e-05,
"loss": 0.3717,
"step": 2280
},
{
"epoch": 1.43,
"learning_rate": 1.125841976383093e-05,
"loss": 0.3843,
"step": 2290
},
{
"epoch": 1.44,
"learning_rate": 1.1191453955114681e-05,
"loss": 0.368,
"step": 2300
},
{
"epoch": 1.44,
"learning_rate": 1.1124433902684411e-05,
"loss": 0.3863,
"step": 2310
},
{
"epoch": 1.45,
"learning_rate": 1.1057362657783967e-05,
"loss": 0.3432,
"step": 2320
},
{
"epoch": 1.46,
"learning_rate": 1.0990243273987863e-05,
"loss": 0.3875,
"step": 2330
},
{
"epoch": 1.46,
"learning_rate": 1.0923078807062245e-05,
"loss": 0.3518,
"step": 2340
},
{
"epoch": 1.47,
"learning_rate": 1.0855872314825772e-05,
"loss": 0.3623,
"step": 2350
},
{
"epoch": 1.48,
"learning_rate": 1.0788626857010404e-05,
"loss": 0.3577,
"step": 2360
},
{
"epoch": 1.48,
"learning_rate": 1.0721345495122098e-05,
"loss": 0.3666,
"step": 2370
},
{
"epoch": 1.49,
"learning_rate": 1.0654031292301432e-05,
"loss": 0.3714,
"step": 2380
},
{
"epoch": 1.49,
"learning_rate": 1.0586687313184141e-05,
"loss": 0.3783,
"step": 2390
},
{
"epoch": 1.5,
"learning_rate": 1.0519316623761593e-05,
"loss": 0.3453,
"step": 2400
},
{
"epoch": 1.51,
"learning_rate": 1.045192229124121e-05,
"loss": 0.385,
"step": 2410
},
{
"epoch": 1.51,
"learning_rate": 1.0384507383906819e-05,
"loss": 0.3752,
"step": 2420
},
{
"epoch": 1.52,
"learning_rate": 1.0317074970978957e-05,
"loss": 0.3714,
"step": 2430
},
{
"epoch": 1.52,
"learning_rate": 1.0249628122475154e-05,
"loss": 0.3871,
"step": 2440
},
{
"epoch": 1.53,
"learning_rate": 1.0188916154305646e-05,
"loss": 0.3366,
"step": 2450
},
{
"epoch": 1.54,
"learning_rate": 1.012145033834699e-05,
"loss": 0.3724,
"step": 2460
},
{
"epoch": 1.54,
"learning_rate": 1.0053978993079046e-05,
"loss": 0.3878,
"step": 2470
},
{
"epoch": 1.55,
"learning_rate": 9.986505190291822e-06,
"loss": 0.351,
"step": 2480
},
{
"epoch": 1.56,
"learning_rate": 9.919032001887215e-06,
"loss": 0.3289,
"step": 2490
},
{
"epoch": 1.56,
"learning_rate": 9.85156249973915e-06,
"loss": 0.351,
"step": 2500
},
{
"epoch": 1.57,
"learning_rate": 9.784099755553723e-06,
"loss": 0.3428,
"step": 2510
},
{
"epoch": 1.57,
"learning_rate": 9.71664684072936e-06,
"loss": 0.3405,
"step": 2520
},
{
"epoch": 1.58,
"learning_rate": 9.649206826216988e-06,
"loss": 0.3786,
"step": 2530
},
{
"epoch": 1.59,
"learning_rate": 9.581782782380208e-06,
"loss": 0.3498,
"step": 2540
},
{
"epoch": 1.59,
"learning_rate": 9.514377778855521e-06,
"loss": 0.3247,
"step": 2550
},
{
"epoch": 1.6,
"learning_rate": 9.446994884412575e-06,
"loss": 0.3601,
"step": 2560
},
{
"epoch": 1.61,
"learning_rate": 9.37963716681446e-06,
"loss": 0.3569,
"step": 2570
},
{
"epoch": 1.61,
"learning_rate": 9.312307692678016e-06,
"loss": 0.3534,
"step": 2580
},
{
"epoch": 1.62,
"learning_rate": 9.245009527334243e-06,
"loss": 0.3435,
"step": 2590
},
{
"epoch": 1.62,
"learning_rate": 9.17774573468873e-06,
"loss": 0.3236,
"step": 2600
},
{
"epoch": 1.63,
"learning_rate": 9.110519377082174e-06,
"loss": 0.3543,
"step": 2610
},
{
"epoch": 1.64,
"learning_rate": 9.04333351515095e-06,
"loss": 0.3193,
"step": 2620
},
{
"epoch": 1.64,
"learning_rate": 8.976191207687775e-06,
"loss": 0.3259,
"step": 2630
},
{
"epoch": 1.65,
"learning_rate": 8.909095511502452e-06,
"loss": 0.3576,
"step": 2640
},
{
"epoch": 1.66,
"learning_rate": 8.842049481282691e-06,
"loss": 0.3422,
"step": 2650
},
{
"epoch": 1.66,
"learning_rate": 8.775056169455045e-06,
"loss": 0.3463,
"step": 2660
},
{
"epoch": 1.67,
"learning_rate": 8.708118626045939e-06,
"loss": 0.3219,
"step": 2670
},
{
"epoch": 1.68,
"learning_rate": 8.641239898542814e-06,
"loss": 0.3258,
"step": 2680
},
{
"epoch": 1.68,
"learning_rate": 8.574423031755377e-06,
"loss": 0.3325,
"step": 2690
},
{
"epoch": 1.69,
"learning_rate": 8.50767106767698e-06,
"loss": 0.3271,
"step": 2700
},
{
"epoch": 1.69,
"learning_rate": 8.440987045346135e-06,
"loss": 0.3171,
"step": 2710
},
{
"epoch": 1.7,
"learning_rate": 8.374374000708146e-06,
"loss": 0.3464,
"step": 2720
},
{
"epoch": 1.71,
"learning_rate": 8.307834966476885e-06,
"loss": 0.3445,
"step": 2730
},
{
"epoch": 1.71,
"learning_rate": 8.241372971996735e-06,
"loss": 0.3334,
"step": 2740
},
{
"epoch": 1.72,
"learning_rate": 8.174991043104662e-06,
"loss": 0.3,
"step": 2750
},
{
"epoch": 1.73,
"learning_rate": 8.108692201992466e-06,
"loss": 0.3452,
"step": 2760
},
{
"epoch": 1.73,
"learning_rate": 8.04247946706917e-06,
"loss": 0.3307,
"step": 2770
},
{
"epoch": 1.74,
"learning_rate": 7.976355852823628e-06,
"loss": 0.3035,
"step": 2780
},
{
"epoch": 1.74,
"learning_rate": 7.91032436968725e-06,
"loss": 0.3533,
"step": 2790
},
{
"epoch": 1.75,
"learning_rate": 7.844388023896981e-06,
"loss": 0.3318,
"step": 2800
},
{
"epoch": 1.76,
"learning_rate": 7.778549817358404e-06,
"loss": 0.3277,
"step": 2810
},
{
"epoch": 1.76,
"learning_rate": 7.712812747509091e-06,
"loss": 0.3024,
"step": 2820
},
{
"epoch": 1.77,
"learning_rate": 7.647179807182125e-06,
"loss": 0.3337,
"step": 2830
},
{
"epoch": 1.77,
"learning_rate": 7.5816539844698565e-06,
"loss": 0.3133,
"step": 2840
},
{
"epoch": 1.78,
"learning_rate": 7.516238262587851e-06,
"loss": 0.3381,
"step": 2850
},
{
"epoch": 1.79,
"learning_rate": 7.450935619739087e-06,
"loss": 0.3025,
"step": 2860
},
{
"epoch": 1.79,
"learning_rate": 7.385749028978347e-06,
"loss": 0.3152,
"step": 2870
},
{
"epoch": 1.8,
"learning_rate": 7.320681458076871e-06,
"loss": 0.3325,
"step": 2880
},
{
"epoch": 1.81,
"learning_rate": 7.255735869387257e-06,
"loss": 0.2968,
"step": 2890
},
{
"epoch": 1.81,
"learning_rate": 7.190915219708564e-06,
"loss": 0.3332,
"step": 2900
},
{
"epoch": 1.82,
"learning_rate": 7.126222460151719e-06,
"loss": 0.3049,
"step": 2910
},
{
"epoch": 1.82,
"learning_rate": 7.061660536005151e-06,
"loss": 0.3051,
"step": 2920
},
{
"epoch": 1.83,
"learning_rate": 6.997232386600706e-06,
"loss": 0.3294,
"step": 2930
},
{
"epoch": 1.84,
"learning_rate": 6.932940945179818e-06,
"loss": 0.3166,
"step": 2940
},
{
"epoch": 1.84,
"learning_rate": 6.868789138759977e-06,
"loss": 0.3045,
"step": 2950
},
{
"epoch": 1.85,
"learning_rate": 6.804779888001461e-06,
"loss": 0.2994,
"step": 2960
},
{
"epoch": 1.86,
"learning_rate": 6.740916107074372e-06,
"loss": 0.3216,
"step": 2970
},
{
"epoch": 1.86,
"learning_rate": 6.677200703525959e-06,
"loss": 0.2938,
"step": 2980
},
{
"epoch": 1.87,
"learning_rate": 6.613636578148242e-06,
"loss": 0.3407,
"step": 2990
},
{
"epoch": 1.88,
"learning_rate": 6.550226624845961e-06,
"loss": 0.343,
"step": 3000
},
{
"epoch": 1.88,
"eval_loss": NaN,
"eval_runtime": 16.6263,
"eval_samples_per_second": 61.589,
"eval_steps_per_second": 3.849,
"step": 3000
},
{
"epoch": 1.88,
"learning_rate": 6.4869737305047996e-06,
"loss": 0.3083,
"step": 3010
},
{
"epoch": 1.89,
"learning_rate": 6.423880774859978e-06,
"loss": 0.2817,
"step": 3020
},
{
"epoch": 1.89,
"learning_rate": 6.360950630365126e-06,
"loss": 0.2781,
"step": 3030
},
{
"epoch": 1.9,
"learning_rate": 6.298186162061521e-06,
"loss": 0.2762,
"step": 3040
},
{
"epoch": 1.91,
"learning_rate": 6.241842155612241e-06,
"loss": 0.2512,
"step": 3050
},
{
"epoch": 1.91,
"learning_rate": 6.179400338141371e-06,
"loss": 0.2593,
"step": 3060
},
{
"epoch": 1.92,
"learning_rate": 6.1171324623613016e-06,
"loss": 0.2904,
"step": 3070
},
{
"epoch": 1.93,
"learning_rate": 6.055041363161986e-06,
"loss": 0.2917,
"step": 3080
},
{
"epoch": 1.93,
"learning_rate": 5.99312986738521e-06,
"loss": 0.3004,
"step": 3090
},
{
"epoch": 1.94,
"learning_rate": 5.9314007936959006e-06,
"loss": 0.2892,
"step": 3100
},
{
"epoch": 1.94,
"learning_rate": 5.869856952453792e-06,
"loss": 0.2986,
"step": 3110
},
{
"epoch": 1.95,
"learning_rate": 5.8085011455854766e-06,
"loss": 0.2674,
"step": 3120
},
{
"epoch": 1.96,
"learning_rate": 5.747336166456849e-06,
"loss": 0.2898,
"step": 3130
},
{
"epoch": 1.96,
"learning_rate": 5.686364799745923e-06,
"loss": 0.2871,
"step": 3140
},
{
"epoch": 1.97,
"learning_rate": 5.625589821316065e-06,
"loss": 0.2682,
"step": 3150
},
{
"epoch": 1.98,
"learning_rate": 5.5650139980895985e-06,
"loss": 0.3133,
"step": 3160
},
{
"epoch": 1.98,
"learning_rate": 5.504640087921847e-06,
"loss": 0.2823,
"step": 3170
},
{
"epoch": 1.99,
"learning_rate": 5.444470839475571e-06,
"loss": 0.3103,
"step": 3180
},
{
"epoch": 1.99,
"learning_rate": 5.38450899209583e-06,
"loss": 0.2896,
"step": 3190
},
{
"epoch": 2.0,
"learning_rate": 5.324757275685269e-06,
"loss": 0.2672,
"step": 3200
},
{
"epoch": 2.01,
"learning_rate": 5.265218410579827e-06,
"loss": 0.2438,
"step": 3210
},
{
"epoch": 2.01,
"learning_rate": 5.2058951074248985e-06,
"loss": 0.2523,
"step": 3220
},
{
"epoch": 2.02,
"learning_rate": 5.146790067051912e-06,
"loss": 0.233,
"step": 3230
},
{
"epoch": 2.02,
"learning_rate": 5.087905980355381e-06,
"loss": 0.2697,
"step": 3240
},
{
"epoch": 2.03,
"learning_rate": 5.029245528170383e-06,
"loss": 0.2586,
"step": 3250
},
{
"epoch": 2.04,
"learning_rate": 4.97081138115052e-06,
"loss": 0.2368,
"step": 3260
},
{
"epoch": 2.04,
"learning_rate": 4.9126061996463195e-06,
"loss": 0.2319,
"step": 3270
},
{
"epoch": 2.05,
"learning_rate": 4.854632633584118e-06,
"loss": 0.2774,
"step": 3280
},
{
"epoch": 2.06,
"learning_rate": 4.796893322345435e-06,
"loss": 0.2738,
"step": 3290
},
{
"epoch": 2.06,
"learning_rate": 4.739390894646779e-06,
"loss": 0.242,
"step": 3300
},
{
"epoch": 2.07,
"learning_rate": 4.68212796841999e-06,
"loss": 0.2434,
"step": 3310
},
{
"epoch": 2.08,
"learning_rate": 4.625107150693042e-06,
"loss": 0.284,
"step": 3320
},
{
"epoch": 2.08,
"learning_rate": 4.568331037471364e-06,
"loss": 0.2368,
"step": 3330
},
{
"epoch": 2.09,
"learning_rate": 4.511802213619635e-06,
"loss": 0.2386,
"step": 3340
},
{
"epoch": 2.09,
"learning_rate": 4.455523252744114e-06,
"loss": 0.2971,
"step": 3350
},
{
"epoch": 2.1,
"learning_rate": 4.399496717075465e-06,
"loss": 0.2351,
"step": 3360
},
{
"epoch": 2.11,
"learning_rate": 4.34372515735211e-06,
"loss": 0.2279,
"step": 3370
},
{
"epoch": 2.11,
"learning_rate": 4.288211112704092e-06,
"loss": 0.2699,
"step": 3380
},
{
"epoch": 2.12,
"learning_rate": 4.232957110537488e-06,
"loss": 0.2333,
"step": 3390
},
{
"epoch": 2.12,
"learning_rate": 4.17796566641933e-06,
"loss": 0.2472,
"step": 3400
},
{
"epoch": 2.13,
"learning_rate": 4.123239283963087e-06,
"loss": 0.248,
"step": 3410
},
{
"epoch": 2.14,
"learning_rate": 4.068780454714677e-06,
"loss": 0.2294,
"step": 3420
},
{
"epoch": 2.14,
"learning_rate": 4.0145916580390335e-06,
"loss": 0.2326,
"step": 3430
},
{
"epoch": 2.15,
"learning_rate": 3.96067536100724e-06,
"loss": 0.2489,
"step": 3440
},
{
"epoch": 2.16,
"learning_rate": 3.907034018284186e-06,
"loss": 0.2375,
"step": 3450
},
{
"epoch": 2.16,
"learning_rate": 3.853670072016833e-06,
"loss": 0.255,
"step": 3460
},
{
"epoch": 2.17,
"learning_rate": 3.8005859517230214e-06,
"loss": 0.258,
"step": 3470
},
{
"epoch": 2.17,
"learning_rate": 3.747784074180865e-06,
"loss": 0.2595,
"step": 3480
},
{
"epoch": 2.18,
"learning_rate": 3.6952668433187145e-06,
"loss": 0.2647,
"step": 3490
},
{
"epoch": 2.19,
"learning_rate": 3.643036650105725e-06,
"loss": 0.2052,
"step": 3500
},
{
"epoch": 2.19,
"learning_rate": 3.591095872442989e-06,
"loss": 0.26,
"step": 3510
},
{
"epoch": 2.2,
"learning_rate": 3.539446875055287e-06,
"loss": 0.2436,
"step": 3520
},
{
"epoch": 2.21,
"learning_rate": 3.488092009383418e-06,
"loss": 0.2306,
"step": 3530
},
{
"epoch": 2.21,
"learning_rate": 3.4370336134771567e-06,
"loss": 0.1932,
"step": 3540
},
{
"epoch": 2.22,
"learning_rate": 3.386274011888796e-06,
"loss": 0.1953,
"step": 3550
},
{
"epoch": 2.23,
"learning_rate": 3.3358155155673257e-06,
"loss": 0.2648,
"step": 3560
},
{
"epoch": 2.23,
"learning_rate": 3.285660421753216e-06,
"loss": 0.2133,
"step": 3570
},
{
"epoch": 2.24,
"learning_rate": 3.2358110138738297e-06,
"loss": 0.2209,
"step": 3580
},
{
"epoch": 2.24,
"learning_rate": 3.1862695614394745e-06,
"loss": 0.2401,
"step": 3590
},
{
"epoch": 2.25,
"learning_rate": 3.1370383199400613e-06,
"loss": 0.214,
"step": 3600
},
{
"epoch": 2.26,
"learning_rate": 3.0881195307424282e-06,
"loss": 0.23,
"step": 3610
},
{
"epoch": 2.26,
"learning_rate": 3.039515420988297e-06,
"loss": 0.2272,
"step": 3620
},
{
"epoch": 2.27,
"learning_rate": 2.991228203492873e-06,
"loss": 0.2326,
"step": 3630
},
{
"epoch": 2.27,
"learning_rate": 2.9432600766441066e-06,
"loss": 0.2299,
"step": 3640
},
{
"epoch": 2.28,
"learning_rate": 2.8956132243026025e-06,
"loss": 0.2481,
"step": 3650
},
{
"epoch": 2.29,
"learning_rate": 2.8482898157021945e-06,
"loss": 0.2266,
"step": 3660
},
{
"epoch": 2.29,
"learning_rate": 2.8012920053511916e-06,
"loss": 0.2223,
"step": 3670
},
{
"epoch": 2.3,
"learning_rate": 2.7546219329342792e-06,
"loss": 0.2181,
"step": 3680
},
{
"epoch": 2.31,
"learning_rate": 2.7082817232151128e-06,
"loss": 0.2287,
"step": 3690
},
{
"epoch": 2.31,
"learning_rate": 2.662273485939586e-06,
"loss": 0.2314,
"step": 3700
},
{
"epoch": 2.32,
"learning_rate": 2.616599315739766e-06,
"loss": 0.2275,
"step": 3710
},
{
"epoch": 2.33,
"learning_rate": 2.571261292038546e-06,
"loss": 0.213,
"step": 3720
},
{
"epoch": 2.33,
"learning_rate": 2.5262614789549624e-06,
"loss": 0.2008,
"step": 3730
},
{
"epoch": 2.34,
"learning_rate": 2.4816019252102274e-06,
"loss": 0.2189,
"step": 3740
},
{
"epoch": 2.34,
"learning_rate": 2.437284664034457e-06,
"loss": 0.2167,
"step": 3750
},
{
"epoch": 2.35,
"learning_rate": 2.3933117130741e-06,
"loss": 0.2059,
"step": 3760
},
{
"epoch": 2.36,
"learning_rate": 2.3496850743000775e-06,
"loss": 0.2265,
"step": 3770
},
{
"epoch": 2.36,
"learning_rate": 2.306406733916654e-06,
"loss": 0.1929,
"step": 3780
},
{
"epoch": 2.37,
"learning_rate": 2.263478662270987e-06,
"loss": 0.2038,
"step": 3790
},
{
"epoch": 2.38,
"learning_rate": 2.2209028137634402e-06,
"loss": 0.1998,
"step": 3800
},
{
"epoch": 2.38,
"learning_rate": 2.1786811267585984e-06,
"loss": 0.2111,
"step": 3810
},
{
"epoch": 2.39,
"learning_rate": 2.136815523497019e-06,
"loss": 0.1965,
"step": 3820
},
{
"epoch": 2.39,
"learning_rate": 2.09530791000772e-06,
"loss": 0.1969,
"step": 3830
},
{
"epoch": 2.4,
"learning_rate": 2.054160176021399e-06,
"loss": 0.2224,
"step": 3840
},
{
"epoch": 2.41,
"learning_rate": 2.0133741948844056e-06,
"loss": 0.2081,
"step": 3850
},
{
"epoch": 2.41,
"learning_rate": 1.9729518234734455e-06,
"loss": 0.2033,
"step": 3860
},
{
"epoch": 2.42,
"learning_rate": 1.9328949021110453e-06,
"loss": 0.1862,
"step": 3870
},
{
"epoch": 2.42,
"learning_rate": 1.8932052544817747e-06,
"loss": 0.2318,
"step": 3880
},
{
"epoch": 2.43,
"learning_rate": 1.8538846875492034e-06,
"loss": 0.2283,
"step": 3890
},
{
"epoch": 2.44,
"learning_rate": 1.814934991473647e-06,
"loss": 0.1989,
"step": 3900
},
{
"epoch": 2.44,
"learning_rate": 1.776357939530663e-06,
"loss": 0.1953,
"step": 3910
},
{
"epoch": 2.45,
"learning_rate": 1.7381552880303155e-06,
"loss": 0.2307,
"step": 3920
},
{
"epoch": 2.46,
"learning_rate": 1.7003287762372178e-06,
"loss": 0.2179,
"step": 3930
},
{
"epoch": 2.46,
"learning_rate": 1.6628801262913485e-06,
"loss": 0.2061,
"step": 3940
},
{
"epoch": 2.47,
"learning_rate": 1.625811043129647e-06,
"loss": 0.1814,
"step": 3950
},
{
"epoch": 2.48,
"learning_rate": 1.589123214408388e-06,
"loss": 0.2217,
"step": 3960
},
{
"epoch": 2.48,
"learning_rate": 1.552818310426356e-06,
"loss": 0.2369,
"step": 3970
},
{
"epoch": 2.49,
"learning_rate": 1.5168979840487897e-06,
"loss": 0.2065,
"step": 3980
},
{
"epoch": 2.49,
"learning_rate": 1.481363870632142e-06,
"loss": 0.2191,
"step": 3990
},
{
"epoch": 2.5,
"learning_rate": 1.4462175879496198e-06,
"loss": 0.2158,
"step": 4000
},
{
"epoch": 2.5,
"eval_loss": NaN,
"eval_runtime": 16.1092,
"eval_samples_per_second": 63.566,
"eval_steps_per_second": 3.973,
"step": 4000
},
{
"epoch": 2.51,
"learning_rate": 1.4114607361175314e-06,
"loss": 0.2249,
"step": 4010
},
{
"epoch": 2.51,
"learning_rate": 1.3770948975224462e-06,
"loss": 0.2279,
"step": 4020
},
{
"epoch": 2.52,
"learning_rate": 1.3431216367491384e-06,
"loss": 0.1812,
"step": 4030
},
{
"epoch": 2.52,
"learning_rate": 1.3095425005093676e-06,
"loss": 0.2139,
"step": 4040
},
{
"epoch": 2.53,
"learning_rate": 1.2763590175714547e-06,
"loss": 0.2395,
"step": 4050
},
{
"epoch": 2.54,
"learning_rate": 1.243572698690685e-06,
"loss": 0.1955,
"step": 4060
},
{
"epoch": 2.54,
"learning_rate": 1.2111850365405242e-06,
"loss": 0.2058,
"step": 4070
},
{
"epoch": 2.55,
"learning_rate": 1.1791975056446626e-06,
"loss": 0.2055,
"step": 4080
},
{
"epoch": 2.56,
"learning_rate": 1.147611562309887e-06,
"loss": 0.202,
"step": 4090
},
{
"epoch": 2.56,
"learning_rate": 1.1164286445597716e-06,
"loss": 0.2077,
"step": 4100
},
{
"epoch": 2.57,
"learning_rate": 1.0856501720692159e-06,
"loss": 0.1978,
"step": 4110
},
{
"epoch": 2.58,
"learning_rate": 1.0552775460998067e-06,
"loss": 0.1776,
"step": 4120
},
{
"epoch": 2.58,
"learning_rate": 1.025312149436024e-06,
"loss": 0.1917,
"step": 4130
},
{
"epoch": 2.59,
"learning_rate": 9.957553463222847e-07,
"loss": 0.2137,
"step": 4140
},
{
"epoch": 2.59,
"learning_rate": 9.666084824008349e-07,
"loss": 0.1914,
"step": 4150
},
{
"epoch": 2.6,
"learning_rate": 9.378728846504825e-07,
"loss": 0.2026,
"step": 4160
},
{
"epoch": 2.61,
"learning_rate": 9.095498613261911e-07,
"loss": 0.2462,
"step": 4170
},
{
"epoch": 2.61,
"learning_rate": 8.816407018995088e-07,
"loss": 0.1953,
"step": 4180
},
{
"epoch": 2.62,
"learning_rate": 8.541466769998685e-07,
"loss": 0.2095,
"step": 4190
},
{
"epoch": 2.62,
"learning_rate": 8.270690383567359e-07,
"loss": 0.1931,
"step": 4200
},
{
"epoch": 2.63,
"learning_rate": 8.004090187426238e-07,
"loss": 0.2122,
"step": 4210
},
{
"epoch": 2.64,
"learning_rate": 7.741678319169698e-07,
"loss": 0.2336,
"step": 4220
},
{
"epoch": 2.64,
"learning_rate": 7.48346672570871e-07,
"loss": 0.1691,
"step": 4230
},
{
"epoch": 2.65,
"learning_rate": 7.229467162726966e-07,
"loss": 0.1789,
"step": 4240
},
{
"epoch": 2.66,
"learning_rate": 6.979691194145677e-07,
"loss": 0.1936,
"step": 4250
},
{
"epoch": 2.66,
"learning_rate": 6.734150191597078e-07,
"loss": 0.1923,
"step": 4260
},
{
"epoch": 2.67,
"learning_rate": 6.492855333906733e-07,
"loss": 0.2141,
"step": 4270
},
{
"epoch": 2.67,
"learning_rate": 6.255817606584558e-07,
"loss": 0.2156,
"step": 4280
},
{
"epoch": 2.68,
"learning_rate": 6.023047801324744e-07,
"loss": 0.1892,
"step": 4290
},
{
"epoch": 2.69,
"learning_rate": 5.794556515514327e-07,
"loss": 0.2173,
"step": 4300
},
{
"epoch": 2.69,
"learning_rate": 5.570354151750878e-07,
"loss": 0.1992,
"step": 4310
},
{
"epoch": 2.7,
"learning_rate": 5.350450917368755e-07,
"loss": 0.1994,
"step": 4320
},
{
"epoch": 2.71,
"learning_rate": 5.134856823974444e-07,
"loss": 0.2123,
"step": 4330
},
{
"epoch": 2.71,
"learning_rate": 4.923581686990775e-07,
"loss": 0.1867,
"step": 4340
},
{
"epoch": 2.72,
"learning_rate": 4.716635125210034e-07,
"loss": 0.1895,
"step": 4350
},
{
"epoch": 2.73,
"learning_rate": 4.514026560356044e-07,
"loss": 0.2056,
"step": 4360
},
{
"epoch": 2.73,
"learning_rate": 4.315765216655232e-07,
"loss": 0.1885,
"step": 4370
},
{
"epoch": 2.74,
"learning_rate": 4.121860120416654e-07,
"loss": 0.1791,
"step": 4380
},
{
"epoch": 2.74,
"learning_rate": 3.9323200996210673e-07,
"loss": 0.201,
"step": 4390
},
{
"epoch": 2.75,
"learning_rate": 3.7471537835190087e-07,
"loss": 0.2065,
"step": 4400
},
{
"epoch": 2.76,
"learning_rate": 3.566369602237929e-07,
"loss": 0.2152,
"step": 4410
},
{
"epoch": 2.76,
"learning_rate": 3.389975786398403e-07,
"loss": 0.1918,
"step": 4420
},
{
"epoch": 2.77,
"learning_rate": 3.217980366739404e-07,
"loss": 0.2076,
"step": 4430
},
{
"epoch": 2.77,
"learning_rate": 3.0503911737526536e-07,
"loss": 0.2107,
"step": 4440
},
{
"epoch": 2.78,
"learning_rate": 2.8872158373261847e-07,
"loss": 0.1817,
"step": 4450
},
{
"epoch": 2.79,
"learning_rate": 2.7284617863969296e-07,
"loss": 0.1849,
"step": 4460
},
{
"epoch": 2.79,
"learning_rate": 2.5741362486125023e-07,
"loss": 0.2002,
"step": 4470
},
{
"epoch": 2.8,
"learning_rate": 2.424246250002138e-07,
"loss": 0.2012,
"step": 4480
},
{
"epoch": 2.81,
"learning_rate": 2.278798614656863e-07,
"loss": 0.1792,
"step": 4490
},
{
"epoch": 2.81,
"learning_rate": 2.1377999644187518e-07,
"loss": 0.1965,
"step": 4500
},
{
"epoch": 2.82,
"learning_rate": 2.0012567185794808e-07,
"loss": 0.2079,
"step": 4510
},
{
"epoch": 2.83,
"learning_rate": 1.8691750935880826e-07,
"loss": 0.1776,
"step": 4520
},
{
"epoch": 2.83,
"learning_rate": 1.741561102767908e-07,
"loss": 0.1856,
"step": 4530
},
{
"epoch": 2.84,
"learning_rate": 1.6184205560428655e-07,
"loss": 0.1927,
"step": 4540
},
{
"epoch": 2.84,
"learning_rate": 1.4997590596729227e-07,
"loss": 0.1844,
"step": 4550
},
{
"epoch": 2.85,
"learning_rate": 1.385582015998832e-07,
"loss": 0.2104,
"step": 4560
},
{
"epoch": 2.86,
"learning_rate": 1.2758946231962389e-07,
"loss": 0.192,
"step": 4570
},
{
"epoch": 2.86,
"learning_rate": 1.1707018750389376e-07,
"loss": 0.2106,
"step": 4580
},
{
"epoch": 2.87,
"learning_rate": 1.0700085606716092e-07,
"loss": 0.1914,
"step": 4590
},
{
"epoch": 2.88,
"learning_rate": 9.738192643917066e-08,
"loss": 0.1844,
"step": 4600
},
{
"epoch": 2.88,
"learning_rate": 8.8213836544081e-08,
"loss": 0.1783,
"step": 4610
},
{
"epoch": 2.89,
"learning_rate": 7.949700378051872e-08,
"loss": 0.1934,
"step": 4620
},
{
"epoch": 2.89,
"learning_rate": 7.123182500258119e-08,
"loss": 0.1946,
"step": 4630
},
{
"epoch": 2.9,
"learning_rate": 6.341867650176637e-08,
"loss": 0.2097,
"step": 4640
},
{
"epoch": 2.91,
"learning_rate": 5.605791398984317e-08,
"loss": 0.1757,
"step": 4650
},
{
"epoch": 2.91,
"learning_rate": 4.914987258265558e-08,
"loss": 0.1656,
"step": 4660
},
{
"epoch": 2.92,
"learning_rate": 4.2694866784862565e-08,
"loss": 0.1923,
"step": 4670
},
{
"epoch": 2.92,
"learning_rate": 3.669319047562514e-08,
"loss": 0.2078,
"step": 4680
},
{
"epoch": 2.93,
"learning_rate": 3.114511689522592e-08,
"loss": 0.2223,
"step": 4690
},
{
"epoch": 2.94,
"learning_rate": 2.6050898632625777e-08,
"loss": 0.2062,
"step": 4700
},
{
"epoch": 2.94,
"learning_rate": 2.1410767613965212e-08,
"loss": 0.1708,
"step": 4710
},
{
"epoch": 2.95,
"learning_rate": 1.722493509200729e-08,
"loss": 0.2104,
"step": 4720
},
{
"epoch": 2.96,
"learning_rate": 1.3493591636519753e-08,
"loss": 0.1751,
"step": 4730
},
{
"epoch": 2.96,
"learning_rate": 1.021690712559642e-08,
"loss": 0.2018,
"step": 4740
},
{
"epoch": 2.97,
"learning_rate": 7.395030737924469e-09,
"loss": 0.1806,
"step": 4750
},
{
"epoch": 2.98,
"learning_rate": 5.028090945993214e-09,
"loss": 0.1807,
"step": 4760
},
{
"epoch": 2.98,
"learning_rate": 3.1161955102465558e-09,
"loss": 0.19,
"step": 4770
},
{
"epoch": 2.99,
"learning_rate": 1.6594314741724682e-09,
"loss": 0.1996,
"step": 4780
},
{
"epoch": 2.99,
"learning_rate": 6.578651603439401e-10,
"loss": 0.2078,
"step": 4790
},
{
"epoch": 3.0,
"learning_rate": 1.115421673980599e-10,
"loss": 0.2101,
"step": 4800
},
{
"epoch": 3.0,
"step": 4800,
"total_flos": 9.142542195721503e+17,
"train_loss": 0.4117240861803293,
"train_runtime": 65160.1252,
"train_samples_per_second": 4.715,
"train_steps_per_second": 0.074
}
],
"max_steps": 4800,
"num_train_epochs": 3,
"total_flos": 9.142542195721503e+17,
"trial_name": null,
"trial_params": null
}