vit-base-beans-demo-v5 / trainer_state.json
rshrott's picture
🍻 cheers
fbc1625 verified
{
"best_metric": 0.8459659218788147,
"best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-900",
"epoch": 4.0,
"eval_steps": 100,
"global_step": 2348,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 1.6497292518615723,
"learning_rate": 0.00019914821124361162,
"loss": 1.6003,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 1.307145357131958,
"learning_rate": 0.00019829642248722317,
"loss": 1.2767,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 2.9354941844940186,
"learning_rate": 0.00019744463373083478,
"loss": 1.2612,
"step": 30
},
{
"epoch": 0.07,
"grad_norm": 1.3261815309524536,
"learning_rate": 0.00019659284497444633,
"loss": 1.2354,
"step": 40
},
{
"epoch": 0.09,
"grad_norm": 1.5586915016174316,
"learning_rate": 0.00019574105621805794,
"loss": 1.0959,
"step": 50
},
{
"epoch": 0.1,
"grad_norm": 1.490173578262329,
"learning_rate": 0.00019488926746166952,
"loss": 1.0438,
"step": 60
},
{
"epoch": 0.12,
"grad_norm": 2.0831446647644043,
"learning_rate": 0.0001940374787052811,
"loss": 1.0841,
"step": 70
},
{
"epoch": 0.14,
"grad_norm": 2.6207799911499023,
"learning_rate": 0.00019318568994889268,
"loss": 1.0983,
"step": 80
},
{
"epoch": 0.15,
"grad_norm": 1.7383110523223877,
"learning_rate": 0.00019233390119250426,
"loss": 1.1775,
"step": 90
},
{
"epoch": 0.17,
"grad_norm": 2.1954941749572754,
"learning_rate": 0.00019148211243611585,
"loss": 1.0616,
"step": 100
},
{
"epoch": 0.17,
"eval_accuracy": 0.5817717206132879,
"eval_loss": 1.0267014503479004,
"eval_runtime": 39.3874,
"eval_samples_per_second": 29.806,
"eval_steps_per_second": 3.732,
"step": 100
},
{
"epoch": 0.19,
"grad_norm": 1.597124695777893,
"learning_rate": 0.00019063032367972745,
"loss": 1.007,
"step": 110
},
{
"epoch": 0.2,
"grad_norm": 1.289490818977356,
"learning_rate": 0.000189778534923339,
"loss": 1.0065,
"step": 120
},
{
"epoch": 0.22,
"grad_norm": 1.7088607549667358,
"learning_rate": 0.00018892674616695061,
"loss": 1.0204,
"step": 130
},
{
"epoch": 0.24,
"grad_norm": 2.730241537094116,
"learning_rate": 0.00018807495741056217,
"loss": 0.8969,
"step": 140
},
{
"epoch": 0.26,
"grad_norm": 2.9691402912139893,
"learning_rate": 0.00018722316865417378,
"loss": 0.953,
"step": 150
},
{
"epoch": 0.27,
"grad_norm": 2.2519712448120117,
"learning_rate": 0.00018637137989778536,
"loss": 0.9269,
"step": 160
},
{
"epoch": 0.29,
"grad_norm": 1.8000602722167969,
"learning_rate": 0.00018551959114139694,
"loss": 1.1314,
"step": 170
},
{
"epoch": 0.31,
"grad_norm": 1.5348334312438965,
"learning_rate": 0.00018466780238500855,
"loss": 0.9615,
"step": 180
},
{
"epoch": 0.32,
"grad_norm": 1.599938988685608,
"learning_rate": 0.0001838160136286201,
"loss": 0.8033,
"step": 190
},
{
"epoch": 0.34,
"grad_norm": 1.50412917137146,
"learning_rate": 0.0001829642248722317,
"loss": 0.9594,
"step": 200
},
{
"epoch": 0.34,
"eval_accuracy": 0.6073253833049403,
"eval_loss": 0.9467767477035522,
"eval_runtime": 38.8829,
"eval_samples_per_second": 30.193,
"eval_steps_per_second": 3.781,
"step": 200
},
{
"epoch": 0.36,
"grad_norm": 2.1896722316741943,
"learning_rate": 0.0001821124361158433,
"loss": 0.9217,
"step": 210
},
{
"epoch": 0.37,
"grad_norm": 1.9687891006469727,
"learning_rate": 0.00018126064735945487,
"loss": 1.0296,
"step": 220
},
{
"epoch": 0.39,
"grad_norm": 1.9628914594650269,
"learning_rate": 0.00018040885860306645,
"loss": 0.8122,
"step": 230
},
{
"epoch": 0.41,
"grad_norm": 2.598545789718628,
"learning_rate": 0.00017955706984667803,
"loss": 0.8393,
"step": 240
},
{
"epoch": 0.43,
"grad_norm": 2.2483532428741455,
"learning_rate": 0.0001787052810902896,
"loss": 0.9047,
"step": 250
},
{
"epoch": 0.44,
"grad_norm": 2.1274337768554688,
"learning_rate": 0.0001778534923339012,
"loss": 0.91,
"step": 260
},
{
"epoch": 0.46,
"grad_norm": 2.436018466949463,
"learning_rate": 0.00017700170357751277,
"loss": 1.0615,
"step": 270
},
{
"epoch": 0.48,
"grad_norm": 2.069586992263794,
"learning_rate": 0.00017614991482112438,
"loss": 1.0799,
"step": 280
},
{
"epoch": 0.49,
"grad_norm": 1.7266385555267334,
"learning_rate": 0.00017529812606473594,
"loss": 0.9465,
"step": 290
},
{
"epoch": 0.51,
"grad_norm": 2.0491390228271484,
"learning_rate": 0.00017444633730834754,
"loss": 1.1785,
"step": 300
},
{
"epoch": 0.51,
"eval_accuracy": 0.5868824531516184,
"eval_loss": 0.997596025466919,
"eval_runtime": 39.3421,
"eval_samples_per_second": 29.841,
"eval_steps_per_second": 3.736,
"step": 300
},
{
"epoch": 0.53,
"grad_norm": 1.4697805643081665,
"learning_rate": 0.00017359454855195912,
"loss": 1.094,
"step": 310
},
{
"epoch": 0.55,
"grad_norm": 2.369339942932129,
"learning_rate": 0.0001727427597955707,
"loss": 0.9398,
"step": 320
},
{
"epoch": 0.56,
"grad_norm": 2.325148344039917,
"learning_rate": 0.00017189097103918229,
"loss": 0.9718,
"step": 330
},
{
"epoch": 0.58,
"grad_norm": 1.9404678344726562,
"learning_rate": 0.00017103918228279387,
"loss": 0.9091,
"step": 340
},
{
"epoch": 0.6,
"grad_norm": 2.4493370056152344,
"learning_rate": 0.00017018739352640547,
"loss": 0.9295,
"step": 350
},
{
"epoch": 0.61,
"grad_norm": 1.6286579370498657,
"learning_rate": 0.00016933560477001706,
"loss": 1.1049,
"step": 360
},
{
"epoch": 0.63,
"grad_norm": 3.559056043624878,
"learning_rate": 0.00016848381601362864,
"loss": 0.9566,
"step": 370
},
{
"epoch": 0.65,
"grad_norm": 1.4250924587249756,
"learning_rate": 0.00016763202725724022,
"loss": 0.7772,
"step": 380
},
{
"epoch": 0.66,
"grad_norm": 1.5668089389801025,
"learning_rate": 0.0001667802385008518,
"loss": 0.8869,
"step": 390
},
{
"epoch": 0.68,
"grad_norm": 2.725231885910034,
"learning_rate": 0.00016592844974446338,
"loss": 0.865,
"step": 400
},
{
"epoch": 0.68,
"eval_accuracy": 0.6388415672913118,
"eval_loss": 0.9287859201431274,
"eval_runtime": 38.5489,
"eval_samples_per_second": 30.455,
"eval_steps_per_second": 3.813,
"step": 400
},
{
"epoch": 0.7,
"grad_norm": 2.6907713413238525,
"learning_rate": 0.00016507666098807496,
"loss": 0.899,
"step": 410
},
{
"epoch": 0.72,
"grad_norm": 2.402860164642334,
"learning_rate": 0.00016422487223168654,
"loss": 0.9506,
"step": 420
},
{
"epoch": 0.73,
"grad_norm": 2.749433994293213,
"learning_rate": 0.00016337308347529815,
"loss": 0.8529,
"step": 430
},
{
"epoch": 0.75,
"grad_norm": 1.92979097366333,
"learning_rate": 0.0001625212947189097,
"loss": 0.8695,
"step": 440
},
{
"epoch": 0.77,
"grad_norm": 2.793747901916504,
"learning_rate": 0.0001616695059625213,
"loss": 0.8614,
"step": 450
},
{
"epoch": 0.78,
"grad_norm": 2.483780860900879,
"learning_rate": 0.0001608177172061329,
"loss": 0.9176,
"step": 460
},
{
"epoch": 0.8,
"grad_norm": 1.7278929948806763,
"learning_rate": 0.00015996592844974447,
"loss": 0.9656,
"step": 470
},
{
"epoch": 0.82,
"grad_norm": 2.649017810821533,
"learning_rate": 0.00015911413969335605,
"loss": 0.8653,
"step": 480
},
{
"epoch": 0.83,
"grad_norm": 1.8457053899765015,
"learning_rate": 0.00015826235093696763,
"loss": 0.7707,
"step": 490
},
{
"epoch": 0.85,
"grad_norm": 2.824699640274048,
"learning_rate": 0.00015741056218057921,
"loss": 0.8494,
"step": 500
},
{
"epoch": 0.85,
"eval_accuracy": 0.651618398637138,
"eval_loss": 0.8572959303855896,
"eval_runtime": 38.0883,
"eval_samples_per_second": 30.823,
"eval_steps_per_second": 3.859,
"step": 500
},
{
"epoch": 0.87,
"grad_norm": 1.9104124307632446,
"learning_rate": 0.0001565587734241908,
"loss": 0.8113,
"step": 510
},
{
"epoch": 0.89,
"grad_norm": 2.2717394828796387,
"learning_rate": 0.0001557069846678024,
"loss": 0.9194,
"step": 520
},
{
"epoch": 0.9,
"grad_norm": 1.891735553741455,
"learning_rate": 0.00015485519591141398,
"loss": 0.9337,
"step": 530
},
{
"epoch": 0.92,
"grad_norm": 4.8229146003723145,
"learning_rate": 0.00015400340715502557,
"loss": 0.9033,
"step": 540
},
{
"epoch": 0.94,
"grad_norm": 2.656970977783203,
"learning_rate": 0.00015315161839863715,
"loss": 0.8985,
"step": 550
},
{
"epoch": 0.95,
"grad_norm": 2.2908411026000977,
"learning_rate": 0.00015229982964224873,
"loss": 0.8708,
"step": 560
},
{
"epoch": 0.97,
"grad_norm": 2.141950845718384,
"learning_rate": 0.0001514480408858603,
"loss": 0.9298,
"step": 570
},
{
"epoch": 0.99,
"grad_norm": 2.5572831630706787,
"learning_rate": 0.00015059625212947192,
"loss": 0.9101,
"step": 580
},
{
"epoch": 1.01,
"grad_norm": 2.2453222274780273,
"learning_rate": 0.00014982964224872234,
"loss": 0.8034,
"step": 590
},
{
"epoch": 1.02,
"grad_norm": 2.2874865531921387,
"learning_rate": 0.00014897785349233392,
"loss": 0.8151,
"step": 600
},
{
"epoch": 1.02,
"eval_accuracy": 0.6396933560477002,
"eval_loss": 0.87294602394104,
"eval_runtime": 38.7251,
"eval_samples_per_second": 30.316,
"eval_steps_per_second": 3.796,
"step": 600
},
{
"epoch": 1.04,
"grad_norm": 2.950303554534912,
"learning_rate": 0.0001481260647359455,
"loss": 0.7484,
"step": 610
},
{
"epoch": 1.06,
"grad_norm": 1.9773017168045044,
"learning_rate": 0.00014727427597955708,
"loss": 0.6572,
"step": 620
},
{
"epoch": 1.07,
"grad_norm": 2.9777700901031494,
"learning_rate": 0.00014642248722316866,
"loss": 0.6927,
"step": 630
},
{
"epoch": 1.09,
"grad_norm": 3.323662519454956,
"learning_rate": 0.00014557069846678024,
"loss": 0.5812,
"step": 640
},
{
"epoch": 1.11,
"grad_norm": 1.9647018909454346,
"learning_rate": 0.00014471890971039185,
"loss": 0.6166,
"step": 650
},
{
"epoch": 1.12,
"grad_norm": 3.215794563293457,
"learning_rate": 0.0001438671209540034,
"loss": 0.602,
"step": 660
},
{
"epoch": 1.14,
"grad_norm": 2.8758130073547363,
"learning_rate": 0.000143015332197615,
"loss": 0.5224,
"step": 670
},
{
"epoch": 1.16,
"grad_norm": 2.142829179763794,
"learning_rate": 0.00014216354344122656,
"loss": 0.5663,
"step": 680
},
{
"epoch": 1.18,
"grad_norm": 6.860159397125244,
"learning_rate": 0.00014131175468483817,
"loss": 0.6479,
"step": 690
},
{
"epoch": 1.19,
"grad_norm": 3.3176701068878174,
"learning_rate": 0.00014045996592844975,
"loss": 0.5787,
"step": 700
},
{
"epoch": 1.19,
"eval_accuracy": 0.6448040885860307,
"eval_loss": 0.9067147970199585,
"eval_runtime": 38.2427,
"eval_samples_per_second": 30.699,
"eval_steps_per_second": 3.844,
"step": 700
},
{
"epoch": 1.21,
"grad_norm": 2.322371482849121,
"learning_rate": 0.00013960817717206133,
"loss": 0.6849,
"step": 710
},
{
"epoch": 1.23,
"grad_norm": 1.875775933265686,
"learning_rate": 0.00013875638841567291,
"loss": 0.6399,
"step": 720
},
{
"epoch": 1.24,
"grad_norm": 2.0012145042419434,
"learning_rate": 0.0001379045996592845,
"loss": 0.725,
"step": 730
},
{
"epoch": 1.26,
"grad_norm": 2.5320353507995605,
"learning_rate": 0.00013705281090289608,
"loss": 0.5306,
"step": 740
},
{
"epoch": 1.28,
"grad_norm": 2.29856538772583,
"learning_rate": 0.00013620102214650768,
"loss": 0.5731,
"step": 750
},
{
"epoch": 1.29,
"grad_norm": 1.8604925870895386,
"learning_rate": 0.00013534923339011926,
"loss": 0.6806,
"step": 760
},
{
"epoch": 1.31,
"grad_norm": 2.6868739128112793,
"learning_rate": 0.00013449744463373084,
"loss": 0.5944,
"step": 770
},
{
"epoch": 1.33,
"grad_norm": 3.3680803775787354,
"learning_rate": 0.00013364565587734243,
"loss": 0.6412,
"step": 780
},
{
"epoch": 1.35,
"grad_norm": 2.798149824142456,
"learning_rate": 0.000132793867120954,
"loss": 0.5235,
"step": 790
},
{
"epoch": 1.36,
"grad_norm": 2.4862072467803955,
"learning_rate": 0.00013194207836456561,
"loss": 0.7768,
"step": 800
},
{
"epoch": 1.36,
"eval_accuracy": 0.6533219761499148,
"eval_loss": 0.8995758295059204,
"eval_runtime": 38.4107,
"eval_samples_per_second": 30.564,
"eval_steps_per_second": 3.827,
"step": 800
},
{
"epoch": 1.38,
"grad_norm": 3.291276216506958,
"learning_rate": 0.00013109028960817717,
"loss": 0.669,
"step": 810
},
{
"epoch": 1.4,
"grad_norm": 2.814397096633911,
"learning_rate": 0.00013023850085178878,
"loss": 0.5539,
"step": 820
},
{
"epoch": 1.41,
"grad_norm": 2.5982093811035156,
"learning_rate": 0.00012938671209540033,
"loss": 0.6565,
"step": 830
},
{
"epoch": 1.43,
"grad_norm": 3.1191565990448,
"learning_rate": 0.00012853492333901194,
"loss": 0.533,
"step": 840
},
{
"epoch": 1.45,
"grad_norm": 5.229197025299072,
"learning_rate": 0.00012768313458262352,
"loss": 0.6123,
"step": 850
},
{
"epoch": 1.47,
"grad_norm": 2.259110689163208,
"learning_rate": 0.0001268313458262351,
"loss": 0.5183,
"step": 860
},
{
"epoch": 1.48,
"grad_norm": 3.099496364593506,
"learning_rate": 0.00012597955706984668,
"loss": 0.6911,
"step": 870
},
{
"epoch": 1.5,
"grad_norm": 2.9909987449645996,
"learning_rate": 0.00012512776831345826,
"loss": 0.6671,
"step": 880
},
{
"epoch": 1.52,
"grad_norm": 3.1856462955474854,
"learning_rate": 0.00012427597955706984,
"loss": 0.6652,
"step": 890
},
{
"epoch": 1.53,
"grad_norm": 3.9080755710601807,
"learning_rate": 0.00012342419080068145,
"loss": 0.6098,
"step": 900
},
{
"epoch": 1.53,
"eval_accuracy": 0.6695059625212947,
"eval_loss": 0.8459659218788147,
"eval_runtime": 37.8733,
"eval_samples_per_second": 30.998,
"eval_steps_per_second": 3.881,
"step": 900
},
{
"epoch": 1.55,
"grad_norm": 1.7587580680847168,
"learning_rate": 0.000122572402044293,
"loss": 0.7362,
"step": 910
},
{
"epoch": 1.57,
"grad_norm": 2.7327494621276855,
"learning_rate": 0.00012172061328790461,
"loss": 0.5863,
"step": 920
},
{
"epoch": 1.58,
"grad_norm": 4.113401889801025,
"learning_rate": 0.0001208688245315162,
"loss": 0.8205,
"step": 930
},
{
"epoch": 1.6,
"grad_norm": 4.598094940185547,
"learning_rate": 0.00012001703577512777,
"loss": 0.7198,
"step": 940
},
{
"epoch": 1.62,
"grad_norm": 2.8792037963867188,
"learning_rate": 0.00011916524701873937,
"loss": 0.6532,
"step": 950
},
{
"epoch": 1.64,
"grad_norm": 2.949414014816284,
"learning_rate": 0.00011831345826235094,
"loss": 0.6783,
"step": 960
},
{
"epoch": 1.65,
"grad_norm": 2.300352096557617,
"learning_rate": 0.00011746166950596253,
"loss": 0.69,
"step": 970
},
{
"epoch": 1.67,
"grad_norm": 2.5100274085998535,
"learning_rate": 0.00011660988074957411,
"loss": 0.7028,
"step": 980
},
{
"epoch": 1.69,
"grad_norm": 2.372359275817871,
"learning_rate": 0.0001157580919931857,
"loss": 0.5673,
"step": 990
},
{
"epoch": 1.7,
"grad_norm": 4.268792152404785,
"learning_rate": 0.00011490630323679727,
"loss": 0.6251,
"step": 1000
},
{
"epoch": 1.7,
"eval_accuracy": 0.6703577512776832,
"eval_loss": 0.8609783053398132,
"eval_runtime": 37.811,
"eval_samples_per_second": 31.049,
"eval_steps_per_second": 3.888,
"step": 1000
},
{
"epoch": 1.72,
"grad_norm": 3.081153154373169,
"learning_rate": 0.00011405451448040887,
"loss": 0.7021,
"step": 1010
},
{
"epoch": 1.74,
"grad_norm": 2.9631364345550537,
"learning_rate": 0.0001132879045996593,
"loss": 0.5469,
"step": 1020
},
{
"epoch": 1.75,
"grad_norm": 3.2896649837493896,
"learning_rate": 0.00011243611584327087,
"loss": 0.5593,
"step": 1030
},
{
"epoch": 1.77,
"grad_norm": 3.8375134468078613,
"learning_rate": 0.00011158432708688246,
"loss": 0.5499,
"step": 1040
},
{
"epoch": 1.79,
"grad_norm": 1.5597748756408691,
"learning_rate": 0.00011073253833049404,
"loss": 0.5529,
"step": 1050
},
{
"epoch": 1.81,
"grad_norm": 4.54299783706665,
"learning_rate": 0.00010988074957410564,
"loss": 0.6211,
"step": 1060
},
{
"epoch": 1.82,
"grad_norm": 3.2734501361846924,
"learning_rate": 0.0001090289608177172,
"loss": 0.7002,
"step": 1070
},
{
"epoch": 1.84,
"grad_norm": 3.7582859992980957,
"learning_rate": 0.0001081771720613288,
"loss": 0.7465,
"step": 1080
},
{
"epoch": 1.86,
"grad_norm": 2.190544605255127,
"learning_rate": 0.00010732538330494038,
"loss": 0.6662,
"step": 1090
},
{
"epoch": 1.87,
"grad_norm": 1.7477951049804688,
"learning_rate": 0.00010647359454855197,
"loss": 0.7863,
"step": 1100
},
{
"epoch": 1.87,
"eval_accuracy": 0.6431005110732538,
"eval_loss": 0.8668282628059387,
"eval_runtime": 37.5178,
"eval_samples_per_second": 31.292,
"eval_steps_per_second": 3.918,
"step": 1100
},
{
"epoch": 1.89,
"grad_norm": 1.9970145225524902,
"learning_rate": 0.00010562180579216354,
"loss": 0.5988,
"step": 1110
},
{
"epoch": 1.91,
"grad_norm": 3.718055248260498,
"learning_rate": 0.00010477001703577514,
"loss": 0.5973,
"step": 1120
},
{
"epoch": 1.93,
"grad_norm": 1.6347967386245728,
"learning_rate": 0.0001039182282793867,
"loss": 0.5818,
"step": 1130
},
{
"epoch": 1.94,
"grad_norm": 2.3118577003479004,
"learning_rate": 0.0001030664395229983,
"loss": 0.5136,
"step": 1140
},
{
"epoch": 1.96,
"grad_norm": 2.806833267211914,
"learning_rate": 0.00010221465076660988,
"loss": 0.5353,
"step": 1150
},
{
"epoch": 1.98,
"grad_norm": 2.699890375137329,
"learning_rate": 0.00010136286201022147,
"loss": 0.5498,
"step": 1160
},
{
"epoch": 1.99,
"grad_norm": 2.3461856842041016,
"learning_rate": 0.00010051107325383304,
"loss": 0.7181,
"step": 1170
},
{
"epoch": 2.01,
"grad_norm": 2.483959436416626,
"learning_rate": 9.965928449744463e-05,
"loss": 0.3872,
"step": 1180
},
{
"epoch": 2.03,
"grad_norm": 2.1393377780914307,
"learning_rate": 9.880749574105622e-05,
"loss": 0.292,
"step": 1190
},
{
"epoch": 2.04,
"grad_norm": 1.6828927993774414,
"learning_rate": 9.795570698466781e-05,
"loss": 0.2595,
"step": 1200
},
{
"epoch": 2.04,
"eval_accuracy": 0.6839863713798978,
"eval_loss": 0.8725138902664185,
"eval_runtime": 37.2408,
"eval_samples_per_second": 31.525,
"eval_steps_per_second": 3.947,
"step": 1200
},
{
"epoch": 2.06,
"grad_norm": 1.973240852355957,
"learning_rate": 9.710391822827939e-05,
"loss": 0.2644,
"step": 1210
},
{
"epoch": 2.08,
"grad_norm": 2.932751417160034,
"learning_rate": 9.625212947189097e-05,
"loss": 0.2925,
"step": 1220
},
{
"epoch": 2.1,
"grad_norm": 3.356760025024414,
"learning_rate": 9.540034071550255e-05,
"loss": 0.312,
"step": 1230
},
{
"epoch": 2.11,
"grad_norm": 1.7125446796417236,
"learning_rate": 9.454855195911415e-05,
"loss": 0.2277,
"step": 1240
},
{
"epoch": 2.13,
"grad_norm": 1.714805006980896,
"learning_rate": 9.369676320272573e-05,
"loss": 0.3301,
"step": 1250
},
{
"epoch": 2.15,
"grad_norm": 2.301734685897827,
"learning_rate": 9.284497444633732e-05,
"loss": 0.2668,
"step": 1260
},
{
"epoch": 2.16,
"grad_norm": 2.4843878746032715,
"learning_rate": 9.19931856899489e-05,
"loss": 0.2333,
"step": 1270
},
{
"epoch": 2.18,
"grad_norm": 2.9054977893829346,
"learning_rate": 9.114139693356048e-05,
"loss": 0.3492,
"step": 1280
},
{
"epoch": 2.2,
"grad_norm": 4.664933681488037,
"learning_rate": 9.028960817717206e-05,
"loss": 0.3754,
"step": 1290
},
{
"epoch": 2.21,
"grad_norm": 2.1164679527282715,
"learning_rate": 8.943781942078366e-05,
"loss": 0.2735,
"step": 1300
},
{
"epoch": 2.21,
"eval_accuracy": 0.6746166950596252,
"eval_loss": 0.9306557178497314,
"eval_runtime": 37.0944,
"eval_samples_per_second": 31.649,
"eval_steps_per_second": 3.963,
"step": 1300
},
{
"epoch": 2.23,
"grad_norm": 4.541740894317627,
"learning_rate": 8.858603066439524e-05,
"loss": 0.3835,
"step": 1310
},
{
"epoch": 2.25,
"grad_norm": 3.0828359127044678,
"learning_rate": 8.773424190800682e-05,
"loss": 0.3189,
"step": 1320
},
{
"epoch": 2.27,
"grad_norm": 2.398512363433838,
"learning_rate": 8.68824531516184e-05,
"loss": 0.29,
"step": 1330
},
{
"epoch": 2.28,
"grad_norm": 3.069840908050537,
"learning_rate": 8.603066439522998e-05,
"loss": 0.288,
"step": 1340
},
{
"epoch": 2.3,
"grad_norm": 5.078506946563721,
"learning_rate": 8.517887563884158e-05,
"loss": 0.2772,
"step": 1350
},
{
"epoch": 2.32,
"grad_norm": 2.812199354171753,
"learning_rate": 8.432708688245316e-05,
"loss": 0.2951,
"step": 1360
},
{
"epoch": 2.33,
"grad_norm": 4.542017936706543,
"learning_rate": 8.347529812606474e-05,
"loss": 0.2142,
"step": 1370
},
{
"epoch": 2.35,
"grad_norm": 3.7486696243286133,
"learning_rate": 8.262350936967632e-05,
"loss": 0.257,
"step": 1380
},
{
"epoch": 2.37,
"grad_norm": 3.5566983222961426,
"learning_rate": 8.17717206132879e-05,
"loss": 0.2816,
"step": 1390
},
{
"epoch": 2.39,
"grad_norm": 1.3465384244918823,
"learning_rate": 8.09199318568995e-05,
"loss": 0.2429,
"step": 1400
},
{
"epoch": 2.39,
"eval_accuracy": 0.6354344122657581,
"eval_loss": 1.0957823991775513,
"eval_runtime": 37.2033,
"eval_samples_per_second": 31.556,
"eval_steps_per_second": 3.951,
"step": 1400
},
{
"epoch": 2.4,
"grad_norm": 2.310131788253784,
"learning_rate": 8.006814310051108e-05,
"loss": 0.306,
"step": 1410
},
{
"epoch": 2.42,
"grad_norm": 3.1297261714935303,
"learning_rate": 7.921635434412266e-05,
"loss": 0.3257,
"step": 1420
},
{
"epoch": 2.44,
"grad_norm": 1.8082480430603027,
"learning_rate": 7.836456558773425e-05,
"loss": 0.2001,
"step": 1430
},
{
"epoch": 2.45,
"grad_norm": 1.7700148820877075,
"learning_rate": 7.751277683134583e-05,
"loss": 0.3476,
"step": 1440
},
{
"epoch": 2.47,
"grad_norm": 4.247625350952148,
"learning_rate": 7.666098807495741e-05,
"loss": 0.2323,
"step": 1450
},
{
"epoch": 2.49,
"grad_norm": 4.059571743011475,
"learning_rate": 7.5809199318569e-05,
"loss": 0.3089,
"step": 1460
},
{
"epoch": 2.5,
"grad_norm": 3.2417612075805664,
"learning_rate": 7.495741056218059e-05,
"loss": 0.1964,
"step": 1470
},
{
"epoch": 2.52,
"grad_norm": 5.7817463874816895,
"learning_rate": 7.410562180579217e-05,
"loss": 0.3549,
"step": 1480
},
{
"epoch": 2.54,
"grad_norm": 5.440825939178467,
"learning_rate": 7.325383304940375e-05,
"loss": 0.3085,
"step": 1490
},
{
"epoch": 2.56,
"grad_norm": 4.482067108154297,
"learning_rate": 7.240204429301533e-05,
"loss": 0.3224,
"step": 1500
},
{
"epoch": 2.56,
"eval_accuracy": 0.6686541737649063,
"eval_loss": 1.0305246114730835,
"eval_runtime": 37.1181,
"eval_samples_per_second": 31.629,
"eval_steps_per_second": 3.96,
"step": 1500
},
{
"epoch": 2.57,
"grad_norm": 2.1568057537078857,
"learning_rate": 7.155025553662692e-05,
"loss": 0.1612,
"step": 1510
},
{
"epoch": 2.59,
"grad_norm": 1.293427586555481,
"learning_rate": 7.06984667802385e-05,
"loss": 0.3217,
"step": 1520
},
{
"epoch": 2.61,
"grad_norm": 4.301244258880615,
"learning_rate": 6.984667802385009e-05,
"loss": 0.2378,
"step": 1530
},
{
"epoch": 2.62,
"grad_norm": 1.6040468215942383,
"learning_rate": 6.899488926746167e-05,
"loss": 0.2801,
"step": 1540
},
{
"epoch": 2.64,
"grad_norm": 0.7993047833442688,
"learning_rate": 6.814310051107326e-05,
"loss": 0.2637,
"step": 1550
},
{
"epoch": 2.66,
"grad_norm": 4.865533828735352,
"learning_rate": 6.729131175468484e-05,
"loss": 0.3441,
"step": 1560
},
{
"epoch": 2.67,
"grad_norm": 1.7501546144485474,
"learning_rate": 6.643952299829642e-05,
"loss": 0.2523,
"step": 1570
},
{
"epoch": 2.69,
"grad_norm": 1.331475019454956,
"learning_rate": 6.5587734241908e-05,
"loss": 0.2127,
"step": 1580
},
{
"epoch": 2.71,
"grad_norm": 3.352147102355957,
"learning_rate": 6.473594548551958e-05,
"loss": 0.3432,
"step": 1590
},
{
"epoch": 2.73,
"grad_norm": 0.3470512330532074,
"learning_rate": 6.388415672913118e-05,
"loss": 0.1602,
"step": 1600
},
{
"epoch": 2.73,
"eval_accuracy": 0.6746166950596252,
"eval_loss": 1.0072139501571655,
"eval_runtime": 37.0019,
"eval_samples_per_second": 31.728,
"eval_steps_per_second": 3.973,
"step": 1600
},
{
"epoch": 2.74,
"grad_norm": 3.1594250202178955,
"learning_rate": 6.303236797274277e-05,
"loss": 0.1929,
"step": 1610
},
{
"epoch": 2.76,
"grad_norm": 4.477923393249512,
"learning_rate": 6.218057921635435e-05,
"loss": 0.2696,
"step": 1620
},
{
"epoch": 2.78,
"grad_norm": 3.042938232421875,
"learning_rate": 6.132879045996594e-05,
"loss": 0.2527,
"step": 1630
},
{
"epoch": 2.79,
"grad_norm": 0.8534514904022217,
"learning_rate": 6.0477001703577516e-05,
"loss": 0.1727,
"step": 1640
},
{
"epoch": 2.81,
"grad_norm": 2.2307116985321045,
"learning_rate": 5.9625212947189104e-05,
"loss": 0.3178,
"step": 1650
},
{
"epoch": 2.83,
"grad_norm": 3.302003860473633,
"learning_rate": 5.8773424190800684e-05,
"loss": 0.2973,
"step": 1660
},
{
"epoch": 2.84,
"grad_norm": 5.320656776428223,
"learning_rate": 5.792163543441227e-05,
"loss": 0.2715,
"step": 1670
},
{
"epoch": 2.86,
"grad_norm": 3.923163414001465,
"learning_rate": 5.706984667802385e-05,
"loss": 0.1991,
"step": 1680
},
{
"epoch": 2.88,
"grad_norm": 7.479254245758057,
"learning_rate": 5.6218057921635434e-05,
"loss": 0.321,
"step": 1690
},
{
"epoch": 2.9,
"grad_norm": 2.2710225582122803,
"learning_rate": 5.536626916524702e-05,
"loss": 0.2042,
"step": 1700
},
{
"epoch": 2.9,
"eval_accuracy": 0.6788756388415673,
"eval_loss": 1.0971218347549438,
"eval_runtime": 36.9173,
"eval_samples_per_second": 31.801,
"eval_steps_per_second": 3.982,
"step": 1700
},
{
"epoch": 2.91,
"grad_norm": 2.7610058784484863,
"learning_rate": 5.45144804088586e-05,
"loss": 0.3396,
"step": 1710
},
{
"epoch": 2.93,
"grad_norm": 2.2475104331970215,
"learning_rate": 5.366269165247019e-05,
"loss": 0.266,
"step": 1720
},
{
"epoch": 2.95,
"grad_norm": 4.55673885345459,
"learning_rate": 5.281090289608177e-05,
"loss": 0.341,
"step": 1730
},
{
"epoch": 2.96,
"grad_norm": 4.0248260498046875,
"learning_rate": 5.195911413969335e-05,
"loss": 0.2005,
"step": 1740
},
{
"epoch": 2.98,
"grad_norm": 4.798257827758789,
"learning_rate": 5.110732538330494e-05,
"loss": 0.2615,
"step": 1750
},
{
"epoch": 3.0,
"grad_norm": 3.2967402935028076,
"learning_rate": 5.025553662691652e-05,
"loss": 0.1966,
"step": 1760
},
{
"epoch": 3.02,
"grad_norm": 5.774517059326172,
"learning_rate": 4.940374787052811e-05,
"loss": 0.1141,
"step": 1770
},
{
"epoch": 3.03,
"grad_norm": 1.7739803791046143,
"learning_rate": 4.8551959114139695e-05,
"loss": 0.0671,
"step": 1780
},
{
"epoch": 3.05,
"grad_norm": 0.8837150931358337,
"learning_rate": 4.7700170357751276e-05,
"loss": 0.0835,
"step": 1790
},
{
"epoch": 3.07,
"grad_norm": 1.7833037376403809,
"learning_rate": 4.6848381601362864e-05,
"loss": 0.0604,
"step": 1800
},
{
"epoch": 3.07,
"eval_accuracy": 0.6916524701873935,
"eval_loss": 1.0816737413406372,
"eval_runtime": 36.8222,
"eval_samples_per_second": 31.883,
"eval_steps_per_second": 3.992,
"step": 1800
},
{
"epoch": 3.08,
"grad_norm": 0.34585830569267273,
"learning_rate": 4.599659284497445e-05,
"loss": 0.092,
"step": 1810
},
{
"epoch": 3.1,
"grad_norm": 0.7962571382522583,
"learning_rate": 4.514480408858603e-05,
"loss": 0.0587,
"step": 1820
},
{
"epoch": 3.12,
"grad_norm": 0.16402888298034668,
"learning_rate": 4.429301533219762e-05,
"loss": 0.0547,
"step": 1830
},
{
"epoch": 3.13,
"grad_norm": 0.624047040939331,
"learning_rate": 4.34412265758092e-05,
"loss": 0.0954,
"step": 1840
},
{
"epoch": 3.15,
"grad_norm": 0.4253842532634735,
"learning_rate": 4.258943781942079e-05,
"loss": 0.0567,
"step": 1850
},
{
"epoch": 3.17,
"grad_norm": 0.1523701399564743,
"learning_rate": 4.173764906303237e-05,
"loss": 0.0413,
"step": 1860
},
{
"epoch": 3.19,
"grad_norm": 4.592818260192871,
"learning_rate": 4.088586030664395e-05,
"loss": 0.0968,
"step": 1870
},
{
"epoch": 3.2,
"grad_norm": 1.4066344499588013,
"learning_rate": 4.003407155025554e-05,
"loss": 0.1454,
"step": 1880
},
{
"epoch": 3.22,
"grad_norm": 2.1996095180511475,
"learning_rate": 3.9182282793867125e-05,
"loss": 0.1128,
"step": 1890
},
{
"epoch": 3.24,
"grad_norm": 0.102027028799057,
"learning_rate": 3.8330494037478706e-05,
"loss": 0.0716,
"step": 1900
},
{
"epoch": 3.24,
"eval_accuracy": 0.692504258943782,
"eval_loss": 1.1307132244110107,
"eval_runtime": 37.0403,
"eval_samples_per_second": 31.695,
"eval_steps_per_second": 3.969,
"step": 1900
},
{
"epoch": 3.25,
"grad_norm": 1.6857343912124634,
"learning_rate": 3.7478705281090294e-05,
"loss": 0.04,
"step": 1910
},
{
"epoch": 3.27,
"grad_norm": 1.2973403930664062,
"learning_rate": 3.6626916524701875e-05,
"loss": 0.0403,
"step": 1920
},
{
"epoch": 3.29,
"grad_norm": 0.41860514879226685,
"learning_rate": 3.577512776831346e-05,
"loss": 0.0642,
"step": 1930
},
{
"epoch": 3.3,
"grad_norm": 0.5436795353889465,
"learning_rate": 3.492333901192504e-05,
"loss": 0.0836,
"step": 1940
},
{
"epoch": 3.32,
"grad_norm": 0.21996204555034637,
"learning_rate": 3.407155025553663e-05,
"loss": 0.0406,
"step": 1950
},
{
"epoch": 3.34,
"grad_norm": 0.14845231175422668,
"learning_rate": 3.321976149914821e-05,
"loss": 0.0385,
"step": 1960
},
{
"epoch": 3.36,
"grad_norm": 3.531405448913574,
"learning_rate": 3.236797274275979e-05,
"loss": 0.0824,
"step": 1970
},
{
"epoch": 3.37,
"grad_norm": 0.07682117819786072,
"learning_rate": 3.151618398637139e-05,
"loss": 0.0717,
"step": 1980
},
{
"epoch": 3.39,
"grad_norm": 0.07611515372991562,
"learning_rate": 3.066439522998297e-05,
"loss": 0.0572,
"step": 1990
},
{
"epoch": 3.41,
"grad_norm": 0.6266534328460693,
"learning_rate": 2.9812606473594552e-05,
"loss": 0.0822,
"step": 2000
},
{
"epoch": 3.41,
"eval_accuracy": 0.692504258943782,
"eval_loss": 1.1826940774917603,
"eval_runtime": 37.1369,
"eval_samples_per_second": 31.613,
"eval_steps_per_second": 3.958,
"step": 2000
},
{
"epoch": 3.42,
"grad_norm": 0.1280030608177185,
"learning_rate": 2.8960817717206136e-05,
"loss": 0.0244,
"step": 2010
},
{
"epoch": 3.44,
"grad_norm": 0.07406999170780182,
"learning_rate": 2.8109028960817717e-05,
"loss": 0.0574,
"step": 2020
},
{
"epoch": 3.46,
"grad_norm": 5.587332248687744,
"learning_rate": 2.72572402044293e-05,
"loss": 0.0352,
"step": 2030
},
{
"epoch": 3.48,
"grad_norm": 2.2010979652404785,
"learning_rate": 2.6405451448040885e-05,
"loss": 0.0789,
"step": 2040
},
{
"epoch": 3.49,
"grad_norm": 2.9271368980407715,
"learning_rate": 2.555366269165247e-05,
"loss": 0.082,
"step": 2050
},
{
"epoch": 3.51,
"grad_norm": 0.05890679359436035,
"learning_rate": 2.4701873935264054e-05,
"loss": 0.0769,
"step": 2060
},
{
"epoch": 3.53,
"grad_norm": 0.7043523192405701,
"learning_rate": 2.3850085178875638e-05,
"loss": 0.0819,
"step": 2070
},
{
"epoch": 3.54,
"grad_norm": 0.12047506123781204,
"learning_rate": 2.2998296422487226e-05,
"loss": 0.0195,
"step": 2080
},
{
"epoch": 3.56,
"grad_norm": 0.1116802990436554,
"learning_rate": 2.214650766609881e-05,
"loss": 0.0159,
"step": 2090
},
{
"epoch": 3.58,
"grad_norm": 0.09187493473291397,
"learning_rate": 2.1294718909710394e-05,
"loss": 0.0889,
"step": 2100
},
{
"epoch": 3.58,
"eval_accuracy": 0.6933560477001703,
"eval_loss": 1.2423571348190308,
"eval_runtime": 37.3059,
"eval_samples_per_second": 31.47,
"eval_steps_per_second": 3.94,
"step": 2100
},
{
"epoch": 3.59,
"grad_norm": 4.332376956939697,
"learning_rate": 2.0442930153321975e-05,
"loss": 0.0939,
"step": 2110
},
{
"epoch": 3.61,
"grad_norm": 0.13916102051734924,
"learning_rate": 1.9591141396933563e-05,
"loss": 0.0933,
"step": 2120
},
{
"epoch": 3.63,
"grad_norm": 7.690703392028809,
"learning_rate": 1.8739352640545147e-05,
"loss": 0.0496,
"step": 2130
},
{
"epoch": 3.65,
"grad_norm": 2.5700595378875732,
"learning_rate": 1.788756388415673e-05,
"loss": 0.0782,
"step": 2140
},
{
"epoch": 3.66,
"grad_norm": 0.20934216678142548,
"learning_rate": 1.7035775127768315e-05,
"loss": 0.0606,
"step": 2150
},
{
"epoch": 3.68,
"grad_norm": 1.2959486246109009,
"learning_rate": 1.6183986371379896e-05,
"loss": 0.0601,
"step": 2160
},
{
"epoch": 3.7,
"grad_norm": 0.2652721405029297,
"learning_rate": 1.5332197614991484e-05,
"loss": 0.062,
"step": 2170
},
{
"epoch": 3.71,
"grad_norm": 0.48360127210617065,
"learning_rate": 1.4480408858603068e-05,
"loss": 0.054,
"step": 2180
},
{
"epoch": 3.73,
"grad_norm": 3.1118693351745605,
"learning_rate": 1.362862010221465e-05,
"loss": 0.0989,
"step": 2190
},
{
"epoch": 3.75,
"grad_norm": 0.9077383279800415,
"learning_rate": 1.2776831345826235e-05,
"loss": 0.0855,
"step": 2200
},
{
"epoch": 3.75,
"eval_accuracy": 0.6899488926746167,
"eval_loss": 1.2667156457901,
"eval_runtime": 36.8511,
"eval_samples_per_second": 31.858,
"eval_steps_per_second": 3.989,
"step": 2200
},
{
"epoch": 3.76,
"grad_norm": 0.13304296135902405,
"learning_rate": 1.1925042589437819e-05,
"loss": 0.0675,
"step": 2210
},
{
"epoch": 3.78,
"grad_norm": 1.3241567611694336,
"learning_rate": 1.1073253833049405e-05,
"loss": 0.0753,
"step": 2220
},
{
"epoch": 3.8,
"grad_norm": 0.2818525731563568,
"learning_rate": 1.0221465076660987e-05,
"loss": 0.0998,
"step": 2230
},
{
"epoch": 3.82,
"grad_norm": 7.136697292327881,
"learning_rate": 9.369676320272573e-06,
"loss": 0.0314,
"step": 2240
},
{
"epoch": 3.83,
"grad_norm": 1.372044324874878,
"learning_rate": 8.517887563884158e-06,
"loss": 0.0768,
"step": 2250
},
{
"epoch": 3.85,
"grad_norm": 6.264348983764648,
"learning_rate": 7.666098807495742e-06,
"loss": 0.1516,
"step": 2260
},
{
"epoch": 3.87,
"grad_norm": 0.1342085599899292,
"learning_rate": 6.814310051107325e-06,
"loss": 0.0812,
"step": 2270
},
{
"epoch": 3.88,
"grad_norm": 0.7664629220962524,
"learning_rate": 5.9625212947189095e-06,
"loss": 0.0474,
"step": 2280
},
{
"epoch": 3.9,
"grad_norm": 4.264090538024902,
"learning_rate": 5.110732538330494e-06,
"loss": 0.0903,
"step": 2290
},
{
"epoch": 3.92,
"grad_norm": 0.07316776365041733,
"learning_rate": 4.258943781942079e-06,
"loss": 0.0682,
"step": 2300
},
{
"epoch": 3.92,
"eval_accuracy": 0.6950596252129472,
"eval_loss": 1.2470241785049438,
"eval_runtime": 37.0027,
"eval_samples_per_second": 31.727,
"eval_steps_per_second": 3.973,
"step": 2300
},
{
"epoch": 3.94,
"grad_norm": 1.477973222732544,
"learning_rate": 3.4071550255536626e-06,
"loss": 0.0587,
"step": 2310
},
{
"epoch": 3.95,
"grad_norm": 1.249779224395752,
"learning_rate": 2.555366269165247e-06,
"loss": 0.0546,
"step": 2320
},
{
"epoch": 3.97,
"grad_norm": 1.9763495922088623,
"learning_rate": 1.7035775127768313e-06,
"loss": 0.0539,
"step": 2330
},
{
"epoch": 3.99,
"grad_norm": 0.11824575811624527,
"learning_rate": 8.517887563884157e-07,
"loss": 0.0322,
"step": 2340
},
{
"epoch": 4.0,
"step": 2348,
"total_flos": 2.910419581971751e+18,
"train_loss": 0.4888155373286145,
"train_runtime": 2894.9609,
"train_samples_per_second": 12.973,
"train_steps_per_second": 0.811
}
],
"logging_steps": 10,
"max_steps": 2348,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"total_flos": 2.910419581971751e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}