shylhy's picture
Training in progress, epoch 0
3ec9b6b verified
{
"best_metric": 0.3794967830181122,
"best_model_checkpoint": "videomae-large-finetuned-deepfake-subset/checkpoint-2235",
"epoch": 9.1,
"eval_steps": 500,
"global_step": 4470,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022371364653243847,
"grad_norm": 8.840668678283691,
"learning_rate": 1.1185682326621925e-06,
"loss": 0.7314,
"step": 10
},
{
"epoch": 0.0044742729306487695,
"grad_norm": 14.85615348815918,
"learning_rate": 2.237136465324385e-06,
"loss": 0.7059,
"step": 20
},
{
"epoch": 0.006711409395973154,
"grad_norm": 13.392386436462402,
"learning_rate": 3.3557046979865773e-06,
"loss": 0.677,
"step": 30
},
{
"epoch": 0.008948545861297539,
"grad_norm": 9.807293891906738,
"learning_rate": 4.47427293064877e-06,
"loss": 0.7136,
"step": 40
},
{
"epoch": 0.011185682326621925,
"grad_norm": 7.858952045440674,
"learning_rate": 5.592841163310962e-06,
"loss": 0.679,
"step": 50
},
{
"epoch": 0.013422818791946308,
"grad_norm": 11.944358825683594,
"learning_rate": 6.7114093959731546e-06,
"loss": 0.6539,
"step": 60
},
{
"epoch": 0.015659955257270694,
"grad_norm": 11.665265083312988,
"learning_rate": 7.829977628635348e-06,
"loss": 0.7147,
"step": 70
},
{
"epoch": 0.017897091722595078,
"grad_norm": 17.402263641357422,
"learning_rate": 8.94854586129754e-06,
"loss": 0.6939,
"step": 80
},
{
"epoch": 0.020134228187919462,
"grad_norm": 13.367256164550781,
"learning_rate": 1.006711409395973e-05,
"loss": 0.6801,
"step": 90
},
{
"epoch": 0.02237136465324385,
"grad_norm": 11.276527404785156,
"learning_rate": 1.1185682326621925e-05,
"loss": 0.6904,
"step": 100
},
{
"epoch": 0.024608501118568233,
"grad_norm": 8.762359619140625,
"learning_rate": 1.2304250559284117e-05,
"loss": 0.6742,
"step": 110
},
{
"epoch": 0.026845637583892617,
"grad_norm": 10.632108688354492,
"learning_rate": 1.3422818791946309e-05,
"loss": 0.6967,
"step": 120
},
{
"epoch": 0.029082774049217,
"grad_norm": 8.825544357299805,
"learning_rate": 1.4541387024608501e-05,
"loss": 0.631,
"step": 130
},
{
"epoch": 0.03131991051454139,
"grad_norm": 10.079858779907227,
"learning_rate": 1.5659955257270695e-05,
"loss": 0.698,
"step": 140
},
{
"epoch": 0.03355704697986577,
"grad_norm": 12.647045135498047,
"learning_rate": 1.6778523489932888e-05,
"loss": 0.6871,
"step": 150
},
{
"epoch": 0.035794183445190156,
"grad_norm": 7.6588311195373535,
"learning_rate": 1.789709172259508e-05,
"loss": 0.7482,
"step": 160
},
{
"epoch": 0.03803131991051454,
"grad_norm": 8.198958396911621,
"learning_rate": 1.9015659955257272e-05,
"loss": 0.641,
"step": 170
},
{
"epoch": 0.040268456375838924,
"grad_norm": 8.8464937210083,
"learning_rate": 2.013422818791946e-05,
"loss": 0.7909,
"step": 180
},
{
"epoch": 0.042505592841163314,
"grad_norm": 4.950939178466797,
"learning_rate": 2.1252796420581657e-05,
"loss": 0.7335,
"step": 190
},
{
"epoch": 0.0447427293064877,
"grad_norm": 5.206540584564209,
"learning_rate": 2.237136465324385e-05,
"loss": 0.744,
"step": 200
},
{
"epoch": 0.04697986577181208,
"grad_norm": 6.270852088928223,
"learning_rate": 2.348993288590604e-05,
"loss": 0.691,
"step": 210
},
{
"epoch": 0.049217002237136466,
"grad_norm": 6.907114028930664,
"learning_rate": 2.4608501118568234e-05,
"loss": 0.6868,
"step": 220
},
{
"epoch": 0.05145413870246085,
"grad_norm": 6.208651065826416,
"learning_rate": 2.5727069351230426e-05,
"loss": 0.7205,
"step": 230
},
{
"epoch": 0.053691275167785234,
"grad_norm": 4.508482456207275,
"learning_rate": 2.6845637583892618e-05,
"loss": 0.6688,
"step": 240
},
{
"epoch": 0.05592841163310962,
"grad_norm": 8.969482421875,
"learning_rate": 2.796420581655481e-05,
"loss": 0.6811,
"step": 250
},
{
"epoch": 0.058165548098434,
"grad_norm": 6.105631351470947,
"learning_rate": 2.9082774049217003e-05,
"loss": 0.7777,
"step": 260
},
{
"epoch": 0.06040268456375839,
"grad_norm": 4.278919696807861,
"learning_rate": 3.02013422818792e-05,
"loss": 0.7046,
"step": 270
},
{
"epoch": 0.06263982102908278,
"grad_norm": 5.356738567352295,
"learning_rate": 3.131991051454139e-05,
"loss": 0.6642,
"step": 280
},
{
"epoch": 0.06487695749440715,
"grad_norm": 5.4614691734313965,
"learning_rate": 3.243847874720358e-05,
"loss": 0.6817,
"step": 290
},
{
"epoch": 0.06711409395973154,
"grad_norm": 5.7568278312683105,
"learning_rate": 3.3557046979865775e-05,
"loss": 0.6382,
"step": 300
},
{
"epoch": 0.06935123042505593,
"grad_norm": 3.24556565284729,
"learning_rate": 3.4675615212527964e-05,
"loss": 0.679,
"step": 310
},
{
"epoch": 0.07158836689038031,
"grad_norm": 3.6615562438964844,
"learning_rate": 3.579418344519016e-05,
"loss": 0.6796,
"step": 320
},
{
"epoch": 0.0738255033557047,
"grad_norm": 7.1600494384765625,
"learning_rate": 3.6912751677852356e-05,
"loss": 0.675,
"step": 330
},
{
"epoch": 0.07606263982102908,
"grad_norm": 3.104412317276001,
"learning_rate": 3.8031319910514545e-05,
"loss": 0.6024,
"step": 340
},
{
"epoch": 0.07829977628635347,
"grad_norm": 4.062417030334473,
"learning_rate": 3.914988814317674e-05,
"loss": 0.7211,
"step": 350
},
{
"epoch": 0.08053691275167785,
"grad_norm": 16.75772476196289,
"learning_rate": 4.026845637583892e-05,
"loss": 0.6736,
"step": 360
},
{
"epoch": 0.08277404921700224,
"grad_norm": 6.7340168952941895,
"learning_rate": 4.138702460850112e-05,
"loss": 0.6964,
"step": 370
},
{
"epoch": 0.08501118568232663,
"grad_norm": 4.515945911407471,
"learning_rate": 4.2505592841163314e-05,
"loss": 0.7276,
"step": 380
},
{
"epoch": 0.087248322147651,
"grad_norm": 2.2911009788513184,
"learning_rate": 4.36241610738255e-05,
"loss": 0.5794,
"step": 390
},
{
"epoch": 0.0894854586129754,
"grad_norm": 5.8774027824401855,
"learning_rate": 4.47427293064877e-05,
"loss": 0.5691,
"step": 400
},
{
"epoch": 0.09172259507829977,
"grad_norm": 4.3128743171691895,
"learning_rate": 4.586129753914989e-05,
"loss": 0.7606,
"step": 410
},
{
"epoch": 0.09395973154362416,
"grad_norm": 9.608476638793945,
"learning_rate": 4.697986577181208e-05,
"loss": 0.775,
"step": 420
},
{
"epoch": 0.09619686800894854,
"grad_norm": 7.786107540130615,
"learning_rate": 4.809843400447427e-05,
"loss": 0.6394,
"step": 430
},
{
"epoch": 0.09843400447427293,
"grad_norm": 8.117825508117676,
"learning_rate": 4.921700223713647e-05,
"loss": 0.6361,
"step": 440
},
{
"epoch": 0.1,
"eval_loss": 0.6478366255760193,
"eval_runtime": 887.1172,
"eval_samples_per_second": 8.36,
"eval_steps_per_second": 1.045,
"step": 447
},
{
"epoch": 1.0006711409395974,
"grad_norm": 4.550715446472168,
"learning_rate": 4.99627143922446e-05,
"loss": 0.6042,
"step": 450
},
{
"epoch": 1.0029082774049216,
"grad_norm": 6.292792797088623,
"learning_rate": 4.9838429033059906e-05,
"loss": 0.6412,
"step": 460
},
{
"epoch": 1.005145413870246,
"grad_norm": 4.233644485473633,
"learning_rate": 4.971414367387522e-05,
"loss": 0.6035,
"step": 470
},
{
"epoch": 1.0073825503355704,
"grad_norm": 5.84645938873291,
"learning_rate": 4.958985831469053e-05,
"loss": 0.5547,
"step": 480
},
{
"epoch": 1.0096196868008949,
"grad_norm": 5.941657066345215,
"learning_rate": 4.946557295550584e-05,
"loss": 0.6611,
"step": 490
},
{
"epoch": 1.0118568232662193,
"grad_norm": 2.766065835952759,
"learning_rate": 4.9341287596321155e-05,
"loss": 0.7369,
"step": 500
},
{
"epoch": 1.0140939597315437,
"grad_norm": 7.194911003112793,
"learning_rate": 4.921700223713647e-05,
"loss": 0.7586,
"step": 510
},
{
"epoch": 1.016331096196868,
"grad_norm": 5.610304832458496,
"learning_rate": 4.909271687795178e-05,
"loss": 0.6815,
"step": 520
},
{
"epoch": 1.0185682326621923,
"grad_norm": 5.830178260803223,
"learning_rate": 4.896843151876709e-05,
"loss": 0.5949,
"step": 530
},
{
"epoch": 1.0208053691275167,
"grad_norm": 7.435003280639648,
"learning_rate": 4.8844146159582404e-05,
"loss": 0.6054,
"step": 540
},
{
"epoch": 1.0230425055928412,
"grad_norm": 3.300534248352051,
"learning_rate": 4.871986080039772e-05,
"loss": 0.644,
"step": 550
},
{
"epoch": 1.0252796420581656,
"grad_norm": 5.10628080368042,
"learning_rate": 4.859557544121303e-05,
"loss": 0.7789,
"step": 560
},
{
"epoch": 1.02751677852349,
"grad_norm": 3.5758488178253174,
"learning_rate": 4.8471290082028335e-05,
"loss": 0.6373,
"step": 570
},
{
"epoch": 1.0297539149888144,
"grad_norm": 2.3225715160369873,
"learning_rate": 4.8347004722843654e-05,
"loss": 0.6763,
"step": 580
},
{
"epoch": 1.0319910514541386,
"grad_norm": 2.920625686645508,
"learning_rate": 4.8222719363658966e-05,
"loss": 0.6733,
"step": 590
},
{
"epoch": 1.034228187919463,
"grad_norm": 5.328765869140625,
"learning_rate": 4.809843400447427e-05,
"loss": 0.6051,
"step": 600
},
{
"epoch": 1.0364653243847874,
"grad_norm": 5.5468878746032715,
"learning_rate": 4.7974148645289584e-05,
"loss": 0.6439,
"step": 610
},
{
"epoch": 1.0387024608501119,
"grad_norm": 3.9731194972991943,
"learning_rate": 4.78498632861049e-05,
"loss": 0.6416,
"step": 620
},
{
"epoch": 1.0409395973154363,
"grad_norm": 5.598540782928467,
"learning_rate": 4.772557792692021e-05,
"loss": 0.6182,
"step": 630
},
{
"epoch": 1.0431767337807607,
"grad_norm": 3.5205001831054688,
"learning_rate": 4.760129256773552e-05,
"loss": 0.6307,
"step": 640
},
{
"epoch": 1.045413870246085,
"grad_norm": 8.040474891662598,
"learning_rate": 4.7477007208550834e-05,
"loss": 0.6298,
"step": 650
},
{
"epoch": 1.0476510067114093,
"grad_norm": 7.812734127044678,
"learning_rate": 4.735272184936615e-05,
"loss": 0.5933,
"step": 660
},
{
"epoch": 1.0498881431767337,
"grad_norm": 8.276246070861816,
"learning_rate": 4.722843649018146e-05,
"loss": 0.68,
"step": 670
},
{
"epoch": 1.0521252796420582,
"grad_norm": 9.207062721252441,
"learning_rate": 4.710415113099677e-05,
"loss": 0.6591,
"step": 680
},
{
"epoch": 1.0543624161073826,
"grad_norm": 2.244326591491699,
"learning_rate": 4.697986577181208e-05,
"loss": 0.5858,
"step": 690
},
{
"epoch": 1.056599552572707,
"grad_norm": 4.339087963104248,
"learning_rate": 4.6855580412627395e-05,
"loss": 0.6173,
"step": 700
},
{
"epoch": 1.0588366890380314,
"grad_norm": 4.587048053741455,
"learning_rate": 4.673129505344271e-05,
"loss": 0.5405,
"step": 710
},
{
"epoch": 1.0610738255033556,
"grad_norm": 6.943038463592529,
"learning_rate": 4.660700969425802e-05,
"loss": 0.5336,
"step": 720
},
{
"epoch": 1.06331096196868,
"grad_norm": 3.565220832824707,
"learning_rate": 4.648272433507333e-05,
"loss": 0.6332,
"step": 730
},
{
"epoch": 1.0655480984340044,
"grad_norm": 4.71682596206665,
"learning_rate": 4.635843897588864e-05,
"loss": 0.6398,
"step": 740
},
{
"epoch": 1.0677852348993289,
"grad_norm": 8.337652206420898,
"learning_rate": 4.623415361670396e-05,
"loss": 0.6597,
"step": 750
},
{
"epoch": 1.0700223713646533,
"grad_norm": 6.056807994842529,
"learning_rate": 4.610986825751927e-05,
"loss": 0.68,
"step": 760
},
{
"epoch": 1.0722595078299777,
"grad_norm": 3.2448365688323975,
"learning_rate": 4.5985582898334575e-05,
"loss": 0.6275,
"step": 770
},
{
"epoch": 1.0744966442953021,
"grad_norm": 7.669735431671143,
"learning_rate": 4.586129753914989e-05,
"loss": 0.6847,
"step": 780
},
{
"epoch": 1.0767337807606263,
"grad_norm": 3.858572483062744,
"learning_rate": 4.57370121799652e-05,
"loss": 0.5387,
"step": 790
},
{
"epoch": 1.0789709172259507,
"grad_norm": 3.0774307250976562,
"learning_rate": 4.561272682078052e-05,
"loss": 0.6463,
"step": 800
},
{
"epoch": 1.0812080536912752,
"grad_norm": 5.94879674911499,
"learning_rate": 4.5488441461595824e-05,
"loss": 0.5771,
"step": 810
},
{
"epoch": 1.0834451901565996,
"grad_norm": 7.283985137939453,
"learning_rate": 4.5364156102411137e-05,
"loss": 0.4675,
"step": 820
},
{
"epoch": 1.085682326621924,
"grad_norm": 5.343263149261475,
"learning_rate": 4.523987074322645e-05,
"loss": 0.619,
"step": 830
},
{
"epoch": 1.0879194630872484,
"grad_norm": 2.982252597808838,
"learning_rate": 4.511558538404176e-05,
"loss": 0.6595,
"step": 840
},
{
"epoch": 1.0901565995525728,
"grad_norm": 3.7833807468414307,
"learning_rate": 4.4991300024857074e-05,
"loss": 0.4992,
"step": 850
},
{
"epoch": 1.092393736017897,
"grad_norm": 4.744262218475342,
"learning_rate": 4.4867014665672386e-05,
"loss": 0.5481,
"step": 860
},
{
"epoch": 1.0946308724832214,
"grad_norm": 4.21218204498291,
"learning_rate": 4.47427293064877e-05,
"loss": 0.4745,
"step": 870
},
{
"epoch": 1.0968680089485459,
"grad_norm": 6.225035667419434,
"learning_rate": 4.461844394730301e-05,
"loss": 0.6044,
"step": 880
},
{
"epoch": 1.0991051454138703,
"grad_norm": 6.521196365356445,
"learning_rate": 4.449415858811832e-05,
"loss": 0.6774,
"step": 890
},
{
"epoch": 1.1,
"eval_loss": 0.6047455072402954,
"eval_runtime": 892.7591,
"eval_samples_per_second": 8.307,
"eval_steps_per_second": 1.038,
"step": 894
},
{
"epoch": 2.001342281879195,
"grad_norm": 4.137167930603027,
"learning_rate": 4.4369873228933635e-05,
"loss": 0.6044,
"step": 900
},
{
"epoch": 2.003579418344519,
"grad_norm": 5.822400093078613,
"learning_rate": 4.424558786974895e-05,
"loss": 0.4379,
"step": 910
},
{
"epoch": 2.005816554809843,
"grad_norm": 5.85886812210083,
"learning_rate": 4.412130251056425e-05,
"loss": 0.6157,
"step": 920
},
{
"epoch": 2.0080536912751676,
"grad_norm": 3.880828857421875,
"learning_rate": 4.399701715137957e-05,
"loss": 0.457,
"step": 930
},
{
"epoch": 2.010290827740492,
"grad_norm": 6.341732025146484,
"learning_rate": 4.3872731792194885e-05,
"loss": 0.522,
"step": 940
},
{
"epoch": 2.0125279642058165,
"grad_norm": 10.094740867614746,
"learning_rate": 4.374844643301019e-05,
"loss": 0.6731,
"step": 950
},
{
"epoch": 2.014765100671141,
"grad_norm": 5.733267784118652,
"learning_rate": 4.36241610738255e-05,
"loss": 0.4796,
"step": 960
},
{
"epoch": 2.0170022371364653,
"grad_norm": 3.963778257369995,
"learning_rate": 4.349987571464082e-05,
"loss": 0.5343,
"step": 970
},
{
"epoch": 2.0192393736017897,
"grad_norm": 4.41682243347168,
"learning_rate": 4.337559035545613e-05,
"loss": 0.5492,
"step": 980
},
{
"epoch": 2.021476510067114,
"grad_norm": 1.9658536911010742,
"learning_rate": 4.325130499627144e-05,
"loss": 0.4044,
"step": 990
},
{
"epoch": 2.0237136465324386,
"grad_norm": 8.907441139221191,
"learning_rate": 4.312701963708675e-05,
"loss": 0.486,
"step": 1000
},
{
"epoch": 2.025950782997763,
"grad_norm": 4.796280860900879,
"learning_rate": 4.3002734277902064e-05,
"loss": 0.5449,
"step": 1010
},
{
"epoch": 2.0281879194630874,
"grad_norm": 13.880756378173828,
"learning_rate": 4.287844891871738e-05,
"loss": 0.5926,
"step": 1020
},
{
"epoch": 2.030425055928412,
"grad_norm": 5.1638312339782715,
"learning_rate": 4.275416355953269e-05,
"loss": 0.7223,
"step": 1030
},
{
"epoch": 2.032662192393736,
"grad_norm": 2.8141660690307617,
"learning_rate": 4.2629878200348e-05,
"loss": 0.5354,
"step": 1040
},
{
"epoch": 2.03489932885906,
"grad_norm": 5.402659893035889,
"learning_rate": 4.2505592841163314e-05,
"loss": 0.5388,
"step": 1050
},
{
"epoch": 2.0371364653243846,
"grad_norm": 9.897317886352539,
"learning_rate": 4.2381307481978626e-05,
"loss": 0.5581,
"step": 1060
},
{
"epoch": 2.039373601789709,
"grad_norm": 5.52318811416626,
"learning_rate": 4.225702212279394e-05,
"loss": 0.4331,
"step": 1070
},
{
"epoch": 2.0416107382550335,
"grad_norm": 6.323409080505371,
"learning_rate": 4.213273676360925e-05,
"loss": 0.6491,
"step": 1080
},
{
"epoch": 2.043847874720358,
"grad_norm": 5.7435503005981445,
"learning_rate": 4.2008451404424556e-05,
"loss": 0.494,
"step": 1090
},
{
"epoch": 2.0460850111856823,
"grad_norm": 2.975238084793091,
"learning_rate": 4.1884166045239875e-05,
"loss": 0.5629,
"step": 1100
},
{
"epoch": 2.0483221476510067,
"grad_norm": 7.936377048492432,
"learning_rate": 4.175988068605519e-05,
"loss": 0.5487,
"step": 1110
},
{
"epoch": 2.050559284116331,
"grad_norm": 9.979763984680176,
"learning_rate": 4.16355953268705e-05,
"loss": 0.6606,
"step": 1120
},
{
"epoch": 2.0527964205816556,
"grad_norm": 8.900005340576172,
"learning_rate": 4.1511309967685806e-05,
"loss": 0.537,
"step": 1130
},
{
"epoch": 2.05503355704698,
"grad_norm": 5.619589328765869,
"learning_rate": 4.138702460850112e-05,
"loss": 0.5785,
"step": 1140
},
{
"epoch": 2.0572706935123044,
"grad_norm": 4.075002193450928,
"learning_rate": 4.126273924931644e-05,
"loss": 0.3999,
"step": 1150
},
{
"epoch": 2.059507829977629,
"grad_norm": 5.566099166870117,
"learning_rate": 4.113845389013174e-05,
"loss": 0.4934,
"step": 1160
},
{
"epoch": 2.0617449664429532,
"grad_norm": 7.996336936950684,
"learning_rate": 4.1014168530947055e-05,
"loss": 0.3563,
"step": 1170
},
{
"epoch": 2.063982102908277,
"grad_norm": 8.547967910766602,
"learning_rate": 4.088988317176237e-05,
"loss": 0.6231,
"step": 1180
},
{
"epoch": 2.0662192393736016,
"grad_norm": 4.0213236808776855,
"learning_rate": 4.076559781257768e-05,
"loss": 0.7292,
"step": 1190
},
{
"epoch": 2.068456375838926,
"grad_norm": 4.820833206176758,
"learning_rate": 4.064131245339299e-05,
"loss": 0.4553,
"step": 1200
},
{
"epoch": 2.0706935123042505,
"grad_norm": 9.791057586669922,
"learning_rate": 4.0517027094208304e-05,
"loss": 0.5081,
"step": 1210
},
{
"epoch": 2.072930648769575,
"grad_norm": 2.710472345352173,
"learning_rate": 4.039274173502362e-05,
"loss": 0.4875,
"step": 1220
},
{
"epoch": 2.0751677852348993,
"grad_norm": 4.432290554046631,
"learning_rate": 4.026845637583892e-05,
"loss": 0.4762,
"step": 1230
},
{
"epoch": 2.0774049217002237,
"grad_norm": 12.281310081481934,
"learning_rate": 4.014417101665424e-05,
"loss": 0.5381,
"step": 1240
},
{
"epoch": 2.079642058165548,
"grad_norm": 12.222536087036133,
"learning_rate": 4.0019885657469554e-05,
"loss": 0.4633,
"step": 1250
},
{
"epoch": 2.0818791946308726,
"grad_norm": 10.08840274810791,
"learning_rate": 3.9895600298284866e-05,
"loss": 0.4012,
"step": 1260
},
{
"epoch": 2.084116331096197,
"grad_norm": 12.933877944946289,
"learning_rate": 3.977131493910017e-05,
"loss": 0.4828,
"step": 1270
},
{
"epoch": 2.0863534675615214,
"grad_norm": 8.704557418823242,
"learning_rate": 3.964702957991549e-05,
"loss": 0.4824,
"step": 1280
},
{
"epoch": 2.088590604026846,
"grad_norm": 5.015852451324463,
"learning_rate": 3.95227442207308e-05,
"loss": 0.5165,
"step": 1290
},
{
"epoch": 2.09082774049217,
"grad_norm": 11.11552906036377,
"learning_rate": 3.939845886154611e-05,
"loss": 0.5092,
"step": 1300
},
{
"epoch": 2.093064876957494,
"grad_norm": 10.659283638000488,
"learning_rate": 3.927417350236142e-05,
"loss": 0.5005,
"step": 1310
},
{
"epoch": 2.0953020134228186,
"grad_norm": 4.979311466217041,
"learning_rate": 3.914988814317674e-05,
"loss": 0.7376,
"step": 1320
},
{
"epoch": 2.097539149888143,
"grad_norm": 2.4167163372039795,
"learning_rate": 3.9025602783992046e-05,
"loss": 0.4397,
"step": 1330
},
{
"epoch": 2.0997762863534675,
"grad_norm": 7.5874857902526855,
"learning_rate": 3.890131742480736e-05,
"loss": 0.4168,
"step": 1340
},
{
"epoch": 2.1,
"eval_loss": 0.48516377806663513,
"eval_runtime": 891.9205,
"eval_samples_per_second": 8.315,
"eval_steps_per_second": 1.039,
"step": 1341
},
{
"epoch": 3.002013422818792,
"grad_norm": 9.898842811584473,
"learning_rate": 3.877703206562267e-05,
"loss": 0.4532,
"step": 1350
},
{
"epoch": 3.004250559284116,
"grad_norm": 10.05817699432373,
"learning_rate": 3.865274670643798e-05,
"loss": 0.7569,
"step": 1360
},
{
"epoch": 3.0064876957494406,
"grad_norm": 1.2511606216430664,
"learning_rate": 3.8528461347253295e-05,
"loss": 0.4412,
"step": 1370
},
{
"epoch": 3.008724832214765,
"grad_norm": 11.480599403381348,
"learning_rate": 3.840417598806861e-05,
"loss": 0.3935,
"step": 1380
},
{
"epoch": 3.0109619686800895,
"grad_norm": 11.63521671295166,
"learning_rate": 3.827989062888392e-05,
"loss": 0.5626,
"step": 1390
},
{
"epoch": 3.013199105145414,
"grad_norm": 9.114398956298828,
"learning_rate": 3.815560526969923e-05,
"loss": 0.6732,
"step": 1400
},
{
"epoch": 3.0154362416107383,
"grad_norm": 7.543931484222412,
"learning_rate": 3.8031319910514545e-05,
"loss": 0.4275,
"step": 1410
},
{
"epoch": 3.0176733780760627,
"grad_norm": 10.644927024841309,
"learning_rate": 3.790703455132986e-05,
"loss": 0.4325,
"step": 1420
},
{
"epoch": 3.019910514541387,
"grad_norm": 3.5276317596435547,
"learning_rate": 3.778274919214517e-05,
"loss": 0.3605,
"step": 1430
},
{
"epoch": 3.0221476510067116,
"grad_norm": 5.080909729003906,
"learning_rate": 3.7658463832960475e-05,
"loss": 0.5921,
"step": 1440
},
{
"epoch": 3.024384787472036,
"grad_norm": 1.2971785068511963,
"learning_rate": 3.753417847377579e-05,
"loss": 0.3565,
"step": 1450
},
{
"epoch": 3.02662192393736,
"grad_norm": 3.7884743213653564,
"learning_rate": 3.7409893114591106e-05,
"loss": 0.287,
"step": 1460
},
{
"epoch": 3.0288590604026844,
"grad_norm": 5.833460807800293,
"learning_rate": 3.728560775540642e-05,
"loss": 0.4596,
"step": 1470
},
{
"epoch": 3.031096196868009,
"grad_norm": 10.393218994140625,
"learning_rate": 3.7161322396221724e-05,
"loss": 0.3767,
"step": 1480
},
{
"epoch": 3.033333333333333,
"grad_norm": 12.434408187866211,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.5202,
"step": 1490
},
{
"epoch": 3.0355704697986576,
"grad_norm": 7.507827281951904,
"learning_rate": 3.6912751677852356e-05,
"loss": 0.4699,
"step": 1500
},
{
"epoch": 3.037807606263982,
"grad_norm": 4.108563423156738,
"learning_rate": 3.678846631866766e-05,
"loss": 0.4836,
"step": 1510
},
{
"epoch": 3.0400447427293065,
"grad_norm": 6.502699851989746,
"learning_rate": 3.6664180959482974e-05,
"loss": 0.2626,
"step": 1520
},
{
"epoch": 3.042281879194631,
"grad_norm": 11.717517852783203,
"learning_rate": 3.6539895600298286e-05,
"loss": 0.6785,
"step": 1530
},
{
"epoch": 3.0445190156599553,
"grad_norm": 9.127665519714355,
"learning_rate": 3.64156102411136e-05,
"loss": 0.5052,
"step": 1540
},
{
"epoch": 3.0467561521252797,
"grad_norm": 6.234489917755127,
"learning_rate": 3.629132488192891e-05,
"loss": 0.4078,
"step": 1550
},
{
"epoch": 3.048993288590604,
"grad_norm": 13.028934478759766,
"learning_rate": 3.616703952274422e-05,
"loss": 0.5171,
"step": 1560
},
{
"epoch": 3.0512304250559286,
"grad_norm": 9.663383483886719,
"learning_rate": 3.6042754163559535e-05,
"loss": 0.6043,
"step": 1570
},
{
"epoch": 3.053467561521253,
"grad_norm": 3.3989367485046387,
"learning_rate": 3.591846880437484e-05,
"loss": 0.4231,
"step": 1580
},
{
"epoch": 3.0557046979865774,
"grad_norm": 6.579158782958984,
"learning_rate": 3.579418344519016e-05,
"loss": 0.515,
"step": 1590
},
{
"epoch": 3.0579418344519014,
"grad_norm": 5.151082515716553,
"learning_rate": 3.566989808600547e-05,
"loss": 0.3919,
"step": 1600
},
{
"epoch": 3.060178970917226,
"grad_norm": 2.145969867706299,
"learning_rate": 3.5545612726820785e-05,
"loss": 0.4733,
"step": 1610
},
{
"epoch": 3.06241610738255,
"grad_norm": 5.741364002227783,
"learning_rate": 3.542132736763609e-05,
"loss": 0.4354,
"step": 1620
},
{
"epoch": 3.0646532438478746,
"grad_norm": 3.9511780738830566,
"learning_rate": 3.529704200845141e-05,
"loss": 0.443,
"step": 1630
},
{
"epoch": 3.066890380313199,
"grad_norm": 6.973093509674072,
"learning_rate": 3.517275664926672e-05,
"loss": 0.4201,
"step": 1640
},
{
"epoch": 3.0691275167785235,
"grad_norm": 1.0698981285095215,
"learning_rate": 3.504847129008203e-05,
"loss": 0.485,
"step": 1650
},
{
"epoch": 3.071364653243848,
"grad_norm": 6.2486701011657715,
"learning_rate": 3.492418593089734e-05,
"loss": 0.4176,
"step": 1660
},
{
"epoch": 3.0736017897091723,
"grad_norm": 2.134953022003174,
"learning_rate": 3.479990057171265e-05,
"loss": 0.3947,
"step": 1670
},
{
"epoch": 3.0758389261744967,
"grad_norm": 1.0479804277420044,
"learning_rate": 3.4675615212527964e-05,
"loss": 0.3886,
"step": 1680
},
{
"epoch": 3.078076062639821,
"grad_norm": 1.2134567499160767,
"learning_rate": 3.455132985334328e-05,
"loss": 0.3022,
"step": 1690
},
{
"epoch": 3.0803131991051456,
"grad_norm": 10.898287773132324,
"learning_rate": 3.442704449415859e-05,
"loss": 0.3078,
"step": 1700
},
{
"epoch": 3.08255033557047,
"grad_norm": 18.389766693115234,
"learning_rate": 3.43027591349739e-05,
"loss": 0.663,
"step": 1710
},
{
"epoch": 3.0847874720357944,
"grad_norm": 2.9712672233581543,
"learning_rate": 3.4178473775789214e-05,
"loss": 0.4882,
"step": 1720
},
{
"epoch": 3.0870246085011184,
"grad_norm": 4.190480709075928,
"learning_rate": 3.4054188416604526e-05,
"loss": 0.4099,
"step": 1730
},
{
"epoch": 3.089261744966443,
"grad_norm": 5.036893367767334,
"learning_rate": 3.392990305741984e-05,
"loss": 0.3227,
"step": 1740
},
{
"epoch": 3.091498881431767,
"grad_norm": 3.94989013671875,
"learning_rate": 3.380561769823515e-05,
"loss": 0.3345,
"step": 1750
},
{
"epoch": 3.0937360178970916,
"grad_norm": 10.000751495361328,
"learning_rate": 3.3681332339050456e-05,
"loss": 0.3655,
"step": 1760
},
{
"epoch": 3.095973154362416,
"grad_norm": 6.677926063537598,
"learning_rate": 3.3557046979865775e-05,
"loss": 0.2954,
"step": 1770
},
{
"epoch": 3.0982102908277405,
"grad_norm": 10.632355690002441,
"learning_rate": 3.343276162068109e-05,
"loss": 0.4427,
"step": 1780
},
{
"epoch": 3.1,
"eval_loss": 0.8546826243400574,
"eval_runtime": 891.5992,
"eval_samples_per_second": 8.318,
"eval_steps_per_second": 1.04,
"step": 1788
},
{
"epoch": 4.000447427293065,
"grad_norm": 16.46874237060547,
"learning_rate": 3.330847626149639e-05,
"loss": 0.6448,
"step": 1790
},
{
"epoch": 4.00268456375839,
"grad_norm": 7.57173490524292,
"learning_rate": 3.3184190902311706e-05,
"loss": 0.4691,
"step": 1800
},
{
"epoch": 4.004921700223714,
"grad_norm": 6.603734493255615,
"learning_rate": 3.3059905543127025e-05,
"loss": 0.4373,
"step": 1810
},
{
"epoch": 4.007158836689038,
"grad_norm": 5.733815670013428,
"learning_rate": 3.293562018394234e-05,
"loss": 0.4219,
"step": 1820
},
{
"epoch": 4.009395973154362,
"grad_norm": 0.5362582206726074,
"learning_rate": 3.281133482475764e-05,
"loss": 0.2901,
"step": 1830
},
{
"epoch": 4.011633109619686,
"grad_norm": 9.654644012451172,
"learning_rate": 3.2687049465572955e-05,
"loss": 0.4001,
"step": 1840
},
{
"epoch": 4.013870246085011,
"grad_norm": 5.657355785369873,
"learning_rate": 3.2562764106388274e-05,
"loss": 0.3882,
"step": 1850
},
{
"epoch": 4.016107382550335,
"grad_norm": 4.895392417907715,
"learning_rate": 3.243847874720358e-05,
"loss": 0.402,
"step": 1860
},
{
"epoch": 4.01834451901566,
"grad_norm": 7.476536750793457,
"learning_rate": 3.231419338801889e-05,
"loss": 0.2948,
"step": 1870
},
{
"epoch": 4.020581655480984,
"grad_norm": 15.446544647216797,
"learning_rate": 3.2189908028834204e-05,
"loss": 0.3745,
"step": 1880
},
{
"epoch": 4.0228187919463085,
"grad_norm": 9.441873550415039,
"learning_rate": 3.206562266964952e-05,
"loss": 0.4731,
"step": 1890
},
{
"epoch": 4.025055928411633,
"grad_norm": 2.744432210922241,
"learning_rate": 3.194133731046483e-05,
"loss": 0.5575,
"step": 1900
},
{
"epoch": 4.027293064876957,
"grad_norm": 7.594290733337402,
"learning_rate": 3.181705195128014e-05,
"loss": 0.463,
"step": 1910
},
{
"epoch": 4.029530201342282,
"grad_norm": 9.001227378845215,
"learning_rate": 3.1692766592095454e-05,
"loss": 0.3622,
"step": 1920
},
{
"epoch": 4.031767337807606,
"grad_norm": 12.734862327575684,
"learning_rate": 3.156848123291076e-05,
"loss": 0.3435,
"step": 1930
},
{
"epoch": 4.034004474272931,
"grad_norm": 1.6699249744415283,
"learning_rate": 3.144419587372608e-05,
"loss": 0.2984,
"step": 1940
},
{
"epoch": 4.036241610738255,
"grad_norm": 14.737456321716309,
"learning_rate": 3.131991051454139e-05,
"loss": 0.4041,
"step": 1950
},
{
"epoch": 4.0384787472035795,
"grad_norm": 5.71207857131958,
"learning_rate": 3.11956251553567e-05,
"loss": 0.3152,
"step": 1960
},
{
"epoch": 4.040715883668904,
"grad_norm": 12.913744926452637,
"learning_rate": 3.107133979617201e-05,
"loss": 0.4892,
"step": 1970
},
{
"epoch": 4.042953020134228,
"grad_norm": 12.614986419677734,
"learning_rate": 3.094705443698732e-05,
"loss": 0.4399,
"step": 1980
},
{
"epoch": 4.045190156599553,
"grad_norm": 10.273175239562988,
"learning_rate": 3.082276907780264e-05,
"loss": 0.5362,
"step": 1990
},
{
"epoch": 4.047427293064877,
"grad_norm": 1.00068199634552,
"learning_rate": 3.0698483718617946e-05,
"loss": 0.4019,
"step": 2000
},
{
"epoch": 4.049664429530202,
"grad_norm": 3.2468674182891846,
"learning_rate": 3.057419835943326e-05,
"loss": 0.2908,
"step": 2010
},
{
"epoch": 4.051901565995526,
"grad_norm": 13.30661392211914,
"learning_rate": 3.044991300024857e-05,
"loss": 0.4924,
"step": 2020
},
{
"epoch": 4.05413870246085,
"grad_norm": 7.9545063972473145,
"learning_rate": 3.0325627641063886e-05,
"loss": 0.3895,
"step": 2030
},
{
"epoch": 4.056375838926175,
"grad_norm": 9.699666023254395,
"learning_rate": 3.02013422818792e-05,
"loss": 0.2904,
"step": 2040
},
{
"epoch": 4.058612975391499,
"grad_norm": 6.4541707038879395,
"learning_rate": 3.0077056922694508e-05,
"loss": 0.4361,
"step": 2050
},
{
"epoch": 4.060850111856824,
"grad_norm": 22.470691680908203,
"learning_rate": 2.995277156350982e-05,
"loss": 0.3353,
"step": 2060
},
{
"epoch": 4.063087248322148,
"grad_norm": 8.081043243408203,
"learning_rate": 2.9828486204325136e-05,
"loss": 0.492,
"step": 2070
},
{
"epoch": 4.065324384787472,
"grad_norm": 1.5061018466949463,
"learning_rate": 2.9704200845140445e-05,
"loss": 0.3086,
"step": 2080
},
{
"epoch": 4.067561521252796,
"grad_norm": 0.6852089762687683,
"learning_rate": 2.9579915485955757e-05,
"loss": 0.389,
"step": 2090
},
{
"epoch": 4.06979865771812,
"grad_norm": 12.416427612304688,
"learning_rate": 2.9455630126771066e-05,
"loss": 0.3055,
"step": 2100
},
{
"epoch": 4.072035794183445,
"grad_norm": 7.986660957336426,
"learning_rate": 2.9331344767586378e-05,
"loss": 0.5869,
"step": 2110
},
{
"epoch": 4.074272930648769,
"grad_norm": 14.885111808776855,
"learning_rate": 2.9207059408401694e-05,
"loss": 0.3224,
"step": 2120
},
{
"epoch": 4.076510067114094,
"grad_norm": 10.167362213134766,
"learning_rate": 2.9082774049217003e-05,
"loss": 0.3044,
"step": 2130
},
{
"epoch": 4.078747203579418,
"grad_norm": 6.552039623260498,
"learning_rate": 2.8958488690032315e-05,
"loss": 0.2917,
"step": 2140
},
{
"epoch": 4.0809843400447425,
"grad_norm": 7.694530487060547,
"learning_rate": 2.8834203330847624e-05,
"loss": 0.4193,
"step": 2150
},
{
"epoch": 4.083221476510067,
"grad_norm": 7.818331241607666,
"learning_rate": 2.8709917971662943e-05,
"loss": 0.3355,
"step": 2160
},
{
"epoch": 4.085458612975391,
"grad_norm": 18.491985321044922,
"learning_rate": 2.8585632612478252e-05,
"loss": 0.2317,
"step": 2170
},
{
"epoch": 4.087695749440716,
"grad_norm": 6.1848602294921875,
"learning_rate": 2.8461347253293565e-05,
"loss": 0.2758,
"step": 2180
},
{
"epoch": 4.08993288590604,
"grad_norm": 13.520557403564453,
"learning_rate": 2.8337061894108874e-05,
"loss": 0.439,
"step": 2190
},
{
"epoch": 4.092170022371365,
"grad_norm": 9.846938133239746,
"learning_rate": 2.8212776534924186e-05,
"loss": 0.4711,
"step": 2200
},
{
"epoch": 4.094407158836689,
"grad_norm": 5.90399694442749,
"learning_rate": 2.80884911757395e-05,
"loss": 0.085,
"step": 2210
},
{
"epoch": 4.0966442953020135,
"grad_norm": 16.939096450805664,
"learning_rate": 2.796420581655481e-05,
"loss": 0.4099,
"step": 2220
},
{
"epoch": 4.098881431767338,
"grad_norm": 1.1104991436004639,
"learning_rate": 2.7839920457370123e-05,
"loss": 0.4496,
"step": 2230
},
{
"epoch": 4.1,
"eval_loss": 0.3794967830181122,
"eval_runtime": 891.9924,
"eval_samples_per_second": 8.314,
"eval_steps_per_second": 1.039,
"step": 2235
},
{
"epoch": 5.001118568232662,
"grad_norm": 4.833787441253662,
"learning_rate": 2.7715635098185432e-05,
"loss": 0.2992,
"step": 2240
},
{
"epoch": 5.003355704697986,
"grad_norm": 5.847362518310547,
"learning_rate": 2.7591349739000748e-05,
"loss": 0.2596,
"step": 2250
},
{
"epoch": 5.005592841163311,
"grad_norm": 18.474834442138672,
"learning_rate": 2.746706437981606e-05,
"loss": 0.4849,
"step": 2260
},
{
"epoch": 5.007829977628635,
"grad_norm": 12.474526405334473,
"learning_rate": 2.734277902063137e-05,
"loss": 0.3275,
"step": 2270
},
{
"epoch": 5.010067114093959,
"grad_norm": 1.1569851636886597,
"learning_rate": 2.721849366144668e-05,
"loss": 0.3104,
"step": 2280
},
{
"epoch": 5.012304250559284,
"grad_norm": 10.059696197509766,
"learning_rate": 2.7094208302261997e-05,
"loss": 0.1573,
"step": 2290
},
{
"epoch": 5.014541387024608,
"grad_norm": 4.665347099304199,
"learning_rate": 2.696992294307731e-05,
"loss": 0.4128,
"step": 2300
},
{
"epoch": 5.016778523489933,
"grad_norm": 13.133956909179688,
"learning_rate": 2.6845637583892618e-05,
"loss": 0.5156,
"step": 2310
},
{
"epoch": 5.019015659955257,
"grad_norm": 1.0956755876541138,
"learning_rate": 2.672135222470793e-05,
"loss": 0.3676,
"step": 2320
},
{
"epoch": 5.0212527964205815,
"grad_norm": 8.179460525512695,
"learning_rate": 2.659706686552324e-05,
"loss": 0.3082,
"step": 2330
},
{
"epoch": 5.023489932885906,
"grad_norm": 1.9563068151474,
"learning_rate": 2.6472781506338555e-05,
"loss": 0.2365,
"step": 2340
},
{
"epoch": 5.02572706935123,
"grad_norm": 9.728827476501465,
"learning_rate": 2.6348496147153868e-05,
"loss": 0.6143,
"step": 2350
},
{
"epoch": 5.027964205816555,
"grad_norm": 0.4499684274196625,
"learning_rate": 2.6224210787969177e-05,
"loss": 0.1527,
"step": 2360
},
{
"epoch": 5.030201342281879,
"grad_norm": 4.692073345184326,
"learning_rate": 2.609992542878449e-05,
"loss": 0.4215,
"step": 2370
},
{
"epoch": 5.032438478747204,
"grad_norm": 0.4127592146396637,
"learning_rate": 2.5975640069599805e-05,
"loss": 0.3132,
"step": 2380
},
{
"epoch": 5.034675615212528,
"grad_norm": 5.006928443908691,
"learning_rate": 2.5851354710415117e-05,
"loss": 0.243,
"step": 2390
},
{
"epoch": 5.0369127516778525,
"grad_norm": 9.491246223449707,
"learning_rate": 2.5727069351230426e-05,
"loss": 0.4203,
"step": 2400
},
{
"epoch": 5.039149888143177,
"grad_norm": 1.1350034475326538,
"learning_rate": 2.560278399204574e-05,
"loss": 0.3657,
"step": 2410
},
{
"epoch": 5.041387024608501,
"grad_norm": 17.347612380981445,
"learning_rate": 2.5478498632861047e-05,
"loss": 0.4972,
"step": 2420
},
{
"epoch": 5.043624161073826,
"grad_norm": 2.2265026569366455,
"learning_rate": 2.5354213273676363e-05,
"loss": 0.3345,
"step": 2430
},
{
"epoch": 5.04586129753915,
"grad_norm": 4.642486572265625,
"learning_rate": 2.5229927914491675e-05,
"loss": 0.2684,
"step": 2440
},
{
"epoch": 5.0480984340044746,
"grad_norm": 12.171128273010254,
"learning_rate": 2.5105642555306984e-05,
"loss": 0.4424,
"step": 2450
},
{
"epoch": 5.050335570469799,
"grad_norm": 11.319775581359863,
"learning_rate": 2.49813571961223e-05,
"loss": 0.2878,
"step": 2460
},
{
"epoch": 5.052572706935123,
"grad_norm": 7.691583156585693,
"learning_rate": 2.485707183693761e-05,
"loss": 0.3732,
"step": 2470
},
{
"epoch": 5.054809843400448,
"grad_norm": 11.818854331970215,
"learning_rate": 2.473278647775292e-05,
"loss": 0.3303,
"step": 2480
},
{
"epoch": 5.057046979865772,
"grad_norm": 10.7247314453125,
"learning_rate": 2.4608501118568234e-05,
"loss": 0.2182,
"step": 2490
},
{
"epoch": 5.059284116331096,
"grad_norm": 11.010503768920898,
"learning_rate": 2.4484215759383546e-05,
"loss": 0.3295,
"step": 2500
},
{
"epoch": 5.06152125279642,
"grad_norm": 8.999567985534668,
"learning_rate": 2.435993040019886e-05,
"loss": 0.2326,
"step": 2510
},
{
"epoch": 5.063758389261745,
"grad_norm": 7.073877334594727,
"learning_rate": 2.4235645041014167e-05,
"loss": 0.2705,
"step": 2520
},
{
"epoch": 5.065995525727069,
"grad_norm": 11.623409271240234,
"learning_rate": 2.4111359681829483e-05,
"loss": 0.3234,
"step": 2530
},
{
"epoch": 5.068232662192393,
"grad_norm": 4.587973594665527,
"learning_rate": 2.3987074322644792e-05,
"loss": 0.2505,
"step": 2540
},
{
"epoch": 5.070469798657718,
"grad_norm": 11.883222579956055,
"learning_rate": 2.3862788963460104e-05,
"loss": 0.456,
"step": 2550
},
{
"epoch": 5.072706935123042,
"grad_norm": 21.06523323059082,
"learning_rate": 2.3738503604275417e-05,
"loss": 0.2691,
"step": 2560
},
{
"epoch": 5.074944071588367,
"grad_norm": 12.439352989196777,
"learning_rate": 2.361421824509073e-05,
"loss": 0.2216,
"step": 2570
},
{
"epoch": 5.077181208053691,
"grad_norm": 0.9154367446899414,
"learning_rate": 2.348993288590604e-05,
"loss": 0.3547,
"step": 2580
},
{
"epoch": 5.0794183445190155,
"grad_norm": 1.807629942893982,
"learning_rate": 2.3365647526721354e-05,
"loss": 0.2784,
"step": 2590
},
{
"epoch": 5.08165548098434,
"grad_norm": 14.866148948669434,
"learning_rate": 2.3241362167536666e-05,
"loss": 0.4041,
"step": 2600
},
{
"epoch": 5.083892617449664,
"grad_norm": 9.991622924804688,
"learning_rate": 2.311707680835198e-05,
"loss": 0.3884,
"step": 2610
},
{
"epoch": 5.086129753914989,
"grad_norm": 11.396065711975098,
"learning_rate": 2.2992791449167287e-05,
"loss": 0.1861,
"step": 2620
},
{
"epoch": 5.088366890380313,
"grad_norm": 5.028550624847412,
"learning_rate": 2.28685060899826e-05,
"loss": 0.1773,
"step": 2630
},
{
"epoch": 5.090604026845638,
"grad_norm": 7.7053022384643555,
"learning_rate": 2.2744220730797912e-05,
"loss": 0.34,
"step": 2640
},
{
"epoch": 5.092841163310962,
"grad_norm": 11.335506439208984,
"learning_rate": 2.2619935371613224e-05,
"loss": 0.2359,
"step": 2650
},
{
"epoch": 5.0950782997762865,
"grad_norm": 9.261770248413086,
"learning_rate": 2.2495650012428537e-05,
"loss": 0.3856,
"step": 2660
},
{
"epoch": 5.097315436241611,
"grad_norm": 8.961913108825684,
"learning_rate": 2.237136465324385e-05,
"loss": 0.4009,
"step": 2670
},
{
"epoch": 5.099552572706935,
"grad_norm": 3.3359217643737793,
"learning_rate": 2.224707929405916e-05,
"loss": 0.3433,
"step": 2680
},
{
"epoch": 5.1,
"eval_loss": 0.4118688702583313,
"eval_runtime": 892.4454,
"eval_samples_per_second": 8.31,
"eval_steps_per_second": 1.039,
"step": 2682
},
{
"epoch": 6.001789709172259,
"grad_norm": 8.580872535705566,
"learning_rate": 2.2122793934874474e-05,
"loss": 0.1915,
"step": 2690
},
{
"epoch": 6.004026845637584,
"grad_norm": 0.8801985383033752,
"learning_rate": 2.1998508575689786e-05,
"loss": 0.182,
"step": 2700
},
{
"epoch": 6.006263982102908,
"grad_norm": 15.736348152160645,
"learning_rate": 2.1874223216505095e-05,
"loss": 0.4147,
"step": 2710
},
{
"epoch": 6.008501118568232,
"grad_norm": 1.222266674041748,
"learning_rate": 2.174993785732041e-05,
"loss": 0.2243,
"step": 2720
},
{
"epoch": 6.010738255033557,
"grad_norm": 9.234151840209961,
"learning_rate": 2.162565249813572e-05,
"loss": 0.2065,
"step": 2730
},
{
"epoch": 6.012975391498881,
"grad_norm": 17.84362030029297,
"learning_rate": 2.1501367138951032e-05,
"loss": 0.2676,
"step": 2740
},
{
"epoch": 6.015212527964206,
"grad_norm": 7.234649658203125,
"learning_rate": 2.1377081779766345e-05,
"loss": 0.3512,
"step": 2750
},
{
"epoch": 6.01744966442953,
"grad_norm": 5.13038969039917,
"learning_rate": 2.1252796420581657e-05,
"loss": 0.3029,
"step": 2760
},
{
"epoch": 6.0196868008948545,
"grad_norm": 12.59158706665039,
"learning_rate": 2.112851106139697e-05,
"loss": 0.2794,
"step": 2770
},
{
"epoch": 6.021923937360179,
"grad_norm": 14.324825286865234,
"learning_rate": 2.1004225702212278e-05,
"loss": 0.2144,
"step": 2780
},
{
"epoch": 6.024161073825503,
"grad_norm": 6.485671043395996,
"learning_rate": 2.0879940343027594e-05,
"loss": 0.249,
"step": 2790
},
{
"epoch": 6.026398210290828,
"grad_norm": 1.8053562641143799,
"learning_rate": 2.0755654983842903e-05,
"loss": 0.2008,
"step": 2800
},
{
"epoch": 6.028635346756152,
"grad_norm": 5.798049449920654,
"learning_rate": 2.063136962465822e-05,
"loss": 0.1985,
"step": 2810
},
{
"epoch": 6.030872483221477,
"grad_norm": 11.126876831054688,
"learning_rate": 2.0507084265473528e-05,
"loss": 0.3294,
"step": 2820
},
{
"epoch": 6.033109619686801,
"grad_norm": 20.707542419433594,
"learning_rate": 2.038279890628884e-05,
"loss": 0.4472,
"step": 2830
},
{
"epoch": 6.0353467561521255,
"grad_norm": 7.778163909912109,
"learning_rate": 2.0258513547104152e-05,
"loss": 0.2675,
"step": 2840
},
{
"epoch": 6.03758389261745,
"grad_norm": 8.803659439086914,
"learning_rate": 2.013422818791946e-05,
"loss": 0.244,
"step": 2850
},
{
"epoch": 6.039821029082774,
"grad_norm": 0.29662925004959106,
"learning_rate": 2.0009942828734777e-05,
"loss": 0.1816,
"step": 2860
},
{
"epoch": 6.042058165548099,
"grad_norm": 0.928829550743103,
"learning_rate": 1.9885657469550086e-05,
"loss": 0.3033,
"step": 2870
},
{
"epoch": 6.044295302013423,
"grad_norm": 0.4924483299255371,
"learning_rate": 1.97613721103654e-05,
"loss": 0.3565,
"step": 2880
},
{
"epoch": 6.0465324384787476,
"grad_norm": 10.867852210998535,
"learning_rate": 1.963708675118071e-05,
"loss": 0.1718,
"step": 2890
},
{
"epoch": 6.048769574944072,
"grad_norm": 0.17046819627285004,
"learning_rate": 1.9512801391996023e-05,
"loss": 0.3193,
"step": 2900
},
{
"epoch": 6.051006711409396,
"grad_norm": 21.9335994720459,
"learning_rate": 1.9388516032811335e-05,
"loss": 0.2725,
"step": 2910
},
{
"epoch": 6.05324384787472,
"grad_norm": 7.8351030349731445,
"learning_rate": 1.9264230673626648e-05,
"loss": 0.2877,
"step": 2920
},
{
"epoch": 6.055480984340044,
"grad_norm": 1.0813746452331543,
"learning_rate": 1.913994531444196e-05,
"loss": 0.2619,
"step": 2930
},
{
"epoch": 6.057718120805369,
"grad_norm": 4.7920241355896,
"learning_rate": 1.9015659955257272e-05,
"loss": 0.182,
"step": 2940
},
{
"epoch": 6.059955257270693,
"grad_norm": 18.19488525390625,
"learning_rate": 1.8891374596072585e-05,
"loss": 0.2848,
"step": 2950
},
{
"epoch": 6.062192393736018,
"grad_norm": 2.5479190349578857,
"learning_rate": 1.8767089236887894e-05,
"loss": 0.3099,
"step": 2960
},
{
"epoch": 6.064429530201342,
"grad_norm": 21.13016128540039,
"learning_rate": 1.864280387770321e-05,
"loss": 0.4713,
"step": 2970
},
{
"epoch": 6.066666666666666,
"grad_norm": 0.09484589099884033,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.2758,
"step": 2980
},
{
"epoch": 6.068903803131991,
"grad_norm": 11.212635040283203,
"learning_rate": 1.839423315933383e-05,
"loss": 0.2305,
"step": 2990
},
{
"epoch": 6.071140939597315,
"grad_norm": 8.96382999420166,
"learning_rate": 1.8269947800149143e-05,
"loss": 0.2464,
"step": 3000
},
{
"epoch": 6.07337807606264,
"grad_norm": 8.752419471740723,
"learning_rate": 1.8145662440964455e-05,
"loss": 0.1472,
"step": 3010
},
{
"epoch": 6.075615212527964,
"grad_norm": 9.717801094055176,
"learning_rate": 1.8021377081779768e-05,
"loss": 0.3221,
"step": 3020
},
{
"epoch": 6.0778523489932885,
"grad_norm": 5.287158966064453,
"learning_rate": 1.789709172259508e-05,
"loss": 0.2747,
"step": 3030
},
{
"epoch": 6.080089485458613,
"grad_norm": 1.6286725997924805,
"learning_rate": 1.7772806363410392e-05,
"loss": 0.1127,
"step": 3040
},
{
"epoch": 6.082326621923937,
"grad_norm": 12.31570053100586,
"learning_rate": 1.7648521004225705e-05,
"loss": 0.2879,
"step": 3050
},
{
"epoch": 6.084563758389262,
"grad_norm": 3.0768423080444336,
"learning_rate": 1.7524235645041014e-05,
"loss": 0.3264,
"step": 3060
},
{
"epoch": 6.086800894854586,
"grad_norm": 6.542660713195801,
"learning_rate": 1.7399950285856326e-05,
"loss": 0.2626,
"step": 3070
},
{
"epoch": 6.089038031319911,
"grad_norm": 22.778274536132812,
"learning_rate": 1.727566492667164e-05,
"loss": 0.2602,
"step": 3080
},
{
"epoch": 6.091275167785235,
"grad_norm": 14.418547630310059,
"learning_rate": 1.715137956748695e-05,
"loss": 0.2574,
"step": 3090
},
{
"epoch": 6.0935123042505595,
"grad_norm": 1.627580165863037,
"learning_rate": 1.7027094208302263e-05,
"loss": 0.2347,
"step": 3100
},
{
"epoch": 6.095749440715884,
"grad_norm": 2.667323350906372,
"learning_rate": 1.6902808849117575e-05,
"loss": 0.4904,
"step": 3110
},
{
"epoch": 6.097986577181208,
"grad_norm": 26.79123878479004,
"learning_rate": 1.6778523489932888e-05,
"loss": 0.2287,
"step": 3120
},
{
"epoch": 6.1,
"eval_loss": 0.4823199510574341,
"eval_runtime": 898.1668,
"eval_samples_per_second": 8.257,
"eval_steps_per_second": 1.032,
"step": 3129
},
{
"epoch": 7.000223713646532,
"grad_norm": 10.718249320983887,
"learning_rate": 1.6654238130748197e-05,
"loss": 0.2243,
"step": 3130
},
{
"epoch": 7.002460850111857,
"grad_norm": 3.2877581119537354,
"learning_rate": 1.6529952771563512e-05,
"loss": 0.3877,
"step": 3140
},
{
"epoch": 7.004697986577181,
"grad_norm": 2.2482826709747314,
"learning_rate": 1.640566741237882e-05,
"loss": 0.1283,
"step": 3150
},
{
"epoch": 7.006935123042505,
"grad_norm": 11.641459465026855,
"learning_rate": 1.6281382053194137e-05,
"loss": 0.2999,
"step": 3160
},
{
"epoch": 7.00917225950783,
"grad_norm": 6.360275745391846,
"learning_rate": 1.6157096694009446e-05,
"loss": 0.2999,
"step": 3170
},
{
"epoch": 7.011409395973154,
"grad_norm": 0.33911630511283875,
"learning_rate": 1.603281133482476e-05,
"loss": 0.3532,
"step": 3180
},
{
"epoch": 7.013646532438479,
"grad_norm": 6.708487033843994,
"learning_rate": 1.590852597564007e-05,
"loss": 0.3293,
"step": 3190
},
{
"epoch": 7.015883668903803,
"grad_norm": 7.4095025062561035,
"learning_rate": 1.578424061645538e-05,
"loss": 0.1617,
"step": 3200
},
{
"epoch": 7.0181208053691275,
"grad_norm": 23.493078231811523,
"learning_rate": 1.5659955257270695e-05,
"loss": 0.1497,
"step": 3210
},
{
"epoch": 7.020357941834452,
"grad_norm": 7.098881244659424,
"learning_rate": 1.5535669898086004e-05,
"loss": 0.2247,
"step": 3220
},
{
"epoch": 7.022595078299776,
"grad_norm": 8.431655883789062,
"learning_rate": 1.541138453890132e-05,
"loss": 0.2487,
"step": 3230
},
{
"epoch": 7.024832214765101,
"grad_norm": 15.323188781738281,
"learning_rate": 1.528709917971663e-05,
"loss": 0.1421,
"step": 3240
},
{
"epoch": 7.027069351230425,
"grad_norm": 7.675280570983887,
"learning_rate": 1.5162813820531943e-05,
"loss": 0.4883,
"step": 3250
},
{
"epoch": 7.02930648769575,
"grad_norm": 3.4720630645751953,
"learning_rate": 1.5038528461347254e-05,
"loss": 0.2715,
"step": 3260
},
{
"epoch": 7.031543624161074,
"grad_norm": 1.7522681951522827,
"learning_rate": 1.4914243102162568e-05,
"loss": 0.1637,
"step": 3270
},
{
"epoch": 7.0337807606263985,
"grad_norm": 0.16172055900096893,
"learning_rate": 1.4789957742977878e-05,
"loss": 0.1717,
"step": 3280
},
{
"epoch": 7.036017897091723,
"grad_norm": 12.743756294250488,
"learning_rate": 1.4665672383793189e-05,
"loss": 0.3252,
"step": 3290
},
{
"epoch": 7.038255033557047,
"grad_norm": 10.548824310302734,
"learning_rate": 1.4541387024608501e-05,
"loss": 0.0774,
"step": 3300
},
{
"epoch": 7.040492170022372,
"grad_norm": 11.808302879333496,
"learning_rate": 1.4417101665423812e-05,
"loss": 0.3318,
"step": 3310
},
{
"epoch": 7.042729306487696,
"grad_norm": 11.149969100952148,
"learning_rate": 1.4292816306239126e-05,
"loss": 0.3435,
"step": 3320
},
{
"epoch": 7.0449664429530205,
"grad_norm": 16.549835205078125,
"learning_rate": 1.4168530947054437e-05,
"loss": 0.245,
"step": 3330
},
{
"epoch": 7.047203579418344,
"grad_norm": 0.44824355840682983,
"learning_rate": 1.404424558786975e-05,
"loss": 0.2654,
"step": 3340
},
{
"epoch": 7.0494407158836685,
"grad_norm": 10.627558708190918,
"learning_rate": 1.3919960228685061e-05,
"loss": 0.2706,
"step": 3350
},
{
"epoch": 7.051677852348993,
"grad_norm": 4.539977073669434,
"learning_rate": 1.3795674869500374e-05,
"loss": 0.2529,
"step": 3360
},
{
"epoch": 7.053914988814317,
"grad_norm": 25.88998031616211,
"learning_rate": 1.3671389510315684e-05,
"loss": 0.1102,
"step": 3370
},
{
"epoch": 7.056152125279642,
"grad_norm": 0.056022170931100845,
"learning_rate": 1.3547104151130999e-05,
"loss": 0.0937,
"step": 3380
},
{
"epoch": 7.058389261744966,
"grad_norm": 27.33989715576172,
"learning_rate": 1.3422818791946309e-05,
"loss": 0.3445,
"step": 3390
},
{
"epoch": 7.060626398210291,
"grad_norm": 27.60544204711914,
"learning_rate": 1.329853343276162e-05,
"loss": 0.3671,
"step": 3400
},
{
"epoch": 7.062863534675615,
"grad_norm": 0.10744742304086685,
"learning_rate": 1.3174248073576934e-05,
"loss": 0.2315,
"step": 3410
},
{
"epoch": 7.065100671140939,
"grad_norm": 19.817075729370117,
"learning_rate": 1.3049962714392244e-05,
"loss": 0.2397,
"step": 3420
},
{
"epoch": 7.067337807606264,
"grad_norm": 20.652551651000977,
"learning_rate": 1.2925677355207559e-05,
"loss": 0.1723,
"step": 3430
},
{
"epoch": 7.069574944071588,
"grad_norm": 3.9074671268463135,
"learning_rate": 1.280139199602287e-05,
"loss": 0.2729,
"step": 3440
},
{
"epoch": 7.071812080536913,
"grad_norm": 0.27141252160072327,
"learning_rate": 1.2677106636838182e-05,
"loss": 0.3176,
"step": 3450
},
{
"epoch": 7.074049217002237,
"grad_norm": 11.46578598022461,
"learning_rate": 1.2552821277653492e-05,
"loss": 0.2252,
"step": 3460
},
{
"epoch": 7.0762863534675615,
"grad_norm": 16.85219383239746,
"learning_rate": 1.2428535918468805e-05,
"loss": 0.1388,
"step": 3470
},
{
"epoch": 7.078523489932886,
"grad_norm": 10.73118782043457,
"learning_rate": 1.2304250559284117e-05,
"loss": 0.2334,
"step": 3480
},
{
"epoch": 7.08076062639821,
"grad_norm": 0.460288941860199,
"learning_rate": 1.217996520009943e-05,
"loss": 0.0608,
"step": 3490
},
{
"epoch": 7.082997762863535,
"grad_norm": 1.64030921459198,
"learning_rate": 1.2055679840914742e-05,
"loss": 0.1595,
"step": 3500
},
{
"epoch": 7.085234899328859,
"grad_norm": 14.426569938659668,
"learning_rate": 1.1931394481730052e-05,
"loss": 0.2432,
"step": 3510
},
{
"epoch": 7.087472035794184,
"grad_norm": 13.951542854309082,
"learning_rate": 1.1807109122545365e-05,
"loss": 0.2007,
"step": 3520
},
{
"epoch": 7.089709172259508,
"grad_norm": 25.381717681884766,
"learning_rate": 1.1682823763360677e-05,
"loss": 0.2316,
"step": 3530
},
{
"epoch": 7.0919463087248324,
"grad_norm": 0.6340412497520447,
"learning_rate": 1.155853840417599e-05,
"loss": 0.1972,
"step": 3540
},
{
"epoch": 7.094183445190157,
"grad_norm": 11.732870101928711,
"learning_rate": 1.14342530449913e-05,
"loss": 0.1388,
"step": 3550
},
{
"epoch": 7.096420581655481,
"grad_norm": 0.08352160453796387,
"learning_rate": 1.1309967685806612e-05,
"loss": 0.2276,
"step": 3560
},
{
"epoch": 7.098657718120806,
"grad_norm": 0.6394329071044922,
"learning_rate": 1.1185682326621925e-05,
"loss": 0.1297,
"step": 3570
},
{
"epoch": 7.1,
"eval_loss": 0.4294538199901581,
"eval_runtime": 892.5544,
"eval_samples_per_second": 8.309,
"eval_steps_per_second": 1.039,
"step": 3576
},
{
"epoch": 8.00089485458613,
"grad_norm": 12.145805358886719,
"learning_rate": 1.1061396967437237e-05,
"loss": 0.4217,
"step": 3580
},
{
"epoch": 8.003131991051454,
"grad_norm": 16.92803382873535,
"learning_rate": 1.0937111608252548e-05,
"loss": 0.1421,
"step": 3590
},
{
"epoch": 8.00536912751678,
"grad_norm": 6.773504257202148,
"learning_rate": 1.081282624906786e-05,
"loss": 0.1891,
"step": 3600
},
{
"epoch": 8.007606263982103,
"grad_norm": 22.504379272460938,
"learning_rate": 1.0688540889883172e-05,
"loss": 0.1844,
"step": 3610
},
{
"epoch": 8.009843400447428,
"grad_norm": 19.881702423095703,
"learning_rate": 1.0564255530698485e-05,
"loss": 0.2223,
"step": 3620
},
{
"epoch": 8.012080536912752,
"grad_norm": 1.9907094240188599,
"learning_rate": 1.0439970171513797e-05,
"loss": 0.2041,
"step": 3630
},
{
"epoch": 8.014317673378075,
"grad_norm": 0.3308212161064148,
"learning_rate": 1.031568481232911e-05,
"loss": 0.0968,
"step": 3640
},
{
"epoch": 8.0165548098434,
"grad_norm": 1.7726011276245117,
"learning_rate": 1.019139945314442e-05,
"loss": 0.0571,
"step": 3650
},
{
"epoch": 8.018791946308724,
"grad_norm": 29.911670684814453,
"learning_rate": 1.006711409395973e-05,
"loss": 0.1089,
"step": 3660
},
{
"epoch": 8.02102908277405,
"grad_norm": 5.603979587554932,
"learning_rate": 9.942828734775043e-06,
"loss": 0.0697,
"step": 3670
},
{
"epoch": 8.023266219239373,
"grad_norm": 0.07145686447620392,
"learning_rate": 9.818543375590355e-06,
"loss": 0.0768,
"step": 3680
},
{
"epoch": 8.025503355704698,
"grad_norm": 1.745406985282898,
"learning_rate": 9.694258016405668e-06,
"loss": 0.0412,
"step": 3690
},
{
"epoch": 8.027740492170022,
"grad_norm": 0.02010565809905529,
"learning_rate": 9.56997265722098e-06,
"loss": 0.2096,
"step": 3700
},
{
"epoch": 8.029977628635347,
"grad_norm": 8.72205638885498,
"learning_rate": 9.445687298036292e-06,
"loss": 0.1555,
"step": 3710
},
{
"epoch": 8.03221476510067,
"grad_norm": 0.01665619947016239,
"learning_rate": 9.321401938851605e-06,
"loss": 0.2793,
"step": 3720
},
{
"epoch": 8.034451901565996,
"grad_norm": 0.004611628130078316,
"learning_rate": 9.197116579666915e-06,
"loss": 0.2735,
"step": 3730
},
{
"epoch": 8.03668903803132,
"grad_norm": 1.931004524230957,
"learning_rate": 9.072831220482228e-06,
"loss": 0.2636,
"step": 3740
},
{
"epoch": 8.038926174496645,
"grad_norm": 0.014065139926970005,
"learning_rate": 8.94854586129754e-06,
"loss": 0.2653,
"step": 3750
},
{
"epoch": 8.041163310961968,
"grad_norm": 0.5851568579673767,
"learning_rate": 8.824260502112852e-06,
"loss": 0.092,
"step": 3760
},
{
"epoch": 8.043400447427294,
"grad_norm": 0.051146071404218674,
"learning_rate": 8.699975142928163e-06,
"loss": 0.2241,
"step": 3770
},
{
"epoch": 8.045637583892617,
"grad_norm": 0.1566874086856842,
"learning_rate": 8.575689783743475e-06,
"loss": 0.1845,
"step": 3780
},
{
"epoch": 8.047874720357942,
"grad_norm": 35.4766960144043,
"learning_rate": 8.451404424558788e-06,
"loss": 0.1556,
"step": 3790
},
{
"epoch": 8.050111856823266,
"grad_norm": 44.98518753051758,
"learning_rate": 8.327119065374098e-06,
"loss": 0.2961,
"step": 3800
},
{
"epoch": 8.052348993288591,
"grad_norm": 1.2694602012634277,
"learning_rate": 8.20283370618941e-06,
"loss": 0.0785,
"step": 3810
},
{
"epoch": 8.054586129753915,
"grad_norm": 0.6759599447250366,
"learning_rate": 8.078548347004723e-06,
"loss": 0.31,
"step": 3820
},
{
"epoch": 8.05682326621924,
"grad_norm": 5.972934246063232,
"learning_rate": 7.954262987820035e-06,
"loss": 0.0335,
"step": 3830
},
{
"epoch": 8.059060402684564,
"grad_norm": 0.8728901743888855,
"learning_rate": 7.829977628635348e-06,
"loss": 0.2005,
"step": 3840
},
{
"epoch": 8.061297539149889,
"grad_norm": 10.340746879577637,
"learning_rate": 7.70569226945066e-06,
"loss": 0.1724,
"step": 3850
},
{
"epoch": 8.063534675615212,
"grad_norm": 0.060672469437122345,
"learning_rate": 7.5814069102659716e-06,
"loss": 0.174,
"step": 3860
},
{
"epoch": 8.065771812080538,
"grad_norm": 4.864527702331543,
"learning_rate": 7.457121551081284e-06,
"loss": 0.3543,
"step": 3870
},
{
"epoch": 8.068008948545861,
"grad_norm": 4.48012638092041,
"learning_rate": 7.3328361918965945e-06,
"loss": 0.122,
"step": 3880
},
{
"epoch": 8.070246085011185,
"grad_norm": 0.07548043876886368,
"learning_rate": 7.208550832711906e-06,
"loss": 0.2838,
"step": 3890
},
{
"epoch": 8.07248322147651,
"grad_norm": 18.938758850097656,
"learning_rate": 7.084265473527218e-06,
"loss": 0.1681,
"step": 3900
},
{
"epoch": 8.074720357941834,
"grad_norm": 1.3047031164169312,
"learning_rate": 6.959980114342531e-06,
"loss": 0.2001,
"step": 3910
},
{
"epoch": 8.076957494407159,
"grad_norm": 12.20274829864502,
"learning_rate": 6.835694755157842e-06,
"loss": 0.2776,
"step": 3920
},
{
"epoch": 8.079194630872482,
"grad_norm": 0.47563186287879944,
"learning_rate": 6.7114093959731546e-06,
"loss": 0.2005,
"step": 3930
},
{
"epoch": 8.081431767337808,
"grad_norm": 5.735065460205078,
"learning_rate": 6.587124036788467e-06,
"loss": 0.1499,
"step": 3940
},
{
"epoch": 8.083668903803131,
"grad_norm": 20.469074249267578,
"learning_rate": 6.462838677603779e-06,
"loss": 0.2258,
"step": 3950
},
{
"epoch": 8.085906040268457,
"grad_norm": 3.2545881271362305,
"learning_rate": 6.338553318419091e-06,
"loss": 0.2216,
"step": 3960
},
{
"epoch": 8.08814317673378,
"grad_norm": 1.8031901121139526,
"learning_rate": 6.214267959234402e-06,
"loss": 0.1462,
"step": 3970
},
{
"epoch": 8.090380313199105,
"grad_norm": 11.510761260986328,
"learning_rate": 6.089982600049715e-06,
"loss": 0.2225,
"step": 3980
},
{
"epoch": 8.092617449664429,
"grad_norm": 1.421615481376648,
"learning_rate": 5.965697240865026e-06,
"loss": 0.1203,
"step": 3990
},
{
"epoch": 8.094854586129754,
"grad_norm": 0.10247212648391724,
"learning_rate": 5.8414118816803384e-06,
"loss": 0.0633,
"step": 4000
},
{
"epoch": 8.097091722595078,
"grad_norm": 0.038117095828056335,
"learning_rate": 5.71712652249565e-06,
"loss": 0.1246,
"step": 4010
},
{
"epoch": 8.099328859060403,
"grad_norm": 14.49740982055664,
"learning_rate": 5.592841163310962e-06,
"loss": 0.3104,
"step": 4020
},
{
"epoch": 8.1,
"eval_loss": 0.40958455204963684,
"eval_runtime": 894.7196,
"eval_samples_per_second": 8.289,
"eval_steps_per_second": 1.036,
"step": 4023
},
{
"epoch": 9.001565995525727,
"grad_norm": 0.1057172641158104,
"learning_rate": 5.468555804126274e-06,
"loss": 0.0831,
"step": 4030
},
{
"epoch": 9.003803131991052,
"grad_norm": 1.2070332765579224,
"learning_rate": 5.344270444941586e-06,
"loss": 0.0479,
"step": 4040
},
{
"epoch": 9.006040268456376,
"grad_norm": 0.6095995903015137,
"learning_rate": 5.2199850857568985e-06,
"loss": 0.2161,
"step": 4050
},
{
"epoch": 9.0082774049217,
"grad_norm": 6.29830265045166,
"learning_rate": 5.09569972657221e-06,
"loss": 0.1455,
"step": 4060
},
{
"epoch": 9.010514541387025,
"grad_norm": 0.01811257004737854,
"learning_rate": 4.9714143673875215e-06,
"loss": 0.0465,
"step": 4070
},
{
"epoch": 9.012751677852348,
"grad_norm": 17.725488662719727,
"learning_rate": 4.847129008202834e-06,
"loss": 0.3159,
"step": 4080
},
{
"epoch": 9.014988814317674,
"grad_norm": 27.672937393188477,
"learning_rate": 4.722843649018146e-06,
"loss": 0.0538,
"step": 4090
},
{
"epoch": 9.017225950782997,
"grad_norm": 0.12884105741977692,
"learning_rate": 4.598558289833458e-06,
"loss": 0.2565,
"step": 4100
},
{
"epoch": 9.019463087248322,
"grad_norm": 20.479238510131836,
"learning_rate": 4.47427293064877e-06,
"loss": 0.1162,
"step": 4110
},
{
"epoch": 9.021700223713646,
"grad_norm": 23.51799774169922,
"learning_rate": 4.3499875714640815e-06,
"loss": 0.181,
"step": 4120
},
{
"epoch": 9.023937360178971,
"grad_norm": 14.024354934692383,
"learning_rate": 4.225702212279394e-06,
"loss": 0.2026,
"step": 4130
},
{
"epoch": 9.026174496644295,
"grad_norm": 0.20324963331222534,
"learning_rate": 4.101416853094705e-06,
"loss": 0.0238,
"step": 4140
},
{
"epoch": 9.02841163310962,
"grad_norm": 0.17176759243011475,
"learning_rate": 3.977131493910018e-06,
"loss": 0.036,
"step": 4150
},
{
"epoch": 9.030648769574944,
"grad_norm": 13.05589485168457,
"learning_rate": 3.85284613472533e-06,
"loss": 0.3049,
"step": 4160
},
{
"epoch": 9.032885906040269,
"grad_norm": 0.03032175451517105,
"learning_rate": 3.728560775540642e-06,
"loss": 0.043,
"step": 4170
},
{
"epoch": 9.035123042505592,
"grad_norm": 2.0046262741088867,
"learning_rate": 3.604275416355953e-06,
"loss": 0.2365,
"step": 4180
},
{
"epoch": 9.037360178970918,
"grad_norm": 0.10407610237598419,
"learning_rate": 3.4799900571712654e-06,
"loss": 0.0472,
"step": 4190
},
{
"epoch": 9.039597315436241,
"grad_norm": 0.7708104252815247,
"learning_rate": 3.3557046979865773e-06,
"loss": 0.2119,
"step": 4200
},
{
"epoch": 9.041834451901567,
"grad_norm": 0.13973136246204376,
"learning_rate": 3.2314193388018896e-06,
"loss": 0.1322,
"step": 4210
},
{
"epoch": 9.04407158836689,
"grad_norm": 4.554214000701904,
"learning_rate": 3.107133979617201e-06,
"loss": 0.0044,
"step": 4220
},
{
"epoch": 9.046308724832215,
"grad_norm": 1.251420259475708,
"learning_rate": 2.982848620432513e-06,
"loss": 0.1559,
"step": 4230
},
{
"epoch": 9.048545861297539,
"grad_norm": 0.027974896132946014,
"learning_rate": 2.858563261247825e-06,
"loss": 0.1958,
"step": 4240
},
{
"epoch": 9.050782997762864,
"grad_norm": 17.233055114746094,
"learning_rate": 2.734277902063137e-06,
"loss": 0.1606,
"step": 4250
},
{
"epoch": 9.053020134228188,
"grad_norm": 0.4418050944805145,
"learning_rate": 2.6099925428784492e-06,
"loss": 0.19,
"step": 4260
},
{
"epoch": 9.055257270693513,
"grad_norm": 7.108306884765625,
"learning_rate": 2.4857071836937607e-06,
"loss": 0.1778,
"step": 4270
},
{
"epoch": 9.057494407158837,
"grad_norm": 0.11051614582538605,
"learning_rate": 2.361421824509073e-06,
"loss": 0.1074,
"step": 4280
},
{
"epoch": 9.059731543624162,
"grad_norm": 0.011775967665016651,
"learning_rate": 2.237136465324385e-06,
"loss": 0.1294,
"step": 4290
},
{
"epoch": 9.061968680089485,
"grad_norm": 0.011663487181067467,
"learning_rate": 2.112851106139697e-06,
"loss": 0.0744,
"step": 4300
},
{
"epoch": 9.06420581655481,
"grad_norm": 23.934040069580078,
"learning_rate": 1.988565746955009e-06,
"loss": 0.1064,
"step": 4310
},
{
"epoch": 9.066442953020134,
"grad_norm": 0.42981475591659546,
"learning_rate": 1.864280387770321e-06,
"loss": 0.0546,
"step": 4320
},
{
"epoch": 9.068680089485458,
"grad_norm": 19.337921142578125,
"learning_rate": 1.7399950285856327e-06,
"loss": 0.2816,
"step": 4330
},
{
"epoch": 9.070917225950783,
"grad_norm": 0.4612930119037628,
"learning_rate": 1.6157096694009448e-06,
"loss": 0.136,
"step": 4340
},
{
"epoch": 9.073154362416107,
"grad_norm": 0.09723786264657974,
"learning_rate": 1.4914243102162565e-06,
"loss": 0.1669,
"step": 4350
},
{
"epoch": 9.075391498881432,
"grad_norm": 0.028177455067634583,
"learning_rate": 1.3671389510315684e-06,
"loss": 0.0808,
"step": 4360
},
{
"epoch": 9.077628635346755,
"grad_norm": 13.670249938964844,
"learning_rate": 1.2428535918468804e-06,
"loss": 0.1512,
"step": 4370
},
{
"epoch": 9.07986577181208,
"grad_norm": 0.34152135252952576,
"learning_rate": 1.1185682326621925e-06,
"loss": 0.1886,
"step": 4380
},
{
"epoch": 9.082102908277404,
"grad_norm": 0.24342969059944153,
"learning_rate": 9.942828734775044e-07,
"loss": 0.2693,
"step": 4390
},
{
"epoch": 9.08434004474273,
"grad_norm": 0.17166166007518768,
"learning_rate": 8.699975142928163e-07,
"loss": 0.1354,
"step": 4400
},
{
"epoch": 9.086577181208053,
"grad_norm": 0.027880514040589333,
"learning_rate": 7.457121551081283e-07,
"loss": 0.1474,
"step": 4410
},
{
"epoch": 9.088814317673378,
"grad_norm": 0.05772515758872032,
"learning_rate": 6.214267959234402e-07,
"loss": 0.0538,
"step": 4420
},
{
"epoch": 9.091051454138702,
"grad_norm": 0.07866553962230682,
"learning_rate": 4.971414367387522e-07,
"loss": 0.1434,
"step": 4430
},
{
"epoch": 9.093288590604027,
"grad_norm": 16.45406723022461,
"learning_rate": 3.7285607755406413e-07,
"loss": 0.1083,
"step": 4440
},
{
"epoch": 9.09552572706935,
"grad_norm": 0.050580114126205444,
"learning_rate": 2.485707183693761e-07,
"loss": 0.3213,
"step": 4450
},
{
"epoch": 9.097762863534676,
"grad_norm": 0.4725286364555359,
"learning_rate": 1.2428535918468805e-07,
"loss": 0.1468,
"step": 4460
},
{
"epoch": 9.1,
"grad_norm": 0.048940546810626984,
"learning_rate": 0.0,
"loss": 0.0525,
"step": 4470
},
{
"epoch": 9.1,
"eval_loss": 0.41542962193489075,
"eval_runtime": 893.9438,
"eval_samples_per_second": 8.296,
"eval_steps_per_second": 1.037,
"step": 4470
},
{
"epoch": 9.1,
"step": 4470,
"total_flos": 1.5702310103501242e+20,
"train_loss": 0.38359242084309025,
"train_runtime": 28356.0667,
"train_samples_per_second": 1.261,
"train_steps_per_second": 0.158
},
{
"epoch": 9.1,
"eval_loss": 0.39315128326416016,
"eval_runtime": 423.7958,
"eval_samples_per_second": 2.596,
"eval_steps_per_second": 0.326,
"step": 4470
},
{
"epoch": 9.1,
"eval_loss": 0.39315125346183777,
"eval_runtime": 426.8234,
"eval_samples_per_second": 2.577,
"eval_steps_per_second": 0.323,
"step": 4470
}
],
"logging_steps": 10,
"max_steps": 4470,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5702310103501242e+20,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}