ceb_b64_le4_s8000 / last-checkpoint /trainer_state.json
mikhail-panzo's picture
Training in progress, step 8000, checkpoint
a084b69 verified
raw
history blame contribute delete
No virus
31.3 kB
{
"best_metric": 0.39598318934440613,
"best_model_checkpoint": "mikhail_panzo/ceb_b64_le4_s8000/checkpoint-3500",
"epoch": 316.83168316831683,
"eval_steps": 500,
"global_step": 8000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.9801980198019802,
"grad_norm": 1.725104808807373,
"learning_rate": 2.5e-06,
"loss": 0.7809,
"step": 50
},
{
"epoch": 3.9603960396039604,
"grad_norm": 1.6038875579833984,
"learning_rate": 5e-06,
"loss": 0.7005,
"step": 100
},
{
"epoch": 5.9405940594059405,
"grad_norm": 1.6793955564498901,
"learning_rate": 7.5e-06,
"loss": 0.6246,
"step": 150
},
{
"epoch": 7.920792079207921,
"grad_norm": 1.5051871538162231,
"learning_rate": 1e-05,
"loss": 0.5278,
"step": 200
},
{
"epoch": 9.900990099009901,
"grad_norm": 1.404683232307434,
"learning_rate": 1.25e-05,
"loss": 0.5095,
"step": 250
},
{
"epoch": 11.881188118811881,
"grad_norm": 1.248382568359375,
"learning_rate": 1.5e-05,
"loss": 0.4814,
"step": 300
},
{
"epoch": 13.861386138613861,
"grad_norm": 0.995944082736969,
"learning_rate": 1.75e-05,
"loss": 0.4743,
"step": 350
},
{
"epoch": 15.841584158415841,
"grad_norm": 1.472835659980774,
"learning_rate": 2e-05,
"loss": 0.4631,
"step": 400
},
{
"epoch": 17.821782178217823,
"grad_norm": 2.442906618118286,
"learning_rate": 2.245e-05,
"loss": 0.4622,
"step": 450
},
{
"epoch": 19.801980198019802,
"grad_norm": 1.071074366569519,
"learning_rate": 2.495e-05,
"loss": 0.4561,
"step": 500
},
{
"epoch": 19.801980198019802,
"eval_loss": 0.41511112451553345,
"eval_runtime": 7.4967,
"eval_samples_per_second": 24.011,
"eval_steps_per_second": 3.068,
"step": 500
},
{
"epoch": 21.782178217821784,
"grad_norm": 1.0462185144424438,
"learning_rate": 2.7450000000000003e-05,
"loss": 0.4465,
"step": 550
},
{
"epoch": 23.762376237623762,
"grad_norm": 1.103574275970459,
"learning_rate": 2.995e-05,
"loss": 0.4453,
"step": 600
},
{
"epoch": 25.742574257425744,
"grad_norm": 3.00575590133667,
"learning_rate": 3.245e-05,
"loss": 0.447,
"step": 650
},
{
"epoch": 27.722772277227723,
"grad_norm": 1.786911129951477,
"learning_rate": 3.495e-05,
"loss": 0.4351,
"step": 700
},
{
"epoch": 29.702970297029704,
"grad_norm": 1.236941933631897,
"learning_rate": 3.745e-05,
"loss": 0.4347,
"step": 750
},
{
"epoch": 31.683168316831683,
"grad_norm": 1.3743062019348145,
"learning_rate": 3.995e-05,
"loss": 0.4319,
"step": 800
},
{
"epoch": 33.663366336633665,
"grad_norm": 2.7615420818328857,
"learning_rate": 4.245e-05,
"loss": 0.4358,
"step": 850
},
{
"epoch": 35.64356435643565,
"grad_norm": 1.662369966506958,
"learning_rate": 4.495e-05,
"loss": 0.4276,
"step": 900
},
{
"epoch": 37.62376237623762,
"grad_norm": 1.0967382192611694,
"learning_rate": 4.745e-05,
"loss": 0.4267,
"step": 950
},
{
"epoch": 39.603960396039604,
"grad_norm": 2.530874252319336,
"learning_rate": 4.995e-05,
"loss": 0.4179,
"step": 1000
},
{
"epoch": 39.603960396039604,
"eval_loss": 0.39941468834877014,
"eval_runtime": 7.4617,
"eval_samples_per_second": 24.123,
"eval_steps_per_second": 3.082,
"step": 1000
},
{
"epoch": 41.584158415841586,
"grad_norm": 2.8653476238250732,
"learning_rate": 5.245e-05,
"loss": 0.4268,
"step": 1050
},
{
"epoch": 43.56435643564357,
"grad_norm": 1.5550223588943481,
"learning_rate": 5.495e-05,
"loss": 0.4265,
"step": 1100
},
{
"epoch": 45.54455445544554,
"grad_norm": 1.804150104522705,
"learning_rate": 5.745e-05,
"loss": 0.4192,
"step": 1150
},
{
"epoch": 47.524752475247524,
"grad_norm": 1.9916889667510986,
"learning_rate": 5.995000000000001e-05,
"loss": 0.4149,
"step": 1200
},
{
"epoch": 49.504950495049506,
"grad_norm": 2.1027019023895264,
"learning_rate": 6.245000000000001e-05,
"loss": 0.4203,
"step": 1250
},
{
"epoch": 51.48514851485149,
"grad_norm": 1.1542466878890991,
"learning_rate": 6.494999999999999e-05,
"loss": 0.4127,
"step": 1300
},
{
"epoch": 53.46534653465346,
"grad_norm": 1.8733513355255127,
"learning_rate": 6.745e-05,
"loss": 0.4165,
"step": 1350
},
{
"epoch": 55.445544554455445,
"grad_norm": 2.544435739517212,
"learning_rate": 6.995e-05,
"loss": 0.4156,
"step": 1400
},
{
"epoch": 57.42574257425743,
"grad_norm": 2.9764773845672607,
"learning_rate": 7.245000000000001e-05,
"loss": 0.4045,
"step": 1450
},
{
"epoch": 59.40594059405941,
"grad_norm": 1.334035038948059,
"learning_rate": 7.495e-05,
"loss": 0.4075,
"step": 1500
},
{
"epoch": 59.40594059405941,
"eval_loss": 0.40177610516548157,
"eval_runtime": 6.8721,
"eval_samples_per_second": 26.193,
"eval_steps_per_second": 3.347,
"step": 1500
},
{
"epoch": 61.386138613861384,
"grad_norm": 2.3007051944732666,
"learning_rate": 7.745e-05,
"loss": 0.4067,
"step": 1550
},
{
"epoch": 63.366336633663366,
"grad_norm": 0.9966986179351807,
"learning_rate": 7.995e-05,
"loss": 0.4042,
"step": 1600
},
{
"epoch": 65.34653465346534,
"grad_norm": 1.4066482782363892,
"learning_rate": 8.245e-05,
"loss": 0.4079,
"step": 1650
},
{
"epoch": 67.32673267326733,
"grad_norm": 3.3195865154266357,
"learning_rate": 8.495e-05,
"loss": 0.4061,
"step": 1700
},
{
"epoch": 69.3069306930693,
"grad_norm": 2.83154559135437,
"learning_rate": 8.745000000000001e-05,
"loss": 0.4028,
"step": 1750
},
{
"epoch": 71.2871287128713,
"grad_norm": 1.5752816200256348,
"learning_rate": 8.995e-05,
"loss": 0.3977,
"step": 1800
},
{
"epoch": 73.26732673267327,
"grad_norm": 1.8909986019134521,
"learning_rate": 9.245e-05,
"loss": 0.4013,
"step": 1850
},
{
"epoch": 75.24752475247524,
"grad_norm": 4.082262992858887,
"learning_rate": 9.495e-05,
"loss": 0.3991,
"step": 1900
},
{
"epoch": 77.22772277227723,
"grad_norm": 1.8281221389770508,
"learning_rate": 9.745000000000001e-05,
"loss": 0.4011,
"step": 1950
},
{
"epoch": 79.20792079207921,
"grad_norm": 2.2827675342559814,
"learning_rate": 9.995e-05,
"loss": 0.3981,
"step": 2000
},
{
"epoch": 79.20792079207921,
"eval_loss": 0.40288153290748596,
"eval_runtime": 7.8052,
"eval_samples_per_second": 23.062,
"eval_steps_per_second": 2.947,
"step": 2000
},
{
"epoch": 81.18811881188118,
"grad_norm": 2.6100072860717773,
"learning_rate": 9.918333333333334e-05,
"loss": 0.3996,
"step": 2050
},
{
"epoch": 83.16831683168317,
"grad_norm": 1.1003444194793701,
"learning_rate": 9.835e-05,
"loss": 0.3999,
"step": 2100
},
{
"epoch": 85.14851485148515,
"grad_norm": 1.4783449172973633,
"learning_rate": 9.751666666666666e-05,
"loss": 0.3951,
"step": 2150
},
{
"epoch": 87.12871287128714,
"grad_norm": 2.3728928565979004,
"learning_rate": 9.668333333333334e-05,
"loss": 0.3831,
"step": 2200
},
{
"epoch": 89.10891089108911,
"grad_norm": 1.2834324836730957,
"learning_rate": 9.585000000000001e-05,
"loss": 0.3869,
"step": 2250
},
{
"epoch": 91.08910891089108,
"grad_norm": 1.771146535873413,
"learning_rate": 9.501666666666668e-05,
"loss": 0.3896,
"step": 2300
},
{
"epoch": 93.06930693069307,
"grad_norm": 2.136204481124878,
"learning_rate": 9.418333333333334e-05,
"loss": 0.3898,
"step": 2350
},
{
"epoch": 95.04950495049505,
"grad_norm": 0.8848810791969299,
"learning_rate": 9.335e-05,
"loss": 0.3875,
"step": 2400
},
{
"epoch": 97.02970297029702,
"grad_norm": 1.2002694606781006,
"learning_rate": 9.251666666666667e-05,
"loss": 0.3808,
"step": 2450
},
{
"epoch": 99.00990099009901,
"grad_norm": 1.392091155052185,
"learning_rate": 9.168333333333333e-05,
"loss": 0.3862,
"step": 2500
},
{
"epoch": 99.00990099009901,
"eval_loss": 0.39783453941345215,
"eval_runtime": 7.729,
"eval_samples_per_second": 23.289,
"eval_steps_per_second": 2.976,
"step": 2500
},
{
"epoch": 100.99009900990099,
"grad_norm": 1.1166267395019531,
"learning_rate": 9.085e-05,
"loss": 0.3824,
"step": 2550
},
{
"epoch": 102.97029702970298,
"grad_norm": 1.4629709720611572,
"learning_rate": 9.001666666666667e-05,
"loss": 0.3829,
"step": 2600
},
{
"epoch": 104.95049504950495,
"grad_norm": 2.9931211471557617,
"learning_rate": 8.918333333333334e-05,
"loss": 0.3756,
"step": 2650
},
{
"epoch": 106.93069306930693,
"grad_norm": 1.6760491132736206,
"learning_rate": 8.834999999999999e-05,
"loss": 0.3815,
"step": 2700
},
{
"epoch": 108.91089108910892,
"grad_norm": 1.8942713737487793,
"learning_rate": 8.751666666666668e-05,
"loss": 0.3773,
"step": 2750
},
{
"epoch": 110.89108910891089,
"grad_norm": 1.110032081604004,
"learning_rate": 8.668333333333334e-05,
"loss": 0.3747,
"step": 2800
},
{
"epoch": 112.87128712871286,
"grad_norm": 1.3915964365005493,
"learning_rate": 8.585000000000001e-05,
"loss": 0.3796,
"step": 2850
},
{
"epoch": 114.85148514851485,
"grad_norm": 2.8676748275756836,
"learning_rate": 8.501666666666667e-05,
"loss": 0.3731,
"step": 2900
},
{
"epoch": 116.83168316831683,
"grad_norm": 1.0008431673049927,
"learning_rate": 8.418333333333334e-05,
"loss": 0.3747,
"step": 2950
},
{
"epoch": 118.81188118811882,
"grad_norm": 2.071352243423462,
"learning_rate": 8.335e-05,
"loss": 0.3726,
"step": 3000
},
{
"epoch": 118.81188118811882,
"eval_loss": 0.3978251516819,
"eval_runtime": 8.1696,
"eval_samples_per_second": 22.033,
"eval_steps_per_second": 2.815,
"step": 3000
},
{
"epoch": 120.79207920792079,
"grad_norm": 0.8712412118911743,
"learning_rate": 8.251666666666668e-05,
"loss": 0.3675,
"step": 3050
},
{
"epoch": 122.77227722772277,
"grad_norm": 4.452208042144775,
"learning_rate": 8.168333333333333e-05,
"loss": 0.3687,
"step": 3100
},
{
"epoch": 124.75247524752476,
"grad_norm": 2.735180377960205,
"learning_rate": 8.085e-05,
"loss": 0.3749,
"step": 3150
},
{
"epoch": 126.73267326732673,
"grad_norm": 2.1853744983673096,
"learning_rate": 8.001666666666667e-05,
"loss": 0.3733,
"step": 3200
},
{
"epoch": 128.7128712871287,
"grad_norm": 3.216191530227661,
"learning_rate": 7.918333333333334e-05,
"loss": 0.369,
"step": 3250
},
{
"epoch": 130.69306930693068,
"grad_norm": 1.2702809572219849,
"learning_rate": 7.835000000000001e-05,
"loss": 0.3673,
"step": 3300
},
{
"epoch": 132.67326732673268,
"grad_norm": 2.0314784049987793,
"learning_rate": 7.751666666666668e-05,
"loss": 0.3671,
"step": 3350
},
{
"epoch": 134.65346534653466,
"grad_norm": 2.0706610679626465,
"learning_rate": 7.668333333333335e-05,
"loss": 0.3625,
"step": 3400
},
{
"epoch": 136.63366336633663,
"grad_norm": 1.2799315452575684,
"learning_rate": 7.585e-05,
"loss": 0.3646,
"step": 3450
},
{
"epoch": 138.6138613861386,
"grad_norm": 1.2347270250320435,
"learning_rate": 7.501666666666667e-05,
"loss": 0.365,
"step": 3500
},
{
"epoch": 138.6138613861386,
"eval_loss": 0.39598318934440613,
"eval_runtime": 7.6328,
"eval_samples_per_second": 23.582,
"eval_steps_per_second": 3.013,
"step": 3500
},
{
"epoch": 140.59405940594058,
"grad_norm": 2.1505396366119385,
"learning_rate": 7.418333333333334e-05,
"loss": 0.367,
"step": 3550
},
{
"epoch": 142.5742574257426,
"grad_norm": 1.6036536693572998,
"learning_rate": 7.335000000000001e-05,
"loss": 0.3622,
"step": 3600
},
{
"epoch": 144.55445544554456,
"grad_norm": 1.1357529163360596,
"learning_rate": 7.251666666666666e-05,
"loss": 0.3589,
"step": 3650
},
{
"epoch": 146.53465346534654,
"grad_norm": 1.5478957891464233,
"learning_rate": 7.168333333333333e-05,
"loss": 0.3577,
"step": 3700
},
{
"epoch": 148.5148514851485,
"grad_norm": 1.0070338249206543,
"learning_rate": 7.085e-05,
"loss": 0.3582,
"step": 3750
},
{
"epoch": 150.4950495049505,
"grad_norm": 0.9300253987312317,
"learning_rate": 7.001666666666667e-05,
"loss": 0.3563,
"step": 3800
},
{
"epoch": 152.47524752475246,
"grad_norm": 0.9197555184364319,
"learning_rate": 6.918333333333334e-05,
"loss": 0.3514,
"step": 3850
},
{
"epoch": 154.45544554455446,
"grad_norm": 0.6059859991073608,
"learning_rate": 6.835000000000001e-05,
"loss": 0.3575,
"step": 3900
},
{
"epoch": 156.43564356435644,
"grad_norm": 0.7884564399719238,
"learning_rate": 6.751666666666668e-05,
"loss": 0.3613,
"step": 3950
},
{
"epoch": 158.41584158415841,
"grad_norm": 0.7471904754638672,
"learning_rate": 6.668333333333333e-05,
"loss": 0.3525,
"step": 4000
},
{
"epoch": 158.41584158415841,
"eval_loss": 0.39685142040252686,
"eval_runtime": 7.1693,
"eval_samples_per_second": 25.107,
"eval_steps_per_second": 3.208,
"step": 4000
},
{
"epoch": 160.3960396039604,
"grad_norm": 0.9373750686645508,
"learning_rate": 6.585e-05,
"loss": 0.3537,
"step": 4050
},
{
"epoch": 162.37623762376236,
"grad_norm": 1.3369851112365723,
"learning_rate": 6.501666666666667e-05,
"loss": 0.3585,
"step": 4100
},
{
"epoch": 164.35643564356437,
"grad_norm": 0.6891220211982727,
"learning_rate": 6.418333333333334e-05,
"loss": 0.3519,
"step": 4150
},
{
"epoch": 166.33663366336634,
"grad_norm": 0.8272483944892883,
"learning_rate": 6.335e-05,
"loss": 0.3542,
"step": 4200
},
{
"epoch": 168.31683168316832,
"grad_norm": 0.9853746891021729,
"learning_rate": 6.251666666666666e-05,
"loss": 0.3553,
"step": 4250
},
{
"epoch": 170.2970297029703,
"grad_norm": 1.0020989179611206,
"learning_rate": 6.168333333333333e-05,
"loss": 0.3558,
"step": 4300
},
{
"epoch": 172.27722772277227,
"grad_norm": 1.4780181646347046,
"learning_rate": 6.085000000000001e-05,
"loss": 0.3505,
"step": 4350
},
{
"epoch": 174.25742574257427,
"grad_norm": 0.9966872334480286,
"learning_rate": 6.0016666666666664e-05,
"loss": 0.3513,
"step": 4400
},
{
"epoch": 176.23762376237624,
"grad_norm": 1.2055169343948364,
"learning_rate": 5.918333333333333e-05,
"loss": 0.3509,
"step": 4450
},
{
"epoch": 178.21782178217822,
"grad_norm": 1.075426697731018,
"learning_rate": 5.835e-05,
"loss": 0.3545,
"step": 4500
},
{
"epoch": 178.21782178217822,
"eval_loss": 0.3981594443321228,
"eval_runtime": 6.8387,
"eval_samples_per_second": 26.321,
"eval_steps_per_second": 3.363,
"step": 4500
},
{
"epoch": 180.1980198019802,
"grad_norm": 1.0541815757751465,
"learning_rate": 5.751666666666667e-05,
"loss": 0.3473,
"step": 4550
},
{
"epoch": 182.17821782178217,
"grad_norm": 2.1192638874053955,
"learning_rate": 5.668333333333333e-05,
"loss": 0.348,
"step": 4600
},
{
"epoch": 184.15841584158414,
"grad_norm": 1.2069100141525269,
"learning_rate": 5.585e-05,
"loss": 0.3463,
"step": 4650
},
{
"epoch": 186.13861386138615,
"grad_norm": 0.9461864233016968,
"learning_rate": 5.501666666666667e-05,
"loss": 0.3471,
"step": 4700
},
{
"epoch": 188.11881188118812,
"grad_norm": 1.0580745935440063,
"learning_rate": 5.4183333333333334e-05,
"loss": 0.3485,
"step": 4750
},
{
"epoch": 190.0990099009901,
"grad_norm": 0.7629022002220154,
"learning_rate": 5.335e-05,
"loss": 0.346,
"step": 4800
},
{
"epoch": 192.07920792079207,
"grad_norm": 0.7628908753395081,
"learning_rate": 5.251666666666667e-05,
"loss": 0.3487,
"step": 4850
},
{
"epoch": 194.05940594059405,
"grad_norm": 1.024609088897705,
"learning_rate": 5.168333333333334e-05,
"loss": 0.3486,
"step": 4900
},
{
"epoch": 196.03960396039605,
"grad_norm": 0.8158652186393738,
"learning_rate": 5.0849999999999996e-05,
"loss": 0.3456,
"step": 4950
},
{
"epoch": 198.01980198019803,
"grad_norm": 1.0953030586242676,
"learning_rate": 5.0016666666666665e-05,
"loss": 0.3473,
"step": 5000
},
{
"epoch": 198.01980198019803,
"eval_loss": 0.40393778681755066,
"eval_runtime": 6.9458,
"eval_samples_per_second": 25.915,
"eval_steps_per_second": 3.311,
"step": 5000
},
{
"epoch": 200.0,
"grad_norm": 1.864687442779541,
"learning_rate": 4.9183333333333334e-05,
"loss": 0.3484,
"step": 5050
},
{
"epoch": 201.98019801980197,
"grad_norm": 1.406449556350708,
"learning_rate": 4.835e-05,
"loss": 0.345,
"step": 5100
},
{
"epoch": 203.96039603960395,
"grad_norm": 0.7522682547569275,
"learning_rate": 4.751666666666667e-05,
"loss": 0.3468,
"step": 5150
},
{
"epoch": 205.94059405940595,
"grad_norm": 0.5859296321868896,
"learning_rate": 4.6683333333333334e-05,
"loss": 0.3432,
"step": 5200
},
{
"epoch": 207.92079207920793,
"grad_norm": 0.6594001054763794,
"learning_rate": 4.585e-05,
"loss": 0.3417,
"step": 5250
},
{
"epoch": 209.9009900990099,
"grad_norm": 1.0125696659088135,
"learning_rate": 4.5016666666666665e-05,
"loss": 0.3428,
"step": 5300
},
{
"epoch": 211.88118811881188,
"grad_norm": 0.8519133925437927,
"learning_rate": 4.4183333333333334e-05,
"loss": 0.3424,
"step": 5350
},
{
"epoch": 213.86138613861385,
"grad_norm": 0.8138070106506348,
"learning_rate": 4.335e-05,
"loss": 0.3411,
"step": 5400
},
{
"epoch": 215.84158415841586,
"grad_norm": 1.7046844959259033,
"learning_rate": 4.251666666666667e-05,
"loss": 0.3418,
"step": 5450
},
{
"epoch": 217.82178217821783,
"grad_norm": 0.8346728682518005,
"learning_rate": 4.1683333333333335e-05,
"loss": 0.3439,
"step": 5500
},
{
"epoch": 217.82178217821783,
"eval_loss": 0.40201354026794434,
"eval_runtime": 7.7869,
"eval_samples_per_second": 23.116,
"eval_steps_per_second": 2.954,
"step": 5500
},
{
"epoch": 219.8019801980198,
"grad_norm": 0.7159820199012756,
"learning_rate": 4.085e-05,
"loss": 0.3419,
"step": 5550
},
{
"epoch": 221.78217821782178,
"grad_norm": 1.4013868570327759,
"learning_rate": 4.0016666666666666e-05,
"loss": 0.3358,
"step": 5600
},
{
"epoch": 223.76237623762376,
"grad_norm": 1.4386184215545654,
"learning_rate": 3.9183333333333335e-05,
"loss": 0.3457,
"step": 5650
},
{
"epoch": 225.74257425742573,
"grad_norm": 1.1353213787078857,
"learning_rate": 3.8350000000000004e-05,
"loss": 0.3405,
"step": 5700
},
{
"epoch": 227.72277227722773,
"grad_norm": 1.091909646987915,
"learning_rate": 3.7516666666666666e-05,
"loss": 0.3403,
"step": 5750
},
{
"epoch": 229.7029702970297,
"grad_norm": 0.8275148272514343,
"learning_rate": 3.6683333333333335e-05,
"loss": 0.3404,
"step": 5800
},
{
"epoch": 231.68316831683168,
"grad_norm": 0.6606130599975586,
"learning_rate": 3.585e-05,
"loss": 0.3416,
"step": 5850
},
{
"epoch": 233.66336633663366,
"grad_norm": 1.0569533109664917,
"learning_rate": 3.501666666666667e-05,
"loss": 0.3404,
"step": 5900
},
{
"epoch": 235.64356435643563,
"grad_norm": 0.8686895370483398,
"learning_rate": 3.4183333333333335e-05,
"loss": 0.3397,
"step": 5950
},
{
"epoch": 237.62376237623764,
"grad_norm": 0.8039170503616333,
"learning_rate": 3.3350000000000004e-05,
"loss": 0.3371,
"step": 6000
},
{
"epoch": 237.62376237623764,
"eval_loss": 0.4044432044029236,
"eval_runtime": 7.7329,
"eval_samples_per_second": 23.277,
"eval_steps_per_second": 2.974,
"step": 6000
},
{
"epoch": 239.6039603960396,
"grad_norm": 0.5451411604881287,
"learning_rate": 3.2516666666666666e-05,
"loss": 0.3394,
"step": 6050
},
{
"epoch": 241.58415841584159,
"grad_norm": 0.6792750954627991,
"learning_rate": 3.1683333333333335e-05,
"loss": 0.3379,
"step": 6100
},
{
"epoch": 243.56435643564356,
"grad_norm": 0.6445412635803223,
"learning_rate": 3.0850000000000004e-05,
"loss": 0.3389,
"step": 6150
},
{
"epoch": 245.54455445544554,
"grad_norm": 0.9960897564888,
"learning_rate": 3.001666666666667e-05,
"loss": 0.3352,
"step": 6200
},
{
"epoch": 247.52475247524754,
"grad_norm": 0.7753505110740662,
"learning_rate": 2.9183333333333336e-05,
"loss": 0.3375,
"step": 6250
},
{
"epoch": 249.5049504950495,
"grad_norm": 0.5568383932113647,
"learning_rate": 2.8349999999999998e-05,
"loss": 0.3386,
"step": 6300
},
{
"epoch": 251.4851485148515,
"grad_norm": 0.6036835312843323,
"learning_rate": 2.7516666666666667e-05,
"loss": 0.3356,
"step": 6350
},
{
"epoch": 253.46534653465346,
"grad_norm": 1.170256495475769,
"learning_rate": 2.6683333333333333e-05,
"loss": 0.3327,
"step": 6400
},
{
"epoch": 255.44554455445544,
"grad_norm": 0.6887166500091553,
"learning_rate": 2.585e-05,
"loss": 0.3373,
"step": 6450
},
{
"epoch": 257.4257425742574,
"grad_norm": 0.6323124170303345,
"learning_rate": 2.5016666666666667e-05,
"loss": 0.3362,
"step": 6500
},
{
"epoch": 257.4257425742574,
"eval_loss": 0.40408840775489807,
"eval_runtime": 7.7698,
"eval_samples_per_second": 23.167,
"eval_steps_per_second": 2.96,
"step": 6500
},
{
"epoch": 259.4059405940594,
"grad_norm": 0.7631197571754456,
"learning_rate": 2.4183333333333336e-05,
"loss": 0.3325,
"step": 6550
},
{
"epoch": 261.38613861386136,
"grad_norm": 0.6006826162338257,
"learning_rate": 2.3350000000000002e-05,
"loss": 0.3348,
"step": 6600
},
{
"epoch": 263.36633663366337,
"grad_norm": 0.7407628297805786,
"learning_rate": 2.2516666666666667e-05,
"loss": 0.3317,
"step": 6650
},
{
"epoch": 265.34653465346537,
"grad_norm": 0.5582762956619263,
"learning_rate": 2.1683333333333333e-05,
"loss": 0.3334,
"step": 6700
},
{
"epoch": 267.3267326732673,
"grad_norm": 0.4441429674625397,
"learning_rate": 2.085e-05,
"loss": 0.3308,
"step": 6750
},
{
"epoch": 269.3069306930693,
"grad_norm": 0.6358359456062317,
"learning_rate": 2.0016666666666668e-05,
"loss": 0.3302,
"step": 6800
},
{
"epoch": 271.28712871287127,
"grad_norm": 0.5992699861526489,
"learning_rate": 1.9183333333333333e-05,
"loss": 0.3335,
"step": 6850
},
{
"epoch": 273.26732673267327,
"grad_norm": 0.49822068214416504,
"learning_rate": 1.8350000000000002e-05,
"loss": 0.3325,
"step": 6900
},
{
"epoch": 275.2475247524753,
"grad_norm": 0.6612289547920227,
"learning_rate": 1.7516666666666668e-05,
"loss": 0.3373,
"step": 6950
},
{
"epoch": 277.2277227722772,
"grad_norm": 0.5066806674003601,
"learning_rate": 1.6683333333333333e-05,
"loss": 0.3311,
"step": 7000
},
{
"epoch": 277.2277227722772,
"eval_loss": 0.4022347033023834,
"eval_runtime": 6.5767,
"eval_samples_per_second": 27.369,
"eval_steps_per_second": 3.497,
"step": 7000
},
{
"epoch": 279.2079207920792,
"grad_norm": 0.5922915935516357,
"learning_rate": 1.5850000000000002e-05,
"loss": 0.331,
"step": 7050
},
{
"epoch": 281.18811881188117,
"grad_norm": 0.49854084849357605,
"learning_rate": 1.5016666666666668e-05,
"loss": 0.3292,
"step": 7100
},
{
"epoch": 283.16831683168317,
"grad_norm": 0.534227192401886,
"learning_rate": 1.4183333333333335e-05,
"loss": 0.3295,
"step": 7150
},
{
"epoch": 285.1485148514852,
"grad_norm": 0.4879334568977356,
"learning_rate": 1.3350000000000001e-05,
"loss": 0.3295,
"step": 7200
},
{
"epoch": 287.1287128712871,
"grad_norm": 0.4761298596858978,
"learning_rate": 1.2516666666666668e-05,
"loss": 0.333,
"step": 7250
},
{
"epoch": 289.1089108910891,
"grad_norm": 0.5835270881652832,
"learning_rate": 1.1683333333333334e-05,
"loss": 0.3311,
"step": 7300
},
{
"epoch": 291.08910891089107,
"grad_norm": 0.5297247767448425,
"learning_rate": 1.0866666666666667e-05,
"loss": 0.333,
"step": 7350
},
{
"epoch": 293.0693069306931,
"grad_norm": 0.44668009877204895,
"learning_rate": 1.0033333333333333e-05,
"loss": 0.3282,
"step": 7400
},
{
"epoch": 295.0495049504951,
"grad_norm": 0.47231703996658325,
"learning_rate": 9.2e-06,
"loss": 0.3309,
"step": 7450
},
{
"epoch": 297.029702970297,
"grad_norm": 0.5559085011482239,
"learning_rate": 8.366666666666667e-06,
"loss": 0.3345,
"step": 7500
},
{
"epoch": 297.029702970297,
"eval_loss": 0.40512633323669434,
"eval_runtime": 6.9059,
"eval_samples_per_second": 26.065,
"eval_steps_per_second": 3.33,
"step": 7500
},
{
"epoch": 299.009900990099,
"grad_norm": 0.5674709677696228,
"learning_rate": 7.533333333333334e-06,
"loss": 0.3317,
"step": 7550
},
{
"epoch": 300.990099009901,
"grad_norm": 0.5428618788719177,
"learning_rate": 6.700000000000001e-06,
"loss": 0.3322,
"step": 7600
},
{
"epoch": 302.970297029703,
"grad_norm": 0.6271554827690125,
"learning_rate": 5.866666666666667e-06,
"loss": 0.3337,
"step": 7650
},
{
"epoch": 304.9504950495049,
"grad_norm": 0.41911429166793823,
"learning_rate": 5.033333333333334e-06,
"loss": 0.329,
"step": 7700
},
{
"epoch": 306.9306930693069,
"grad_norm": 0.4316006600856781,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.3338,
"step": 7750
},
{
"epoch": 308.91089108910893,
"grad_norm": 0.5471222400665283,
"learning_rate": 3.3666666666666665e-06,
"loss": 0.3316,
"step": 7800
},
{
"epoch": 310.8910891089109,
"grad_norm": 0.5605342388153076,
"learning_rate": 2.5333333333333334e-06,
"loss": 0.3289,
"step": 7850
},
{
"epoch": 312.8712871287129,
"grad_norm": 0.5504734516143799,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.3303,
"step": 7900
},
{
"epoch": 314.8514851485148,
"grad_norm": 0.5514795780181885,
"learning_rate": 8.666666666666667e-07,
"loss": 0.3282,
"step": 7950
},
{
"epoch": 316.83168316831683,
"grad_norm": 0.5700021982192993,
"learning_rate": 3.3333333333333334e-08,
"loss": 0.3348,
"step": 8000
},
{
"epoch": 316.83168316831683,
"eval_loss": 0.4050144553184509,
"eval_runtime": 6.8387,
"eval_samples_per_second": 26.321,
"eval_steps_per_second": 3.363,
"step": 8000
}
],
"logging_steps": 50,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 320,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.643923525044128e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}