ceb_b64_le4_s8000 / last-checkpoint /trainer_state.json
mikhail-panzo's picture
Training in progress, step 5500, checkpoint
ded3105 verified
raw
history blame
21.6 kB
{
"best_metric": 0.39598318934440613,
"best_model_checkpoint": "mikhail_panzo/ceb_b64_le4_s8000/checkpoint-3500",
"epoch": 217.82178217821783,
"eval_steps": 500,
"global_step": 5500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.9801980198019802,
"grad_norm": 1.725104808807373,
"learning_rate": 2.5e-06,
"loss": 0.7809,
"step": 50
},
{
"epoch": 3.9603960396039604,
"grad_norm": 1.6038875579833984,
"learning_rate": 5e-06,
"loss": 0.7005,
"step": 100
},
{
"epoch": 5.9405940594059405,
"grad_norm": 1.6793955564498901,
"learning_rate": 7.5e-06,
"loss": 0.6246,
"step": 150
},
{
"epoch": 7.920792079207921,
"grad_norm": 1.5051871538162231,
"learning_rate": 1e-05,
"loss": 0.5278,
"step": 200
},
{
"epoch": 9.900990099009901,
"grad_norm": 1.404683232307434,
"learning_rate": 1.25e-05,
"loss": 0.5095,
"step": 250
},
{
"epoch": 11.881188118811881,
"grad_norm": 1.248382568359375,
"learning_rate": 1.5e-05,
"loss": 0.4814,
"step": 300
},
{
"epoch": 13.861386138613861,
"grad_norm": 0.995944082736969,
"learning_rate": 1.75e-05,
"loss": 0.4743,
"step": 350
},
{
"epoch": 15.841584158415841,
"grad_norm": 1.472835659980774,
"learning_rate": 2e-05,
"loss": 0.4631,
"step": 400
},
{
"epoch": 17.821782178217823,
"grad_norm": 2.442906618118286,
"learning_rate": 2.245e-05,
"loss": 0.4622,
"step": 450
},
{
"epoch": 19.801980198019802,
"grad_norm": 1.071074366569519,
"learning_rate": 2.495e-05,
"loss": 0.4561,
"step": 500
},
{
"epoch": 19.801980198019802,
"eval_loss": 0.41511112451553345,
"eval_runtime": 7.4967,
"eval_samples_per_second": 24.011,
"eval_steps_per_second": 3.068,
"step": 500
},
{
"epoch": 21.782178217821784,
"grad_norm": 1.0462185144424438,
"learning_rate": 2.7450000000000003e-05,
"loss": 0.4465,
"step": 550
},
{
"epoch": 23.762376237623762,
"grad_norm": 1.103574275970459,
"learning_rate": 2.995e-05,
"loss": 0.4453,
"step": 600
},
{
"epoch": 25.742574257425744,
"grad_norm": 3.00575590133667,
"learning_rate": 3.245e-05,
"loss": 0.447,
"step": 650
},
{
"epoch": 27.722772277227723,
"grad_norm": 1.786911129951477,
"learning_rate": 3.495e-05,
"loss": 0.4351,
"step": 700
},
{
"epoch": 29.702970297029704,
"grad_norm": 1.236941933631897,
"learning_rate": 3.745e-05,
"loss": 0.4347,
"step": 750
},
{
"epoch": 31.683168316831683,
"grad_norm": 1.3743062019348145,
"learning_rate": 3.995e-05,
"loss": 0.4319,
"step": 800
},
{
"epoch": 33.663366336633665,
"grad_norm": 2.7615420818328857,
"learning_rate": 4.245e-05,
"loss": 0.4358,
"step": 850
},
{
"epoch": 35.64356435643565,
"grad_norm": 1.662369966506958,
"learning_rate": 4.495e-05,
"loss": 0.4276,
"step": 900
},
{
"epoch": 37.62376237623762,
"grad_norm": 1.0967382192611694,
"learning_rate": 4.745e-05,
"loss": 0.4267,
"step": 950
},
{
"epoch": 39.603960396039604,
"grad_norm": 2.530874252319336,
"learning_rate": 4.995e-05,
"loss": 0.4179,
"step": 1000
},
{
"epoch": 39.603960396039604,
"eval_loss": 0.39941468834877014,
"eval_runtime": 7.4617,
"eval_samples_per_second": 24.123,
"eval_steps_per_second": 3.082,
"step": 1000
},
{
"epoch": 41.584158415841586,
"grad_norm": 2.8653476238250732,
"learning_rate": 5.245e-05,
"loss": 0.4268,
"step": 1050
},
{
"epoch": 43.56435643564357,
"grad_norm": 1.5550223588943481,
"learning_rate": 5.495e-05,
"loss": 0.4265,
"step": 1100
},
{
"epoch": 45.54455445544554,
"grad_norm": 1.804150104522705,
"learning_rate": 5.745e-05,
"loss": 0.4192,
"step": 1150
},
{
"epoch": 47.524752475247524,
"grad_norm": 1.9916889667510986,
"learning_rate": 5.995000000000001e-05,
"loss": 0.4149,
"step": 1200
},
{
"epoch": 49.504950495049506,
"grad_norm": 2.1027019023895264,
"learning_rate": 6.245000000000001e-05,
"loss": 0.4203,
"step": 1250
},
{
"epoch": 51.48514851485149,
"grad_norm": 1.1542466878890991,
"learning_rate": 6.494999999999999e-05,
"loss": 0.4127,
"step": 1300
},
{
"epoch": 53.46534653465346,
"grad_norm": 1.8733513355255127,
"learning_rate": 6.745e-05,
"loss": 0.4165,
"step": 1350
},
{
"epoch": 55.445544554455445,
"grad_norm": 2.544435739517212,
"learning_rate": 6.995e-05,
"loss": 0.4156,
"step": 1400
},
{
"epoch": 57.42574257425743,
"grad_norm": 2.9764773845672607,
"learning_rate": 7.245000000000001e-05,
"loss": 0.4045,
"step": 1450
},
{
"epoch": 59.40594059405941,
"grad_norm": 1.334035038948059,
"learning_rate": 7.495e-05,
"loss": 0.4075,
"step": 1500
},
{
"epoch": 59.40594059405941,
"eval_loss": 0.40177610516548157,
"eval_runtime": 6.8721,
"eval_samples_per_second": 26.193,
"eval_steps_per_second": 3.347,
"step": 1500
},
{
"epoch": 61.386138613861384,
"grad_norm": 2.3007051944732666,
"learning_rate": 7.745e-05,
"loss": 0.4067,
"step": 1550
},
{
"epoch": 63.366336633663366,
"grad_norm": 0.9966986179351807,
"learning_rate": 7.995e-05,
"loss": 0.4042,
"step": 1600
},
{
"epoch": 65.34653465346534,
"grad_norm": 1.4066482782363892,
"learning_rate": 8.245e-05,
"loss": 0.4079,
"step": 1650
},
{
"epoch": 67.32673267326733,
"grad_norm": 3.3195865154266357,
"learning_rate": 8.495e-05,
"loss": 0.4061,
"step": 1700
},
{
"epoch": 69.3069306930693,
"grad_norm": 2.83154559135437,
"learning_rate": 8.745000000000001e-05,
"loss": 0.4028,
"step": 1750
},
{
"epoch": 71.2871287128713,
"grad_norm": 1.5752816200256348,
"learning_rate": 8.995e-05,
"loss": 0.3977,
"step": 1800
},
{
"epoch": 73.26732673267327,
"grad_norm": 1.8909986019134521,
"learning_rate": 9.245e-05,
"loss": 0.4013,
"step": 1850
},
{
"epoch": 75.24752475247524,
"grad_norm": 4.082262992858887,
"learning_rate": 9.495e-05,
"loss": 0.3991,
"step": 1900
},
{
"epoch": 77.22772277227723,
"grad_norm": 1.8281221389770508,
"learning_rate": 9.745000000000001e-05,
"loss": 0.4011,
"step": 1950
},
{
"epoch": 79.20792079207921,
"grad_norm": 2.2827675342559814,
"learning_rate": 9.995e-05,
"loss": 0.3981,
"step": 2000
},
{
"epoch": 79.20792079207921,
"eval_loss": 0.40288153290748596,
"eval_runtime": 7.8052,
"eval_samples_per_second": 23.062,
"eval_steps_per_second": 2.947,
"step": 2000
},
{
"epoch": 81.18811881188118,
"grad_norm": 2.6100072860717773,
"learning_rate": 9.918333333333334e-05,
"loss": 0.3996,
"step": 2050
},
{
"epoch": 83.16831683168317,
"grad_norm": 1.1003444194793701,
"learning_rate": 9.835e-05,
"loss": 0.3999,
"step": 2100
},
{
"epoch": 85.14851485148515,
"grad_norm": 1.4783449172973633,
"learning_rate": 9.751666666666666e-05,
"loss": 0.3951,
"step": 2150
},
{
"epoch": 87.12871287128714,
"grad_norm": 2.3728928565979004,
"learning_rate": 9.668333333333334e-05,
"loss": 0.3831,
"step": 2200
},
{
"epoch": 89.10891089108911,
"grad_norm": 1.2834324836730957,
"learning_rate": 9.585000000000001e-05,
"loss": 0.3869,
"step": 2250
},
{
"epoch": 91.08910891089108,
"grad_norm": 1.771146535873413,
"learning_rate": 9.501666666666668e-05,
"loss": 0.3896,
"step": 2300
},
{
"epoch": 93.06930693069307,
"grad_norm": 2.136204481124878,
"learning_rate": 9.418333333333334e-05,
"loss": 0.3898,
"step": 2350
},
{
"epoch": 95.04950495049505,
"grad_norm": 0.8848810791969299,
"learning_rate": 9.335e-05,
"loss": 0.3875,
"step": 2400
},
{
"epoch": 97.02970297029702,
"grad_norm": 1.2002694606781006,
"learning_rate": 9.251666666666667e-05,
"loss": 0.3808,
"step": 2450
},
{
"epoch": 99.00990099009901,
"grad_norm": 1.392091155052185,
"learning_rate": 9.168333333333333e-05,
"loss": 0.3862,
"step": 2500
},
{
"epoch": 99.00990099009901,
"eval_loss": 0.39783453941345215,
"eval_runtime": 7.729,
"eval_samples_per_second": 23.289,
"eval_steps_per_second": 2.976,
"step": 2500
},
{
"epoch": 100.99009900990099,
"grad_norm": 1.1166267395019531,
"learning_rate": 9.085e-05,
"loss": 0.3824,
"step": 2550
},
{
"epoch": 102.97029702970298,
"grad_norm": 1.4629709720611572,
"learning_rate": 9.001666666666667e-05,
"loss": 0.3829,
"step": 2600
},
{
"epoch": 104.95049504950495,
"grad_norm": 2.9931211471557617,
"learning_rate": 8.918333333333334e-05,
"loss": 0.3756,
"step": 2650
},
{
"epoch": 106.93069306930693,
"grad_norm": 1.6760491132736206,
"learning_rate": 8.834999999999999e-05,
"loss": 0.3815,
"step": 2700
},
{
"epoch": 108.91089108910892,
"grad_norm": 1.8942713737487793,
"learning_rate": 8.751666666666668e-05,
"loss": 0.3773,
"step": 2750
},
{
"epoch": 110.89108910891089,
"grad_norm": 1.110032081604004,
"learning_rate": 8.668333333333334e-05,
"loss": 0.3747,
"step": 2800
},
{
"epoch": 112.87128712871286,
"grad_norm": 1.3915964365005493,
"learning_rate": 8.585000000000001e-05,
"loss": 0.3796,
"step": 2850
},
{
"epoch": 114.85148514851485,
"grad_norm": 2.8676748275756836,
"learning_rate": 8.501666666666667e-05,
"loss": 0.3731,
"step": 2900
},
{
"epoch": 116.83168316831683,
"grad_norm": 1.0008431673049927,
"learning_rate": 8.418333333333334e-05,
"loss": 0.3747,
"step": 2950
},
{
"epoch": 118.81188118811882,
"grad_norm": 2.071352243423462,
"learning_rate": 8.335e-05,
"loss": 0.3726,
"step": 3000
},
{
"epoch": 118.81188118811882,
"eval_loss": 0.3978251516819,
"eval_runtime": 8.1696,
"eval_samples_per_second": 22.033,
"eval_steps_per_second": 2.815,
"step": 3000
},
{
"epoch": 120.79207920792079,
"grad_norm": 0.8712412118911743,
"learning_rate": 8.251666666666668e-05,
"loss": 0.3675,
"step": 3050
},
{
"epoch": 122.77227722772277,
"grad_norm": 4.452208042144775,
"learning_rate": 8.168333333333333e-05,
"loss": 0.3687,
"step": 3100
},
{
"epoch": 124.75247524752476,
"grad_norm": 2.735180377960205,
"learning_rate": 8.085e-05,
"loss": 0.3749,
"step": 3150
},
{
"epoch": 126.73267326732673,
"grad_norm": 2.1853744983673096,
"learning_rate": 8.001666666666667e-05,
"loss": 0.3733,
"step": 3200
},
{
"epoch": 128.7128712871287,
"grad_norm": 3.216191530227661,
"learning_rate": 7.918333333333334e-05,
"loss": 0.369,
"step": 3250
},
{
"epoch": 130.69306930693068,
"grad_norm": 1.2702809572219849,
"learning_rate": 7.835000000000001e-05,
"loss": 0.3673,
"step": 3300
},
{
"epoch": 132.67326732673268,
"grad_norm": 2.0314784049987793,
"learning_rate": 7.751666666666668e-05,
"loss": 0.3671,
"step": 3350
},
{
"epoch": 134.65346534653466,
"grad_norm": 2.0706610679626465,
"learning_rate": 7.668333333333335e-05,
"loss": 0.3625,
"step": 3400
},
{
"epoch": 136.63366336633663,
"grad_norm": 1.2799315452575684,
"learning_rate": 7.585e-05,
"loss": 0.3646,
"step": 3450
},
{
"epoch": 138.6138613861386,
"grad_norm": 1.2347270250320435,
"learning_rate": 7.501666666666667e-05,
"loss": 0.365,
"step": 3500
},
{
"epoch": 138.6138613861386,
"eval_loss": 0.39598318934440613,
"eval_runtime": 7.6328,
"eval_samples_per_second": 23.582,
"eval_steps_per_second": 3.013,
"step": 3500
},
{
"epoch": 140.59405940594058,
"grad_norm": 2.1505396366119385,
"learning_rate": 7.418333333333334e-05,
"loss": 0.367,
"step": 3550
},
{
"epoch": 142.5742574257426,
"grad_norm": 1.6036536693572998,
"learning_rate": 7.335000000000001e-05,
"loss": 0.3622,
"step": 3600
},
{
"epoch": 144.55445544554456,
"grad_norm": 1.1357529163360596,
"learning_rate": 7.251666666666666e-05,
"loss": 0.3589,
"step": 3650
},
{
"epoch": 146.53465346534654,
"grad_norm": 1.5478957891464233,
"learning_rate": 7.168333333333333e-05,
"loss": 0.3577,
"step": 3700
},
{
"epoch": 148.5148514851485,
"grad_norm": 1.0070338249206543,
"learning_rate": 7.085e-05,
"loss": 0.3582,
"step": 3750
},
{
"epoch": 150.4950495049505,
"grad_norm": 0.9300253987312317,
"learning_rate": 7.001666666666667e-05,
"loss": 0.3563,
"step": 3800
},
{
"epoch": 152.47524752475246,
"grad_norm": 0.9197555184364319,
"learning_rate": 6.918333333333334e-05,
"loss": 0.3514,
"step": 3850
},
{
"epoch": 154.45544554455446,
"grad_norm": 0.6059859991073608,
"learning_rate": 6.835000000000001e-05,
"loss": 0.3575,
"step": 3900
},
{
"epoch": 156.43564356435644,
"grad_norm": 0.7884564399719238,
"learning_rate": 6.751666666666668e-05,
"loss": 0.3613,
"step": 3950
},
{
"epoch": 158.41584158415841,
"grad_norm": 0.7471904754638672,
"learning_rate": 6.668333333333333e-05,
"loss": 0.3525,
"step": 4000
},
{
"epoch": 158.41584158415841,
"eval_loss": 0.39685142040252686,
"eval_runtime": 7.1693,
"eval_samples_per_second": 25.107,
"eval_steps_per_second": 3.208,
"step": 4000
},
{
"epoch": 160.3960396039604,
"grad_norm": 0.9373750686645508,
"learning_rate": 6.585e-05,
"loss": 0.3537,
"step": 4050
},
{
"epoch": 162.37623762376236,
"grad_norm": 1.3369851112365723,
"learning_rate": 6.501666666666667e-05,
"loss": 0.3585,
"step": 4100
},
{
"epoch": 164.35643564356437,
"grad_norm": 0.6891220211982727,
"learning_rate": 6.418333333333334e-05,
"loss": 0.3519,
"step": 4150
},
{
"epoch": 166.33663366336634,
"grad_norm": 0.8272483944892883,
"learning_rate": 6.335e-05,
"loss": 0.3542,
"step": 4200
},
{
"epoch": 168.31683168316832,
"grad_norm": 0.9853746891021729,
"learning_rate": 6.251666666666666e-05,
"loss": 0.3553,
"step": 4250
},
{
"epoch": 170.2970297029703,
"grad_norm": 1.0020989179611206,
"learning_rate": 6.168333333333333e-05,
"loss": 0.3558,
"step": 4300
},
{
"epoch": 172.27722772277227,
"grad_norm": 1.4780181646347046,
"learning_rate": 6.085000000000001e-05,
"loss": 0.3505,
"step": 4350
},
{
"epoch": 174.25742574257427,
"grad_norm": 0.9966872334480286,
"learning_rate": 6.0016666666666664e-05,
"loss": 0.3513,
"step": 4400
},
{
"epoch": 176.23762376237624,
"grad_norm": 1.2055169343948364,
"learning_rate": 5.918333333333333e-05,
"loss": 0.3509,
"step": 4450
},
{
"epoch": 178.21782178217822,
"grad_norm": 1.075426697731018,
"learning_rate": 5.835e-05,
"loss": 0.3545,
"step": 4500
},
{
"epoch": 178.21782178217822,
"eval_loss": 0.3981594443321228,
"eval_runtime": 6.8387,
"eval_samples_per_second": 26.321,
"eval_steps_per_second": 3.363,
"step": 4500
},
{
"epoch": 180.1980198019802,
"grad_norm": 1.0541815757751465,
"learning_rate": 5.751666666666667e-05,
"loss": 0.3473,
"step": 4550
},
{
"epoch": 182.17821782178217,
"grad_norm": 2.1192638874053955,
"learning_rate": 5.668333333333333e-05,
"loss": 0.348,
"step": 4600
},
{
"epoch": 184.15841584158414,
"grad_norm": 1.2069100141525269,
"learning_rate": 5.585e-05,
"loss": 0.3463,
"step": 4650
},
{
"epoch": 186.13861386138615,
"grad_norm": 0.9461864233016968,
"learning_rate": 5.501666666666667e-05,
"loss": 0.3471,
"step": 4700
},
{
"epoch": 188.11881188118812,
"grad_norm": 1.0580745935440063,
"learning_rate": 5.4183333333333334e-05,
"loss": 0.3485,
"step": 4750
},
{
"epoch": 190.0990099009901,
"grad_norm": 0.7629022002220154,
"learning_rate": 5.335e-05,
"loss": 0.346,
"step": 4800
},
{
"epoch": 192.07920792079207,
"grad_norm": 0.7628908753395081,
"learning_rate": 5.251666666666667e-05,
"loss": 0.3487,
"step": 4850
},
{
"epoch": 194.05940594059405,
"grad_norm": 1.024609088897705,
"learning_rate": 5.168333333333334e-05,
"loss": 0.3486,
"step": 4900
},
{
"epoch": 196.03960396039605,
"grad_norm": 0.8158652186393738,
"learning_rate": 5.0849999999999996e-05,
"loss": 0.3456,
"step": 4950
},
{
"epoch": 198.01980198019803,
"grad_norm": 1.0953030586242676,
"learning_rate": 5.0016666666666665e-05,
"loss": 0.3473,
"step": 5000
},
{
"epoch": 198.01980198019803,
"eval_loss": 0.40393778681755066,
"eval_runtime": 6.9458,
"eval_samples_per_second": 25.915,
"eval_steps_per_second": 3.311,
"step": 5000
},
{
"epoch": 200.0,
"grad_norm": 1.864687442779541,
"learning_rate": 4.9183333333333334e-05,
"loss": 0.3484,
"step": 5050
},
{
"epoch": 201.98019801980197,
"grad_norm": 1.406449556350708,
"learning_rate": 4.835e-05,
"loss": 0.345,
"step": 5100
},
{
"epoch": 203.96039603960395,
"grad_norm": 0.7522682547569275,
"learning_rate": 4.751666666666667e-05,
"loss": 0.3468,
"step": 5150
},
{
"epoch": 205.94059405940595,
"grad_norm": 0.5859296321868896,
"learning_rate": 4.6683333333333334e-05,
"loss": 0.3432,
"step": 5200
},
{
"epoch": 207.92079207920793,
"grad_norm": 0.6594001054763794,
"learning_rate": 4.585e-05,
"loss": 0.3417,
"step": 5250
},
{
"epoch": 209.9009900990099,
"grad_norm": 1.0125696659088135,
"learning_rate": 4.5016666666666665e-05,
"loss": 0.3428,
"step": 5300
},
{
"epoch": 211.88118811881188,
"grad_norm": 0.8519133925437927,
"learning_rate": 4.4183333333333334e-05,
"loss": 0.3424,
"step": 5350
},
{
"epoch": 213.86138613861385,
"grad_norm": 0.8138070106506348,
"learning_rate": 4.335e-05,
"loss": 0.3411,
"step": 5400
},
{
"epoch": 215.84158415841586,
"grad_norm": 1.7046844959259033,
"learning_rate": 4.251666666666667e-05,
"loss": 0.3418,
"step": 5450
},
{
"epoch": 217.82178217821783,
"grad_norm": 0.8346728682518005,
"learning_rate": 4.1683333333333335e-05,
"loss": 0.3439,
"step": 5500
},
{
"epoch": 217.82178217821783,
"eval_loss": 0.40201354026794434,
"eval_runtime": 7.7869,
"eval_samples_per_second": 23.116,
"eval_steps_per_second": 2.954,
"step": 5500
}
],
"logging_steps": 50,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 320,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.943523141225296e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}