zlm_b128_le5_s8000 / last-checkpoint /trainer_state.json
mikhail-panzo's picture
Training in progress, step 8000, checkpoint
8a5d649 verified
{
"best_metric": 0.3662460446357727,
"best_model_checkpoint": "mikhail-panzo/zlm_b128_le5_s8000/checkpoint-8000",
"epoch": 13.403141361256544,
"eval_steps": 500,
"global_step": 8000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08376963350785341,
"grad_norm": 6.784719467163086,
"learning_rate": 2.4500000000000004e-07,
"loss": 1.1115,
"step": 50
},
{
"epoch": 0.16753926701570682,
"grad_norm": 12.37061595916748,
"learning_rate": 4.95e-07,
"loss": 1.1117,
"step": 100
},
{
"epoch": 0.2513089005235602,
"grad_norm": 2.4181911945343018,
"learning_rate": 7.450000000000001e-07,
"loss": 0.9761,
"step": 150
},
{
"epoch": 0.33507853403141363,
"grad_norm": 5.356767177581787,
"learning_rate": 9.950000000000002e-07,
"loss": 0.8691,
"step": 200
},
{
"epoch": 0.418848167539267,
"grad_norm": 1.991185188293457,
"learning_rate": 1.2450000000000002e-06,
"loss": 0.816,
"step": 250
},
{
"epoch": 0.5026178010471204,
"grad_norm": 1.8864444494247437,
"learning_rate": 1.495e-06,
"loss": 0.7841,
"step": 300
},
{
"epoch": 0.5863874345549738,
"grad_norm": 2.3990137577056885,
"learning_rate": 1.745e-06,
"loss": 0.7553,
"step": 350
},
{
"epoch": 0.6701570680628273,
"grad_norm": 3.0834362506866455,
"learning_rate": 1.9950000000000004e-06,
"loss": 0.7231,
"step": 400
},
{
"epoch": 0.7539267015706806,
"grad_norm": 1.6590628623962402,
"learning_rate": 2.245e-06,
"loss": 0.6887,
"step": 450
},
{
"epoch": 0.837696335078534,
"grad_norm": 2.2680208683013916,
"learning_rate": 2.4950000000000003e-06,
"loss": 0.6645,
"step": 500
},
{
"epoch": 0.837696335078534,
"eval_loss": 0.5697648525238037,
"eval_runtime": 253.6,
"eval_samples_per_second": 33.474,
"eval_steps_per_second": 4.188,
"step": 500
},
{
"epoch": 0.9214659685863874,
"grad_norm": 1.2383062839508057,
"learning_rate": 2.7450000000000004e-06,
"loss": 0.6485,
"step": 550
},
{
"epoch": 1.0052356020942408,
"grad_norm": 1.7865532636642456,
"learning_rate": 2.995e-06,
"loss": 0.6305,
"step": 600
},
{
"epoch": 1.0890052356020943,
"grad_norm": 5.1390790939331055,
"learning_rate": 3.2450000000000003e-06,
"loss": 0.625,
"step": 650
},
{
"epoch": 1.1727748691099475,
"grad_norm": 1.56719172000885,
"learning_rate": 3.495e-06,
"loss": 0.6164,
"step": 700
},
{
"epoch": 1.256544502617801,
"grad_norm": 2.732139825820923,
"learning_rate": 3.745e-06,
"loss": 0.5949,
"step": 750
},
{
"epoch": 1.3403141361256545,
"grad_norm": 2.0998775959014893,
"learning_rate": 3.995000000000001e-06,
"loss": 0.5885,
"step": 800
},
{
"epoch": 1.4240837696335078,
"grad_norm": 9.37508487701416,
"learning_rate": 4.245e-06,
"loss": 0.5763,
"step": 850
},
{
"epoch": 1.5078534031413613,
"grad_norm": 2.467970132827759,
"learning_rate": 4.495e-06,
"loss": 0.5671,
"step": 900
},
{
"epoch": 1.5916230366492146,
"grad_norm": 1.5992038249969482,
"learning_rate": 4.745e-06,
"loss": 0.5569,
"step": 950
},
{
"epoch": 1.675392670157068,
"grad_norm": 2.854398727416992,
"learning_rate": 4.9950000000000005e-06,
"loss": 0.5581,
"step": 1000
},
{
"epoch": 1.675392670157068,
"eval_loss": 0.4793892800807953,
"eval_runtime": 254.3858,
"eval_samples_per_second": 33.371,
"eval_steps_per_second": 4.175,
"step": 1000
},
{
"epoch": 1.7591623036649215,
"grad_norm": 2.9313721656799316,
"learning_rate": 5.245e-06,
"loss": 0.5573,
"step": 1050
},
{
"epoch": 1.8429319371727748,
"grad_norm": 1.984640121459961,
"learning_rate": 5.495000000000001e-06,
"loss": 0.5455,
"step": 1100
},
{
"epoch": 1.9267015706806283,
"grad_norm": 2.0967133045196533,
"learning_rate": 5.745000000000001e-06,
"loss": 0.5301,
"step": 1150
},
{
"epoch": 2.0104712041884816,
"grad_norm": 2.864819049835205,
"learning_rate": 5.995000000000001e-06,
"loss": 0.5249,
"step": 1200
},
{
"epoch": 2.094240837696335,
"grad_norm": 1.5830483436584473,
"learning_rate": 6.245000000000001e-06,
"loss": 0.5265,
"step": 1250
},
{
"epoch": 2.1780104712041886,
"grad_norm": 1.706629753112793,
"learning_rate": 6.4950000000000005e-06,
"loss": 0.5226,
"step": 1300
},
{
"epoch": 2.261780104712042,
"grad_norm": 2.3449299335479736,
"learning_rate": 6.745000000000001e-06,
"loss": 0.5159,
"step": 1350
},
{
"epoch": 2.345549738219895,
"grad_norm": 2.4408295154571533,
"learning_rate": 6.995000000000001e-06,
"loss": 0.517,
"step": 1400
},
{
"epoch": 2.4293193717277486,
"grad_norm": 1.8883627653121948,
"learning_rate": 7.245000000000001e-06,
"loss": 0.5139,
"step": 1450
},
{
"epoch": 2.513089005235602,
"grad_norm": 1.5606210231781006,
"learning_rate": 7.495000000000001e-06,
"loss": 0.5045,
"step": 1500
},
{
"epoch": 2.513089005235602,
"eval_loss": 0.44668668508529663,
"eval_runtime": 252.363,
"eval_samples_per_second": 33.638,
"eval_steps_per_second": 4.208,
"step": 1500
},
{
"epoch": 2.5968586387434556,
"grad_norm": 1.6376454830169678,
"learning_rate": 7.745e-06,
"loss": 0.503,
"step": 1550
},
{
"epoch": 2.680628272251309,
"grad_norm": 1.7104960680007935,
"learning_rate": 7.995e-06,
"loss": 0.4988,
"step": 1600
},
{
"epoch": 2.7643979057591626,
"grad_norm": 2.3372185230255127,
"learning_rate": 8.245000000000002e-06,
"loss": 0.4954,
"step": 1650
},
{
"epoch": 2.8481675392670156,
"grad_norm": 2.170474052429199,
"learning_rate": 8.495e-06,
"loss": 0.4971,
"step": 1700
},
{
"epoch": 2.931937172774869,
"grad_norm": 2.3041465282440186,
"learning_rate": 8.745000000000002e-06,
"loss": 0.4948,
"step": 1750
},
{
"epoch": 3.0157068062827226,
"grad_norm": 2.1798529624938965,
"learning_rate": 8.995000000000001e-06,
"loss": 0.4849,
"step": 1800
},
{
"epoch": 3.099476439790576,
"grad_norm": 1.8813133239746094,
"learning_rate": 9.245e-06,
"loss": 0.4839,
"step": 1850
},
{
"epoch": 3.183246073298429,
"grad_norm": 1.8749632835388184,
"learning_rate": 9.49e-06,
"loss": 0.4871,
"step": 1900
},
{
"epoch": 3.2670157068062826,
"grad_norm": 1.917149305343628,
"learning_rate": 9.74e-06,
"loss": 0.4724,
"step": 1950
},
{
"epoch": 3.350785340314136,
"grad_norm": 1.9401865005493164,
"learning_rate": 9.990000000000001e-06,
"loss": 0.4776,
"step": 2000
},
{
"epoch": 3.350785340314136,
"eval_loss": 0.4236186444759369,
"eval_runtime": 261.4044,
"eval_samples_per_second": 32.475,
"eval_steps_per_second": 4.063,
"step": 2000
},
{
"epoch": 3.4345549738219896,
"grad_norm": 1.67532479763031,
"learning_rate": 9.921666666666667e-06,
"loss": 0.4797,
"step": 2050
},
{
"epoch": 3.518324607329843,
"grad_norm": 2.044435977935791,
"learning_rate": 9.838333333333334e-06,
"loss": 0.475,
"step": 2100
},
{
"epoch": 3.6020942408376966,
"grad_norm": 1.7103309631347656,
"learning_rate": 9.755e-06,
"loss": 0.4729,
"step": 2150
},
{
"epoch": 3.6858638743455496,
"grad_norm": 2.557506799697876,
"learning_rate": 9.671666666666668e-06,
"loss": 0.4671,
"step": 2200
},
{
"epoch": 3.769633507853403,
"grad_norm": 2.5080385208129883,
"learning_rate": 9.588333333333334e-06,
"loss": 0.4613,
"step": 2250
},
{
"epoch": 3.8534031413612566,
"grad_norm": 1.6658800840377808,
"learning_rate": 9.505000000000001e-06,
"loss": 0.4614,
"step": 2300
},
{
"epoch": 3.93717277486911,
"grad_norm": 2.167924404144287,
"learning_rate": 9.421666666666668e-06,
"loss": 0.4685,
"step": 2350
},
{
"epoch": 4.020942408376963,
"grad_norm": 4.781851768493652,
"learning_rate": 9.338333333333333e-06,
"loss": 0.4578,
"step": 2400
},
{
"epoch": 4.104712041884817,
"grad_norm": 2.1949734687805176,
"learning_rate": 9.255e-06,
"loss": 0.4613,
"step": 2450
},
{
"epoch": 4.18848167539267,
"grad_norm": 1.9870787858963013,
"learning_rate": 9.171666666666667e-06,
"loss": 0.4553,
"step": 2500
},
{
"epoch": 4.18848167539267,
"eval_loss": 0.40933898091316223,
"eval_runtime": 252.812,
"eval_samples_per_second": 33.578,
"eval_steps_per_second": 4.201,
"step": 2500
},
{
"epoch": 4.272251308900524,
"grad_norm": 5.594339370727539,
"learning_rate": 9.088333333333334e-06,
"loss": 0.4592,
"step": 2550
},
{
"epoch": 4.356020942408377,
"grad_norm": 2.930290699005127,
"learning_rate": 9.005000000000001e-06,
"loss": 0.4549,
"step": 2600
},
{
"epoch": 4.439790575916231,
"grad_norm": 2.199352741241455,
"learning_rate": 8.921666666666668e-06,
"loss": 0.4512,
"step": 2650
},
{
"epoch": 4.523560209424084,
"grad_norm": 2.435898780822754,
"learning_rate": 8.838333333333335e-06,
"loss": 0.4557,
"step": 2700
},
{
"epoch": 4.607329842931938,
"grad_norm": 3.2397773265838623,
"learning_rate": 8.755e-06,
"loss": 0.449,
"step": 2750
},
{
"epoch": 4.69109947643979,
"grad_norm": 1.9361001253128052,
"learning_rate": 8.671666666666667e-06,
"loss": 0.4525,
"step": 2800
},
{
"epoch": 4.774869109947644,
"grad_norm": 2.1852686405181885,
"learning_rate": 8.588333333333334e-06,
"loss": 0.4486,
"step": 2850
},
{
"epoch": 4.858638743455497,
"grad_norm": 1.8136191368103027,
"learning_rate": 8.505e-06,
"loss": 0.4517,
"step": 2900
},
{
"epoch": 4.942408376963351,
"grad_norm": 2.694406032562256,
"learning_rate": 8.421666666666668e-06,
"loss": 0.4505,
"step": 2950
},
{
"epoch": 5.026178010471204,
"grad_norm": 1.251187801361084,
"learning_rate": 8.338333333333335e-06,
"loss": 0.4489,
"step": 3000
},
{
"epoch": 5.026178010471204,
"eval_loss": 0.39678552746772766,
"eval_runtime": 256.1618,
"eval_samples_per_second": 33.139,
"eval_steps_per_second": 4.146,
"step": 3000
},
{
"epoch": 5.109947643979058,
"grad_norm": 1.1454211473464966,
"learning_rate": 8.255000000000001e-06,
"loss": 0.4422,
"step": 3050
},
{
"epoch": 5.193717277486911,
"grad_norm": 1.8685294389724731,
"learning_rate": 8.171666666666668e-06,
"loss": 0.4426,
"step": 3100
},
{
"epoch": 5.277486910994765,
"grad_norm": 1.6863799095153809,
"learning_rate": 8.088333333333334e-06,
"loss": 0.4398,
"step": 3150
},
{
"epoch": 5.361256544502618,
"grad_norm": 2.249805212020874,
"learning_rate": 8.005e-06,
"loss": 0.4384,
"step": 3200
},
{
"epoch": 5.445026178010472,
"grad_norm": 2.1187326908111572,
"learning_rate": 7.921666666666667e-06,
"loss": 0.4431,
"step": 3250
},
{
"epoch": 5.528795811518324,
"grad_norm": 1.8476357460021973,
"learning_rate": 7.838333333333334e-06,
"loss": 0.4434,
"step": 3300
},
{
"epoch": 5.612565445026178,
"grad_norm": 1.6522760391235352,
"learning_rate": 7.755000000000001e-06,
"loss": 0.4343,
"step": 3350
},
{
"epoch": 5.696335078534031,
"grad_norm": 1.3926664590835571,
"learning_rate": 7.671666666666668e-06,
"loss": 0.4365,
"step": 3400
},
{
"epoch": 5.780104712041885,
"grad_norm": 2.1967947483062744,
"learning_rate": 7.588333333333334e-06,
"loss": 0.4362,
"step": 3450
},
{
"epoch": 5.863874345549738,
"grad_norm": 1.5428054332733154,
"learning_rate": 7.505e-06,
"loss": 0.4337,
"step": 3500
},
{
"epoch": 5.863874345549738,
"eval_loss": 0.39255669713020325,
"eval_runtime": 251.4241,
"eval_samples_per_second": 33.764,
"eval_steps_per_second": 4.224,
"step": 3500
},
{
"epoch": 5.947643979057592,
"grad_norm": 1.7545124292373657,
"learning_rate": 7.421666666666667e-06,
"loss": 0.4292,
"step": 3550
},
{
"epoch": 6.031413612565445,
"grad_norm": 1.1912785768508911,
"learning_rate": 7.338333333333334e-06,
"loss": 0.4293,
"step": 3600
},
{
"epoch": 6.115183246073299,
"grad_norm": 1.549297571182251,
"learning_rate": 7.255000000000001e-06,
"loss": 0.4333,
"step": 3650
},
{
"epoch": 6.198952879581152,
"grad_norm": 1.8822449445724487,
"learning_rate": 7.171666666666667e-06,
"loss": 0.4308,
"step": 3700
},
{
"epoch": 6.282722513089006,
"grad_norm": 1.6894242763519287,
"learning_rate": 7.088333333333334e-06,
"loss": 0.4336,
"step": 3750
},
{
"epoch": 6.366492146596858,
"grad_norm": 1.63300621509552,
"learning_rate": 7.005000000000001e-06,
"loss": 0.432,
"step": 3800
},
{
"epoch": 6.450261780104712,
"grad_norm": 1.4569647312164307,
"learning_rate": 6.921666666666668e-06,
"loss": 0.4302,
"step": 3850
},
{
"epoch": 6.534031413612565,
"grad_norm": 1.3649786710739136,
"learning_rate": 6.838333333333334e-06,
"loss": 0.4277,
"step": 3900
},
{
"epoch": 6.617801047120419,
"grad_norm": 1.6974161863327026,
"learning_rate": 6.7550000000000005e-06,
"loss": 0.4329,
"step": 3950
},
{
"epoch": 6.701570680628272,
"grad_norm": 8.575983047485352,
"learning_rate": 6.6716666666666674e-06,
"loss": 0.4282,
"step": 4000
},
{
"epoch": 6.701570680628272,
"eval_loss": 0.3837336301803589,
"eval_runtime": 252.6194,
"eval_samples_per_second": 33.604,
"eval_steps_per_second": 4.204,
"step": 4000
},
{
"epoch": 6.785340314136126,
"grad_norm": 1.308820366859436,
"learning_rate": 6.588333333333334e-06,
"loss": 0.4233,
"step": 4050
},
{
"epoch": 6.869109947643979,
"grad_norm": 1.6496186256408691,
"learning_rate": 6.505e-06,
"loss": 0.4235,
"step": 4100
},
{
"epoch": 6.952879581151833,
"grad_norm": 2.9797613620758057,
"learning_rate": 6.421666666666667e-06,
"loss": 0.424,
"step": 4150
},
{
"epoch": 7.036649214659686,
"grad_norm": 1.366092324256897,
"learning_rate": 6.338333333333334e-06,
"loss": 0.4241,
"step": 4200
},
{
"epoch": 7.12041884816754,
"grad_norm": 1.2892742156982422,
"learning_rate": 6.255e-06,
"loss": 0.4306,
"step": 4250
},
{
"epoch": 7.204188481675392,
"grad_norm": 1.769739031791687,
"learning_rate": 6.171666666666667e-06,
"loss": 0.4206,
"step": 4300
},
{
"epoch": 7.287958115183246,
"grad_norm": 1.2120298147201538,
"learning_rate": 6.088333333333334e-06,
"loss": 0.4245,
"step": 4350
},
{
"epoch": 7.371727748691099,
"grad_norm": 2.545499086380005,
"learning_rate": 6.005000000000001e-06,
"loss": 0.4234,
"step": 4400
},
{
"epoch": 7.455497382198953,
"grad_norm": 1.57191801071167,
"learning_rate": 5.921666666666667e-06,
"loss": 0.426,
"step": 4450
},
{
"epoch": 7.539267015706806,
"grad_norm": 1.3846454620361328,
"learning_rate": 5.838333333333334e-06,
"loss": 0.4188,
"step": 4500
},
{
"epoch": 7.539267015706806,
"eval_loss": 0.37984684109687805,
"eval_runtime": 250.9224,
"eval_samples_per_second": 33.831,
"eval_steps_per_second": 4.232,
"step": 4500
},
{
"epoch": 7.62303664921466,
"grad_norm": 1.3911954164505005,
"learning_rate": 5.755000000000001e-06,
"loss": 0.42,
"step": 4550
},
{
"epoch": 7.706806282722513,
"grad_norm": 1.3781930208206177,
"learning_rate": 5.671666666666668e-06,
"loss": 0.4234,
"step": 4600
},
{
"epoch": 7.790575916230367,
"grad_norm": 1.5225090980529785,
"learning_rate": 5.588333333333334e-06,
"loss": 0.4242,
"step": 4650
},
{
"epoch": 7.87434554973822,
"grad_norm": 1.4195870161056519,
"learning_rate": 5.505000000000001e-06,
"loss": 0.4243,
"step": 4700
},
{
"epoch": 7.958115183246074,
"grad_norm": 1.9713116884231567,
"learning_rate": 5.4216666666666676e-06,
"loss": 0.4158,
"step": 4750
},
{
"epoch": 8.041884816753926,
"grad_norm": 1.3215075731277466,
"learning_rate": 5.3383333333333345e-06,
"loss": 0.4141,
"step": 4800
},
{
"epoch": 8.12565445026178,
"grad_norm": 4.483395576477051,
"learning_rate": 5.2550000000000005e-06,
"loss": 0.4165,
"step": 4850
},
{
"epoch": 8.209424083769633,
"grad_norm": 1.4759974479675293,
"learning_rate": 5.171666666666667e-06,
"loss": 0.4291,
"step": 4900
},
{
"epoch": 8.293193717277488,
"grad_norm": 1.3336604833602905,
"learning_rate": 5.088333333333334e-06,
"loss": 0.4134,
"step": 4950
},
{
"epoch": 8.37696335078534,
"grad_norm": 1.5618665218353271,
"learning_rate": 5.0049999999999995e-06,
"loss": 0.4222,
"step": 5000
},
{
"epoch": 8.37696335078534,
"eval_loss": 0.37838345766067505,
"eval_runtime": 256.1892,
"eval_samples_per_second": 33.136,
"eval_steps_per_second": 4.145,
"step": 5000
},
{
"epoch": 8.460732984293193,
"grad_norm": 1.079275369644165,
"learning_rate": 4.921666666666666e-06,
"loss": 0.4143,
"step": 5050
},
{
"epoch": 8.544502617801047,
"grad_norm": 1.0949628353118896,
"learning_rate": 4.838333333333334e-06,
"loss": 0.4123,
"step": 5100
},
{
"epoch": 8.6282722513089,
"grad_norm": 1.5005624294281006,
"learning_rate": 4.755e-06,
"loss": 0.4218,
"step": 5150
},
{
"epoch": 8.712041884816754,
"grad_norm": 5.408727169036865,
"learning_rate": 4.671666666666667e-06,
"loss": 0.4123,
"step": 5200
},
{
"epoch": 8.795811518324607,
"grad_norm": 1.0426135063171387,
"learning_rate": 4.588333333333333e-06,
"loss": 0.4159,
"step": 5250
},
{
"epoch": 8.879581151832461,
"grad_norm": 2.312485456466675,
"learning_rate": 4.505e-06,
"loss": 0.4149,
"step": 5300
},
{
"epoch": 8.963350785340314,
"grad_norm": 1.6531749963760376,
"learning_rate": 4.421666666666667e-06,
"loss": 0.4147,
"step": 5350
},
{
"epoch": 9.047120418848168,
"grad_norm": 1.2448443174362183,
"learning_rate": 4.338333333333334e-06,
"loss": 0.4191,
"step": 5400
},
{
"epoch": 9.13089005235602,
"grad_norm": 1.1506507396697998,
"learning_rate": 4.255e-06,
"loss": 0.4094,
"step": 5450
},
{
"epoch": 9.214659685863875,
"grad_norm": 1.2973967790603638,
"learning_rate": 4.171666666666667e-06,
"loss": 0.412,
"step": 5500
},
{
"epoch": 9.214659685863875,
"eval_loss": 0.3729405403137207,
"eval_runtime": 258.7159,
"eval_samples_per_second": 32.812,
"eval_steps_per_second": 4.105,
"step": 5500
},
{
"epoch": 9.298429319371728,
"grad_norm": 1.0570714473724365,
"learning_rate": 4.088333333333334e-06,
"loss": 0.4144,
"step": 5550
},
{
"epoch": 9.38219895287958,
"grad_norm": 1.1278653144836426,
"learning_rate": 4.005000000000001e-06,
"loss": 0.4154,
"step": 5600
},
{
"epoch": 9.465968586387435,
"grad_norm": 5.811945915222168,
"learning_rate": 3.921666666666667e-06,
"loss": 0.4085,
"step": 5650
},
{
"epoch": 9.549738219895287,
"grad_norm": 1.6973522901535034,
"learning_rate": 3.8383333333333336e-06,
"loss": 0.4133,
"step": 5700
},
{
"epoch": 9.633507853403142,
"grad_norm": 1.209333062171936,
"learning_rate": 3.7550000000000005e-06,
"loss": 0.4123,
"step": 5750
},
{
"epoch": 9.717277486910994,
"grad_norm": 3.592991590499878,
"learning_rate": 3.6716666666666665e-06,
"loss": 0.4126,
"step": 5800
},
{
"epoch": 9.801047120418849,
"grad_norm": 1.152239203453064,
"learning_rate": 3.588333333333334e-06,
"loss": 0.4115,
"step": 5850
},
{
"epoch": 9.884816753926701,
"grad_norm": 1.6118751764297485,
"learning_rate": 3.505e-06,
"loss": 0.4036,
"step": 5900
},
{
"epoch": 9.968586387434556,
"grad_norm": 1.4384329319000244,
"learning_rate": 3.4216666666666672e-06,
"loss": 0.4103,
"step": 5950
},
{
"epoch": 10.052356020942408,
"grad_norm": 1.447549819946289,
"learning_rate": 3.3383333333333333e-06,
"loss": 0.4056,
"step": 6000
},
{
"epoch": 10.052356020942408,
"eval_loss": 0.3696598410606384,
"eval_runtime": 256.5857,
"eval_samples_per_second": 33.084,
"eval_steps_per_second": 4.139,
"step": 6000
},
{
"epoch": 10.136125654450261,
"grad_norm": 1.2832478284835815,
"learning_rate": 3.255e-06,
"loss": 0.4086,
"step": 6050
},
{
"epoch": 10.219895287958115,
"grad_norm": 1.0580389499664307,
"learning_rate": 3.1716666666666667e-06,
"loss": 0.4128,
"step": 6100
},
{
"epoch": 10.303664921465968,
"grad_norm": 1.5535204410552979,
"learning_rate": 3.0883333333333336e-06,
"loss": 0.4088,
"step": 6150
},
{
"epoch": 10.387434554973822,
"grad_norm": 0.8630754947662354,
"learning_rate": 3.005e-06,
"loss": 0.4073,
"step": 6200
},
{
"epoch": 10.471204188481675,
"grad_norm": 1.248646855354309,
"learning_rate": 2.921666666666667e-06,
"loss": 0.4074,
"step": 6250
},
{
"epoch": 10.55497382198953,
"grad_norm": 0.9990864396095276,
"learning_rate": 2.8383333333333334e-06,
"loss": 0.4076,
"step": 6300
},
{
"epoch": 10.638743455497382,
"grad_norm": 1.100101351737976,
"learning_rate": 2.7550000000000003e-06,
"loss": 0.4081,
"step": 6350
},
{
"epoch": 10.722513089005236,
"grad_norm": 1.1483354568481445,
"learning_rate": 2.6716666666666668e-06,
"loss": 0.4088,
"step": 6400
},
{
"epoch": 10.806282722513089,
"grad_norm": 0.8169103860855103,
"learning_rate": 2.5883333333333337e-06,
"loss": 0.406,
"step": 6450
},
{
"epoch": 10.890052356020943,
"grad_norm": 1.0567216873168945,
"learning_rate": 2.505e-06,
"loss": 0.4065,
"step": 6500
},
{
"epoch": 10.890052356020943,
"eval_loss": 0.36852771043777466,
"eval_runtime": 255.1911,
"eval_samples_per_second": 33.265,
"eval_steps_per_second": 4.162,
"step": 6500
},
{
"epoch": 10.973821989528796,
"grad_norm": 1.0459740161895752,
"learning_rate": 2.421666666666667e-06,
"loss": 0.4074,
"step": 6550
},
{
"epoch": 11.057591623036648,
"grad_norm": 0.9029247760772705,
"learning_rate": 2.3383333333333335e-06,
"loss": 0.4075,
"step": 6600
},
{
"epoch": 11.141361256544503,
"grad_norm": 1.5372889041900635,
"learning_rate": 2.2550000000000004e-06,
"loss": 0.4088,
"step": 6650
},
{
"epoch": 11.225130890052355,
"grad_norm": 0.9959379434585571,
"learning_rate": 2.171666666666667e-06,
"loss": 0.4044,
"step": 6700
},
{
"epoch": 11.30890052356021,
"grad_norm": 1.3793728351593018,
"learning_rate": 2.088333333333334e-06,
"loss": 0.4034,
"step": 6750
},
{
"epoch": 11.392670157068062,
"grad_norm": 1.2086491584777832,
"learning_rate": 2.0050000000000003e-06,
"loss": 0.4073,
"step": 6800
},
{
"epoch": 11.476439790575917,
"grad_norm": 1.07647705078125,
"learning_rate": 1.9216666666666668e-06,
"loss": 0.405,
"step": 6850
},
{
"epoch": 11.56020942408377,
"grad_norm": 0.9849846363067627,
"learning_rate": 1.8383333333333334e-06,
"loss": 0.4037,
"step": 6900
},
{
"epoch": 11.643979057591624,
"grad_norm": 1.2623456716537476,
"learning_rate": 1.7550000000000001e-06,
"loss": 0.4042,
"step": 6950
},
{
"epoch": 11.727748691099476,
"grad_norm": 0.9488279819488525,
"learning_rate": 1.6716666666666666e-06,
"loss": 0.4069,
"step": 7000
},
{
"epoch": 11.727748691099476,
"eval_loss": 0.3675082325935364,
"eval_runtime": 257.7947,
"eval_samples_per_second": 32.929,
"eval_steps_per_second": 4.12,
"step": 7000
},
{
"epoch": 11.81151832460733,
"grad_norm": 1.7772880792617798,
"learning_rate": 1.5883333333333333e-06,
"loss": 0.4062,
"step": 7050
},
{
"epoch": 11.895287958115183,
"grad_norm": 0.9282962083816528,
"learning_rate": 1.505e-06,
"loss": 0.4097,
"step": 7100
},
{
"epoch": 11.979057591623036,
"grad_norm": 0.9623945355415344,
"learning_rate": 1.4216666666666667e-06,
"loss": 0.4022,
"step": 7150
},
{
"epoch": 12.06282722513089,
"grad_norm": 0.8984498381614685,
"learning_rate": 1.34e-06,
"loss": 0.4136,
"step": 7200
},
{
"epoch": 12.146596858638743,
"grad_norm": 1.1274847984313965,
"learning_rate": 1.2566666666666668e-06,
"loss": 0.4074,
"step": 7250
},
{
"epoch": 12.230366492146597,
"grad_norm": 0.9233041405677795,
"learning_rate": 1.1733333333333335e-06,
"loss": 0.4051,
"step": 7300
},
{
"epoch": 12.31413612565445,
"grad_norm": 1.0804561376571655,
"learning_rate": 1.0900000000000002e-06,
"loss": 0.4045,
"step": 7350
},
{
"epoch": 12.397905759162304,
"grad_norm": 0.9518505334854126,
"learning_rate": 1.0066666666666668e-06,
"loss": 0.4097,
"step": 7400
},
{
"epoch": 12.481675392670157,
"grad_norm": 1.3323400020599365,
"learning_rate": 9.233333333333334e-07,
"loss": 0.4077,
"step": 7450
},
{
"epoch": 12.565445026178011,
"grad_norm": 0.8265945911407471,
"learning_rate": 8.400000000000001e-07,
"loss": 0.4049,
"step": 7500
},
{
"epoch": 12.565445026178011,
"eval_loss": 0.3666408658027649,
"eval_runtime": 263.2918,
"eval_samples_per_second": 32.242,
"eval_steps_per_second": 4.034,
"step": 7500
},
{
"epoch": 12.649214659685864,
"grad_norm": 0.8195668458938599,
"learning_rate": 7.566666666666667e-07,
"loss": 0.4032,
"step": 7550
},
{
"epoch": 12.732984293193716,
"grad_norm": 0.9879769086837769,
"learning_rate": 6.733333333333334e-07,
"loss": 0.4078,
"step": 7600
},
{
"epoch": 12.81675392670157,
"grad_norm": 2.3267557621002197,
"learning_rate": 5.900000000000001e-07,
"loss": 0.4058,
"step": 7650
},
{
"epoch": 12.900523560209423,
"grad_norm": 0.9204362034797668,
"learning_rate": 5.066666666666667e-07,
"loss": 0.4052,
"step": 7700
},
{
"epoch": 12.984293193717278,
"grad_norm": 1.0587321519851685,
"learning_rate": 4.233333333333334e-07,
"loss": 0.4111,
"step": 7750
},
{
"epoch": 13.06806282722513,
"grad_norm": 0.8546725511550903,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.4021,
"step": 7800
},
{
"epoch": 13.151832460732985,
"grad_norm": 0.7696201801300049,
"learning_rate": 2.566666666666667e-07,
"loss": 0.4035,
"step": 7850
},
{
"epoch": 13.235602094240837,
"grad_norm": 0.8487740755081177,
"learning_rate": 1.7333333333333335e-07,
"loss": 0.4077,
"step": 7900
},
{
"epoch": 13.319371727748692,
"grad_norm": 0.9611801505088806,
"learning_rate": 9e-08,
"loss": 0.4058,
"step": 7950
},
{
"epoch": 13.403141361256544,
"grad_norm": 0.7191503047943115,
"learning_rate": 6.666666666666667e-09,
"loss": 0.4044,
"step": 8000
},
{
"epoch": 13.403141361256544,
"eval_loss": 0.3662460446357727,
"eval_runtime": 261.9325,
"eval_samples_per_second": 32.409,
"eval_steps_per_second": 4.054,
"step": 8000
}
],
"logging_steps": 50,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 14,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4332007404692806e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}