{ "best_metric": 0.3662460446357727, "best_model_checkpoint": "mikhail-panzo/zlm_b128_le5_s8000/checkpoint-8000", "epoch": 13.403141361256544, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08376963350785341, "grad_norm": 6.784719467163086, "learning_rate": 2.4500000000000004e-07, "loss": 1.1115, "step": 50 }, { "epoch": 0.16753926701570682, "grad_norm": 12.37061595916748, "learning_rate": 4.95e-07, "loss": 1.1117, "step": 100 }, { "epoch": 0.2513089005235602, "grad_norm": 2.4181911945343018, "learning_rate": 7.450000000000001e-07, "loss": 0.9761, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 5.356767177581787, "learning_rate": 9.950000000000002e-07, "loss": 0.8691, "step": 200 }, { "epoch": 0.418848167539267, "grad_norm": 1.991185188293457, "learning_rate": 1.2450000000000002e-06, "loss": 0.816, "step": 250 }, { "epoch": 0.5026178010471204, "grad_norm": 1.8864444494247437, "learning_rate": 1.495e-06, "loss": 0.7841, "step": 300 }, { "epoch": 0.5863874345549738, "grad_norm": 2.3990137577056885, "learning_rate": 1.745e-06, "loss": 0.7553, "step": 350 }, { "epoch": 0.6701570680628273, "grad_norm": 3.0834362506866455, "learning_rate": 1.9950000000000004e-06, "loss": 0.7231, "step": 400 }, { "epoch": 0.7539267015706806, "grad_norm": 1.6590628623962402, "learning_rate": 2.245e-06, "loss": 0.6887, "step": 450 }, { "epoch": 0.837696335078534, "grad_norm": 2.2680208683013916, "learning_rate": 2.4950000000000003e-06, "loss": 0.6645, "step": 500 }, { "epoch": 0.837696335078534, "eval_loss": 0.5697648525238037, "eval_runtime": 253.6, "eval_samples_per_second": 33.474, "eval_steps_per_second": 4.188, "step": 500 }, { "epoch": 0.9214659685863874, "grad_norm": 1.2383062839508057, "learning_rate": 2.7450000000000004e-06, "loss": 0.6485, "step": 550 }, { "epoch": 1.0052356020942408, "grad_norm": 1.7865532636642456, "learning_rate": 2.995e-06, "loss": 0.6305, "step": 600 }, { "epoch": 1.0890052356020943, "grad_norm": 5.1390790939331055, "learning_rate": 3.2450000000000003e-06, "loss": 0.625, "step": 650 }, { "epoch": 1.1727748691099475, "grad_norm": 1.56719172000885, "learning_rate": 3.495e-06, "loss": 0.6164, "step": 700 }, { "epoch": 1.256544502617801, "grad_norm": 2.732139825820923, "learning_rate": 3.745e-06, "loss": 0.5949, "step": 750 }, { "epoch": 1.3403141361256545, "grad_norm": 2.0998775959014893, "learning_rate": 3.995000000000001e-06, "loss": 0.5885, "step": 800 }, { "epoch": 1.4240837696335078, "grad_norm": 9.37508487701416, "learning_rate": 4.245e-06, "loss": 0.5763, "step": 850 }, { "epoch": 1.5078534031413613, "grad_norm": 2.467970132827759, "learning_rate": 4.495e-06, "loss": 0.5671, "step": 900 }, { "epoch": 1.5916230366492146, "grad_norm": 1.5992038249969482, "learning_rate": 4.745e-06, "loss": 0.5569, "step": 950 }, { "epoch": 1.675392670157068, "grad_norm": 2.854398727416992, "learning_rate": 4.9950000000000005e-06, "loss": 0.5581, "step": 1000 }, { "epoch": 1.675392670157068, "eval_loss": 0.4793892800807953, "eval_runtime": 254.3858, "eval_samples_per_second": 33.371, "eval_steps_per_second": 4.175, "step": 1000 }, { "epoch": 1.7591623036649215, "grad_norm": 2.9313721656799316, "learning_rate": 5.245e-06, "loss": 0.5573, "step": 1050 }, { "epoch": 1.8429319371727748, "grad_norm": 1.984640121459961, "learning_rate": 5.495000000000001e-06, "loss": 0.5455, "step": 1100 }, { "epoch": 1.9267015706806283, "grad_norm": 2.0967133045196533, "learning_rate": 5.745000000000001e-06, "loss": 0.5301, "step": 1150 }, { "epoch": 2.0104712041884816, "grad_norm": 2.864819049835205, "learning_rate": 5.995000000000001e-06, "loss": 0.5249, "step": 1200 }, { "epoch": 2.094240837696335, "grad_norm": 1.5830483436584473, "learning_rate": 6.245000000000001e-06, "loss": 0.5265, "step": 1250 }, { "epoch": 2.1780104712041886, "grad_norm": 1.706629753112793, "learning_rate": 6.4950000000000005e-06, "loss": 0.5226, "step": 1300 }, { "epoch": 2.261780104712042, "grad_norm": 2.3449299335479736, "learning_rate": 6.745000000000001e-06, "loss": 0.5159, "step": 1350 }, { "epoch": 2.345549738219895, "grad_norm": 2.4408295154571533, "learning_rate": 6.995000000000001e-06, "loss": 0.517, "step": 1400 }, { "epoch": 2.4293193717277486, "grad_norm": 1.8883627653121948, "learning_rate": 7.245000000000001e-06, "loss": 0.5139, "step": 1450 }, { "epoch": 2.513089005235602, "grad_norm": 1.5606210231781006, "learning_rate": 7.495000000000001e-06, "loss": 0.5045, "step": 1500 }, { "epoch": 2.513089005235602, "eval_loss": 0.44668668508529663, "eval_runtime": 252.363, "eval_samples_per_second": 33.638, "eval_steps_per_second": 4.208, "step": 1500 }, { "epoch": 2.5968586387434556, "grad_norm": 1.6376454830169678, "learning_rate": 7.745e-06, "loss": 0.503, "step": 1550 }, { "epoch": 2.680628272251309, "grad_norm": 1.7104960680007935, "learning_rate": 7.995e-06, "loss": 0.4988, "step": 1600 }, { "epoch": 2.7643979057591626, "grad_norm": 2.3372185230255127, "learning_rate": 8.245000000000002e-06, "loss": 0.4954, "step": 1650 }, { "epoch": 2.8481675392670156, "grad_norm": 2.170474052429199, "learning_rate": 8.495e-06, "loss": 0.4971, "step": 1700 }, { "epoch": 2.931937172774869, "grad_norm": 2.3041465282440186, "learning_rate": 8.745000000000002e-06, "loss": 0.4948, "step": 1750 }, { "epoch": 3.0157068062827226, "grad_norm": 2.1798529624938965, "learning_rate": 8.995000000000001e-06, "loss": 0.4849, "step": 1800 }, { "epoch": 3.099476439790576, "grad_norm": 1.8813133239746094, "learning_rate": 9.245e-06, "loss": 0.4839, "step": 1850 }, { "epoch": 3.183246073298429, "grad_norm": 1.8749632835388184, "learning_rate": 9.49e-06, "loss": 0.4871, "step": 1900 }, { "epoch": 3.2670157068062826, "grad_norm": 1.917149305343628, "learning_rate": 9.74e-06, "loss": 0.4724, "step": 1950 }, { "epoch": 3.350785340314136, "grad_norm": 1.9401865005493164, "learning_rate": 9.990000000000001e-06, "loss": 0.4776, "step": 2000 }, { "epoch": 3.350785340314136, "eval_loss": 0.4236186444759369, "eval_runtime": 261.4044, "eval_samples_per_second": 32.475, "eval_steps_per_second": 4.063, "step": 2000 }, { "epoch": 3.4345549738219896, "grad_norm": 1.67532479763031, "learning_rate": 9.921666666666667e-06, "loss": 0.4797, "step": 2050 }, { "epoch": 3.518324607329843, "grad_norm": 2.044435977935791, "learning_rate": 9.838333333333334e-06, "loss": 0.475, "step": 2100 }, { "epoch": 3.6020942408376966, "grad_norm": 1.7103309631347656, "learning_rate": 9.755e-06, "loss": 0.4729, "step": 2150 }, { "epoch": 3.6858638743455496, "grad_norm": 2.557506799697876, "learning_rate": 9.671666666666668e-06, "loss": 0.4671, "step": 2200 }, { "epoch": 3.769633507853403, "grad_norm": 2.5080385208129883, "learning_rate": 9.588333333333334e-06, "loss": 0.4613, "step": 2250 }, { "epoch": 3.8534031413612566, "grad_norm": 1.6658800840377808, "learning_rate": 9.505000000000001e-06, "loss": 0.4614, "step": 2300 }, { "epoch": 3.93717277486911, "grad_norm": 2.167924404144287, "learning_rate": 9.421666666666668e-06, "loss": 0.4685, "step": 2350 }, { "epoch": 4.020942408376963, "grad_norm": 4.781851768493652, "learning_rate": 9.338333333333333e-06, "loss": 0.4578, "step": 2400 }, { "epoch": 4.104712041884817, "grad_norm": 2.1949734687805176, "learning_rate": 9.255e-06, "loss": 0.4613, "step": 2450 }, { "epoch": 4.18848167539267, "grad_norm": 1.9870787858963013, "learning_rate": 9.171666666666667e-06, "loss": 0.4553, "step": 2500 }, { "epoch": 4.18848167539267, "eval_loss": 0.40933898091316223, "eval_runtime": 252.812, "eval_samples_per_second": 33.578, "eval_steps_per_second": 4.201, "step": 2500 }, { "epoch": 4.272251308900524, "grad_norm": 5.594339370727539, "learning_rate": 9.088333333333334e-06, "loss": 0.4592, "step": 2550 }, { "epoch": 4.356020942408377, "grad_norm": 2.930290699005127, "learning_rate": 9.005000000000001e-06, "loss": 0.4549, "step": 2600 }, { "epoch": 4.439790575916231, "grad_norm": 2.199352741241455, "learning_rate": 8.921666666666668e-06, "loss": 0.4512, "step": 2650 }, { "epoch": 4.523560209424084, "grad_norm": 2.435898780822754, "learning_rate": 8.838333333333335e-06, "loss": 0.4557, "step": 2700 }, { "epoch": 4.607329842931938, "grad_norm": 3.2397773265838623, "learning_rate": 8.755e-06, "loss": 0.449, "step": 2750 }, { "epoch": 4.69109947643979, "grad_norm": 1.9361001253128052, "learning_rate": 8.671666666666667e-06, "loss": 0.4525, "step": 2800 }, { "epoch": 4.774869109947644, "grad_norm": 2.1852686405181885, "learning_rate": 8.588333333333334e-06, "loss": 0.4486, "step": 2850 }, { "epoch": 4.858638743455497, "grad_norm": 1.8136191368103027, "learning_rate": 8.505e-06, "loss": 0.4517, "step": 2900 }, { "epoch": 4.942408376963351, "grad_norm": 2.694406032562256, "learning_rate": 8.421666666666668e-06, "loss": 0.4505, "step": 2950 }, { "epoch": 5.026178010471204, "grad_norm": 1.251187801361084, "learning_rate": 8.338333333333335e-06, "loss": 0.4489, "step": 3000 }, { "epoch": 5.026178010471204, "eval_loss": 0.39678552746772766, "eval_runtime": 256.1618, "eval_samples_per_second": 33.139, "eval_steps_per_second": 4.146, "step": 3000 }, { "epoch": 5.109947643979058, "grad_norm": 1.1454211473464966, "learning_rate": 8.255000000000001e-06, "loss": 0.4422, "step": 3050 }, { "epoch": 5.193717277486911, "grad_norm": 1.8685294389724731, "learning_rate": 8.171666666666668e-06, "loss": 0.4426, "step": 3100 }, { "epoch": 5.277486910994765, "grad_norm": 1.6863799095153809, "learning_rate": 8.088333333333334e-06, "loss": 0.4398, "step": 3150 }, { "epoch": 5.361256544502618, "grad_norm": 2.249805212020874, "learning_rate": 8.005e-06, "loss": 0.4384, "step": 3200 }, { "epoch": 5.445026178010472, "grad_norm": 2.1187326908111572, "learning_rate": 7.921666666666667e-06, "loss": 0.4431, "step": 3250 }, { "epoch": 5.528795811518324, "grad_norm": 1.8476357460021973, "learning_rate": 7.838333333333334e-06, "loss": 0.4434, "step": 3300 }, { "epoch": 5.612565445026178, "grad_norm": 1.6522760391235352, "learning_rate": 7.755000000000001e-06, "loss": 0.4343, "step": 3350 }, { "epoch": 5.696335078534031, "grad_norm": 1.3926664590835571, "learning_rate": 7.671666666666668e-06, "loss": 0.4365, "step": 3400 }, { "epoch": 5.780104712041885, "grad_norm": 2.1967947483062744, "learning_rate": 7.588333333333334e-06, "loss": 0.4362, "step": 3450 }, { "epoch": 5.863874345549738, "grad_norm": 1.5428054332733154, "learning_rate": 7.505e-06, "loss": 0.4337, "step": 3500 }, { "epoch": 5.863874345549738, "eval_loss": 0.39255669713020325, "eval_runtime": 251.4241, "eval_samples_per_second": 33.764, "eval_steps_per_second": 4.224, "step": 3500 }, { "epoch": 5.947643979057592, "grad_norm": 1.7545124292373657, "learning_rate": 7.421666666666667e-06, "loss": 0.4292, "step": 3550 }, { "epoch": 6.031413612565445, "grad_norm": 1.1912785768508911, "learning_rate": 7.338333333333334e-06, "loss": 0.4293, "step": 3600 }, { "epoch": 6.115183246073299, "grad_norm": 1.549297571182251, "learning_rate": 7.255000000000001e-06, "loss": 0.4333, "step": 3650 }, { "epoch": 6.198952879581152, "grad_norm": 1.8822449445724487, "learning_rate": 7.171666666666667e-06, "loss": 0.4308, "step": 3700 }, { "epoch": 6.282722513089006, "grad_norm": 1.6894242763519287, "learning_rate": 7.088333333333334e-06, "loss": 0.4336, "step": 3750 }, { "epoch": 6.366492146596858, "grad_norm": 1.63300621509552, "learning_rate": 7.005000000000001e-06, "loss": 0.432, "step": 3800 }, { "epoch": 6.450261780104712, "grad_norm": 1.4569647312164307, "learning_rate": 6.921666666666668e-06, "loss": 0.4302, "step": 3850 }, { "epoch": 6.534031413612565, "grad_norm": 1.3649786710739136, "learning_rate": 6.838333333333334e-06, "loss": 0.4277, "step": 3900 }, { "epoch": 6.617801047120419, "grad_norm": 1.6974161863327026, "learning_rate": 6.7550000000000005e-06, "loss": 0.4329, "step": 3950 }, { "epoch": 6.701570680628272, "grad_norm": 8.575983047485352, "learning_rate": 6.6716666666666674e-06, "loss": 0.4282, "step": 4000 }, { "epoch": 6.701570680628272, "eval_loss": 0.3837336301803589, "eval_runtime": 252.6194, "eval_samples_per_second": 33.604, "eval_steps_per_second": 4.204, "step": 4000 }, { "epoch": 6.785340314136126, "grad_norm": 1.308820366859436, "learning_rate": 6.588333333333334e-06, "loss": 0.4233, "step": 4050 }, { "epoch": 6.869109947643979, "grad_norm": 1.6496186256408691, "learning_rate": 6.505e-06, "loss": 0.4235, "step": 4100 }, { "epoch": 6.952879581151833, "grad_norm": 2.9797613620758057, "learning_rate": 6.421666666666667e-06, "loss": 0.424, "step": 4150 }, { "epoch": 7.036649214659686, "grad_norm": 1.366092324256897, "learning_rate": 6.338333333333334e-06, "loss": 0.4241, "step": 4200 }, { "epoch": 7.12041884816754, "grad_norm": 1.2892742156982422, "learning_rate": 6.255e-06, "loss": 0.4306, "step": 4250 }, { "epoch": 7.204188481675392, "grad_norm": 1.769739031791687, "learning_rate": 6.171666666666667e-06, "loss": 0.4206, "step": 4300 }, { "epoch": 7.287958115183246, "grad_norm": 1.2120298147201538, "learning_rate": 6.088333333333334e-06, "loss": 0.4245, "step": 4350 }, { "epoch": 7.371727748691099, "grad_norm": 2.545499086380005, "learning_rate": 6.005000000000001e-06, "loss": 0.4234, "step": 4400 }, { "epoch": 7.455497382198953, "grad_norm": 1.57191801071167, "learning_rate": 5.921666666666667e-06, "loss": 0.426, "step": 4450 }, { "epoch": 7.539267015706806, "grad_norm": 1.3846454620361328, "learning_rate": 5.838333333333334e-06, "loss": 0.4188, "step": 4500 }, { "epoch": 7.539267015706806, "eval_loss": 0.37984684109687805, "eval_runtime": 250.9224, "eval_samples_per_second": 33.831, "eval_steps_per_second": 4.232, "step": 4500 }, { "epoch": 7.62303664921466, "grad_norm": 1.3911954164505005, "learning_rate": 5.755000000000001e-06, "loss": 0.42, "step": 4550 }, { "epoch": 7.706806282722513, "grad_norm": 1.3781930208206177, "learning_rate": 5.671666666666668e-06, "loss": 0.4234, "step": 4600 }, { "epoch": 7.790575916230367, "grad_norm": 1.5225090980529785, "learning_rate": 5.588333333333334e-06, "loss": 0.4242, "step": 4650 }, { "epoch": 7.87434554973822, "grad_norm": 1.4195870161056519, "learning_rate": 5.505000000000001e-06, "loss": 0.4243, "step": 4700 }, { "epoch": 7.958115183246074, "grad_norm": 1.9713116884231567, "learning_rate": 5.4216666666666676e-06, "loss": 0.4158, "step": 4750 }, { "epoch": 8.041884816753926, "grad_norm": 1.3215075731277466, "learning_rate": 5.3383333333333345e-06, "loss": 0.4141, "step": 4800 }, { "epoch": 8.12565445026178, "grad_norm": 4.483395576477051, "learning_rate": 5.2550000000000005e-06, "loss": 0.4165, "step": 4850 }, { "epoch": 8.209424083769633, "grad_norm": 1.4759974479675293, "learning_rate": 5.171666666666667e-06, "loss": 0.4291, "step": 4900 }, { "epoch": 8.293193717277488, "grad_norm": 1.3336604833602905, "learning_rate": 5.088333333333334e-06, "loss": 0.4134, "step": 4950 }, { "epoch": 8.37696335078534, "grad_norm": 1.5618665218353271, "learning_rate": 5.0049999999999995e-06, "loss": 0.4222, "step": 5000 }, { "epoch": 8.37696335078534, "eval_loss": 0.37838345766067505, "eval_runtime": 256.1892, "eval_samples_per_second": 33.136, "eval_steps_per_second": 4.145, "step": 5000 }, { "epoch": 8.460732984293193, "grad_norm": 1.079275369644165, "learning_rate": 4.921666666666666e-06, "loss": 0.4143, "step": 5050 }, { "epoch": 8.544502617801047, "grad_norm": 1.0949628353118896, "learning_rate": 4.838333333333334e-06, "loss": 0.4123, "step": 5100 }, { "epoch": 8.6282722513089, "grad_norm": 1.5005624294281006, "learning_rate": 4.755e-06, "loss": 0.4218, "step": 5150 }, { "epoch": 8.712041884816754, "grad_norm": 5.408727169036865, "learning_rate": 4.671666666666667e-06, "loss": 0.4123, "step": 5200 }, { "epoch": 8.795811518324607, "grad_norm": 1.0426135063171387, "learning_rate": 4.588333333333333e-06, "loss": 0.4159, "step": 5250 }, { "epoch": 8.879581151832461, "grad_norm": 2.312485456466675, "learning_rate": 4.505e-06, "loss": 0.4149, "step": 5300 }, { "epoch": 8.963350785340314, "grad_norm": 1.6531749963760376, "learning_rate": 4.421666666666667e-06, "loss": 0.4147, "step": 5350 }, { "epoch": 9.047120418848168, "grad_norm": 1.2448443174362183, "learning_rate": 4.338333333333334e-06, "loss": 0.4191, "step": 5400 }, { "epoch": 9.13089005235602, "grad_norm": 1.1506507396697998, "learning_rate": 4.255e-06, "loss": 0.4094, "step": 5450 }, { "epoch": 9.214659685863875, "grad_norm": 1.2973967790603638, "learning_rate": 4.171666666666667e-06, "loss": 0.412, "step": 5500 }, { "epoch": 9.214659685863875, "eval_loss": 0.3729405403137207, "eval_runtime": 258.7159, "eval_samples_per_second": 32.812, "eval_steps_per_second": 4.105, "step": 5500 }, { "epoch": 9.298429319371728, "grad_norm": 1.0570714473724365, "learning_rate": 4.088333333333334e-06, "loss": 0.4144, "step": 5550 }, { "epoch": 9.38219895287958, "grad_norm": 1.1278653144836426, "learning_rate": 4.005000000000001e-06, "loss": 0.4154, "step": 5600 }, { "epoch": 9.465968586387435, "grad_norm": 5.811945915222168, "learning_rate": 3.921666666666667e-06, "loss": 0.4085, "step": 5650 }, { "epoch": 9.549738219895287, "grad_norm": 1.6973522901535034, "learning_rate": 3.8383333333333336e-06, "loss": 0.4133, "step": 5700 }, { "epoch": 9.633507853403142, "grad_norm": 1.209333062171936, "learning_rate": 3.7550000000000005e-06, "loss": 0.4123, "step": 5750 }, { "epoch": 9.717277486910994, "grad_norm": 3.592991590499878, "learning_rate": 3.6716666666666665e-06, "loss": 0.4126, "step": 5800 }, { "epoch": 9.801047120418849, "grad_norm": 1.152239203453064, "learning_rate": 3.588333333333334e-06, "loss": 0.4115, "step": 5850 }, { "epoch": 9.884816753926701, "grad_norm": 1.6118751764297485, "learning_rate": 3.505e-06, "loss": 0.4036, "step": 5900 }, { "epoch": 9.968586387434556, "grad_norm": 1.4384329319000244, "learning_rate": 3.4216666666666672e-06, "loss": 0.4103, "step": 5950 }, { "epoch": 10.052356020942408, "grad_norm": 1.447549819946289, "learning_rate": 3.3383333333333333e-06, "loss": 0.4056, "step": 6000 }, { "epoch": 10.052356020942408, "eval_loss": 0.3696598410606384, "eval_runtime": 256.5857, "eval_samples_per_second": 33.084, "eval_steps_per_second": 4.139, "step": 6000 }, { "epoch": 10.136125654450261, "grad_norm": 1.2832478284835815, "learning_rate": 3.255e-06, "loss": 0.4086, "step": 6050 }, { "epoch": 10.219895287958115, "grad_norm": 1.0580389499664307, "learning_rate": 3.1716666666666667e-06, "loss": 0.4128, "step": 6100 }, { "epoch": 10.303664921465968, "grad_norm": 1.5535204410552979, "learning_rate": 3.0883333333333336e-06, "loss": 0.4088, "step": 6150 }, { "epoch": 10.387434554973822, "grad_norm": 0.8630754947662354, "learning_rate": 3.005e-06, "loss": 0.4073, "step": 6200 }, { "epoch": 10.471204188481675, "grad_norm": 1.248646855354309, "learning_rate": 2.921666666666667e-06, "loss": 0.4074, "step": 6250 }, { "epoch": 10.55497382198953, "grad_norm": 0.9990864396095276, "learning_rate": 2.8383333333333334e-06, "loss": 0.4076, "step": 6300 }, { "epoch": 10.638743455497382, "grad_norm": 1.100101351737976, "learning_rate": 2.7550000000000003e-06, "loss": 0.4081, "step": 6350 }, { "epoch": 10.722513089005236, "grad_norm": 1.1483354568481445, "learning_rate": 2.6716666666666668e-06, "loss": 0.4088, "step": 6400 }, { "epoch": 10.806282722513089, "grad_norm": 0.8169103860855103, "learning_rate": 2.5883333333333337e-06, "loss": 0.406, "step": 6450 }, { "epoch": 10.890052356020943, "grad_norm": 1.0567216873168945, "learning_rate": 2.505e-06, "loss": 0.4065, "step": 6500 }, { "epoch": 10.890052356020943, "eval_loss": 0.36852771043777466, "eval_runtime": 255.1911, "eval_samples_per_second": 33.265, "eval_steps_per_second": 4.162, "step": 6500 }, { "epoch": 10.973821989528796, "grad_norm": 1.0459740161895752, "learning_rate": 2.421666666666667e-06, "loss": 0.4074, "step": 6550 }, { "epoch": 11.057591623036648, "grad_norm": 0.9029247760772705, "learning_rate": 2.3383333333333335e-06, "loss": 0.4075, "step": 6600 }, { "epoch": 11.141361256544503, "grad_norm": 1.5372889041900635, "learning_rate": 2.2550000000000004e-06, "loss": 0.4088, "step": 6650 }, { "epoch": 11.225130890052355, "grad_norm": 0.9959379434585571, "learning_rate": 2.171666666666667e-06, "loss": 0.4044, "step": 6700 }, { "epoch": 11.30890052356021, "grad_norm": 1.3793728351593018, "learning_rate": 2.088333333333334e-06, "loss": 0.4034, "step": 6750 }, { "epoch": 11.392670157068062, "grad_norm": 1.2086491584777832, "learning_rate": 2.0050000000000003e-06, "loss": 0.4073, "step": 6800 }, { "epoch": 11.476439790575917, "grad_norm": 1.07647705078125, "learning_rate": 1.9216666666666668e-06, "loss": 0.405, "step": 6850 }, { "epoch": 11.56020942408377, "grad_norm": 0.9849846363067627, "learning_rate": 1.8383333333333334e-06, "loss": 0.4037, "step": 6900 }, { "epoch": 11.643979057591624, "grad_norm": 1.2623456716537476, "learning_rate": 1.7550000000000001e-06, "loss": 0.4042, "step": 6950 }, { "epoch": 11.727748691099476, "grad_norm": 0.9488279819488525, "learning_rate": 1.6716666666666666e-06, "loss": 0.4069, "step": 7000 }, { "epoch": 11.727748691099476, "eval_loss": 0.3675082325935364, "eval_runtime": 257.7947, "eval_samples_per_second": 32.929, "eval_steps_per_second": 4.12, "step": 7000 }, { "epoch": 11.81151832460733, "grad_norm": 1.7772880792617798, "learning_rate": 1.5883333333333333e-06, "loss": 0.4062, "step": 7050 }, { "epoch": 11.895287958115183, "grad_norm": 0.9282962083816528, "learning_rate": 1.505e-06, "loss": 0.4097, "step": 7100 }, { "epoch": 11.979057591623036, "grad_norm": 0.9623945355415344, "learning_rate": 1.4216666666666667e-06, "loss": 0.4022, "step": 7150 }, { "epoch": 12.06282722513089, "grad_norm": 0.8984498381614685, "learning_rate": 1.34e-06, "loss": 0.4136, "step": 7200 }, { "epoch": 12.146596858638743, "grad_norm": 1.1274847984313965, "learning_rate": 1.2566666666666668e-06, "loss": 0.4074, "step": 7250 }, { "epoch": 12.230366492146597, "grad_norm": 0.9233041405677795, "learning_rate": 1.1733333333333335e-06, "loss": 0.4051, "step": 7300 }, { "epoch": 12.31413612565445, "grad_norm": 1.0804561376571655, "learning_rate": 1.0900000000000002e-06, "loss": 0.4045, "step": 7350 }, { "epoch": 12.397905759162304, "grad_norm": 0.9518505334854126, "learning_rate": 1.0066666666666668e-06, "loss": 0.4097, "step": 7400 }, { "epoch": 12.481675392670157, "grad_norm": 1.3323400020599365, "learning_rate": 9.233333333333334e-07, "loss": 0.4077, "step": 7450 }, { "epoch": 12.565445026178011, "grad_norm": 0.8265945911407471, "learning_rate": 8.400000000000001e-07, "loss": 0.4049, "step": 7500 }, { "epoch": 12.565445026178011, "eval_loss": 0.3666408658027649, "eval_runtime": 263.2918, "eval_samples_per_second": 32.242, "eval_steps_per_second": 4.034, "step": 7500 }, { "epoch": 12.649214659685864, "grad_norm": 0.8195668458938599, "learning_rate": 7.566666666666667e-07, "loss": 0.4032, "step": 7550 }, { "epoch": 12.732984293193716, "grad_norm": 0.9879769086837769, "learning_rate": 6.733333333333334e-07, "loss": 0.4078, "step": 7600 }, { "epoch": 12.81675392670157, "grad_norm": 2.3267557621002197, "learning_rate": 5.900000000000001e-07, "loss": 0.4058, "step": 7650 }, { "epoch": 12.900523560209423, "grad_norm": 0.9204362034797668, "learning_rate": 5.066666666666667e-07, "loss": 0.4052, "step": 7700 }, { "epoch": 12.984293193717278, "grad_norm": 1.0587321519851685, "learning_rate": 4.233333333333334e-07, "loss": 0.4111, "step": 7750 }, { "epoch": 13.06806282722513, "grad_norm": 0.8546725511550903, "learning_rate": 3.4000000000000003e-07, "loss": 0.4021, "step": 7800 }, { "epoch": 13.151832460732985, "grad_norm": 0.7696201801300049, "learning_rate": 2.566666666666667e-07, "loss": 0.4035, "step": 7850 }, { "epoch": 13.235602094240837, "grad_norm": 0.8487740755081177, "learning_rate": 1.7333333333333335e-07, "loss": 0.4077, "step": 7900 }, { "epoch": 13.319371727748692, "grad_norm": 0.9611801505088806, "learning_rate": 9e-08, "loss": 0.4058, "step": 7950 }, { "epoch": 13.403141361256544, "grad_norm": 0.7191503047943115, "learning_rate": 6.666666666666667e-09, "loss": 0.4044, "step": 8000 }, { "epoch": 13.403141361256544, "eval_loss": 0.3662460446357727, "eval_runtime": 261.9325, "eval_samples_per_second": 32.409, "eval_steps_per_second": 4.054, "step": 8000 } ], "logging_steps": 50, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4332007404692806e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }