|
{ |
|
"best_metric": 0.3102165162563324, |
|
"best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-9500", |
|
"epoch": 15.916230366492147, |
|
"eval_steps": 500, |
|
"global_step": 9500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08376963350785341, |
|
"grad_norm": 2.9717624187469482, |
|
"learning_rate": 2.4500000000000003e-06, |
|
"loss": 1.0424, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16753926701570682, |
|
"grad_norm": 2.9720630645751953, |
|
"learning_rate": 4.950000000000001e-06, |
|
"loss": 0.8474, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2513089005235602, |
|
"grad_norm": 2.445929765701294, |
|
"learning_rate": 7.45e-06, |
|
"loss": 0.7336, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33507853403141363, |
|
"grad_norm": 5.502955913543701, |
|
"learning_rate": 9.950000000000001e-06, |
|
"loss": 0.6492, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 2.3356130123138428, |
|
"learning_rate": 1.2450000000000001e-05, |
|
"loss": 0.6133, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5026178010471204, |
|
"grad_norm": 1.937270164489746, |
|
"learning_rate": 1.4950000000000001e-05, |
|
"loss": 0.5889, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5863874345549738, |
|
"grad_norm": 2.392244338989258, |
|
"learning_rate": 1.745e-05, |
|
"loss": 0.5694, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6701570680628273, |
|
"grad_norm": 7.3209919929504395, |
|
"learning_rate": 1.995e-05, |
|
"loss": 0.5477, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7539267015706806, |
|
"grad_norm": 3.415917158126831, |
|
"learning_rate": 2.245e-05, |
|
"loss": 0.5329, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 3.0256705284118652, |
|
"learning_rate": 2.495e-05, |
|
"loss": 0.5173, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"eval_loss": 0.4566049873828888, |
|
"eval_runtime": 268.5202, |
|
"eval_samples_per_second": 31.614, |
|
"eval_steps_per_second": 3.955, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9214659685863874, |
|
"grad_norm": 1.9436837434768677, |
|
"learning_rate": 2.7450000000000003e-05, |
|
"loss": 0.5079, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0052356020942408, |
|
"grad_norm": 1.819956660270691, |
|
"learning_rate": 2.995e-05, |
|
"loss": 0.4969, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0890052356020943, |
|
"grad_norm": 5.457251071929932, |
|
"learning_rate": 3.245e-05, |
|
"loss": 0.4977, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1727748691099475, |
|
"grad_norm": 3.183980703353882, |
|
"learning_rate": 3.495e-05, |
|
"loss": 0.4923, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 7.1660051345825195, |
|
"learning_rate": 3.745e-05, |
|
"loss": 0.4802, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3403141361256545, |
|
"grad_norm": 5.499026775360107, |
|
"learning_rate": 3.995e-05, |
|
"loss": 0.4754, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4240837696335078, |
|
"grad_norm": 2.8053908348083496, |
|
"learning_rate": 4.245e-05, |
|
"loss": 0.4669, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5078534031413613, |
|
"grad_norm": 3.017005443572998, |
|
"learning_rate": 4.495e-05, |
|
"loss": 0.4604, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5916230366492146, |
|
"grad_norm": 2.7971177101135254, |
|
"learning_rate": 4.745e-05, |
|
"loss": 0.4565, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 3.1588356494903564, |
|
"learning_rate": 4.995e-05, |
|
"loss": 0.455, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"eval_loss": 0.40312233567237854, |
|
"eval_runtime": 271.3585, |
|
"eval_samples_per_second": 31.283, |
|
"eval_steps_per_second": 3.914, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7591623036649215, |
|
"grad_norm": 2.2053232192993164, |
|
"learning_rate": 5.245e-05, |
|
"loss": 0.4543, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8429319371727748, |
|
"grad_norm": 2.0562164783477783, |
|
"learning_rate": 5.495e-05, |
|
"loss": 0.4456, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9267015706806283, |
|
"grad_norm": 2.730119466781616, |
|
"learning_rate": 5.745e-05, |
|
"loss": 0.4355, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.0104712041884816, |
|
"grad_norm": 1.7484283447265625, |
|
"learning_rate": 5.995000000000001e-05, |
|
"loss": 0.4299, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.094240837696335, |
|
"grad_norm": 1.1786061525344849, |
|
"learning_rate": 6.245000000000001e-05, |
|
"loss": 0.4305, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.1780104712041886, |
|
"grad_norm": 1.98978590965271, |
|
"learning_rate": 6.494999999999999e-05, |
|
"loss": 0.4295, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.261780104712042, |
|
"grad_norm": 2.818659782409668, |
|
"learning_rate": 6.745e-05, |
|
"loss": 0.4235, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.345549738219895, |
|
"grad_norm": 2.3864262104034424, |
|
"learning_rate": 6.995e-05, |
|
"loss": 0.4271, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4293193717277486, |
|
"grad_norm": 1.3647903203964233, |
|
"learning_rate": 7.245000000000001e-05, |
|
"loss": 0.4208, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"grad_norm": 2.2144172191619873, |
|
"learning_rate": 7.495e-05, |
|
"loss": 0.4175, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"eval_loss": 0.3777858018875122, |
|
"eval_runtime": 273.3281, |
|
"eval_samples_per_second": 31.058, |
|
"eval_steps_per_second": 3.885, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.5968586387434556, |
|
"grad_norm": 1.6483193635940552, |
|
"learning_rate": 7.745e-05, |
|
"loss": 0.414, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.680628272251309, |
|
"grad_norm": 1.7688554525375366, |
|
"learning_rate": 7.995e-05, |
|
"loss": 0.4153, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.7643979057591626, |
|
"grad_norm": 1.2314317226409912, |
|
"learning_rate": 8.245e-05, |
|
"loss": 0.4089, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.8481675392670156, |
|
"grad_norm": 1.6623793840408325, |
|
"learning_rate": 8.495e-05, |
|
"loss": 0.4124, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.931937172774869, |
|
"grad_norm": 3.812507390975952, |
|
"learning_rate": 8.745000000000001e-05, |
|
"loss": 0.4112, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.0157068062827226, |
|
"grad_norm": 2.141019821166992, |
|
"learning_rate": 8.995e-05, |
|
"loss": 0.4081, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.099476439790576, |
|
"grad_norm": 1.8928133249282837, |
|
"learning_rate": 9.245e-05, |
|
"loss": 0.4067, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.183246073298429, |
|
"grad_norm": 2.322817087173462, |
|
"learning_rate": 9.495e-05, |
|
"loss": 0.4088, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.2670157068062826, |
|
"grad_norm": 2.1984918117523193, |
|
"learning_rate": 9.745000000000001e-05, |
|
"loss": 0.3976, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.350785340314136, |
|
"grad_norm": 2.0455121994018555, |
|
"learning_rate": 9.995e-05, |
|
"loss": 0.4022, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.350785340314136, |
|
"eval_loss": 0.3677983582019806, |
|
"eval_runtime": 274.4574, |
|
"eval_samples_per_second": 30.93, |
|
"eval_steps_per_second": 3.869, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.4345549738219896, |
|
"grad_norm": 1.2897744178771973, |
|
"learning_rate": 9.951e-05, |
|
"loss": 0.4026, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.518324607329843, |
|
"grad_norm": 1.470860242843628, |
|
"learning_rate": 9.901e-05, |
|
"loss": 0.4008, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.6020942408376966, |
|
"grad_norm": 1.2159388065338135, |
|
"learning_rate": 9.851e-05, |
|
"loss": 0.3971, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.6858638743455496, |
|
"grad_norm": 2.0348379611968994, |
|
"learning_rate": 9.801e-05, |
|
"loss": 0.396, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.769633507853403, |
|
"grad_norm": 1.7535659074783325, |
|
"learning_rate": 9.751e-05, |
|
"loss": 0.3929, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.8534031413612566, |
|
"grad_norm": 1.361984372138977, |
|
"learning_rate": 9.701e-05, |
|
"loss": 0.3905, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.93717277486911, |
|
"grad_norm": 1.7380383014678955, |
|
"learning_rate": 9.651e-05, |
|
"loss": 0.3957, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.020942408376963, |
|
"grad_norm": 1.2679184675216675, |
|
"learning_rate": 9.601e-05, |
|
"loss": 0.388, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.104712041884817, |
|
"grad_norm": 1.274625301361084, |
|
"learning_rate": 9.551e-05, |
|
"loss": 0.3887, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.18848167539267, |
|
"grad_norm": 1.813714861869812, |
|
"learning_rate": 9.501e-05, |
|
"loss": 0.3865, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.18848167539267, |
|
"eval_loss": 0.35398951172828674, |
|
"eval_runtime": 271.385, |
|
"eval_samples_per_second": 31.28, |
|
"eval_steps_per_second": 3.913, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.272251308900524, |
|
"grad_norm": 2.468984842300415, |
|
"learning_rate": 9.451000000000002e-05, |
|
"loss": 0.3902, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.356020942408377, |
|
"grad_norm": 1.2810943126678467, |
|
"learning_rate": 9.401e-05, |
|
"loss": 0.386, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.439790575916231, |
|
"grad_norm": 1.6781765222549438, |
|
"learning_rate": 9.351e-05, |
|
"loss": 0.383, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.523560209424084, |
|
"grad_norm": 1.617163896560669, |
|
"learning_rate": 9.301e-05, |
|
"loss": 0.3849, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.607329842931938, |
|
"grad_norm": 1.4169151782989502, |
|
"learning_rate": 9.251000000000001e-05, |
|
"loss": 0.3807, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.69109947643979, |
|
"grad_norm": 1.1944037675857544, |
|
"learning_rate": 9.201000000000001e-05, |
|
"loss": 0.3838, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.774869109947644, |
|
"grad_norm": 1.7312718629837036, |
|
"learning_rate": 9.151000000000001e-05, |
|
"loss": 0.3808, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.858638743455497, |
|
"grad_norm": 1.357228398323059, |
|
"learning_rate": 9.101000000000001e-05, |
|
"loss": 0.3832, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.942408376963351, |
|
"grad_norm": 1.2495553493499756, |
|
"learning_rate": 9.051000000000001e-05, |
|
"loss": 0.3837, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.026178010471204, |
|
"grad_norm": 1.3688994646072388, |
|
"learning_rate": 9.001e-05, |
|
"loss": 0.3802, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.026178010471204, |
|
"eval_loss": 0.3458922803401947, |
|
"eval_runtime": 277.371, |
|
"eval_samples_per_second": 30.605, |
|
"eval_steps_per_second": 3.829, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.109947643979058, |
|
"grad_norm": 1.0916550159454346, |
|
"learning_rate": 8.951e-05, |
|
"loss": 0.3747, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.193717277486911, |
|
"grad_norm": 1.4605640172958374, |
|
"learning_rate": 8.901e-05, |
|
"loss": 0.3765, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.277486910994765, |
|
"grad_norm": 1.302049994468689, |
|
"learning_rate": 8.851e-05, |
|
"loss": 0.3753, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.361256544502618, |
|
"grad_norm": 1.0380531549453735, |
|
"learning_rate": 8.801e-05, |
|
"loss": 0.3735, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.445026178010472, |
|
"grad_norm": 2.157710075378418, |
|
"learning_rate": 8.751000000000001e-05, |
|
"loss": 0.3766, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.528795811518324, |
|
"grad_norm": 2.2072594165802, |
|
"learning_rate": 8.701000000000001e-05, |
|
"loss": 0.3767, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.612565445026178, |
|
"grad_norm": 1.258347749710083, |
|
"learning_rate": 8.651e-05, |
|
"loss": 0.3709, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 5.696335078534031, |
|
"grad_norm": 1.7026106119155884, |
|
"learning_rate": 8.601e-05, |
|
"loss": 0.3715, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.780104712041885, |
|
"grad_norm": 1.1708229780197144, |
|
"learning_rate": 8.551e-05, |
|
"loss": 0.3716, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 5.863874345549738, |
|
"grad_norm": 2.3675355911254883, |
|
"learning_rate": 8.501e-05, |
|
"loss": 0.3693, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.863874345549738, |
|
"eval_loss": 0.3417563736438751, |
|
"eval_runtime": 272.8827, |
|
"eval_samples_per_second": 31.109, |
|
"eval_steps_per_second": 3.892, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.947643979057592, |
|
"grad_norm": 1.6144191026687622, |
|
"learning_rate": 8.451e-05, |
|
"loss": 0.3666, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.031413612565445, |
|
"grad_norm": 1.4944205284118652, |
|
"learning_rate": 8.401e-05, |
|
"loss": 0.3657, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.115183246073299, |
|
"grad_norm": 1.0198278427124023, |
|
"learning_rate": 8.351e-05, |
|
"loss": 0.3702, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.198952879581152, |
|
"grad_norm": 2.195380926132202, |
|
"learning_rate": 8.300999999999999e-05, |
|
"loss": 0.3686, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.282722513089006, |
|
"grad_norm": 1.3650749921798706, |
|
"learning_rate": 8.251e-05, |
|
"loss": 0.3701, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.366492146596858, |
|
"grad_norm": 1.6887727975845337, |
|
"learning_rate": 8.201000000000001e-05, |
|
"loss": 0.3677, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.450261780104712, |
|
"grad_norm": 0.8709685206413269, |
|
"learning_rate": 8.151000000000001e-05, |
|
"loss": 0.3678, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 6.534031413612565, |
|
"grad_norm": 1.0899595022201538, |
|
"learning_rate": 8.101000000000001e-05, |
|
"loss": 0.3641, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 6.617801047120419, |
|
"grad_norm": 1.1222867965698242, |
|
"learning_rate": 8.051000000000001e-05, |
|
"loss": 0.3691, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 6.701570680628272, |
|
"grad_norm": 1.0771104097366333, |
|
"learning_rate": 8.001e-05, |
|
"loss": 0.3674, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.701570680628272, |
|
"eval_loss": 0.3313756585121155, |
|
"eval_runtime": 279.286, |
|
"eval_samples_per_second": 30.395, |
|
"eval_steps_per_second": 3.803, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.785340314136126, |
|
"grad_norm": 1.868295669555664, |
|
"learning_rate": 7.951e-05, |
|
"loss": 0.3617, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 6.869109947643979, |
|
"grad_norm": 1.0599360466003418, |
|
"learning_rate": 7.901e-05, |
|
"loss": 0.3637, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 6.952879581151833, |
|
"grad_norm": 1.4801158905029297, |
|
"learning_rate": 7.851e-05, |
|
"loss": 0.363, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.036649214659686, |
|
"grad_norm": 1.137289047241211, |
|
"learning_rate": 7.801000000000001e-05, |
|
"loss": 0.3622, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.12041884816754, |
|
"grad_norm": 1.2109190225601196, |
|
"learning_rate": 7.751000000000001e-05, |
|
"loss": 0.3668, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 7.204188481675392, |
|
"grad_norm": 1.1171132326126099, |
|
"learning_rate": 7.701000000000001e-05, |
|
"loss": 0.3594, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.287958115183246, |
|
"grad_norm": 1.2529895305633545, |
|
"learning_rate": 7.651e-05, |
|
"loss": 0.3635, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 7.371727748691099, |
|
"grad_norm": 1.352792739868164, |
|
"learning_rate": 7.601e-05, |
|
"loss": 0.3627, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 7.455497382198953, |
|
"grad_norm": 0.8809813261032104, |
|
"learning_rate": 7.552e-05, |
|
"loss": 0.3647, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 7.539267015706806, |
|
"grad_norm": 4.0386962890625, |
|
"learning_rate": 7.502e-05, |
|
"loss": 0.3582, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.539267015706806, |
|
"eval_loss": 0.32692766189575195, |
|
"eval_runtime": 272.0854, |
|
"eval_samples_per_second": 31.2, |
|
"eval_steps_per_second": 3.903, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.62303664921466, |
|
"grad_norm": 1.616075873374939, |
|
"learning_rate": 7.452e-05, |
|
"loss": 0.3603, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 7.706806282722513, |
|
"grad_norm": 2.2668583393096924, |
|
"learning_rate": 7.402e-05, |
|
"loss": 0.3622, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 7.790575916230367, |
|
"grad_norm": 1.0464789867401123, |
|
"learning_rate": 7.352e-05, |
|
"loss": 0.3667, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 7.87434554973822, |
|
"grad_norm": 1.2528297901153564, |
|
"learning_rate": 7.302e-05, |
|
"loss": 0.3631, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 7.958115183246074, |
|
"grad_norm": 1.72895085811615, |
|
"learning_rate": 7.252e-05, |
|
"loss": 0.3567, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 8.041884816753926, |
|
"grad_norm": 1.5020617246627808, |
|
"learning_rate": 7.202e-05, |
|
"loss": 0.3553, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 8.12565445026178, |
|
"grad_norm": 1.976888656616211, |
|
"learning_rate": 7.151999999999999e-05, |
|
"loss": 0.3569, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 8.209424083769633, |
|
"grad_norm": 1.156580924987793, |
|
"learning_rate": 7.102000000000001e-05, |
|
"loss": 0.3659, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 8.293193717277488, |
|
"grad_norm": 0.9017566442489624, |
|
"learning_rate": 7.052000000000001e-05, |
|
"loss": 0.3549, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 8.37696335078534, |
|
"grad_norm": 1.5168513059616089, |
|
"learning_rate": 7.002000000000001e-05, |
|
"loss": 0.362, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.37696335078534, |
|
"eval_loss": 0.34056970477104187, |
|
"eval_runtime": 276.7614, |
|
"eval_samples_per_second": 30.673, |
|
"eval_steps_per_second": 3.837, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.460732984293193, |
|
"grad_norm": 1.111985206604004, |
|
"learning_rate": 6.952000000000001e-05, |
|
"loss": 0.3553, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 8.544502617801047, |
|
"grad_norm": 1.3966108560562134, |
|
"learning_rate": 6.902000000000001e-05, |
|
"loss": 0.3545, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 8.6282722513089, |
|
"grad_norm": 1.3428140878677368, |
|
"learning_rate": 6.852e-05, |
|
"loss": 0.3609, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 8.712041884816754, |
|
"grad_norm": 1.9436802864074707, |
|
"learning_rate": 6.802e-05, |
|
"loss": 0.3547, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 8.795811518324607, |
|
"grad_norm": 1.1481266021728516, |
|
"learning_rate": 6.752e-05, |
|
"loss": 0.3569, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 8.879581151832461, |
|
"grad_norm": 1.410223364830017, |
|
"learning_rate": 6.702e-05, |
|
"loss": 0.3558, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 8.963350785340314, |
|
"grad_norm": 1.7548959255218506, |
|
"learning_rate": 6.652000000000001e-05, |
|
"loss": 0.3561, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 9.047120418848168, |
|
"grad_norm": 1.343935489654541, |
|
"learning_rate": 6.602000000000001e-05, |
|
"loss": 0.3609, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 9.13089005235602, |
|
"grad_norm": 1.5190401077270508, |
|
"learning_rate": 6.552000000000001e-05, |
|
"loss": 0.3504, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 9.214659685863875, |
|
"grad_norm": 0.8521016240119934, |
|
"learning_rate": 6.502e-05, |
|
"loss": 0.3521, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.214659685863875, |
|
"eval_loss": 0.3218235671520233, |
|
"eval_runtime": 279.5684, |
|
"eval_samples_per_second": 30.365, |
|
"eval_steps_per_second": 3.799, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.298429319371728, |
|
"grad_norm": 1.0284796953201294, |
|
"learning_rate": 6.452e-05, |
|
"loss": 0.356, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 9.38219895287958, |
|
"grad_norm": 1.8278234004974365, |
|
"learning_rate": 6.402e-05, |
|
"loss": 0.356, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 9.465968586387435, |
|
"grad_norm": 0.9208963513374329, |
|
"learning_rate": 6.352e-05, |
|
"loss": 0.3504, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 9.549738219895287, |
|
"grad_norm": 1.295639991760254, |
|
"learning_rate": 6.302e-05, |
|
"loss": 0.3551, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 9.633507853403142, |
|
"grad_norm": 0.9757601022720337, |
|
"learning_rate": 6.252e-05, |
|
"loss": 0.3529, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 9.717277486910994, |
|
"grad_norm": 1.451418399810791, |
|
"learning_rate": 6.202e-05, |
|
"loss": 0.3537, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 9.801047120418849, |
|
"grad_norm": 2.2001028060913086, |
|
"learning_rate": 6.152e-05, |
|
"loss": 0.3522, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 9.884816753926701, |
|
"grad_norm": 1.1149827241897583, |
|
"learning_rate": 6.102e-05, |
|
"loss": 0.3472, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 9.968586387434556, |
|
"grad_norm": 1.4035720825195312, |
|
"learning_rate": 6.0519999999999997e-05, |
|
"loss": 0.3525, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 10.052356020942408, |
|
"grad_norm": 1.0732487440109253, |
|
"learning_rate": 6.002e-05, |
|
"loss": 0.3485, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 10.052356020942408, |
|
"eval_loss": 0.31853485107421875, |
|
"eval_runtime": 271.779, |
|
"eval_samples_per_second": 31.235, |
|
"eval_steps_per_second": 3.908, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 10.136125654450261, |
|
"grad_norm": 1.2576690912246704, |
|
"learning_rate": 5.952e-05, |
|
"loss": 0.3488, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 10.219895287958115, |
|
"grad_norm": 1.2645186185836792, |
|
"learning_rate": 5.902e-05, |
|
"loss": 0.3537, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 10.303664921465968, |
|
"grad_norm": 1.743445634841919, |
|
"learning_rate": 5.852000000000001e-05, |
|
"loss": 0.3501, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 10.387434554973822, |
|
"grad_norm": 1.2827191352844238, |
|
"learning_rate": 5.802000000000001e-05, |
|
"loss": 0.349, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 10.471204188481675, |
|
"grad_norm": 1.0109118223190308, |
|
"learning_rate": 5.7520000000000005e-05, |
|
"loss": 0.3495, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 10.55497382198953, |
|
"grad_norm": 1.420745611190796, |
|
"learning_rate": 5.7020000000000006e-05, |
|
"loss": 0.3493, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 10.638743455497382, |
|
"grad_norm": 1.2105921506881714, |
|
"learning_rate": 5.652000000000001e-05, |
|
"loss": 0.3487, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 10.722513089005236, |
|
"grad_norm": 1.1536401510238647, |
|
"learning_rate": 5.602000000000001e-05, |
|
"loss": 0.35, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 10.806282722513089, |
|
"grad_norm": 1.0635104179382324, |
|
"learning_rate": 5.5520000000000004e-05, |
|
"loss": 0.3475, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 10.890052356020943, |
|
"grad_norm": 1.4069427251815796, |
|
"learning_rate": 5.5020000000000005e-05, |
|
"loss": 0.3472, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 10.890052356020943, |
|
"eval_loss": 0.3199196457862854, |
|
"eval_runtime": 276.9702, |
|
"eval_samples_per_second": 30.65, |
|
"eval_steps_per_second": 3.834, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 10.973821989528796, |
|
"grad_norm": 0.8649620413780212, |
|
"learning_rate": 5.4520000000000007e-05, |
|
"loss": 0.3496, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 11.057591623036648, |
|
"grad_norm": 2.6794686317443848, |
|
"learning_rate": 5.402e-05, |
|
"loss": 0.3482, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 11.141361256544503, |
|
"grad_norm": 1.6224123239517212, |
|
"learning_rate": 5.352e-05, |
|
"loss": 0.3498, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 11.225130890052355, |
|
"grad_norm": 1.2548692226409912, |
|
"learning_rate": 5.3020000000000004e-05, |
|
"loss": 0.346, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 11.30890052356021, |
|
"grad_norm": 1.390360713005066, |
|
"learning_rate": 5.2520000000000005e-05, |
|
"loss": 0.345, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 11.392670157068062, |
|
"grad_norm": 1.1040029525756836, |
|
"learning_rate": 5.202e-05, |
|
"loss": 0.3477, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 11.476439790575917, |
|
"grad_norm": 1.0738588571548462, |
|
"learning_rate": 5.152e-05, |
|
"loss": 0.3455, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 11.56020942408377, |
|
"grad_norm": 1.0175799131393433, |
|
"learning_rate": 5.102e-05, |
|
"loss": 0.3448, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 11.643979057591624, |
|
"grad_norm": 1.8546490669250488, |
|
"learning_rate": 5.052e-05, |
|
"loss": 0.346, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 11.727748691099476, |
|
"grad_norm": 1.7156524658203125, |
|
"learning_rate": 5.002e-05, |
|
"loss": 0.3469, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 11.727748691099476, |
|
"eval_loss": 0.31849026679992676, |
|
"eval_runtime": 283.0428, |
|
"eval_samples_per_second": 29.992, |
|
"eval_steps_per_second": 3.752, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 11.81151832460733, |
|
"grad_norm": 1.1094063520431519, |
|
"learning_rate": 4.952e-05, |
|
"loss": 0.346, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 11.895287958115183, |
|
"grad_norm": 1.8263230323791504, |
|
"learning_rate": 4.902e-05, |
|
"loss": 0.3496, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 11.979057591623036, |
|
"grad_norm": 1.4049593210220337, |
|
"learning_rate": 4.852e-05, |
|
"loss": 0.3433, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 12.06282722513089, |
|
"grad_norm": 1.3455963134765625, |
|
"learning_rate": 4.8030000000000006e-05, |
|
"loss": 0.3518, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 12.146596858638743, |
|
"grad_norm": 1.174660325050354, |
|
"learning_rate": 4.753e-05, |
|
"loss": 0.348, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 12.230366492146597, |
|
"grad_norm": 1.2765902280807495, |
|
"learning_rate": 4.703e-05, |
|
"loss": 0.345, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 12.31413612565445, |
|
"grad_norm": 1.419295072555542, |
|
"learning_rate": 4.6530000000000003e-05, |
|
"loss": 0.3436, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 12.397905759162304, |
|
"grad_norm": 1.3437247276306152, |
|
"learning_rate": 4.603e-05, |
|
"loss": 0.3469, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 12.481675392670157, |
|
"grad_norm": 1.6074751615524292, |
|
"learning_rate": 4.553e-05, |
|
"loss": 0.3461, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 12.565445026178011, |
|
"grad_norm": 1.432062029838562, |
|
"learning_rate": 4.503e-05, |
|
"loss": 0.3441, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 12.565445026178011, |
|
"eval_loss": 0.3222896158695221, |
|
"eval_runtime": 282.6486, |
|
"eval_samples_per_second": 30.034, |
|
"eval_steps_per_second": 3.757, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 12.649214659685864, |
|
"grad_norm": 1.4210392236709595, |
|
"learning_rate": 4.453e-05, |
|
"loss": 0.3436, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 12.732984293193716, |
|
"grad_norm": 1.275467038154602, |
|
"learning_rate": 4.4030000000000004e-05, |
|
"loss": 0.3453, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 12.81675392670157, |
|
"grad_norm": 1.1207870244979858, |
|
"learning_rate": 4.3530000000000005e-05, |
|
"loss": 0.3438, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 12.900523560209423, |
|
"grad_norm": 1.8535631895065308, |
|
"learning_rate": 4.3030000000000006e-05, |
|
"loss": 0.3442, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 12.984293193717278, |
|
"grad_norm": 1.0426372289657593, |
|
"learning_rate": 4.253e-05, |
|
"loss": 0.3494, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 13.06806282722513, |
|
"grad_norm": 1.3337020874023438, |
|
"learning_rate": 4.203e-05, |
|
"loss": 0.3413, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 13.151832460732985, |
|
"grad_norm": 1.017905592918396, |
|
"learning_rate": 4.1530000000000004e-05, |
|
"loss": 0.3417, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 13.235602094240837, |
|
"grad_norm": 1.166343331336975, |
|
"learning_rate": 4.103e-05, |
|
"loss": 0.3443, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 13.319371727748692, |
|
"grad_norm": 1.4170418977737427, |
|
"learning_rate": 4.053e-05, |
|
"loss": 0.3433, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 13.403141361256544, |
|
"grad_norm": 1.125741720199585, |
|
"learning_rate": 4.003e-05, |
|
"loss": 0.3422, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 13.403141361256544, |
|
"eval_loss": 0.31487980484962463, |
|
"eval_runtime": 278.3852, |
|
"eval_samples_per_second": 30.494, |
|
"eval_steps_per_second": 3.815, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 13.486910994764397, |
|
"grad_norm": 1.5452402830123901, |
|
"learning_rate": 3.953e-05, |
|
"loss": 0.3403, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 13.570680628272251, |
|
"grad_norm": 0.9096773862838745, |
|
"learning_rate": 3.903e-05, |
|
"loss": 0.3409, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 13.654450261780104, |
|
"grad_norm": 1.6249001026153564, |
|
"learning_rate": 3.853e-05, |
|
"loss": 0.3414, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 13.738219895287958, |
|
"grad_norm": 0.9276340007781982, |
|
"learning_rate": 3.803000000000001e-05, |
|
"loss": 0.3389, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 13.821989528795811, |
|
"grad_norm": 1.7416585683822632, |
|
"learning_rate": 3.753e-05, |
|
"loss": 0.343, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 13.905759162303665, |
|
"grad_norm": 2.2160768508911133, |
|
"learning_rate": 3.703e-05, |
|
"loss": 0.3402, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 13.989528795811518, |
|
"grad_norm": 1.0885984897613525, |
|
"learning_rate": 3.6530000000000004e-05, |
|
"loss": 0.3407, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 14.073298429319372, |
|
"grad_norm": 0.9969326853752136, |
|
"learning_rate": 3.6030000000000006e-05, |
|
"loss": 0.3447, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 14.157068062827225, |
|
"grad_norm": 1.2978531122207642, |
|
"learning_rate": 3.553e-05, |
|
"loss": 0.3377, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 14.24083769633508, |
|
"grad_norm": 1.0465147495269775, |
|
"learning_rate": 3.503e-05, |
|
"loss": 0.3396, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 14.24083769633508, |
|
"eval_loss": 0.310507208108902, |
|
"eval_runtime": 279.5625, |
|
"eval_samples_per_second": 30.365, |
|
"eval_steps_per_second": 3.799, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 14.324607329842932, |
|
"grad_norm": 2.537041425704956, |
|
"learning_rate": 3.453e-05, |
|
"loss": 0.3418, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 14.408376963350785, |
|
"grad_norm": 1.3357998132705688, |
|
"learning_rate": 3.403e-05, |
|
"loss": 0.3408, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 14.492146596858639, |
|
"grad_norm": 0.8550173044204712, |
|
"learning_rate": 3.353e-05, |
|
"loss": 0.3408, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 14.575916230366492, |
|
"grad_norm": 1.4455218315124512, |
|
"learning_rate": 3.303e-05, |
|
"loss": 0.3407, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 14.659685863874346, |
|
"grad_norm": 1.0547473430633545, |
|
"learning_rate": 3.253e-05, |
|
"loss": 0.3382, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 14.743455497382199, |
|
"grad_norm": 1.5398694276809692, |
|
"learning_rate": 3.2029999999999997e-05, |
|
"loss": 0.3402, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 14.827225130890053, |
|
"grad_norm": 1.008465051651001, |
|
"learning_rate": 3.1530000000000005e-05, |
|
"loss": 0.3433, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 14.910994764397905, |
|
"grad_norm": 1.8319462537765503, |
|
"learning_rate": 3.1030000000000006e-05, |
|
"loss": 0.341, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 14.99476439790576, |
|
"grad_norm": 1.1432167291641235, |
|
"learning_rate": 3.053e-05, |
|
"loss": 0.3369, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 15.078534031413612, |
|
"grad_norm": 1.098186731338501, |
|
"learning_rate": 3.0030000000000002e-05, |
|
"loss": 0.3396, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 15.078534031413612, |
|
"eval_loss": 0.31039854884147644, |
|
"eval_runtime": 280.3967, |
|
"eval_samples_per_second": 30.275, |
|
"eval_steps_per_second": 3.787, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 15.162303664921467, |
|
"grad_norm": 1.0989015102386475, |
|
"learning_rate": 2.9530000000000004e-05, |
|
"loss": 0.3381, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 15.24607329842932, |
|
"grad_norm": 1.1959214210510254, |
|
"learning_rate": 2.903e-05, |
|
"loss": 0.3381, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 15.329842931937172, |
|
"grad_norm": 0.9721996188163757, |
|
"learning_rate": 2.853e-05, |
|
"loss": 0.3384, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 15.413612565445026, |
|
"grad_norm": 1.2921016216278076, |
|
"learning_rate": 2.803e-05, |
|
"loss": 0.3375, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 15.497382198952879, |
|
"grad_norm": 1.1854231357574463, |
|
"learning_rate": 2.753e-05, |
|
"loss": 0.3389, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 15.581151832460733, |
|
"grad_norm": 1.571321725845337, |
|
"learning_rate": 2.703e-05, |
|
"loss": 0.3406, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 15.664921465968586, |
|
"grad_norm": 1.2595016956329346, |
|
"learning_rate": 2.6540000000000003e-05, |
|
"loss": 0.3392, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 15.74869109947644, |
|
"grad_norm": 1.2291969060897827, |
|
"learning_rate": 2.6040000000000005e-05, |
|
"loss": 0.3362, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 15.832460732984293, |
|
"grad_norm": 1.0605494976043701, |
|
"learning_rate": 2.5540000000000003e-05, |
|
"loss": 0.3388, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 15.916230366492147, |
|
"grad_norm": 0.9927255511283875, |
|
"learning_rate": 2.504e-05, |
|
"loss": 0.3391, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 15.916230366492147, |
|
"eval_loss": 0.3102165162563324, |
|
"eval_runtime": 279.552, |
|
"eval_samples_per_second": 30.366, |
|
"eval_steps_per_second": 3.799, |
|
"step": 9500 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 12000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 21, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7021322045447034e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|