|
{ |
|
"best_metric": 0.9593754410743713, |
|
"best_model_checkpoint": "/scratch/czm5kz/NEW_finetuned_llama27b32_1_0.0003_alternate_no_output/checkpoint-1400", |
|
"epoch": 0.9975062344139651, |
|
"eval_steps": 20, |
|
"global_step": 1400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.9491676092147827, |
|
"learning_rate": 0.0002989308624376336, |
|
"loss": 3.393, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.7517892122268677, |
|
"learning_rate": 0.00029786172487526725, |
|
"loss": 2.5606, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.6203854084014893, |
|
"learning_rate": 0.0002967925873129009, |
|
"loss": 1.987, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.8757699728012085, |
|
"learning_rate": 0.00029572344975053457, |
|
"loss": 1.7622, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 1.6077924966812134, |
|
"eval_runtime": 227.5054, |
|
"eval_samples_per_second": 49.357, |
|
"eval_steps_per_second": 6.171, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9948850870132446, |
|
"learning_rate": 0.00029465431218816815, |
|
"loss": 1.5741, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.1261812448501587, |
|
"learning_rate": 0.00029358517462580184, |
|
"loss": 1.4732, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7787383794784546, |
|
"learning_rate": 0.0002925160370634355, |
|
"loss": 1.3471, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.935620129108429, |
|
"learning_rate": 0.0002914468995010691, |
|
"loss": 1.2991, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 1.243576169013977, |
|
"eval_runtime": 227.5867, |
|
"eval_samples_per_second": 49.339, |
|
"eval_steps_per_second": 6.169, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9946210980415344, |
|
"learning_rate": 0.00029037776193870275, |
|
"loss": 1.2733, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.828574538230896, |
|
"learning_rate": 0.0002893086243763364, |
|
"loss": 1.203, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7947638630867004, |
|
"learning_rate": 0.00028823948681397, |
|
"loss": 1.1877, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.826960027217865, |
|
"learning_rate": 0.0002871703492516037, |
|
"loss": 1.1397, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 1.1289246082305908, |
|
"eval_runtime": 227.0882, |
|
"eval_samples_per_second": 49.448, |
|
"eval_steps_per_second": 6.183, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8590031862258911, |
|
"learning_rate": 0.0002861012116892373, |
|
"loss": 1.1081, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6775720715522766, |
|
"learning_rate": 0.000285032074126871, |
|
"loss": 1.1283, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.059350609779358, |
|
"learning_rate": 0.0002839629365645046, |
|
"loss": 1.1091, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.754294216632843, |
|
"learning_rate": 0.00028289379900213826, |
|
"loss": 1.1062, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.0860652923583984, |
|
"eval_runtime": 227.2483, |
|
"eval_samples_per_second": 49.413, |
|
"eval_steps_per_second": 6.178, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7335007786750793, |
|
"learning_rate": 0.0002818246614397719, |
|
"loss": 1.0554, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6339726448059082, |
|
"learning_rate": 0.00028075552387740553, |
|
"loss": 1.0523, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1034719944000244, |
|
"learning_rate": 0.00027968638631503917, |
|
"loss": 1.0769, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6975623369216919, |
|
"learning_rate": 0.0002786172487526728, |
|
"loss": 1.0596, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 1.0658228397369385, |
|
"eval_runtime": 227.3685, |
|
"eval_samples_per_second": 49.387, |
|
"eval_steps_per_second": 6.175, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6593163013458252, |
|
"learning_rate": 0.00027754811119030644, |
|
"loss": 1.079, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6725738048553467, |
|
"learning_rate": 0.0002764789736279401, |
|
"loss": 1.0656, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5702206492424011, |
|
"learning_rate": 0.00027540983606557377, |
|
"loss": 1.0733, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7566413283348083, |
|
"learning_rate": 0.0002743406985032074, |
|
"loss": 1.0589, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 1.0495774745941162, |
|
"eval_runtime": 227.0521, |
|
"eval_samples_per_second": 49.456, |
|
"eval_steps_per_second": 6.184, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6189225912094116, |
|
"learning_rate": 0.00027327156094084104, |
|
"loss": 1.0778, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6081388592720032, |
|
"learning_rate": 0.0002722024233784747, |
|
"loss": 1.0163, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.2115389108657837, |
|
"learning_rate": 0.0002711332858161083, |
|
"loss": 1.0035, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6294423341751099, |
|
"learning_rate": 0.00027006414825374195, |
|
"loss": 1.0234, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.0386356115341187, |
|
"eval_runtime": 227.6468, |
|
"eval_samples_per_second": 49.326, |
|
"eval_steps_per_second": 6.167, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6168947219848633, |
|
"learning_rate": 0.00026899501069137564, |
|
"loss": 0.9842, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5433680415153503, |
|
"learning_rate": 0.0002679258731290092, |
|
"loss": 1.0196, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5716556906700134, |
|
"learning_rate": 0.0002668567355666429, |
|
"loss": 1.0261, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7313571572303772, |
|
"learning_rate": 0.00026578759800427654, |
|
"loss": 1.0426, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 1.0302481651306152, |
|
"eval_runtime": 227.5439, |
|
"eval_samples_per_second": 49.349, |
|
"eval_steps_per_second": 6.17, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5894333124160767, |
|
"learning_rate": 0.0002647184604419102, |
|
"loss": 1.0848, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5585373640060425, |
|
"learning_rate": 0.0002636493228795438, |
|
"loss": 1.0557, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5632246136665344, |
|
"learning_rate": 0.00026258018531717745, |
|
"loss": 1.0309, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.49537792801856995, |
|
"learning_rate": 0.0002615110477548111, |
|
"loss": 1.0356, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 1.0230164527893066, |
|
"eval_runtime": 227.8202, |
|
"eval_samples_per_second": 49.289, |
|
"eval_steps_per_second": 6.163, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5201237201690674, |
|
"learning_rate": 0.0002604419101924447, |
|
"loss": 1.0166, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7654304504394531, |
|
"learning_rate": 0.00025937277263007836, |
|
"loss": 1.0645, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5226192474365234, |
|
"learning_rate": 0.000258303635067712, |
|
"loss": 1.0297, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5896185040473938, |
|
"learning_rate": 0.0002572344975053457, |
|
"loss": 1.0265, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.0194729566574097, |
|
"eval_runtime": 227.6113, |
|
"eval_samples_per_second": 49.334, |
|
"eval_steps_per_second": 6.168, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5336751937866211, |
|
"learning_rate": 0.0002561653599429793, |
|
"loss": 1.0388, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6317204833030701, |
|
"learning_rate": 0.00025509622238061296, |
|
"loss": 1.0365, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5186619758605957, |
|
"learning_rate": 0.0002540270848182466, |
|
"loss": 0.9862, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5502139925956726, |
|
"learning_rate": 0.00025295794725588023, |
|
"loss": 1.018, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.015502691268921, |
|
"eval_runtime": 227.1348, |
|
"eval_samples_per_second": 49.438, |
|
"eval_steps_per_second": 6.181, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.4881739318370819, |
|
"learning_rate": 0.00025188880969351387, |
|
"loss": 1.0419, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5655686855316162, |
|
"learning_rate": 0.00025081967213114756, |
|
"loss": 0.9913, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5858956575393677, |
|
"learning_rate": 0.00024975053456878114, |
|
"loss": 1.0137, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.46040791273117065, |
|
"learning_rate": 0.00024868139700641483, |
|
"loss": 0.9942, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 1.0137138366699219, |
|
"eval_runtime": 227.2315, |
|
"eval_samples_per_second": 49.417, |
|
"eval_steps_per_second": 6.179, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6350478529930115, |
|
"learning_rate": 0.00024761225944404847, |
|
"loss": 1.0222, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5849551558494568, |
|
"learning_rate": 0.0002465431218816821, |
|
"loss": 1.0154, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.42272746562957764, |
|
"learning_rate": 0.00024547398431931574, |
|
"loss": 1.0189, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.4941234886646271, |
|
"learning_rate": 0.0002444048467569494, |
|
"loss": 1.0114, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 1.0054244995117188, |
|
"eval_runtime": 227.5101, |
|
"eval_samples_per_second": 49.356, |
|
"eval_steps_per_second": 6.171, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.4647514522075653, |
|
"learning_rate": 0.000243335709194583, |
|
"loss": 1.0339, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.4477405548095703, |
|
"learning_rate": 0.00024226657163221665, |
|
"loss": 1.0285, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4374540448188782, |
|
"learning_rate": 0.0002411974340698503, |
|
"loss": 1.0008, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.39951157569885254, |
|
"learning_rate": 0.00024012829650748392, |
|
"loss": 0.9829, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.0040351152420044, |
|
"eval_runtime": 227.8754, |
|
"eval_samples_per_second": 49.277, |
|
"eval_steps_per_second": 6.161, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.557092547416687, |
|
"learning_rate": 0.00023905915894511758, |
|
"loss": 0.9946, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.48575475811958313, |
|
"learning_rate": 0.00023799002138275122, |
|
"loss": 1.0176, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.4411045014858246, |
|
"learning_rate": 0.00023692088382038488, |
|
"loss": 1.0372, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.46225762367248535, |
|
"learning_rate": 0.0002358517462580185, |
|
"loss": 1.0448, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 1.0029548406600952, |
|
"eval_runtime": 227.9278, |
|
"eval_samples_per_second": 49.266, |
|
"eval_steps_per_second": 6.16, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3242034912109375, |
|
"learning_rate": 0.00023478260869565215, |
|
"loss": 1.0389, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6357295513153076, |
|
"learning_rate": 0.0002337134711332858, |
|
"loss": 0.97, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6022586226463318, |
|
"learning_rate": 0.00023264433357091945, |
|
"loss": 0.9643, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.36969560384750366, |
|
"learning_rate": 0.0002315751960085531, |
|
"loss": 1.0139, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 0.9994259476661682, |
|
"eval_runtime": 227.6329, |
|
"eval_samples_per_second": 49.329, |
|
"eval_steps_per_second": 6.168, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.4300327003002167, |
|
"learning_rate": 0.00023050605844618672, |
|
"loss": 0.9854, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5460650324821472, |
|
"learning_rate": 0.00022943692088382036, |
|
"loss": 1.0384, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5851055383682251, |
|
"learning_rate": 0.00022836778332145402, |
|
"loss": 1.0378, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.48823001980781555, |
|
"learning_rate": 0.00022729864575908766, |
|
"loss": 0.995, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.995897650718689, |
|
"eval_runtime": 227.1636, |
|
"eval_samples_per_second": 49.431, |
|
"eval_steps_per_second": 6.181, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.49934253096580505, |
|
"learning_rate": 0.00022622950819672127, |
|
"loss": 1.0244, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4276964068412781, |
|
"learning_rate": 0.00022516037063435493, |
|
"loss": 0.9814, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4696836471557617, |
|
"learning_rate": 0.00022409123307198857, |
|
"loss": 1.011, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8325011730194092, |
|
"learning_rate": 0.00022302209550962223, |
|
"loss": 1.0064, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 0.9960118532180786, |
|
"eval_runtime": 227.2098, |
|
"eval_samples_per_second": 49.421, |
|
"eval_steps_per_second": 6.179, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.45505791902542114, |
|
"learning_rate": 0.00022195295794725584, |
|
"loss": 0.9677, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5271835327148438, |
|
"learning_rate": 0.0002208838203848895, |
|
"loss": 1.0595, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4093138873577118, |
|
"learning_rate": 0.00021981468282252314, |
|
"loss": 0.9966, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.44212207198143005, |
|
"learning_rate": 0.0002187455452601568, |
|
"loss": 0.9898, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 0.9923149943351746, |
|
"eval_runtime": 227.4616, |
|
"eval_samples_per_second": 49.367, |
|
"eval_steps_per_second": 6.172, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4539279341697693, |
|
"learning_rate": 0.0002176764076977904, |
|
"loss": 0.9675, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4204266369342804, |
|
"learning_rate": 0.00021660727013542407, |
|
"loss": 1.0235, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7492608428001404, |
|
"learning_rate": 0.0002155381325730577, |
|
"loss": 0.9584, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5321412086486816, |
|
"learning_rate": 0.00021446899501069137, |
|
"loss": 0.9828, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 0.9924358129501343, |
|
"eval_runtime": 227.9389, |
|
"eval_samples_per_second": 49.263, |
|
"eval_steps_per_second": 6.16, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5250059962272644, |
|
"learning_rate": 0.00021339985744832498, |
|
"loss": 1.0105, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.45952174067497253, |
|
"learning_rate": 0.00021233071988595865, |
|
"loss": 0.9956, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.45821675658226013, |
|
"learning_rate": 0.00021126158232359228, |
|
"loss": 1.0208, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.4415878355503082, |
|
"learning_rate": 0.00021019244476122595, |
|
"loss": 1.0286, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.9904425740242004, |
|
"eval_runtime": 227.2284, |
|
"eval_samples_per_second": 49.417, |
|
"eval_steps_per_second": 6.179, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.4214901626110077, |
|
"learning_rate": 0.00020912330719885958, |
|
"loss": 0.9945, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4757830500602722, |
|
"learning_rate": 0.0002080541696364932, |
|
"loss": 1.0125, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.48218998312950134, |
|
"learning_rate": 0.00020698503207412685, |
|
"loss": 0.987, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5465131998062134, |
|
"learning_rate": 0.0002059158945117605, |
|
"loss": 0.9945, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 0.9884626865386963, |
|
"eval_runtime": 227.9678, |
|
"eval_samples_per_second": 49.257, |
|
"eval_steps_per_second": 6.159, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6086324453353882, |
|
"learning_rate": 0.00020484675694939415, |
|
"loss": 0.9763, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4712287187576294, |
|
"learning_rate": 0.00020377761938702776, |
|
"loss": 1.0055, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.46066540479660034, |
|
"learning_rate": 0.00020270848182466143, |
|
"loss": 1.0283, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.412824809551239, |
|
"learning_rate": 0.00020163934426229506, |
|
"loss": 0.9841, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 0.9868392944335938, |
|
"eval_runtime": 227.472, |
|
"eval_samples_per_second": 49.364, |
|
"eval_steps_per_second": 6.172, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.4112266004085541, |
|
"learning_rate": 0.00020057020669992872, |
|
"loss": 0.9619, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.41810086369514465, |
|
"learning_rate": 0.00019950106913756233, |
|
"loss": 0.9915, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.4301850497722626, |
|
"learning_rate": 0.000198431931575196, |
|
"loss": 0.9889, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.452982634305954, |
|
"learning_rate": 0.00019736279401282963, |
|
"loss": 1.0032, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 0.9845861792564392, |
|
"eval_runtime": 227.9942, |
|
"eval_samples_per_second": 49.251, |
|
"eval_steps_per_second": 6.158, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5018645524978638, |
|
"learning_rate": 0.0001962936564504633, |
|
"loss": 0.9595, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.44759634137153625, |
|
"learning_rate": 0.0001952245188880969, |
|
"loss": 0.9759, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5419363379478455, |
|
"learning_rate": 0.00019415538132573057, |
|
"loss": 0.9756, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5175866484642029, |
|
"learning_rate": 0.0001930862437633642, |
|
"loss": 0.9977, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 0.9829397797584534, |
|
"eval_runtime": 227.4734, |
|
"eval_samples_per_second": 49.364, |
|
"eval_steps_per_second": 6.172, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7049946188926697, |
|
"learning_rate": 0.00019201710620099787, |
|
"loss": 0.9869, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.4388104975223541, |
|
"learning_rate": 0.00019094796863863148, |
|
"loss": 1.0134, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.488154798746109, |
|
"learning_rate": 0.0001898788310762651, |
|
"loss": 1.0117, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.39673909544944763, |
|
"learning_rate": 0.00018880969351389878, |
|
"loss": 0.9987, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 0.9813041687011719, |
|
"eval_runtime": 227.269, |
|
"eval_samples_per_second": 49.408, |
|
"eval_steps_per_second": 6.178, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.4031273424625397, |
|
"learning_rate": 0.0001877405559515324, |
|
"loss": 0.9711, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5331501364707947, |
|
"learning_rate": 0.00018667141838916605, |
|
"loss": 1.0479, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.4277609586715698, |
|
"learning_rate": 0.00018560228082679968, |
|
"loss": 1.0113, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5301382541656494, |
|
"learning_rate": 0.00018453314326443335, |
|
"loss": 1.029, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 0.98179030418396, |
|
"eval_runtime": 227.6248, |
|
"eval_samples_per_second": 49.331, |
|
"eval_steps_per_second": 6.168, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.43908432126045227, |
|
"learning_rate": 0.00018346400570206698, |
|
"loss": 0.976, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.45188236236572266, |
|
"learning_rate": 0.00018239486813970065, |
|
"loss": 1.0183, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3613215386867523, |
|
"learning_rate": 0.00018132573057733425, |
|
"loss": 0.9799, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5476358532905579, |
|
"learning_rate": 0.00018025659301496792, |
|
"loss": 0.999, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.981633186340332, |
|
"eval_runtime": 227.815, |
|
"eval_samples_per_second": 49.29, |
|
"eval_steps_per_second": 6.163, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5802431106567383, |
|
"learning_rate": 0.00017918745545260155, |
|
"loss": 1.0214, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.45726877450942993, |
|
"learning_rate": 0.00017811831789023522, |
|
"loss": 0.9812, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.4250318706035614, |
|
"learning_rate": 0.00017704918032786883, |
|
"loss": 0.9646, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.462782621383667, |
|
"learning_rate": 0.0001759800427655025, |
|
"loss": 1.0018, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 0.9794904589653015, |
|
"eval_runtime": 227.5588, |
|
"eval_samples_per_second": 49.345, |
|
"eval_steps_per_second": 6.17, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4206041693687439, |
|
"learning_rate": 0.00017491090520313613, |
|
"loss": 1.0011, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.38388872146606445, |
|
"learning_rate": 0.0001738417676407698, |
|
"loss": 0.9974, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4569980204105377, |
|
"learning_rate": 0.0001727726300784034, |
|
"loss": 0.9873, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.44570407271385193, |
|
"learning_rate": 0.00017170349251603703, |
|
"loss": 0.9819, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 0.9782843589782715, |
|
"eval_runtime": 227.3655, |
|
"eval_samples_per_second": 49.387, |
|
"eval_steps_per_second": 6.175, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.4842546284198761, |
|
"learning_rate": 0.0001706343549536707, |
|
"loss": 1.022, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3910045921802521, |
|
"learning_rate": 0.00016956521739130433, |
|
"loss": 0.9678, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4159318804740906, |
|
"learning_rate": 0.00016849607982893797, |
|
"loss": 0.9746, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4413444995880127, |
|
"learning_rate": 0.0001674269422665716, |
|
"loss": 0.9651, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.9774540066719055, |
|
"eval_runtime": 227.3235, |
|
"eval_samples_per_second": 49.397, |
|
"eval_steps_per_second": 6.176, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.593654453754425, |
|
"learning_rate": 0.00016635780470420527, |
|
"loss": 0.9888, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4957660734653473, |
|
"learning_rate": 0.0001652886671418389, |
|
"loss": 1.0392, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4909263849258423, |
|
"learning_rate": 0.00016421952957947254, |
|
"loss": 0.9973, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.40913325548171997, |
|
"learning_rate": 0.00016315039201710618, |
|
"loss": 0.9688, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 0.9764226675033569, |
|
"eval_runtime": 228.0151, |
|
"eval_samples_per_second": 49.247, |
|
"eval_steps_per_second": 6.157, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.3790026605129242, |
|
"learning_rate": 0.00016208125445473984, |
|
"loss": 0.9521, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.3385336399078369, |
|
"learning_rate": 0.00016101211689237348, |
|
"loss": 0.9764, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.43496620655059814, |
|
"learning_rate": 0.00015994297933000714, |
|
"loss": 0.9575, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.47056716680526733, |
|
"learning_rate": 0.00015887384176764075, |
|
"loss": 0.985, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 0.9764449000358582, |
|
"eval_runtime": 227.4751, |
|
"eval_samples_per_second": 49.364, |
|
"eval_steps_per_second": 6.172, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.4433695673942566, |
|
"learning_rate": 0.0001578047042052744, |
|
"loss": 0.9669, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.40916725993156433, |
|
"learning_rate": 0.00015673556664290805, |
|
"loss": 1.0049, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.4507070779800415, |
|
"learning_rate": 0.0001556664290805417, |
|
"loss": 1.01, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.46380680799484253, |
|
"learning_rate": 0.00015459729151817532, |
|
"loss": 1.0002, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.9757907390594482, |
|
"eval_runtime": 227.4869, |
|
"eval_samples_per_second": 49.361, |
|
"eval_steps_per_second": 6.172, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.42961764335632324, |
|
"learning_rate": 0.00015352815395580896, |
|
"loss": 0.965, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.3826574683189392, |
|
"learning_rate": 0.00015245901639344262, |
|
"loss": 0.9412, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.4790586829185486, |
|
"learning_rate": 0.00015138987883107623, |
|
"loss": 0.9941, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.39779818058013916, |
|
"learning_rate": 0.0001503207412687099, |
|
"loss": 0.9649, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.9745392799377441, |
|
"eval_runtime": 227.4768, |
|
"eval_samples_per_second": 49.363, |
|
"eval_steps_per_second": 6.172, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.49817249178886414, |
|
"learning_rate": 0.00014925160370634355, |
|
"loss": 0.9573, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.373766154050827, |
|
"learning_rate": 0.0001481824661439772, |
|
"loss": 0.9868, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.3962858021259308, |
|
"learning_rate": 0.00014711332858161083, |
|
"loss": 0.9495, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.4002228081226349, |
|
"learning_rate": 0.00014604419101924446, |
|
"loss": 0.9574, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 0.972939133644104, |
|
"eval_runtime": 227.5154, |
|
"eval_samples_per_second": 49.355, |
|
"eval_steps_per_second": 6.171, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7709822058677673, |
|
"learning_rate": 0.0001449750534568781, |
|
"loss": 1.007, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.42385196685791016, |
|
"learning_rate": 0.00014390591589451173, |
|
"loss": 1.0022, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.4056071937084198, |
|
"learning_rate": 0.0001428367783321454, |
|
"loss": 0.9409, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.4253057539463043, |
|
"learning_rate": 0.00014176764076977903, |
|
"loss": 0.9587, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 0.9732517004013062, |
|
"eval_runtime": 227.3891, |
|
"eval_samples_per_second": 49.382, |
|
"eval_steps_per_second": 6.174, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.4211929142475128, |
|
"learning_rate": 0.00014069850320741267, |
|
"loss": 1.0021, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.4413307011127472, |
|
"learning_rate": 0.0001396293656450463, |
|
"loss": 1.0075, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.42298370599746704, |
|
"learning_rate": 0.00013856022808267997, |
|
"loss": 0.9788, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.3264484703540802, |
|
"learning_rate": 0.0001374910905203136, |
|
"loss": 0.9904, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 0.9725102186203003, |
|
"eval_runtime": 227.1577, |
|
"eval_samples_per_second": 49.433, |
|
"eval_steps_per_second": 6.181, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.3268262445926666, |
|
"learning_rate": 0.00013642195295794724, |
|
"loss": 0.9549, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.3310069143772125, |
|
"learning_rate": 0.00013535281539558088, |
|
"loss": 0.9848, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.38187867403030396, |
|
"learning_rate": 0.00013428367783321454, |
|
"loss": 0.9802, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.37355056405067444, |
|
"learning_rate": 0.00013321454027084818, |
|
"loss": 0.9959, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.9710414409637451, |
|
"eval_runtime": 227.6662, |
|
"eval_samples_per_second": 49.322, |
|
"eval_steps_per_second": 6.167, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.48310089111328125, |
|
"learning_rate": 0.0001321454027084818, |
|
"loss": 1.0199, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.3701138198375702, |
|
"learning_rate": 0.00013107626514611545, |
|
"loss": 0.9983, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.359737366437912, |
|
"learning_rate": 0.0001300071275837491, |
|
"loss": 0.9747, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5300225615501404, |
|
"learning_rate": 0.00012893799002138275, |
|
"loss": 1.0007, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_loss": 0.9719007015228271, |
|
"eval_runtime": 227.4887, |
|
"eval_samples_per_second": 49.361, |
|
"eval_steps_per_second": 6.172, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.43651485443115234, |
|
"learning_rate": 0.00012786885245901638, |
|
"loss": 0.9885, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.4850105047225952, |
|
"learning_rate": 0.00012679971489665002, |
|
"loss": 1.0109, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.45454463362693787, |
|
"learning_rate": 0.00012573057733428366, |
|
"loss": 0.9987, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.40239349007606506, |
|
"learning_rate": 0.0001246614397719173, |
|
"loss": 0.9995, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 0.9698851108551025, |
|
"eval_runtime": 227.4941, |
|
"eval_samples_per_second": 49.36, |
|
"eval_steps_per_second": 6.172, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.4248133897781372, |
|
"learning_rate": 0.00012359230220955095, |
|
"loss": 1.0366, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.4477006494998932, |
|
"learning_rate": 0.0001225231646471846, |
|
"loss": 0.9866, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.45568859577178955, |
|
"learning_rate": 0.00012145402708481824, |
|
"loss": 0.9754, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4142056107521057, |
|
"learning_rate": 0.00012038488952245188, |
|
"loss": 0.9394, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.9695276021957397, |
|
"eval_runtime": 227.2433, |
|
"eval_samples_per_second": 49.414, |
|
"eval_steps_per_second": 6.178, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.44050607085227966, |
|
"learning_rate": 0.00011931575196008553, |
|
"loss": 0.9709, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.45126873254776, |
|
"learning_rate": 0.00011824661439771916, |
|
"loss": 1.0009, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.47706660628318787, |
|
"learning_rate": 0.00011717747683535281, |
|
"loss": 0.9789, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.43811845779418945, |
|
"learning_rate": 0.00011610833927298645, |
|
"loss": 0.9558, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 0.9690415263175964, |
|
"eval_runtime": 227.2613, |
|
"eval_samples_per_second": 49.41, |
|
"eval_steps_per_second": 6.178, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5149155855178833, |
|
"learning_rate": 0.0001150392017106201, |
|
"loss": 0.9848, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.482454389333725, |
|
"learning_rate": 0.00011397006414825373, |
|
"loss": 0.9798, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.3987056612968445, |
|
"learning_rate": 0.00011290092658588738, |
|
"loss": 0.9388, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.4491436183452606, |
|
"learning_rate": 0.00011183178902352102, |
|
"loss": 0.9512, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 0.9684551954269409, |
|
"eval_runtime": 227.6007, |
|
"eval_samples_per_second": 49.336, |
|
"eval_steps_per_second": 6.169, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.4160451889038086, |
|
"learning_rate": 0.00011076265146115467, |
|
"loss": 0.9869, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5239384770393372, |
|
"learning_rate": 0.00010969351389878829, |
|
"loss": 0.9676, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.42906704545021057, |
|
"learning_rate": 0.00010862437633642194, |
|
"loss": 0.9611, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3256273567676544, |
|
"learning_rate": 0.00010755523877405558, |
|
"loss": 0.9908, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.967640221118927, |
|
"eval_runtime": 227.6932, |
|
"eval_samples_per_second": 49.316, |
|
"eval_steps_per_second": 6.166, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.42826366424560547, |
|
"learning_rate": 0.00010648610121168923, |
|
"loss": 0.9655, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.46543028950691223, |
|
"learning_rate": 0.00010541696364932286, |
|
"loss": 0.9607, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5507635474205017, |
|
"learning_rate": 0.00010434782608695651, |
|
"loss": 0.9864, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.42310115694999695, |
|
"learning_rate": 0.00010327868852459015, |
|
"loss": 0.9611, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 0.967984676361084, |
|
"eval_runtime": 227.9969, |
|
"eval_samples_per_second": 49.251, |
|
"eval_steps_per_second": 6.158, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.4114033281803131, |
|
"learning_rate": 0.0001022095509622238, |
|
"loss": 0.9503, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.33842840790748596, |
|
"learning_rate": 0.00010114041339985743, |
|
"loss": 0.939, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.35206928849220276, |
|
"learning_rate": 0.00010007127583749108, |
|
"loss": 1.0102, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.45287784934043884, |
|
"learning_rate": 9.900213827512472e-05, |
|
"loss": 0.9444, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 0.9666356444358826, |
|
"eval_runtime": 227.3836, |
|
"eval_samples_per_second": 49.384, |
|
"eval_steps_per_second": 6.175, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5042274594306946, |
|
"learning_rate": 9.793300071275837e-05, |
|
"loss": 0.9986, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.4130496084690094, |
|
"learning_rate": 9.686386315039202e-05, |
|
"loss": 0.9481, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.4442286491394043, |
|
"learning_rate": 9.579472558802566e-05, |
|
"loss": 0.9832, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5206668972969055, |
|
"learning_rate": 9.47255880256593e-05, |
|
"loss": 0.9668, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 0.9656959772109985, |
|
"eval_runtime": 228.0783, |
|
"eval_samples_per_second": 49.233, |
|
"eval_steps_per_second": 6.156, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4653392434120178, |
|
"learning_rate": 9.365645046329294e-05, |
|
"loss": 0.9966, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.44000840187072754, |
|
"learning_rate": 9.258731290092659e-05, |
|
"loss": 1.0002, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4088016450405121, |
|
"learning_rate": 9.151817533856021e-05, |
|
"loss": 0.9868, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.4259507656097412, |
|
"learning_rate": 9.044903777619385e-05, |
|
"loss": 0.9887, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.965515673160553, |
|
"eval_runtime": 228.1355, |
|
"eval_samples_per_second": 49.221, |
|
"eval_steps_per_second": 6.154, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.42707499861717224, |
|
"learning_rate": 8.93799002138275e-05, |
|
"loss": 1.0194, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.38201627135276794, |
|
"learning_rate": 8.831076265146115e-05, |
|
"loss": 0.9835, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.4495198428630829, |
|
"learning_rate": 8.724162508909478e-05, |
|
"loss": 0.9911, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.4431660771369934, |
|
"learning_rate": 8.617248752672843e-05, |
|
"loss": 0.9759, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 0.964968204498291, |
|
"eval_runtime": 227.5676, |
|
"eval_samples_per_second": 49.344, |
|
"eval_steps_per_second": 6.17, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.43940964341163635, |
|
"learning_rate": 8.510334996436207e-05, |
|
"loss": 0.958, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.4617341458797455, |
|
"learning_rate": 8.403421240199572e-05, |
|
"loss": 0.9383, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.33945685625076294, |
|
"learning_rate": 8.296507483962936e-05, |
|
"loss": 0.9541, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.45421719551086426, |
|
"learning_rate": 8.1895937277263e-05, |
|
"loss": 0.9567, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 0.9648857712745667, |
|
"eval_runtime": 227.4739, |
|
"eval_samples_per_second": 49.364, |
|
"eval_steps_per_second": 6.172, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.3823011815547943, |
|
"learning_rate": 8.082679971489664e-05, |
|
"loss": 0.9496, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.39452335238456726, |
|
"learning_rate": 7.975766215253029e-05, |
|
"loss": 0.9633, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3680623769760132, |
|
"learning_rate": 7.868852459016393e-05, |
|
"loss": 0.988, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5220345854759216, |
|
"learning_rate": 7.761938702779758e-05, |
|
"loss": 0.9858, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 0.9640631079673767, |
|
"eval_runtime": 227.6642, |
|
"eval_samples_per_second": 49.323, |
|
"eval_steps_per_second": 6.167, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.39960744976997375, |
|
"learning_rate": 7.655024946543121e-05, |
|
"loss": 0.9936, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.43278273940086365, |
|
"learning_rate": 7.548111190306486e-05, |
|
"loss": 0.9618, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3960849642753601, |
|
"learning_rate": 7.44119743406985e-05, |
|
"loss": 0.999, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.4232223629951477, |
|
"learning_rate": 7.334283677833213e-05, |
|
"loss": 0.9702, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.9640046954154968, |
|
"eval_runtime": 227.4939, |
|
"eval_samples_per_second": 49.36, |
|
"eval_steps_per_second": 6.172, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3899974822998047, |
|
"learning_rate": 7.227369921596578e-05, |
|
"loss": 0.9744, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.38672515749931335, |
|
"learning_rate": 7.120456165359942e-05, |
|
"loss": 0.9424, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.33859318494796753, |
|
"learning_rate": 7.013542409123307e-05, |
|
"loss": 0.9721, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.4227427542209625, |
|
"learning_rate": 6.90662865288667e-05, |
|
"loss": 0.9731, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 0.9635033011436462, |
|
"eval_runtime": 227.4766, |
|
"eval_samples_per_second": 49.363, |
|
"eval_steps_per_second": 6.172, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6176052689552307, |
|
"learning_rate": 6.799714896650034e-05, |
|
"loss": 0.99, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.3367667496204376, |
|
"learning_rate": 6.692801140413399e-05, |
|
"loss": 0.965, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.41608142852783203, |
|
"learning_rate": 6.585887384176763e-05, |
|
"loss": 0.9678, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.3237856328487396, |
|
"learning_rate": 6.478973627940128e-05, |
|
"loss": 0.927, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 0.9633656144142151, |
|
"eval_runtime": 227.8317, |
|
"eval_samples_per_second": 49.286, |
|
"eval_steps_per_second": 6.162, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.3576589822769165, |
|
"learning_rate": 6.372059871703493e-05, |
|
"loss": 0.9729, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.47324052453041077, |
|
"learning_rate": 6.265146115466856e-05, |
|
"loss": 0.989, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.42953863739967346, |
|
"learning_rate": 6.158232359230221e-05, |
|
"loss": 0.9566, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4253891110420227, |
|
"learning_rate": 6.051318602993584e-05, |
|
"loss": 0.9878, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.9624494314193726, |
|
"eval_runtime": 227.67, |
|
"eval_samples_per_second": 49.321, |
|
"eval_steps_per_second": 6.167, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4608158767223358, |
|
"learning_rate": 5.9444048467569485e-05, |
|
"loss": 0.9787, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.4022761285305023, |
|
"learning_rate": 5.837491090520313e-05, |
|
"loss": 0.9738, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.3572002053260803, |
|
"learning_rate": 5.730577334283677e-05, |
|
"loss": 0.9919, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.4154810905456543, |
|
"learning_rate": 5.6236635780470413e-05, |
|
"loss": 0.9861, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 0.962546169757843, |
|
"eval_runtime": 227.8477, |
|
"eval_samples_per_second": 49.283, |
|
"eval_steps_per_second": 6.162, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.36054152250289917, |
|
"learning_rate": 5.5167498218104056e-05, |
|
"loss": 0.9834, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.3918026089668274, |
|
"learning_rate": 5.40983606557377e-05, |
|
"loss": 0.9978, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.4808182120323181, |
|
"learning_rate": 5.302922309337134e-05, |
|
"loss": 0.9481, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.37957093119621277, |
|
"learning_rate": 5.196008553100499e-05, |
|
"loss": 0.9703, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 0.9620444178581238, |
|
"eval_runtime": 228.027, |
|
"eval_samples_per_second": 49.244, |
|
"eval_steps_per_second": 6.157, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.3866080939769745, |
|
"learning_rate": 5.089094796863862e-05, |
|
"loss": 1.0014, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.4046897888183594, |
|
"learning_rate": 4.9821810406272264e-05, |
|
"loss": 0.9911, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.4034290909767151, |
|
"learning_rate": 4.875267284390591e-05, |
|
"loss": 0.908, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.39161401987075806, |
|
"learning_rate": 4.7683535281539556e-05, |
|
"loss": 0.9699, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.9614489674568176, |
|
"eval_runtime": 227.849, |
|
"eval_samples_per_second": 49.283, |
|
"eval_steps_per_second": 6.162, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.43111854791641235, |
|
"learning_rate": 4.66143977191732e-05, |
|
"loss": 1.0008, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.3769904673099518, |
|
"learning_rate": 4.554526015680684e-05, |
|
"loss": 1.0144, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.466468870639801, |
|
"learning_rate": 4.4476122594440485e-05, |
|
"loss": 0.9589, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.41613641381263733, |
|
"learning_rate": 4.340698503207413e-05, |
|
"loss": 1.0043, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 0.9611164927482605, |
|
"eval_runtime": 227.5578, |
|
"eval_samples_per_second": 49.346, |
|
"eval_steps_per_second": 6.17, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5018350481987, |
|
"learning_rate": 4.2337847469707764e-05, |
|
"loss": 0.9734, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5452864766120911, |
|
"learning_rate": 4.1268709907341407e-05, |
|
"loss": 0.953, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4102606177330017, |
|
"learning_rate": 4.019957234497505e-05, |
|
"loss": 0.9841, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4345894753932953, |
|
"learning_rate": 3.913043478260869e-05, |
|
"loss": 1.0105, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 0.9609247446060181, |
|
"eval_runtime": 227.5088, |
|
"eval_samples_per_second": 49.356, |
|
"eval_steps_per_second": 6.171, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.34100887179374695, |
|
"learning_rate": 3.8061297220242335e-05, |
|
"loss": 0.9571, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3881888687610626, |
|
"learning_rate": 3.699215965787598e-05, |
|
"loss": 0.9768, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.36524873971939087, |
|
"learning_rate": 3.592302209550962e-05, |
|
"loss": 0.9543, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.39331433176994324, |
|
"learning_rate": 3.485388453314326e-05, |
|
"loss": 0.9705, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.9608638286590576, |
|
"eval_runtime": 227.7591, |
|
"eval_samples_per_second": 49.302, |
|
"eval_steps_per_second": 6.164, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.4295852780342102, |
|
"learning_rate": 3.3784746970776906e-05, |
|
"loss": 0.9623, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.4223569333553314, |
|
"learning_rate": 3.271560940841055e-05, |
|
"loss": 0.964, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.3776945173740387, |
|
"learning_rate": 3.164647184604419e-05, |
|
"loss": 0.9981, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.3786326050758362, |
|
"learning_rate": 3.057733428367783e-05, |
|
"loss": 0.9738, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.9605663418769836, |
|
"eval_runtime": 227.6759, |
|
"eval_samples_per_second": 49.32, |
|
"eval_steps_per_second": 6.167, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5197612643241882, |
|
"learning_rate": 2.950819672131147e-05, |
|
"loss": 0.9834, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.42392510175704956, |
|
"learning_rate": 2.8439059158945114e-05, |
|
"loss": 1.0068, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.3506150543689728, |
|
"learning_rate": 2.736992159657876e-05, |
|
"loss": 0.9518, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.3928501605987549, |
|
"learning_rate": 2.63007840342124e-05, |
|
"loss": 1.0032, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 0.9605409502983093, |
|
"eval_runtime": 227.9108, |
|
"eval_samples_per_second": 49.269, |
|
"eval_steps_per_second": 6.16, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.3545394241809845, |
|
"learning_rate": 2.5231646471846042e-05, |
|
"loss": 0.9809, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.3934232294559479, |
|
"learning_rate": 2.4162508909479685e-05, |
|
"loss": 0.9812, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.4914129674434662, |
|
"learning_rate": 2.3093371347113328e-05, |
|
"loss": 1.041, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.45427605509757996, |
|
"learning_rate": 2.2024233784746968e-05, |
|
"loss": 0.9473, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 0.9605054259300232, |
|
"eval_runtime": 228.4903, |
|
"eval_samples_per_second": 49.144, |
|
"eval_steps_per_second": 6.145, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.3592887818813324, |
|
"learning_rate": 2.095509622238061e-05, |
|
"loss": 0.9616, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.47144436836242676, |
|
"learning_rate": 1.9885958660014253e-05, |
|
"loss": 0.9862, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.45374444127082825, |
|
"learning_rate": 1.8816821097647896e-05, |
|
"loss": 1.006, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.49605900049209595, |
|
"learning_rate": 1.774768353528154e-05, |
|
"loss": 0.9037, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 0.9600586295127869, |
|
"eval_runtime": 227.3914, |
|
"eval_samples_per_second": 49.382, |
|
"eval_steps_per_second": 6.174, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.388639360666275, |
|
"learning_rate": 1.6678545972915182e-05, |
|
"loss": 0.9601, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.47878703474998474, |
|
"learning_rate": 1.560940841054882e-05, |
|
"loss": 1.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.42967456579208374, |
|
"learning_rate": 1.4540270848182466e-05, |
|
"loss": 0.996, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.3042117953300476, |
|
"learning_rate": 1.3471133285816107e-05, |
|
"loss": 0.9258, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 0.9597083926200867, |
|
"eval_runtime": 227.7172, |
|
"eval_samples_per_second": 49.311, |
|
"eval_steps_per_second": 6.166, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.44863736629486084, |
|
"learning_rate": 1.240199572344975e-05, |
|
"loss": 0.9422, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.47229892015457153, |
|
"learning_rate": 1.1332858161083391e-05, |
|
"loss": 0.9497, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4254206120967865, |
|
"learning_rate": 1.0263720598717034e-05, |
|
"loss": 0.993, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3855699896812439, |
|
"learning_rate": 9.194583036350677e-06, |
|
"loss": 0.9965, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 0.9595157504081726, |
|
"eval_runtime": 227.4432, |
|
"eval_samples_per_second": 49.371, |
|
"eval_steps_per_second": 6.173, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3840392231941223, |
|
"learning_rate": 8.12544547398432e-06, |
|
"loss": 0.9326, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.36666616797447205, |
|
"learning_rate": 7.0563079116179615e-06, |
|
"loss": 0.9729, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.379976749420166, |
|
"learning_rate": 5.9871703492516035e-06, |
|
"loss": 0.9771, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.3656114935874939, |
|
"learning_rate": 4.9180327868852455e-06, |
|
"loss": 0.9165, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 0.959417998790741, |
|
"eval_runtime": 227.5694, |
|
"eval_samples_per_second": 49.343, |
|
"eval_steps_per_second": 6.17, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.4715186059474945, |
|
"learning_rate": 3.848895224518888e-06, |
|
"loss": 0.9531, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.4237213730812073, |
|
"learning_rate": 2.7797576621525303e-06, |
|
"loss": 0.9883, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.43288546800613403, |
|
"learning_rate": 1.7106200997861725e-06, |
|
"loss": 0.9835, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3952523171901703, |
|
"learning_rate": 6.414825374198146e-07, |
|
"loss": 1.022, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.9593754410743713, |
|
"eval_runtime": 227.652, |
|
"eval_samples_per_second": 49.325, |
|
"eval_steps_per_second": 6.167, |
|
"step": 1400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1403, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 2.634861827260416e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|