|
{ |
|
"best_metric": 0.23529192805290222, |
|
"best_model_checkpoint": "./convnext-base-15ep/checkpoint-9891", |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 16485, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 23.461017608642578, |
|
"learning_rate": 2.999727623294975e-05, |
|
"loss": 2.6353, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 16.223487854003906, |
|
"learning_rate": 2.9989105920986585e-05, |
|
"loss": 1.496, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 21.04163932800293, |
|
"learning_rate": 2.9975492031314045e-05, |
|
"loss": 1.1313, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 34.1530647277832, |
|
"learning_rate": 2.995643950807401e-05, |
|
"loss": 0.8547, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 19.99446678161621, |
|
"learning_rate": 2.9931955270551154e-05, |
|
"loss": 0.8017, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 19.548385620117188, |
|
"learning_rate": 2.990204821066006e-05, |
|
"loss": 0.691, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 27.656845092773438, |
|
"learning_rate": 2.9866729189715972e-05, |
|
"loss": 0.7105, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 14.135828971862793, |
|
"learning_rate": 2.982601103449029e-05, |
|
"loss": 0.6394, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 11.512890815734863, |
|
"learning_rate": 2.977990853255228e-05, |
|
"loss": 0.6012, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 12.752175331115723, |
|
"learning_rate": 2.972843842689871e-05, |
|
"loss": 0.6099, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8934393638170974, |
|
"eval_loss": 0.3667598366737366, |
|
"eval_runtime": 105.5376, |
|
"eval_samples_per_second": 23.83, |
|
"eval_steps_per_second": 1.497, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 32.5348014831543, |
|
"learning_rate": 2.9671619409873295e-05, |
|
"loss": 0.639, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 16.545459747314453, |
|
"learning_rate": 2.9609472116378222e-05, |
|
"loss": 0.5358, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 25.811128616333008, |
|
"learning_rate": 2.954201911638019e-05, |
|
"loss": 0.518, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 18.66850471496582, |
|
"learning_rate": 2.9469284906713715e-05, |
|
"loss": 0.4844, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 19.064212799072266, |
|
"learning_rate": 2.9391295902184625e-05, |
|
"loss": 0.5122, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 20.12409210205078, |
|
"learning_rate": 2.930808042597703e-05, |
|
"loss": 0.474, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 13.67546272277832, |
|
"learning_rate": 2.9219668699367213e-05, |
|
"loss": 0.5351, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 21.155534744262695, |
|
"learning_rate": 2.9126092830748217e-05, |
|
"loss": 0.4939, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 20.379898071289062, |
|
"learning_rate": 2.9027386803969068e-05, |
|
"loss": 0.4351, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 18.86287498474121, |
|
"learning_rate": 2.8923586465992876e-05, |
|
"loss": 0.4853, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 16.228618621826172, |
|
"learning_rate": 2.8814729513878365e-05, |
|
"loss": 0.5086, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.927634194831014, |
|
"eval_loss": 0.27727919816970825, |
|
"eval_runtime": 105.4061, |
|
"eval_samples_per_second": 23.86, |
|
"eval_steps_per_second": 1.499, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 28.413759231567383, |
|
"learning_rate": 2.8700855481089444e-05, |
|
"loss": 0.4694, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 16.885929107666016, |
|
"learning_rate": 2.8582005723137908e-05, |
|
"loss": 0.3474, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 21.655845642089844, |
|
"learning_rate": 2.8458223402564366e-05, |
|
"loss": 0.4257, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 20.78180503845215, |
|
"learning_rate": 2.8329553473262978e-05, |
|
"loss": 0.3752, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 13.279301643371582, |
|
"learning_rate": 2.8196042664155587e-05, |
|
"loss": 0.4485, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 35.08995056152344, |
|
"learning_rate": 2.8057739462221215e-05, |
|
"loss": 0.3814, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 6.2748332023620605, |
|
"learning_rate": 2.791469409488711e-05, |
|
"loss": 0.4063, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 34.69600296020508, |
|
"learning_rate": 2.7766958511787707e-05, |
|
"loss": 0.392, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 19.070589065551758, |
|
"learning_rate": 2.761458636589813e-05, |
|
"loss": 0.4298, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 18.813634872436523, |
|
"learning_rate": 2.7457632994049085e-05, |
|
"loss": 0.4147, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 21.774486541748047, |
|
"learning_rate": 2.7296155396830262e-05, |
|
"loss": 0.386, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9324055666003976, |
|
"eval_loss": 0.2586577832698822, |
|
"eval_runtime": 105.5285, |
|
"eval_samples_per_second": 23.832, |
|
"eval_steps_per_second": 1.497, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 10.948277473449707, |
|
"learning_rate": 2.7130212217889484e-05, |
|
"loss": 0.3943, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 9.30019760131836, |
|
"learning_rate": 2.6959863722635152e-05, |
|
"loss": 0.3148, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 20.882099151611328, |
|
"learning_rate": 2.6785171776349725e-05, |
|
"loss": 0.375, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 25.736265182495117, |
|
"learning_rate": 2.6606199821722166e-05, |
|
"loss": 0.3827, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 22.133760452270508, |
|
"learning_rate": 2.6423012855807538e-05, |
|
"loss": 0.3239, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 10.715078353881836, |
|
"learning_rate": 2.6235677406422072e-05, |
|
"loss": 0.3671, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 7.383324146270752, |
|
"learning_rate": 2.6044261507982356e-05, |
|
"loss": 0.3623, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 13.652631759643555, |
|
"learning_rate": 2.5848834676797335e-05, |
|
"loss": 0.382, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 14.792743682861328, |
|
"learning_rate": 2.5649467885822135e-05, |
|
"loss": 0.3719, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 2.3243019580841064, |
|
"learning_rate": 2.5446233538882924e-05, |
|
"loss": 0.3439, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 39.48076248168945, |
|
"learning_rate": 2.5239205444382054e-05, |
|
"loss": 0.335, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9347912524850894, |
|
"eval_loss": 0.2400086522102356, |
|
"eval_runtime": 105.8322, |
|
"eval_samples_per_second": 23.764, |
|
"eval_steps_per_second": 1.493, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 13.311861038208008, |
|
"learning_rate": 2.5028458788493163e-05, |
|
"loss": 0.2904, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 16.158748626708984, |
|
"learning_rate": 2.4814070107855878e-05, |
|
"loss": 0.3153, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 22.555923461914062, |
|
"learning_rate": 2.4596117261780113e-05, |
|
"loss": 0.3329, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 27.752666473388672, |
|
"learning_rate": 2.4374679403969946e-05, |
|
"loss": 0.3047, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 20.464553833007812, |
|
"learning_rate": 2.4149836953777488e-05, |
|
"loss": 0.2973, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 12.999048233032227, |
|
"learning_rate": 2.3921671566997044e-05, |
|
"loss": 0.2756, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 24.010915756225586, |
|
"learning_rate": 2.3690266106210284e-05, |
|
"loss": 0.3108, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 19.249290466308594, |
|
"learning_rate": 2.345570461069312e-05, |
|
"loss": 0.2996, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 33.51617431640625, |
|
"learning_rate": 2.3218072265895257e-05, |
|
"loss": 0.3192, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 26.002756118774414, |
|
"learning_rate": 2.297745537250347e-05, |
|
"loss": 0.3396, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 16.87458038330078, |
|
"learning_rate": 2.2733941315099883e-05, |
|
"loss": 0.3167, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9339960238568589, |
|
"eval_loss": 0.25986233353614807, |
|
"eval_runtime": 105.6337, |
|
"eval_samples_per_second": 23.809, |
|
"eval_steps_per_second": 1.496, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 16.456005096435547, |
|
"learning_rate": 2.2487618530426604e-05, |
|
"loss": 0.3225, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 14.056130409240723, |
|
"learning_rate": 2.2238576475268268e-05, |
|
"loss": 0.256, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 17.51475715637207, |
|
"learning_rate": 2.1986905593964048e-05, |
|
"loss": 0.2478, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 23.06650161743164, |
|
"learning_rate": 2.173269728556115e-05, |
|
"loss": 0.2827, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 24.29911231994629, |
|
"learning_rate": 2.147604387062149e-05, |
|
"loss": 0.2836, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 16.115610122680664, |
|
"learning_rate": 2.121703855769373e-05, |
|
"loss": 0.271, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 16.511001586914062, |
|
"learning_rate": 2.0955775409462816e-05, |
|
"loss": 0.2915, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 34.321468353271484, |
|
"learning_rate": 2.0692349308589375e-05, |
|
"loss": 0.2576, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 22.157716751098633, |
|
"learning_rate": 2.042685592325123e-05, |
|
"loss": 0.2954, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 19.21451759338379, |
|
"learning_rate": 2.0159391672399725e-05, |
|
"loss": 0.2857, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 28.460418701171875, |
|
"learning_rate": 1.9890053690743337e-05, |
|
"loss": 0.2703, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.941948310139165, |
|
"eval_loss": 0.24395686388015747, |
|
"eval_runtime": 104.8687, |
|
"eval_samples_per_second": 23.982, |
|
"eval_steps_per_second": 1.507, |
|
"step": 6594 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 16.289432525634766, |
|
"learning_rate": 1.961893979347137e-05, |
|
"loss": 0.306, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 27.070125579833984, |
|
"learning_rate": 1.934614844073054e-05, |
|
"loss": 0.2425, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 14.10284423828125, |
|
"learning_rate": 1.9071778701867247e-05, |
|
"loss": 0.2729, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 8.3997220993042, |
|
"learning_rate": 1.879593021944875e-05, |
|
"loss": 0.2816, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 8.656745910644531, |
|
"learning_rate": 1.851870317307602e-05, |
|
"loss": 0.2411, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 1.0716943740844727, |
|
"learning_rate": 1.8240198243001623e-05, |
|
"loss": 0.2689, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 3.0549349784851074, |
|
"learning_rate": 1.796051657356582e-05, |
|
"loss": 0.2435, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 23.518062591552734, |
|
"learning_rate": 1.7679759736464045e-05, |
|
"loss": 0.2346, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 31.476299285888672, |
|
"learning_rate": 1.739802969385923e-05, |
|
"loss": 0.2478, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 14.344544410705566, |
|
"learning_rate": 1.711542876135233e-05, |
|
"loss": 0.2377, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 0.6573715209960938, |
|
"learning_rate": 1.6832059570824453e-05, |
|
"loss": 0.2638, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9407554671968191, |
|
"eval_loss": 0.2496492713689804, |
|
"eval_runtime": 104.4088, |
|
"eval_samples_per_second": 24.088, |
|
"eval_steps_per_second": 1.513, |
|
"step": 7693 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 33.705074310302734, |
|
"learning_rate": 1.654802503316416e-05, |
|
"loss": 0.2077, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 33.48544692993164, |
|
"learning_rate": 1.6263428300893422e-05, |
|
"loss": 0.2398, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 18.51409149169922, |
|
"learning_rate": 1.597837273070585e-05, |
|
"loss": 0.217, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 8.902990341186523, |
|
"learning_rate": 1.5692961845930704e-05, |
|
"loss": 0.2538, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 19.85183334350586, |
|
"learning_rate": 1.540729929893649e-05, |
|
"loss": 0.2362, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 8.387295722961426, |
|
"learning_rate": 1.51214888334876e-05, |
|
"loss": 0.2415, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 20.70574378967285, |
|
"learning_rate": 1.4835634247067834e-05, |
|
"loss": 0.2109, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 18.571918487548828, |
|
"learning_rate": 1.454983935318433e-05, |
|
"loss": 0.242, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 15.444602012634277, |
|
"learning_rate": 1.426420794366578e-05, |
|
"loss": 0.1971, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 10.603649139404297, |
|
"learning_rate": 1.3978843750968413e-05, |
|
"loss": 0.2519, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 1.8435286283493042, |
|
"learning_rate": 1.3693850410503614e-05, |
|
"loss": 0.1938, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9431411530815109, |
|
"eval_loss": 0.23655731976032257, |
|
"eval_runtime": 104.2878, |
|
"eval_samples_per_second": 24.116, |
|
"eval_steps_per_second": 1.515, |
|
"step": 8792 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 16.798791885375977, |
|
"learning_rate": 1.3409331423000765e-05, |
|
"loss": 0.2356, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 18.694393157958984, |
|
"learning_rate": 1.3125390116918962e-05, |
|
"loss": 0.2057, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 2.4685895442962646, |
|
"learning_rate": 1.2842129610921378e-05, |
|
"loss": 0.1852, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 14.801736831665039, |
|
"learning_rate": 1.255965277642572e-05, |
|
"loss": 0.1627, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.14111284911632538, |
|
"learning_rate": 1.2278062200244565e-05, |
|
"loss": 0.2274, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 2.3665616512298584, |
|
"learning_rate": 1.1997460147328984e-05, |
|
"loss": 0.19, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 10.016236305236816, |
|
"learning_rate": 1.1717948523629107e-05, |
|
"loss": 0.1982, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 9.423545837402344, |
|
"learning_rate": 1.1439628839085037e-05, |
|
"loss": 0.2109, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 7.491578578948975, |
|
"learning_rate": 1.1162602170761611e-05, |
|
"loss": 0.2033, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 1.5837846994400024, |
|
"learning_rate": 1.0886969126140309e-05, |
|
"loss": 0.2583, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 30.90834617614746, |
|
"learning_rate": 1.0612829806581792e-05, |
|
"loss": 0.1789, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9487077534791253, |
|
"eval_loss": 0.23529192805290222, |
|
"eval_runtime": 104.3958, |
|
"eval_samples_per_second": 24.091, |
|
"eval_steps_per_second": 1.513, |
|
"step": 9891 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 9.547430038452148, |
|
"learning_rate": 1.0340283770972167e-05, |
|
"loss": 0.1811, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 3.7667858600616455, |
|
"learning_rate": 1.0069429999566298e-05, |
|
"loss": 0.1631, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 35.338844299316406, |
|
"learning_rate": 9.800366858041242e-06, |
|
"loss": 0.1908, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 19.66462516784668, |
|
"learning_rate": 9.533192061772919e-06, |
|
"loss": 0.1571, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 9.39445686340332, |
|
"learning_rate": 9.268002640348889e-06, |
|
"loss": 0.1904, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 4.144321441650391, |
|
"learning_rate": 9.004894902330242e-06, |
|
"loss": 0.1559, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 10.969382286071777, |
|
"learning_rate": 8.743964400275304e-06, |
|
"loss": 0.2004, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 24.618282318115234, |
|
"learning_rate": 8.485305896037929e-06, |
|
"loss": 0.2064, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 25.1768856048584, |
|
"learning_rate": 8.229013326352934e-06, |
|
"loss": 0.1981, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 10.52382755279541, |
|
"learning_rate": 7.975179768721187e-06, |
|
"loss": 0.2, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 15.742509841918945, |
|
"learning_rate": 7.723897407606758e-06, |
|
"loss": 0.1738, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9499005964214712, |
|
"eval_loss": 0.237999826669693, |
|
"eval_runtime": 107.0756, |
|
"eval_samples_per_second": 23.488, |
|
"eval_steps_per_second": 1.476, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 10.01, |
|
"grad_norm": 0.739620566368103, |
|
"learning_rate": 7.475257500958387e-06, |
|
"loss": 0.1941, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 10.1, |
|
"grad_norm": 15.124515533447266, |
|
"learning_rate": 7.229350347067426e-06, |
|
"loss": 0.1358, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 10.19, |
|
"grad_norm": 16.16267204284668, |
|
"learning_rate": 6.986265251774287e-06, |
|
"loss": 0.1634, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 10.28, |
|
"grad_norm": 9.51611614227295, |
|
"learning_rate": 6.746090496035372e-06, |
|
"loss": 0.1561, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 10.37, |
|
"grad_norm": 17.76508140563965, |
|
"learning_rate": 6.508913303862144e-06, |
|
"loss": 0.1767, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 10.46, |
|
"grad_norm": 9.271286964416504, |
|
"learning_rate": 6.274819810644087e-06, |
|
"loss": 0.1907, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 10.56, |
|
"grad_norm": 47.73799514770508, |
|
"learning_rate": 6.043895031866995e-06, |
|
"loss": 0.1628, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"grad_norm": 31.59560775756836, |
|
"learning_rate": 5.8162228322380155e-06, |
|
"loss": 0.1618, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 10.74, |
|
"grad_norm": 8.150809288024902, |
|
"learning_rate": 5.591885895228557e-06, |
|
"loss": 0.1795, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 10.83, |
|
"grad_norm": 9.975177764892578, |
|
"learning_rate": 5.370965693046249e-06, |
|
"loss": 0.1705, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"grad_norm": 39.83053207397461, |
|
"learning_rate": 5.153542457046737e-06, |
|
"loss": 0.1924, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.9463220675944334, |
|
"eval_loss": 0.24584931135177612, |
|
"eval_runtime": 105.078, |
|
"eval_samples_per_second": 23.935, |
|
"eval_steps_per_second": 1.504, |
|
"step": 12089 |
|
}, |
|
{ |
|
"epoch": 11.01, |
|
"grad_norm": 0.06365140527486801, |
|
"learning_rate": 4.9396951485961885e-06, |
|
"loss": 0.1239, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 11.1, |
|
"grad_norm": 7.116461277008057, |
|
"learning_rate": 4.729501430394933e-06, |
|
"loss": 0.1739, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 11.19, |
|
"grad_norm": 0.722091019153595, |
|
"learning_rate": 4.523037638272822e-06, |
|
"loss": 0.1611, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 11.28, |
|
"grad_norm": 15.62628173828125, |
|
"learning_rate": 4.320378753466392e-06, |
|
"loss": 0.1386, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 11.37, |
|
"grad_norm": 29.914321899414062, |
|
"learning_rate": 4.121598375388027e-06, |
|
"loss": 0.129, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 11.46, |
|
"grad_norm": 16.476482391357422, |
|
"learning_rate": 3.926768694896931e-06, |
|
"loss": 0.1358, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 13.462120056152344, |
|
"learning_rate": 3.7359604680816612e-06, |
|
"loss": 0.151, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 11.65, |
|
"grad_norm": 19.780019760131836, |
|
"learning_rate": 3.5492429905636857e-06, |
|
"loss": 0.1715, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 11.74, |
|
"grad_norm": 2.3141283988952637, |
|
"learning_rate": 3.3666840723314145e-06, |
|
"loss": 0.1304, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 14.787359237670898, |
|
"learning_rate": 3.188350013113671e-06, |
|
"loss": 0.1528, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 11.92, |
|
"grad_norm": 20.494741439819336, |
|
"learning_rate": 3.014305578301712e-06, |
|
"loss": 0.1628, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9491053677932405, |
|
"eval_loss": 0.2434205859899521, |
|
"eval_runtime": 105.1912, |
|
"eval_samples_per_second": 23.909, |
|
"eval_steps_per_second": 1.502, |
|
"step": 13188 |
|
}, |
|
{ |
|
"epoch": 12.01, |
|
"grad_norm": 11.916013717651367, |
|
"learning_rate": 2.8446139754284486e-06, |
|
"loss": 0.1375, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 0.6161535978317261, |
|
"learning_rate": 2.6793368312134275e-06, |
|
"loss": 0.1324, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 12.19, |
|
"grad_norm": 21.261964797973633, |
|
"learning_rate": 2.5185341691819315e-06, |
|
"loss": 0.1538, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 12.28, |
|
"grad_norm": 1.320718765258789, |
|
"learning_rate": 2.36226438786627e-06, |
|
"loss": 0.1647, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"grad_norm": 3.367286205291748, |
|
"learning_rate": 2.210584239597296e-06, |
|
"loss": 0.1459, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 12.47, |
|
"grad_norm": 25.336097717285156, |
|
"learning_rate": 2.063548809893678e-06, |
|
"loss": 0.1299, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 12.56, |
|
"grad_norm": 29.0968074798584, |
|
"learning_rate": 1.9212114974565664e-06, |
|
"loss": 0.15, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 12.65, |
|
"grad_norm": 10.785637855529785, |
|
"learning_rate": 1.783623994776848e-06, |
|
"loss": 0.122, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 12.74, |
|
"grad_norm": 6.101187229156494, |
|
"learning_rate": 1.6508362693620305e-06, |
|
"loss": 0.1714, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.83, |
|
"grad_norm": 0.2466667890548706, |
|
"learning_rate": 1.5228965455896054e-06, |
|
"loss": 0.1639, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 12.92, |
|
"grad_norm": 17.598079681396484, |
|
"learning_rate": 1.3998512871934415e-06, |
|
"loss": 0.1431, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.9499005964214712, |
|
"eval_loss": 0.23904123902320862, |
|
"eval_runtime": 105.0334, |
|
"eval_samples_per_second": 23.945, |
|
"eval_steps_per_second": 1.504, |
|
"step": 14287 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"grad_norm": 4.807325839996338, |
|
"learning_rate": 1.2817451803896063e-06, |
|
"loss": 0.1444, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 13.1, |
|
"grad_norm": 0.20366211235523224, |
|
"learning_rate": 1.1686211176477208e-06, |
|
"loss": 0.1555, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 13.19, |
|
"grad_norm": 16.46355628967285, |
|
"learning_rate": 1.0605201821137417e-06, |
|
"loss": 0.1512, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 11.976471900939941, |
|
"learning_rate": 9.57481632689845e-07, |
|
"loss": 0.1741, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 13.38, |
|
"grad_norm": 27.499574661254883, |
|
"learning_rate": 8.595428897768071e-07, |
|
"loss": 0.1509, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 13.47, |
|
"grad_norm": 2.7120893001556396, |
|
"learning_rate": 7.66739521684079e-07, |
|
"loss": 0.1601, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 13.56, |
|
"grad_norm": 37.196876525878906, |
|
"learning_rate": 6.791052317124824e-07, |
|
"loss": 0.131, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 13.65, |
|
"grad_norm": 9.931727409362793, |
|
"learning_rate": 5.966718459142196e-07, |
|
"loss": 0.1411, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 13.74, |
|
"grad_norm": 10.333179473876953, |
|
"learning_rate": 5.194693015346313e-07, |
|
"loss": 0.1452, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 13.83, |
|
"grad_norm": 4.551497936248779, |
|
"learning_rate": 4.4752563613992993e-07, |
|
"loss": 0.142, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"grad_norm": 19.9360408782959, |
|
"learning_rate": 3.808669774348167e-07, |
|
"loss": 0.1432, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.9502982107355865, |
|
"eval_loss": 0.23912423849105835, |
|
"eval_runtime": 104.6114, |
|
"eval_samples_per_second": 24.041, |
|
"eval_steps_per_second": 1.51, |
|
"step": 15386 |
|
}, |
|
{ |
|
"epoch": 14.01, |
|
"grad_norm": 10.347683906555176, |
|
"learning_rate": 3.195175337737194e-07, |
|
"loss": 0.1464, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 14.1, |
|
"grad_norm": 14.291678428649902, |
|
"learning_rate": 2.6349958536906303e-07, |
|
"loss": 0.1387, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 14.19, |
|
"grad_norm": 5.145290851593018, |
|
"learning_rate": 2.1283347619979243e-07, |
|
"loss": 0.1493, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 14.29, |
|
"grad_norm": 9.14526653289795, |
|
"learning_rate": 1.6753760662307217e-07, |
|
"loss": 0.1488, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 14.38, |
|
"grad_norm": 0.8332547545433044, |
|
"learning_rate": 1.2762842669184205e-07, |
|
"loss": 0.1489, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"grad_norm": 5.39556884765625, |
|
"learning_rate": 9.312043018067762e-08, |
|
"loss": 0.1606, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 14.56, |
|
"grad_norm": 26.057680130004883, |
|
"learning_rate": 6.402614932209228e-08, |
|
"loss": 0.1411, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 14.65, |
|
"grad_norm": 33.924278259277344, |
|
"learning_rate": 4.035615025522632e-08, |
|
"loss": 0.1347, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 14.74, |
|
"grad_norm": 1.6821552515029907, |
|
"learning_rate": 2.211902918855313e-08, |
|
"loss": 0.1498, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 14.83, |
|
"grad_norm": 50.973289489746094, |
|
"learning_rate": 9.321409277999738e-09, |
|
"loss": 0.1481, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 14.92, |
|
"grad_norm": 7.030074119567871, |
|
"learning_rate": 1.9679382216242213e-09, |
|
"loss": 0.1297, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.9499005964214712, |
|
"eval_loss": 0.23844709992408752, |
|
"eval_runtime": 105.2757, |
|
"eval_samples_per_second": 23.89, |
|
"eval_steps_per_second": 1.501, |
|
"step": 16485 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"step": 16485, |
|
"total_flos": 6.140249030814106e+19, |
|
"train_loss": 0.2960004877096818, |
|
"train_runtime": 28237.408, |
|
"train_samples_per_second": 9.339, |
|
"train_steps_per_second": 0.584 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 16485, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"total_flos": 6.140249030814106e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|