{ "best_metric": 0.9518218623481781, "best_model_checkpoint": "swin-tiny-patch4-window7-224-hotel_images_classifier_v2/checkpoint-3470", "epoch": 4.9946023749550195, "eval_steps": 500, "global_step": 3470, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 4.747580528259277, "learning_rate": 7.204610951008646e-07, "loss": 1.9842, "step": 5 }, { "epoch": 0.01, "grad_norm": 5.086691379547119, "learning_rate": 1.4409221902017292e-06, "loss": 1.9811, "step": 10 }, { "epoch": 0.02, "grad_norm": 4.6829681396484375, "learning_rate": 2.161383285302594e-06, "loss": 1.9832, "step": 15 }, { "epoch": 0.03, "grad_norm": 4.578265190124512, "learning_rate": 2.8818443804034585e-06, "loss": 1.9501, "step": 20 }, { "epoch": 0.04, "grad_norm": 4.904880523681641, "learning_rate": 3.602305475504323e-06, "loss": 1.917, "step": 25 }, { "epoch": 0.04, "grad_norm": 5.561208248138428, "learning_rate": 4.322766570605188e-06, "loss": 1.868, "step": 30 }, { "epoch": 0.05, "grad_norm": 4.837254047393799, "learning_rate": 5.043227665706052e-06, "loss": 1.8244, "step": 35 }, { "epoch": 0.06, "grad_norm": 4.515142440795898, "learning_rate": 5.763688760806917e-06, "loss": 1.7756, "step": 40 }, { "epoch": 0.06, "grad_norm": 6.021132946014404, "learning_rate": 6.484149855907781e-06, "loss": 1.7231, "step": 45 }, { "epoch": 0.07, "grad_norm": 6.325433254241943, "learning_rate": 7.204610951008646e-06, "loss": 1.6389, "step": 50 }, { "epoch": 0.08, "grad_norm": 6.28499698638916, "learning_rate": 7.92507204610951e-06, "loss": 1.5651, "step": 55 }, { "epoch": 0.09, "grad_norm": 4.940927982330322, "learning_rate": 8.645533141210376e-06, "loss": 1.5001, "step": 60 }, { "epoch": 0.09, "grad_norm": 4.624394416809082, "learning_rate": 9.36599423631124e-06, "loss": 1.3998, "step": 65 }, { "epoch": 0.1, "grad_norm": 4.9989118576049805, "learning_rate": 1.0086455331412104e-05, "loss": 1.2913, "step": 70 }, { "epoch": 0.11, "grad_norm": 6.203399658203125, "learning_rate": 1.0806916426512968e-05, "loss": 1.1994, "step": 75 }, { "epoch": 0.12, "grad_norm": 4.825283050537109, "learning_rate": 1.1527377521613834e-05, "loss": 1.0557, "step": 80 }, { "epoch": 0.12, "grad_norm": 6.639811992645264, "learning_rate": 1.2247838616714698e-05, "loss": 1.0077, "step": 85 }, { "epoch": 0.13, "grad_norm": 5.167383670806885, "learning_rate": 1.2968299711815562e-05, "loss": 0.9303, "step": 90 }, { "epoch": 0.14, "grad_norm": 6.177196979522705, "learning_rate": 1.3688760806916426e-05, "loss": 0.7967, "step": 95 }, { "epoch": 0.14, "grad_norm": 5.489429950714111, "learning_rate": 1.4409221902017291e-05, "loss": 0.7269, "step": 100 }, { "epoch": 0.15, "grad_norm": 5.555374622344971, "learning_rate": 1.5129682997118155e-05, "loss": 0.7176, "step": 105 }, { "epoch": 0.16, "grad_norm": 11.141295433044434, "learning_rate": 1.585014409221902e-05, "loss": 0.6796, "step": 110 }, { "epoch": 0.17, "grad_norm": 7.412641525268555, "learning_rate": 1.6570605187319883e-05, "loss": 0.6028, "step": 115 }, { "epoch": 0.17, "grad_norm": 6.904923439025879, "learning_rate": 1.7291066282420752e-05, "loss": 0.6348, "step": 120 }, { "epoch": 0.18, "grad_norm": 11.165042877197266, "learning_rate": 1.8011527377521615e-05, "loss": 0.5814, "step": 125 }, { "epoch": 0.19, "grad_norm": 7.367648124694824, "learning_rate": 1.873198847262248e-05, "loss": 0.5858, "step": 130 }, { "epoch": 0.19, "grad_norm": 7.115988254547119, "learning_rate": 1.9452449567723343e-05, "loss": 0.5316, "step": 135 }, { "epoch": 0.2, "grad_norm": 6.44365119934082, "learning_rate": 2.017291066282421e-05, "loss": 0.5049, "step": 140 }, { "epoch": 0.21, "grad_norm": 7.195384502410889, "learning_rate": 2.0893371757925074e-05, "loss": 0.5511, "step": 145 }, { "epoch": 0.22, "grad_norm": 17.6825008392334, "learning_rate": 2.1613832853025936e-05, "loss": 0.5124, "step": 150 }, { "epoch": 0.22, "grad_norm": 7.656848907470703, "learning_rate": 2.2334293948126802e-05, "loss": 0.4794, "step": 155 }, { "epoch": 0.23, "grad_norm": 7.221956729888916, "learning_rate": 2.3054755043227668e-05, "loss": 0.4773, "step": 160 }, { "epoch": 0.24, "grad_norm": 16.787612915039062, "learning_rate": 2.3775216138328533e-05, "loss": 0.4746, "step": 165 }, { "epoch": 0.24, "grad_norm": 7.123960494995117, "learning_rate": 2.4495677233429396e-05, "loss": 0.4734, "step": 170 }, { "epoch": 0.25, "grad_norm": 7.737701416015625, "learning_rate": 2.5216138328530258e-05, "loss": 0.4613, "step": 175 }, { "epoch": 0.26, "grad_norm": 7.011651515960693, "learning_rate": 2.5936599423631124e-05, "loss": 0.4886, "step": 180 }, { "epoch": 0.27, "grad_norm": 8.571374893188477, "learning_rate": 2.6657060518731993e-05, "loss": 0.4702, "step": 185 }, { "epoch": 0.27, "grad_norm": 7.675159454345703, "learning_rate": 2.737752161383285e-05, "loss": 0.5365, "step": 190 }, { "epoch": 0.28, "grad_norm": 5.9088239669799805, "learning_rate": 2.8097982708933717e-05, "loss": 0.3823, "step": 195 }, { "epoch": 0.29, "grad_norm": 5.840087413787842, "learning_rate": 2.8818443804034583e-05, "loss": 0.408, "step": 200 }, { "epoch": 0.3, "grad_norm": 6.880429267883301, "learning_rate": 2.953890489913545e-05, "loss": 0.4787, "step": 205 }, { "epoch": 0.3, "grad_norm": 5.355893611907959, "learning_rate": 3.025936599423631e-05, "loss": 0.377, "step": 210 }, { "epoch": 0.31, "grad_norm": 7.921416759490967, "learning_rate": 3.097982708933718e-05, "loss": 0.4504, "step": 215 }, { "epoch": 0.32, "grad_norm": 5.329736232757568, "learning_rate": 3.170028818443804e-05, "loss": 0.4056, "step": 220 }, { "epoch": 0.32, "grad_norm": 5.699007034301758, "learning_rate": 3.242074927953891e-05, "loss": 0.395, "step": 225 }, { "epoch": 0.33, "grad_norm": 10.29712963104248, "learning_rate": 3.314121037463977e-05, "loss": 0.433, "step": 230 }, { "epoch": 0.34, "grad_norm": 8.653733253479004, "learning_rate": 3.3861671469740636e-05, "loss": 0.3733, "step": 235 }, { "epoch": 0.35, "grad_norm": 4.476428508758545, "learning_rate": 3.4582132564841505e-05, "loss": 0.3758, "step": 240 }, { "epoch": 0.35, "grad_norm": 7.4768571853637695, "learning_rate": 3.530259365994236e-05, "loss": 0.4628, "step": 245 }, { "epoch": 0.36, "grad_norm": 7.058348655700684, "learning_rate": 3.602305475504323e-05, "loss": 0.3855, "step": 250 }, { "epoch": 0.37, "grad_norm": 7.238952159881592, "learning_rate": 3.674351585014409e-05, "loss": 0.389, "step": 255 }, { "epoch": 0.37, "grad_norm": 7.494441032409668, "learning_rate": 3.746397694524496e-05, "loss": 0.4285, "step": 260 }, { "epoch": 0.38, "grad_norm": 6.927433490753174, "learning_rate": 3.818443804034582e-05, "loss": 0.357, "step": 265 }, { "epoch": 0.39, "grad_norm": 8.478387832641602, "learning_rate": 3.8904899135446685e-05, "loss": 0.3612, "step": 270 }, { "epoch": 0.4, "grad_norm": 9.04246997833252, "learning_rate": 3.9625360230547554e-05, "loss": 0.3068, "step": 275 }, { "epoch": 0.4, "grad_norm": 7.052452087402344, "learning_rate": 4.034582132564842e-05, "loss": 0.3388, "step": 280 }, { "epoch": 0.41, "grad_norm": 6.3666510581970215, "learning_rate": 4.106628242074928e-05, "loss": 0.398, "step": 285 }, { "epoch": 0.42, "grad_norm": 7.982662200927734, "learning_rate": 4.178674351585015e-05, "loss": 0.3796, "step": 290 }, { "epoch": 0.42, "grad_norm": 6.020977973937988, "learning_rate": 4.250720461095101e-05, "loss": 0.4266, "step": 295 }, { "epoch": 0.43, "grad_norm": 7.010791778564453, "learning_rate": 4.322766570605187e-05, "loss": 0.4219, "step": 300 }, { "epoch": 0.44, "grad_norm": 5.0191216468811035, "learning_rate": 4.394812680115274e-05, "loss": 0.3489, "step": 305 }, { "epoch": 0.45, "grad_norm": 5.907705307006836, "learning_rate": 4.4668587896253604e-05, "loss": 0.3821, "step": 310 }, { "epoch": 0.45, "grad_norm": 6.560094356536865, "learning_rate": 4.538904899135447e-05, "loss": 0.3874, "step": 315 }, { "epoch": 0.46, "grad_norm": 6.429476737976074, "learning_rate": 4.6109510086455335e-05, "loss": 0.4096, "step": 320 }, { "epoch": 0.47, "grad_norm": 7.065363883972168, "learning_rate": 4.68299711815562e-05, "loss": 0.3778, "step": 325 }, { "epoch": 0.47, "grad_norm": 7.916449069976807, "learning_rate": 4.7550432276657067e-05, "loss": 0.3491, "step": 330 }, { "epoch": 0.48, "grad_norm": 5.4434709548950195, "learning_rate": 4.827089337175792e-05, "loss": 0.3645, "step": 335 }, { "epoch": 0.49, "grad_norm": 6.34391975402832, "learning_rate": 4.899135446685879e-05, "loss": 0.3778, "step": 340 }, { "epoch": 0.5, "grad_norm": 6.070534706115723, "learning_rate": 4.971181556195966e-05, "loss": 0.4307, "step": 345 }, { "epoch": 0.5, "grad_norm": 8.251782417297363, "learning_rate": 4.995196926032661e-05, "loss": 0.3964, "step": 350 }, { "epoch": 0.51, "grad_norm": 5.293612957000732, "learning_rate": 4.9871918027537626e-05, "loss": 0.3379, "step": 355 }, { "epoch": 0.52, "grad_norm": 7.164644241333008, "learning_rate": 4.979186679474864e-05, "loss": 0.3969, "step": 360 }, { "epoch": 0.53, "grad_norm": 4.961303234100342, "learning_rate": 4.971181556195966e-05, "loss": 0.3697, "step": 365 }, { "epoch": 0.53, "grad_norm": 6.196359157562256, "learning_rate": 4.9631764329170674e-05, "loss": 0.3448, "step": 370 }, { "epoch": 0.54, "grad_norm": 5.836663722991943, "learning_rate": 4.955171309638169e-05, "loss": 0.3939, "step": 375 }, { "epoch": 0.55, "grad_norm": 5.845285892486572, "learning_rate": 4.94716618635927e-05, "loss": 0.356, "step": 380 }, { "epoch": 0.55, "grad_norm": 3.937917947769165, "learning_rate": 4.9391610630803715e-05, "loss": 0.3033, "step": 385 }, { "epoch": 0.56, "grad_norm": 6.883370399475098, "learning_rate": 4.9311559398014736e-05, "loss": 0.415, "step": 390 }, { "epoch": 0.57, "grad_norm": 6.164604663848877, "learning_rate": 4.923150816522575e-05, "loss": 0.3794, "step": 395 }, { "epoch": 0.58, "grad_norm": 10.906937599182129, "learning_rate": 4.9151456932436764e-05, "loss": 0.3967, "step": 400 }, { "epoch": 0.58, "grad_norm": 3.428271532058716, "learning_rate": 4.907140569964778e-05, "loss": 0.3312, "step": 405 }, { "epoch": 0.59, "grad_norm": 7.288811206817627, "learning_rate": 4.899135446685879e-05, "loss": 0.3681, "step": 410 }, { "epoch": 0.6, "grad_norm": 8.319820404052734, "learning_rate": 4.8911303234069805e-05, "loss": 0.3265, "step": 415 }, { "epoch": 0.6, "grad_norm": 6.9813232421875, "learning_rate": 4.883125200128082e-05, "loss": 0.3233, "step": 420 }, { "epoch": 0.61, "grad_norm": 5.874197959899902, "learning_rate": 4.875120076849184e-05, "loss": 0.3689, "step": 425 }, { "epoch": 0.62, "grad_norm": 5.609955787658691, "learning_rate": 4.867114953570285e-05, "loss": 0.4213, "step": 430 }, { "epoch": 0.63, "grad_norm": 5.877446174621582, "learning_rate": 4.859109830291387e-05, "loss": 0.419, "step": 435 }, { "epoch": 0.63, "grad_norm": 6.771636962890625, "learning_rate": 4.851104707012488e-05, "loss": 0.3529, "step": 440 }, { "epoch": 0.64, "grad_norm": 9.461392402648926, "learning_rate": 4.8430995837335894e-05, "loss": 0.3725, "step": 445 }, { "epoch": 0.65, "grad_norm": 5.563230991363525, "learning_rate": 4.835094460454691e-05, "loss": 0.3675, "step": 450 }, { "epoch": 0.65, "grad_norm": 6.4672465324401855, "learning_rate": 4.827089337175792e-05, "loss": 0.3156, "step": 455 }, { "epoch": 0.66, "grad_norm": 3.8499579429626465, "learning_rate": 4.819084213896894e-05, "loss": 0.322, "step": 460 }, { "epoch": 0.67, "grad_norm": 5.031641960144043, "learning_rate": 4.8110790906179956e-05, "loss": 0.3936, "step": 465 }, { "epoch": 0.68, "grad_norm": 5.684152603149414, "learning_rate": 4.803073967339097e-05, "loss": 0.3898, "step": 470 }, { "epoch": 0.68, "grad_norm": 5.913132190704346, "learning_rate": 4.7950688440601984e-05, "loss": 0.3331, "step": 475 }, { "epoch": 0.69, "grad_norm": 5.199942588806152, "learning_rate": 4.7870637207813005e-05, "loss": 0.3303, "step": 480 }, { "epoch": 0.7, "grad_norm": 3.992769956588745, "learning_rate": 4.779058597502402e-05, "loss": 0.3422, "step": 485 }, { "epoch": 0.71, "grad_norm": 6.158402919769287, "learning_rate": 4.771053474223503e-05, "loss": 0.3152, "step": 490 }, { "epoch": 0.71, "grad_norm": 4.361845016479492, "learning_rate": 4.763048350944605e-05, "loss": 0.3057, "step": 495 }, { "epoch": 0.72, "grad_norm": 4.663881301879883, "learning_rate": 4.7550432276657067e-05, "loss": 0.3461, "step": 500 }, { "epoch": 0.73, "grad_norm": 7.09819221496582, "learning_rate": 4.747038104386808e-05, "loss": 0.3675, "step": 505 }, { "epoch": 0.73, "grad_norm": 5.0237956047058105, "learning_rate": 4.7390329811079094e-05, "loss": 0.3274, "step": 510 }, { "epoch": 0.74, "grad_norm": 5.483020782470703, "learning_rate": 4.731027857829011e-05, "loss": 0.3055, "step": 515 }, { "epoch": 0.75, "grad_norm": 4.972677707672119, "learning_rate": 4.723022734550112e-05, "loss": 0.3236, "step": 520 }, { "epoch": 0.76, "grad_norm": 7.017973899841309, "learning_rate": 4.7150176112712136e-05, "loss": 0.3543, "step": 525 }, { "epoch": 0.76, "grad_norm": 8.219503402709961, "learning_rate": 4.7070124879923156e-05, "loss": 0.3791, "step": 530 }, { "epoch": 0.77, "grad_norm": 5.836394309997559, "learning_rate": 4.699007364713417e-05, "loss": 0.2882, "step": 535 }, { "epoch": 0.78, "grad_norm": 6.394532680511475, "learning_rate": 4.6910022414345184e-05, "loss": 0.3741, "step": 540 }, { "epoch": 0.78, "grad_norm": 5.4533843994140625, "learning_rate": 4.68299711815562e-05, "loss": 0.3852, "step": 545 }, { "epoch": 0.79, "grad_norm": 6.065195083618164, "learning_rate": 4.674991994876721e-05, "loss": 0.3589, "step": 550 }, { "epoch": 0.8, "grad_norm": 4.000141620635986, "learning_rate": 4.6669868715978225e-05, "loss": 0.2865, "step": 555 }, { "epoch": 0.81, "grad_norm": 6.05587100982666, "learning_rate": 4.658981748318924e-05, "loss": 0.316, "step": 560 }, { "epoch": 0.81, "grad_norm": 5.1732892990112305, "learning_rate": 4.650976625040026e-05, "loss": 0.2768, "step": 565 }, { "epoch": 0.82, "grad_norm": 4.745729446411133, "learning_rate": 4.642971501761127e-05, "loss": 0.2796, "step": 570 }, { "epoch": 0.83, "grad_norm": 4.964130878448486, "learning_rate": 4.634966378482229e-05, "loss": 0.3268, "step": 575 }, { "epoch": 0.83, "grad_norm": 5.333953857421875, "learning_rate": 4.62696125520333e-05, "loss": 0.321, "step": 580 }, { "epoch": 0.84, "grad_norm": 4.004300117492676, "learning_rate": 4.6189561319244315e-05, "loss": 0.3371, "step": 585 }, { "epoch": 0.85, "grad_norm": 6.5950751304626465, "learning_rate": 4.6109510086455335e-05, "loss": 0.3028, "step": 590 }, { "epoch": 0.86, "grad_norm": 4.516002655029297, "learning_rate": 4.602945885366635e-05, "loss": 0.3539, "step": 595 }, { "epoch": 0.86, "grad_norm": 5.180628776550293, "learning_rate": 4.594940762087736e-05, "loss": 0.35, "step": 600 }, { "epoch": 0.87, "grad_norm": 3.2567028999328613, "learning_rate": 4.586935638808838e-05, "loss": 0.323, "step": 605 }, { "epoch": 0.88, "grad_norm": 3.9456095695495605, "learning_rate": 4.57893051552994e-05, "loss": 0.3378, "step": 610 }, { "epoch": 0.89, "grad_norm": 3.6121273040771484, "learning_rate": 4.570925392251041e-05, "loss": 0.2565, "step": 615 }, { "epoch": 0.89, "grad_norm": 4.358009338378906, "learning_rate": 4.5629202689721425e-05, "loss": 0.3147, "step": 620 }, { "epoch": 0.9, "grad_norm": 7.531122207641602, "learning_rate": 4.554915145693244e-05, "loss": 0.3346, "step": 625 }, { "epoch": 0.91, "grad_norm": 5.810347557067871, "learning_rate": 4.546910022414345e-05, "loss": 0.3196, "step": 630 }, { "epoch": 0.91, "grad_norm": 6.805031776428223, "learning_rate": 4.538904899135447e-05, "loss": 0.2952, "step": 635 }, { "epoch": 0.92, "grad_norm": 4.857294082641602, "learning_rate": 4.530899775856549e-05, "loss": 0.315, "step": 640 }, { "epoch": 0.93, "grad_norm": 4.595619201660156, "learning_rate": 4.52289465257765e-05, "loss": 0.3231, "step": 645 }, { "epoch": 0.94, "grad_norm": 5.075206279754639, "learning_rate": 4.5148895292987514e-05, "loss": 0.3019, "step": 650 }, { "epoch": 0.94, "grad_norm": 4.71131706237793, "learning_rate": 4.506884406019853e-05, "loss": 0.3249, "step": 655 }, { "epoch": 0.95, "grad_norm": 5.032394886016846, "learning_rate": 4.498879282740954e-05, "loss": 0.2653, "step": 660 }, { "epoch": 0.96, "grad_norm": 6.4502997398376465, "learning_rate": 4.4908741594620556e-05, "loss": 0.309, "step": 665 }, { "epoch": 0.96, "grad_norm": 5.608312129974365, "learning_rate": 4.4828690361831576e-05, "loss": 0.2943, "step": 670 }, { "epoch": 0.97, "grad_norm": 5.454727649688721, "learning_rate": 4.474863912904259e-05, "loss": 0.3037, "step": 675 }, { "epoch": 0.98, "grad_norm": 4.60232400894165, "learning_rate": 4.4668587896253604e-05, "loss": 0.2739, "step": 680 }, { "epoch": 0.99, "grad_norm": 5.319153308868408, "learning_rate": 4.458853666346462e-05, "loss": 0.2811, "step": 685 }, { "epoch": 0.99, "grad_norm": 4.6785054206848145, "learning_rate": 4.450848543067563e-05, "loss": 0.2929, "step": 690 }, { "epoch": 1.0, "eval_accuracy": 0.9385627530364372, "eval_loss": 0.16880780458450317, "eval_runtime": 32.041, "eval_samples_per_second": 308.355, "eval_steps_per_second": 9.644, "step": 694 }, { "epoch": 1.0, "grad_norm": 6.902053356170654, "learning_rate": 4.4428434197886645e-05, "loss": 0.3839, "step": 695 }, { "epoch": 1.01, "grad_norm": 4.171269416809082, "learning_rate": 4.434838296509766e-05, "loss": 0.3153, "step": 700 }, { "epoch": 1.01, "grad_norm": 3.1970090866088867, "learning_rate": 4.426833173230868e-05, "loss": 0.2399, "step": 705 }, { "epoch": 1.02, "grad_norm": 5.519264221191406, "learning_rate": 4.4188280499519693e-05, "loss": 0.2775, "step": 710 }, { "epoch": 1.03, "grad_norm": 4.797208786010742, "learning_rate": 4.4108229266730714e-05, "loss": 0.2805, "step": 715 }, { "epoch": 1.04, "grad_norm": 9.114941596984863, "learning_rate": 4.402817803394173e-05, "loss": 0.2846, "step": 720 }, { "epoch": 1.04, "grad_norm": 4.987404823303223, "learning_rate": 4.394812680115274e-05, "loss": 0.2849, "step": 725 }, { "epoch": 1.05, "grad_norm": 6.2959136962890625, "learning_rate": 4.3868075568363755e-05, "loss": 0.3129, "step": 730 }, { "epoch": 1.06, "grad_norm": 4.492276668548584, "learning_rate": 4.378802433557477e-05, "loss": 0.2384, "step": 735 }, { "epoch": 1.07, "grad_norm": 3.5424952507019043, "learning_rate": 4.370797310278579e-05, "loss": 0.2183, "step": 740 }, { "epoch": 1.07, "grad_norm": 7.594015598297119, "learning_rate": 4.3627921869996804e-05, "loss": 0.2657, "step": 745 }, { "epoch": 1.08, "grad_norm": 6.9036431312561035, "learning_rate": 4.354787063720782e-05, "loss": 0.2678, "step": 750 }, { "epoch": 1.09, "grad_norm": 7.780063629150391, "learning_rate": 4.346781940441883e-05, "loss": 0.3054, "step": 755 }, { "epoch": 1.09, "grad_norm": 5.562774181365967, "learning_rate": 4.3387768171629845e-05, "loss": 0.272, "step": 760 }, { "epoch": 1.1, "grad_norm": 7.2162861824035645, "learning_rate": 4.330771693884086e-05, "loss": 0.2678, "step": 765 }, { "epoch": 1.11, "grad_norm": 5.875248432159424, "learning_rate": 4.322766570605187e-05, "loss": 0.2691, "step": 770 }, { "epoch": 1.12, "grad_norm": 4.324618339538574, "learning_rate": 4.314761447326289e-05, "loss": 0.3025, "step": 775 }, { "epoch": 1.12, "grad_norm": 4.129276275634766, "learning_rate": 4.306756324047391e-05, "loss": 0.2596, "step": 780 }, { "epoch": 1.13, "grad_norm": 3.086761713027954, "learning_rate": 4.298751200768492e-05, "loss": 0.2528, "step": 785 }, { "epoch": 1.14, "grad_norm": 4.340246200561523, "learning_rate": 4.2907460774895934e-05, "loss": 0.223, "step": 790 }, { "epoch": 1.14, "grad_norm": 3.6360461711883545, "learning_rate": 4.282740954210695e-05, "loss": 0.2547, "step": 795 }, { "epoch": 1.15, "grad_norm": 4.182173252105713, "learning_rate": 4.274735830931796e-05, "loss": 0.287, "step": 800 }, { "epoch": 1.16, "grad_norm": 4.418725490570068, "learning_rate": 4.2667307076528976e-05, "loss": 0.2888, "step": 805 }, { "epoch": 1.17, "grad_norm": 4.325172424316406, "learning_rate": 4.2587255843739996e-05, "loss": 0.2634, "step": 810 }, { "epoch": 1.17, "grad_norm": 5.551906585693359, "learning_rate": 4.250720461095101e-05, "loss": 0.2651, "step": 815 }, { "epoch": 1.18, "grad_norm": 3.631472110748291, "learning_rate": 4.2427153378162024e-05, "loss": 0.2745, "step": 820 }, { "epoch": 1.19, "grad_norm": 3.5533196926116943, "learning_rate": 4.234710214537304e-05, "loss": 0.2936, "step": 825 }, { "epoch": 1.19, "grad_norm": 4.504055023193359, "learning_rate": 4.226705091258406e-05, "loss": 0.3025, "step": 830 }, { "epoch": 1.2, "grad_norm": 4.739752292633057, "learning_rate": 4.218699967979507e-05, "loss": 0.303, "step": 835 }, { "epoch": 1.21, "grad_norm": 5.039779186248779, "learning_rate": 4.2106948447006086e-05, "loss": 0.2663, "step": 840 }, { "epoch": 1.22, "grad_norm": 3.7070090770721436, "learning_rate": 4.2026897214217107e-05, "loss": 0.2663, "step": 845 }, { "epoch": 1.22, "grad_norm": 4.351013660430908, "learning_rate": 4.194684598142812e-05, "loss": 0.2454, "step": 850 }, { "epoch": 1.23, "grad_norm": 5.0032830238342285, "learning_rate": 4.1866794748639134e-05, "loss": 0.245, "step": 855 }, { "epoch": 1.24, "grad_norm": 3.203274965286255, "learning_rate": 4.178674351585015e-05, "loss": 0.3036, "step": 860 }, { "epoch": 1.25, "grad_norm": 4.47341775894165, "learning_rate": 4.170669228306116e-05, "loss": 0.3336, "step": 865 }, { "epoch": 1.25, "grad_norm": 4.188334941864014, "learning_rate": 4.1626641050272176e-05, "loss": 0.2529, "step": 870 }, { "epoch": 1.26, "grad_norm": 3.3264882564544678, "learning_rate": 4.154658981748319e-05, "loss": 0.2258, "step": 875 }, { "epoch": 1.27, "grad_norm": 4.058962821960449, "learning_rate": 4.146653858469421e-05, "loss": 0.3129, "step": 880 }, { "epoch": 1.27, "grad_norm": 4.271402359008789, "learning_rate": 4.1386487351905224e-05, "loss": 0.2605, "step": 885 }, { "epoch": 1.28, "grad_norm": 8.134669303894043, "learning_rate": 4.130643611911624e-05, "loss": 0.3072, "step": 890 }, { "epoch": 1.29, "grad_norm": 5.065728664398193, "learning_rate": 4.122638488632725e-05, "loss": 0.2557, "step": 895 }, { "epoch": 1.3, "grad_norm": 4.518153190612793, "learning_rate": 4.1146333653538265e-05, "loss": 0.2591, "step": 900 }, { "epoch": 1.3, "grad_norm": 6.0956926345825195, "learning_rate": 4.106628242074928e-05, "loss": 0.3001, "step": 905 }, { "epoch": 1.31, "grad_norm": 4.715207099914551, "learning_rate": 4.098623118796029e-05, "loss": 0.2882, "step": 910 }, { "epoch": 1.32, "grad_norm": 6.3927435874938965, "learning_rate": 4.090617995517131e-05, "loss": 0.2733, "step": 915 }, { "epoch": 1.32, "grad_norm": 3.886277198791504, "learning_rate": 4.082612872238233e-05, "loss": 0.2558, "step": 920 }, { "epoch": 1.33, "grad_norm": 6.690213203430176, "learning_rate": 4.074607748959334e-05, "loss": 0.2411, "step": 925 }, { "epoch": 1.34, "grad_norm": 5.04226016998291, "learning_rate": 4.0666026256804355e-05, "loss": 0.2814, "step": 930 }, { "epoch": 1.35, "grad_norm": 6.361902236938477, "learning_rate": 4.058597502401537e-05, "loss": 0.1918, "step": 935 }, { "epoch": 1.35, "grad_norm": 6.6365227699279785, "learning_rate": 4.050592379122638e-05, "loss": 0.2714, "step": 940 }, { "epoch": 1.36, "grad_norm": 4.794340133666992, "learning_rate": 4.04258725584374e-05, "loss": 0.269, "step": 945 }, { "epoch": 1.37, "grad_norm": 5.207016468048096, "learning_rate": 4.034582132564842e-05, "loss": 0.2955, "step": 950 }, { "epoch": 1.37, "grad_norm": 5.347695350646973, "learning_rate": 4.026577009285944e-05, "loss": 0.2341, "step": 955 }, { "epoch": 1.38, "grad_norm": 7.788352966308594, "learning_rate": 4.018571886007045e-05, "loss": 0.2228, "step": 960 }, { "epoch": 1.39, "grad_norm": 4.078495025634766, "learning_rate": 4.0105667627281465e-05, "loss": 0.2408, "step": 965 }, { "epoch": 1.4, "grad_norm": 5.237365245819092, "learning_rate": 4.002561639449248e-05, "loss": 0.2891, "step": 970 }, { "epoch": 1.4, "grad_norm": 5.711833953857422, "learning_rate": 3.994556516170349e-05, "loss": 0.323, "step": 975 }, { "epoch": 1.41, "grad_norm": 3.250711679458618, "learning_rate": 3.9865513928914506e-05, "loss": 0.2945, "step": 980 }, { "epoch": 1.42, "grad_norm": 6.933974266052246, "learning_rate": 3.978546269612553e-05, "loss": 0.2507, "step": 985 }, { "epoch": 1.42, "grad_norm": 4.515052795410156, "learning_rate": 3.970541146333654e-05, "loss": 0.265, "step": 990 }, { "epoch": 1.43, "grad_norm": 4.89296293258667, "learning_rate": 3.9625360230547554e-05, "loss": 0.2868, "step": 995 }, { "epoch": 1.44, "grad_norm": 4.629034996032715, "learning_rate": 3.954530899775857e-05, "loss": 0.2773, "step": 1000 }, { "epoch": 1.45, "grad_norm": 3.881559371948242, "learning_rate": 3.946525776496958e-05, "loss": 0.3336, "step": 1005 }, { "epoch": 1.45, "grad_norm": 3.4768316745758057, "learning_rate": 3.9385206532180596e-05, "loss": 0.2212, "step": 1010 }, { "epoch": 1.46, "grad_norm": 5.582344055175781, "learning_rate": 3.930515529939161e-05, "loss": 0.3031, "step": 1015 }, { "epoch": 1.47, "grad_norm": 3.73008131980896, "learning_rate": 3.922510406660262e-05, "loss": 0.2557, "step": 1020 }, { "epoch": 1.48, "grad_norm": 5.319180011749268, "learning_rate": 3.9145052833813644e-05, "loss": 0.2679, "step": 1025 }, { "epoch": 1.48, "grad_norm": 6.709672451019287, "learning_rate": 3.906500160102466e-05, "loss": 0.2471, "step": 1030 }, { "epoch": 1.49, "grad_norm": 5.294819355010986, "learning_rate": 3.898495036823567e-05, "loss": 0.2661, "step": 1035 }, { "epoch": 1.5, "grad_norm": 3.2995288372039795, "learning_rate": 3.8904899135446685e-05, "loss": 0.2789, "step": 1040 }, { "epoch": 1.5, "grad_norm": 4.34086799621582, "learning_rate": 3.88248479026577e-05, "loss": 0.2789, "step": 1045 }, { "epoch": 1.51, "grad_norm": 5.209534168243408, "learning_rate": 3.874479666986871e-05, "loss": 0.3002, "step": 1050 }, { "epoch": 1.52, "grad_norm": 5.175271034240723, "learning_rate": 3.8664745437079733e-05, "loss": 0.2631, "step": 1055 }, { "epoch": 1.53, "grad_norm": 4.909916400909424, "learning_rate": 3.858469420429075e-05, "loss": 0.25, "step": 1060 }, { "epoch": 1.53, "grad_norm": 3.8786613941192627, "learning_rate": 3.850464297150176e-05, "loss": 0.226, "step": 1065 }, { "epoch": 1.54, "grad_norm": 4.349425315856934, "learning_rate": 3.842459173871278e-05, "loss": 0.2635, "step": 1070 }, { "epoch": 1.55, "grad_norm": 5.107605934143066, "learning_rate": 3.8344540505923795e-05, "loss": 0.2536, "step": 1075 }, { "epoch": 1.55, "grad_norm": 5.436495780944824, "learning_rate": 3.826448927313481e-05, "loss": 0.2911, "step": 1080 }, { "epoch": 1.56, "grad_norm": 5.1116156578063965, "learning_rate": 3.818443804034582e-05, "loss": 0.3064, "step": 1085 }, { "epoch": 1.57, "grad_norm": 4.1365742683410645, "learning_rate": 3.810438680755684e-05, "loss": 0.2003, "step": 1090 }, { "epoch": 1.58, "grad_norm": 5.43222188949585, "learning_rate": 3.802433557476786e-05, "loss": 0.291, "step": 1095 }, { "epoch": 1.58, "grad_norm": 6.062341690063477, "learning_rate": 3.794428434197887e-05, "loss": 0.2325, "step": 1100 }, { "epoch": 1.59, "grad_norm": 4.5507097244262695, "learning_rate": 3.7864233109189885e-05, "loss": 0.2493, "step": 1105 }, { "epoch": 1.6, "grad_norm": 3.3975865840911865, "learning_rate": 3.77841818764009e-05, "loss": 0.2349, "step": 1110 }, { "epoch": 1.6, "grad_norm": 3.967979907989502, "learning_rate": 3.770413064361191e-05, "loss": 0.2364, "step": 1115 }, { "epoch": 1.61, "grad_norm": 4.541342735290527, "learning_rate": 3.7624079410822926e-05, "loss": 0.2285, "step": 1120 }, { "epoch": 1.62, "grad_norm": 4.848491668701172, "learning_rate": 3.754402817803394e-05, "loss": 0.235, "step": 1125 }, { "epoch": 1.63, "grad_norm": 5.879725933074951, "learning_rate": 3.746397694524496e-05, "loss": 0.2759, "step": 1130 }, { "epoch": 1.63, "grad_norm": 6.01210880279541, "learning_rate": 3.7383925712455975e-05, "loss": 0.3345, "step": 1135 }, { "epoch": 1.64, "grad_norm": 4.760444641113281, "learning_rate": 3.730387447966699e-05, "loss": 0.2708, "step": 1140 }, { "epoch": 1.65, "grad_norm": 4.630128860473633, "learning_rate": 3.7223823246878e-05, "loss": 0.3049, "step": 1145 }, { "epoch": 1.66, "grad_norm": 4.3284101486206055, "learning_rate": 3.7143772014089016e-05, "loss": 0.2822, "step": 1150 }, { "epoch": 1.66, "grad_norm": 6.679904937744141, "learning_rate": 3.706372078130003e-05, "loss": 0.2764, "step": 1155 }, { "epoch": 1.67, "grad_norm": 5.192065238952637, "learning_rate": 3.6983669548511043e-05, "loss": 0.2479, "step": 1160 }, { "epoch": 1.68, "grad_norm": 4.901111125946045, "learning_rate": 3.6903618315722064e-05, "loss": 0.2849, "step": 1165 }, { "epoch": 1.68, "grad_norm": 6.2184977531433105, "learning_rate": 3.682356708293308e-05, "loss": 0.2667, "step": 1170 }, { "epoch": 1.69, "grad_norm": 5.900247573852539, "learning_rate": 3.674351585014409e-05, "loss": 0.2992, "step": 1175 }, { "epoch": 1.7, "grad_norm": 3.7004477977752686, "learning_rate": 3.666346461735511e-05, "loss": 0.2791, "step": 1180 }, { "epoch": 1.71, "grad_norm": 4.646676063537598, "learning_rate": 3.6583413384566126e-05, "loss": 0.2525, "step": 1185 }, { "epoch": 1.71, "grad_norm": 4.426496982574463, "learning_rate": 3.650336215177714e-05, "loss": 0.2624, "step": 1190 }, { "epoch": 1.72, "grad_norm": 4.333110809326172, "learning_rate": 3.6423310918988154e-05, "loss": 0.2777, "step": 1195 }, { "epoch": 1.73, "grad_norm": 3.7483744621276855, "learning_rate": 3.6343259686199174e-05, "loss": 0.2897, "step": 1200 }, { "epoch": 1.73, "grad_norm": 5.556215286254883, "learning_rate": 3.626320845341019e-05, "loss": 0.3432, "step": 1205 }, { "epoch": 1.74, "grad_norm": 4.707242965698242, "learning_rate": 3.61831572206212e-05, "loss": 0.2439, "step": 1210 }, { "epoch": 1.75, "grad_norm": 4.767390251159668, "learning_rate": 3.6103105987832216e-05, "loss": 0.2744, "step": 1215 }, { "epoch": 1.76, "grad_norm": 4.1662492752075195, "learning_rate": 3.602305475504323e-05, "loss": 0.267, "step": 1220 }, { "epoch": 1.76, "grad_norm": 4.437891006469727, "learning_rate": 3.594300352225424e-05, "loss": 0.2354, "step": 1225 }, { "epoch": 1.77, "grad_norm": 5.63749361038208, "learning_rate": 3.586295228946526e-05, "loss": 0.2557, "step": 1230 }, { "epoch": 1.78, "grad_norm": 6.398256778717041, "learning_rate": 3.578290105667628e-05, "loss": 0.2697, "step": 1235 }, { "epoch": 1.78, "grad_norm": 4.15376091003418, "learning_rate": 3.570284982388729e-05, "loss": 0.2672, "step": 1240 }, { "epoch": 1.79, "grad_norm": 8.952369689941406, "learning_rate": 3.5622798591098305e-05, "loss": 0.2992, "step": 1245 }, { "epoch": 1.8, "grad_norm": 7.161625385284424, "learning_rate": 3.554274735830932e-05, "loss": 0.3067, "step": 1250 }, { "epoch": 1.81, "grad_norm": 3.848027467727661, "learning_rate": 3.546269612552033e-05, "loss": 0.3165, "step": 1255 }, { "epoch": 1.81, "grad_norm": 5.625514507293701, "learning_rate": 3.5382644892731347e-05, "loss": 0.2792, "step": 1260 }, { "epoch": 1.82, "grad_norm": 3.829505681991577, "learning_rate": 3.530259365994236e-05, "loss": 0.258, "step": 1265 }, { "epoch": 1.83, "grad_norm": 4.038649559020996, "learning_rate": 3.522254242715338e-05, "loss": 0.2668, "step": 1270 }, { "epoch": 1.84, "grad_norm": 3.746533155441284, "learning_rate": 3.5142491194364395e-05, "loss": 0.2571, "step": 1275 }, { "epoch": 1.84, "grad_norm": 3.9205687046051025, "learning_rate": 3.506243996157541e-05, "loss": 0.2148, "step": 1280 }, { "epoch": 1.85, "grad_norm": 5.464355945587158, "learning_rate": 3.498238872878642e-05, "loss": 0.2707, "step": 1285 }, { "epoch": 1.86, "grad_norm": 4.321130752563477, "learning_rate": 3.4902337495997436e-05, "loss": 0.2434, "step": 1290 }, { "epoch": 1.86, "grad_norm": 6.3836588859558105, "learning_rate": 3.482228626320846e-05, "loss": 0.2601, "step": 1295 }, { "epoch": 1.87, "grad_norm": 2.9065053462982178, "learning_rate": 3.474223503041947e-05, "loss": 0.2033, "step": 1300 }, { "epoch": 1.88, "grad_norm": 4.280132293701172, "learning_rate": 3.4662183797630484e-05, "loss": 0.2708, "step": 1305 }, { "epoch": 1.89, "grad_norm": 5.5674262046813965, "learning_rate": 3.4582132564841505e-05, "loss": 0.2899, "step": 1310 }, { "epoch": 1.89, "grad_norm": 4.071995735168457, "learning_rate": 3.450208133205252e-05, "loss": 0.2714, "step": 1315 }, { "epoch": 1.9, "grad_norm": 6.83046817779541, "learning_rate": 3.442203009926353e-05, "loss": 0.2563, "step": 1320 }, { "epoch": 1.91, "grad_norm": 4.866962432861328, "learning_rate": 3.4341978866474546e-05, "loss": 0.2898, "step": 1325 }, { "epoch": 1.91, "grad_norm": 6.10991096496582, "learning_rate": 3.426192763368556e-05, "loss": 0.2927, "step": 1330 }, { "epoch": 1.92, "grad_norm": 8.084212303161621, "learning_rate": 3.4181876400896574e-05, "loss": 0.2668, "step": 1335 }, { "epoch": 1.93, "grad_norm": 2.702385902404785, "learning_rate": 3.4101825168107594e-05, "loss": 0.2617, "step": 1340 }, { "epoch": 1.94, "grad_norm": 5.180947303771973, "learning_rate": 3.402177393531861e-05, "loss": 0.2411, "step": 1345 }, { "epoch": 1.94, "grad_norm": 3.0766685009002686, "learning_rate": 3.394172270252962e-05, "loss": 0.2723, "step": 1350 }, { "epoch": 1.95, "grad_norm": 3.833108901977539, "learning_rate": 3.3861671469740636e-05, "loss": 0.2237, "step": 1355 }, { "epoch": 1.96, "grad_norm": 4.505425930023193, "learning_rate": 3.378162023695165e-05, "loss": 0.2685, "step": 1360 }, { "epoch": 1.96, "grad_norm": 3.9498701095581055, "learning_rate": 3.370156900416266e-05, "loss": 0.2637, "step": 1365 }, { "epoch": 1.97, "grad_norm": 6.345920562744141, "learning_rate": 3.362151777137368e-05, "loss": 0.2745, "step": 1370 }, { "epoch": 1.98, "grad_norm": 4.702010154724121, "learning_rate": 3.35414665385847e-05, "loss": 0.2837, "step": 1375 }, { "epoch": 1.99, "grad_norm": 4.943043231964111, "learning_rate": 3.346141530579571e-05, "loss": 0.2525, "step": 1380 }, { "epoch": 1.99, "grad_norm": 3.8749611377716064, "learning_rate": 3.3381364073006725e-05, "loss": 0.2499, "step": 1385 }, { "epoch": 2.0, "eval_accuracy": 0.9425101214574899, "eval_loss": 0.1516382098197937, "eval_runtime": 31.6984, "eval_samples_per_second": 311.687, "eval_steps_per_second": 9.748, "step": 1389 }, { "epoch": 2.0, "grad_norm": 6.219438076019287, "learning_rate": 3.330131284021774e-05, "loss": 0.3071, "step": 1390 }, { "epoch": 2.01, "grad_norm": 4.6552629470825195, "learning_rate": 3.322126160742875e-05, "loss": 0.2364, "step": 1395 }, { "epoch": 2.02, "grad_norm": 3.997241497039795, "learning_rate": 3.314121037463977e-05, "loss": 0.249, "step": 1400 }, { "epoch": 2.02, "grad_norm": 3.6796419620513916, "learning_rate": 3.306115914185078e-05, "loss": 0.2261, "step": 1405 }, { "epoch": 2.03, "grad_norm": 3.0016541481018066, "learning_rate": 3.29811079090618e-05, "loss": 0.2127, "step": 1410 }, { "epoch": 2.04, "grad_norm": 4.60055685043335, "learning_rate": 3.2901056676272815e-05, "loss": 0.2207, "step": 1415 }, { "epoch": 2.04, "grad_norm": 6.432025909423828, "learning_rate": 3.2821005443483835e-05, "loss": 0.2088, "step": 1420 }, { "epoch": 2.05, "grad_norm": 4.049763202667236, "learning_rate": 3.274095421069485e-05, "loss": 0.2193, "step": 1425 }, { "epoch": 2.06, "grad_norm": 4.77670955657959, "learning_rate": 3.266090297790586e-05, "loss": 0.2666, "step": 1430 }, { "epoch": 2.07, "grad_norm": 3.876225709915161, "learning_rate": 3.258085174511688e-05, "loss": 0.222, "step": 1435 }, { "epoch": 2.07, "grad_norm": 2.917393207550049, "learning_rate": 3.250080051232789e-05, "loss": 0.2481, "step": 1440 }, { "epoch": 2.08, "grad_norm": 3.858349084854126, "learning_rate": 3.242074927953891e-05, "loss": 0.2929, "step": 1445 }, { "epoch": 2.09, "grad_norm": 4.08052921295166, "learning_rate": 3.2340698046749925e-05, "loss": 0.2081, "step": 1450 }, { "epoch": 2.09, "grad_norm": 3.8843398094177246, "learning_rate": 3.226064681396094e-05, "loss": 0.1917, "step": 1455 }, { "epoch": 2.1, "grad_norm": 4.356058597564697, "learning_rate": 3.218059558117195e-05, "loss": 0.2211, "step": 1460 }, { "epoch": 2.11, "grad_norm": 5.629312038421631, "learning_rate": 3.2100544348382966e-05, "loss": 0.2704, "step": 1465 }, { "epoch": 2.12, "grad_norm": 3.8312325477600098, "learning_rate": 3.202049311559398e-05, "loss": 0.2418, "step": 1470 }, { "epoch": 2.12, "grad_norm": 3.1079790592193604, "learning_rate": 3.1940441882804994e-05, "loss": 0.1948, "step": 1475 }, { "epoch": 2.13, "grad_norm": 4.682496547698975, "learning_rate": 3.1860390650016015e-05, "loss": 0.2023, "step": 1480 }, { "epoch": 2.14, "grad_norm": 4.4082489013671875, "learning_rate": 3.178033941722703e-05, "loss": 0.2346, "step": 1485 }, { "epoch": 2.14, "grad_norm": 5.721102714538574, "learning_rate": 3.170028818443804e-05, "loss": 0.2294, "step": 1490 }, { "epoch": 2.15, "grad_norm": 3.2310311794281006, "learning_rate": 3.1620236951649056e-05, "loss": 0.2074, "step": 1495 }, { "epoch": 2.16, "grad_norm": 5.734870433807373, "learning_rate": 3.154018571886007e-05, "loss": 0.2244, "step": 1500 }, { "epoch": 2.17, "grad_norm": 4.256961822509766, "learning_rate": 3.1460134486071084e-05, "loss": 0.2208, "step": 1505 }, { "epoch": 2.17, "grad_norm": 6.9470696449279785, "learning_rate": 3.13800832532821e-05, "loss": 0.2736, "step": 1510 }, { "epoch": 2.18, "grad_norm": 2.8514010906219482, "learning_rate": 3.130003202049312e-05, "loss": 0.1989, "step": 1515 }, { "epoch": 2.19, "grad_norm": 4.2279744148254395, "learning_rate": 3.121998078770413e-05, "loss": 0.2753, "step": 1520 }, { "epoch": 2.2, "grad_norm": 3.349268674850464, "learning_rate": 3.1139929554915145e-05, "loss": 0.181, "step": 1525 }, { "epoch": 2.2, "grad_norm": 4.550454616546631, "learning_rate": 3.105987832212616e-05, "loss": 0.2536, "step": 1530 }, { "epoch": 2.21, "grad_norm": 3.7860782146453857, "learning_rate": 3.097982708933718e-05, "loss": 0.2331, "step": 1535 }, { "epoch": 2.22, "grad_norm": 4.5719170570373535, "learning_rate": 3.0899775856548194e-05, "loss": 0.2408, "step": 1540 }, { "epoch": 2.22, "grad_norm": 4.448012828826904, "learning_rate": 3.081972462375921e-05, "loss": 0.2219, "step": 1545 }, { "epoch": 2.23, "grad_norm": 3.7972702980041504, "learning_rate": 3.073967339097023e-05, "loss": 0.2691, "step": 1550 }, { "epoch": 2.24, "grad_norm": 4.268452167510986, "learning_rate": 3.065962215818124e-05, "loss": 0.2215, "step": 1555 }, { "epoch": 2.25, "grad_norm": 4.145329475402832, "learning_rate": 3.0579570925392256e-05, "loss": 0.2488, "step": 1560 }, { "epoch": 2.25, "grad_norm": 5.501221656799316, "learning_rate": 3.049951969260327e-05, "loss": 0.2441, "step": 1565 }, { "epoch": 2.26, "grad_norm": 4.3408203125, "learning_rate": 3.0419468459814283e-05, "loss": 0.2308, "step": 1570 }, { "epoch": 2.27, "grad_norm": 4.104162216186523, "learning_rate": 3.0339417227025297e-05, "loss": 0.2538, "step": 1575 }, { "epoch": 2.27, "grad_norm": 5.441348075866699, "learning_rate": 3.025936599423631e-05, "loss": 0.2742, "step": 1580 }, { "epoch": 2.28, "grad_norm": 3.3526971340179443, "learning_rate": 3.017931476144733e-05, "loss": 0.1934, "step": 1585 }, { "epoch": 2.29, "grad_norm": 3.5918030738830566, "learning_rate": 3.0099263528658345e-05, "loss": 0.256, "step": 1590 }, { "epoch": 2.3, "grad_norm": 3.9758517742156982, "learning_rate": 3.001921229586936e-05, "loss": 0.2096, "step": 1595 }, { "epoch": 2.3, "grad_norm": 2.7759931087493896, "learning_rate": 2.9939161063080373e-05, "loss": 0.2545, "step": 1600 }, { "epoch": 2.31, "grad_norm": 6.958917140960693, "learning_rate": 2.9859109830291387e-05, "loss": 0.2293, "step": 1605 }, { "epoch": 2.32, "grad_norm": 4.162193775177002, "learning_rate": 2.97790585975024e-05, "loss": 0.2095, "step": 1610 }, { "epoch": 2.32, "grad_norm": 3.701801061630249, "learning_rate": 2.9699007364713418e-05, "loss": 0.2339, "step": 1615 }, { "epoch": 2.33, "grad_norm": 3.290947437286377, "learning_rate": 2.9618956131924435e-05, "loss": 0.209, "step": 1620 }, { "epoch": 2.34, "grad_norm": 4.3231024742126465, "learning_rate": 2.953890489913545e-05, "loss": 0.2791, "step": 1625 }, { "epoch": 2.35, "grad_norm": 3.6642446517944336, "learning_rate": 2.9458853666346466e-05, "loss": 0.2382, "step": 1630 }, { "epoch": 2.35, "grad_norm": 6.942342281341553, "learning_rate": 2.937880243355748e-05, "loss": 0.2406, "step": 1635 }, { "epoch": 2.36, "grad_norm": 3.886199712753296, "learning_rate": 2.9298751200768493e-05, "loss": 0.218, "step": 1640 }, { "epoch": 2.37, "grad_norm": 3.8468515872955322, "learning_rate": 2.9218699967979507e-05, "loss": 0.2449, "step": 1645 }, { "epoch": 2.37, "grad_norm": 3.2598648071289062, "learning_rate": 2.913864873519052e-05, "loss": 0.2276, "step": 1650 }, { "epoch": 2.38, "grad_norm": 3.9356770515441895, "learning_rate": 2.905859750240154e-05, "loss": 0.2481, "step": 1655 }, { "epoch": 2.39, "grad_norm": 5.803495407104492, "learning_rate": 2.8978546269612555e-05, "loss": 0.2699, "step": 1660 }, { "epoch": 2.4, "grad_norm": 3.3325111865997314, "learning_rate": 2.889849503682357e-05, "loss": 0.2206, "step": 1665 }, { "epoch": 2.4, "grad_norm": 5.40475606918335, "learning_rate": 2.8818443804034583e-05, "loss": 0.2295, "step": 1670 }, { "epoch": 2.41, "grad_norm": 4.207846164703369, "learning_rate": 2.8738392571245597e-05, "loss": 0.2268, "step": 1675 }, { "epoch": 2.42, "grad_norm": 3.405880928039551, "learning_rate": 2.8658341338456614e-05, "loss": 0.2773, "step": 1680 }, { "epoch": 2.43, "grad_norm": 4.502201557159424, "learning_rate": 2.8578290105667628e-05, "loss": 0.2459, "step": 1685 }, { "epoch": 2.43, "grad_norm": 2.8585033416748047, "learning_rate": 2.8498238872878645e-05, "loss": 0.2626, "step": 1690 }, { "epoch": 2.44, "grad_norm": 4.774590015411377, "learning_rate": 2.8418187640089662e-05, "loss": 0.2242, "step": 1695 }, { "epoch": 2.45, "grad_norm": 6.423954010009766, "learning_rate": 2.8338136407300676e-05, "loss": 0.2711, "step": 1700 }, { "epoch": 2.45, "grad_norm": 5.023673057556152, "learning_rate": 2.825808517451169e-05, "loss": 0.2191, "step": 1705 }, { "epoch": 2.46, "grad_norm": 3.246953010559082, "learning_rate": 2.8178033941722703e-05, "loss": 0.2032, "step": 1710 }, { "epoch": 2.47, "grad_norm": 4.740121364593506, "learning_rate": 2.8097982708933717e-05, "loss": 0.2257, "step": 1715 }, { "epoch": 2.48, "grad_norm": 4.652435302734375, "learning_rate": 2.801793147614473e-05, "loss": 0.2441, "step": 1720 }, { "epoch": 2.48, "grad_norm": 3.7246835231781006, "learning_rate": 2.7937880243355745e-05, "loss": 0.2064, "step": 1725 }, { "epoch": 2.49, "grad_norm": 2.8556969165802, "learning_rate": 2.7857829010566765e-05, "loss": 0.2002, "step": 1730 }, { "epoch": 2.5, "grad_norm": 3.9338796138763428, "learning_rate": 2.777777777777778e-05, "loss": 0.2608, "step": 1735 }, { "epoch": 2.5, "grad_norm": 3.847045660018921, "learning_rate": 2.7697726544988796e-05, "loss": 0.2167, "step": 1740 }, { "epoch": 2.51, "grad_norm": 3.5335538387298584, "learning_rate": 2.761767531219981e-05, "loss": 0.1966, "step": 1745 }, { "epoch": 2.52, "grad_norm": 3.702679395675659, "learning_rate": 2.7537624079410824e-05, "loss": 0.1865, "step": 1750 }, { "epoch": 2.53, "grad_norm": 3.013113498687744, "learning_rate": 2.7457572846621838e-05, "loss": 0.199, "step": 1755 }, { "epoch": 2.53, "grad_norm": 3.300877809524536, "learning_rate": 2.737752161383285e-05, "loss": 0.2504, "step": 1760 }, { "epoch": 2.54, "grad_norm": 5.806422233581543, "learning_rate": 2.7297470381043872e-05, "loss": 0.2362, "step": 1765 }, { "epoch": 2.55, "grad_norm": 6.372203826904297, "learning_rate": 2.7217419148254886e-05, "loss": 0.2298, "step": 1770 }, { "epoch": 2.55, "grad_norm": 6.462773323059082, "learning_rate": 2.71373679154659e-05, "loss": 0.2367, "step": 1775 }, { "epoch": 2.56, "grad_norm": 5.330246448516846, "learning_rate": 2.7057316682676913e-05, "loss": 0.2543, "step": 1780 }, { "epoch": 2.57, "grad_norm": 4.1171956062316895, "learning_rate": 2.6977265449887927e-05, "loss": 0.2057, "step": 1785 }, { "epoch": 2.58, "grad_norm": 3.247389316558838, "learning_rate": 2.6897214217098944e-05, "loss": 0.1965, "step": 1790 }, { "epoch": 2.58, "grad_norm": 2.7912063598632812, "learning_rate": 2.6817162984309958e-05, "loss": 0.2103, "step": 1795 }, { "epoch": 2.59, "grad_norm": 2.85927152633667, "learning_rate": 2.6737111751520975e-05, "loss": 0.2226, "step": 1800 }, { "epoch": 2.6, "grad_norm": 3.5677337646484375, "learning_rate": 2.6657060518731993e-05, "loss": 0.2193, "step": 1805 }, { "epoch": 2.61, "grad_norm": 5.31620979309082, "learning_rate": 2.6577009285943006e-05, "loss": 0.2569, "step": 1810 }, { "epoch": 2.61, "grad_norm": 5.1970038414001465, "learning_rate": 2.649695805315402e-05, "loss": 0.2235, "step": 1815 }, { "epoch": 2.62, "grad_norm": 3.6116130352020264, "learning_rate": 2.6416906820365034e-05, "loss": 0.2353, "step": 1820 }, { "epoch": 2.63, "grad_norm": 4.2939043045043945, "learning_rate": 2.6336855587576048e-05, "loss": 0.2448, "step": 1825 }, { "epoch": 2.63, "grad_norm": 3.7755072116851807, "learning_rate": 2.625680435478706e-05, "loss": 0.2131, "step": 1830 }, { "epoch": 2.64, "grad_norm": 4.578812122344971, "learning_rate": 2.6176753121998082e-05, "loss": 0.2167, "step": 1835 }, { "epoch": 2.65, "grad_norm": 4.904923439025879, "learning_rate": 2.6096701889209096e-05, "loss": 0.2228, "step": 1840 }, { "epoch": 2.66, "grad_norm": 5.128912448883057, "learning_rate": 2.601665065642011e-05, "loss": 0.2888, "step": 1845 }, { "epoch": 2.66, "grad_norm": 5.788363933563232, "learning_rate": 2.5936599423631124e-05, "loss": 0.2421, "step": 1850 }, { "epoch": 2.67, "grad_norm": 4.001156806945801, "learning_rate": 2.585654819084214e-05, "loss": 0.1997, "step": 1855 }, { "epoch": 2.68, "grad_norm": 4.3057475090026855, "learning_rate": 2.5776496958053155e-05, "loss": 0.2434, "step": 1860 }, { "epoch": 2.68, "grad_norm": 3.524348258972168, "learning_rate": 2.5696445725264168e-05, "loss": 0.2188, "step": 1865 }, { "epoch": 2.69, "grad_norm": 6.004559516906738, "learning_rate": 2.561639449247519e-05, "loss": 0.2426, "step": 1870 }, { "epoch": 2.7, "grad_norm": 4.429930686950684, "learning_rate": 2.5536343259686203e-05, "loss": 0.2306, "step": 1875 }, { "epoch": 2.71, "grad_norm": 5.706151008605957, "learning_rate": 2.5456292026897216e-05, "loss": 0.2194, "step": 1880 }, { "epoch": 2.71, "grad_norm": 4.148650169372559, "learning_rate": 2.537624079410823e-05, "loss": 0.2683, "step": 1885 }, { "epoch": 2.72, "grad_norm": 3.2449026107788086, "learning_rate": 2.5296189561319244e-05, "loss": 0.2539, "step": 1890 }, { "epoch": 2.73, "grad_norm": 3.6404850482940674, "learning_rate": 2.5216138328530258e-05, "loss": 0.2221, "step": 1895 }, { "epoch": 2.73, "grad_norm": 3.1382288932800293, "learning_rate": 2.513608709574127e-05, "loss": 0.2266, "step": 1900 }, { "epoch": 2.74, "grad_norm": 8.027711868286133, "learning_rate": 2.5056035862952292e-05, "loss": 0.2944, "step": 1905 }, { "epoch": 2.75, "grad_norm": 7.140124797821045, "learning_rate": 2.4975984630163306e-05, "loss": 0.2036, "step": 1910 }, { "epoch": 2.76, "grad_norm": 3.4655325412750244, "learning_rate": 2.489593339737432e-05, "loss": 0.1955, "step": 1915 }, { "epoch": 2.76, "grad_norm": 3.295433759689331, "learning_rate": 2.4815882164585337e-05, "loss": 0.2114, "step": 1920 }, { "epoch": 2.77, "grad_norm": 3.806304931640625, "learning_rate": 2.473583093179635e-05, "loss": 0.206, "step": 1925 }, { "epoch": 2.78, "grad_norm": 4.674000263214111, "learning_rate": 2.4655779699007368e-05, "loss": 0.2215, "step": 1930 }, { "epoch": 2.79, "grad_norm": 3.5063233375549316, "learning_rate": 2.4575728466218382e-05, "loss": 0.2583, "step": 1935 }, { "epoch": 2.79, "grad_norm": 3.4132816791534424, "learning_rate": 2.4495677233429396e-05, "loss": 0.2388, "step": 1940 }, { "epoch": 2.8, "grad_norm": 3.2140300273895264, "learning_rate": 2.441562600064041e-05, "loss": 0.2395, "step": 1945 }, { "epoch": 2.81, "grad_norm": 4.795976638793945, "learning_rate": 2.4335574767851427e-05, "loss": 0.2206, "step": 1950 }, { "epoch": 2.81, "grad_norm": 3.491682767868042, "learning_rate": 2.425552353506244e-05, "loss": 0.2553, "step": 1955 }, { "epoch": 2.82, "grad_norm": 4.174879550933838, "learning_rate": 2.4175472302273454e-05, "loss": 0.1969, "step": 1960 }, { "epoch": 2.83, "grad_norm": 3.776137590408325, "learning_rate": 2.409542106948447e-05, "loss": 0.2276, "step": 1965 }, { "epoch": 2.84, "grad_norm": 3.7050764560699463, "learning_rate": 2.4015369836695485e-05, "loss": 0.2001, "step": 1970 }, { "epoch": 2.84, "grad_norm": 3.4648373126983643, "learning_rate": 2.3935318603906502e-05, "loss": 0.2538, "step": 1975 }, { "epoch": 2.85, "grad_norm": 4.3064727783203125, "learning_rate": 2.3855267371117516e-05, "loss": 0.2579, "step": 1980 }, { "epoch": 2.86, "grad_norm": 2.671032428741455, "learning_rate": 2.3775216138328533e-05, "loss": 0.2443, "step": 1985 }, { "epoch": 2.86, "grad_norm": 4.2159013748168945, "learning_rate": 2.3695164905539547e-05, "loss": 0.2373, "step": 1990 }, { "epoch": 2.87, "grad_norm": 3.787076711654663, "learning_rate": 2.361511367275056e-05, "loss": 0.2179, "step": 1995 }, { "epoch": 2.88, "grad_norm": 3.971762180328369, "learning_rate": 2.3535062439961578e-05, "loss": 0.2356, "step": 2000 }, { "epoch": 2.89, "grad_norm": 5.022749900817871, "learning_rate": 2.3455011207172592e-05, "loss": 0.2167, "step": 2005 }, { "epoch": 2.89, "grad_norm": 4.616547107696533, "learning_rate": 2.3374959974383606e-05, "loss": 0.2266, "step": 2010 }, { "epoch": 2.9, "grad_norm": 4.522019386291504, "learning_rate": 2.329490874159462e-05, "loss": 0.247, "step": 2015 }, { "epoch": 2.91, "grad_norm": 5.141051292419434, "learning_rate": 2.3214857508805637e-05, "loss": 0.2028, "step": 2020 }, { "epoch": 2.91, "grad_norm": 3.577793836593628, "learning_rate": 2.313480627601665e-05, "loss": 0.1924, "step": 2025 }, { "epoch": 2.92, "grad_norm": 5.1364665031433105, "learning_rate": 2.3054755043227668e-05, "loss": 0.226, "step": 2030 }, { "epoch": 2.93, "grad_norm": 3.8625662326812744, "learning_rate": 2.297470381043868e-05, "loss": 0.2329, "step": 2035 }, { "epoch": 2.94, "grad_norm": 4.119937419891357, "learning_rate": 2.28946525776497e-05, "loss": 0.2037, "step": 2040 }, { "epoch": 2.94, "grad_norm": 3.1188371181488037, "learning_rate": 2.2814601344860712e-05, "loss": 0.231, "step": 2045 }, { "epoch": 2.95, "grad_norm": 4.263334274291992, "learning_rate": 2.2734550112071726e-05, "loss": 0.219, "step": 2050 }, { "epoch": 2.96, "grad_norm": 4.002464771270752, "learning_rate": 2.2654498879282743e-05, "loss": 0.1927, "step": 2055 }, { "epoch": 2.97, "grad_norm": 3.5694775581359863, "learning_rate": 2.2574447646493757e-05, "loss": 0.1803, "step": 2060 }, { "epoch": 2.97, "grad_norm": 4.048843860626221, "learning_rate": 2.249439641370477e-05, "loss": 0.1837, "step": 2065 }, { "epoch": 2.98, "grad_norm": 4.335817337036133, "learning_rate": 2.2414345180915788e-05, "loss": 0.227, "step": 2070 }, { "epoch": 2.99, "grad_norm": 4.292420864105225, "learning_rate": 2.2334293948126802e-05, "loss": 0.2535, "step": 2075 }, { "epoch": 2.99, "grad_norm": 3.625598430633545, "learning_rate": 2.2254242715337816e-05, "loss": 0.1633, "step": 2080 }, { "epoch": 3.0, "eval_accuracy": 0.9487854251012146, "eval_loss": 0.1372506320476532, "eval_runtime": 31.8832, "eval_samples_per_second": 309.881, "eval_steps_per_second": 9.692, "step": 2084 }, { "epoch": 3.0, "grad_norm": 4.9075140953063965, "learning_rate": 2.217419148254883e-05, "loss": 0.19, "step": 2085 }, { "epoch": 3.01, "grad_norm": 4.76453971862793, "learning_rate": 2.2094140249759847e-05, "loss": 0.214, "step": 2090 }, { "epoch": 3.02, "grad_norm": 3.710191011428833, "learning_rate": 2.2014089016970864e-05, "loss": 0.2197, "step": 2095 }, { "epoch": 3.02, "grad_norm": 3.287574529647827, "learning_rate": 2.1934037784181878e-05, "loss": 0.1939, "step": 2100 }, { "epoch": 3.03, "grad_norm": 3.7616758346557617, "learning_rate": 2.1853986551392895e-05, "loss": 0.209, "step": 2105 }, { "epoch": 3.04, "grad_norm": 3.6096699237823486, "learning_rate": 2.177393531860391e-05, "loss": 0.2195, "step": 2110 }, { "epoch": 3.04, "grad_norm": 4.259820461273193, "learning_rate": 2.1693884085814922e-05, "loss": 0.1813, "step": 2115 }, { "epoch": 3.05, "grad_norm": 4.710832118988037, "learning_rate": 2.1613832853025936e-05, "loss": 0.2054, "step": 2120 }, { "epoch": 3.06, "grad_norm": 2.757356882095337, "learning_rate": 2.1533781620236953e-05, "loss": 0.2276, "step": 2125 }, { "epoch": 3.07, "grad_norm": 4.743321418762207, "learning_rate": 2.1453730387447967e-05, "loss": 0.1603, "step": 2130 }, { "epoch": 3.07, "grad_norm": 3.536240339279175, "learning_rate": 2.137367915465898e-05, "loss": 0.1888, "step": 2135 }, { "epoch": 3.08, "grad_norm": 3.635094404220581, "learning_rate": 2.1293627921869998e-05, "loss": 0.1841, "step": 2140 }, { "epoch": 3.09, "grad_norm": 4.491457939147949, "learning_rate": 2.1213576689081012e-05, "loss": 0.2013, "step": 2145 }, { "epoch": 3.09, "grad_norm": 5.20548152923584, "learning_rate": 2.113352545629203e-05, "loss": 0.1618, "step": 2150 }, { "epoch": 3.1, "grad_norm": 3.6702117919921875, "learning_rate": 2.1053474223503043e-05, "loss": 0.2106, "step": 2155 }, { "epoch": 3.11, "grad_norm": 3.9622325897216797, "learning_rate": 2.097342299071406e-05, "loss": 0.2488, "step": 2160 }, { "epoch": 3.12, "grad_norm": 7.823854923248291, "learning_rate": 2.0893371757925074e-05, "loss": 0.2107, "step": 2165 }, { "epoch": 3.12, "grad_norm": 5.4744791984558105, "learning_rate": 2.0813320525136088e-05, "loss": 0.1888, "step": 2170 }, { "epoch": 3.13, "grad_norm": 3.024887800216675, "learning_rate": 2.0733269292347105e-05, "loss": 0.2051, "step": 2175 }, { "epoch": 3.14, "grad_norm": 3.444693088531494, "learning_rate": 2.065321805955812e-05, "loss": 0.2404, "step": 2180 }, { "epoch": 3.15, "grad_norm": 4.3029656410217285, "learning_rate": 2.0573166826769133e-05, "loss": 0.215, "step": 2185 }, { "epoch": 3.15, "grad_norm": 4.038111209869385, "learning_rate": 2.0493115593980146e-05, "loss": 0.2003, "step": 2190 }, { "epoch": 3.16, "grad_norm": 4.064023494720459, "learning_rate": 2.0413064361191164e-05, "loss": 0.1961, "step": 2195 }, { "epoch": 3.17, "grad_norm": 5.2245707511901855, "learning_rate": 2.0333013128402177e-05, "loss": 0.2172, "step": 2200 }, { "epoch": 3.17, "grad_norm": 4.670438289642334, "learning_rate": 2.025296189561319e-05, "loss": 0.1992, "step": 2205 }, { "epoch": 3.18, "grad_norm": 4.39680290222168, "learning_rate": 2.017291066282421e-05, "loss": 0.2174, "step": 2210 }, { "epoch": 3.19, "grad_norm": 6.914219379425049, "learning_rate": 2.0092859430035225e-05, "loss": 0.1968, "step": 2215 }, { "epoch": 3.2, "grad_norm": 3.2190115451812744, "learning_rate": 2.001280819724624e-05, "loss": 0.1939, "step": 2220 }, { "epoch": 3.2, "grad_norm": 3.638925075531006, "learning_rate": 1.9932756964457253e-05, "loss": 0.2431, "step": 2225 }, { "epoch": 3.21, "grad_norm": 5.030416965484619, "learning_rate": 1.985270573166827e-05, "loss": 0.2094, "step": 2230 }, { "epoch": 3.22, "grad_norm": 5.105839729309082, "learning_rate": 1.9772654498879284e-05, "loss": 0.2165, "step": 2235 }, { "epoch": 3.22, "grad_norm": 4.913294315338135, "learning_rate": 1.9692603266090298e-05, "loss": 0.2171, "step": 2240 }, { "epoch": 3.23, "grad_norm": 4.230659008026123, "learning_rate": 1.961255203330131e-05, "loss": 0.2088, "step": 2245 }, { "epoch": 3.24, "grad_norm": 4.271526336669922, "learning_rate": 1.953250080051233e-05, "loss": 0.215, "step": 2250 }, { "epoch": 3.25, "grad_norm": 6.460733413696289, "learning_rate": 1.9452449567723343e-05, "loss": 0.2241, "step": 2255 }, { "epoch": 3.25, "grad_norm": 2.8896567821502686, "learning_rate": 1.9372398334934356e-05, "loss": 0.1587, "step": 2260 }, { "epoch": 3.26, "grad_norm": 3.2169876098632812, "learning_rate": 1.9292347102145374e-05, "loss": 0.1587, "step": 2265 }, { "epoch": 3.27, "grad_norm": 4.299535274505615, "learning_rate": 1.921229586935639e-05, "loss": 0.1819, "step": 2270 }, { "epoch": 3.27, "grad_norm": 3.9862189292907715, "learning_rate": 1.9132244636567405e-05, "loss": 0.2099, "step": 2275 }, { "epoch": 3.28, "grad_norm": 5.323502540588379, "learning_rate": 1.905219340377842e-05, "loss": 0.222, "step": 2280 }, { "epoch": 3.29, "grad_norm": 3.4311234951019287, "learning_rate": 1.8972142170989436e-05, "loss": 0.1956, "step": 2285 }, { "epoch": 3.3, "grad_norm": 4.878343105316162, "learning_rate": 1.889209093820045e-05, "loss": 0.1814, "step": 2290 }, { "epoch": 3.3, "grad_norm": 2.903064489364624, "learning_rate": 1.8812039705411463e-05, "loss": 0.2397, "step": 2295 }, { "epoch": 3.31, "grad_norm": 5.286783695220947, "learning_rate": 1.873198847262248e-05, "loss": 0.2362, "step": 2300 }, { "epoch": 3.32, "grad_norm": 4.201813220977783, "learning_rate": 1.8651937239833494e-05, "loss": 0.2235, "step": 2305 }, { "epoch": 3.32, "grad_norm": 3.4148082733154297, "learning_rate": 1.8571886007044508e-05, "loss": 0.1922, "step": 2310 }, { "epoch": 3.33, "grad_norm": 4.562300682067871, "learning_rate": 1.8491834774255522e-05, "loss": 0.2013, "step": 2315 }, { "epoch": 3.34, "grad_norm": 6.004905700683594, "learning_rate": 1.841178354146654e-05, "loss": 0.2215, "step": 2320 }, { "epoch": 3.35, "grad_norm": 4.642991065979004, "learning_rate": 1.8331732308677556e-05, "loss": 0.2085, "step": 2325 }, { "epoch": 3.35, "grad_norm": 2.796497344970703, "learning_rate": 1.825168107588857e-05, "loss": 0.2126, "step": 2330 }, { "epoch": 3.36, "grad_norm": 6.009349346160889, "learning_rate": 1.8171629843099587e-05, "loss": 0.1906, "step": 2335 }, { "epoch": 3.37, "grad_norm": 4.415472507476807, "learning_rate": 1.80915786103106e-05, "loss": 0.2013, "step": 2340 }, { "epoch": 3.38, "grad_norm": 2.890207529067993, "learning_rate": 1.8011527377521615e-05, "loss": 0.2017, "step": 2345 }, { "epoch": 3.38, "grad_norm": 3.2712149620056152, "learning_rate": 1.793147614473263e-05, "loss": 0.1997, "step": 2350 }, { "epoch": 3.39, "grad_norm": 4.87721061706543, "learning_rate": 1.7851424911943646e-05, "loss": 0.1944, "step": 2355 }, { "epoch": 3.4, "grad_norm": 5.590481281280518, "learning_rate": 1.777137367915466e-05, "loss": 0.1749, "step": 2360 }, { "epoch": 3.4, "grad_norm": 3.1477975845336914, "learning_rate": 1.7691322446365673e-05, "loss": 0.1734, "step": 2365 }, { "epoch": 3.41, "grad_norm": 4.50333309173584, "learning_rate": 1.761127121357669e-05, "loss": 0.244, "step": 2370 }, { "epoch": 3.42, "grad_norm": 4.189910411834717, "learning_rate": 1.7531219980787704e-05, "loss": 0.2015, "step": 2375 }, { "epoch": 3.43, "grad_norm": 4.48671817779541, "learning_rate": 1.7451168747998718e-05, "loss": 0.1994, "step": 2380 }, { "epoch": 3.43, "grad_norm": 3.9251739978790283, "learning_rate": 1.7371117515209735e-05, "loss": 0.1798, "step": 2385 }, { "epoch": 3.44, "grad_norm": 2.792525291442871, "learning_rate": 1.7291066282420752e-05, "loss": 0.1628, "step": 2390 }, { "epoch": 3.45, "grad_norm": 3.325592041015625, "learning_rate": 1.7211015049631766e-05, "loss": 0.2069, "step": 2395 }, { "epoch": 3.45, "grad_norm": 3.9942626953125, "learning_rate": 1.713096381684278e-05, "loss": 0.1866, "step": 2400 }, { "epoch": 3.46, "grad_norm": 5.486047267913818, "learning_rate": 1.7050912584053797e-05, "loss": 0.2185, "step": 2405 }, { "epoch": 3.47, "grad_norm": 3.5321319103240967, "learning_rate": 1.697086135126481e-05, "loss": 0.2068, "step": 2410 }, { "epoch": 3.48, "grad_norm": 4.118142127990723, "learning_rate": 1.6890810118475825e-05, "loss": 0.2076, "step": 2415 }, { "epoch": 3.48, "grad_norm": 4.678371906280518, "learning_rate": 1.681075888568684e-05, "loss": 0.1948, "step": 2420 }, { "epoch": 3.49, "grad_norm": 5.298951148986816, "learning_rate": 1.6730707652897856e-05, "loss": 0.2142, "step": 2425 }, { "epoch": 3.5, "grad_norm": 4.5779900550842285, "learning_rate": 1.665065642010887e-05, "loss": 0.1994, "step": 2430 }, { "epoch": 3.5, "grad_norm": 4.762623310089111, "learning_rate": 1.6570605187319883e-05, "loss": 0.2213, "step": 2435 }, { "epoch": 3.51, "grad_norm": 4.956728458404541, "learning_rate": 1.64905539545309e-05, "loss": 0.1818, "step": 2440 }, { "epoch": 3.52, "grad_norm": 3.7195310592651367, "learning_rate": 1.6410502721741918e-05, "loss": 0.2171, "step": 2445 }, { "epoch": 3.53, "grad_norm": 3.115422010421753, "learning_rate": 1.633045148895293e-05, "loss": 0.1873, "step": 2450 }, { "epoch": 3.53, "grad_norm": 2.4611568450927734, "learning_rate": 1.6250400256163945e-05, "loss": 0.1999, "step": 2455 }, { "epoch": 3.54, "grad_norm": 7.129974842071533, "learning_rate": 1.6170349023374962e-05, "loss": 0.2039, "step": 2460 }, { "epoch": 3.55, "grad_norm": 3.4364309310913086, "learning_rate": 1.6090297790585976e-05, "loss": 0.2019, "step": 2465 }, { "epoch": 3.56, "grad_norm": 7.869508266448975, "learning_rate": 1.601024655779699e-05, "loss": 0.1678, "step": 2470 }, { "epoch": 3.56, "grad_norm": 4.7185378074646, "learning_rate": 1.5930195325008007e-05, "loss": 0.1934, "step": 2475 }, { "epoch": 3.57, "grad_norm": 7.357175350189209, "learning_rate": 1.585014409221902e-05, "loss": 0.1998, "step": 2480 }, { "epoch": 3.58, "grad_norm": 3.6080660820007324, "learning_rate": 1.5770092859430035e-05, "loss": 0.1949, "step": 2485 }, { "epoch": 3.58, "grad_norm": 2.9534220695495605, "learning_rate": 1.569004162664105e-05, "loss": 0.1772, "step": 2490 }, { "epoch": 3.59, "grad_norm": 4.7188401222229, "learning_rate": 1.5609990393852066e-05, "loss": 0.2164, "step": 2495 }, { "epoch": 3.6, "grad_norm": 5.8504180908203125, "learning_rate": 1.552993916106308e-05, "loss": 0.2283, "step": 2500 }, { "epoch": 3.61, "grad_norm": 4.23643684387207, "learning_rate": 1.5449887928274097e-05, "loss": 0.2003, "step": 2505 }, { "epoch": 3.61, "grad_norm": 2.19675350189209, "learning_rate": 1.5369836695485114e-05, "loss": 0.1997, "step": 2510 }, { "epoch": 3.62, "grad_norm": 5.1381330490112305, "learning_rate": 1.5289785462696128e-05, "loss": 0.195, "step": 2515 }, { "epoch": 3.63, "grad_norm": 3.739199161529541, "learning_rate": 1.5209734229907142e-05, "loss": 0.1596, "step": 2520 }, { "epoch": 3.63, "grad_norm": 4.581226348876953, "learning_rate": 1.5129682997118155e-05, "loss": 0.2086, "step": 2525 }, { "epoch": 3.64, "grad_norm": 5.416107177734375, "learning_rate": 1.5049631764329173e-05, "loss": 0.2517, "step": 2530 }, { "epoch": 3.65, "grad_norm": 6.070262908935547, "learning_rate": 1.4969580531540186e-05, "loss": 0.1801, "step": 2535 }, { "epoch": 3.66, "grad_norm": 4.063976764678955, "learning_rate": 1.48895292987512e-05, "loss": 0.2302, "step": 2540 }, { "epoch": 3.66, "grad_norm": 3.717087745666504, "learning_rate": 1.4809478065962217e-05, "loss": 0.2185, "step": 2545 }, { "epoch": 3.67, "grad_norm": 3.2319772243499756, "learning_rate": 1.4729426833173233e-05, "loss": 0.2609, "step": 2550 }, { "epoch": 3.68, "grad_norm": 3.7224340438842773, "learning_rate": 1.4649375600384247e-05, "loss": 0.1906, "step": 2555 }, { "epoch": 3.68, "grad_norm": 6.972284317016602, "learning_rate": 1.456932436759526e-05, "loss": 0.2232, "step": 2560 }, { "epoch": 3.69, "grad_norm": 3.514923095703125, "learning_rate": 1.4489273134806278e-05, "loss": 0.2081, "step": 2565 }, { "epoch": 3.7, "grad_norm": 5.140145301818848, "learning_rate": 1.4409221902017291e-05, "loss": 0.2099, "step": 2570 }, { "epoch": 3.71, "grad_norm": 2.977041482925415, "learning_rate": 1.4329170669228307e-05, "loss": 0.1689, "step": 2575 }, { "epoch": 3.71, "grad_norm": 2.9438095092773438, "learning_rate": 1.4249119436439322e-05, "loss": 0.1788, "step": 2580 }, { "epoch": 3.72, "grad_norm": 3.311598777770996, "learning_rate": 1.4169068203650338e-05, "loss": 0.1787, "step": 2585 }, { "epoch": 3.73, "grad_norm": 4.066298961639404, "learning_rate": 1.4089016970861352e-05, "loss": 0.2049, "step": 2590 }, { "epoch": 3.74, "grad_norm": 3.8641276359558105, "learning_rate": 1.4008965738072365e-05, "loss": 0.2064, "step": 2595 }, { "epoch": 3.74, "grad_norm": 4.785098075866699, "learning_rate": 1.3928914505283383e-05, "loss": 0.213, "step": 2600 }, { "epoch": 3.75, "grad_norm": 3.3832712173461914, "learning_rate": 1.3848863272494398e-05, "loss": 0.203, "step": 2605 }, { "epoch": 3.76, "grad_norm": 3.8471434116363525, "learning_rate": 1.3768812039705412e-05, "loss": 0.2192, "step": 2610 }, { "epoch": 3.76, "grad_norm": 4.769313335418701, "learning_rate": 1.3688760806916426e-05, "loss": 0.2191, "step": 2615 }, { "epoch": 3.77, "grad_norm": 3.5882818698883057, "learning_rate": 1.3608709574127443e-05, "loss": 0.1952, "step": 2620 }, { "epoch": 3.78, "grad_norm": 4.177798271179199, "learning_rate": 1.3528658341338457e-05, "loss": 0.2209, "step": 2625 }, { "epoch": 3.79, "grad_norm": 5.218222618103027, "learning_rate": 1.3448607108549472e-05, "loss": 0.1953, "step": 2630 }, { "epoch": 3.79, "grad_norm": 4.669002056121826, "learning_rate": 1.3368555875760488e-05, "loss": 0.2017, "step": 2635 }, { "epoch": 3.8, "grad_norm": 4.992402076721191, "learning_rate": 1.3288504642971503e-05, "loss": 0.2702, "step": 2640 }, { "epoch": 3.81, "grad_norm": 3.818152666091919, "learning_rate": 1.3208453410182517e-05, "loss": 0.2195, "step": 2645 }, { "epoch": 3.81, "grad_norm": 3.825201988220215, "learning_rate": 1.312840217739353e-05, "loss": 0.2086, "step": 2650 }, { "epoch": 3.82, "grad_norm": 3.2888553142547607, "learning_rate": 1.3048350944604548e-05, "loss": 0.1899, "step": 2655 }, { "epoch": 3.83, "grad_norm": 4.896663665771484, "learning_rate": 1.2968299711815562e-05, "loss": 0.2154, "step": 2660 }, { "epoch": 3.84, "grad_norm": 3.9895691871643066, "learning_rate": 1.2888248479026577e-05, "loss": 0.2251, "step": 2665 }, { "epoch": 3.84, "grad_norm": 3.9652981758117676, "learning_rate": 1.2808197246237594e-05, "loss": 0.2116, "step": 2670 }, { "epoch": 3.85, "grad_norm": 4.93154764175415, "learning_rate": 1.2728146013448608e-05, "loss": 0.2597, "step": 2675 }, { "epoch": 3.86, "grad_norm": 4.236401081085205, "learning_rate": 1.2648094780659622e-05, "loss": 0.2312, "step": 2680 }, { "epoch": 3.86, "grad_norm": 3.95443058013916, "learning_rate": 1.2568043547870636e-05, "loss": 0.1696, "step": 2685 }, { "epoch": 3.87, "grad_norm": 2.7311601638793945, "learning_rate": 1.2487992315081653e-05, "loss": 0.1625, "step": 2690 }, { "epoch": 3.88, "grad_norm": 3.6803927421569824, "learning_rate": 1.2407941082292668e-05, "loss": 0.2069, "step": 2695 }, { "epoch": 3.89, "grad_norm": 3.391956329345703, "learning_rate": 1.2327889849503684e-05, "loss": 0.1779, "step": 2700 }, { "epoch": 3.89, "grad_norm": 3.478215456008911, "learning_rate": 1.2247838616714698e-05, "loss": 0.1874, "step": 2705 }, { "epoch": 3.9, "grad_norm": 2.4775846004486084, "learning_rate": 1.2167787383925713e-05, "loss": 0.1953, "step": 2710 }, { "epoch": 3.91, "grad_norm": 4.715533256530762, "learning_rate": 1.2087736151136727e-05, "loss": 0.1863, "step": 2715 }, { "epoch": 3.92, "grad_norm": 4.083915710449219, "learning_rate": 1.2007684918347743e-05, "loss": 0.1871, "step": 2720 }, { "epoch": 3.92, "grad_norm": 2.535428285598755, "learning_rate": 1.1927633685558758e-05, "loss": 0.2084, "step": 2725 }, { "epoch": 3.93, "grad_norm": 5.987590789794922, "learning_rate": 1.1847582452769774e-05, "loss": 0.172, "step": 2730 }, { "epoch": 3.94, "grad_norm": 4.185674667358398, "learning_rate": 1.1767531219980789e-05, "loss": 0.2106, "step": 2735 }, { "epoch": 3.94, "grad_norm": 3.0659992694854736, "learning_rate": 1.1687479987191803e-05, "loss": 0.1839, "step": 2740 }, { "epoch": 3.95, "grad_norm": 8.405370712280273, "learning_rate": 1.1607428754402818e-05, "loss": 0.2449, "step": 2745 }, { "epoch": 3.96, "grad_norm": 5.262624740600586, "learning_rate": 1.1527377521613834e-05, "loss": 0.1982, "step": 2750 }, { "epoch": 3.97, "grad_norm": 3.3970797061920166, "learning_rate": 1.144732628882485e-05, "loss": 0.2383, "step": 2755 }, { "epoch": 3.97, "grad_norm": 4.604133129119873, "learning_rate": 1.1367275056035863e-05, "loss": 0.211, "step": 2760 }, { "epoch": 3.98, "grad_norm": 4.767920970916748, "learning_rate": 1.1287223823246879e-05, "loss": 0.2111, "step": 2765 }, { "epoch": 3.99, "grad_norm": 4.075857162475586, "learning_rate": 1.1207172590457894e-05, "loss": 0.2011, "step": 2770 }, { "epoch": 3.99, "grad_norm": 3.293419599533081, "learning_rate": 1.1127121357668908e-05, "loss": 0.1943, "step": 2775 }, { "epoch": 4.0, "eval_accuracy": 0.9510121457489878, "eval_loss": 0.131936714053154, "eval_runtime": 31.7023, "eval_samples_per_second": 311.649, "eval_steps_per_second": 9.747, "step": 2779 }, { "epoch": 4.0, "grad_norm": 2.756840229034424, "learning_rate": 1.1047070124879923e-05, "loss": 0.2012, "step": 2780 }, { "epoch": 4.01, "grad_norm": 4.239038467407227, "learning_rate": 1.0967018892090939e-05, "loss": 0.1637, "step": 2785 }, { "epoch": 4.02, "grad_norm": 3.6597139835357666, "learning_rate": 1.0886967659301954e-05, "loss": 0.1848, "step": 2790 }, { "epoch": 4.02, "grad_norm": 3.050875425338745, "learning_rate": 1.0806916426512968e-05, "loss": 0.1565, "step": 2795 }, { "epoch": 4.03, "grad_norm": 4.3006463050842285, "learning_rate": 1.0726865193723984e-05, "loss": 0.2116, "step": 2800 }, { "epoch": 4.04, "grad_norm": 4.682863712310791, "learning_rate": 1.0646813960934999e-05, "loss": 0.1974, "step": 2805 }, { "epoch": 4.04, "grad_norm": 3.8604190349578857, "learning_rate": 1.0566762728146015e-05, "loss": 0.1972, "step": 2810 }, { "epoch": 4.05, "grad_norm": 4.325167655944824, "learning_rate": 1.048671149535703e-05, "loss": 0.1732, "step": 2815 }, { "epoch": 4.06, "grad_norm": 6.881094932556152, "learning_rate": 1.0406660262568044e-05, "loss": 0.2527, "step": 2820 }, { "epoch": 4.07, "grad_norm": 6.374682426452637, "learning_rate": 1.032660902977906e-05, "loss": 0.224, "step": 2825 }, { "epoch": 4.07, "grad_norm": 3.154886245727539, "learning_rate": 1.0246557796990073e-05, "loss": 0.1613, "step": 2830 }, { "epoch": 4.08, "grad_norm": 5.165164470672607, "learning_rate": 1.0166506564201089e-05, "loss": 0.225, "step": 2835 }, { "epoch": 4.09, "grad_norm": 3.388165235519409, "learning_rate": 1.0086455331412104e-05, "loss": 0.2189, "step": 2840 }, { "epoch": 4.09, "grad_norm": 4.795779705047607, "learning_rate": 1.000640409862312e-05, "loss": 0.2027, "step": 2845 }, { "epoch": 4.1, "grad_norm": 3.341182231903076, "learning_rate": 9.926352865834135e-06, "loss": 0.1931, "step": 2850 }, { "epoch": 4.11, "grad_norm": 1.956528902053833, "learning_rate": 9.846301633045149e-06, "loss": 0.2208, "step": 2855 }, { "epoch": 4.12, "grad_norm": 6.8234076499938965, "learning_rate": 9.766250400256164e-06, "loss": 0.2388, "step": 2860 }, { "epoch": 4.12, "grad_norm": 2.924370527267456, "learning_rate": 9.686199167467178e-06, "loss": 0.2046, "step": 2865 }, { "epoch": 4.13, "grad_norm": 5.049492359161377, "learning_rate": 9.606147934678195e-06, "loss": 0.1876, "step": 2870 }, { "epoch": 4.14, "grad_norm": 4.749929428100586, "learning_rate": 9.52609670188921e-06, "loss": 0.1649, "step": 2875 }, { "epoch": 4.15, "grad_norm": 3.702878475189209, "learning_rate": 9.446045469100225e-06, "loss": 0.2309, "step": 2880 }, { "epoch": 4.15, "grad_norm": 6.8818745613098145, "learning_rate": 9.36599423631124e-06, "loss": 0.2012, "step": 2885 }, { "epoch": 4.16, "grad_norm": 3.418677568435669, "learning_rate": 9.285943003522254e-06, "loss": 0.2209, "step": 2890 }, { "epoch": 4.17, "grad_norm": 3.8437540531158447, "learning_rate": 9.20589177073327e-06, "loss": 0.1668, "step": 2895 }, { "epoch": 4.17, "grad_norm": 3.2534446716308594, "learning_rate": 9.125840537944285e-06, "loss": 0.2346, "step": 2900 }, { "epoch": 4.18, "grad_norm": 4.049452781677246, "learning_rate": 9.0457893051553e-06, "loss": 0.1752, "step": 2905 }, { "epoch": 4.19, "grad_norm": 4.121111869812012, "learning_rate": 8.965738072366314e-06, "loss": 0.2057, "step": 2910 }, { "epoch": 4.2, "grad_norm": 5.423705577850342, "learning_rate": 8.88568683957733e-06, "loss": 0.1958, "step": 2915 }, { "epoch": 4.2, "grad_norm": 3.153987407684326, "learning_rate": 8.805635606788345e-06, "loss": 0.1547, "step": 2920 }, { "epoch": 4.21, "grad_norm": 3.7586491107940674, "learning_rate": 8.725584373999359e-06, "loss": 0.224, "step": 2925 }, { "epoch": 4.22, "grad_norm": 4.077225208282471, "learning_rate": 8.645533141210376e-06, "loss": 0.2113, "step": 2930 }, { "epoch": 4.22, "grad_norm": 6.970191478729248, "learning_rate": 8.56548190842139e-06, "loss": 0.2032, "step": 2935 }, { "epoch": 4.23, "grad_norm": 4.3456926345825195, "learning_rate": 8.485430675632405e-06, "loss": 0.2111, "step": 2940 }, { "epoch": 4.24, "grad_norm": 3.5162301063537598, "learning_rate": 8.40537944284342e-06, "loss": 0.1873, "step": 2945 }, { "epoch": 4.25, "grad_norm": 5.653372764587402, "learning_rate": 8.325328210054435e-06, "loss": 0.178, "step": 2950 }, { "epoch": 4.25, "grad_norm": 2.084319829940796, "learning_rate": 8.24527697726545e-06, "loss": 0.185, "step": 2955 }, { "epoch": 4.26, "grad_norm": 3.9863054752349854, "learning_rate": 8.165225744476466e-06, "loss": 0.1945, "step": 2960 }, { "epoch": 4.27, "grad_norm": 6.000556468963623, "learning_rate": 8.085174511687481e-06, "loss": 0.1823, "step": 2965 }, { "epoch": 4.27, "grad_norm": 3.515742778778076, "learning_rate": 8.005123278898495e-06, "loss": 0.1957, "step": 2970 }, { "epoch": 4.28, "grad_norm": 2.8108863830566406, "learning_rate": 7.92507204610951e-06, "loss": 0.1838, "step": 2975 }, { "epoch": 4.29, "grad_norm": 5.262875556945801, "learning_rate": 7.845020813320524e-06, "loss": 0.2389, "step": 2980 }, { "epoch": 4.3, "grad_norm": 5.4690752029418945, "learning_rate": 7.76496958053154e-06, "loss": 0.1823, "step": 2985 }, { "epoch": 4.3, "grad_norm": 2.1274213790893555, "learning_rate": 7.684918347742557e-06, "loss": 0.1233, "step": 2990 }, { "epoch": 4.31, "grad_norm": 6.855415344238281, "learning_rate": 7.604867114953571e-06, "loss": 0.2284, "step": 2995 }, { "epoch": 4.32, "grad_norm": 5.152151584625244, "learning_rate": 7.524815882164586e-06, "loss": 0.1856, "step": 3000 }, { "epoch": 4.33, "grad_norm": 4.211722373962402, "learning_rate": 7.4447646493756e-06, "loss": 0.2111, "step": 3005 }, { "epoch": 4.33, "grad_norm": 4.821152210235596, "learning_rate": 7.364713416586616e-06, "loss": 0.1541, "step": 3010 }, { "epoch": 4.34, "grad_norm": 3.2400951385498047, "learning_rate": 7.28466218379763e-06, "loss": 0.21, "step": 3015 }, { "epoch": 4.35, "grad_norm": 3.82334566116333, "learning_rate": 7.204610951008646e-06, "loss": 0.1835, "step": 3020 }, { "epoch": 4.35, "grad_norm": 4.301241397857666, "learning_rate": 7.124559718219661e-06, "loss": 0.2246, "step": 3025 }, { "epoch": 4.36, "grad_norm": 3.4558205604553223, "learning_rate": 7.044508485430676e-06, "loss": 0.1766, "step": 3030 }, { "epoch": 4.37, "grad_norm": 3.872791290283203, "learning_rate": 6.964457252641691e-06, "loss": 0.2126, "step": 3035 }, { "epoch": 4.38, "grad_norm": 2.319420099258423, "learning_rate": 6.884406019852706e-06, "loss": 0.179, "step": 3040 }, { "epoch": 4.38, "grad_norm": 6.737104892730713, "learning_rate": 6.8043547870637215e-06, "loss": 0.1882, "step": 3045 }, { "epoch": 4.39, "grad_norm": 4.559133052825928, "learning_rate": 6.724303554274736e-06, "loss": 0.1808, "step": 3050 }, { "epoch": 4.4, "grad_norm": 3.060370922088623, "learning_rate": 6.644252321485752e-06, "loss": 0.1923, "step": 3055 }, { "epoch": 4.4, "grad_norm": 5.091296672821045, "learning_rate": 6.564201088696765e-06, "loss": 0.2012, "step": 3060 }, { "epoch": 4.41, "grad_norm": 2.942782163619995, "learning_rate": 6.484149855907781e-06, "loss": 0.1731, "step": 3065 }, { "epoch": 4.42, "grad_norm": 4.692785263061523, "learning_rate": 6.404098623118797e-06, "loss": 0.1765, "step": 3070 }, { "epoch": 4.43, "grad_norm": 4.15416145324707, "learning_rate": 6.324047390329811e-06, "loss": 0.168, "step": 3075 }, { "epoch": 4.43, "grad_norm": 4.836540699005127, "learning_rate": 6.2439961575408265e-06, "loss": 0.1884, "step": 3080 }, { "epoch": 4.44, "grad_norm": 5.723465442657471, "learning_rate": 6.163944924751842e-06, "loss": 0.2006, "step": 3085 }, { "epoch": 4.45, "grad_norm": 3.738910675048828, "learning_rate": 6.083893691962857e-06, "loss": 0.152, "step": 3090 }, { "epoch": 4.45, "grad_norm": 4.6227641105651855, "learning_rate": 6.003842459173871e-06, "loss": 0.1885, "step": 3095 }, { "epoch": 4.46, "grad_norm": 4.877871036529541, "learning_rate": 5.923791226384887e-06, "loss": 0.1635, "step": 3100 }, { "epoch": 4.47, "grad_norm": 3.391716480255127, "learning_rate": 5.843739993595901e-06, "loss": 0.1917, "step": 3105 }, { "epoch": 4.48, "grad_norm": 3.0858306884765625, "learning_rate": 5.763688760806917e-06, "loss": 0.1981, "step": 3110 }, { "epoch": 4.48, "grad_norm": 3.075488805770874, "learning_rate": 5.6836375280179315e-06, "loss": 0.175, "step": 3115 }, { "epoch": 4.49, "grad_norm": 4.415194988250732, "learning_rate": 5.603586295228947e-06, "loss": 0.2039, "step": 3120 }, { "epoch": 4.5, "grad_norm": 4.507144451141357, "learning_rate": 5.523535062439962e-06, "loss": 0.1816, "step": 3125 }, { "epoch": 4.51, "grad_norm": 4.327670097351074, "learning_rate": 5.443483829650977e-06, "loss": 0.2072, "step": 3130 }, { "epoch": 4.51, "grad_norm": 3.314438819885254, "learning_rate": 5.363432596861992e-06, "loss": 0.1997, "step": 3135 }, { "epoch": 4.52, "grad_norm": 3.981945753097534, "learning_rate": 5.283381364073007e-06, "loss": 0.1643, "step": 3140 }, { "epoch": 4.53, "grad_norm": 3.4533607959747314, "learning_rate": 5.203330131284022e-06, "loss": 0.1503, "step": 3145 }, { "epoch": 4.53, "grad_norm": 3.6115882396698, "learning_rate": 5.123278898495037e-06, "loss": 0.1712, "step": 3150 }, { "epoch": 4.54, "grad_norm": 2.636838912963867, "learning_rate": 5.043227665706052e-06, "loss": 0.1828, "step": 3155 }, { "epoch": 4.55, "grad_norm": 3.045761823654175, "learning_rate": 4.9631764329170676e-06, "loss": 0.167, "step": 3160 }, { "epoch": 4.56, "grad_norm": 5.738334655761719, "learning_rate": 4.883125200128082e-06, "loss": 0.2237, "step": 3165 }, { "epoch": 4.56, "grad_norm": 2.163240909576416, "learning_rate": 4.803073967339098e-06, "loss": 0.1411, "step": 3170 }, { "epoch": 4.57, "grad_norm": 5.213181495666504, "learning_rate": 4.723022734550112e-06, "loss": 0.1874, "step": 3175 }, { "epoch": 4.58, "grad_norm": 3.869131565093994, "learning_rate": 4.642971501761127e-06, "loss": 0.1756, "step": 3180 }, { "epoch": 4.58, "grad_norm": 3.244732618331909, "learning_rate": 4.5629202689721425e-06, "loss": 0.1829, "step": 3185 }, { "epoch": 4.59, "grad_norm": 3.5364272594451904, "learning_rate": 4.482869036183157e-06, "loss": 0.1861, "step": 3190 }, { "epoch": 4.6, "grad_norm": 2.5283873081207275, "learning_rate": 4.402817803394173e-06, "loss": 0.1931, "step": 3195 }, { "epoch": 4.61, "grad_norm": 3.36181902885437, "learning_rate": 4.322766570605188e-06, "loss": 0.2183, "step": 3200 }, { "epoch": 4.61, "grad_norm": 5.513607025146484, "learning_rate": 4.242715337816203e-06, "loss": 0.1717, "step": 3205 }, { "epoch": 4.62, "grad_norm": 5.976490497589111, "learning_rate": 4.162664105027217e-06, "loss": 0.202, "step": 3210 }, { "epoch": 4.63, "grad_norm": 3.3449387550354004, "learning_rate": 4.082612872238233e-06, "loss": 0.2165, "step": 3215 }, { "epoch": 4.63, "grad_norm": 3.3972129821777344, "learning_rate": 4.0025616394492475e-06, "loss": 0.1994, "step": 3220 }, { "epoch": 4.64, "grad_norm": 4.022273540496826, "learning_rate": 3.922510406660262e-06, "loss": 0.168, "step": 3225 }, { "epoch": 4.65, "grad_norm": 3.2063329219818115, "learning_rate": 3.8424591738712785e-06, "loss": 0.1862, "step": 3230 }, { "epoch": 4.66, "grad_norm": 3.1869962215423584, "learning_rate": 3.762407941082293e-06, "loss": 0.1583, "step": 3235 }, { "epoch": 4.66, "grad_norm": 3.648125171661377, "learning_rate": 3.682356708293308e-06, "loss": 0.2026, "step": 3240 }, { "epoch": 4.67, "grad_norm": 4.182619571685791, "learning_rate": 3.602305475504323e-06, "loss": 0.1711, "step": 3245 }, { "epoch": 4.68, "grad_norm": 3.2886900901794434, "learning_rate": 3.522254242715338e-06, "loss": 0.1778, "step": 3250 }, { "epoch": 4.69, "grad_norm": 3.8204097747802734, "learning_rate": 3.442203009926353e-06, "loss": 0.1906, "step": 3255 }, { "epoch": 4.69, "grad_norm": 4.073367595672607, "learning_rate": 3.362151777137368e-06, "loss": 0.1693, "step": 3260 }, { "epoch": 4.7, "grad_norm": 4.779504299163818, "learning_rate": 3.2821005443483827e-06, "loss": 0.2031, "step": 3265 }, { "epoch": 4.71, "grad_norm": 4.730034828186035, "learning_rate": 3.2020493115593986e-06, "loss": 0.1731, "step": 3270 }, { "epoch": 4.71, "grad_norm": 4.198641300201416, "learning_rate": 3.1219980787704133e-06, "loss": 0.1982, "step": 3275 }, { "epoch": 4.72, "grad_norm": 3.796201229095459, "learning_rate": 3.0419468459814283e-06, "loss": 0.2502, "step": 3280 }, { "epoch": 4.73, "grad_norm": 3.4022860527038574, "learning_rate": 2.9618956131924434e-06, "loss": 0.1746, "step": 3285 }, { "epoch": 4.74, "grad_norm": 3.493821859359741, "learning_rate": 2.8818443804034585e-06, "loss": 0.1862, "step": 3290 }, { "epoch": 4.74, "grad_norm": 4.883081436157227, "learning_rate": 2.8017931476144735e-06, "loss": 0.1832, "step": 3295 }, { "epoch": 4.75, "grad_norm": 4.014003753662109, "learning_rate": 2.7217419148254886e-06, "loss": 0.2232, "step": 3300 }, { "epoch": 4.76, "grad_norm": 3.3797993659973145, "learning_rate": 2.6416906820365037e-06, "loss": 0.1791, "step": 3305 }, { "epoch": 4.76, "grad_norm": 2.9076929092407227, "learning_rate": 2.5616394492475183e-06, "loss": 0.1557, "step": 3310 }, { "epoch": 4.77, "grad_norm": 5.119110584259033, "learning_rate": 2.4815882164585338e-06, "loss": 0.1989, "step": 3315 }, { "epoch": 4.78, "grad_norm": 3.889577627182007, "learning_rate": 2.401536983669549e-06, "loss": 0.1771, "step": 3320 }, { "epoch": 4.79, "grad_norm": 2.979879379272461, "learning_rate": 2.3214857508805635e-06, "loss": 0.2187, "step": 3325 }, { "epoch": 4.79, "grad_norm": 4.31455135345459, "learning_rate": 2.2414345180915786e-06, "loss": 0.1818, "step": 3330 }, { "epoch": 4.8, "grad_norm": 5.267322540283203, "learning_rate": 2.161383285302594e-06, "loss": 0.1564, "step": 3335 }, { "epoch": 4.81, "grad_norm": 4.620851516723633, "learning_rate": 2.0813320525136087e-06, "loss": 0.2058, "step": 3340 }, { "epoch": 4.81, "grad_norm": 3.6133904457092285, "learning_rate": 2.0012808197246238e-06, "loss": 0.1678, "step": 3345 }, { "epoch": 4.82, "grad_norm": 2.955531358718872, "learning_rate": 1.9212295869356392e-06, "loss": 0.1771, "step": 3350 }, { "epoch": 4.83, "grad_norm": 5.3159403800964355, "learning_rate": 1.841178354146654e-06, "loss": 0.2387, "step": 3355 }, { "epoch": 4.84, "grad_norm": 3.5263235569000244, "learning_rate": 1.761127121357669e-06, "loss": 0.2061, "step": 3360 }, { "epoch": 4.84, "grad_norm": 3.794788122177124, "learning_rate": 1.681075888568684e-06, "loss": 0.1975, "step": 3365 }, { "epoch": 4.85, "grad_norm": 3.7242631912231445, "learning_rate": 1.6010246557796993e-06, "loss": 0.202, "step": 3370 }, { "epoch": 4.86, "grad_norm": 3.291221857070923, "learning_rate": 1.5209734229907142e-06, "loss": 0.1749, "step": 3375 }, { "epoch": 4.87, "grad_norm": 7.191506385803223, "learning_rate": 1.4409221902017292e-06, "loss": 0.1787, "step": 3380 }, { "epoch": 4.87, "grad_norm": 3.5962772369384766, "learning_rate": 1.3608709574127443e-06, "loss": 0.1894, "step": 3385 }, { "epoch": 4.88, "grad_norm": 3.013857126235962, "learning_rate": 1.2808197246237591e-06, "loss": 0.1439, "step": 3390 }, { "epoch": 4.89, "grad_norm": 3.8775179386138916, "learning_rate": 1.2007684918347744e-06, "loss": 0.1709, "step": 3395 }, { "epoch": 4.89, "grad_norm": 5.876482963562012, "learning_rate": 1.1207172590457893e-06, "loss": 0.1823, "step": 3400 }, { "epoch": 4.9, "grad_norm": 3.76519513130188, "learning_rate": 1.0406660262568043e-06, "loss": 0.1932, "step": 3405 }, { "epoch": 4.91, "grad_norm": 3.4437146186828613, "learning_rate": 9.606147934678196e-07, "loss": 0.2059, "step": 3410 }, { "epoch": 4.92, "grad_norm": 4.459022045135498, "learning_rate": 8.805635606788345e-07, "loss": 0.2016, "step": 3415 }, { "epoch": 4.92, "grad_norm": 3.656373977661133, "learning_rate": 8.005123278898497e-07, "loss": 0.1869, "step": 3420 }, { "epoch": 4.93, "grad_norm": 2.2337965965270996, "learning_rate": 7.204610951008646e-07, "loss": 0.1501, "step": 3425 }, { "epoch": 4.94, "grad_norm": 5.598134994506836, "learning_rate": 6.404098623118796e-07, "loss": 0.1659, "step": 3430 }, { "epoch": 4.94, "grad_norm": 4.543219089508057, "learning_rate": 5.603586295228946e-07, "loss": 0.2164, "step": 3435 }, { "epoch": 4.95, "grad_norm": 4.817913055419922, "learning_rate": 4.803073967339098e-07, "loss": 0.1332, "step": 3440 }, { "epoch": 4.96, "grad_norm": 6.280834674835205, "learning_rate": 4.002561639449248e-07, "loss": 0.2054, "step": 3445 }, { "epoch": 4.97, "grad_norm": 3.0518364906311035, "learning_rate": 3.202049311559398e-07, "loss": 0.1904, "step": 3450 }, { "epoch": 4.97, "grad_norm": 3.695298910140991, "learning_rate": 2.401536983669549e-07, "loss": 0.1784, "step": 3455 }, { "epoch": 4.98, "grad_norm": 6.226070880889893, "learning_rate": 1.601024655779699e-07, "loss": 0.1871, "step": 3460 }, { "epoch": 4.99, "grad_norm": 4.446568489074707, "learning_rate": 8.005123278898495e-08, "loss": 0.2494, "step": 3465 }, { "epoch": 4.99, "grad_norm": 5.050913333892822, "learning_rate": 0.0, "loss": 0.2138, "step": 3470 }, { "epoch": 4.99, "eval_accuracy": 0.9518218623481781, "eval_loss": 0.1259032040834427, "eval_runtime": 31.3409, "eval_samples_per_second": 315.243, "eval_steps_per_second": 9.859, "step": 3470 }, { "epoch": 4.99, "step": 3470, "total_flos": 1.1039888050539651e+19, "train_loss": 0.287632371283402, "train_runtime": 2790.3597, "train_samples_per_second": 159.318, "train_steps_per_second": 1.244 } ], "logging_steps": 5, "max_steps": 3470, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.1039888050539651e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }