|
{ |
|
"best_metric": 0.9518218623481781, |
|
"best_model_checkpoint": "swin-tiny-patch4-window7-224-hotel_images_classifier_v2/checkpoint-3470", |
|
"epoch": 4.9946023749550195, |
|
"eval_steps": 500, |
|
"global_step": 3470, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.747580528259277, |
|
"learning_rate": 7.204610951008646e-07, |
|
"loss": 1.9842, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.086691379547119, |
|
"learning_rate": 1.4409221902017292e-06, |
|
"loss": 1.9811, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.6829681396484375, |
|
"learning_rate": 2.161383285302594e-06, |
|
"loss": 1.9832, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.578265190124512, |
|
"learning_rate": 2.8818443804034585e-06, |
|
"loss": 1.9501, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.904880523681641, |
|
"learning_rate": 3.602305475504323e-06, |
|
"loss": 1.917, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.561208248138428, |
|
"learning_rate": 4.322766570605188e-06, |
|
"loss": 1.868, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.837254047393799, |
|
"learning_rate": 5.043227665706052e-06, |
|
"loss": 1.8244, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.515142440795898, |
|
"learning_rate": 5.763688760806917e-06, |
|
"loss": 1.7756, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.021132946014404, |
|
"learning_rate": 6.484149855907781e-06, |
|
"loss": 1.7231, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.325433254241943, |
|
"learning_rate": 7.204610951008646e-06, |
|
"loss": 1.6389, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.28499698638916, |
|
"learning_rate": 7.92507204610951e-06, |
|
"loss": 1.5651, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.940927982330322, |
|
"learning_rate": 8.645533141210376e-06, |
|
"loss": 1.5001, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.624394416809082, |
|
"learning_rate": 9.36599423631124e-06, |
|
"loss": 1.3998, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.9989118576049805, |
|
"learning_rate": 1.0086455331412104e-05, |
|
"loss": 1.2913, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.203399658203125, |
|
"learning_rate": 1.0806916426512968e-05, |
|
"loss": 1.1994, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.825283050537109, |
|
"learning_rate": 1.1527377521613834e-05, |
|
"loss": 1.0557, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.639811992645264, |
|
"learning_rate": 1.2247838616714698e-05, |
|
"loss": 1.0077, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.167383670806885, |
|
"learning_rate": 1.2968299711815562e-05, |
|
"loss": 0.9303, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.177196979522705, |
|
"learning_rate": 1.3688760806916426e-05, |
|
"loss": 0.7967, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.489429950714111, |
|
"learning_rate": 1.4409221902017291e-05, |
|
"loss": 0.7269, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.555374622344971, |
|
"learning_rate": 1.5129682997118155e-05, |
|
"loss": 0.7176, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 11.141295433044434, |
|
"learning_rate": 1.585014409221902e-05, |
|
"loss": 0.6796, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 7.412641525268555, |
|
"learning_rate": 1.6570605187319883e-05, |
|
"loss": 0.6028, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.904923439025879, |
|
"learning_rate": 1.7291066282420752e-05, |
|
"loss": 0.6348, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 11.165042877197266, |
|
"learning_rate": 1.8011527377521615e-05, |
|
"loss": 0.5814, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.367648124694824, |
|
"learning_rate": 1.873198847262248e-05, |
|
"loss": 0.5858, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.115988254547119, |
|
"learning_rate": 1.9452449567723343e-05, |
|
"loss": 0.5316, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.44365119934082, |
|
"learning_rate": 2.017291066282421e-05, |
|
"loss": 0.5049, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.195384502410889, |
|
"learning_rate": 2.0893371757925074e-05, |
|
"loss": 0.5511, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 17.6825008392334, |
|
"learning_rate": 2.1613832853025936e-05, |
|
"loss": 0.5124, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 7.656848907470703, |
|
"learning_rate": 2.2334293948126802e-05, |
|
"loss": 0.4794, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.221956729888916, |
|
"learning_rate": 2.3054755043227668e-05, |
|
"loss": 0.4773, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 16.787612915039062, |
|
"learning_rate": 2.3775216138328533e-05, |
|
"loss": 0.4746, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.123960494995117, |
|
"learning_rate": 2.4495677233429396e-05, |
|
"loss": 0.4734, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 7.737701416015625, |
|
"learning_rate": 2.5216138328530258e-05, |
|
"loss": 0.4613, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 7.011651515960693, |
|
"learning_rate": 2.5936599423631124e-05, |
|
"loss": 0.4886, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 8.571374893188477, |
|
"learning_rate": 2.6657060518731993e-05, |
|
"loss": 0.4702, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 7.675159454345703, |
|
"learning_rate": 2.737752161383285e-05, |
|
"loss": 0.5365, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.9088239669799805, |
|
"learning_rate": 2.8097982708933717e-05, |
|
"loss": 0.3823, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.840087413787842, |
|
"learning_rate": 2.8818443804034583e-05, |
|
"loss": 0.408, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.880429267883301, |
|
"learning_rate": 2.953890489913545e-05, |
|
"loss": 0.4787, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.355893611907959, |
|
"learning_rate": 3.025936599423631e-05, |
|
"loss": 0.377, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 7.921416759490967, |
|
"learning_rate": 3.097982708933718e-05, |
|
"loss": 0.4504, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.329736232757568, |
|
"learning_rate": 3.170028818443804e-05, |
|
"loss": 0.4056, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.699007034301758, |
|
"learning_rate": 3.242074927953891e-05, |
|
"loss": 0.395, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 10.29712963104248, |
|
"learning_rate": 3.314121037463977e-05, |
|
"loss": 0.433, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 8.653733253479004, |
|
"learning_rate": 3.3861671469740636e-05, |
|
"loss": 0.3733, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.476428508758545, |
|
"learning_rate": 3.4582132564841505e-05, |
|
"loss": 0.3758, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 7.4768571853637695, |
|
"learning_rate": 3.530259365994236e-05, |
|
"loss": 0.4628, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 7.058348655700684, |
|
"learning_rate": 3.602305475504323e-05, |
|
"loss": 0.3855, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.238952159881592, |
|
"learning_rate": 3.674351585014409e-05, |
|
"loss": 0.389, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.494441032409668, |
|
"learning_rate": 3.746397694524496e-05, |
|
"loss": 0.4285, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 6.927433490753174, |
|
"learning_rate": 3.818443804034582e-05, |
|
"loss": 0.357, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 8.478387832641602, |
|
"learning_rate": 3.8904899135446685e-05, |
|
"loss": 0.3612, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 9.04246997833252, |
|
"learning_rate": 3.9625360230547554e-05, |
|
"loss": 0.3068, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.052452087402344, |
|
"learning_rate": 4.034582132564842e-05, |
|
"loss": 0.3388, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 6.3666510581970215, |
|
"learning_rate": 4.106628242074928e-05, |
|
"loss": 0.398, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 7.982662200927734, |
|
"learning_rate": 4.178674351585015e-05, |
|
"loss": 0.3796, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 6.020977973937988, |
|
"learning_rate": 4.250720461095101e-05, |
|
"loss": 0.4266, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 7.010791778564453, |
|
"learning_rate": 4.322766570605187e-05, |
|
"loss": 0.4219, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 5.0191216468811035, |
|
"learning_rate": 4.394812680115274e-05, |
|
"loss": 0.3489, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 5.907705307006836, |
|
"learning_rate": 4.4668587896253604e-05, |
|
"loss": 0.3821, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 6.560094356536865, |
|
"learning_rate": 4.538904899135447e-05, |
|
"loss": 0.3874, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 6.429476737976074, |
|
"learning_rate": 4.6109510086455335e-05, |
|
"loss": 0.4096, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 7.065363883972168, |
|
"learning_rate": 4.68299711815562e-05, |
|
"loss": 0.3778, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 7.916449069976807, |
|
"learning_rate": 4.7550432276657067e-05, |
|
"loss": 0.3491, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 5.4434709548950195, |
|
"learning_rate": 4.827089337175792e-05, |
|
"loss": 0.3645, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 6.34391975402832, |
|
"learning_rate": 4.899135446685879e-05, |
|
"loss": 0.3778, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.070534706115723, |
|
"learning_rate": 4.971181556195966e-05, |
|
"loss": 0.4307, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 8.251782417297363, |
|
"learning_rate": 4.995196926032661e-05, |
|
"loss": 0.3964, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 5.293612957000732, |
|
"learning_rate": 4.9871918027537626e-05, |
|
"loss": 0.3379, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 7.164644241333008, |
|
"learning_rate": 4.979186679474864e-05, |
|
"loss": 0.3969, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.961303234100342, |
|
"learning_rate": 4.971181556195966e-05, |
|
"loss": 0.3697, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 6.196359157562256, |
|
"learning_rate": 4.9631764329170674e-05, |
|
"loss": 0.3448, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 5.836663722991943, |
|
"learning_rate": 4.955171309638169e-05, |
|
"loss": 0.3939, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 5.845285892486572, |
|
"learning_rate": 4.94716618635927e-05, |
|
"loss": 0.356, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.937917947769165, |
|
"learning_rate": 4.9391610630803715e-05, |
|
"loss": 0.3033, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.883370399475098, |
|
"learning_rate": 4.9311559398014736e-05, |
|
"loss": 0.415, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.164604663848877, |
|
"learning_rate": 4.923150816522575e-05, |
|
"loss": 0.3794, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 10.906937599182129, |
|
"learning_rate": 4.9151456932436764e-05, |
|
"loss": 0.3967, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.428271532058716, |
|
"learning_rate": 4.907140569964778e-05, |
|
"loss": 0.3312, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 7.288811206817627, |
|
"learning_rate": 4.899135446685879e-05, |
|
"loss": 0.3681, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 8.319820404052734, |
|
"learning_rate": 4.8911303234069805e-05, |
|
"loss": 0.3265, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.9813232421875, |
|
"learning_rate": 4.883125200128082e-05, |
|
"loss": 0.3233, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 5.874197959899902, |
|
"learning_rate": 4.875120076849184e-05, |
|
"loss": 0.3689, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 5.609955787658691, |
|
"learning_rate": 4.867114953570285e-05, |
|
"loss": 0.4213, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 5.877446174621582, |
|
"learning_rate": 4.859109830291387e-05, |
|
"loss": 0.419, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 6.771636962890625, |
|
"learning_rate": 4.851104707012488e-05, |
|
"loss": 0.3529, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 9.461392402648926, |
|
"learning_rate": 4.8430995837335894e-05, |
|
"loss": 0.3725, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 5.563230991363525, |
|
"learning_rate": 4.835094460454691e-05, |
|
"loss": 0.3675, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 6.4672465324401855, |
|
"learning_rate": 4.827089337175792e-05, |
|
"loss": 0.3156, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.8499579429626465, |
|
"learning_rate": 4.819084213896894e-05, |
|
"loss": 0.322, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.031641960144043, |
|
"learning_rate": 4.8110790906179956e-05, |
|
"loss": 0.3936, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 5.684152603149414, |
|
"learning_rate": 4.803073967339097e-05, |
|
"loss": 0.3898, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 5.913132190704346, |
|
"learning_rate": 4.7950688440601984e-05, |
|
"loss": 0.3331, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 5.199942588806152, |
|
"learning_rate": 4.7870637207813005e-05, |
|
"loss": 0.3303, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.992769956588745, |
|
"learning_rate": 4.779058597502402e-05, |
|
"loss": 0.3422, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.158402919769287, |
|
"learning_rate": 4.771053474223503e-05, |
|
"loss": 0.3152, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 4.361845016479492, |
|
"learning_rate": 4.763048350944605e-05, |
|
"loss": 0.3057, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.663881301879883, |
|
"learning_rate": 4.7550432276657067e-05, |
|
"loss": 0.3461, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 7.09819221496582, |
|
"learning_rate": 4.747038104386808e-05, |
|
"loss": 0.3675, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 5.0237956047058105, |
|
"learning_rate": 4.7390329811079094e-05, |
|
"loss": 0.3274, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 5.483020782470703, |
|
"learning_rate": 4.731027857829011e-05, |
|
"loss": 0.3055, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.972677707672119, |
|
"learning_rate": 4.723022734550112e-05, |
|
"loss": 0.3236, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 7.017973899841309, |
|
"learning_rate": 4.7150176112712136e-05, |
|
"loss": 0.3543, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 8.219503402709961, |
|
"learning_rate": 4.7070124879923156e-05, |
|
"loss": 0.3791, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 5.836394309997559, |
|
"learning_rate": 4.699007364713417e-05, |
|
"loss": 0.2882, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 6.394532680511475, |
|
"learning_rate": 4.6910022414345184e-05, |
|
"loss": 0.3741, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 5.4533843994140625, |
|
"learning_rate": 4.68299711815562e-05, |
|
"loss": 0.3852, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 6.065195083618164, |
|
"learning_rate": 4.674991994876721e-05, |
|
"loss": 0.3589, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.000141620635986, |
|
"learning_rate": 4.6669868715978225e-05, |
|
"loss": 0.2865, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 6.05587100982666, |
|
"learning_rate": 4.658981748318924e-05, |
|
"loss": 0.316, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.1732892990112305, |
|
"learning_rate": 4.650976625040026e-05, |
|
"loss": 0.2768, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.745729446411133, |
|
"learning_rate": 4.642971501761127e-05, |
|
"loss": 0.2796, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.964130878448486, |
|
"learning_rate": 4.634966378482229e-05, |
|
"loss": 0.3268, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 5.333953857421875, |
|
"learning_rate": 4.62696125520333e-05, |
|
"loss": 0.321, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.004300117492676, |
|
"learning_rate": 4.6189561319244315e-05, |
|
"loss": 0.3371, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 6.5950751304626465, |
|
"learning_rate": 4.6109510086455335e-05, |
|
"loss": 0.3028, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.516002655029297, |
|
"learning_rate": 4.602945885366635e-05, |
|
"loss": 0.3539, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 5.180628776550293, |
|
"learning_rate": 4.594940762087736e-05, |
|
"loss": 0.35, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.2567028999328613, |
|
"learning_rate": 4.586935638808838e-05, |
|
"loss": 0.323, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.9456095695495605, |
|
"learning_rate": 4.57893051552994e-05, |
|
"loss": 0.3378, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.6121273040771484, |
|
"learning_rate": 4.570925392251041e-05, |
|
"loss": 0.2565, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.358009338378906, |
|
"learning_rate": 4.5629202689721425e-05, |
|
"loss": 0.3147, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 7.531122207641602, |
|
"learning_rate": 4.554915145693244e-05, |
|
"loss": 0.3346, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.810347557067871, |
|
"learning_rate": 4.546910022414345e-05, |
|
"loss": 0.3196, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 6.805031776428223, |
|
"learning_rate": 4.538904899135447e-05, |
|
"loss": 0.2952, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.857294082641602, |
|
"learning_rate": 4.530899775856549e-05, |
|
"loss": 0.315, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.595619201660156, |
|
"learning_rate": 4.52289465257765e-05, |
|
"loss": 0.3231, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 5.075206279754639, |
|
"learning_rate": 4.5148895292987514e-05, |
|
"loss": 0.3019, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.71131706237793, |
|
"learning_rate": 4.506884406019853e-05, |
|
"loss": 0.3249, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 5.032394886016846, |
|
"learning_rate": 4.498879282740954e-05, |
|
"loss": 0.2653, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 6.4502997398376465, |
|
"learning_rate": 4.4908741594620556e-05, |
|
"loss": 0.309, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 5.608312129974365, |
|
"learning_rate": 4.4828690361831576e-05, |
|
"loss": 0.2943, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.454727649688721, |
|
"learning_rate": 4.474863912904259e-05, |
|
"loss": 0.3037, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.60232400894165, |
|
"learning_rate": 4.4668587896253604e-05, |
|
"loss": 0.2739, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 5.319153308868408, |
|
"learning_rate": 4.458853666346462e-05, |
|
"loss": 0.2811, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.6785054206848145, |
|
"learning_rate": 4.450848543067563e-05, |
|
"loss": 0.2929, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9385627530364372, |
|
"eval_loss": 0.16880780458450317, |
|
"eval_runtime": 32.041, |
|
"eval_samples_per_second": 308.355, |
|
"eval_steps_per_second": 9.644, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.902053356170654, |
|
"learning_rate": 4.4428434197886645e-05, |
|
"loss": 0.3839, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 4.171269416809082, |
|
"learning_rate": 4.434838296509766e-05, |
|
"loss": 0.3153, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.1970090866088867, |
|
"learning_rate": 4.426833173230868e-05, |
|
"loss": 0.2399, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 5.519264221191406, |
|
"learning_rate": 4.4188280499519693e-05, |
|
"loss": 0.2775, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 4.797208786010742, |
|
"learning_rate": 4.4108229266730714e-05, |
|
"loss": 0.2805, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 9.114941596984863, |
|
"learning_rate": 4.402817803394173e-05, |
|
"loss": 0.2846, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 4.987404823303223, |
|
"learning_rate": 4.394812680115274e-05, |
|
"loss": 0.2849, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 6.2959136962890625, |
|
"learning_rate": 4.3868075568363755e-05, |
|
"loss": 0.3129, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 4.492276668548584, |
|
"learning_rate": 4.378802433557477e-05, |
|
"loss": 0.2384, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.5424952507019043, |
|
"learning_rate": 4.370797310278579e-05, |
|
"loss": 0.2183, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 7.594015598297119, |
|
"learning_rate": 4.3627921869996804e-05, |
|
"loss": 0.2657, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 6.9036431312561035, |
|
"learning_rate": 4.354787063720782e-05, |
|
"loss": 0.2678, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 7.780063629150391, |
|
"learning_rate": 4.346781940441883e-05, |
|
"loss": 0.3054, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 5.562774181365967, |
|
"learning_rate": 4.3387768171629845e-05, |
|
"loss": 0.272, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 7.2162861824035645, |
|
"learning_rate": 4.330771693884086e-05, |
|
"loss": 0.2678, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 5.875248432159424, |
|
"learning_rate": 4.322766570605187e-05, |
|
"loss": 0.2691, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 4.324618339538574, |
|
"learning_rate": 4.314761447326289e-05, |
|
"loss": 0.3025, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 4.129276275634766, |
|
"learning_rate": 4.306756324047391e-05, |
|
"loss": 0.2596, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.086761713027954, |
|
"learning_rate": 4.298751200768492e-05, |
|
"loss": 0.2528, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 4.340246200561523, |
|
"learning_rate": 4.2907460774895934e-05, |
|
"loss": 0.223, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 3.6360461711883545, |
|
"learning_rate": 4.282740954210695e-05, |
|
"loss": 0.2547, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 4.182173252105713, |
|
"learning_rate": 4.274735830931796e-05, |
|
"loss": 0.287, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 4.418725490570068, |
|
"learning_rate": 4.2667307076528976e-05, |
|
"loss": 0.2888, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 4.325172424316406, |
|
"learning_rate": 4.2587255843739996e-05, |
|
"loss": 0.2634, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 5.551906585693359, |
|
"learning_rate": 4.250720461095101e-05, |
|
"loss": 0.2651, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 3.631472110748291, |
|
"learning_rate": 4.2427153378162024e-05, |
|
"loss": 0.2745, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 3.5533196926116943, |
|
"learning_rate": 4.234710214537304e-05, |
|
"loss": 0.2936, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 4.504055023193359, |
|
"learning_rate": 4.226705091258406e-05, |
|
"loss": 0.3025, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 4.739752292633057, |
|
"learning_rate": 4.218699967979507e-05, |
|
"loss": 0.303, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 5.039779186248779, |
|
"learning_rate": 4.2106948447006086e-05, |
|
"loss": 0.2663, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 3.7070090770721436, |
|
"learning_rate": 4.2026897214217107e-05, |
|
"loss": 0.2663, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 4.351013660430908, |
|
"learning_rate": 4.194684598142812e-05, |
|
"loss": 0.2454, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 5.0032830238342285, |
|
"learning_rate": 4.1866794748639134e-05, |
|
"loss": 0.245, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 3.203274965286255, |
|
"learning_rate": 4.178674351585015e-05, |
|
"loss": 0.3036, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 4.47341775894165, |
|
"learning_rate": 4.170669228306116e-05, |
|
"loss": 0.3336, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 4.188334941864014, |
|
"learning_rate": 4.1626641050272176e-05, |
|
"loss": 0.2529, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.3264882564544678, |
|
"learning_rate": 4.154658981748319e-05, |
|
"loss": 0.2258, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 4.058962821960449, |
|
"learning_rate": 4.146653858469421e-05, |
|
"loss": 0.3129, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 4.271402359008789, |
|
"learning_rate": 4.1386487351905224e-05, |
|
"loss": 0.2605, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 8.134669303894043, |
|
"learning_rate": 4.130643611911624e-05, |
|
"loss": 0.3072, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 5.065728664398193, |
|
"learning_rate": 4.122638488632725e-05, |
|
"loss": 0.2557, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 4.518153190612793, |
|
"learning_rate": 4.1146333653538265e-05, |
|
"loss": 0.2591, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 6.0956926345825195, |
|
"learning_rate": 4.106628242074928e-05, |
|
"loss": 0.3001, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 4.715207099914551, |
|
"learning_rate": 4.098623118796029e-05, |
|
"loss": 0.2882, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 6.3927435874938965, |
|
"learning_rate": 4.090617995517131e-05, |
|
"loss": 0.2733, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 3.886277198791504, |
|
"learning_rate": 4.082612872238233e-05, |
|
"loss": 0.2558, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 6.690213203430176, |
|
"learning_rate": 4.074607748959334e-05, |
|
"loss": 0.2411, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 5.04226016998291, |
|
"learning_rate": 4.0666026256804355e-05, |
|
"loss": 0.2814, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 6.361902236938477, |
|
"learning_rate": 4.058597502401537e-05, |
|
"loss": 0.1918, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 6.6365227699279785, |
|
"learning_rate": 4.050592379122638e-05, |
|
"loss": 0.2714, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 4.794340133666992, |
|
"learning_rate": 4.04258725584374e-05, |
|
"loss": 0.269, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 5.207016468048096, |
|
"learning_rate": 4.034582132564842e-05, |
|
"loss": 0.2955, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 5.347695350646973, |
|
"learning_rate": 4.026577009285944e-05, |
|
"loss": 0.2341, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 7.788352966308594, |
|
"learning_rate": 4.018571886007045e-05, |
|
"loss": 0.2228, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 4.078495025634766, |
|
"learning_rate": 4.0105667627281465e-05, |
|
"loss": 0.2408, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.237365245819092, |
|
"learning_rate": 4.002561639449248e-05, |
|
"loss": 0.2891, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.711833953857422, |
|
"learning_rate": 3.994556516170349e-05, |
|
"loss": 0.323, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 3.250711679458618, |
|
"learning_rate": 3.9865513928914506e-05, |
|
"loss": 0.2945, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 6.933974266052246, |
|
"learning_rate": 3.978546269612553e-05, |
|
"loss": 0.2507, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 4.515052795410156, |
|
"learning_rate": 3.970541146333654e-05, |
|
"loss": 0.265, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 4.89296293258667, |
|
"learning_rate": 3.9625360230547554e-05, |
|
"loss": 0.2868, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 4.629034996032715, |
|
"learning_rate": 3.954530899775857e-05, |
|
"loss": 0.2773, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.881559371948242, |
|
"learning_rate": 3.946525776496958e-05, |
|
"loss": 0.3336, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.4768316745758057, |
|
"learning_rate": 3.9385206532180596e-05, |
|
"loss": 0.2212, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.582344055175781, |
|
"learning_rate": 3.930515529939161e-05, |
|
"loss": 0.3031, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 3.73008131980896, |
|
"learning_rate": 3.922510406660262e-05, |
|
"loss": 0.2557, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 5.319180011749268, |
|
"learning_rate": 3.9145052833813644e-05, |
|
"loss": 0.2679, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 6.709672451019287, |
|
"learning_rate": 3.906500160102466e-05, |
|
"loss": 0.2471, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 5.294819355010986, |
|
"learning_rate": 3.898495036823567e-05, |
|
"loss": 0.2661, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.2995288372039795, |
|
"learning_rate": 3.8904899135446685e-05, |
|
"loss": 0.2789, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.34086799621582, |
|
"learning_rate": 3.88248479026577e-05, |
|
"loss": 0.2789, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 5.209534168243408, |
|
"learning_rate": 3.874479666986871e-05, |
|
"loss": 0.3002, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 5.175271034240723, |
|
"learning_rate": 3.8664745437079733e-05, |
|
"loss": 0.2631, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 4.909916400909424, |
|
"learning_rate": 3.858469420429075e-05, |
|
"loss": 0.25, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 3.8786613941192627, |
|
"learning_rate": 3.850464297150176e-05, |
|
"loss": 0.226, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 4.349425315856934, |
|
"learning_rate": 3.842459173871278e-05, |
|
"loss": 0.2635, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 5.107605934143066, |
|
"learning_rate": 3.8344540505923795e-05, |
|
"loss": 0.2536, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 5.436495780944824, |
|
"learning_rate": 3.826448927313481e-05, |
|
"loss": 0.2911, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5.1116156578063965, |
|
"learning_rate": 3.818443804034582e-05, |
|
"loss": 0.3064, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 4.1365742683410645, |
|
"learning_rate": 3.810438680755684e-05, |
|
"loss": 0.2003, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 5.43222188949585, |
|
"learning_rate": 3.802433557476786e-05, |
|
"loss": 0.291, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 6.062341690063477, |
|
"learning_rate": 3.794428434197887e-05, |
|
"loss": 0.2325, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 4.5507097244262695, |
|
"learning_rate": 3.7864233109189885e-05, |
|
"loss": 0.2493, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.3975865840911865, |
|
"learning_rate": 3.77841818764009e-05, |
|
"loss": 0.2349, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.967979907989502, |
|
"learning_rate": 3.770413064361191e-05, |
|
"loss": 0.2364, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.541342735290527, |
|
"learning_rate": 3.7624079410822926e-05, |
|
"loss": 0.2285, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 4.848491668701172, |
|
"learning_rate": 3.754402817803394e-05, |
|
"loss": 0.235, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 5.879725933074951, |
|
"learning_rate": 3.746397694524496e-05, |
|
"loss": 0.2759, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 6.01210880279541, |
|
"learning_rate": 3.7383925712455975e-05, |
|
"loss": 0.3345, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 4.760444641113281, |
|
"learning_rate": 3.730387447966699e-05, |
|
"loss": 0.2708, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 4.630128860473633, |
|
"learning_rate": 3.7223823246878e-05, |
|
"loss": 0.3049, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 4.3284101486206055, |
|
"learning_rate": 3.7143772014089016e-05, |
|
"loss": 0.2822, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 6.679904937744141, |
|
"learning_rate": 3.706372078130003e-05, |
|
"loss": 0.2764, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 5.192065238952637, |
|
"learning_rate": 3.6983669548511043e-05, |
|
"loss": 0.2479, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.901111125946045, |
|
"learning_rate": 3.6903618315722064e-05, |
|
"loss": 0.2849, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 6.2184977531433105, |
|
"learning_rate": 3.682356708293308e-05, |
|
"loss": 0.2667, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 5.900247573852539, |
|
"learning_rate": 3.674351585014409e-05, |
|
"loss": 0.2992, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 3.7004477977752686, |
|
"learning_rate": 3.666346461735511e-05, |
|
"loss": 0.2791, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 4.646676063537598, |
|
"learning_rate": 3.6583413384566126e-05, |
|
"loss": 0.2525, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 4.426496982574463, |
|
"learning_rate": 3.650336215177714e-05, |
|
"loss": 0.2624, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 4.333110809326172, |
|
"learning_rate": 3.6423310918988154e-05, |
|
"loss": 0.2777, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 3.7483744621276855, |
|
"learning_rate": 3.6343259686199174e-05, |
|
"loss": 0.2897, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.556215286254883, |
|
"learning_rate": 3.626320845341019e-05, |
|
"loss": 0.3432, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 4.707242965698242, |
|
"learning_rate": 3.61831572206212e-05, |
|
"loss": 0.2439, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 4.767390251159668, |
|
"learning_rate": 3.6103105987832216e-05, |
|
"loss": 0.2744, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 4.1662492752075195, |
|
"learning_rate": 3.602305475504323e-05, |
|
"loss": 0.267, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 4.437891006469727, |
|
"learning_rate": 3.594300352225424e-05, |
|
"loss": 0.2354, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 5.63749361038208, |
|
"learning_rate": 3.586295228946526e-05, |
|
"loss": 0.2557, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 6.398256778717041, |
|
"learning_rate": 3.578290105667628e-05, |
|
"loss": 0.2697, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 4.15376091003418, |
|
"learning_rate": 3.570284982388729e-05, |
|
"loss": 0.2672, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 8.952369689941406, |
|
"learning_rate": 3.5622798591098305e-05, |
|
"loss": 0.2992, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 7.161625385284424, |
|
"learning_rate": 3.554274735830932e-05, |
|
"loss": 0.3067, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 3.848027467727661, |
|
"learning_rate": 3.546269612552033e-05, |
|
"loss": 0.3165, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 5.625514507293701, |
|
"learning_rate": 3.5382644892731347e-05, |
|
"loss": 0.2792, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.829505681991577, |
|
"learning_rate": 3.530259365994236e-05, |
|
"loss": 0.258, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 4.038649559020996, |
|
"learning_rate": 3.522254242715338e-05, |
|
"loss": 0.2668, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 3.746533155441284, |
|
"learning_rate": 3.5142491194364395e-05, |
|
"loss": 0.2571, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 3.9205687046051025, |
|
"learning_rate": 3.506243996157541e-05, |
|
"loss": 0.2148, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 5.464355945587158, |
|
"learning_rate": 3.498238872878642e-05, |
|
"loss": 0.2707, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 4.321130752563477, |
|
"learning_rate": 3.4902337495997436e-05, |
|
"loss": 0.2434, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 6.3836588859558105, |
|
"learning_rate": 3.482228626320846e-05, |
|
"loss": 0.2601, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.9065053462982178, |
|
"learning_rate": 3.474223503041947e-05, |
|
"loss": 0.2033, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 4.280132293701172, |
|
"learning_rate": 3.4662183797630484e-05, |
|
"loss": 0.2708, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 5.5674262046813965, |
|
"learning_rate": 3.4582132564841505e-05, |
|
"loss": 0.2899, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 4.071995735168457, |
|
"learning_rate": 3.450208133205252e-05, |
|
"loss": 0.2714, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 6.83046817779541, |
|
"learning_rate": 3.442203009926353e-05, |
|
"loss": 0.2563, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 4.866962432861328, |
|
"learning_rate": 3.4341978866474546e-05, |
|
"loss": 0.2898, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 6.10991096496582, |
|
"learning_rate": 3.426192763368556e-05, |
|
"loss": 0.2927, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 8.084212303161621, |
|
"learning_rate": 3.4181876400896574e-05, |
|
"loss": 0.2668, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.702385902404785, |
|
"learning_rate": 3.4101825168107594e-05, |
|
"loss": 0.2617, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 5.180947303771973, |
|
"learning_rate": 3.402177393531861e-05, |
|
"loss": 0.2411, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 3.0766685009002686, |
|
"learning_rate": 3.394172270252962e-05, |
|
"loss": 0.2723, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 3.833108901977539, |
|
"learning_rate": 3.3861671469740636e-05, |
|
"loss": 0.2237, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 4.505425930023193, |
|
"learning_rate": 3.378162023695165e-05, |
|
"loss": 0.2685, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 3.9498701095581055, |
|
"learning_rate": 3.370156900416266e-05, |
|
"loss": 0.2637, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 6.345920562744141, |
|
"learning_rate": 3.362151777137368e-05, |
|
"loss": 0.2745, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 4.702010154724121, |
|
"learning_rate": 3.35414665385847e-05, |
|
"loss": 0.2837, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 4.943043231964111, |
|
"learning_rate": 3.346141530579571e-05, |
|
"loss": 0.2525, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 3.8749611377716064, |
|
"learning_rate": 3.3381364073006725e-05, |
|
"loss": 0.2499, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9425101214574899, |
|
"eval_loss": 0.1516382098197937, |
|
"eval_runtime": 31.6984, |
|
"eval_samples_per_second": 311.687, |
|
"eval_steps_per_second": 9.748, |
|
"step": 1389 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 6.219438076019287, |
|
"learning_rate": 3.330131284021774e-05, |
|
"loss": 0.3071, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 4.6552629470825195, |
|
"learning_rate": 3.322126160742875e-05, |
|
"loss": 0.2364, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 3.997241497039795, |
|
"learning_rate": 3.314121037463977e-05, |
|
"loss": 0.249, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 3.6796419620513916, |
|
"learning_rate": 3.306115914185078e-05, |
|
"loss": 0.2261, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.0016541481018066, |
|
"learning_rate": 3.29811079090618e-05, |
|
"loss": 0.2127, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 4.60055685043335, |
|
"learning_rate": 3.2901056676272815e-05, |
|
"loss": 0.2207, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 6.432025909423828, |
|
"learning_rate": 3.2821005443483835e-05, |
|
"loss": 0.2088, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 4.049763202667236, |
|
"learning_rate": 3.274095421069485e-05, |
|
"loss": 0.2193, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 4.77670955657959, |
|
"learning_rate": 3.266090297790586e-05, |
|
"loss": 0.2666, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 3.876225709915161, |
|
"learning_rate": 3.258085174511688e-05, |
|
"loss": 0.222, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.917393207550049, |
|
"learning_rate": 3.250080051232789e-05, |
|
"loss": 0.2481, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 3.858349084854126, |
|
"learning_rate": 3.242074927953891e-05, |
|
"loss": 0.2929, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 4.08052921295166, |
|
"learning_rate": 3.2340698046749925e-05, |
|
"loss": 0.2081, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 3.8843398094177246, |
|
"learning_rate": 3.226064681396094e-05, |
|
"loss": 0.1917, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 4.356058597564697, |
|
"learning_rate": 3.218059558117195e-05, |
|
"loss": 0.2211, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 5.629312038421631, |
|
"learning_rate": 3.2100544348382966e-05, |
|
"loss": 0.2704, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 3.8312325477600098, |
|
"learning_rate": 3.202049311559398e-05, |
|
"loss": 0.2418, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 3.1079790592193604, |
|
"learning_rate": 3.1940441882804994e-05, |
|
"loss": 0.1948, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 4.682496547698975, |
|
"learning_rate": 3.1860390650016015e-05, |
|
"loss": 0.2023, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 4.4082489013671875, |
|
"learning_rate": 3.178033941722703e-05, |
|
"loss": 0.2346, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 5.721102714538574, |
|
"learning_rate": 3.170028818443804e-05, |
|
"loss": 0.2294, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 3.2310311794281006, |
|
"learning_rate": 3.1620236951649056e-05, |
|
"loss": 0.2074, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 5.734870433807373, |
|
"learning_rate": 3.154018571886007e-05, |
|
"loss": 0.2244, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 4.256961822509766, |
|
"learning_rate": 3.1460134486071084e-05, |
|
"loss": 0.2208, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 6.9470696449279785, |
|
"learning_rate": 3.13800832532821e-05, |
|
"loss": 0.2736, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.8514010906219482, |
|
"learning_rate": 3.130003202049312e-05, |
|
"loss": 0.1989, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 4.2279744148254395, |
|
"learning_rate": 3.121998078770413e-05, |
|
"loss": 0.2753, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 3.349268674850464, |
|
"learning_rate": 3.1139929554915145e-05, |
|
"loss": 0.181, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 4.550454616546631, |
|
"learning_rate": 3.105987832212616e-05, |
|
"loss": 0.2536, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 3.7860782146453857, |
|
"learning_rate": 3.097982708933718e-05, |
|
"loss": 0.2331, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 4.5719170570373535, |
|
"learning_rate": 3.0899775856548194e-05, |
|
"loss": 0.2408, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 4.448012828826904, |
|
"learning_rate": 3.081972462375921e-05, |
|
"loss": 0.2219, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 3.7972702980041504, |
|
"learning_rate": 3.073967339097023e-05, |
|
"loss": 0.2691, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 4.268452167510986, |
|
"learning_rate": 3.065962215818124e-05, |
|
"loss": 0.2215, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 4.145329475402832, |
|
"learning_rate": 3.0579570925392256e-05, |
|
"loss": 0.2488, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 5.501221656799316, |
|
"learning_rate": 3.049951969260327e-05, |
|
"loss": 0.2441, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 4.3408203125, |
|
"learning_rate": 3.0419468459814283e-05, |
|
"loss": 0.2308, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 4.104162216186523, |
|
"learning_rate": 3.0339417227025297e-05, |
|
"loss": 0.2538, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 5.441348075866699, |
|
"learning_rate": 3.025936599423631e-05, |
|
"loss": 0.2742, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 3.3526971340179443, |
|
"learning_rate": 3.017931476144733e-05, |
|
"loss": 0.1934, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 3.5918030738830566, |
|
"learning_rate": 3.0099263528658345e-05, |
|
"loss": 0.256, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 3.9758517742156982, |
|
"learning_rate": 3.001921229586936e-05, |
|
"loss": 0.2096, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.7759931087493896, |
|
"learning_rate": 2.9939161063080373e-05, |
|
"loss": 0.2545, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 6.958917140960693, |
|
"learning_rate": 2.9859109830291387e-05, |
|
"loss": 0.2293, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 4.162193775177002, |
|
"learning_rate": 2.97790585975024e-05, |
|
"loss": 0.2095, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 3.701801061630249, |
|
"learning_rate": 2.9699007364713418e-05, |
|
"loss": 0.2339, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 3.290947437286377, |
|
"learning_rate": 2.9618956131924435e-05, |
|
"loss": 0.209, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 4.3231024742126465, |
|
"learning_rate": 2.953890489913545e-05, |
|
"loss": 0.2791, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 3.6642446517944336, |
|
"learning_rate": 2.9458853666346466e-05, |
|
"loss": 0.2382, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 6.942342281341553, |
|
"learning_rate": 2.937880243355748e-05, |
|
"loss": 0.2406, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 3.886199712753296, |
|
"learning_rate": 2.9298751200768493e-05, |
|
"loss": 0.218, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 3.8468515872955322, |
|
"learning_rate": 2.9218699967979507e-05, |
|
"loss": 0.2449, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 3.2598648071289062, |
|
"learning_rate": 2.913864873519052e-05, |
|
"loss": 0.2276, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 3.9356770515441895, |
|
"learning_rate": 2.905859750240154e-05, |
|
"loss": 0.2481, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 5.803495407104492, |
|
"learning_rate": 2.8978546269612555e-05, |
|
"loss": 0.2699, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.3325111865997314, |
|
"learning_rate": 2.889849503682357e-05, |
|
"loss": 0.2206, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 5.40475606918335, |
|
"learning_rate": 2.8818443804034583e-05, |
|
"loss": 0.2295, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 4.207846164703369, |
|
"learning_rate": 2.8738392571245597e-05, |
|
"loss": 0.2268, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 3.405880928039551, |
|
"learning_rate": 2.8658341338456614e-05, |
|
"loss": 0.2773, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 4.502201557159424, |
|
"learning_rate": 2.8578290105667628e-05, |
|
"loss": 0.2459, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.8585033416748047, |
|
"learning_rate": 2.8498238872878645e-05, |
|
"loss": 0.2626, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 4.774590015411377, |
|
"learning_rate": 2.8418187640089662e-05, |
|
"loss": 0.2242, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 6.423954010009766, |
|
"learning_rate": 2.8338136407300676e-05, |
|
"loss": 0.2711, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 5.023673057556152, |
|
"learning_rate": 2.825808517451169e-05, |
|
"loss": 0.2191, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 3.246953010559082, |
|
"learning_rate": 2.8178033941722703e-05, |
|
"loss": 0.2032, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 4.740121364593506, |
|
"learning_rate": 2.8097982708933717e-05, |
|
"loss": 0.2257, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 4.652435302734375, |
|
"learning_rate": 2.801793147614473e-05, |
|
"loss": 0.2441, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 3.7246835231781006, |
|
"learning_rate": 2.7937880243355745e-05, |
|
"loss": 0.2064, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.8556969165802, |
|
"learning_rate": 2.7857829010566765e-05, |
|
"loss": 0.2002, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.9338796138763428, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.2608, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.847045660018921, |
|
"learning_rate": 2.7697726544988796e-05, |
|
"loss": 0.2167, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 3.5335538387298584, |
|
"learning_rate": 2.761767531219981e-05, |
|
"loss": 0.1966, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 3.702679395675659, |
|
"learning_rate": 2.7537624079410824e-05, |
|
"loss": 0.1865, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.013113498687744, |
|
"learning_rate": 2.7457572846621838e-05, |
|
"loss": 0.199, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.300877809524536, |
|
"learning_rate": 2.737752161383285e-05, |
|
"loss": 0.2504, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 5.806422233581543, |
|
"learning_rate": 2.7297470381043872e-05, |
|
"loss": 0.2362, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 6.372203826904297, |
|
"learning_rate": 2.7217419148254886e-05, |
|
"loss": 0.2298, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 6.462773323059082, |
|
"learning_rate": 2.71373679154659e-05, |
|
"loss": 0.2367, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 5.330246448516846, |
|
"learning_rate": 2.7057316682676913e-05, |
|
"loss": 0.2543, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 4.1171956062316895, |
|
"learning_rate": 2.6977265449887927e-05, |
|
"loss": 0.2057, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 3.247389316558838, |
|
"learning_rate": 2.6897214217098944e-05, |
|
"loss": 0.1965, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.7912063598632812, |
|
"learning_rate": 2.6817162984309958e-05, |
|
"loss": 0.2103, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.85927152633667, |
|
"learning_rate": 2.6737111751520975e-05, |
|
"loss": 0.2226, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 3.5677337646484375, |
|
"learning_rate": 2.6657060518731993e-05, |
|
"loss": 0.2193, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 5.31620979309082, |
|
"learning_rate": 2.6577009285943006e-05, |
|
"loss": 0.2569, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 5.1970038414001465, |
|
"learning_rate": 2.649695805315402e-05, |
|
"loss": 0.2235, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 3.6116130352020264, |
|
"learning_rate": 2.6416906820365034e-05, |
|
"loss": 0.2353, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 4.2939043045043945, |
|
"learning_rate": 2.6336855587576048e-05, |
|
"loss": 0.2448, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 3.7755072116851807, |
|
"learning_rate": 2.625680435478706e-05, |
|
"loss": 0.2131, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 4.578812122344971, |
|
"learning_rate": 2.6176753121998082e-05, |
|
"loss": 0.2167, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 4.904923439025879, |
|
"learning_rate": 2.6096701889209096e-05, |
|
"loss": 0.2228, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 5.128912448883057, |
|
"learning_rate": 2.601665065642011e-05, |
|
"loss": 0.2888, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 5.788363933563232, |
|
"learning_rate": 2.5936599423631124e-05, |
|
"loss": 0.2421, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 4.001156806945801, |
|
"learning_rate": 2.585654819084214e-05, |
|
"loss": 0.1997, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 4.3057475090026855, |
|
"learning_rate": 2.5776496958053155e-05, |
|
"loss": 0.2434, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 3.524348258972168, |
|
"learning_rate": 2.5696445725264168e-05, |
|
"loss": 0.2188, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 6.004559516906738, |
|
"learning_rate": 2.561639449247519e-05, |
|
"loss": 0.2426, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 4.429930686950684, |
|
"learning_rate": 2.5536343259686203e-05, |
|
"loss": 0.2306, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 5.706151008605957, |
|
"learning_rate": 2.5456292026897216e-05, |
|
"loss": 0.2194, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 4.148650169372559, |
|
"learning_rate": 2.537624079410823e-05, |
|
"loss": 0.2683, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 3.2449026107788086, |
|
"learning_rate": 2.5296189561319244e-05, |
|
"loss": 0.2539, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 3.6404850482940674, |
|
"learning_rate": 2.5216138328530258e-05, |
|
"loss": 0.2221, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 3.1382288932800293, |
|
"learning_rate": 2.513608709574127e-05, |
|
"loss": 0.2266, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 8.027711868286133, |
|
"learning_rate": 2.5056035862952292e-05, |
|
"loss": 0.2944, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 7.140124797821045, |
|
"learning_rate": 2.4975984630163306e-05, |
|
"loss": 0.2036, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 3.4655325412750244, |
|
"learning_rate": 2.489593339737432e-05, |
|
"loss": 0.1955, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 3.295433759689331, |
|
"learning_rate": 2.4815882164585337e-05, |
|
"loss": 0.2114, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 3.806304931640625, |
|
"learning_rate": 2.473583093179635e-05, |
|
"loss": 0.206, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 4.674000263214111, |
|
"learning_rate": 2.4655779699007368e-05, |
|
"loss": 0.2215, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 3.5063233375549316, |
|
"learning_rate": 2.4575728466218382e-05, |
|
"loss": 0.2583, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 3.4132816791534424, |
|
"learning_rate": 2.4495677233429396e-05, |
|
"loss": 0.2388, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 3.2140300273895264, |
|
"learning_rate": 2.441562600064041e-05, |
|
"loss": 0.2395, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 4.795976638793945, |
|
"learning_rate": 2.4335574767851427e-05, |
|
"loss": 0.2206, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 3.491682767868042, |
|
"learning_rate": 2.425552353506244e-05, |
|
"loss": 0.2553, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 4.174879550933838, |
|
"learning_rate": 2.4175472302273454e-05, |
|
"loss": 0.1969, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 3.776137590408325, |
|
"learning_rate": 2.409542106948447e-05, |
|
"loss": 0.2276, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 3.7050764560699463, |
|
"learning_rate": 2.4015369836695485e-05, |
|
"loss": 0.2001, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 3.4648373126983643, |
|
"learning_rate": 2.3935318603906502e-05, |
|
"loss": 0.2538, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 4.3064727783203125, |
|
"learning_rate": 2.3855267371117516e-05, |
|
"loss": 0.2579, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 2.671032428741455, |
|
"learning_rate": 2.3775216138328533e-05, |
|
"loss": 0.2443, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 4.2159013748168945, |
|
"learning_rate": 2.3695164905539547e-05, |
|
"loss": 0.2373, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 3.787076711654663, |
|
"learning_rate": 2.361511367275056e-05, |
|
"loss": 0.2179, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 3.971762180328369, |
|
"learning_rate": 2.3535062439961578e-05, |
|
"loss": 0.2356, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 5.022749900817871, |
|
"learning_rate": 2.3455011207172592e-05, |
|
"loss": 0.2167, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 4.616547107696533, |
|
"learning_rate": 2.3374959974383606e-05, |
|
"loss": 0.2266, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 4.522019386291504, |
|
"learning_rate": 2.329490874159462e-05, |
|
"loss": 0.247, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 5.141051292419434, |
|
"learning_rate": 2.3214857508805637e-05, |
|
"loss": 0.2028, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 3.577793836593628, |
|
"learning_rate": 2.313480627601665e-05, |
|
"loss": 0.1924, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 5.1364665031433105, |
|
"learning_rate": 2.3054755043227668e-05, |
|
"loss": 0.226, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 3.8625662326812744, |
|
"learning_rate": 2.297470381043868e-05, |
|
"loss": 0.2329, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 4.119937419891357, |
|
"learning_rate": 2.28946525776497e-05, |
|
"loss": 0.2037, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 3.1188371181488037, |
|
"learning_rate": 2.2814601344860712e-05, |
|
"loss": 0.231, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 4.263334274291992, |
|
"learning_rate": 2.2734550112071726e-05, |
|
"loss": 0.219, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 4.002464771270752, |
|
"learning_rate": 2.2654498879282743e-05, |
|
"loss": 0.1927, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 3.5694775581359863, |
|
"learning_rate": 2.2574447646493757e-05, |
|
"loss": 0.1803, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 4.048843860626221, |
|
"learning_rate": 2.249439641370477e-05, |
|
"loss": 0.1837, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 4.335817337036133, |
|
"learning_rate": 2.2414345180915788e-05, |
|
"loss": 0.227, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 4.292420864105225, |
|
"learning_rate": 2.2334293948126802e-05, |
|
"loss": 0.2535, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 3.625598430633545, |
|
"learning_rate": 2.2254242715337816e-05, |
|
"loss": 0.1633, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9487854251012146, |
|
"eval_loss": 0.1372506320476532, |
|
"eval_runtime": 31.8832, |
|
"eval_samples_per_second": 309.881, |
|
"eval_steps_per_second": 9.692, |
|
"step": 2084 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.9075140953063965, |
|
"learning_rate": 2.217419148254883e-05, |
|
"loss": 0.19, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 4.76453971862793, |
|
"learning_rate": 2.2094140249759847e-05, |
|
"loss": 0.214, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 3.710191011428833, |
|
"learning_rate": 2.2014089016970864e-05, |
|
"loss": 0.2197, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 3.287574529647827, |
|
"learning_rate": 2.1934037784181878e-05, |
|
"loss": 0.1939, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 3.7616758346557617, |
|
"learning_rate": 2.1853986551392895e-05, |
|
"loss": 0.209, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 3.6096699237823486, |
|
"learning_rate": 2.177393531860391e-05, |
|
"loss": 0.2195, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 4.259820461273193, |
|
"learning_rate": 2.1693884085814922e-05, |
|
"loss": 0.1813, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 4.710832118988037, |
|
"learning_rate": 2.1613832853025936e-05, |
|
"loss": 0.2054, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.757356882095337, |
|
"learning_rate": 2.1533781620236953e-05, |
|
"loss": 0.2276, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 4.743321418762207, |
|
"learning_rate": 2.1453730387447967e-05, |
|
"loss": 0.1603, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 3.536240339279175, |
|
"learning_rate": 2.137367915465898e-05, |
|
"loss": 0.1888, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 3.635094404220581, |
|
"learning_rate": 2.1293627921869998e-05, |
|
"loss": 0.1841, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 4.491457939147949, |
|
"learning_rate": 2.1213576689081012e-05, |
|
"loss": 0.2013, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 5.20548152923584, |
|
"learning_rate": 2.113352545629203e-05, |
|
"loss": 0.1618, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 3.6702117919921875, |
|
"learning_rate": 2.1053474223503043e-05, |
|
"loss": 0.2106, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 3.9622325897216797, |
|
"learning_rate": 2.097342299071406e-05, |
|
"loss": 0.2488, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 7.823854923248291, |
|
"learning_rate": 2.0893371757925074e-05, |
|
"loss": 0.2107, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 5.4744791984558105, |
|
"learning_rate": 2.0813320525136088e-05, |
|
"loss": 0.1888, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 3.024887800216675, |
|
"learning_rate": 2.0733269292347105e-05, |
|
"loss": 0.2051, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 3.444693088531494, |
|
"learning_rate": 2.065321805955812e-05, |
|
"loss": 0.2404, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 4.3029656410217285, |
|
"learning_rate": 2.0573166826769133e-05, |
|
"loss": 0.215, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 4.038111209869385, |
|
"learning_rate": 2.0493115593980146e-05, |
|
"loss": 0.2003, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 4.064023494720459, |
|
"learning_rate": 2.0413064361191164e-05, |
|
"loss": 0.1961, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 5.2245707511901855, |
|
"learning_rate": 2.0333013128402177e-05, |
|
"loss": 0.2172, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 4.670438289642334, |
|
"learning_rate": 2.025296189561319e-05, |
|
"loss": 0.1992, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 4.39680290222168, |
|
"learning_rate": 2.017291066282421e-05, |
|
"loss": 0.2174, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 6.914219379425049, |
|
"learning_rate": 2.0092859430035225e-05, |
|
"loss": 0.1968, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 3.2190115451812744, |
|
"learning_rate": 2.001280819724624e-05, |
|
"loss": 0.1939, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 3.638925075531006, |
|
"learning_rate": 1.9932756964457253e-05, |
|
"loss": 0.2431, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 5.030416965484619, |
|
"learning_rate": 1.985270573166827e-05, |
|
"loss": 0.2094, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 5.105839729309082, |
|
"learning_rate": 1.9772654498879284e-05, |
|
"loss": 0.2165, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 4.913294315338135, |
|
"learning_rate": 1.9692603266090298e-05, |
|
"loss": 0.2171, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 4.230659008026123, |
|
"learning_rate": 1.961255203330131e-05, |
|
"loss": 0.2088, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 4.271526336669922, |
|
"learning_rate": 1.953250080051233e-05, |
|
"loss": 0.215, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 6.460733413696289, |
|
"learning_rate": 1.9452449567723343e-05, |
|
"loss": 0.2241, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.8896567821502686, |
|
"learning_rate": 1.9372398334934356e-05, |
|
"loss": 0.1587, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 3.2169876098632812, |
|
"learning_rate": 1.9292347102145374e-05, |
|
"loss": 0.1587, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 4.299535274505615, |
|
"learning_rate": 1.921229586935639e-05, |
|
"loss": 0.1819, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 3.9862189292907715, |
|
"learning_rate": 1.9132244636567405e-05, |
|
"loss": 0.2099, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 5.323502540588379, |
|
"learning_rate": 1.905219340377842e-05, |
|
"loss": 0.222, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 3.4311234951019287, |
|
"learning_rate": 1.8972142170989436e-05, |
|
"loss": 0.1956, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 4.878343105316162, |
|
"learning_rate": 1.889209093820045e-05, |
|
"loss": 0.1814, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 2.903064489364624, |
|
"learning_rate": 1.8812039705411463e-05, |
|
"loss": 0.2397, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 5.286783695220947, |
|
"learning_rate": 1.873198847262248e-05, |
|
"loss": 0.2362, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 4.201813220977783, |
|
"learning_rate": 1.8651937239833494e-05, |
|
"loss": 0.2235, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 3.4148082733154297, |
|
"learning_rate": 1.8571886007044508e-05, |
|
"loss": 0.1922, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 4.562300682067871, |
|
"learning_rate": 1.8491834774255522e-05, |
|
"loss": 0.2013, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 6.004905700683594, |
|
"learning_rate": 1.841178354146654e-05, |
|
"loss": 0.2215, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 4.642991065979004, |
|
"learning_rate": 1.8331732308677556e-05, |
|
"loss": 0.2085, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 2.796497344970703, |
|
"learning_rate": 1.825168107588857e-05, |
|
"loss": 0.2126, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 6.009349346160889, |
|
"learning_rate": 1.8171629843099587e-05, |
|
"loss": 0.1906, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 4.415472507476807, |
|
"learning_rate": 1.80915786103106e-05, |
|
"loss": 0.2013, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.890207529067993, |
|
"learning_rate": 1.8011527377521615e-05, |
|
"loss": 0.2017, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 3.2712149620056152, |
|
"learning_rate": 1.793147614473263e-05, |
|
"loss": 0.1997, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 4.87721061706543, |
|
"learning_rate": 1.7851424911943646e-05, |
|
"loss": 0.1944, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 5.590481281280518, |
|
"learning_rate": 1.777137367915466e-05, |
|
"loss": 0.1749, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 3.1477975845336914, |
|
"learning_rate": 1.7691322446365673e-05, |
|
"loss": 0.1734, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 4.50333309173584, |
|
"learning_rate": 1.761127121357669e-05, |
|
"loss": 0.244, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 4.189910411834717, |
|
"learning_rate": 1.7531219980787704e-05, |
|
"loss": 0.2015, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 4.48671817779541, |
|
"learning_rate": 1.7451168747998718e-05, |
|
"loss": 0.1994, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 3.9251739978790283, |
|
"learning_rate": 1.7371117515209735e-05, |
|
"loss": 0.1798, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 2.792525291442871, |
|
"learning_rate": 1.7291066282420752e-05, |
|
"loss": 0.1628, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 3.325592041015625, |
|
"learning_rate": 1.7211015049631766e-05, |
|
"loss": 0.2069, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 3.9942626953125, |
|
"learning_rate": 1.713096381684278e-05, |
|
"loss": 0.1866, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 5.486047267913818, |
|
"learning_rate": 1.7050912584053797e-05, |
|
"loss": 0.2185, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 3.5321319103240967, |
|
"learning_rate": 1.697086135126481e-05, |
|
"loss": 0.2068, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 4.118142127990723, |
|
"learning_rate": 1.6890810118475825e-05, |
|
"loss": 0.2076, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 4.678371906280518, |
|
"learning_rate": 1.681075888568684e-05, |
|
"loss": 0.1948, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 5.298951148986816, |
|
"learning_rate": 1.6730707652897856e-05, |
|
"loss": 0.2142, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 4.5779900550842285, |
|
"learning_rate": 1.665065642010887e-05, |
|
"loss": 0.1994, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 4.762623310089111, |
|
"learning_rate": 1.6570605187319883e-05, |
|
"loss": 0.2213, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 4.956728458404541, |
|
"learning_rate": 1.64905539545309e-05, |
|
"loss": 0.1818, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 3.7195310592651367, |
|
"learning_rate": 1.6410502721741918e-05, |
|
"loss": 0.2171, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 3.115422010421753, |
|
"learning_rate": 1.633045148895293e-05, |
|
"loss": 0.1873, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 2.4611568450927734, |
|
"learning_rate": 1.6250400256163945e-05, |
|
"loss": 0.1999, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 7.129974842071533, |
|
"learning_rate": 1.6170349023374962e-05, |
|
"loss": 0.2039, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 3.4364309310913086, |
|
"learning_rate": 1.6090297790585976e-05, |
|
"loss": 0.2019, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 7.869508266448975, |
|
"learning_rate": 1.601024655779699e-05, |
|
"loss": 0.1678, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 4.7185378074646, |
|
"learning_rate": 1.5930195325008007e-05, |
|
"loss": 0.1934, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 7.357175350189209, |
|
"learning_rate": 1.585014409221902e-05, |
|
"loss": 0.1998, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 3.6080660820007324, |
|
"learning_rate": 1.5770092859430035e-05, |
|
"loss": 0.1949, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 2.9534220695495605, |
|
"learning_rate": 1.569004162664105e-05, |
|
"loss": 0.1772, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 4.7188401222229, |
|
"learning_rate": 1.5609990393852066e-05, |
|
"loss": 0.2164, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 5.8504180908203125, |
|
"learning_rate": 1.552993916106308e-05, |
|
"loss": 0.2283, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 4.23643684387207, |
|
"learning_rate": 1.5449887928274097e-05, |
|
"loss": 0.2003, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 2.19675350189209, |
|
"learning_rate": 1.5369836695485114e-05, |
|
"loss": 0.1997, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 5.1381330490112305, |
|
"learning_rate": 1.5289785462696128e-05, |
|
"loss": 0.195, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 3.739199161529541, |
|
"learning_rate": 1.5209734229907142e-05, |
|
"loss": 0.1596, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 4.581226348876953, |
|
"learning_rate": 1.5129682997118155e-05, |
|
"loss": 0.2086, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 5.416107177734375, |
|
"learning_rate": 1.5049631764329173e-05, |
|
"loss": 0.2517, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 6.070262908935547, |
|
"learning_rate": 1.4969580531540186e-05, |
|
"loss": 0.1801, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 4.063976764678955, |
|
"learning_rate": 1.48895292987512e-05, |
|
"loss": 0.2302, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 3.717087745666504, |
|
"learning_rate": 1.4809478065962217e-05, |
|
"loss": 0.2185, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 3.2319772243499756, |
|
"learning_rate": 1.4729426833173233e-05, |
|
"loss": 0.2609, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 3.7224340438842773, |
|
"learning_rate": 1.4649375600384247e-05, |
|
"loss": 0.1906, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 6.972284317016602, |
|
"learning_rate": 1.456932436759526e-05, |
|
"loss": 0.2232, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 3.514923095703125, |
|
"learning_rate": 1.4489273134806278e-05, |
|
"loss": 0.2081, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 5.140145301818848, |
|
"learning_rate": 1.4409221902017291e-05, |
|
"loss": 0.2099, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.977041482925415, |
|
"learning_rate": 1.4329170669228307e-05, |
|
"loss": 0.1689, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.9438095092773438, |
|
"learning_rate": 1.4249119436439322e-05, |
|
"loss": 0.1788, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 3.311598777770996, |
|
"learning_rate": 1.4169068203650338e-05, |
|
"loss": 0.1787, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 4.066298961639404, |
|
"learning_rate": 1.4089016970861352e-05, |
|
"loss": 0.2049, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 3.8641276359558105, |
|
"learning_rate": 1.4008965738072365e-05, |
|
"loss": 0.2064, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 4.785098075866699, |
|
"learning_rate": 1.3928914505283383e-05, |
|
"loss": 0.213, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.3832712173461914, |
|
"learning_rate": 1.3848863272494398e-05, |
|
"loss": 0.203, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 3.8471434116363525, |
|
"learning_rate": 1.3768812039705412e-05, |
|
"loss": 0.2192, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 4.769313335418701, |
|
"learning_rate": 1.3688760806916426e-05, |
|
"loss": 0.2191, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 3.5882818698883057, |
|
"learning_rate": 1.3608709574127443e-05, |
|
"loss": 0.1952, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 4.177798271179199, |
|
"learning_rate": 1.3528658341338457e-05, |
|
"loss": 0.2209, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 5.218222618103027, |
|
"learning_rate": 1.3448607108549472e-05, |
|
"loss": 0.1953, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 4.669002056121826, |
|
"learning_rate": 1.3368555875760488e-05, |
|
"loss": 0.2017, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 4.992402076721191, |
|
"learning_rate": 1.3288504642971503e-05, |
|
"loss": 0.2702, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 3.818152666091919, |
|
"learning_rate": 1.3208453410182517e-05, |
|
"loss": 0.2195, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 3.825201988220215, |
|
"learning_rate": 1.312840217739353e-05, |
|
"loss": 0.2086, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 3.2888553142547607, |
|
"learning_rate": 1.3048350944604548e-05, |
|
"loss": 0.1899, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 4.896663665771484, |
|
"learning_rate": 1.2968299711815562e-05, |
|
"loss": 0.2154, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 3.9895691871643066, |
|
"learning_rate": 1.2888248479026577e-05, |
|
"loss": 0.2251, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 3.9652981758117676, |
|
"learning_rate": 1.2808197246237594e-05, |
|
"loss": 0.2116, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 4.93154764175415, |
|
"learning_rate": 1.2728146013448608e-05, |
|
"loss": 0.2597, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 4.236401081085205, |
|
"learning_rate": 1.2648094780659622e-05, |
|
"loss": 0.2312, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 3.95443058013916, |
|
"learning_rate": 1.2568043547870636e-05, |
|
"loss": 0.1696, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 2.7311601638793945, |
|
"learning_rate": 1.2487992315081653e-05, |
|
"loss": 0.1625, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 3.6803927421569824, |
|
"learning_rate": 1.2407941082292668e-05, |
|
"loss": 0.2069, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 3.391956329345703, |
|
"learning_rate": 1.2327889849503684e-05, |
|
"loss": 0.1779, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 3.478215456008911, |
|
"learning_rate": 1.2247838616714698e-05, |
|
"loss": 0.1874, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 2.4775846004486084, |
|
"learning_rate": 1.2167787383925713e-05, |
|
"loss": 0.1953, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 4.715533256530762, |
|
"learning_rate": 1.2087736151136727e-05, |
|
"loss": 0.1863, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 4.083915710449219, |
|
"learning_rate": 1.2007684918347743e-05, |
|
"loss": 0.1871, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 2.535428285598755, |
|
"learning_rate": 1.1927633685558758e-05, |
|
"loss": 0.2084, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 5.987590789794922, |
|
"learning_rate": 1.1847582452769774e-05, |
|
"loss": 0.172, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 4.185674667358398, |
|
"learning_rate": 1.1767531219980789e-05, |
|
"loss": 0.2106, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 3.0659992694854736, |
|
"learning_rate": 1.1687479987191803e-05, |
|
"loss": 0.1839, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 8.405370712280273, |
|
"learning_rate": 1.1607428754402818e-05, |
|
"loss": 0.2449, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 5.262624740600586, |
|
"learning_rate": 1.1527377521613834e-05, |
|
"loss": 0.1982, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 3.3970797061920166, |
|
"learning_rate": 1.144732628882485e-05, |
|
"loss": 0.2383, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 4.604133129119873, |
|
"learning_rate": 1.1367275056035863e-05, |
|
"loss": 0.211, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 4.767920970916748, |
|
"learning_rate": 1.1287223823246879e-05, |
|
"loss": 0.2111, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 4.075857162475586, |
|
"learning_rate": 1.1207172590457894e-05, |
|
"loss": 0.2011, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 3.293419599533081, |
|
"learning_rate": 1.1127121357668908e-05, |
|
"loss": 0.1943, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9510121457489878, |
|
"eval_loss": 0.131936714053154, |
|
"eval_runtime": 31.7023, |
|
"eval_samples_per_second": 311.649, |
|
"eval_steps_per_second": 9.747, |
|
"step": 2779 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.756840229034424, |
|
"learning_rate": 1.1047070124879923e-05, |
|
"loss": 0.2012, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 4.239038467407227, |
|
"learning_rate": 1.0967018892090939e-05, |
|
"loss": 0.1637, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 3.6597139835357666, |
|
"learning_rate": 1.0886967659301954e-05, |
|
"loss": 0.1848, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 3.050875425338745, |
|
"learning_rate": 1.0806916426512968e-05, |
|
"loss": 0.1565, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 4.3006463050842285, |
|
"learning_rate": 1.0726865193723984e-05, |
|
"loss": 0.2116, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 4.682863712310791, |
|
"learning_rate": 1.0646813960934999e-05, |
|
"loss": 0.1974, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 3.8604190349578857, |
|
"learning_rate": 1.0566762728146015e-05, |
|
"loss": 0.1972, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 4.325167655944824, |
|
"learning_rate": 1.048671149535703e-05, |
|
"loss": 0.1732, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 6.881094932556152, |
|
"learning_rate": 1.0406660262568044e-05, |
|
"loss": 0.2527, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 6.374682426452637, |
|
"learning_rate": 1.032660902977906e-05, |
|
"loss": 0.224, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 3.154886245727539, |
|
"learning_rate": 1.0246557796990073e-05, |
|
"loss": 0.1613, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 5.165164470672607, |
|
"learning_rate": 1.0166506564201089e-05, |
|
"loss": 0.225, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 3.388165235519409, |
|
"learning_rate": 1.0086455331412104e-05, |
|
"loss": 0.2189, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 4.795779705047607, |
|
"learning_rate": 1.000640409862312e-05, |
|
"loss": 0.2027, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 3.341182231903076, |
|
"learning_rate": 9.926352865834135e-06, |
|
"loss": 0.1931, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 1.956528902053833, |
|
"learning_rate": 9.846301633045149e-06, |
|
"loss": 0.2208, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 6.8234076499938965, |
|
"learning_rate": 9.766250400256164e-06, |
|
"loss": 0.2388, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 2.924370527267456, |
|
"learning_rate": 9.686199167467178e-06, |
|
"loss": 0.2046, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 5.049492359161377, |
|
"learning_rate": 9.606147934678195e-06, |
|
"loss": 0.1876, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 4.749929428100586, |
|
"learning_rate": 9.52609670188921e-06, |
|
"loss": 0.1649, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 3.702878475189209, |
|
"learning_rate": 9.446045469100225e-06, |
|
"loss": 0.2309, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 6.8818745613098145, |
|
"learning_rate": 9.36599423631124e-06, |
|
"loss": 0.2012, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 3.418677568435669, |
|
"learning_rate": 9.285943003522254e-06, |
|
"loss": 0.2209, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 3.8437540531158447, |
|
"learning_rate": 9.20589177073327e-06, |
|
"loss": 0.1668, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 3.2534446716308594, |
|
"learning_rate": 9.125840537944285e-06, |
|
"loss": 0.2346, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 4.049452781677246, |
|
"learning_rate": 9.0457893051553e-06, |
|
"loss": 0.1752, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 4.121111869812012, |
|
"learning_rate": 8.965738072366314e-06, |
|
"loss": 0.2057, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 5.423705577850342, |
|
"learning_rate": 8.88568683957733e-06, |
|
"loss": 0.1958, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 3.153987407684326, |
|
"learning_rate": 8.805635606788345e-06, |
|
"loss": 0.1547, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 3.7586491107940674, |
|
"learning_rate": 8.725584373999359e-06, |
|
"loss": 0.224, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 4.077225208282471, |
|
"learning_rate": 8.645533141210376e-06, |
|
"loss": 0.2113, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 6.970191478729248, |
|
"learning_rate": 8.56548190842139e-06, |
|
"loss": 0.2032, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 4.3456926345825195, |
|
"learning_rate": 8.485430675632405e-06, |
|
"loss": 0.2111, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 3.5162301063537598, |
|
"learning_rate": 8.40537944284342e-06, |
|
"loss": 0.1873, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 5.653372764587402, |
|
"learning_rate": 8.325328210054435e-06, |
|
"loss": 0.178, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 2.084319829940796, |
|
"learning_rate": 8.24527697726545e-06, |
|
"loss": 0.185, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 3.9863054752349854, |
|
"learning_rate": 8.165225744476466e-06, |
|
"loss": 0.1945, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 6.000556468963623, |
|
"learning_rate": 8.085174511687481e-06, |
|
"loss": 0.1823, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 3.515742778778076, |
|
"learning_rate": 8.005123278898495e-06, |
|
"loss": 0.1957, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 2.8108863830566406, |
|
"learning_rate": 7.92507204610951e-06, |
|
"loss": 0.1838, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 5.262875556945801, |
|
"learning_rate": 7.845020813320524e-06, |
|
"loss": 0.2389, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 5.4690752029418945, |
|
"learning_rate": 7.76496958053154e-06, |
|
"loss": 0.1823, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 2.1274213790893555, |
|
"learning_rate": 7.684918347742557e-06, |
|
"loss": 0.1233, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 6.855415344238281, |
|
"learning_rate": 7.604867114953571e-06, |
|
"loss": 0.2284, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 5.152151584625244, |
|
"learning_rate": 7.524815882164586e-06, |
|
"loss": 0.1856, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 4.211722373962402, |
|
"learning_rate": 7.4447646493756e-06, |
|
"loss": 0.2111, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 4.821152210235596, |
|
"learning_rate": 7.364713416586616e-06, |
|
"loss": 0.1541, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 3.2400951385498047, |
|
"learning_rate": 7.28466218379763e-06, |
|
"loss": 0.21, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 3.82334566116333, |
|
"learning_rate": 7.204610951008646e-06, |
|
"loss": 0.1835, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 4.301241397857666, |
|
"learning_rate": 7.124559718219661e-06, |
|
"loss": 0.2246, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 3.4558205604553223, |
|
"learning_rate": 7.044508485430676e-06, |
|
"loss": 0.1766, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 3.872791290283203, |
|
"learning_rate": 6.964457252641691e-06, |
|
"loss": 0.2126, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 2.319420099258423, |
|
"learning_rate": 6.884406019852706e-06, |
|
"loss": 0.179, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 6.737104892730713, |
|
"learning_rate": 6.8043547870637215e-06, |
|
"loss": 0.1882, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 4.559133052825928, |
|
"learning_rate": 6.724303554274736e-06, |
|
"loss": 0.1808, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 3.060370922088623, |
|
"learning_rate": 6.644252321485752e-06, |
|
"loss": 0.1923, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 5.091296672821045, |
|
"learning_rate": 6.564201088696765e-06, |
|
"loss": 0.2012, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 2.942782163619995, |
|
"learning_rate": 6.484149855907781e-06, |
|
"loss": 0.1731, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 4.692785263061523, |
|
"learning_rate": 6.404098623118797e-06, |
|
"loss": 0.1765, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 4.15416145324707, |
|
"learning_rate": 6.324047390329811e-06, |
|
"loss": 0.168, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 4.836540699005127, |
|
"learning_rate": 6.2439961575408265e-06, |
|
"loss": 0.1884, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 5.723465442657471, |
|
"learning_rate": 6.163944924751842e-06, |
|
"loss": 0.2006, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 3.738910675048828, |
|
"learning_rate": 6.083893691962857e-06, |
|
"loss": 0.152, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 4.6227641105651855, |
|
"learning_rate": 6.003842459173871e-06, |
|
"loss": 0.1885, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 4.877871036529541, |
|
"learning_rate": 5.923791226384887e-06, |
|
"loss": 0.1635, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 3.391716480255127, |
|
"learning_rate": 5.843739993595901e-06, |
|
"loss": 0.1917, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 3.0858306884765625, |
|
"learning_rate": 5.763688760806917e-06, |
|
"loss": 0.1981, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 3.075488805770874, |
|
"learning_rate": 5.6836375280179315e-06, |
|
"loss": 0.175, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 4.415194988250732, |
|
"learning_rate": 5.603586295228947e-06, |
|
"loss": 0.2039, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 4.507144451141357, |
|
"learning_rate": 5.523535062439962e-06, |
|
"loss": 0.1816, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 4.327670097351074, |
|
"learning_rate": 5.443483829650977e-06, |
|
"loss": 0.2072, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 3.314438819885254, |
|
"learning_rate": 5.363432596861992e-06, |
|
"loss": 0.1997, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 3.981945753097534, |
|
"learning_rate": 5.283381364073007e-06, |
|
"loss": 0.1643, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 3.4533607959747314, |
|
"learning_rate": 5.203330131284022e-06, |
|
"loss": 0.1503, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 3.6115882396698, |
|
"learning_rate": 5.123278898495037e-06, |
|
"loss": 0.1712, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 2.636838912963867, |
|
"learning_rate": 5.043227665706052e-06, |
|
"loss": 0.1828, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 3.045761823654175, |
|
"learning_rate": 4.9631764329170676e-06, |
|
"loss": 0.167, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 5.738334655761719, |
|
"learning_rate": 4.883125200128082e-06, |
|
"loss": 0.2237, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 2.163240909576416, |
|
"learning_rate": 4.803073967339098e-06, |
|
"loss": 0.1411, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 5.213181495666504, |
|
"learning_rate": 4.723022734550112e-06, |
|
"loss": 0.1874, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 3.869131565093994, |
|
"learning_rate": 4.642971501761127e-06, |
|
"loss": 0.1756, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 3.244732618331909, |
|
"learning_rate": 4.5629202689721425e-06, |
|
"loss": 0.1829, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 3.5364272594451904, |
|
"learning_rate": 4.482869036183157e-06, |
|
"loss": 0.1861, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 2.5283873081207275, |
|
"learning_rate": 4.402817803394173e-06, |
|
"loss": 0.1931, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 3.36181902885437, |
|
"learning_rate": 4.322766570605188e-06, |
|
"loss": 0.2183, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 5.513607025146484, |
|
"learning_rate": 4.242715337816203e-06, |
|
"loss": 0.1717, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 5.976490497589111, |
|
"learning_rate": 4.162664105027217e-06, |
|
"loss": 0.202, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 3.3449387550354004, |
|
"learning_rate": 4.082612872238233e-06, |
|
"loss": 0.2165, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 3.3972129821777344, |
|
"learning_rate": 4.0025616394492475e-06, |
|
"loss": 0.1994, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 4.022273540496826, |
|
"learning_rate": 3.922510406660262e-06, |
|
"loss": 0.168, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 3.2063329219818115, |
|
"learning_rate": 3.8424591738712785e-06, |
|
"loss": 0.1862, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 3.1869962215423584, |
|
"learning_rate": 3.762407941082293e-06, |
|
"loss": 0.1583, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 3.648125171661377, |
|
"learning_rate": 3.682356708293308e-06, |
|
"loss": 0.2026, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 4.182619571685791, |
|
"learning_rate": 3.602305475504323e-06, |
|
"loss": 0.1711, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 3.2886900901794434, |
|
"learning_rate": 3.522254242715338e-06, |
|
"loss": 0.1778, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 3.8204097747802734, |
|
"learning_rate": 3.442203009926353e-06, |
|
"loss": 0.1906, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 4.073367595672607, |
|
"learning_rate": 3.362151777137368e-06, |
|
"loss": 0.1693, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 4.779504299163818, |
|
"learning_rate": 3.2821005443483827e-06, |
|
"loss": 0.2031, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 4.730034828186035, |
|
"learning_rate": 3.2020493115593986e-06, |
|
"loss": 0.1731, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 4.198641300201416, |
|
"learning_rate": 3.1219980787704133e-06, |
|
"loss": 0.1982, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 3.796201229095459, |
|
"learning_rate": 3.0419468459814283e-06, |
|
"loss": 0.2502, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 3.4022860527038574, |
|
"learning_rate": 2.9618956131924434e-06, |
|
"loss": 0.1746, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 3.493821859359741, |
|
"learning_rate": 2.8818443804034585e-06, |
|
"loss": 0.1862, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 4.883081436157227, |
|
"learning_rate": 2.8017931476144735e-06, |
|
"loss": 0.1832, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 4.014003753662109, |
|
"learning_rate": 2.7217419148254886e-06, |
|
"loss": 0.2232, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 3.3797993659973145, |
|
"learning_rate": 2.6416906820365037e-06, |
|
"loss": 0.1791, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 2.9076929092407227, |
|
"learning_rate": 2.5616394492475183e-06, |
|
"loss": 0.1557, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 5.119110584259033, |
|
"learning_rate": 2.4815882164585338e-06, |
|
"loss": 0.1989, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 3.889577627182007, |
|
"learning_rate": 2.401536983669549e-06, |
|
"loss": 0.1771, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 2.979879379272461, |
|
"learning_rate": 2.3214857508805635e-06, |
|
"loss": 0.2187, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 4.31455135345459, |
|
"learning_rate": 2.2414345180915786e-06, |
|
"loss": 0.1818, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 5.267322540283203, |
|
"learning_rate": 2.161383285302594e-06, |
|
"loss": 0.1564, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 4.620851516723633, |
|
"learning_rate": 2.0813320525136087e-06, |
|
"loss": 0.2058, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 3.6133904457092285, |
|
"learning_rate": 2.0012808197246238e-06, |
|
"loss": 0.1678, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 2.955531358718872, |
|
"learning_rate": 1.9212295869356392e-06, |
|
"loss": 0.1771, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 5.3159403800964355, |
|
"learning_rate": 1.841178354146654e-06, |
|
"loss": 0.2387, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 3.5263235569000244, |
|
"learning_rate": 1.761127121357669e-06, |
|
"loss": 0.2061, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 3.794788122177124, |
|
"learning_rate": 1.681075888568684e-06, |
|
"loss": 0.1975, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 3.7242631912231445, |
|
"learning_rate": 1.6010246557796993e-06, |
|
"loss": 0.202, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 3.291221857070923, |
|
"learning_rate": 1.5209734229907142e-06, |
|
"loss": 0.1749, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 7.191506385803223, |
|
"learning_rate": 1.4409221902017292e-06, |
|
"loss": 0.1787, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 3.5962772369384766, |
|
"learning_rate": 1.3608709574127443e-06, |
|
"loss": 0.1894, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 3.013857126235962, |
|
"learning_rate": 1.2808197246237591e-06, |
|
"loss": 0.1439, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 3.8775179386138916, |
|
"learning_rate": 1.2007684918347744e-06, |
|
"loss": 0.1709, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 5.876482963562012, |
|
"learning_rate": 1.1207172590457893e-06, |
|
"loss": 0.1823, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 3.76519513130188, |
|
"learning_rate": 1.0406660262568043e-06, |
|
"loss": 0.1932, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 3.4437146186828613, |
|
"learning_rate": 9.606147934678196e-07, |
|
"loss": 0.2059, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 4.459022045135498, |
|
"learning_rate": 8.805635606788345e-07, |
|
"loss": 0.2016, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 3.656373977661133, |
|
"learning_rate": 8.005123278898497e-07, |
|
"loss": 0.1869, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 2.2337965965270996, |
|
"learning_rate": 7.204610951008646e-07, |
|
"loss": 0.1501, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 5.598134994506836, |
|
"learning_rate": 6.404098623118796e-07, |
|
"loss": 0.1659, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 4.543219089508057, |
|
"learning_rate": 5.603586295228946e-07, |
|
"loss": 0.2164, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 4.817913055419922, |
|
"learning_rate": 4.803073967339098e-07, |
|
"loss": 0.1332, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 6.280834674835205, |
|
"learning_rate": 4.002561639449248e-07, |
|
"loss": 0.2054, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 3.0518364906311035, |
|
"learning_rate": 3.202049311559398e-07, |
|
"loss": 0.1904, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 3.695298910140991, |
|
"learning_rate": 2.401536983669549e-07, |
|
"loss": 0.1784, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 6.226070880889893, |
|
"learning_rate": 1.601024655779699e-07, |
|
"loss": 0.1871, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 4.446568489074707, |
|
"learning_rate": 8.005123278898495e-08, |
|
"loss": 0.2494, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 5.050913333892822, |
|
"learning_rate": 0.0, |
|
"loss": 0.2138, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"eval_accuracy": 0.9518218623481781, |
|
"eval_loss": 0.1259032040834427, |
|
"eval_runtime": 31.3409, |
|
"eval_samples_per_second": 315.243, |
|
"eval_steps_per_second": 9.859, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"step": 3470, |
|
"total_flos": 1.1039888050539651e+19, |
|
"train_loss": 0.287632371283402, |
|
"train_runtime": 2790.3597, |
|
"train_samples_per_second": 159.318, |
|
"train_steps_per_second": 1.244 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3470, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.1039888050539651e+19, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|