|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9143553794574825, |
|
"eval_steps": 500, |
|
"global_step": 9000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001015950421619425, |
|
"grad_norm": 17.625, |
|
"learning_rate": 5e-06, |
|
"loss": 3.4264, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00203190084323885, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1e-05, |
|
"loss": 3.432, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.003047851264858275, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.5e-05, |
|
"loss": 3.23, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0040638016864777, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 2e-05, |
|
"loss": 2.9762, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005079752108097125, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.6173, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00609570252971655, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 3e-05, |
|
"loss": 2.2004, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.007111652951335975, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.4176, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0081276033729554, |
|
"grad_norm": 4.375, |
|
"learning_rate": 4e-05, |
|
"loss": 1.0122, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009143553794574825, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.9116, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01015950421619425, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6832, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011175454637813675, |
|
"grad_norm": 5.5, |
|
"learning_rate": 4.9999870035728426e-05, |
|
"loss": 0.7355, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0121914050594331, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 4.9999480144264944e-05, |
|
"loss": 0.6673, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.013207355481052525, |
|
"grad_norm": 4.5, |
|
"learning_rate": 4.9998830329663314e-05, |
|
"loss": 0.6792, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01422330590267195, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 4.9997920598679756e-05, |
|
"loss": 0.6207, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.015239256324291375, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 4.999675096077286e-05, |
|
"loss": 0.483, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0162552067459108, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 4.999532142810354e-05, |
|
"loss": 0.5319, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.017271157167530225, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 4.999363201553483e-05, |
|
"loss": 0.6052, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.01828710758914965, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.9991682740631794e-05, |
|
"loss": 0.4258, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.019303058010769075, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 4.998947362366133e-05, |
|
"loss": 0.4309, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0203190084323885, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 4.998700468759193e-05, |
|
"loss": 0.3957, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.021334958854007924, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.9984275958093475e-05, |
|
"loss": 0.4777, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.02235090927562735, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 4.998128746353695e-05, |
|
"loss": 0.3549, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.023366859697246774, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 4.997803923499417e-05, |
|
"loss": 0.4447, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0243828101188662, |
|
"grad_norm": 6.375, |
|
"learning_rate": 4.99745313062374e-05, |
|
"loss": 0.3808, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.025398760540485624, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 4.99707637137391e-05, |
|
"loss": 0.3827, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02641471096210505, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 4.996673649667145e-05, |
|
"loss": 0.3694, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.027430661383724474, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.9962449696906e-05, |
|
"loss": 0.3586, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0284466118053439, |
|
"grad_norm": 4.125, |
|
"learning_rate": 4.9957903359013214e-05, |
|
"loss": 0.3832, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.029462562226963324, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.995309753026201e-05, |
|
"loss": 0.328, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03047851264858275, |
|
"grad_norm": 4.5, |
|
"learning_rate": 4.994803226061927e-05, |
|
"loss": 0.3667, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03149446307020217, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 4.994270760274933e-05, |
|
"loss": 0.3811, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0325104134918216, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 4.99371236120134e-05, |
|
"loss": 0.3065, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03352636391344102, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.993128034646902e-05, |
|
"loss": 0.4177, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.03454231433506045, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 4.992517786686947e-05, |
|
"loss": 0.33, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03555826475667987, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.9918816236663077e-05, |
|
"loss": 0.3287, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0365742151782993, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 4.991219552199262e-05, |
|
"loss": 0.2934, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03759016559991872, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 4.99053157916946e-05, |
|
"loss": 0.3176, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03860611602153815, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 4.989817711729856e-05, |
|
"loss": 0.3318, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03962206644315757, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.98907795730263e-05, |
|
"loss": 0.3234, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.040638016864777, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 4.988312323579114e-05, |
|
"loss": 0.267, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04165396728639642, |
|
"grad_norm": 3.75, |
|
"learning_rate": 4.98752081851971e-05, |
|
"loss": 0.3081, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.04266991770801585, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.986703450353809e-05, |
|
"loss": 0.2917, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04368586812963527, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.985860227579703e-05, |
|
"loss": 0.2805, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0447018185512547, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 4.984991158964499e-05, |
|
"loss": 0.3534, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04571776897287412, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.9840962535440265e-05, |
|
"loss": 0.335, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04673371939449355, |
|
"grad_norm": 3.25, |
|
"learning_rate": 4.983175520622744e-05, |
|
"loss": 0.2544, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04774966981611297, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.982228969773642e-05, |
|
"loss": 0.3449, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0487656202377324, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.9812566108381435e-05, |
|
"loss": 0.2964, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04978157065935182, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.9802584539260035e-05, |
|
"loss": 0.2799, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.05079752108097125, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 4.979234509415199e-05, |
|
"loss": 0.3231, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05181347150259067, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 4.978184787951828e-05, |
|
"loss": 0.2943, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0528294219242101, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.977109300449992e-05, |
|
"loss": 0.2705, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.05384537234582952, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 4.9760080580916876e-05, |
|
"loss": 0.2998, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.05486132276744895, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 4.974881072326688e-05, |
|
"loss": 0.2595, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05587727318906837, |
|
"grad_norm": 4.25, |
|
"learning_rate": 4.9737283548724236e-05, |
|
"loss": 0.2803, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.0568932236106878, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 4.97254991771386e-05, |
|
"loss": 0.3511, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05790917403230722, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.971345773103377e-05, |
|
"loss": 0.312, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05892512445392665, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 4.9701159335606365e-05, |
|
"loss": 0.2482, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05994107487554607, |
|
"grad_norm": 5.5, |
|
"learning_rate": 4.968860411872454e-05, |
|
"loss": 0.2537, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.0609570252971655, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 4.967579221092666e-05, |
|
"loss": 0.3125, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06197297571878492, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 4.966272374541996e-05, |
|
"loss": 0.2354, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.06298892614040434, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 4.964939885807912e-05, |
|
"loss": 0.3213, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.06400487656202378, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.9635817687444876e-05, |
|
"loss": 0.3003, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0650208269836432, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 4.962198037472259e-05, |
|
"loss": 0.2996, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.06603677740526262, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 4.9607887063780776e-05, |
|
"loss": 0.2257, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.06705272782688204, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.9593537901149564e-05, |
|
"loss": 0.223, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.06806867824850148, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 4.957893303601924e-05, |
|
"loss": 0.3407, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.0690846286701209, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 4.956407262023866e-05, |
|
"loss": 0.2589, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.07010057909174032, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 4.954895680831367e-05, |
|
"loss": 0.2949, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.07111652951335974, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 4.9533585757405506e-05, |
|
"loss": 0.2995, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.07213247993497918, |
|
"grad_norm": 4.625, |
|
"learning_rate": 4.951795962732917e-05, |
|
"loss": 0.2894, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.0731484303565986, |
|
"grad_norm": 3.0, |
|
"learning_rate": 4.9502078580551755e-05, |
|
"loss": 0.3082, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.07416438077821802, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 4.9485942782190734e-05, |
|
"loss": 0.2308, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.07518033119983744, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 4.9469552400012306e-05, |
|
"loss": 0.2272, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.07619628162145688, |
|
"grad_norm": 4.25, |
|
"learning_rate": 4.94529076044296e-05, |
|
"loss": 0.2701, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.0772122320430763, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 4.94360085685009e-05, |
|
"loss": 0.2686, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.07822818246469572, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.9418855467927894e-05, |
|
"loss": 0.2051, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.07924413288631514, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.940144848105379e-05, |
|
"loss": 0.2267, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.08026008330793458, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 4.93837877888615e-05, |
|
"loss": 0.2597, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.081276033729554, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 4.9365873574971745e-05, |
|
"loss": 0.3701, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.08229198415117342, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 4.9347706025641136e-05, |
|
"loss": 0.2559, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.08330793457279284, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 4.9329285329760275e-05, |
|
"loss": 0.2799, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.08432388499441228, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 4.9310611678851735e-05, |
|
"loss": 0.2866, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.0853398354160317, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.929168526706811e-05, |
|
"loss": 0.3105, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.08635578583765112, |
|
"grad_norm": 13.625, |
|
"learning_rate": 4.927250629119e-05, |
|
"loss": 0.2454, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.08737173625927054, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 4.9253074950623925e-05, |
|
"loss": 0.2424, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.08838768668088998, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 4.9233391447400286e-05, |
|
"loss": 0.2481, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.0894036371025094, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 4.921345598617125e-05, |
|
"loss": 0.2231, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.09041958752412882, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.9193268774208654e-05, |
|
"loss": 0.3447, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.09143553794574824, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.9172830021401785e-05, |
|
"loss": 0.229, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.09245148836736768, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 4.9152139940255245e-05, |
|
"loss": 0.2122, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.0934674387889871, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 4.913119874588677e-05, |
|
"loss": 0.2386, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.09448338921060652, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 4.911000665602489e-05, |
|
"loss": 0.1944, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.09549933963222594, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 4.9088563891006786e-05, |
|
"loss": 0.2038, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.09651529005384538, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 4.906687067377592e-05, |
|
"loss": 0.3122, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0975312404754648, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.904492722987976e-05, |
|
"loss": 0.3157, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.09854719089708422, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.902273378746738e-05, |
|
"loss": 0.3077, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.09956314131870364, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.9000290577287165e-05, |
|
"loss": 0.2756, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.10057909174032308, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.897759783268434e-05, |
|
"loss": 0.2915, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.1015950421619425, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 4.895465578959859e-05, |
|
"loss": 0.2052, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10261099258356192, |
|
"grad_norm": 4.0, |
|
"learning_rate": 4.893146468656159e-05, |
|
"loss": 0.2499, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.10362694300518134, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.890802476469452e-05, |
|
"loss": 0.278, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.10464289342680078, |
|
"grad_norm": 3.625, |
|
"learning_rate": 4.888433626770558e-05, |
|
"loss": 0.2143, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.1056588438484202, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 4.886039944188741e-05, |
|
"loss": 0.2878, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.10667479427003962, |
|
"grad_norm": 4.5, |
|
"learning_rate": 4.883621453611461e-05, |
|
"loss": 0.2744, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.10769074469165904, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 4.881178180184106e-05, |
|
"loss": 0.2734, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.10870669511327848, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.878710149309735e-05, |
|
"loss": 0.3574, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.1097226455348979, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 4.876217386648816e-05, |
|
"loss": 0.2625, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.11073859595651732, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 4.873699918118955e-05, |
|
"loss": 0.2437, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.11175454637813674, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 4.87115776989463e-05, |
|
"loss": 0.2051, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.11277049679975618, |
|
"grad_norm": 4.375, |
|
"learning_rate": 4.8685909684069153e-05, |
|
"loss": 0.1727, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.1137864472213756, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 4.865999540343211e-05, |
|
"loss": 0.2256, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.11480239764299502, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.86338351264696e-05, |
|
"loss": 0.3529, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.11581834806461444, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.8607429125173754e-05, |
|
"loss": 0.2113, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.11683429848623388, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.858077767409149e-05, |
|
"loss": 0.2759, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1178502489078533, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 4.855388105032174e-05, |
|
"loss": 0.2482, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.11886619932947272, |
|
"grad_norm": 3.5, |
|
"learning_rate": 4.852673953351249e-05, |
|
"loss": 0.1865, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.11988214975109214, |
|
"grad_norm": 3.75, |
|
"learning_rate": 4.849935340585796e-05, |
|
"loss": 0.2659, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.12089810017271158, |
|
"grad_norm": 3.375, |
|
"learning_rate": 4.8471722952095586e-05, |
|
"loss": 0.1506, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.121914050594331, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 4.844384845950312e-05, |
|
"loss": 0.307, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.12293000101595042, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.841573021789561e-05, |
|
"loss": 0.1952, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.12394595143756984, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.838736851962239e-05, |
|
"loss": 0.1779, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.12496190185918928, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.835876365956408e-05, |
|
"loss": 0.1235, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.12597785228080868, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.8329915935129436e-05, |
|
"loss": 0.1876, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.12699380270242813, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 4.830082564625235e-05, |
|
"loss": 0.2188, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.12800975312404755, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 4.8271493095388684e-05, |
|
"loss": 0.2622, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.12902570354566698, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 4.824191858751312e-05, |
|
"loss": 0.2724, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.1300416539672864, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 4.821210243011601e-05, |
|
"loss": 0.2413, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.13105760438890582, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 4.818204493320016e-05, |
|
"loss": 0.2618, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.13207355481052524, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 4.8151746409277634e-05, |
|
"loss": 0.2295, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.13308950523214466, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 4.8121207173366484e-05, |
|
"loss": 0.2733, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.13410545565376408, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 4.809042754298746e-05, |
|
"loss": 0.2311, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.13512140607538353, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.805940783816075e-05, |
|
"loss": 0.2059, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.13613735649700295, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 4.8028148381402625e-05, |
|
"loss": 0.2102, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.13715330691862238, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 4.7996649497722084e-05, |
|
"loss": 0.2708, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.1381692573402418, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.7964911514617485e-05, |
|
"loss": 0.2429, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.13918520776186122, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.793293476207312e-05, |
|
"loss": 0.2725, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.14020115818348064, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.790071957255585e-05, |
|
"loss": 0.2098, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.14121710860510006, |
|
"grad_norm": 4.25, |
|
"learning_rate": 4.786826628101154e-05, |
|
"loss": 0.2101, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.14223305902671948, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.783557522486167e-05, |
|
"loss": 0.2624, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.14324900944833893, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.780264674399978e-05, |
|
"loss": 0.2518, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.14426495986995835, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 4.7769481180787966e-05, |
|
"loss": 0.3112, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.14528091029157778, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 4.773607888005327e-05, |
|
"loss": 0.2747, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1462968607131972, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.770244018908416e-05, |
|
"loss": 0.1572, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.14731281113481662, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 4.766856545762687e-05, |
|
"loss": 0.2148, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.14832876155643604, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.763445503788178e-05, |
|
"loss": 0.2531, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.14934471197805546, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.760010928449976e-05, |
|
"loss": 0.199, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.15036066239967488, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.7565528554578485e-05, |
|
"loss": 0.2366, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.15137661282129433, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 4.75307132076587e-05, |
|
"loss": 0.1862, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.15239256324291375, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.749566360572049e-05, |
|
"loss": 0.2143, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.15340851366453317, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.746038011317955e-05, |
|
"loss": 0.1877, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.1544244640861526, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.742486309688333e-05, |
|
"loss": 0.2831, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.15544041450777202, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.738911292610732e-05, |
|
"loss": 0.1708, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.15645636492939144, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 4.735312997255107e-05, |
|
"loss": 0.192, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.15747231535101086, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.7316914610334475e-05, |
|
"loss": 0.2586, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.15848826577263028, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 4.728046721599378e-05, |
|
"loss": 0.2141, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.15950421619424973, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 4.724378816847771e-05, |
|
"loss": 0.193, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.16052016661586915, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.720687784914352e-05, |
|
"loss": 0.191, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.16153611703748857, |
|
"grad_norm": 3.75, |
|
"learning_rate": 4.716973664175304e-05, |
|
"loss": 0.2172, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.162552067459108, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.7132364932468645e-05, |
|
"loss": 0.2134, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.16356801788072742, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 4.709476310984932e-05, |
|
"loss": 0.2055, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.16458396830234684, |
|
"grad_norm": 3.875, |
|
"learning_rate": 4.705693156484652e-05, |
|
"loss": 0.2136, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.16559991872396626, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.7018870690800196e-05, |
|
"loss": 0.1471, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.16661586914558568, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.698058088343465e-05, |
|
"loss": 0.2308, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.16763181956720513, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 4.6942062540854425e-05, |
|
"loss": 0.2456, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.16864776998882455, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.69033160635402e-05, |
|
"loss": 0.2654, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.16966372041044397, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 4.6864341854344587e-05, |
|
"loss": 0.2226, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.1706796708320634, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.682514031848795e-05, |
|
"loss": 0.2438, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.17169562125368282, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 4.678571186355423e-05, |
|
"loss": 0.1889, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.17271157167530224, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 4.6746056899486644e-05, |
|
"loss": 0.2117, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.17372752209692166, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 4.67061758385835e-05, |
|
"loss": 0.1953, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.17474347251854108, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 4.6666069095493816e-05, |
|
"loss": 0.1844, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.17575942294016053, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.662573708721309e-05, |
|
"loss": 0.2774, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.17677537336177995, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 4.658518023307894e-05, |
|
"loss": 0.2527, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.17779132378339937, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 4.654439895476671e-05, |
|
"loss": 0.2164, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.1788072742050188, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.6503393676285146e-05, |
|
"loss": 0.2424, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.17982322462663822, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.646216482397192e-05, |
|
"loss": 0.2428, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.18083917504825764, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 4.6420712826489275e-05, |
|
"loss": 0.2155, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.18185512546987706, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.6379038114819485e-05, |
|
"loss": 0.1544, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.18287107589149648, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 4.6337141122260444e-05, |
|
"loss": 0.2029, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.18388702631311593, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 4.629502228442112e-05, |
|
"loss": 0.1489, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.18490297673473535, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 4.6252682039217045e-05, |
|
"loss": 0.2101, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.18591892715635477, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.621012082686573e-05, |
|
"loss": 0.2076, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.1869348775779742, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 4.616733908988216e-05, |
|
"loss": 0.2719, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.18795082799959362, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.612433727307409e-05, |
|
"loss": 0.2105, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.18896677842121304, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 4.608111582353751e-05, |
|
"loss": 0.1877, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.18998272884283246, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.603767519065197e-05, |
|
"loss": 0.2238, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.19099867926445188, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.599401582607589e-05, |
|
"loss": 0.243, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.19201462968607133, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.595013818374185e-05, |
|
"loss": 0.1867, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.19303058010769075, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.5906042719851925e-05, |
|
"loss": 0.1994, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.19404653052931017, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 4.586172989287291e-05, |
|
"loss": 0.1899, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.1950624809509296, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 4.5817200163531534e-05, |
|
"loss": 0.2528, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.577245399480972e-05, |
|
"loss": 0.2336, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.19709438179416844, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.5727491851939715e-05, |
|
"loss": 0.2204, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.19811033221578786, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.568231420239929e-05, |
|
"loss": 0.1656, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.19912628263740728, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 4.563692151590687e-05, |
|
"loss": 0.2105, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.20014223305902673, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.5591314264416666e-05, |
|
"loss": 0.1464, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.20115818348064615, |
|
"grad_norm": 4.25, |
|
"learning_rate": 4.554549292211371e-05, |
|
"loss": 0.2103, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.20217413390226557, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 4.549945796540901e-05, |
|
"loss": 0.144, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.203190084323885, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.545320987293453e-05, |
|
"loss": 0.1963, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.20420603474550442, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.540674912553824e-05, |
|
"loss": 0.2115, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.20522198516712384, |
|
"grad_norm": 4.25, |
|
"learning_rate": 4.536007620627911e-05, |
|
"loss": 0.1682, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.20623793558874326, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.531319160042212e-05, |
|
"loss": 0.1992, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.20725388601036268, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.5266095795433126e-05, |
|
"loss": 0.1134, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.20826983643198213, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.5218789280973925e-05, |
|
"loss": 0.1474, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.20928578685360155, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.5171272548897024e-05, |
|
"loss": 0.1955, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.21030173727522097, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 4.512354609324063e-05, |
|
"loss": 0.2042, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.2113176876968404, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 4.507561041022347e-05, |
|
"loss": 0.2174, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.21233363811845982, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.502746599823963e-05, |
|
"loss": 0.2634, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.21334958854007924, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.497911335785339e-05, |
|
"loss": 0.1884, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.21436553896169866, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.4930552991794e-05, |
|
"loss": 0.1872, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.21538148938331808, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 4.4881785404950474e-05, |
|
"loss": 0.2233, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.21639743980493753, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.483281110436631e-05, |
|
"loss": 0.2374, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.21741339022655695, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 4.478363059923426e-05, |
|
"loss": 0.2545, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.21842934064817637, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.4734244400891014e-05, |
|
"loss": 0.2063, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.2194452910697958, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 4.4684653022811865e-05, |
|
"loss": 0.1219, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.22046124149141522, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 4.463485698060541e-05, |
|
"loss": 0.2805, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.22147719191303464, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.458485679200814e-05, |
|
"loss": 0.1998, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.22249314233465406, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 4.453465297687912e-05, |
|
"loss": 0.2489, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.22350909275627348, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.448424605719452e-05, |
|
"loss": 0.2731, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.22452504317789293, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 4.443363655704224e-05, |
|
"loss": 0.2425, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.22554099359951235, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 4.438282500261641e-05, |
|
"loss": 0.2938, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.22655694402113177, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.433181192221197e-05, |
|
"loss": 0.1728, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.2275728944427512, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 4.4280597846219155e-05, |
|
"loss": 0.216, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.22858884486437062, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.422918330711796e-05, |
|
"loss": 0.1612, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.22960479528599004, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.417756883947263e-05, |
|
"loss": 0.107, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.23062074570760946, |
|
"grad_norm": 3.375, |
|
"learning_rate": 4.412575497992611e-05, |
|
"loss": 0.1756, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.23163669612922888, |
|
"grad_norm": 4.375, |
|
"learning_rate": 4.407374226719445e-05, |
|
"loss": 0.234, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.23265264655084833, |
|
"grad_norm": 3.25, |
|
"learning_rate": 4.402153124206119e-05, |
|
"loss": 0.2144, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.23366859697246775, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.396912244737173e-05, |
|
"loss": 0.1696, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.23468454739408717, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.391651642802778e-05, |
|
"loss": 0.2506, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.2357004978157066, |
|
"grad_norm": 4.5, |
|
"learning_rate": 4.386371373098155e-05, |
|
"loss": 0.1686, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.23671644823732602, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.381071490523018e-05, |
|
"loss": 0.2403, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.23773239865894544, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 4.3757520501809955e-05, |
|
"loss": 0.1611, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.23874834908056486, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.370413107379065e-05, |
|
"loss": 0.1698, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.23976429950218428, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 4.36505471762697e-05, |
|
"loss": 0.1928, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.24078024992380373, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.3596769366366474e-05, |
|
"loss": 0.2035, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.24179620034542315, |
|
"grad_norm": 5.75, |
|
"learning_rate": 4.354279820321649e-05, |
|
"loss": 0.16, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.24281215076704257, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.34886342479656e-05, |
|
"loss": 0.1851, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.243828101188662, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.34342780637641e-05, |
|
"loss": 0.1726, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.24484405161028142, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 4.337973021576095e-05, |
|
"loss": 0.2847, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.24586000203190084, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.3324991271097846e-05, |
|
"loss": 0.2528, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.24687595245352026, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.3270061798903374e-05, |
|
"loss": 0.1573, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.24789190287513968, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.321494237028701e-05, |
|
"loss": 0.1703, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.24890785329675913, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 4.31596335583333e-05, |
|
"loss": 0.2613, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.24992380371837855, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 4.310413593809579e-05, |
|
"loss": 0.22, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.250939754139998, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 4.304845008659108e-05, |
|
"loss": 0.1263, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.25195570456161737, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 4.2992576582792895e-05, |
|
"loss": 0.1639, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.2529716549832368, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.293651600762595e-05, |
|
"loss": 0.2681, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.25398760540485626, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 4.288026894395999e-05, |
|
"loss": 0.2292, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.25500355582647566, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 4.2823835976603723e-05, |
|
"loss": 0.2324, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.2560195062480951, |
|
"grad_norm": 5.625, |
|
"learning_rate": 4.276721769229869e-05, |
|
"loss": 0.1834, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.2570354566697145, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.271041467971323e-05, |
|
"loss": 0.1826, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.25805140709133395, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 4.265342752943632e-05, |
|
"loss": 0.2463, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 4.2596256833971425e-05, |
|
"loss": 0.2598, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.2600833079345728, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.2538903187730374e-05, |
|
"loss": 0.1148, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.26109925835619224, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.248136718702716e-05, |
|
"loss": 0.2123, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.26211520877781164, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 4.242364943007172e-05, |
|
"loss": 0.2369, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.2631311591994311, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.236575051696377e-05, |
|
"loss": 0.261, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.2641471096210505, |
|
"grad_norm": 2.75, |
|
"learning_rate": 4.2307671049686514e-05, |
|
"loss": 0.1564, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.26516306004266993, |
|
"grad_norm": 3.5, |
|
"learning_rate": 4.2249411632100396e-05, |
|
"loss": 0.1563, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.2661790104642893, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.219097286993684e-05, |
|
"loss": 0.1697, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.26719496088590877, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.2132355370791946e-05, |
|
"loss": 0.1844, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.26821091130752817, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 4.2073559744120156e-05, |
|
"loss": 0.2144, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.2692268617291476, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.201458660122793e-05, |
|
"loss": 0.2013, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.27024281215076706, |
|
"grad_norm": 3.625, |
|
"learning_rate": 4.1955436555267393e-05, |
|
"loss": 0.2166, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.27125876257238646, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 4.189611022122997e-05, |
|
"loss": 0.1934, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.2722747129940059, |
|
"grad_norm": 2.75, |
|
"learning_rate": 4.1836608215939944e-05, |
|
"loss": 0.2157, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.2732906634156253, |
|
"grad_norm": 3.5, |
|
"learning_rate": 4.17769311580481e-05, |
|
"loss": 0.18, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.27430661383724475, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.171707966802528e-05, |
|
"loss": 0.2178, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.27532256425886414, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 4.16570543681559e-05, |
|
"loss": 0.1896, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.2763385146804836, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.159685588253151e-05, |
|
"loss": 0.1322, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.27735446510210304, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 4.153648483704429e-05, |
|
"loss": 0.184, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.27837041552372244, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 4.147594185938057e-05, |
|
"loss": 0.2451, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.2793863659453419, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.141522757901426e-05, |
|
"loss": 0.2367, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.2804023163669613, |
|
"grad_norm": 3.375, |
|
"learning_rate": 4.1354342627200345e-05, |
|
"loss": 0.179, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.28141826678858073, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 4.1293287636968286e-05, |
|
"loss": 0.1396, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.2824342172102001, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.1232063243115485e-05, |
|
"loss": 0.1963, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.28345016763181957, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 4.117067008220063e-05, |
|
"loss": 0.2457, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.28446611805343897, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.110910879253712e-05, |
|
"loss": 0.2262, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2854820684750584, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.104738001418641e-05, |
|
"loss": 0.2499, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.28649801889667786, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.098548438895135e-05, |
|
"loss": 0.1667, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.28751396931829726, |
|
"grad_norm": 2.875, |
|
"learning_rate": 4.092342256036954e-05, |
|
"loss": 0.2288, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.2885299197399167, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 4.086119517370659e-05, |
|
"loss": 0.2038, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.2895458701615361, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 4.0798802875949485e-05, |
|
"loss": 0.181, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.29056182058315555, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.073624631579975e-05, |
|
"loss": 0.1886, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.29157777100477494, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 4.067352614366685e-05, |
|
"loss": 0.2053, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.2925937214263944, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.061064301166128e-05, |
|
"loss": 0.1409, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.29360967184801384, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.054759757358787e-05, |
|
"loss": 0.184, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.29462562226963324, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.048439048493898e-05, |
|
"loss": 0.2306, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2956415726912527, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 4.0421022402887676e-05, |
|
"loss": 0.1914, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.2966575231128721, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.035749398628088e-05, |
|
"loss": 0.1653, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.29767347353449153, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.029380589563256e-05, |
|
"loss": 0.1941, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.2986894239561109, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.02299587931168e-05, |
|
"loss": 0.1117, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.29970537437773037, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.0165953342560974e-05, |
|
"loss": 0.1605, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.30072132479934977, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 4.010179020943884e-05, |
|
"loss": 0.1726, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.3017372752209692, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 4.003747006086357e-05, |
|
"loss": 0.2208, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.30275322564258866, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.9972993565580866e-05, |
|
"loss": 0.1325, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.30376917606420806, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 3.9908361393962e-05, |
|
"loss": 0.2014, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.3047851264858275, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 3.984357421799681e-05, |
|
"loss": 0.165, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3058010769074469, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.9778632711286756e-05, |
|
"loss": 0.212, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.30681702732906635, |
|
"grad_norm": 4.25, |
|
"learning_rate": 3.971353754903788e-05, |
|
"loss": 0.2388, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.30783297775068574, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.964828940805381e-05, |
|
"loss": 0.2175, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.3088489281723052, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.95828889667287e-05, |
|
"loss": 0.2088, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.30986487859392464, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.9517336905040244e-05, |
|
"loss": 0.1913, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.31088082901554404, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.9451633904542483e-05, |
|
"loss": 0.2185, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.3118967794371635, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 3.9385780648358846e-05, |
|
"loss": 0.2072, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.3129127298587829, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 3.9319777821174955e-05, |
|
"loss": 0.1902, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.31392868028040233, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.925362610923158e-05, |
|
"loss": 0.259, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.3149446307020217, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 3.918732620031742e-05, |
|
"loss": 0.2026, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.31596058112364117, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.912087878376205e-05, |
|
"loss": 0.1478, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.31697653154526056, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.905428455042865e-05, |
|
"loss": 0.167, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.31799248196688, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.898754419270693e-05, |
|
"loss": 0.1629, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.31900843238849946, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.892065840450583e-05, |
|
"loss": 0.1308, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.32002438281011886, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.885362788124637e-05, |
|
"loss": 0.2008, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.3210403332317383, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 3.8786453319854396e-05, |
|
"loss": 0.2225, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.3220562836533577, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 3.8719135418753366e-05, |
|
"loss": 0.2243, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.32307223407497715, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.865167487785702e-05, |
|
"loss": 0.1981, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.32408818449659654, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.8584072398562164e-05, |
|
"loss": 0.2031, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.325104134918216, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 3.851632868374136e-05, |
|
"loss": 0.1621, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.32612008533983544, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 3.844844443773562e-05, |
|
"loss": 0.1674, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.32713603576145484, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 3.8380420366347046e-05, |
|
"loss": 0.1502, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.3281519861830743, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 3.831225717683157e-05, |
|
"loss": 0.1868, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.3291679366046937, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 3.8243955577891534e-05, |
|
"loss": 0.1818, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.3301838870263131, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 3.8175516279668335e-05, |
|
"loss": 0.2215, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.3311998374479325, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 3.810693999373505e-05, |
|
"loss": 0.2544, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.33221578786955197, |
|
"grad_norm": 4.0, |
|
"learning_rate": 3.8038227433089056e-05, |
|
"loss": 0.1175, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.33323173829117136, |
|
"grad_norm": 3.625, |
|
"learning_rate": 3.796937931214458e-05, |
|
"loss": 0.2213, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.3342476887127908, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.7900396346725296e-05, |
|
"loss": 0.1711, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.33526363913441026, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 3.783127925405686e-05, |
|
"loss": 0.2628, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.33627958955602966, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.77620287527595e-05, |
|
"loss": 0.1671, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.3372955399776491, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 3.769264556284048e-05, |
|
"loss": 0.2109, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.3383114903992685, |
|
"grad_norm": 2.875, |
|
"learning_rate": 3.762313040568665e-05, |
|
"loss": 0.1978, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.33932744082088795, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.755348400405697e-05, |
|
"loss": 0.1275, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.34034339124250734, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 3.7483707082074945e-05, |
|
"loss": 0.1482, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.3413593416641268, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 3.741380036522111e-05, |
|
"loss": 0.1933, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.34237529208574624, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.734376458032551e-05, |
|
"loss": 0.1925, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.34339124250736563, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 3.727360045556014e-05, |
|
"loss": 0.2297, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.3444071929289851, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.7203308720431336e-05, |
|
"loss": 0.1704, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.3454231433506045, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.7132890105772234e-05, |
|
"loss": 0.258, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3464390937722239, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 3.706234534373515e-05, |
|
"loss": 0.2376, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.3474550441938433, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.6991675167783985e-05, |
|
"loss": 0.2403, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.34847099461546277, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.6920880312686556e-05, |
|
"loss": 0.1642, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.34948694503708216, |
|
"grad_norm": 2.875, |
|
"learning_rate": 3.684996151450702e-05, |
|
"loss": 0.1455, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.3505028954587016, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 3.6778919510598155e-05, |
|
"loss": 0.2175, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.35151884588032106, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.670775503959376e-05, |
|
"loss": 0.1858, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.35253479630194046, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 3.6636468841400917e-05, |
|
"loss": 0.1911, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.3535507467235599, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 3.656506165719233e-05, |
|
"loss": 0.2114, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.3545666971451793, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.649353422939863e-05, |
|
"loss": 0.1841, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.35558264756679875, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.6421887301700615e-05, |
|
"loss": 0.1505, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.35659859798841814, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.6350121619021524e-05, |
|
"loss": 0.2625, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.3576145484100376, |
|
"grad_norm": 5.25, |
|
"learning_rate": 3.627823792751936e-05, |
|
"loss": 0.1676, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.35863049883165704, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.620623697457905e-05, |
|
"loss": 0.1963, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.35964644925327643, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 3.613411950880468e-05, |
|
"loss": 0.2048, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.3606623996748959, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 3.606188628001178e-05, |
|
"loss": 0.226, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.3616783500965153, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.598953803921947e-05, |
|
"loss": 0.1884, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.3626943005181347, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 3.591707553864266e-05, |
|
"loss": 0.224, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.3637102509397541, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.584449953168423e-05, |
|
"loss": 0.1866, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.36472620136137357, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.577181077292722e-05, |
|
"loss": 0.1663, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.36574215178299296, |
|
"grad_norm": 5.0, |
|
"learning_rate": 3.569901001812696e-05, |
|
"loss": 0.2032, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3667581022046124, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.562609802420321e-05, |
|
"loss": 0.2395, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.36777405262623186, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 3.555307554923229e-05, |
|
"loss": 0.1799, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.36879000304785126, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 3.547994335243925e-05, |
|
"loss": 0.1771, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.3698059534694707, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.540670219418989e-05, |
|
"loss": 0.2123, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.3708219038910901, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 3.53333528359829e-05, |
|
"loss": 0.2159, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.37183785431270955, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 3.525989604044198e-05, |
|
"loss": 0.2749, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.37285380473432894, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 3.5186332571307826e-05, |
|
"loss": 0.1613, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.3738697551559484, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 3.511266319343025e-05, |
|
"loss": 0.1877, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.37488570557756784, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.503888867276022e-05, |
|
"loss": 0.2185, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.37590165599918723, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.4965009776341894e-05, |
|
"loss": 0.2195, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3769176064208067, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.489102727230461e-05, |
|
"loss": 0.2344, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.3779335568424261, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 3.481694192985496e-05, |
|
"loss": 0.1863, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.3789495072640455, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.474275451926875e-05, |
|
"loss": 0.1894, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.3799654576856649, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.4668465811883e-05, |
|
"loss": 0.2127, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.38098140810728437, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 3.4594076580087914e-05, |
|
"loss": 0.2125, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.38199735852890376, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.451958759731889e-05, |
|
"loss": 0.1801, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.3830133089505232, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 3.4444999638048456e-05, |
|
"loss": 0.1949, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.38402925937214266, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 3.437031347777817e-05, |
|
"loss": 0.2719, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.38504520979376206, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 3.4295529893030634e-05, |
|
"loss": 0.1697, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.3860611602153815, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.422064966134138e-05, |
|
"loss": 0.1557, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3870771106370009, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.4145673561250794e-05, |
|
"loss": 0.2129, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.38809306105862035, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.4070602372296e-05, |
|
"loss": 0.2068, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.38910901148023974, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.39954368750028e-05, |
|
"loss": 0.1634, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.3901249619018592, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.392017785087752e-05, |
|
"loss": 0.2299, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.39114091232347864, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 3.38448260823989e-05, |
|
"loss": 0.1585, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.39215686274509803, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 3.376938235300996e-05, |
|
"loss": 0.2382, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.3931728131667175, |
|
"grad_norm": 5.375, |
|
"learning_rate": 3.369384744710984e-05, |
|
"loss": 0.1987, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.3941887635883369, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.361822215004566e-05, |
|
"loss": 0.2316, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3952047140099563, |
|
"grad_norm": 2.0, |
|
"learning_rate": 3.354250724810436e-05, |
|
"loss": 0.2019, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.3962206644315757, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 3.34667035285045e-05, |
|
"loss": 0.187, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.39723661485319517, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 3.339081177938811e-05, |
|
"loss": 0.2353, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.39825256527481456, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 3.331483278981244e-05, |
|
"loss": 0.2078, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.399268515696434, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.323876734974183e-05, |
|
"loss": 0.1761, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.40028446611805346, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 3.316261625003943e-05, |
|
"loss": 0.2081, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.40130041653967286, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.308638028245902e-05, |
|
"loss": 0.2087, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.4023163669612923, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.301006023963676e-05, |
|
"loss": 0.1579, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.4033323173829117, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 3.293365691508295e-05, |
|
"loss": 0.1904, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.40434826780453115, |
|
"grad_norm": 3.0, |
|
"learning_rate": 3.285717110317379e-05, |
|
"loss": 0.1991, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.40536421822615054, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 3.27806035991431e-05, |
|
"loss": 0.1445, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.40638016864777, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.2703955199074075e-05, |
|
"loss": 0.2393, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.40739611906938944, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.262722669989098e-05, |
|
"loss": 0.1789, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.40841206949100883, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 3.255041889935092e-05, |
|
"loss": 0.1511, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.4094280199126283, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.247353259603547e-05, |
|
"loss": 0.2066, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.4104439703342477, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 3.239656858934242e-05, |
|
"loss": 0.1564, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.4114599207558671, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.231952767947746e-05, |
|
"loss": 0.1503, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.4124758711774865, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.2242410667445844e-05, |
|
"loss": 0.1633, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.41349182159910597, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 3.2165218355044076e-05, |
|
"loss": 0.1492, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.41450777202072536, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 3.2087951544851566e-05, |
|
"loss": 0.3051, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.4155237224423448, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 3.20106110402223e-05, |
|
"loss": 0.2229, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.41653967286396426, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 3.1933197645276455e-05, |
|
"loss": 0.2224, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.41755562328558365, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.185571216489209e-05, |
|
"loss": 0.1297, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.4185715737072031, |
|
"grad_norm": 3.625, |
|
"learning_rate": 3.177815540469669e-05, |
|
"loss": 0.2074, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.4195875241288225, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 3.1700528171058916e-05, |
|
"loss": 0.1949, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.42060347455044195, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 3.162283127108011e-05, |
|
"loss": 0.1661, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.42161942497206134, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.154506551258594e-05, |
|
"loss": 0.2275, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.4226353753936808, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 3.146723170411804e-05, |
|
"loss": 0.2242, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.42365132581530024, |
|
"grad_norm": 6.625, |
|
"learning_rate": 3.138933065492552e-05, |
|
"loss": 0.1897, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.42466727623691963, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.131136317495665e-05, |
|
"loss": 0.1629, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.4256832266585391, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 3.1233330074850364e-05, |
|
"loss": 0.1535, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.4266991770801585, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.115523216592786e-05, |
|
"loss": 0.2494, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.4277151275017779, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.107707026018417e-05, |
|
"loss": 0.1705, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.4287310779233973, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 3.09988451702797e-05, |
|
"loss": 0.1507, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.42974702834501677, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.0920557709531804e-05, |
|
"loss": 0.3071, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.43076297876663616, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 3.0842208691906306e-05, |
|
"loss": 0.199, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.4317789291882556, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.076379893200904e-05, |
|
"loss": 0.1987, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.43279487960987506, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 3.068532924507739e-05, |
|
"loss": 0.1945, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.43381083003149445, |
|
"grad_norm": 5.875, |
|
"learning_rate": 3.060680044697183e-05, |
|
"loss": 0.1937, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.4348267804531139, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 3.052821335416739e-05, |
|
"loss": 0.1643, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.4358427308747333, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 3.0449568783745203e-05, |
|
"loss": 0.1455, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.43685868129635275, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 3.0370867553384023e-05, |
|
"loss": 0.1891, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.43787463171797214, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.029211048135171e-05, |
|
"loss": 0.1377, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.4388905821395916, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.021329838649668e-05, |
|
"loss": 0.2194, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.43990653256121104, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.0134432088239462e-05, |
|
"loss": 0.1915, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.44092248298283043, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.0055512406564146e-05, |
|
"loss": 0.1794, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.4419384334044499, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.9976540162009836e-05, |
|
"loss": 0.2154, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.4429543838260693, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 2.9897516175662155e-05, |
|
"loss": 0.1861, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.4439703342476887, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 2.9818441269144693e-05, |
|
"loss": 0.1857, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.4449862846693081, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.9739316264610452e-05, |
|
"loss": 0.1493, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.44600223509092757, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 2.966014198473332e-05, |
|
"loss": 0.186, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.44701818551254696, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.9580919252699502e-05, |
|
"loss": 0.1963, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.4480341359341664, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 2.9501648892198984e-05, |
|
"loss": 0.2882, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.44905008635578586, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.942233172741693e-05, |
|
"loss": 0.2154, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.45006603677740525, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.934296858302515e-05, |
|
"loss": 0.2228, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.4510819871990247, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.9263560284173485e-05, |
|
"loss": 0.1637, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.4520979376206441, |
|
"grad_norm": 4.5, |
|
"learning_rate": 2.91841076564813e-05, |
|
"loss": 0.1396, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.45311388804226355, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 2.9104611526028808e-05, |
|
"loss": 0.186, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.45412983846388294, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.902507271934855e-05, |
|
"loss": 0.1706, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.4551457888855024, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.8945492063416768e-05, |
|
"loss": 0.2191, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.45616173930712184, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 2.8865870385644823e-05, |
|
"loss": 0.1651, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.45717768972874123, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.8786208513870583e-05, |
|
"loss": 0.1907, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4581936401503607, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 2.8706507276349815e-05, |
|
"loss": 0.2256, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.4592095905719801, |
|
"grad_norm": 3.375, |
|
"learning_rate": 2.8626767501747588e-05, |
|
"loss": 0.215, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.4602255409935995, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.854699001912964e-05, |
|
"loss": 0.2241, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.4612414914152189, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.846717565795376e-05, |
|
"loss": 0.1541, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.46225744183683837, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.8387325248061164e-05, |
|
"loss": 0.1718, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.46327339225845776, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 2.8307439619667897e-05, |
|
"loss": 0.259, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.4642893426800772, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.8227519603356157e-05, |
|
"loss": 0.2205, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.46530529310169666, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 2.8147566030065677e-05, |
|
"loss": 0.2256, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.46632124352331605, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 2.8067579731085085e-05, |
|
"loss": 0.1671, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.4673371939449355, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 2.7987561538043273e-05, |
|
"loss": 0.2471, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.4683531443665549, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 2.7907512282900727e-05, |
|
"loss": 0.1749, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.46936909478817435, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 2.782743279794091e-05, |
|
"loss": 0.2276, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.47038504520979374, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 2.7747323915761574e-05, |
|
"loss": 0.1971, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.4714009956314132, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 2.7667186469266122e-05, |
|
"loss": 0.1951, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.47241694605303264, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.7587021291654924e-05, |
|
"loss": 0.2045, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.47343289647465203, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.750682921641672e-05, |
|
"loss": 0.155, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.4744488468962715, |
|
"grad_norm": 4.375, |
|
"learning_rate": 2.7426611077319864e-05, |
|
"loss": 0.2038, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.4754647973178909, |
|
"grad_norm": 5.5, |
|
"learning_rate": 2.734636770840372e-05, |
|
"loss": 0.159, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.4764807477395103, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.7266099943969976e-05, |
|
"loss": 0.1566, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.4774966981611297, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.7185808618573943e-05, |
|
"loss": 0.1927, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.47851264858274917, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.710549456701592e-05, |
|
"loss": 0.1873, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.47952859900436856, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 2.702515862433247e-05, |
|
"loss": 0.2474, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.480544549425988, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.6944801625787795e-05, |
|
"loss": 0.204, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.48156049984760746, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.6864424406864984e-05, |
|
"loss": 0.1758, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.48257645026922685, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 2.6784027803257377e-05, |
|
"loss": 0.161, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.4835924006908463, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.6703612650859848e-05, |
|
"loss": 0.1469, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.4846083511124657, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 2.6623179785760148e-05, |
|
"loss": 0.1858, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.48562430153408515, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.6542730044230175e-05, |
|
"loss": 0.176, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.48664025195570454, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.6462264262717278e-05, |
|
"loss": 0.1657, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.487656202377324, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 2.6381783277835605e-05, |
|
"loss": 0.2705, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.48867215279894344, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 2.6301287926357355e-05, |
|
"loss": 0.2252, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.48968810322056283, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.622077904520411e-05, |
|
"loss": 0.2141, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.4907040536421823, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 2.6140257471438108e-05, |
|
"loss": 0.1935, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.4917200040638017, |
|
"grad_norm": 3.625, |
|
"learning_rate": 2.6059724042253574e-05, |
|
"loss": 0.2121, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.4927359544854211, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.5979179594967983e-05, |
|
"loss": 0.1221, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.4937519049070405, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 2.5898624967013367e-05, |
|
"loss": 0.2208, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.49476785532865997, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.5818060995927607e-05, |
|
"loss": 0.1904, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.49578380575027936, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 2.573748851934574e-05, |
|
"loss": 0.1658, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.4967997561718988, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.5656908374991213e-05, |
|
"loss": 0.1626, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.49781570659351826, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.557632140066721e-05, |
|
"loss": 0.1905, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.49883165701513765, |
|
"grad_norm": 4.875, |
|
"learning_rate": 2.5495728434247917e-05, |
|
"loss": 0.2591, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.4998476074367571, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.5415130313669845e-05, |
|
"loss": 0.1359, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.5008635578583766, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.5334527876923063e-05, |
|
"loss": 0.2353, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.501879508279996, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.5253921962042525e-05, |
|
"loss": 0.2173, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.5028954587016153, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.5173313407099373e-05, |
|
"loss": 0.1631, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.5039114091232347, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.5092703050192163e-05, |
|
"loss": 0.1884, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.5049273595448542, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.501209172943819e-05, |
|
"loss": 0.217, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.5059433099664736, |
|
"grad_norm": 4.375, |
|
"learning_rate": 2.49314802829648e-05, |
|
"loss": 0.1854, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.506959260388093, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.4850869548900628e-05, |
|
"loss": 0.2049, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.5079752108097125, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 2.477026036536688e-05, |
|
"loss": 0.2093, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5089911612313319, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.4689653570468677e-05, |
|
"loss": 0.164, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.5100071116529513, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 2.460905000228628e-05, |
|
"loss": 0.1649, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.5110230620745707, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.4528450498866428e-05, |
|
"loss": 0.1777, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.5120390124961902, |
|
"grad_norm": 3.0, |
|
"learning_rate": 2.444785589821356e-05, |
|
"loss": 0.1505, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.5130549629178096, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.436726703828118e-05, |
|
"loss": 0.2672, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.514070913339429, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 2.428668475696308e-05, |
|
"loss": 0.1756, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.5150868637610485, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.420610989208465e-05, |
|
"loss": 0.1655, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.5161028141826679, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.412554328139419e-05, |
|
"loss": 0.1579, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.5171187646042873, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.404498576255416e-05, |
|
"loss": 0.1599, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 2.3964438173132522e-05, |
|
"loss": 0.1508, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5191506654475262, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 2.388390135059395e-05, |
|
"loss": 0.1578, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.5201666158691456, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.3803376132291226e-05, |
|
"loss": 0.1374, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.521182566290765, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 2.3722863355456436e-05, |
|
"loss": 0.1854, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.5221985167123845, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 2.364236385719236e-05, |
|
"loss": 0.1391, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.5232144671340039, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 2.356187847446366e-05, |
|
"loss": 0.2106, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.5242304175556233, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 2.348140804408829e-05, |
|
"loss": 0.2383, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.5252463679772427, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 2.3400953402728713e-05, |
|
"loss": 0.1537, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.5262623183988622, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.332051538688322e-05, |
|
"loss": 0.1841, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.5272782688204816, |
|
"grad_norm": 3.25, |
|
"learning_rate": 2.3240094832877287e-05, |
|
"loss": 0.1855, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.528294219242101, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 2.3159692576854793e-05, |
|
"loss": 0.2625, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.5293101696637205, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 2.3079309454769413e-05, |
|
"loss": 0.1292, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.5303261200853399, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 2.2998946302375827e-05, |
|
"loss": 0.1263, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.5313420705069593, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 2.2918603955221148e-05, |
|
"loss": 0.2296, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.5323580209285786, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.283828324863613e-05, |
|
"loss": 0.1231, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.5333739713501982, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 2.2757985017726557e-05, |
|
"loss": 0.1939, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.5343899217718175, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 2.2677710097364495e-05, |
|
"loss": 0.168, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.5354058721934369, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.259745932217969e-05, |
|
"loss": 0.1883, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.5364218226150563, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 2.2517233526550817e-05, |
|
"loss": 0.1898, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.5374377730366758, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.2437033544596837e-05, |
|
"loss": 0.1838, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.5384537234582952, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 2.2356860210168336e-05, |
|
"loss": 0.1553, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.5394696738799146, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 2.2276714356838824e-05, |
|
"loss": 0.2248, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.5404856243015341, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 2.2196596817896118e-05, |
|
"loss": 0.1421, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.5415015747231535, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 2.2116508426333596e-05, |
|
"loss": 0.1947, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.5425175251447729, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 2.2036450014841652e-05, |
|
"loss": 0.2207, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.5435334755663923, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 2.19564224157989e-05, |
|
"loss": 0.2208, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.5445494259880118, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 2.1876426461263654e-05, |
|
"loss": 0.1739, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.5455653764096312, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.179646298296519e-05, |
|
"loss": 0.1938, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.5465813268312506, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 2.171653281229511e-05, |
|
"loss": 0.1736, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.5475972772528701, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.1636636780298732e-05, |
|
"loss": 0.2167, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.5486132276744895, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.1556775717666427e-05, |
|
"loss": 0.1711, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.5496291780961089, |
|
"grad_norm": 5.125, |
|
"learning_rate": 2.147695045472499e-05, |
|
"loss": 0.1789, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.5506451285177283, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 2.1397161821428973e-05, |
|
"loss": 0.2187, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.5516610789393478, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.131741064735212e-05, |
|
"loss": 0.1367, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.5526770293609672, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.1237697761678684e-05, |
|
"loss": 0.1574, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.5536929797825866, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.1158023993194848e-05, |
|
"loss": 0.1301, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.5547089302042061, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 2.107839017028005e-05, |
|
"loss": 0.2782, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.5557248806258255, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 2.0998797120898457e-05, |
|
"loss": 0.2024, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.5567408310474449, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.0919245672590277e-05, |
|
"loss": 0.1755, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.5577567814690643, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.083973665246318e-05, |
|
"loss": 0.2058, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.5587727318906838, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.076027088718373e-05, |
|
"loss": 0.2159, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5597886823123032, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 2.0680849202968743e-05, |
|
"loss": 0.2139, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.5608046327339226, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.060147242557674e-05, |
|
"loss": 0.183, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.5618205831555421, |
|
"grad_norm": 5.5, |
|
"learning_rate": 2.0522141380299308e-05, |
|
"loss": 0.1673, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.5628365335771615, |
|
"grad_norm": 4.25, |
|
"learning_rate": 2.044285689195258e-05, |
|
"loss": 0.1674, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.5638524839987809, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.0363619784868604e-05, |
|
"loss": 0.1531, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.5648684344204002, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.0284430882886836e-05, |
|
"loss": 0.1665, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.5658843848420197, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 2.020529100934549e-05, |
|
"loss": 0.1717, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.5669003352636391, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.012620098707306e-05, |
|
"loss": 0.1167, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.5679162856852585, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 2.004716163837972e-05, |
|
"loss": 0.2084, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.5689322361068779, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.996817378504876e-05, |
|
"loss": 0.1939, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.5699481865284974, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.9889238248328108e-05, |
|
"loss": 0.1241, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.5709641369501168, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.981035584892171e-05, |
|
"loss": 0.1865, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.5719800873717362, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.9731527406981072e-05, |
|
"loss": 0.1639, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.5729960377933557, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9652753742096655e-05, |
|
"loss": 0.2019, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.5740119882149751, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.9574035673289432e-05, |
|
"loss": 0.1829, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.5750279386365945, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.9495374019002312e-05, |
|
"loss": 0.2267, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.5760438890582139, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.9416769597091673e-05, |
|
"loss": 0.1411, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.5770598394798334, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.9338223224818818e-05, |
|
"loss": 0.1476, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.5780757899014528, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.9259735718841524e-05, |
|
"loss": 0.1417, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.5790917403230722, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.918130789520551e-05, |
|
"loss": 0.1592, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.5801076907446917, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.9102940569335963e-05, |
|
"loss": 0.161, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.5811236411663111, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.9024634556029093e-05, |
|
"loss": 0.1614, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.5821395915879305, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.89463906694436e-05, |
|
"loss": 0.1505, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.5831555420095499, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.8868209723092286e-05, |
|
"loss": 0.1674, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.5841714924311694, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 1.8790092529833508e-05, |
|
"loss": 0.1468, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.5851874428527888, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.871203990186281e-05, |
|
"loss": 0.1903, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.5862033932744082, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 1.8634052650704415e-05, |
|
"loss": 0.2644, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.5872193436960277, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.8556131587202848e-05, |
|
"loss": 0.1968, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.8478277521514424e-05, |
|
"loss": 0.2249, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.5892512445392665, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.8400491263098906e-05, |
|
"loss": 0.1881, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.5902671949608859, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.832277362071106e-05, |
|
"loss": 0.1352, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.5912831453825054, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.824512540239221e-05, |
|
"loss": 0.2737, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.5922990958041248, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.81675474154619e-05, |
|
"loss": 0.1566, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.5933150462257442, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.8090040466509444e-05, |
|
"loss": 0.1999, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.5943309966473637, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.8012605361385592e-05, |
|
"loss": 0.2372, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.5953469470689831, |
|
"grad_norm": 8.125, |
|
"learning_rate": 1.7935242905194087e-05, |
|
"loss": 0.2411, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.5963628974906025, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.785795390228336e-05, |
|
"loss": 0.138, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.5973788479122218, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.7780739156238125e-05, |
|
"loss": 0.1867, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.5983947983338413, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.770359946987105e-05, |
|
"loss": 0.2091, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.5994107487554607, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.7626535645214378e-05, |
|
"loss": 0.2091, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.6004266991770801, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.7549548483511614e-05, |
|
"loss": 0.1927, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.6014426495986995, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.7472638785209198e-05, |
|
"loss": 0.1893, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.602458600020319, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.7395807349948145e-05, |
|
"loss": 0.1557, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.6034745504419384, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.73190549765558e-05, |
|
"loss": 0.1717, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.6044905008635578, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.724238246303745e-05, |
|
"loss": 0.1879, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.6055064512851773, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.71657906065681e-05, |
|
"loss": 0.1908, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.6065224017067967, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.7089280203484115e-05, |
|
"loss": 0.1712, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.6075383521284161, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.701285204927502e-05, |
|
"loss": 0.1454, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.6085543025500355, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.693650693857515e-05, |
|
"loss": 0.2283, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.609570252971655, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.6860245665155466e-05, |
|
"loss": 0.2188, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6105862033932744, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.678406902191521e-05, |
|
"loss": 0.1605, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.6116021538148938, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.670797780087374e-05, |
|
"loss": 0.1472, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.6126181042365133, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6631972793162288e-05, |
|
"loss": 0.1676, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.6136340546581327, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.6556054789015662e-05, |
|
"loss": 0.1508, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.6146500050797521, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.6480224577764132e-05, |
|
"loss": 0.1981, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.6156659555013715, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.6404482947825137e-05, |
|
"loss": 0.2514, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.616681905922991, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.6328830686695154e-05, |
|
"loss": 0.2397, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.6176978563446104, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.625326858094144e-05, |
|
"loss": 0.1523, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.6187138067662298, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.6177797416193953e-05, |
|
"loss": 0.218, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.6197297571878493, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.6102417977137052e-05, |
|
"loss": 0.1476, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.6207457076094687, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.602713104750147e-05, |
|
"loss": 0.1818, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.6217616580310881, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.5951937410056087e-05, |
|
"loss": 0.2061, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.6227776084527075, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.587683784659979e-05, |
|
"loss": 0.1566, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.623793558874327, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.58018331379534e-05, |
|
"loss": 0.1376, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.6248095092959464, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.572692406395149e-05, |
|
"loss": 0.1655, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.6258254597175658, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.5652111403434338e-05, |
|
"loss": 0.2363, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.6268414101391853, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.5577395934239757e-05, |
|
"loss": 0.2464, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.6278573605608047, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.5502778433195085e-05, |
|
"loss": 0.1898, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.628873310982424, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.5428259676109048e-05, |
|
"loss": 0.1804, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.6298892614040434, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.5353840437763732e-05, |
|
"loss": 0.1409, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.630905211825663, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.5279521491906496e-05, |
|
"loss": 0.2449, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.6319211622472823, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.520530361124195e-05, |
|
"loss": 0.2103, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.6329371126689017, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.5131187567423937e-05, |
|
"loss": 0.2156, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.6339530630905211, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.5057174131047446e-05, |
|
"loss": 0.161, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.6349690135121406, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.4983264071640679e-05, |
|
"loss": 0.1757, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.63598496393376, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.490945815765699e-05, |
|
"loss": 0.2011, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.6370009143553794, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.4835757156466945e-05, |
|
"loss": 0.1658, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.6380168647769989, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.4762161834350271e-05, |
|
"loss": 0.1754, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.6390328151986183, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.4688672956487987e-05, |
|
"loss": 0.1427, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.6400487656202377, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 1.4615291286954352e-05, |
|
"loss": 0.1517, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.6410647160418571, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.4542017588709005e-05, |
|
"loss": 0.2348, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.6420806664634766, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.4468852623588961e-05, |
|
"loss": 0.2089, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.643096616885096, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.4395797152300719e-05, |
|
"loss": 0.1702, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.6441125673067154, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.4322851934412382e-05, |
|
"loss": 0.1017, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.6451285177283349, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.4250017728345716e-05, |
|
"loss": 0.1813, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.6461444681499543, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.4177295291368292e-05, |
|
"loss": 0.1095, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.6471604185715737, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.410468537958558e-05, |
|
"loss": 0.2259, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.6481763689931931, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.4032188747933136e-05, |
|
"loss": 0.1595, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.6491923194148126, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.39598061501687e-05, |
|
"loss": 0.2226, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.650208269836432, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.388753833886442e-05, |
|
"loss": 0.2132, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.6512242202580514, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.3815386065398945e-05, |
|
"loss": 0.1227, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.6522401706796709, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.3743350079949705e-05, |
|
"loss": 0.1755, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.6532561211012903, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.3671431131485057e-05, |
|
"loss": 0.1552, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.6542720715229097, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.3599629967756483e-05, |
|
"loss": 0.1917, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.6552880219445291, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.3527947335290877e-05, |
|
"loss": 0.1812, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.6563039723661486, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.3456383979382708e-05, |
|
"loss": 0.1896, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.657319922787768, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.3384940644086352e-05, |
|
"loss": 0.1484, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.6583358732093874, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.3313618072208268e-05, |
|
"loss": 0.1334, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.6593518236310069, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.3242417005299357e-05, |
|
"loss": 0.1351, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.6603677740526263, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.31713381836472e-05, |
|
"loss": 0.1717, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6613837244742456, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.3100382346268392e-05, |
|
"loss": 0.1867, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.662399674895865, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.3029550230900812e-05, |
|
"loss": 0.1997, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.6634156253174845, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.2958842573996016e-05, |
|
"loss": 0.1969, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.6644315757391039, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.2888260110711525e-05, |
|
"loss": 0.1469, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.6654475261607233, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.2817803574903212e-05, |
|
"loss": 0.1524, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.6664634765823427, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.2747473699117668e-05, |
|
"loss": 0.159, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.6674794270039622, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.267727121458458e-05, |
|
"loss": 0.1999, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.6684953774255816, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.2607196851209137e-05, |
|
"loss": 0.2216, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.669511327847201, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.2537251337564412e-05, |
|
"loss": 0.1607, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.6705272782688205, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.2467435400883839e-05, |
|
"loss": 0.2187, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.6715432286904399, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.239774976705359e-05, |
|
"loss": 0.1753, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.6725591791120593, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.2328195160605092e-05, |
|
"loss": 0.194, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.6735751295336787, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.225877230470743e-05, |
|
"loss": 0.1485, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.6745910799552982, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.218948192115988e-05, |
|
"loss": 0.1847, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.6756070303769176, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.21203247303844e-05, |
|
"loss": 0.1874, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.676622980798537, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.2051301451418073e-05, |
|
"loss": 0.2377, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.6776389312201565, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.198241280190574e-05, |
|
"loss": 0.1508, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.6786548816417759, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.1913659498092431e-05, |
|
"loss": 0.1537, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.6796708320633953, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.184504225481601e-05, |
|
"loss": 0.2339, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.6806867824850147, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.177656178549966e-05, |
|
"loss": 0.2102, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.6817027329066342, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.1708218802144536e-05, |
|
"loss": 0.1435, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.6827186833282536, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 1.1640014015322323e-05, |
|
"loss": 0.1823, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.683734633749873, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.1571948134167862e-05, |
|
"loss": 0.1154, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.6847505841714925, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.1504021866371761e-05, |
|
"loss": 0.2105, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.6857665345931119, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.143623591817304e-05, |
|
"loss": 0.1317, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.6867824850147313, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.1368590994351835e-05, |
|
"loss": 0.1406, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.6877984354363507, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 1.130108779822198e-05, |
|
"loss": 0.1425, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.6888143858579702, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.1233727031623783e-05, |
|
"loss": 0.1623, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.6898303362795896, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.1166509394916682e-05, |
|
"loss": 0.1591, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.690846286701209, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 1.1099435586971982e-05, |
|
"loss": 0.1758, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.6918622371228285, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.1032506305165555e-05, |
|
"loss": 0.1018, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.6928781875444479, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.0965722245370641e-05, |
|
"loss": 0.1485, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.6938941379660672, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.0899084101950561e-05, |
|
"loss": 0.1762, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.6949100883876866, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.0832592567751555e-05, |
|
"loss": 0.1402, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.6959260388093061, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.0766248334095505e-05, |
|
"loss": 0.2278, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.6969419892309255, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 1.0700052090772828e-05, |
|
"loss": 0.1969, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.6979579396525449, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.0634004526035249e-05, |
|
"loss": 0.2073, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.6989738900741643, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.0568106326588645e-05, |
|
"loss": 0.1902, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.6999898404957838, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.0502358177585953e-05, |
|
"loss": 0.2165, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.7010057909174032, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.0436760762619977e-05, |
|
"loss": 0.1952, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.7020217413390226, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.0371314763716347e-05, |
|
"loss": 0.1422, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.7030376917606421, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0306020861326388e-05, |
|
"loss": 0.0961, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.7040536421822615, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 1.0240879734320068e-05, |
|
"loss": 0.1542, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.7050695926038809, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.0175892059978901e-05, |
|
"loss": 0.1748, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.7060855430255003, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.0111058513988958e-05, |
|
"loss": 0.0819, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.7071014934471198, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.0046379770433803e-05, |
|
"loss": 0.1933, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.7081174438687392, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 9.98185650178749e-06, |
|
"loss": 0.1891, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.7091333942903586, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.917489378907591e-06, |
|
"loss": 0.2102, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.7101493447119781, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 9.853279071028212e-06, |
|
"loss": 0.1714, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.7111652951335975, |
|
"grad_norm": 2.375, |
|
"learning_rate": 9.78922624575303e-06, |
|
"loss": 0.1299, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7121812455552169, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.72533156904833e-06, |
|
"loss": 0.1914, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.7131971959768363, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 9.661595705236137e-06, |
|
"loss": 0.2377, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.7142131463984558, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.598019316987244e-06, |
|
"loss": 0.1851, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.7152290968200752, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.53460306531439e-06, |
|
"loss": 0.2661, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.7162450472416946, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.471347609565311e-06, |
|
"loss": 0.1669, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.7172609976633141, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 9.408253607415957e-06, |
|
"loss": 0.2487, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.7182769480849335, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 9.345321714863614e-06, |
|
"loss": 0.186, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.7192928985065529, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 9.282552586220075e-06, |
|
"loss": 0.2249, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.7203088489281723, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 9.219946874104885e-06, |
|
"loss": 0.1255, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.7213247993497918, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 9.157505229438481e-06, |
|
"loss": 0.1999, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.7223407497714112, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 9.095228301435518e-06, |
|
"loss": 0.199, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.7233567001930306, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.03311673759802e-06, |
|
"loss": 0.2182, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.7243726506146501, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 8.971171183708733e-06, |
|
"loss": 0.1573, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.7253886010362695, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 8.909392283824353e-06, |
|
"loss": 0.2044, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.7264045514578888, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 8.847780680268872e-06, |
|
"loss": 0.11, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.7274205018795082, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 8.786337013626853e-06, |
|
"loss": 0.1897, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.7284364523011277, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 8.725061922736799e-06, |
|
"loss": 0.153, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.7294524027227471, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 8.663956044684532e-06, |
|
"loss": 0.1746, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.7304683531443665, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 8.603020014796507e-06, |
|
"loss": 0.2284, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.7314843035659859, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 8.542254466633273e-06, |
|
"loss": 0.1186, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.7325002539876054, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.481660031982844e-06, |
|
"loss": 0.1971, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.7335162044092248, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 8.421237340854157e-06, |
|
"loss": 0.196, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.7345321548308442, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 8.360987021470479e-06, |
|
"loss": 0.1724, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.7355481052524637, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 8.300909700262929e-06, |
|
"loss": 0.175, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.7365640556740831, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 8.241006001863924e-06, |
|
"loss": 0.2276, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.7375800060957025, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 8.181276549100714e-06, |
|
"loss": 0.2029, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.7385959565173219, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 8.12172196298887e-06, |
|
"loss": 0.175, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.7396119069389414, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 8.062342862725878e-06, |
|
"loss": 0.1662, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.7406278573605608, |
|
"grad_norm": 3.375, |
|
"learning_rate": 8.003139865684662e-06, |
|
"loss": 0.1616, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.7416438077821802, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.944113587407157e-06, |
|
"loss": 0.2448, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.7426597582037997, |
|
"grad_norm": 4.125, |
|
"learning_rate": 7.885264641597961e-06, |
|
"loss": 0.1618, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.7436757086254191, |
|
"grad_norm": 3.5, |
|
"learning_rate": 7.826593640117889e-06, |
|
"loss": 0.1134, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.7446916590470385, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 7.76810119297767e-06, |
|
"loss": 0.1795, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.7457076094686579, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 7.709787908331556e-06, |
|
"loss": 0.2736, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.7467235598902774, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 7.651654392471038e-06, |
|
"loss": 0.139, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.7477395103118968, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 7.593701249818521e-06, |
|
"loss": 0.2023, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.7487554607335162, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 7.535929082921048e-06, |
|
"loss": 0.1702, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.7497714111551357, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 7.47833849244402e-06, |
|
"loss": 0.1835, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.7507873615767551, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 7.420930077164959e-06, |
|
"loss": 0.1713, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.7518033119983745, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 7.363704433967311e-06, |
|
"loss": 0.1906, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.7528192624199939, |
|
"grad_norm": 1.75, |
|
"learning_rate": 7.306662157834185e-06, |
|
"loss": 0.1421, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.7538352128416134, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 7.2498038418422145e-06, |
|
"loss": 0.1793, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.7548511632632328, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.193130077155374e-06, |
|
"loss": 0.1603, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.7558671136848522, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 7.13664145301883e-06, |
|
"loss": 0.2169, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.7568830641064717, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 7.0803385567528025e-06, |
|
"loss": 0.1685, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.757899014528091, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 7.024221973746495e-06, |
|
"loss": 0.2282, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.7589149649497104, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.968292287451961e-06, |
|
"loss": 0.1786, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.7599309153713298, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 6.912550079378091e-06, |
|
"loss": 0.1811, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.7609468657929493, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.856995929084506e-06, |
|
"loss": 0.1747, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.7619628162145687, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 6.801630414175589e-06, |
|
"loss": 0.2028, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7629787666361881, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 6.746454110294451e-06, |
|
"loss": 0.2255, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.7639947170578075, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.691467591116931e-06, |
|
"loss": 0.1604, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.765010667479427, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 6.6366714283456755e-06, |
|
"loss": 0.2559, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.7660266179010464, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 6.582066191704142e-06, |
|
"loss": 0.2034, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.7670425683226658, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 6.527652448930724e-06, |
|
"loss": 0.148, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.7680585187442853, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 6.4734307657728e-06, |
|
"loss": 0.1811, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.7690744691659047, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 6.419401705980924e-06, |
|
"loss": 0.1407, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.7700904195875241, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.365565831302869e-06, |
|
"loss": 0.1893, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.7711063700091435, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.311923701477854e-06, |
|
"loss": 0.1835, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.772122320430763, |
|
"grad_norm": 2.375, |
|
"learning_rate": 6.258475874230713e-06, |
|
"loss": 0.1579, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.7731382708523824, |
|
"grad_norm": 4.5, |
|
"learning_rate": 6.205222905266067e-06, |
|
"loss": 0.1794, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.7741542212740018, |
|
"grad_norm": 4.25, |
|
"learning_rate": 6.152165348262598e-06, |
|
"loss": 0.1477, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.7751701716956213, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 6.0993037548672246e-06, |
|
"loss": 0.2396, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.7761861221172407, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 6.046638674689454e-06, |
|
"loss": 0.1717, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.7772020725388601, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 5.994170655295567e-06, |
|
"loss": 0.2646, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.7782180229604795, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 5.9419002422030106e-06, |
|
"loss": 0.1553, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.779233973382099, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 5.889827978874665e-06, |
|
"loss": 0.1854, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.7802499238037184, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.837954406713245e-06, |
|
"loss": 0.1857, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.7812658742253378, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 5.786280065055619e-06, |
|
"loss": 0.1797, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.7822818246469573, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 5.734805491167244e-06, |
|
"loss": 0.1488, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.7832977750685767, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.683531220236576e-06, |
|
"loss": 0.1688, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 5.632457785369455e-06, |
|
"loss": 0.1503, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.7853296759118155, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.581585717583637e-06, |
|
"loss": 0.1658, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.786345626333435, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 5.530915545803209e-06, |
|
"loss": 0.2112, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.7873615767550544, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 5.480447796853141e-06, |
|
"loss": 0.165, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.7883775271766738, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 5.430182995453756e-06, |
|
"loss": 0.1499, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.7893934775982933, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 5.380121664215329e-06, |
|
"loss": 0.1559, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.7904094280199127, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 5.330264323632611e-06, |
|
"loss": 0.2098, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.791425378441532, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 5.280611492079449e-06, |
|
"loss": 0.1776, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.7924413288631514, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.231163685803361e-06, |
|
"loss": 0.1497, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.7934572792847709, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.181921418920191e-06, |
|
"loss": 0.12, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.7944732297063903, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.13288520340878e-06, |
|
"loss": 0.1981, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.7954891801280097, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 5.084055549105596e-06, |
|
"loss": 0.1389, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.7965051305496291, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 5.035432963699479e-06, |
|
"loss": 0.2293, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.7975210809712486, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 4.98701795272635e-06, |
|
"loss": 0.1618, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.798537031392868, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 4.938811019563938e-06, |
|
"loss": 0.1755, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.7995529818144874, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.8908126654265475e-06, |
|
"loss": 0.1565, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.8005689322361069, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.843023389359885e-06, |
|
"loss": 0.2176, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.8015848826577263, |
|
"grad_norm": 2.625, |
|
"learning_rate": 4.79544368823581e-06, |
|
"loss": 0.2013, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.8026008330793457, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.748074056747234e-06, |
|
"loss": 0.1246, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.8036167835009651, |
|
"grad_norm": 3.5, |
|
"learning_rate": 4.700914987402919e-06, |
|
"loss": 0.1638, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.8046327339225846, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 4.6539669705223916e-06, |
|
"loss": 0.2213, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.805648684344204, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 4.607230494230849e-06, |
|
"loss": 0.1822, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.8066646347658234, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 4.560706044454047e-06, |
|
"loss": 0.1763, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.8076805851874429, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 4.514394104913291e-06, |
|
"loss": 0.234, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.8086965356090623, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 4.468295157120372e-06, |
|
"loss": 0.1939, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.8097124860306817, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.422409680372594e-06, |
|
"loss": 0.174, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.8107284364523011, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 4.3767381517477505e-06, |
|
"loss": 0.2375, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.8117443868739206, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.331281046099203e-06, |
|
"loss": 0.2076, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.81276033729554, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 4.286038836050929e-06, |
|
"loss": 0.2504, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8137762877171594, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 4.241011991992586e-06, |
|
"loss": 0.2102, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.8147922381387789, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.1962009820746635e-06, |
|
"loss": 0.1846, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.8158081885603983, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.15160627220357e-06, |
|
"loss": 0.1741, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.8168241389820177, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 4.107228326036838e-06, |
|
"loss": 0.2078, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.8178400894036371, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.063067604978252e-06, |
|
"loss": 0.212, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.8188560398252566, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 4.019124568173094e-06, |
|
"loss": 0.1831, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.819871990246876, |
|
"grad_norm": 6.625, |
|
"learning_rate": 3.975399672503341e-06, |
|
"loss": 0.2196, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.8208879406684954, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 3.931893372582943e-06, |
|
"loss": 0.2002, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.8219038910901149, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 3.888606120753047e-06, |
|
"loss": 0.2138, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.8229198415117343, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.845538367077362e-06, |
|
"loss": 0.2593, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.8239357919333536, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.8026905593374213e-06, |
|
"loss": 0.2062, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.824951742354973, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 3.760063143027945e-06, |
|
"loss": 0.1343, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.8259676927765925, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 3.7176565613522313e-06, |
|
"loss": 0.2494, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.8269836431982119, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 3.675471255217516e-06, |
|
"loss": 0.1502, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.8279995936198313, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.6335076632304175e-06, |
|
"loss": 0.1256, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.8290155440414507, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.5917662216923332e-06, |
|
"loss": 0.1709, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.8300314944630702, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 3.550247364594958e-06, |
|
"loss": 0.1881, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.8310474448846896, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.508951523615725e-06, |
|
"loss": 0.1998, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.832063395306309, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.467879128113352e-06, |
|
"loss": 0.2429, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.8330793457279285, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.427030605123352e-06, |
|
"loss": 0.1942, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.8340952961495479, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.3864063793536043e-06, |
|
"loss": 0.1898, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.8351112465711673, |
|
"grad_norm": 5.375, |
|
"learning_rate": 3.3460068731799577e-06, |
|
"loss": 0.1919, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.8361271969927867, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 3.3058325066417818e-06, |
|
"loss": 0.1516, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.8371431474144062, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.26588369743768e-06, |
|
"loss": 0.1068, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.8381590978360256, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 3.2261608609210653e-06, |
|
"loss": 0.1203, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.839175048257645, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.186664410095913e-06, |
|
"loss": 0.2172, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.8401909986792645, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 3.1473947556124093e-06, |
|
"loss": 0.1249, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.8412069491008839, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.1083523057627213e-06, |
|
"loss": 0.1744, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.8422228995225033, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.0695374664767353e-06, |
|
"loss": 0.1772, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.8432388499441227, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 3.0309506413178397e-06, |
|
"loss": 0.2302, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.8442548003657422, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.9925922314787136e-06, |
|
"loss": 0.1635, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.8452707507873616, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.954462635777194e-06, |
|
"loss": 0.1573, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.846286701208981, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.916562250652083e-06, |
|
"loss": 0.1608, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.8473026516306005, |
|
"grad_norm": 4.125, |
|
"learning_rate": 2.878891470159048e-06, |
|
"loss": 0.184, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.8483186020522199, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.8414506859665514e-06, |
|
"loss": 0.2141, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.8493345524738393, |
|
"grad_norm": 3.375, |
|
"learning_rate": 2.8042402873517197e-06, |
|
"loss": 0.1729, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.8503505028954587, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 2.76726066119635e-06, |
|
"loss": 0.2252, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.8513664533170782, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.730512191982845e-06, |
|
"loss": 0.1644, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.8523824037386976, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 2.693995261790261e-06, |
|
"loss": 0.1822, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.853398354160317, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 2.657710250290285e-06, |
|
"loss": 0.2068, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.8544143045819365, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 2.621657534743327e-06, |
|
"loss": 0.1224, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.8554302550035559, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 2.5858374899945804e-06, |
|
"loss": 0.179, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.8564462054251752, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 2.550250488470135e-06, |
|
"loss": 0.1873, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.8574621558467946, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 2.5148969001730806e-06, |
|
"loss": 0.1799, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.8584781062684141, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.4797770926796858e-06, |
|
"loss": 0.176, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.8594940566900335, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 2.444891431135571e-06, |
|
"loss": 0.1664, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.8605100071116529, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 2.4102402782518936e-06, |
|
"loss": 0.1512, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.8615259575332723, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 2.3758239943016096e-06, |
|
"loss": 0.1629, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.8625419079548918, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 2.3416429371157013e-06, |
|
"loss": 0.2099, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.8635578583765112, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 2.307697462079464e-06, |
|
"loss": 0.2221, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8645738087981306, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 2.273987922128809e-06, |
|
"loss": 0.2191, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.8655897592197501, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.240514667746607e-06, |
|
"loss": 0.1843, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.8666057096413695, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.2072780469590245e-06, |
|
"loss": 0.2494, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.8676216600629889, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.1742784053319116e-06, |
|
"loss": 0.1712, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.8686376104846083, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.141516085967224e-06, |
|
"loss": 0.1169, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.8696535609062278, |
|
"grad_norm": 4.25, |
|
"learning_rate": 2.1089914294994434e-06, |
|
"loss": 0.1374, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.8706695113278472, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 2.0767047740920336e-06, |
|
"loss": 0.2162, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.8716854617494666, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.0446564554339187e-06, |
|
"loss": 0.1593, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.8727014121710861, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.0128468067360185e-06, |
|
"loss": 0.1857, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.8737173625927055, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.981276158727749e-06, |
|
"loss": 0.1989, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.8747333130143249, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.949944839653625e-06, |
|
"loss": 0.2077, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.8757492634359443, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.918853175269797e-06, |
|
"loss": 0.2003, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.8767652138575638, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.8880014888407127e-06, |
|
"loss": 0.2486, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.8777811642791832, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.8573901011357336e-06, |
|
"loss": 0.1896, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.8787971147008026, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.8270193304257887e-06, |
|
"loss": 0.1727, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.8798130651224221, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.7968894924800916e-06, |
|
"loss": 0.1687, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.8808290155440415, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.7670009005628291e-06, |
|
"loss": 0.166, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.8818449659656609, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.737353865429936e-06, |
|
"loss": 0.1471, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.8828609163872803, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.7079486953258283e-06, |
|
"loss": 0.1075, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.8838768668088998, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.6787856959802367e-06, |
|
"loss": 0.2113, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.8848928172305192, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.6498651706049945e-06, |
|
"loss": 0.1412, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.8859087676521386, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.6211874198909072e-06, |
|
"loss": 0.1701, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.8869247180737581, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.592752742004605e-06, |
|
"loss": 0.1348, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.8879406684953774, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.5645614325854735e-06, |
|
"loss": 0.1931, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.8889566189169968, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.5366137847425466e-06, |
|
"loss": 0.1705, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.8899725693386162, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.5089100890514769e-06, |
|
"loss": 0.1889, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.8909885197602357, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.4814506335515176e-06, |
|
"loss": 0.1837, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.8920044701818551, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.4542357037425207e-06, |
|
"loss": 0.1728, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.8930204206034745, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.4272655825819713e-06, |
|
"loss": 0.1562, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.8940363710250939, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.4005405504820351e-06, |
|
"loss": 0.1681, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.8950523214467134, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.3740608853066634e-06, |
|
"loss": 0.1449, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.8960682718683328, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.347826862368684e-06, |
|
"loss": 0.2418, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.8970842222899522, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.3218387544269545e-06, |
|
"loss": 0.2473, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.8981001727115717, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.2960968316835132e-06, |
|
"loss": 0.194, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.8991161231331911, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.2706013617807822e-06, |
|
"loss": 0.2109, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.9001320735548105, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.2453526097987778e-06, |
|
"loss": 0.151, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.9011480239764299, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.2203508382523431e-06, |
|
"loss": 0.1811, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.9021639743980494, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.1955963070884534e-06, |
|
"loss": 0.2004, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.9031799248196688, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.171089273683465e-06, |
|
"loss": 0.1395, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.9041958752412882, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.1468299928404868e-06, |
|
"loss": 0.1915, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.9052118256629077, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.1228187167866943e-06, |
|
"loss": 0.1281, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.9062277760845271, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.099055695170728e-06, |
|
"loss": 0.1627, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.9072437265061465, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.0755411750600962e-06, |
|
"loss": 0.1768, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.9082596769277659, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.052275400938596e-06, |
|
"loss": 0.1544, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.9092756273493854, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.0292586147037764e-06, |
|
"loss": 0.2498, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.9102915777710048, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.0064910556644214e-06, |
|
"loss": 0.1918, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.9113075281926242, |
|
"grad_norm": 4.0, |
|
"learning_rate": 9.839729605380766e-07, |
|
"loss": 0.2388, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.9123234786142437, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 9.61704563448565e-07, |
|
"loss": 0.1944, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.9133394290358631, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.396860959235671e-07, |
|
"loss": 0.1667, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.9143553794574825, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.179177868922085e-07, |
|
"loss": 0.2143, |
|
"step": 9000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 9843, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 0.0, |
|
"train_batch_size": 7, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|