|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5949, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016809547823163557, |
|
"grad_norm": 79552.359375, |
|
"learning_rate": 4.9915952260884186e-05, |
|
"loss": 1.0656, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0033619095646327114, |
|
"grad_norm": 79056.6640625, |
|
"learning_rate": 4.983190452176837e-05, |
|
"loss": 1.0215, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005042864346949067, |
|
"grad_norm": 85323.828125, |
|
"learning_rate": 4.9747856782652546e-05, |
|
"loss": 0.9578, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006723819129265423, |
|
"grad_norm": 71872.125, |
|
"learning_rate": 4.966380904353673e-05, |
|
"loss": 0.9151, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.008404773911581778, |
|
"grad_norm": 71638.46875, |
|
"learning_rate": 4.957976130442091e-05, |
|
"loss": 0.8803, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.010085728693898134, |
|
"grad_norm": 74304.5234375, |
|
"learning_rate": 4.9495713565305096e-05, |
|
"loss": 0.9141, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01176668347621449, |
|
"grad_norm": 73676.1953125, |
|
"learning_rate": 4.941166582618928e-05, |
|
"loss": 0.9949, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.013447638258530846, |
|
"grad_norm": 76955.7890625, |
|
"learning_rate": 4.9327618087073463e-05, |
|
"loss": 0.8852, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.015128593040847202, |
|
"grad_norm": 69415.2578125, |
|
"learning_rate": 4.924357034795764e-05, |
|
"loss": 0.8891, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.016809547823163556, |
|
"grad_norm": 84991.7734375, |
|
"learning_rate": 4.9159522608841824e-05, |
|
"loss": 0.939, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01849050260547991, |
|
"grad_norm": 72803.203125, |
|
"learning_rate": 4.907547486972601e-05, |
|
"loss": 0.8879, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.020171457387796268, |
|
"grad_norm": 73581.6640625, |
|
"learning_rate": 4.899142713061019e-05, |
|
"loss": 0.923, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.021852412170112624, |
|
"grad_norm": 80610.7421875, |
|
"learning_rate": 4.8907379391494374e-05, |
|
"loss": 0.8933, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02353336695242898, |
|
"grad_norm": 80112.125, |
|
"learning_rate": 4.882333165237855e-05, |
|
"loss": 0.8761, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.025214321734745335, |
|
"grad_norm": 73625.0546875, |
|
"learning_rate": 4.8739283913262734e-05, |
|
"loss": 0.8442, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02689527651706169, |
|
"grad_norm": 79199.546875, |
|
"learning_rate": 4.865523617414692e-05, |
|
"loss": 0.9312, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.028576231299378047, |
|
"grad_norm": 69096.6484375, |
|
"learning_rate": 4.85711884350311e-05, |
|
"loss": 0.9435, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.030257186081694403, |
|
"grad_norm": 66993.53125, |
|
"learning_rate": 4.8487140695915285e-05, |
|
"loss": 0.8933, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.031938140864010756, |
|
"grad_norm": 72366.515625, |
|
"learning_rate": 4.840309295679946e-05, |
|
"loss": 0.8513, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03361909564632711, |
|
"grad_norm": 70981.46875, |
|
"learning_rate": 4.8319045217683645e-05, |
|
"loss": 0.9126, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03530005042864347, |
|
"grad_norm": 73961.296875, |
|
"learning_rate": 4.823499747856783e-05, |
|
"loss": 0.908, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03698100521095982, |
|
"grad_norm": 71402.046875, |
|
"learning_rate": 4.815094973945201e-05, |
|
"loss": 0.915, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03866195999327618, |
|
"grad_norm": 69597.8984375, |
|
"learning_rate": 4.8066902000336195e-05, |
|
"loss": 0.892, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.040342914775592535, |
|
"grad_norm": 77749.3359375, |
|
"learning_rate": 4.798285426122038e-05, |
|
"loss": 0.9004, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04202386955790889, |
|
"grad_norm": 69176.328125, |
|
"learning_rate": 4.7898806522104555e-05, |
|
"loss": 0.9878, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04370482434022525, |
|
"grad_norm": 76940.0625, |
|
"learning_rate": 4.781475878298874e-05, |
|
"loss": 0.8873, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0453857791225416, |
|
"grad_norm": 81460.0546875, |
|
"learning_rate": 4.773071104387292e-05, |
|
"loss": 0.9022, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.04706673390485796, |
|
"grad_norm": 69520.1953125, |
|
"learning_rate": 4.7646663304757106e-05, |
|
"loss": 0.8729, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.048747688687174315, |
|
"grad_norm": 76301.21875, |
|
"learning_rate": 4.756261556564129e-05, |
|
"loss": 0.9464, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.05042864346949067, |
|
"grad_norm": 72831.2265625, |
|
"learning_rate": 4.7478567826525466e-05, |
|
"loss": 0.8346, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05210959825180703, |
|
"grad_norm": 73389.6328125, |
|
"learning_rate": 4.739452008740965e-05, |
|
"loss": 0.8794, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.05379055303412338, |
|
"grad_norm": 78976.9453125, |
|
"learning_rate": 4.731047234829383e-05, |
|
"loss": 0.9318, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.05547150781643974, |
|
"grad_norm": 72745.15625, |
|
"learning_rate": 4.7226424609178016e-05, |
|
"loss": 0.923, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.057152462598756094, |
|
"grad_norm": 65032.38671875, |
|
"learning_rate": 4.71423768700622e-05, |
|
"loss": 0.9431, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.05883341738107245, |
|
"grad_norm": 67867.875, |
|
"learning_rate": 4.705832913094638e-05, |
|
"loss": 0.853, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.060514372163388806, |
|
"grad_norm": 132520.328125, |
|
"learning_rate": 4.697428139183056e-05, |
|
"loss": 0.8772, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06219532694570516, |
|
"grad_norm": 71777.140625, |
|
"learning_rate": 4.6890233652714743e-05, |
|
"loss": 0.841, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.06387628172802151, |
|
"grad_norm": 71626.65625, |
|
"learning_rate": 4.680618591359893e-05, |
|
"loss": 0.8281, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.06555723651033787, |
|
"grad_norm": 66127.0703125, |
|
"learning_rate": 4.672213817448311e-05, |
|
"loss": 0.9221, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.06723819129265422, |
|
"grad_norm": 66148.8203125, |
|
"learning_rate": 4.6638090435367294e-05, |
|
"loss": 0.8607, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06891914607497059, |
|
"grad_norm": 75503.5078125, |
|
"learning_rate": 4.655404269625147e-05, |
|
"loss": 0.9062, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07060010085728693, |
|
"grad_norm": 61276.5625, |
|
"learning_rate": 4.6469994957135654e-05, |
|
"loss": 0.8669, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.0722810556396033, |
|
"grad_norm": 71196.9453125, |
|
"learning_rate": 4.638594721801984e-05, |
|
"loss": 0.8686, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.07396201042191965, |
|
"grad_norm": 66918.8671875, |
|
"learning_rate": 4.630189947890402e-05, |
|
"loss": 0.9121, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.07564296520423601, |
|
"grad_norm": 64304.73828125, |
|
"learning_rate": 4.6217851739788204e-05, |
|
"loss": 0.8663, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07732391998655236, |
|
"grad_norm": 62031.97265625, |
|
"learning_rate": 4.613380400067238e-05, |
|
"loss": 0.8258, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.07900487476886872, |
|
"grad_norm": 63083.9140625, |
|
"learning_rate": 4.6049756261556565e-05, |
|
"loss": 0.8657, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.08068582955118507, |
|
"grad_norm": 67486.75, |
|
"learning_rate": 4.596570852244075e-05, |
|
"loss": 0.9225, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.08236678433350143, |
|
"grad_norm": 68711.7890625, |
|
"learning_rate": 4.588166078332493e-05, |
|
"loss": 0.8172, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.08404773911581778, |
|
"grad_norm": 69319.46875, |
|
"learning_rate": 4.5797613044209115e-05, |
|
"loss": 0.8034, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08572869389813415, |
|
"grad_norm": 67749.515625, |
|
"learning_rate": 4.57135653050933e-05, |
|
"loss": 0.9285, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0874096486804505, |
|
"grad_norm": 78357.3203125, |
|
"learning_rate": 4.5629517565977475e-05, |
|
"loss": 0.8566, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.08909060346276686, |
|
"grad_norm": 59291.59375, |
|
"learning_rate": 4.554546982686166e-05, |
|
"loss": 0.8088, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.0907715582450832, |
|
"grad_norm": 72898.515625, |
|
"learning_rate": 4.546142208774584e-05, |
|
"loss": 0.8411, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.09245251302739957, |
|
"grad_norm": 65205.4296875, |
|
"learning_rate": 4.5377374348630026e-05, |
|
"loss": 0.9546, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.09413346780971592, |
|
"grad_norm": 104009.6953125, |
|
"learning_rate": 4.529332660951421e-05, |
|
"loss": 0.8406, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.09581442259203228, |
|
"grad_norm": 68204.453125, |
|
"learning_rate": 4.5209278870398386e-05, |
|
"loss": 0.897, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.09749537737434863, |
|
"grad_norm": 70454.1484375, |
|
"learning_rate": 4.512523113128257e-05, |
|
"loss": 0.839, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.09917633215666499, |
|
"grad_norm": 67070.96875, |
|
"learning_rate": 4.504118339216675e-05, |
|
"loss": 0.9008, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.10085728693898134, |
|
"grad_norm": 70355.5234375, |
|
"learning_rate": 4.4957135653050936e-05, |
|
"loss": 0.9318, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.10253824172129769, |
|
"grad_norm": 68497.7109375, |
|
"learning_rate": 4.487308791393512e-05, |
|
"loss": 0.9127, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.10421919650361405, |
|
"grad_norm": 81705.2578125, |
|
"learning_rate": 4.47890401748193e-05, |
|
"loss": 0.8575, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1059001512859304, |
|
"grad_norm": 72572.75, |
|
"learning_rate": 4.470499243570348e-05, |
|
"loss": 0.8387, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.10758110606824677, |
|
"grad_norm": 66429.328125, |
|
"learning_rate": 4.462094469658766e-05, |
|
"loss": 0.8322, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.10926206085056311, |
|
"grad_norm": 73876.7265625, |
|
"learning_rate": 4.453689695747185e-05, |
|
"loss": 0.8788, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.11094301563287948, |
|
"grad_norm": 73238.40625, |
|
"learning_rate": 4.445284921835603e-05, |
|
"loss": 0.8983, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.11262397041519583, |
|
"grad_norm": 65609.2578125, |
|
"learning_rate": 4.4368801479240214e-05, |
|
"loss": 0.8406, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.11430492519751219, |
|
"grad_norm": 79022.8515625, |
|
"learning_rate": 4.428475374012439e-05, |
|
"loss": 0.8204, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.11598587997982854, |
|
"grad_norm": 63224.625, |
|
"learning_rate": 4.4200706001008574e-05, |
|
"loss": 0.8252, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.1176668347621449, |
|
"grad_norm": 61681.51171875, |
|
"learning_rate": 4.411665826189276e-05, |
|
"loss": 0.9041, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.11934778954446125, |
|
"grad_norm": 65631.578125, |
|
"learning_rate": 4.403261052277694e-05, |
|
"loss": 0.8541, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.12102874432677761, |
|
"grad_norm": 70977.875, |
|
"learning_rate": 4.3948562783661124e-05, |
|
"loss": 0.9705, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.12270969910909396, |
|
"grad_norm": 67375.078125, |
|
"learning_rate": 4.38645150445453e-05, |
|
"loss": 0.8542, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.12439065389141032, |
|
"grad_norm": 60750.97265625, |
|
"learning_rate": 4.3780467305429484e-05, |
|
"loss": 0.7832, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1260716086737267, |
|
"grad_norm": 62710.984375, |
|
"learning_rate": 4.369641956631367e-05, |
|
"loss": 0.855, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.12775256345604302, |
|
"grad_norm": 56731.50390625, |
|
"learning_rate": 4.361237182719785e-05, |
|
"loss": 0.8591, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.12943351823835939, |
|
"grad_norm": 67084.78125, |
|
"learning_rate": 4.3528324088082035e-05, |
|
"loss": 0.8406, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.13111447302067575, |
|
"grad_norm": 66152.15625, |
|
"learning_rate": 4.344427634896622e-05, |
|
"loss": 0.9062, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1327954278029921, |
|
"grad_norm": 71438.4765625, |
|
"learning_rate": 4.3360228609850395e-05, |
|
"loss": 0.8206, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.13447638258530845, |
|
"grad_norm": 60924.875, |
|
"learning_rate": 4.327618087073458e-05, |
|
"loss": 0.8164, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1361573373676248, |
|
"grad_norm": 70178.5390625, |
|
"learning_rate": 4.319213313161876e-05, |
|
"loss": 0.8303, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.13783829214994117, |
|
"grad_norm": 64838.84765625, |
|
"learning_rate": 4.3108085392502945e-05, |
|
"loss": 0.8503, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.13951924693225753, |
|
"grad_norm": 72861.828125, |
|
"learning_rate": 4.302403765338713e-05, |
|
"loss": 0.9165, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.14120020171457387, |
|
"grad_norm": 74359.4140625, |
|
"learning_rate": 4.2939989914271306e-05, |
|
"loss": 0.9102, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.14288115649689023, |
|
"grad_norm": 70118.2109375, |
|
"learning_rate": 4.285594217515549e-05, |
|
"loss": 0.8703, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.1445621112792066, |
|
"grad_norm": 63720.28125, |
|
"learning_rate": 4.277189443603967e-05, |
|
"loss": 0.893, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.14624306606152296, |
|
"grad_norm": 77761.90625, |
|
"learning_rate": 4.2687846696923856e-05, |
|
"loss": 0.9613, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.1479240208438393, |
|
"grad_norm": 64969.37890625, |
|
"learning_rate": 4.260379895780804e-05, |
|
"loss": 0.8691, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.14960497562615566, |
|
"grad_norm": 70299.8203125, |
|
"learning_rate": 4.2519751218692216e-05, |
|
"loss": 0.8719, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.15128593040847202, |
|
"grad_norm": 65577.1171875, |
|
"learning_rate": 4.24357034795764e-05, |
|
"loss": 0.8554, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.15296688519078835, |
|
"grad_norm": 71740.8671875, |
|
"learning_rate": 4.235165574046058e-05, |
|
"loss": 0.9948, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.15464783997310472, |
|
"grad_norm": 59819.0234375, |
|
"learning_rate": 4.226760800134477e-05, |
|
"loss": 0.8605, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.15632879475542108, |
|
"grad_norm": 57238.7109375, |
|
"learning_rate": 4.218356026222895e-05, |
|
"loss": 0.8281, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.15800974953773744, |
|
"grad_norm": 70315.765625, |
|
"learning_rate": 4.2099512523113134e-05, |
|
"loss": 0.9126, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.15969070432005378, |
|
"grad_norm": 63374.6796875, |
|
"learning_rate": 4.201546478399731e-05, |
|
"loss": 0.8744, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.16137165910237014, |
|
"grad_norm": 75860.09375, |
|
"learning_rate": 4.1931417044881494e-05, |
|
"loss": 0.8078, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.1630526138846865, |
|
"grad_norm": 58454.1484375, |
|
"learning_rate": 4.184736930576568e-05, |
|
"loss": 0.8084, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.16473356866700287, |
|
"grad_norm": 68843.625, |
|
"learning_rate": 4.176332156664986e-05, |
|
"loss": 0.8458, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.1664145234493192, |
|
"grad_norm": 58082.43359375, |
|
"learning_rate": 4.1679273827534044e-05, |
|
"loss": 0.8148, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.16809547823163556, |
|
"grad_norm": 58417.78125, |
|
"learning_rate": 4.159522608841822e-05, |
|
"loss": 0.8373, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16977643301395193, |
|
"grad_norm": 61807.9140625, |
|
"learning_rate": 4.1511178349302404e-05, |
|
"loss": 0.8633, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.1714573877962683, |
|
"grad_norm": 61544.55078125, |
|
"learning_rate": 4.142713061018659e-05, |
|
"loss": 0.7921, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.17313834257858463, |
|
"grad_norm": 78909.9921875, |
|
"learning_rate": 4.134308287107077e-05, |
|
"loss": 0.8422, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.174819297360901, |
|
"grad_norm": 75892.40625, |
|
"learning_rate": 4.1259035131954955e-05, |
|
"loss": 0.8307, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.17650025214321735, |
|
"grad_norm": 62162.703125, |
|
"learning_rate": 4.117498739283914e-05, |
|
"loss": 0.8243, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.1781812069255337, |
|
"grad_norm": 66981.4609375, |
|
"learning_rate": 4.1090939653723315e-05, |
|
"loss": 0.819, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.17986216170785005, |
|
"grad_norm": 69593.6796875, |
|
"learning_rate": 4.10068919146075e-05, |
|
"loss": 0.8817, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.1815431164901664, |
|
"grad_norm": 67435.6640625, |
|
"learning_rate": 4.092284417549168e-05, |
|
"loss": 0.8077, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.18322407127248277, |
|
"grad_norm": 66041.1640625, |
|
"learning_rate": 4.0838796436375865e-05, |
|
"loss": 0.841, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.18490502605479914, |
|
"grad_norm": 65792.2890625, |
|
"learning_rate": 4.075474869726005e-05, |
|
"loss": 0.8742, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.18658598083711547, |
|
"grad_norm": 61461.2265625, |
|
"learning_rate": 4.0670700958144226e-05, |
|
"loss": 0.8381, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.18826693561943184, |
|
"grad_norm": 62316.87109375, |
|
"learning_rate": 4.058665321902841e-05, |
|
"loss": 0.8629, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.1899478904017482, |
|
"grad_norm": 60252.7421875, |
|
"learning_rate": 4.050260547991259e-05, |
|
"loss": 0.8003, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.19162884518406456, |
|
"grad_norm": 133503.328125, |
|
"learning_rate": 4.0418557740796776e-05, |
|
"loss": 0.8469, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.1933097999663809, |
|
"grad_norm": 64984.43359375, |
|
"learning_rate": 4.033451000168096e-05, |
|
"loss": 0.8024, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.19499075474869726, |
|
"grad_norm": 64012.15234375, |
|
"learning_rate": 4.0250462262565136e-05, |
|
"loss": 0.8655, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.19667170953101362, |
|
"grad_norm": 80887.5859375, |
|
"learning_rate": 4.016641452344932e-05, |
|
"loss": 0.8273, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.19835266431332998, |
|
"grad_norm": 67626.34375, |
|
"learning_rate": 4.00823667843335e-05, |
|
"loss": 0.7986, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.20003361909564632, |
|
"grad_norm": 71404.109375, |
|
"learning_rate": 3.9998319045217687e-05, |
|
"loss": 0.796, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.20171457387796268, |
|
"grad_norm": 69063.609375, |
|
"learning_rate": 3.991427130610187e-05, |
|
"loss": 0.8819, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.20339552866027905, |
|
"grad_norm": 64813.16796875, |
|
"learning_rate": 3.9830223566986053e-05, |
|
"loss": 0.8451, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.20507648344259538, |
|
"grad_norm": 76470.9765625, |
|
"learning_rate": 3.974617582787023e-05, |
|
"loss": 0.8079, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.20675743822491174, |
|
"grad_norm": 57304.13671875, |
|
"learning_rate": 3.9662128088754414e-05, |
|
"loss": 0.7686, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.2084383930072281, |
|
"grad_norm": 58822.703125, |
|
"learning_rate": 3.95780803496386e-05, |
|
"loss": 0.8457, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.21011934778954447, |
|
"grad_norm": 61174.65234375, |
|
"learning_rate": 3.949403261052278e-05, |
|
"loss": 0.752, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2118003025718608, |
|
"grad_norm": 72221.9296875, |
|
"learning_rate": 3.9409984871406964e-05, |
|
"loss": 0.8431, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.21348125735417717, |
|
"grad_norm": 64120.45703125, |
|
"learning_rate": 3.932593713229114e-05, |
|
"loss": 0.8505, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.21516221213649353, |
|
"grad_norm": 67320.4140625, |
|
"learning_rate": 3.9241889393175324e-05, |
|
"loss": 0.9042, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.2168431669188099, |
|
"grad_norm": 64738.58203125, |
|
"learning_rate": 3.915784165405951e-05, |
|
"loss": 0.8681, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.21852412170112623, |
|
"grad_norm": 87432.8515625, |
|
"learning_rate": 3.907379391494369e-05, |
|
"loss": 0.8661, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2202050764834426, |
|
"grad_norm": 56789.4453125, |
|
"learning_rate": 3.8989746175827875e-05, |
|
"loss": 0.8058, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.22188603126575895, |
|
"grad_norm": 55766.0234375, |
|
"learning_rate": 3.890569843671206e-05, |
|
"loss": 0.8627, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.22356698604807532, |
|
"grad_norm": 68595.6640625, |
|
"learning_rate": 3.8821650697596235e-05, |
|
"loss": 0.8299, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.22524794083039165, |
|
"grad_norm": 67234.25, |
|
"learning_rate": 3.873760295848042e-05, |
|
"loss": 0.8106, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.22692889561270801, |
|
"grad_norm": 67707.8828125, |
|
"learning_rate": 3.86535552193646e-05, |
|
"loss": 0.8399, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.22860985039502438, |
|
"grad_norm": 60259.1015625, |
|
"learning_rate": 3.8569507480248785e-05, |
|
"loss": 0.8484, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.23029080517734074, |
|
"grad_norm": 68358.1484375, |
|
"learning_rate": 3.848545974113297e-05, |
|
"loss": 0.8264, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.23197175995965708, |
|
"grad_norm": 62014.7109375, |
|
"learning_rate": 3.8401412002017145e-05, |
|
"loss": 0.8507, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.23365271474197344, |
|
"grad_norm": 61060.28125, |
|
"learning_rate": 3.831736426290133e-05, |
|
"loss": 0.8586, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.2353336695242898, |
|
"grad_norm": 57990.08984375, |
|
"learning_rate": 3.823331652378551e-05, |
|
"loss": 0.8011, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.23701462430660616, |
|
"grad_norm": 69242.015625, |
|
"learning_rate": 3.8149268784669696e-05, |
|
"loss": 0.7943, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.2386955790889225, |
|
"grad_norm": 85857.8046875, |
|
"learning_rate": 3.806522104555388e-05, |
|
"loss": 0.864, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.24037653387123886, |
|
"grad_norm": 69873.5234375, |
|
"learning_rate": 3.7981173306438056e-05, |
|
"loss": 0.8578, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.24205748865355523, |
|
"grad_norm": 59830.73046875, |
|
"learning_rate": 3.789712556732224e-05, |
|
"loss": 0.8517, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.2437384434358716, |
|
"grad_norm": 59296.34765625, |
|
"learning_rate": 3.781307782820642e-05, |
|
"loss": 0.864, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.24541939821818792, |
|
"grad_norm": 63888.60546875, |
|
"learning_rate": 3.7729030089090606e-05, |
|
"loss": 0.8161, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.24710035300050429, |
|
"grad_norm": 63231.421875, |
|
"learning_rate": 3.764498234997479e-05, |
|
"loss": 0.8901, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.24878130778282065, |
|
"grad_norm": 62661.12890625, |
|
"learning_rate": 3.756093461085897e-05, |
|
"loss": 0.8638, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.250462262565137, |
|
"grad_norm": 69614.8984375, |
|
"learning_rate": 3.747688687174315e-05, |
|
"loss": 0.8644, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.2521432173474534, |
|
"grad_norm": 67442.9296875, |
|
"learning_rate": 3.7392839132627334e-05, |
|
"loss": 0.7659, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2538241721297697, |
|
"grad_norm": 67846.0390625, |
|
"learning_rate": 3.730879139351152e-05, |
|
"loss": 0.8243, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.25550512691208604, |
|
"grad_norm": 65967.703125, |
|
"learning_rate": 3.72247436543957e-05, |
|
"loss": 0.8573, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.25718608169440244, |
|
"grad_norm": 66188.1484375, |
|
"learning_rate": 3.7140695915279884e-05, |
|
"loss": 0.8491, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.25886703647671877, |
|
"grad_norm": 55857.7890625, |
|
"learning_rate": 3.705664817616406e-05, |
|
"loss": 0.8314, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.2605479912590351, |
|
"grad_norm": 78828.6328125, |
|
"learning_rate": 3.6972600437048244e-05, |
|
"loss": 0.7589, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.2622289460413515, |
|
"grad_norm": 64696.65234375, |
|
"learning_rate": 3.688855269793243e-05, |
|
"loss": 0.774, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.26390990082366783, |
|
"grad_norm": 71397.7265625, |
|
"learning_rate": 3.680450495881661e-05, |
|
"loss": 0.8351, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.2655908556059842, |
|
"grad_norm": 66192.203125, |
|
"learning_rate": 3.6720457219700795e-05, |
|
"loss": 0.8099, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.26727181038830056, |
|
"grad_norm": 70641.421875, |
|
"learning_rate": 3.663640948058498e-05, |
|
"loss": 0.8571, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.2689527651706169, |
|
"grad_norm": 75064.75, |
|
"learning_rate": 3.6552361741469155e-05, |
|
"loss": 0.8681, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2706337199529333, |
|
"grad_norm": 75823.125, |
|
"learning_rate": 3.646831400235334e-05, |
|
"loss": 0.8258, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.2723146747352496, |
|
"grad_norm": 72296.765625, |
|
"learning_rate": 3.638426626323752e-05, |
|
"loss": 0.8561, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.27399562951756595, |
|
"grad_norm": 64309.3125, |
|
"learning_rate": 3.6300218524121705e-05, |
|
"loss": 0.856, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.27567658429988234, |
|
"grad_norm": 73902.8359375, |
|
"learning_rate": 3.621617078500589e-05, |
|
"loss": 0.8424, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.2773575390821987, |
|
"grad_norm": 63552.06640625, |
|
"learning_rate": 3.6132123045890065e-05, |
|
"loss": 0.861, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.27903849386451507, |
|
"grad_norm": 74067.7265625, |
|
"learning_rate": 3.604807530677425e-05, |
|
"loss": 0.7832, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.2807194486468314, |
|
"grad_norm": 68881.3046875, |
|
"learning_rate": 3.596402756765843e-05, |
|
"loss": 0.8443, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.28240040342914774, |
|
"grad_norm": 62996.19921875, |
|
"learning_rate": 3.5879979828542616e-05, |
|
"loss": 0.8075, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.28408135821146413, |
|
"grad_norm": 73364.8125, |
|
"learning_rate": 3.57959320894268e-05, |
|
"loss": 0.8425, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.28576231299378047, |
|
"grad_norm": 67509.296875, |
|
"learning_rate": 3.5711884350310976e-05, |
|
"loss": 0.755, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.2874432677760968, |
|
"grad_norm": 66616.984375, |
|
"learning_rate": 3.562783661119516e-05, |
|
"loss": 0.8679, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.2891242225584132, |
|
"grad_norm": 74004.359375, |
|
"learning_rate": 3.554378887207934e-05, |
|
"loss": 0.8793, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.2908051773407295, |
|
"grad_norm": 59084.9609375, |
|
"learning_rate": 3.5459741132963526e-05, |
|
"loss": 0.8322, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.2924861321230459, |
|
"grad_norm": 74027.28125, |
|
"learning_rate": 3.537569339384771e-05, |
|
"loss": 0.8661, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.29416708690536225, |
|
"grad_norm": 64524.97265625, |
|
"learning_rate": 3.529164565473189e-05, |
|
"loss": 0.7334, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.2958480416876786, |
|
"grad_norm": 74809.4921875, |
|
"learning_rate": 3.520759791561607e-05, |
|
"loss": 0.8091, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.297528996469995, |
|
"grad_norm": 66084.609375, |
|
"learning_rate": 3.5123550176500253e-05, |
|
"loss": 0.8715, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.2992099512523113, |
|
"grad_norm": 62919.05859375, |
|
"learning_rate": 3.503950243738444e-05, |
|
"loss": 0.8326, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.30089090603462765, |
|
"grad_norm": 68219.046875, |
|
"learning_rate": 3.495545469826862e-05, |
|
"loss": 0.8387, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.30257186081694404, |
|
"grad_norm": 61066.58203125, |
|
"learning_rate": 3.4871406959152804e-05, |
|
"loss": 0.8113, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3042528155992604, |
|
"grad_norm": 64199.75390625, |
|
"learning_rate": 3.478735922003698e-05, |
|
"loss": 0.8359, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.3059337703815767, |
|
"grad_norm": 64780.81640625, |
|
"learning_rate": 3.4703311480921164e-05, |
|
"loss": 0.8528, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.3076147251638931, |
|
"grad_norm": 66866.6875, |
|
"learning_rate": 3.461926374180535e-05, |
|
"loss": 0.7795, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.30929567994620943, |
|
"grad_norm": 74536.5078125, |
|
"learning_rate": 3.453521600268953e-05, |
|
"loss": 0.8432, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.3109766347285258, |
|
"grad_norm": 63969.90234375, |
|
"learning_rate": 3.4451168263573714e-05, |
|
"loss": 0.8459, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.31265758951084216, |
|
"grad_norm": 68175.203125, |
|
"learning_rate": 3.43671205244579e-05, |
|
"loss": 0.8437, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.3143385442931585, |
|
"grad_norm": 61941.46484375, |
|
"learning_rate": 3.4283072785342075e-05, |
|
"loss": 0.8182, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.3160194990754749, |
|
"grad_norm": 67861.203125, |
|
"learning_rate": 3.419902504622626e-05, |
|
"loss": 0.8742, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.3177004538577912, |
|
"grad_norm": 66184.25, |
|
"learning_rate": 3.411497730711044e-05, |
|
"loss": 0.8312, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.31938140864010756, |
|
"grad_norm": 63603.37109375, |
|
"learning_rate": 3.4030929567994625e-05, |
|
"loss": 0.8162, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.32106236342242395, |
|
"grad_norm": 76040.2421875, |
|
"learning_rate": 3.394688182887881e-05, |
|
"loss": 0.8555, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.3227433182047403, |
|
"grad_norm": 62280.6328125, |
|
"learning_rate": 3.3862834089762985e-05, |
|
"loss": 0.8086, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.32442427298705667, |
|
"grad_norm": 68005.015625, |
|
"learning_rate": 3.377878635064717e-05, |
|
"loss": 0.7433, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.326105227769373, |
|
"grad_norm": 58576.92578125, |
|
"learning_rate": 3.369473861153135e-05, |
|
"loss": 0.7728, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.32778618255168934, |
|
"grad_norm": 64847.859375, |
|
"learning_rate": 3.3610690872415536e-05, |
|
"loss": 0.7596, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.32946713733400573, |
|
"grad_norm": 63781.38671875, |
|
"learning_rate": 3.352664313329972e-05, |
|
"loss": 0.7308, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.33114809211632207, |
|
"grad_norm": 66446.5859375, |
|
"learning_rate": 3.3442595394183896e-05, |
|
"loss": 0.8058, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.3328290468986384, |
|
"grad_norm": 61722.046875, |
|
"learning_rate": 3.335854765506808e-05, |
|
"loss": 0.869, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.3345100016809548, |
|
"grad_norm": 66245.3359375, |
|
"learning_rate": 3.327449991595226e-05, |
|
"loss": 0.8153, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.33619095646327113, |
|
"grad_norm": 60609.90625, |
|
"learning_rate": 3.3190452176836446e-05, |
|
"loss": 0.8484, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3378719112455875, |
|
"grad_norm": 70234.8828125, |
|
"learning_rate": 3.310640443772063e-05, |
|
"loss": 0.8249, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.33955286602790385, |
|
"grad_norm": 52655.56640625, |
|
"learning_rate": 3.302235669860481e-05, |
|
"loss": 0.7768, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.3412338208102202, |
|
"grad_norm": 73065.375, |
|
"learning_rate": 3.293830895948899e-05, |
|
"loss": 0.8911, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.3429147755925366, |
|
"grad_norm": 59607.45703125, |
|
"learning_rate": 3.285426122037317e-05, |
|
"loss": 0.7773, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.3445957303748529, |
|
"grad_norm": 64399.9375, |
|
"learning_rate": 3.277021348125736e-05, |
|
"loss": 0.8161, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.34627668515716925, |
|
"grad_norm": 75411.3359375, |
|
"learning_rate": 3.268616574214154e-05, |
|
"loss": 0.7615, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.34795763993948564, |
|
"grad_norm": 69012.03125, |
|
"learning_rate": 3.2602118003025724e-05, |
|
"loss": 0.8245, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.349638594721802, |
|
"grad_norm": 69290.5625, |
|
"learning_rate": 3.25180702639099e-05, |
|
"loss": 0.786, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.35131954950411837, |
|
"grad_norm": 68525.0859375, |
|
"learning_rate": 3.2434022524794084e-05, |
|
"loss": 0.7572, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.3530005042864347, |
|
"grad_norm": 61886.20703125, |
|
"learning_rate": 3.234997478567827e-05, |
|
"loss": 0.8115, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.35468145906875104, |
|
"grad_norm": 73001.8828125, |
|
"learning_rate": 3.226592704656245e-05, |
|
"loss": 0.8336, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.3563624138510674, |
|
"grad_norm": 65161.6484375, |
|
"learning_rate": 3.2181879307446634e-05, |
|
"loss": 0.8124, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.35804336863338376, |
|
"grad_norm": 61001.9140625, |
|
"learning_rate": 3.209783156833082e-05, |
|
"loss": 0.8067, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.3597243234157001, |
|
"grad_norm": 56932.25, |
|
"learning_rate": 3.2013783829214994e-05, |
|
"loss": 0.77, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.3614052781980165, |
|
"grad_norm": 68792.859375, |
|
"learning_rate": 3.192973609009918e-05, |
|
"loss": 0.8217, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.3630862329803328, |
|
"grad_norm": 61329.98828125, |
|
"learning_rate": 3.184568835098336e-05, |
|
"loss": 0.7714, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.36476718776264916, |
|
"grad_norm": 66838.0078125, |
|
"learning_rate": 3.1761640611867545e-05, |
|
"loss": 0.8314, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.36644814254496555, |
|
"grad_norm": 73817.578125, |
|
"learning_rate": 3.167759287275173e-05, |
|
"loss": 0.8413, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.3681290973272819, |
|
"grad_norm": 67156.03125, |
|
"learning_rate": 3.1593545133635905e-05, |
|
"loss": 0.7996, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.3698100521095983, |
|
"grad_norm": 83176.359375, |
|
"learning_rate": 3.150949739452009e-05, |
|
"loss": 0.7875, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3714910068919146, |
|
"grad_norm": 68843.5859375, |
|
"learning_rate": 3.142544965540427e-05, |
|
"loss": 0.8156, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.37317196167423095, |
|
"grad_norm": 61444.25, |
|
"learning_rate": 3.1341401916288455e-05, |
|
"loss": 0.7747, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.37485291645654734, |
|
"grad_norm": 70228.59375, |
|
"learning_rate": 3.125735417717264e-05, |
|
"loss": 0.8088, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.37653387123886367, |
|
"grad_norm": 56036.578125, |
|
"learning_rate": 3.1173306438056816e-05, |
|
"loss": 0.7834, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.37821482602118, |
|
"grad_norm": 62951.3828125, |
|
"learning_rate": 3.1089258698941e-05, |
|
"loss": 0.7642, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.3798957808034964, |
|
"grad_norm": 66556.96875, |
|
"learning_rate": 3.100521095982518e-05, |
|
"loss": 0.8115, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.38157673558581273, |
|
"grad_norm": 83066.890625, |
|
"learning_rate": 3.0921163220709366e-05, |
|
"loss": 0.8311, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.3832576903681291, |
|
"grad_norm": 68849.5078125, |
|
"learning_rate": 3.083711548159355e-05, |
|
"loss": 0.7935, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.38493864515044546, |
|
"grad_norm": 68215.6953125, |
|
"learning_rate": 3.075306774247773e-05, |
|
"loss": 0.8071, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.3866195999327618, |
|
"grad_norm": 65878.6484375, |
|
"learning_rate": 3.066902000336191e-05, |
|
"loss": 0.9158, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.3883005547150782, |
|
"grad_norm": 59912.58203125, |
|
"learning_rate": 3.058497226424609e-05, |
|
"loss": 0.8217, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.3899815094973945, |
|
"grad_norm": 65707.3828125, |
|
"learning_rate": 3.0500924525130277e-05, |
|
"loss": 0.7693, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.39166246427971085, |
|
"grad_norm": 75819.90625, |
|
"learning_rate": 3.041687678601446e-05, |
|
"loss": 0.8007, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.39334341906202724, |
|
"grad_norm": 66755.734375, |
|
"learning_rate": 3.0332829046898644e-05, |
|
"loss": 0.8416, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.3950243738443436, |
|
"grad_norm": 68420.984375, |
|
"learning_rate": 3.024878130778282e-05, |
|
"loss": 0.8119, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.39670532862665997, |
|
"grad_norm": 70407.578125, |
|
"learning_rate": 3.0164733568667004e-05, |
|
"loss": 0.7907, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.3983862834089763, |
|
"grad_norm": 70415.9453125, |
|
"learning_rate": 3.0080685829551187e-05, |
|
"loss": 0.7508, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.40006723819129264, |
|
"grad_norm": 64331.296875, |
|
"learning_rate": 2.999663809043537e-05, |
|
"loss": 0.7812, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.40174819297360903, |
|
"grad_norm": 55358.15625, |
|
"learning_rate": 2.9912590351319554e-05, |
|
"loss": 0.8044, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.40342914775592537, |
|
"grad_norm": 74381.0078125, |
|
"learning_rate": 2.9828542612203734e-05, |
|
"loss": 0.7862, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4051101025382417, |
|
"grad_norm": 69503.421875, |
|
"learning_rate": 2.9744494873087914e-05, |
|
"loss": 0.8414, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.4067910573205581, |
|
"grad_norm": 63817.671875, |
|
"learning_rate": 2.9660447133972098e-05, |
|
"loss": 0.8239, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.4084720121028744, |
|
"grad_norm": 64829.07421875, |
|
"learning_rate": 2.957639939485628e-05, |
|
"loss": 0.8409, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.41015296688519076, |
|
"grad_norm": 90130.75, |
|
"learning_rate": 2.9492351655740465e-05, |
|
"loss": 0.7979, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.41183392166750715, |
|
"grad_norm": 65037.46484375, |
|
"learning_rate": 2.9408303916624648e-05, |
|
"loss": 0.7724, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.4135148764498235, |
|
"grad_norm": 74054.9765625, |
|
"learning_rate": 2.9324256177508825e-05, |
|
"loss": 0.9302, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.4151958312321399, |
|
"grad_norm": 65572.390625, |
|
"learning_rate": 2.924020843839301e-05, |
|
"loss": 0.8096, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.4168767860144562, |
|
"grad_norm": 70729.125, |
|
"learning_rate": 2.9156160699277192e-05, |
|
"loss": 0.7749, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.41855774079677255, |
|
"grad_norm": 60806.2265625, |
|
"learning_rate": 2.9072112960161375e-05, |
|
"loss": 0.7964, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.42023869557908894, |
|
"grad_norm": 66842.796875, |
|
"learning_rate": 2.898806522104556e-05, |
|
"loss": 0.7956, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4219196503614053, |
|
"grad_norm": 65062.66015625, |
|
"learning_rate": 2.8904017481929735e-05, |
|
"loss": 0.765, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.4236006051437216, |
|
"grad_norm": 70468.4921875, |
|
"learning_rate": 2.881996974281392e-05, |
|
"loss": 0.8029, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.425281559926038, |
|
"grad_norm": 63679.37109375, |
|
"learning_rate": 2.8735922003698102e-05, |
|
"loss": 0.8116, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.42696251470835433, |
|
"grad_norm": 79301.3984375, |
|
"learning_rate": 2.8651874264582286e-05, |
|
"loss": 0.8561, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.4286434694906707, |
|
"grad_norm": 66358.953125, |
|
"learning_rate": 2.856782652546647e-05, |
|
"loss": 0.8753, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.43032442427298706, |
|
"grad_norm": 64068.6875, |
|
"learning_rate": 2.848377878635065e-05, |
|
"loss": 0.7874, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.4320053790553034, |
|
"grad_norm": 70842.203125, |
|
"learning_rate": 2.839973104723483e-05, |
|
"loss": 0.8036, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.4336863338376198, |
|
"grad_norm": 69636.0390625, |
|
"learning_rate": 2.8315683308119013e-05, |
|
"loss": 0.8069, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.4353672886199361, |
|
"grad_norm": 89295.2421875, |
|
"learning_rate": 2.8231635569003196e-05, |
|
"loss": 0.8322, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.43704824340225246, |
|
"grad_norm": 70937.7265625, |
|
"learning_rate": 2.814758782988738e-05, |
|
"loss": 0.9161, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.43872919818456885, |
|
"grad_norm": 88488.953125, |
|
"learning_rate": 2.8063540090771563e-05, |
|
"loss": 0.7753, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.4404101529668852, |
|
"grad_norm": 68679.078125, |
|
"learning_rate": 2.797949235165574e-05, |
|
"loss": 0.8489, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.4420911077492016, |
|
"grad_norm": 62188.109375, |
|
"learning_rate": 2.7895444612539924e-05, |
|
"loss": 0.7826, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.4437720625315179, |
|
"grad_norm": 72500.828125, |
|
"learning_rate": 2.7811396873424107e-05, |
|
"loss": 0.8122, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.44545301731383424, |
|
"grad_norm": 61758.7421875, |
|
"learning_rate": 2.772734913430829e-05, |
|
"loss": 0.7645, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.44713397209615063, |
|
"grad_norm": 59873.4375, |
|
"learning_rate": 2.7643301395192474e-05, |
|
"loss": 0.7859, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.44881492687846697, |
|
"grad_norm": 78891.734375, |
|
"learning_rate": 2.755925365607665e-05, |
|
"loss": 0.7696, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.4504958816607833, |
|
"grad_norm": 69256.2578125, |
|
"learning_rate": 2.7475205916960834e-05, |
|
"loss": 0.8161, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.4521768364430997, |
|
"grad_norm": 68757.21875, |
|
"learning_rate": 2.7391158177845018e-05, |
|
"loss": 0.7662, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.45385779122541603, |
|
"grad_norm": 80412.984375, |
|
"learning_rate": 2.73071104387292e-05, |
|
"loss": 0.7875, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.45553874600773236, |
|
"grad_norm": 74670.53125, |
|
"learning_rate": 2.7223062699613385e-05, |
|
"loss": 0.7664, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.45721970079004876, |
|
"grad_norm": 69425.640625, |
|
"learning_rate": 2.7139014960497565e-05, |
|
"loss": 0.8521, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.4589006555723651, |
|
"grad_norm": 70010.1484375, |
|
"learning_rate": 2.7054967221381745e-05, |
|
"loss": 0.7819, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.4605816103546815, |
|
"grad_norm": 76622.5078125, |
|
"learning_rate": 2.6970919482265928e-05, |
|
"loss": 0.8632, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.4622625651369978, |
|
"grad_norm": 63262.62890625, |
|
"learning_rate": 2.6886871743150112e-05, |
|
"loss": 0.7303, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.46394351991931415, |
|
"grad_norm": 63295.68359375, |
|
"learning_rate": 2.6802824004034295e-05, |
|
"loss": 0.773, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.46562447470163054, |
|
"grad_norm": 61289.41015625, |
|
"learning_rate": 2.6718776264918475e-05, |
|
"loss": 0.7776, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.4673054294839469, |
|
"grad_norm": 75931.3671875, |
|
"learning_rate": 2.6634728525802655e-05, |
|
"loss": 0.826, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.4689863842662632, |
|
"grad_norm": 69288.890625, |
|
"learning_rate": 2.655068078668684e-05, |
|
"loss": 0.7978, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.4706673390485796, |
|
"grad_norm": 61528.40234375, |
|
"learning_rate": 2.6466633047571022e-05, |
|
"loss": 0.7891, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.47234829383089594, |
|
"grad_norm": 67145.6796875, |
|
"learning_rate": 2.6382585308455206e-05, |
|
"loss": 0.7483, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.47402924861321233, |
|
"grad_norm": 62958.0703125, |
|
"learning_rate": 2.629853756933939e-05, |
|
"loss": 0.8407, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.47571020339552866, |
|
"grad_norm": 71565.0078125, |
|
"learning_rate": 2.621448983022357e-05, |
|
"loss": 0.8093, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.477391158177845, |
|
"grad_norm": 70295.3125, |
|
"learning_rate": 2.613044209110775e-05, |
|
"loss": 0.8328, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.4790721129601614, |
|
"grad_norm": 66831.65625, |
|
"learning_rate": 2.6046394351991933e-05, |
|
"loss": 0.7844, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.4807530677424777, |
|
"grad_norm": 62661.7421875, |
|
"learning_rate": 2.5962346612876116e-05, |
|
"loss": 0.7925, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.48243402252479406, |
|
"grad_norm": 61856.66796875, |
|
"learning_rate": 2.58782988737603e-05, |
|
"loss": 0.7488, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.48411497730711045, |
|
"grad_norm": 73633.4765625, |
|
"learning_rate": 2.579425113464448e-05, |
|
"loss": 0.7881, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.4857959320894268, |
|
"grad_norm": 69115.28125, |
|
"learning_rate": 2.571020339552866e-05, |
|
"loss": 0.8175, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.4874768868717432, |
|
"grad_norm": 61891.3828125, |
|
"learning_rate": 2.5626155656412843e-05, |
|
"loss": 0.8784, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4891578416540595, |
|
"grad_norm": 77769.9921875, |
|
"learning_rate": 2.5542107917297027e-05, |
|
"loss": 0.8684, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.49083879643637585, |
|
"grad_norm": 68048.734375, |
|
"learning_rate": 2.545806017818121e-05, |
|
"loss": 0.7853, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.49251975121869224, |
|
"grad_norm": 61355.94140625, |
|
"learning_rate": 2.537401243906539e-05, |
|
"loss": 0.7845, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.49420070600100857, |
|
"grad_norm": 69287.953125, |
|
"learning_rate": 2.528996469994957e-05, |
|
"loss": 0.8007, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.4958816607833249, |
|
"grad_norm": 68851.8359375, |
|
"learning_rate": 2.5205916960833754e-05, |
|
"loss": 0.8159, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.4975626155656413, |
|
"grad_norm": 69794.859375, |
|
"learning_rate": 2.5121869221717938e-05, |
|
"loss": 0.7958, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.49924357034795763, |
|
"grad_norm": 73403.1953125, |
|
"learning_rate": 2.503782148260212e-05, |
|
"loss": 0.7855, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.500924525130274, |
|
"grad_norm": 74224.1328125, |
|
"learning_rate": 2.49537737434863e-05, |
|
"loss": 0.7728, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.5026054799125903, |
|
"grad_norm": 63414.859375, |
|
"learning_rate": 2.4869726004370485e-05, |
|
"loss": 0.8105, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.5042864346949067, |
|
"grad_norm": 77378.109375, |
|
"learning_rate": 2.4785678265254668e-05, |
|
"loss": 0.8636, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5059673894772231, |
|
"grad_norm": 70468.2734375, |
|
"learning_rate": 2.4701630526138848e-05, |
|
"loss": 0.7242, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.5076483442595394, |
|
"grad_norm": 60809.28515625, |
|
"learning_rate": 2.461758278702303e-05, |
|
"loss": 0.8078, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.5093292990418558, |
|
"grad_norm": 51373.625, |
|
"learning_rate": 2.4533535047907215e-05, |
|
"loss": 0.8366, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.5110102538241721, |
|
"grad_norm": 59719.765625, |
|
"learning_rate": 2.4449487308791395e-05, |
|
"loss": 0.8665, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5126912086064885, |
|
"grad_norm": 64678.41015625, |
|
"learning_rate": 2.436543956967558e-05, |
|
"loss": 0.7776, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5143721633888049, |
|
"grad_norm": 74790.40625, |
|
"learning_rate": 2.4281391830559762e-05, |
|
"loss": 0.7729, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.5160531181711212, |
|
"grad_norm": 71427.921875, |
|
"learning_rate": 2.4197344091443942e-05, |
|
"loss": 0.7956, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.5177340729534375, |
|
"grad_norm": 66004.3671875, |
|
"learning_rate": 2.4113296352328126e-05, |
|
"loss": 0.7588, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.5194150277357539, |
|
"grad_norm": 64083.765625, |
|
"learning_rate": 2.4029248613212306e-05, |
|
"loss": 0.7931, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.5210959825180702, |
|
"grad_norm": 54726.55078125, |
|
"learning_rate": 2.394520087409649e-05, |
|
"loss": 0.7163, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5227769373003867, |
|
"grad_norm": 73129.3125, |
|
"learning_rate": 2.3861153134980673e-05, |
|
"loss": 0.8364, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.524457892082703, |
|
"grad_norm": 62834.08984375, |
|
"learning_rate": 2.3777105395864853e-05, |
|
"loss": 0.7431, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.5261388468650193, |
|
"grad_norm": 61612.23828125, |
|
"learning_rate": 2.3693057656749036e-05, |
|
"loss": 0.7975, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.5278198016473357, |
|
"grad_norm": 71137.6640625, |
|
"learning_rate": 2.3609009917633216e-05, |
|
"loss": 0.783, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.529500756429652, |
|
"grad_norm": 63406.10546875, |
|
"learning_rate": 2.35249621785174e-05, |
|
"loss": 0.824, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5311817112119684, |
|
"grad_norm": 64795.73828125, |
|
"learning_rate": 2.3440914439401583e-05, |
|
"loss": 0.8715, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.5328626659942848, |
|
"grad_norm": 59525.79296875, |
|
"learning_rate": 2.3356866700285763e-05, |
|
"loss": 0.7934, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.5345436207766011, |
|
"grad_norm": 72914.4609375, |
|
"learning_rate": 2.3272818961169947e-05, |
|
"loss": 0.7243, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.5362245755589174, |
|
"grad_norm": 62966.078125, |
|
"learning_rate": 2.318877122205413e-05, |
|
"loss": 0.712, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.5379055303412338, |
|
"grad_norm": 66647.0546875, |
|
"learning_rate": 2.310472348293831e-05, |
|
"loss": 0.7674, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5395864851235502, |
|
"grad_norm": 65908.0703125, |
|
"learning_rate": 2.3020675743822494e-05, |
|
"loss": 0.8484, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.5412674399058666, |
|
"grad_norm": 60565.56640625, |
|
"learning_rate": 2.2936628004706674e-05, |
|
"loss": 0.7844, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.5429483946881829, |
|
"grad_norm": 67560.3828125, |
|
"learning_rate": 2.2852580265590857e-05, |
|
"loss": 0.9071, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.5446293494704992, |
|
"grad_norm": 66084.265625, |
|
"learning_rate": 2.276853252647504e-05, |
|
"loss": 0.7323, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.5463103042528156, |
|
"grad_norm": 78780.078125, |
|
"learning_rate": 2.268448478735922e-05, |
|
"loss": 0.9259, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.5479912590351319, |
|
"grad_norm": 67810.703125, |
|
"learning_rate": 2.2600437048243404e-05, |
|
"loss": 0.7813, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.5496722138174484, |
|
"grad_norm": 62699.97265625, |
|
"learning_rate": 2.2516389309127588e-05, |
|
"loss": 0.7831, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.5513531685997647, |
|
"grad_norm": 66553.1640625, |
|
"learning_rate": 2.2432341570011768e-05, |
|
"loss": 0.8185, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.553034123382081, |
|
"grad_norm": 60711.96875, |
|
"learning_rate": 2.234829383089595e-05, |
|
"loss": 0.7551, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.5547150781643974, |
|
"grad_norm": 64588.28515625, |
|
"learning_rate": 2.226424609178013e-05, |
|
"loss": 0.6919, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5563960329467137, |
|
"grad_norm": 73545.9765625, |
|
"learning_rate": 2.2180198352664315e-05, |
|
"loss": 0.7834, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.5580769877290301, |
|
"grad_norm": 66515.1796875, |
|
"learning_rate": 2.20961506135485e-05, |
|
"loss": 0.8135, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.5597579425113465, |
|
"grad_norm": 66021.4140625, |
|
"learning_rate": 2.201210287443268e-05, |
|
"loss": 0.7767, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.5614388972936628, |
|
"grad_norm": 72027.734375, |
|
"learning_rate": 2.1928055135316862e-05, |
|
"loss": 0.7623, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.5631198520759791, |
|
"grad_norm": 69724.7109375, |
|
"learning_rate": 2.1844007396201046e-05, |
|
"loss": 0.8539, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.5648008068582955, |
|
"grad_norm": 70565.2890625, |
|
"learning_rate": 2.1759959657085226e-05, |
|
"loss": 0.8081, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.5664817616406118, |
|
"grad_norm": 69982.8515625, |
|
"learning_rate": 2.167591191796941e-05, |
|
"loss": 0.766, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.5681627164229283, |
|
"grad_norm": 73730.5859375, |
|
"learning_rate": 2.159186417885359e-05, |
|
"loss": 0.7858, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.5698436712052446, |
|
"grad_norm": 68504.3359375, |
|
"learning_rate": 2.1507816439737773e-05, |
|
"loss": 0.7774, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.5715246259875609, |
|
"grad_norm": 68371.71875, |
|
"learning_rate": 2.1423768700621956e-05, |
|
"loss": 0.7397, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5732055807698773, |
|
"grad_norm": 62352.87890625, |
|
"learning_rate": 2.1339720961506136e-05, |
|
"loss": 0.7727, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.5748865355521936, |
|
"grad_norm": 67821.3671875, |
|
"learning_rate": 2.125567322239032e-05, |
|
"loss": 0.7494, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.57656749033451, |
|
"grad_norm": 75150.0703125, |
|
"learning_rate": 2.1171625483274503e-05, |
|
"loss": 0.738, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.5782484451168264, |
|
"grad_norm": 71489.2109375, |
|
"learning_rate": 2.1087577744158683e-05, |
|
"loss": 0.7666, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.5799293998991427, |
|
"grad_norm": 77000.265625, |
|
"learning_rate": 2.1003530005042867e-05, |
|
"loss": 0.8051, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.581610354681459, |
|
"grad_norm": 63612.04296875, |
|
"learning_rate": 2.0919482265927047e-05, |
|
"loss": 0.7408, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.5832913094637754, |
|
"grad_norm": 65412.390625, |
|
"learning_rate": 2.083543452681123e-05, |
|
"loss": 0.7585, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.5849722642460918, |
|
"grad_norm": 63992.859375, |
|
"learning_rate": 2.0751386787695414e-05, |
|
"loss": 0.8036, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.5866532190284082, |
|
"grad_norm": 67541.6015625, |
|
"learning_rate": 2.0667339048579594e-05, |
|
"loss": 0.7835, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.5883341738107245, |
|
"grad_norm": 87275.59375, |
|
"learning_rate": 2.0583291309463777e-05, |
|
"loss": 0.843, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5900151285930408, |
|
"grad_norm": 62353.66015625, |
|
"learning_rate": 2.0499243570347957e-05, |
|
"loss": 0.8229, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.5916960833753572, |
|
"grad_norm": 76160.890625, |
|
"learning_rate": 2.041519583123214e-05, |
|
"loss": 0.839, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.5933770381576735, |
|
"grad_norm": 71393.7109375, |
|
"learning_rate": 2.0331148092116324e-05, |
|
"loss": 0.7136, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.59505799293999, |
|
"grad_norm": 70572.75, |
|
"learning_rate": 2.0247100353000504e-05, |
|
"loss": 0.7987, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.5967389477223063, |
|
"grad_norm": 66998.0703125, |
|
"learning_rate": 2.0163052613884688e-05, |
|
"loss": 0.7874, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.5984199025046226, |
|
"grad_norm": 53749.26953125, |
|
"learning_rate": 2.007900487476887e-05, |
|
"loss": 0.767, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.600100857286939, |
|
"grad_norm": 71350.6328125, |
|
"learning_rate": 1.999495713565305e-05, |
|
"loss": 0.8136, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.6017818120692553, |
|
"grad_norm": 64163.25390625, |
|
"learning_rate": 1.9910909396537235e-05, |
|
"loss": 0.7565, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.6034627668515717, |
|
"grad_norm": 61004.05078125, |
|
"learning_rate": 1.9826861657421415e-05, |
|
"loss": 0.7502, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.6051437216338881, |
|
"grad_norm": 66495.4375, |
|
"learning_rate": 1.97428139183056e-05, |
|
"loss": 0.7485, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6068246764162044, |
|
"grad_norm": 62442.1015625, |
|
"learning_rate": 1.9658766179189782e-05, |
|
"loss": 0.7484, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.6085056311985207, |
|
"grad_norm": 66803.453125, |
|
"learning_rate": 1.9574718440073962e-05, |
|
"loss": 0.8459, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.6101865859808371, |
|
"grad_norm": 71394.6875, |
|
"learning_rate": 1.9490670700958145e-05, |
|
"loss": 0.8108, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.6118675407631534, |
|
"grad_norm": 72386.0859375, |
|
"learning_rate": 1.940662296184233e-05, |
|
"loss": 0.8039, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.6135484955454699, |
|
"grad_norm": 69155.015625, |
|
"learning_rate": 1.932257522272651e-05, |
|
"loss": 0.823, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.6152294503277862, |
|
"grad_norm": 60343.54296875, |
|
"learning_rate": 1.9238527483610693e-05, |
|
"loss": 0.7284, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.6169104051101025, |
|
"grad_norm": 70258.7734375, |
|
"learning_rate": 1.9154479744494873e-05, |
|
"loss": 0.7925, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.6185913598924189, |
|
"grad_norm": 68638.21875, |
|
"learning_rate": 1.9070432005379056e-05, |
|
"loss": 0.7802, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.6202723146747352, |
|
"grad_norm": 67208.890625, |
|
"learning_rate": 1.898638426626324e-05, |
|
"loss": 0.7921, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.6219532694570516, |
|
"grad_norm": 56271.5703125, |
|
"learning_rate": 1.890233652714742e-05, |
|
"loss": 0.7752, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.623634224239368, |
|
"grad_norm": 84835.21875, |
|
"learning_rate": 1.8818288788031603e-05, |
|
"loss": 0.8012, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.6253151790216843, |
|
"grad_norm": 58683.70703125, |
|
"learning_rate": 1.8734241048915787e-05, |
|
"loss": 0.7547, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.6269961338040007, |
|
"grad_norm": 66791.8828125, |
|
"learning_rate": 1.8650193309799967e-05, |
|
"loss": 0.782, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.628677088586317, |
|
"grad_norm": 60903.69140625, |
|
"learning_rate": 1.856614557068415e-05, |
|
"loss": 0.7304, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.6303580433686334, |
|
"grad_norm": 63727.53515625, |
|
"learning_rate": 1.848209783156833e-05, |
|
"loss": 0.7591, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.6320389981509498, |
|
"grad_norm": 65615.4140625, |
|
"learning_rate": 1.8398050092452514e-05, |
|
"loss": 0.7483, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.6337199529332661, |
|
"grad_norm": 72326.921875, |
|
"learning_rate": 1.8314002353336697e-05, |
|
"loss": 0.7795, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.6354009077155824, |
|
"grad_norm": 65443.81640625, |
|
"learning_rate": 1.8229954614220877e-05, |
|
"loss": 0.8154, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.6370818624978988, |
|
"grad_norm": 66016.125, |
|
"learning_rate": 1.814590687510506e-05, |
|
"loss": 0.8109, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.6387628172802151, |
|
"grad_norm": 70201.703125, |
|
"learning_rate": 1.8061859135989244e-05, |
|
"loss": 0.8039, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6404437720625316, |
|
"grad_norm": 72543.0859375, |
|
"learning_rate": 1.7977811396873424e-05, |
|
"loss": 0.8201, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.6421247268448479, |
|
"grad_norm": 65741.5703125, |
|
"learning_rate": 1.7893763657757608e-05, |
|
"loss": 0.7919, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.6438056816271642, |
|
"grad_norm": 67635.5078125, |
|
"learning_rate": 1.7809715918641788e-05, |
|
"loss": 0.8174, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.6454866364094806, |
|
"grad_norm": 64830.48046875, |
|
"learning_rate": 1.772566817952597e-05, |
|
"loss": 0.7375, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.6471675911917969, |
|
"grad_norm": 66888.5234375, |
|
"learning_rate": 1.7641620440410155e-05, |
|
"loss": 0.7547, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.6488485459741133, |
|
"grad_norm": 67550.5234375, |
|
"learning_rate": 1.7557572701294335e-05, |
|
"loss": 0.7532, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.6505295007564297, |
|
"grad_norm": 66258.5859375, |
|
"learning_rate": 1.747352496217852e-05, |
|
"loss": 0.7623, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.652210455538746, |
|
"grad_norm": 70212.1875, |
|
"learning_rate": 1.7389477223062702e-05, |
|
"loss": 0.8022, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.6538914103210623, |
|
"grad_norm": 69388.6171875, |
|
"learning_rate": 1.7305429483946882e-05, |
|
"loss": 0.7672, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.6555723651033787, |
|
"grad_norm": 61498.984375, |
|
"learning_rate": 1.7221381744831065e-05, |
|
"loss": 0.7243, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.657253319885695, |
|
"grad_norm": 73348.640625, |
|
"learning_rate": 1.7137334005715245e-05, |
|
"loss": 0.7699, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.6589342746680115, |
|
"grad_norm": 76186.703125, |
|
"learning_rate": 1.705328626659943e-05, |
|
"loss": 0.8293, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.6606152294503278, |
|
"grad_norm": 72204.1171875, |
|
"learning_rate": 1.6969238527483612e-05, |
|
"loss": 0.8325, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.6622961842326441, |
|
"grad_norm": 68690.1328125, |
|
"learning_rate": 1.6885190788367792e-05, |
|
"loss": 0.8174, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.6639771390149605, |
|
"grad_norm": 69304.6015625, |
|
"learning_rate": 1.6801143049251976e-05, |
|
"loss": 0.7395, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.6656580937972768, |
|
"grad_norm": 67933.7890625, |
|
"learning_rate": 1.6717095310136156e-05, |
|
"loss": 0.7976, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.6673390485795933, |
|
"grad_norm": 74347.6953125, |
|
"learning_rate": 1.663304757102034e-05, |
|
"loss": 0.8003, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.6690200033619096, |
|
"grad_norm": 74672.625, |
|
"learning_rate": 1.6548999831904523e-05, |
|
"loss": 0.7809, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.6707009581442259, |
|
"grad_norm": 69611.8984375, |
|
"learning_rate": 1.6464952092788703e-05, |
|
"loss": 0.7884, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.6723819129265423, |
|
"grad_norm": 61687.546875, |
|
"learning_rate": 1.6380904353672887e-05, |
|
"loss": 0.7726, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6740628677088586, |
|
"grad_norm": 57029.6953125, |
|
"learning_rate": 1.629685661455707e-05, |
|
"loss": 0.7537, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.675743822491175, |
|
"grad_norm": 62528.2265625, |
|
"learning_rate": 1.621280887544125e-05, |
|
"loss": 0.7542, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.6774247772734914, |
|
"grad_norm": 69657.46875, |
|
"learning_rate": 1.6128761136325434e-05, |
|
"loss": 0.7846, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.6791057320558077, |
|
"grad_norm": 61733.50390625, |
|
"learning_rate": 1.6044713397209614e-05, |
|
"loss": 0.7342, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.680786686838124, |
|
"grad_norm": 63571.60546875, |
|
"learning_rate": 1.5960665658093797e-05, |
|
"loss": 0.7046, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.6824676416204404, |
|
"grad_norm": 80437.828125, |
|
"learning_rate": 1.587661791897798e-05, |
|
"loss": 0.8413, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.6841485964027567, |
|
"grad_norm": 107205.921875, |
|
"learning_rate": 1.579257017986216e-05, |
|
"loss": 0.8052, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.6858295511850732, |
|
"grad_norm": 71237.375, |
|
"learning_rate": 1.5708522440746344e-05, |
|
"loss": 0.7729, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.6875105059673895, |
|
"grad_norm": 61227.4296875, |
|
"learning_rate": 1.5624474701630528e-05, |
|
"loss": 0.7972, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.6891914607497058, |
|
"grad_norm": 81108.9296875, |
|
"learning_rate": 1.5540426962514708e-05, |
|
"loss": 0.7173, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6908724155320222, |
|
"grad_norm": 58387.5703125, |
|
"learning_rate": 1.545637922339889e-05, |
|
"loss": 0.8341, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.6925533703143385, |
|
"grad_norm": 77154.796875, |
|
"learning_rate": 1.537233148428307e-05, |
|
"loss": 0.7459, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.694234325096655, |
|
"grad_norm": 66626.140625, |
|
"learning_rate": 1.5288283745167255e-05, |
|
"loss": 0.7965, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.6959152798789713, |
|
"grad_norm": 65993.8671875, |
|
"learning_rate": 1.5204236006051437e-05, |
|
"loss": 0.8077, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.6975962346612876, |
|
"grad_norm": 65909.390625, |
|
"learning_rate": 1.512018826693562e-05, |
|
"loss": 0.7933, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.699277189443604, |
|
"grad_norm": 73940.3671875, |
|
"learning_rate": 1.5036140527819803e-05, |
|
"loss": 0.7937, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.7009581442259203, |
|
"grad_norm": 80516.8203125, |
|
"learning_rate": 1.4952092788703984e-05, |
|
"loss": 0.7921, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.7026390990082367, |
|
"grad_norm": 62047.8046875, |
|
"learning_rate": 1.4868045049588167e-05, |
|
"loss": 0.7405, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.7043200537905531, |
|
"grad_norm": 68648.3828125, |
|
"learning_rate": 1.478399731047235e-05, |
|
"loss": 0.7956, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.7060010085728694, |
|
"grad_norm": 74798.203125, |
|
"learning_rate": 1.469994957135653e-05, |
|
"loss": 0.7026, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7076819633551857, |
|
"grad_norm": 65563.265625, |
|
"learning_rate": 1.4615901832240714e-05, |
|
"loss": 0.7846, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.7093629181375021, |
|
"grad_norm": 76866.96875, |
|
"learning_rate": 1.4531854093124898e-05, |
|
"loss": 0.829, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.7110438729198184, |
|
"grad_norm": 64904.796875, |
|
"learning_rate": 1.4447806354009078e-05, |
|
"loss": 0.8053, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.7127248277021349, |
|
"grad_norm": 72151.6484375, |
|
"learning_rate": 1.4363758614893261e-05, |
|
"loss": 0.7583, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.7144057824844512, |
|
"grad_norm": 57045.8203125, |
|
"learning_rate": 1.4279710875777441e-05, |
|
"loss": 0.7584, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.7160867372667675, |
|
"grad_norm": 74145.6953125, |
|
"learning_rate": 1.4195663136661625e-05, |
|
"loss": 0.7637, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.7177676920490839, |
|
"grad_norm": 63434.96875, |
|
"learning_rate": 1.4111615397545808e-05, |
|
"loss": 0.8278, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.7194486468314002, |
|
"grad_norm": 70511.5546875, |
|
"learning_rate": 1.4027567658429988e-05, |
|
"loss": 0.7512, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.7211296016137166, |
|
"grad_norm": 65691.6640625, |
|
"learning_rate": 1.3943519919314172e-05, |
|
"loss": 0.7509, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.722810556396033, |
|
"grad_norm": 72238.515625, |
|
"learning_rate": 1.3859472180198355e-05, |
|
"loss": 0.7977, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7244915111783493, |
|
"grad_norm": 79115.5625, |
|
"learning_rate": 1.3775424441082535e-05, |
|
"loss": 0.8284, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.7261724659606656, |
|
"grad_norm": 64790.1484375, |
|
"learning_rate": 1.3691376701966719e-05, |
|
"loss": 0.7124, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.727853420742982, |
|
"grad_norm": 75220.65625, |
|
"learning_rate": 1.3607328962850899e-05, |
|
"loss": 0.7687, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.7295343755252983, |
|
"grad_norm": 60745.61328125, |
|
"learning_rate": 1.3523281223735082e-05, |
|
"loss": 0.7283, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.7312153303076148, |
|
"grad_norm": 54197.08203125, |
|
"learning_rate": 1.3439233484619266e-05, |
|
"loss": 0.768, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.7328962850899311, |
|
"grad_norm": 61422.91015625, |
|
"learning_rate": 1.3355185745503446e-05, |
|
"loss": 0.7324, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.7345772398722474, |
|
"grad_norm": 68666.9765625, |
|
"learning_rate": 1.327113800638763e-05, |
|
"loss": 0.8131, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.7362581946545638, |
|
"grad_norm": 54904.734375, |
|
"learning_rate": 1.3187090267271813e-05, |
|
"loss": 0.751, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.7379391494368801, |
|
"grad_norm": 74403.234375, |
|
"learning_rate": 1.3103042528155993e-05, |
|
"loss": 0.7564, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.7396201042191966, |
|
"grad_norm": 83832.09375, |
|
"learning_rate": 1.3018994789040176e-05, |
|
"loss": 0.7588, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7413010590015129, |
|
"grad_norm": 65015.13671875, |
|
"learning_rate": 1.2934947049924356e-05, |
|
"loss": 0.7333, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.7429820137838292, |
|
"grad_norm": 70224.0234375, |
|
"learning_rate": 1.285089931080854e-05, |
|
"loss": 0.7437, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.7446629685661456, |
|
"grad_norm": 65735.6015625, |
|
"learning_rate": 1.2766851571692723e-05, |
|
"loss": 0.7311, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.7463439233484619, |
|
"grad_norm": 64739.11328125, |
|
"learning_rate": 1.2682803832576903e-05, |
|
"loss": 0.752, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.7480248781307783, |
|
"grad_norm": 84499.734375, |
|
"learning_rate": 1.2598756093461087e-05, |
|
"loss": 0.7085, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.7497058329130947, |
|
"grad_norm": 58072.46484375, |
|
"learning_rate": 1.251470835434527e-05, |
|
"loss": 0.7869, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.751386787695411, |
|
"grad_norm": 60343.64453125, |
|
"learning_rate": 1.243066061522945e-05, |
|
"loss": 0.748, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.7530677424777273, |
|
"grad_norm": 73976.90625, |
|
"learning_rate": 1.2346612876113634e-05, |
|
"loss": 0.8125, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.7547486972600437, |
|
"grad_norm": 83017.234375, |
|
"learning_rate": 1.2262565136997816e-05, |
|
"loss": 0.7647, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.75642965204236, |
|
"grad_norm": 63707.15625, |
|
"learning_rate": 1.2178517397881997e-05, |
|
"loss": 0.689, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7581106068246765, |
|
"grad_norm": 62471.03125, |
|
"learning_rate": 1.2094469658766181e-05, |
|
"loss": 0.7642, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.7597915616069928, |
|
"grad_norm": 71938.2421875, |
|
"learning_rate": 1.2010421919650363e-05, |
|
"loss": 0.7018, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.7614725163893091, |
|
"grad_norm": 62137.34375, |
|
"learning_rate": 1.1926374180534544e-05, |
|
"loss": 0.7742, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.7631534711716255, |
|
"grad_norm": 74779.3515625, |
|
"learning_rate": 1.1842326441418726e-05, |
|
"loss": 0.76, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.7648344259539418, |
|
"grad_norm": 65167.984375, |
|
"learning_rate": 1.175827870230291e-05, |
|
"loss": 0.7814, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.7665153807362582, |
|
"grad_norm": 64530.6015625, |
|
"learning_rate": 1.1674230963187092e-05, |
|
"loss": 0.7496, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.7681963355185746, |
|
"grad_norm": 69663.7109375, |
|
"learning_rate": 1.1590183224071273e-05, |
|
"loss": 0.794, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.7698772903008909, |
|
"grad_norm": 58575.296875, |
|
"learning_rate": 1.1506135484955455e-05, |
|
"loss": 0.7438, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.7715582450832073, |
|
"grad_norm": 60153.59375, |
|
"learning_rate": 1.1422087745839639e-05, |
|
"loss": 0.7246, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.7732391998655236, |
|
"grad_norm": 63029.4140625, |
|
"learning_rate": 1.133804000672382e-05, |
|
"loss": 0.7449, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7749201546478399, |
|
"grad_norm": 70431.859375, |
|
"learning_rate": 1.1253992267608002e-05, |
|
"loss": 0.8162, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.7766011094301564, |
|
"grad_norm": 56848.18359375, |
|
"learning_rate": 1.1169944528492184e-05, |
|
"loss": 0.8272, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.7782820642124727, |
|
"grad_norm": 63638.44921875, |
|
"learning_rate": 1.1085896789376367e-05, |
|
"loss": 0.6735, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.779963018994789, |
|
"grad_norm": 81583.1328125, |
|
"learning_rate": 1.1001849050260549e-05, |
|
"loss": 0.7195, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.7816439737771054, |
|
"grad_norm": 65090.41796875, |
|
"learning_rate": 1.0917801311144731e-05, |
|
"loss": 0.7348, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.7833249285594217, |
|
"grad_norm": 81298.7265625, |
|
"learning_rate": 1.0833753572028913e-05, |
|
"loss": 0.7036, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.7850058833417382, |
|
"grad_norm": 72416.609375, |
|
"learning_rate": 1.0749705832913096e-05, |
|
"loss": 0.759, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.7866868381240545, |
|
"grad_norm": 62808.859375, |
|
"learning_rate": 1.0665658093797278e-05, |
|
"loss": 0.8043, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.7883677929063708, |
|
"grad_norm": 57125.68359375, |
|
"learning_rate": 1.058161035468146e-05, |
|
"loss": 0.695, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.7900487476886872, |
|
"grad_norm": 70024.90625, |
|
"learning_rate": 1.0497562615565642e-05, |
|
"loss": 0.8026, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7917297024710035, |
|
"grad_norm": 67909.5, |
|
"learning_rate": 1.0413514876449825e-05, |
|
"loss": 0.7139, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.7934106572533199, |
|
"grad_norm": 63104.14453125, |
|
"learning_rate": 1.0329467137334007e-05, |
|
"loss": 0.7578, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.7950916120356363, |
|
"grad_norm": 59612.7578125, |
|
"learning_rate": 1.0245419398218189e-05, |
|
"loss": 0.8141, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.7967725668179526, |
|
"grad_norm": 61766.25, |
|
"learning_rate": 1.016137165910237e-05, |
|
"loss": 0.7066, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.798453521600269, |
|
"grad_norm": 61020.91015625, |
|
"learning_rate": 1.0077323919986554e-05, |
|
"loss": 0.7129, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.8001344763825853, |
|
"grad_norm": 73282.953125, |
|
"learning_rate": 9.993276180870736e-06, |
|
"loss": 0.7443, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.8018154311649016, |
|
"grad_norm": 61251.37890625, |
|
"learning_rate": 9.909228441754917e-06, |
|
"loss": 0.7202, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.8034963859472181, |
|
"grad_norm": 57658.30859375, |
|
"learning_rate": 9.825180702639099e-06, |
|
"loss": 0.7261, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.8051773407295344, |
|
"grad_norm": 58032.66796875, |
|
"learning_rate": 9.741132963523283e-06, |
|
"loss": 0.7772, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.8068582955118507, |
|
"grad_norm": 67425.0703125, |
|
"learning_rate": 9.657085224407464e-06, |
|
"loss": 0.75, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8085392502941671, |
|
"grad_norm": 61734.875, |
|
"learning_rate": 9.573037485291646e-06, |
|
"loss": 0.7046, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.8102202050764834, |
|
"grad_norm": 68259.4453125, |
|
"learning_rate": 9.48898974617583e-06, |
|
"loss": 0.7702, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.8119011598587998, |
|
"grad_norm": 62056.20703125, |
|
"learning_rate": 9.404942007060011e-06, |
|
"loss": 0.7043, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.8135821146411162, |
|
"grad_norm": 65011.296875, |
|
"learning_rate": 9.320894267944193e-06, |
|
"loss": 0.7163, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.8152630694234325, |
|
"grad_norm": 74655.8203125, |
|
"learning_rate": 9.236846528828375e-06, |
|
"loss": 0.8321, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.8169440242057489, |
|
"grad_norm": 75672.0703125, |
|
"learning_rate": 9.152798789712558e-06, |
|
"loss": 0.7922, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.8186249789880652, |
|
"grad_norm": 61180.29296875, |
|
"learning_rate": 9.06875105059674e-06, |
|
"loss": 0.7159, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.8203059337703815, |
|
"grad_norm": 87931.515625, |
|
"learning_rate": 8.984703311480922e-06, |
|
"loss": 0.6781, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.821986888552698, |
|
"grad_norm": 73175.0390625, |
|
"learning_rate": 8.900655572365104e-06, |
|
"loss": 0.7326, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.8236678433350143, |
|
"grad_norm": 63801.890625, |
|
"learning_rate": 8.816607833249287e-06, |
|
"loss": 0.7975, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8253487981173306, |
|
"grad_norm": 80039.4375, |
|
"learning_rate": 8.732560094133469e-06, |
|
"loss": 0.6984, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.827029752899647, |
|
"grad_norm": 71785.921875, |
|
"learning_rate": 8.64851235501765e-06, |
|
"loss": 0.7481, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.8287107076819633, |
|
"grad_norm": 72605.5234375, |
|
"learning_rate": 8.564464615901833e-06, |
|
"loss": 0.7317, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.8303916624642798, |
|
"grad_norm": 72289.1171875, |
|
"learning_rate": 8.480416876786016e-06, |
|
"loss": 0.752, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.8320726172465961, |
|
"grad_norm": 69316.015625, |
|
"learning_rate": 8.396369137670198e-06, |
|
"loss": 0.722, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.8337535720289124, |
|
"grad_norm": 68252.515625, |
|
"learning_rate": 8.31232139855438e-06, |
|
"loss": 0.7674, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.8354345268112288, |
|
"grad_norm": 66645.7421875, |
|
"learning_rate": 8.228273659438561e-06, |
|
"loss": 0.7386, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.8371154815935451, |
|
"grad_norm": 64145.96875, |
|
"learning_rate": 8.144225920322745e-06, |
|
"loss": 0.7213, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.8387964363758615, |
|
"grad_norm": 62728.8125, |
|
"learning_rate": 8.060178181206927e-06, |
|
"loss": 0.6535, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.8404773911581779, |
|
"grad_norm": 72365.625, |
|
"learning_rate": 7.976130442091108e-06, |
|
"loss": 0.6833, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8421583459404942, |
|
"grad_norm": 62805.015625, |
|
"learning_rate": 7.89208270297529e-06, |
|
"loss": 0.8574, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.8438393007228105, |
|
"grad_norm": 65521.5703125, |
|
"learning_rate": 7.808034963859474e-06, |
|
"loss": 0.7408, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.8455202555051269, |
|
"grad_norm": 67055.6171875, |
|
"learning_rate": 7.723987224743655e-06, |
|
"loss": 0.6515, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.8472012102874432, |
|
"grad_norm": 56498.43359375, |
|
"learning_rate": 7.639939485627837e-06, |
|
"loss": 0.8291, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.8488821650697597, |
|
"grad_norm": 68116.96875, |
|
"learning_rate": 7.555891746512018e-06, |
|
"loss": 0.7886, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.850563119852076, |
|
"grad_norm": 64178.546875, |
|
"learning_rate": 7.471844007396202e-06, |
|
"loss": 0.7746, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.8522440746343923, |
|
"grad_norm": 63573.09375, |
|
"learning_rate": 7.387796268280383e-06, |
|
"loss": 0.7769, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.8539250294167087, |
|
"grad_norm": 57395.2421875, |
|
"learning_rate": 7.303748529164565e-06, |
|
"loss": 0.7334, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.855605984199025, |
|
"grad_norm": 74868.375, |
|
"learning_rate": 7.219700790048749e-06, |
|
"loss": 0.7575, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.8572869389813415, |
|
"grad_norm": 69290.953125, |
|
"learning_rate": 7.1356530509329304e-06, |
|
"loss": 0.6967, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8589678937636578, |
|
"grad_norm": 59573.5234375, |
|
"learning_rate": 7.051605311817112e-06, |
|
"loss": 0.8442, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.8606488485459741, |
|
"grad_norm": 61021.09375, |
|
"learning_rate": 6.967557572701294e-06, |
|
"loss": 0.7069, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.8623298033282905, |
|
"grad_norm": 71908.3359375, |
|
"learning_rate": 6.8835098335854775e-06, |
|
"loss": 0.6972, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.8640107581106068, |
|
"grad_norm": 71261.375, |
|
"learning_rate": 6.799462094469659e-06, |
|
"loss": 0.7308, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.8656917128929231, |
|
"grad_norm": 76563.328125, |
|
"learning_rate": 6.715414355353841e-06, |
|
"loss": 0.8391, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.8673726676752396, |
|
"grad_norm": 73876.8515625, |
|
"learning_rate": 6.631366616238023e-06, |
|
"loss": 0.6593, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.8690536224575559, |
|
"grad_norm": 58473.9140625, |
|
"learning_rate": 6.547318877122206e-06, |
|
"loss": 0.7766, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.8707345772398722, |
|
"grad_norm": 61153.5390625, |
|
"learning_rate": 6.463271138006388e-06, |
|
"loss": 0.7679, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.8724155320221886, |
|
"grad_norm": 58491.39453125, |
|
"learning_rate": 6.37922339889057e-06, |
|
"loss": 0.7115, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.8740964868045049, |
|
"grad_norm": 64786.53125, |
|
"learning_rate": 6.295175659774752e-06, |
|
"loss": 0.6976, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8757774415868214, |
|
"grad_norm": 68189.5546875, |
|
"learning_rate": 6.211127920658934e-06, |
|
"loss": 0.8054, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.8774583963691377, |
|
"grad_norm": 66427.9921875, |
|
"learning_rate": 6.127080181543117e-06, |
|
"loss": 0.7512, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.879139351151454, |
|
"grad_norm": 57290.70703125, |
|
"learning_rate": 6.043032442427299e-06, |
|
"loss": 0.6927, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.8808203059337704, |
|
"grad_norm": 72431.390625, |
|
"learning_rate": 5.958984703311481e-06, |
|
"loss": 0.8137, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.8825012607160867, |
|
"grad_norm": 70580.7265625, |
|
"learning_rate": 5.874936964195663e-06, |
|
"loss": 0.7712, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.8841822154984031, |
|
"grad_norm": 73078.3046875, |
|
"learning_rate": 5.790889225079846e-06, |
|
"loss": 0.6761, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.8858631702807195, |
|
"grad_norm": 78010.0703125, |
|
"learning_rate": 5.7068414859640274e-06, |
|
"loss": 0.7715, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.8875441250630358, |
|
"grad_norm": 66805.8671875, |
|
"learning_rate": 5.62279374684821e-06, |
|
"loss": 0.7067, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.8892250798453522, |
|
"grad_norm": 68583.328125, |
|
"learning_rate": 5.538746007732392e-06, |
|
"loss": 0.7403, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.8909060346276685, |
|
"grad_norm": 80909.6796875, |
|
"learning_rate": 5.4546982686165745e-06, |
|
"loss": 0.7469, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.8925869894099848, |
|
"grad_norm": 62168.92578125, |
|
"learning_rate": 5.370650529500756e-06, |
|
"loss": 0.7284, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.8942679441923013, |
|
"grad_norm": 70689.0546875, |
|
"learning_rate": 5.286602790384939e-06, |
|
"loss": 0.7804, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.8959488989746176, |
|
"grad_norm": 70834.4609375, |
|
"learning_rate": 5.202555051269121e-06, |
|
"loss": 0.7504, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.8976298537569339, |
|
"grad_norm": 72244.8828125, |
|
"learning_rate": 5.118507312153303e-06, |
|
"loss": 0.6909, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.8993108085392503, |
|
"grad_norm": 68406.7734375, |
|
"learning_rate": 5.034459573037485e-06, |
|
"loss": 0.7554, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.9009917633215666, |
|
"grad_norm": 59602.8515625, |
|
"learning_rate": 4.950411833921668e-06, |
|
"loss": 0.7673, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.902672718103883, |
|
"grad_norm": 61461.9375, |
|
"learning_rate": 4.8663640948058495e-06, |
|
"loss": 0.7082, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.9043536728861994, |
|
"grad_norm": 64041.15625, |
|
"learning_rate": 4.782316355690032e-06, |
|
"loss": 0.682, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.9060346276685157, |
|
"grad_norm": 67531.3046875, |
|
"learning_rate": 4.698268616574215e-06, |
|
"loss": 0.8184, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.9077155824508321, |
|
"grad_norm": 64709.15625, |
|
"learning_rate": 4.6142208774583965e-06, |
|
"loss": 0.7433, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9093965372331484, |
|
"grad_norm": 73473.203125, |
|
"learning_rate": 4.530173138342579e-06, |
|
"loss": 0.7386, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.9110774920154647, |
|
"grad_norm": 66414.84375, |
|
"learning_rate": 4.446125399226761e-06, |
|
"loss": 0.7839, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.9127584467977812, |
|
"grad_norm": 73395.6015625, |
|
"learning_rate": 4.3620776601109435e-06, |
|
"loss": 0.7699, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.9144394015800975, |
|
"grad_norm": 60129.15625, |
|
"learning_rate": 4.278029920995125e-06, |
|
"loss": 0.7111, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.9161203563624138, |
|
"grad_norm": 81137.3203125, |
|
"learning_rate": 4.193982181879308e-06, |
|
"loss": 0.7419, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.9178013111447302, |
|
"grad_norm": 69321.53125, |
|
"learning_rate": 4.10993444276349e-06, |
|
"loss": 0.6546, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.9194822659270465, |
|
"grad_norm": 73445.609375, |
|
"learning_rate": 4.025886703647672e-06, |
|
"loss": 0.7571, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.921163220709363, |
|
"grad_norm": 55211.56640625, |
|
"learning_rate": 3.941838964531854e-06, |
|
"loss": 0.7361, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.9228441754916793, |
|
"grad_norm": 60575.71484375, |
|
"learning_rate": 3.857791225416037e-06, |
|
"loss": 0.7854, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.9245251302739956, |
|
"grad_norm": 70186.140625, |
|
"learning_rate": 3.7737434863002185e-06, |
|
"loss": 0.8255, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.926206085056312, |
|
"grad_norm": 73100.078125, |
|
"learning_rate": 3.689695747184401e-06, |
|
"loss": 0.7469, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.9278870398386283, |
|
"grad_norm": 75805.078125, |
|
"learning_rate": 3.605648008068583e-06, |
|
"loss": 0.7616, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.9295679946209447, |
|
"grad_norm": 77849.140625, |
|
"learning_rate": 3.5216002689527655e-06, |
|
"loss": 0.7562, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.9312489494032611, |
|
"grad_norm": 68207.265625, |
|
"learning_rate": 3.4375525298369473e-06, |
|
"loss": 0.6813, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.9329299041855774, |
|
"grad_norm": 71734.1640625, |
|
"learning_rate": 3.35350479072113e-06, |
|
"loss": 0.6668, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.9346108589678938, |
|
"grad_norm": 80375.4375, |
|
"learning_rate": 3.2694570516053117e-06, |
|
"loss": 0.7542, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.9362918137502101, |
|
"grad_norm": 76297.75, |
|
"learning_rate": 3.1854093124894943e-06, |
|
"loss": 0.7539, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.9379727685325264, |
|
"grad_norm": 59915.84765625, |
|
"learning_rate": 3.1013615733736765e-06, |
|
"loss": 0.6965, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.9396537233148429, |
|
"grad_norm": 66482.078125, |
|
"learning_rate": 3.0173138342578587e-06, |
|
"loss": 0.7416, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.9413346780971592, |
|
"grad_norm": 73498.7578125, |
|
"learning_rate": 2.933266095142041e-06, |
|
"loss": 0.7404, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9430156328794755, |
|
"grad_norm": 56677.17578125, |
|
"learning_rate": 2.849218356026223e-06, |
|
"loss": 0.7199, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.9446965876617919, |
|
"grad_norm": 63067.734375, |
|
"learning_rate": 2.7651706169104053e-06, |
|
"loss": 0.738, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.9463775424441082, |
|
"grad_norm": 65398.22265625, |
|
"learning_rate": 2.6811228777945875e-06, |
|
"loss": 0.7177, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.9480584972264247, |
|
"grad_norm": 56372.125, |
|
"learning_rate": 2.5970751386787698e-06, |
|
"loss": 0.7594, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.949739452008741, |
|
"grad_norm": 66133.2109375, |
|
"learning_rate": 2.513027399562952e-06, |
|
"loss": 0.755, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.9514204067910573, |
|
"grad_norm": 73855.5859375, |
|
"learning_rate": 2.428979660447134e-06, |
|
"loss": 0.7286, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.9531013615733737, |
|
"grad_norm": 80979.5703125, |
|
"learning_rate": 2.3449319213313164e-06, |
|
"loss": 0.7508, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.95478231635569, |
|
"grad_norm": 59096.21484375, |
|
"learning_rate": 2.2608841822154986e-06, |
|
"loss": 0.7273, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.9564632711380063, |
|
"grad_norm": 73199.34375, |
|
"learning_rate": 2.1768364430996808e-06, |
|
"loss": 0.8463, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.9581442259203228, |
|
"grad_norm": 70557.265625, |
|
"learning_rate": 2.092788703983863e-06, |
|
"loss": 0.7518, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.9598251807026391, |
|
"grad_norm": 64123.15234375, |
|
"learning_rate": 2.008740964868045e-06, |
|
"loss": 0.6743, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.9615061354849554, |
|
"grad_norm": 70495.03125, |
|
"learning_rate": 1.9246932257522274e-06, |
|
"loss": 0.7179, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.9631870902672718, |
|
"grad_norm": 62604.77734375, |
|
"learning_rate": 1.8406454866364096e-06, |
|
"loss": 0.7073, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.9648680450495881, |
|
"grad_norm": 69528.75, |
|
"learning_rate": 1.7565977475205918e-06, |
|
"loss": 0.7738, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.9665489998319046, |
|
"grad_norm": 64379.51953125, |
|
"learning_rate": 1.672550008404774e-06, |
|
"loss": 0.774, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.9682299546142209, |
|
"grad_norm": 70070.359375, |
|
"learning_rate": 1.5885022692889562e-06, |
|
"loss": 0.7902, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.9699109093965372, |
|
"grad_norm": 75525.7265625, |
|
"learning_rate": 1.5044545301731386e-06, |
|
"loss": 0.754, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.9715918641788536, |
|
"grad_norm": 72437.4375, |
|
"learning_rate": 1.4204067910573208e-06, |
|
"loss": 0.7122, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.9732728189611699, |
|
"grad_norm": 55177.8125, |
|
"learning_rate": 1.3363590519415028e-06, |
|
"loss": 0.7083, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.9749537737434864, |
|
"grad_norm": 74118.140625, |
|
"learning_rate": 1.252311312825685e-06, |
|
"loss": 0.7124, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9766347285258027, |
|
"grad_norm": 75796.015625, |
|
"learning_rate": 1.1682635737098672e-06, |
|
"loss": 0.7563, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.978315683308119, |
|
"grad_norm": 66329.46875, |
|
"learning_rate": 1.0842158345940494e-06, |
|
"loss": 0.7379, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.9799966380904354, |
|
"grad_norm": 71971.1796875, |
|
"learning_rate": 1.0001680954782316e-06, |
|
"loss": 0.7237, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.9816775928727517, |
|
"grad_norm": 70406.546875, |
|
"learning_rate": 9.161203563624139e-07, |
|
"loss": 0.7518, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.983358547655068, |
|
"grad_norm": 65358.0, |
|
"learning_rate": 8.320726172465961e-07, |
|
"loss": 0.7263, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.9850395024373845, |
|
"grad_norm": 56430.7265625, |
|
"learning_rate": 7.480248781307783e-07, |
|
"loss": 0.7174, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.9867204572197008, |
|
"grad_norm": 69264.21875, |
|
"learning_rate": 6.639771390149606e-07, |
|
"loss": 0.7782, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.9884014120020171, |
|
"grad_norm": 60301.59765625, |
|
"learning_rate": 5.799293998991428e-07, |
|
"loss": 0.7487, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.9900823667843335, |
|
"grad_norm": 66026.0078125, |
|
"learning_rate": 4.95881660783325e-07, |
|
"loss": 0.7565, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.9917633215666498, |
|
"grad_norm": 63975.43359375, |
|
"learning_rate": 4.1183392166750716e-07, |
|
"loss": 0.7349, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.9934442763489663, |
|
"grad_norm": 72585.8046875, |
|
"learning_rate": 3.2778618255168936e-07, |
|
"loss": 0.7754, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.9951252311312826, |
|
"grad_norm": 67646.4296875, |
|
"learning_rate": 2.4373844343587156e-07, |
|
"loss": 0.7622, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.9968061859135989, |
|
"grad_norm": 74414.9375, |
|
"learning_rate": 1.596907043200538e-07, |
|
"loss": 0.7312, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.9984871406959153, |
|
"grad_norm": 69075.203125, |
|
"learning_rate": 7.5642965204236e-08, |
|
"loss": 0.7293, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5949, |
|
"total_flos": 9.074524143432499e+17, |
|
"train_loss": 0.8014113598860578, |
|
"train_runtime": 82603.7739, |
|
"train_samples_per_second": 0.864, |
|
"train_steps_per_second": 0.072 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5949, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5949, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.074524143432499e+17, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|