diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26515 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1923828125, + "eval_steps": 24576, + "global_step": 18912, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.0172526041666666e-05, + "eval_loss": 4.542403221130371, + "eval_runtime": 144.6811, + "eval_samples_per_second": 13.872, + "eval_steps_per_second": 13.872, + "step": 1 + }, + { + "epoch": 5.0862630208333336e-05, + "grad_norm": 45.55976867675781, + "learning_rate": 2.5000000000000004e-07, + "loss": 4.478, + "step": 5 + }, + { + "epoch": 0.00010172526041666667, + "grad_norm": 44.849979400634766, + "learning_rate": 5.000000000000001e-07, + "loss": 4.381, + "step": 10 + }, + { + "epoch": 0.000152587890625, + "grad_norm": 47.78006362915039, + "learning_rate": 7.5e-07, + "loss": 4.2726, + "step": 15 + }, + { + "epoch": 0.00020345052083333334, + "grad_norm": 49.011878967285156, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.4009, + "step": 20 + }, + { + "epoch": 0.0002543131510416667, + "grad_norm": 40.69770050048828, + "learning_rate": 1.25e-06, + "loss": 4.4735, + "step": 25 + }, + { + "epoch": 0.00030517578125, + "grad_norm": 34.35673522949219, + "learning_rate": 1.5e-06, + "loss": 4.2665, + "step": 30 + }, + { + "epoch": 0.0003560384114583333, + "grad_norm": 52.81883239746094, + "learning_rate": 1.75e-06, + "loss": 4.423, + "step": 35 + }, + { + "epoch": 0.0004069010416666667, + "grad_norm": 32.41872024536133, + "learning_rate": 2.0000000000000003e-06, + "loss": 4.6442, + "step": 40 + }, + { + "epoch": 0.000457763671875, + "grad_norm": 37.73821258544922, + "learning_rate": 2.25e-06, + "loss": 4.2681, + "step": 45 + }, + { + "epoch": 0.0005086263020833334, + "grad_norm": 37.740386962890625, + "learning_rate": 2.5e-06, + "loss": 4.6894, + "step": 50 + }, + { + "epoch": 0.0005594889322916666, + "grad_norm": 50.862735748291016, + "learning_rate": 2.7500000000000004e-06, + "loss": 4.3243, + "step": 55 + }, + { + "epoch": 0.0006103515625, + "grad_norm": 45.02497863769531, + "learning_rate": 3e-06, + "loss": 4.3276, + "step": 60 + }, + { + "epoch": 0.0006612141927083334, + "grad_norm": 28.076095581054688, + "learning_rate": 3.2500000000000002e-06, + "loss": 3.8881, + "step": 65 + }, + { + "epoch": 0.0007120768229166666, + "grad_norm": 27.940998077392578, + "learning_rate": 3.5e-06, + "loss": 4.2496, + "step": 70 + }, + { + "epoch": 0.000762939453125, + "grad_norm": 26.482099533081055, + "learning_rate": 3.7500000000000005e-06, + "loss": 4.349, + "step": 75 + }, + { + "epoch": 0.0008138020833333334, + "grad_norm": 32.36627960205078, + "learning_rate": 4.000000000000001e-06, + "loss": 4.423, + "step": 80 + }, + { + "epoch": 0.0008646647135416666, + "grad_norm": 48.71563720703125, + "learning_rate": 4.25e-06, + "loss": 4.65, + "step": 85 + }, + { + "epoch": 0.00091552734375, + "grad_norm": 25.62880516052246, + "learning_rate": 4.5e-06, + "loss": 4.2165, + "step": 90 + }, + { + "epoch": 0.0009663899739583334, + "grad_norm": 33.9738655090332, + "learning_rate": 4.75e-06, + "loss": 4.1882, + "step": 95 + }, + { + "epoch": 0.0010172526041666667, + "grad_norm": 24.544464111328125, + "learning_rate": 5e-06, + "loss": 3.7919, + "step": 100 + }, + { + "epoch": 0.001068115234375, + "grad_norm": 38.953922271728516, + "learning_rate": 4.999999968019047e-06, + "loss": 4.5549, + "step": 105 + }, + { + "epoch": 0.0011189778645833333, + "grad_norm": 30.3071346282959, + "learning_rate": 4.999999872076186e-06, + "loss": 4.3079, + "step": 110 + }, + { + "epoch": 0.0011698404947916667, + "grad_norm": 26.42899513244629, + "learning_rate": 4.999999712171422e-06, + "loss": 4.3139, + "step": 115 + }, + { + "epoch": 0.001220703125, + "grad_norm": 18.36640739440918, + "learning_rate": 4.999999488304758e-06, + "loss": 4.1012, + "step": 120 + }, + { + "epoch": 0.0012715657552083333, + "grad_norm": 22.997697830200195, + "learning_rate": 4.999999200476199e-06, + "loss": 4.0616, + "step": 125 + }, + { + "epoch": 0.0013224283854166667, + "grad_norm": 18.358749389648438, + "learning_rate": 4.999998848685752e-06, + "loss": 4.0783, + "step": 130 + }, + { + "epoch": 0.001373291015625, + "grad_norm": 16.22219467163086, + "learning_rate": 4.999998432933428e-06, + "loss": 4.14, + "step": 135 + }, + { + "epoch": 0.0014241536458333333, + "grad_norm": 31.525222778320312, + "learning_rate": 4.999997953219238e-06, + "loss": 4.268, + "step": 140 + }, + { + "epoch": 0.0014750162760416667, + "grad_norm": 16.66249656677246, + "learning_rate": 4.999997409543191e-06, + "loss": 3.7311, + "step": 145 + }, + { + "epoch": 0.00152587890625, + "grad_norm": 19.815828323364258, + "learning_rate": 4.999996801905304e-06, + "loss": 4.0011, + "step": 150 + }, + { + "epoch": 0.0015767415364583333, + "grad_norm": 27.2629337310791, + "learning_rate": 4.9999961303055906e-06, + "loss": 4.4585, + "step": 155 + }, + { + "epoch": 0.0016276041666666667, + "grad_norm": 20.656837463378906, + "learning_rate": 4.99999539474407e-06, + "loss": 3.6024, + "step": 160 + }, + { + "epoch": 0.001678466796875, + "grad_norm": 32.12629699707031, + "learning_rate": 4.999994595220758e-06, + "loss": 3.6387, + "step": 165 + }, + { + "epoch": 0.0017293294270833333, + "grad_norm": 17.384239196777344, + "learning_rate": 4.9999937317356776e-06, + "loss": 3.7885, + "step": 170 + }, + { + "epoch": 0.0017801920572916667, + "grad_norm": 16.839290618896484, + "learning_rate": 4.99999280428885e-06, + "loss": 3.9351, + "step": 175 + }, + { + "epoch": 0.0018310546875, + "grad_norm": 17.16065216064453, + "learning_rate": 4.9999918128803e-06, + "loss": 3.829, + "step": 180 + }, + { + "epoch": 0.0018819173177083333, + "grad_norm": 19.670543670654297, + "learning_rate": 4.999990757510052e-06, + "loss": 3.9205, + "step": 185 + }, + { + "epoch": 0.0019327799479166667, + "grad_norm": 19.776790618896484, + "learning_rate": 4.999989638178131e-06, + "loss": 4.0733, + "step": 190 + }, + { + "epoch": 0.001983642578125, + "grad_norm": 17.121217727661133, + "learning_rate": 4.99998845488457e-06, + "loss": 3.9968, + "step": 195 + }, + { + "epoch": 0.0020345052083333335, + "grad_norm": 18.365943908691406, + "learning_rate": 4.999987207629396e-06, + "loss": 3.6687, + "step": 200 + }, + { + "epoch": 0.0020853678385416665, + "grad_norm": 17.672025680541992, + "learning_rate": 4.9999858964126415e-06, + "loss": 3.9062, + "step": 205 + }, + { + "epoch": 0.00213623046875, + "grad_norm": 14.973541259765625, + "learning_rate": 4.9999845212343415e-06, + "loss": 3.4274, + "step": 210 + }, + { + "epoch": 0.0021870930989583335, + "grad_norm": 20.310712814331055, + "learning_rate": 4.999983082094529e-06, + "loss": 3.8899, + "step": 215 + }, + { + "epoch": 0.0022379557291666665, + "grad_norm": 30.07281494140625, + "learning_rate": 4.999981578993242e-06, + "loss": 4.083, + "step": 220 + }, + { + "epoch": 0.002288818359375, + "grad_norm": 14.967458724975586, + "learning_rate": 4.999980011930519e-06, + "loss": 3.8021, + "step": 225 + }, + { + "epoch": 0.0023396809895833335, + "grad_norm": 18.596702575683594, + "learning_rate": 4.999978380906401e-06, + "loss": 4.2005, + "step": 230 + }, + { + "epoch": 0.0023905436197916665, + "grad_norm": 14.92667293548584, + "learning_rate": 4.999976685920927e-06, + "loss": 3.7878, + "step": 235 + }, + { + "epoch": 0.00244140625, + "grad_norm": 26.372528076171875, + "learning_rate": 4.999974926974142e-06, + "loss": 3.906, + "step": 240 + }, + { + "epoch": 0.0024922688802083335, + "grad_norm": 11.505077362060547, + "learning_rate": 4.9999731040660925e-06, + "loss": 3.7275, + "step": 245 + }, + { + "epoch": 0.0025431315104166665, + "grad_norm": 19.219104766845703, + "learning_rate": 4.999971217196824e-06, + "loss": 3.6951, + "step": 250 + }, + { + "epoch": 0.002593994140625, + "grad_norm": 20.686763763427734, + "learning_rate": 4.999969266366383e-06, + "loss": 4.2166, + "step": 255 + }, + { + "epoch": 0.0026448567708333335, + "grad_norm": 14.91057300567627, + "learning_rate": 4.999967251574821e-06, + "loss": 4.1096, + "step": 260 + }, + { + "epoch": 0.0026957194010416665, + "grad_norm": 14.4781494140625, + "learning_rate": 4.99996517282219e-06, + "loss": 3.9429, + "step": 265 + }, + { + "epoch": 0.00274658203125, + "grad_norm": 16.21483039855957, + "learning_rate": 4.9999630301085425e-06, + "loss": 4.5956, + "step": 270 + }, + { + "epoch": 0.0027974446614583335, + "grad_norm": 20.831764221191406, + "learning_rate": 4.9999608234339336e-06, + "loss": 4.0729, + "step": 275 + }, + { + "epoch": 0.0028483072916666665, + "grad_norm": 16.851608276367188, + "learning_rate": 4.999958552798419e-06, + "loss": 3.9821, + "step": 280 + }, + { + "epoch": 0.002899169921875, + "grad_norm": 16.131776809692383, + "learning_rate": 4.999956218202058e-06, + "loss": 3.912, + "step": 285 + }, + { + "epoch": 0.0029500325520833335, + "grad_norm": 22.348773956298828, + "learning_rate": 4.9999538196449096e-06, + "loss": 3.7261, + "step": 290 + }, + { + "epoch": 0.0030008951822916665, + "grad_norm": 27.426599502563477, + "learning_rate": 4.9999513571270355e-06, + "loss": 3.9633, + "step": 295 + }, + { + "epoch": 0.0030517578125, + "grad_norm": 19.74297523498535, + "learning_rate": 4.999948830648497e-06, + "loss": 3.6241, + "step": 300 + }, + { + "epoch": 0.0031026204427083335, + "grad_norm": 26.0491943359375, + "learning_rate": 4.999946240209362e-06, + "loss": 3.8093, + "step": 305 + }, + { + "epoch": 0.0031534830729166665, + "grad_norm": 17.24481964111328, + "learning_rate": 4.999943585809694e-06, + "loss": 3.9762, + "step": 310 + }, + { + "epoch": 0.003204345703125, + "grad_norm": 16.89327621459961, + "learning_rate": 4.999940867449562e-06, + "loss": 3.5947, + "step": 315 + }, + { + "epoch": 0.0032552083333333335, + "grad_norm": 19.533809661865234, + "learning_rate": 4.999938085129036e-06, + "loss": 3.5673, + "step": 320 + }, + { + "epoch": 0.0033060709635416665, + "grad_norm": 16.76618766784668, + "learning_rate": 4.999935238848187e-06, + "loss": 3.8095, + "step": 325 + }, + { + "epoch": 0.00335693359375, + "grad_norm": 15.267281532287598, + "learning_rate": 4.999932328607087e-06, + "loss": 3.8809, + "step": 330 + }, + { + "epoch": 0.0034077962239583335, + "grad_norm": 14.550617218017578, + "learning_rate": 4.999929354405811e-06, + "loss": 3.9913, + "step": 335 + }, + { + "epoch": 0.0034586588541666665, + "grad_norm": 22.565776824951172, + "learning_rate": 4.999926316244434e-06, + "loss": 4.0828, + "step": 340 + }, + { + "epoch": 0.003509521484375, + "grad_norm": 14.583137512207031, + "learning_rate": 4.999923214123036e-06, + "loss": 4.0128, + "step": 345 + }, + { + "epoch": 0.0035603841145833335, + "grad_norm": 14.065851211547852, + "learning_rate": 4.999920048041694e-06, + "loss": 3.5976, + "step": 350 + }, + { + "epoch": 0.0036112467447916665, + "grad_norm": 22.18216323852539, + "learning_rate": 4.999916818000491e-06, + "loss": 3.8117, + "step": 355 + }, + { + "epoch": 0.003662109375, + "grad_norm": 21.645153045654297, + "learning_rate": 4.9999135239995076e-06, + "loss": 3.5906, + "step": 360 + }, + { + "epoch": 0.0037129720052083335, + "grad_norm": 13.195281028747559, + "learning_rate": 4.9999101660388305e-06, + "loss": 3.7585, + "step": 365 + }, + { + "epoch": 0.0037638346354166665, + "grad_norm": 10.816890716552734, + "learning_rate": 4.999906744118545e-06, + "loss": 3.7736, + "step": 370 + }, + { + "epoch": 0.003814697265625, + "grad_norm": 14.775542259216309, + "learning_rate": 4.999903258238736e-06, + "loss": 3.6809, + "step": 375 + }, + { + "epoch": 0.0038655598958333335, + "grad_norm": 22.4163761138916, + "learning_rate": 4.999899708399496e-06, + "loss": 3.6198, + "step": 380 + }, + { + "epoch": 0.003916422526041667, + "grad_norm": 20.934917449951172, + "learning_rate": 4.999896094600914e-06, + "loss": 3.7003, + "step": 385 + }, + { + "epoch": 0.00396728515625, + "grad_norm": 13.397461891174316, + "learning_rate": 4.999892416843083e-06, + "loss": 3.9415, + "step": 390 + }, + { + "epoch": 0.004018147786458333, + "grad_norm": 18.704856872558594, + "learning_rate": 4.999888675126097e-06, + "loss": 3.7429, + "step": 395 + }, + { + "epoch": 0.004069010416666667, + "grad_norm": 14.650379180908203, + "learning_rate": 4.9998848694500526e-06, + "loss": 3.8455, + "step": 400 + }, + { + "epoch": 0.004119873046875, + "grad_norm": 22.322834014892578, + "learning_rate": 4.999880999815045e-06, + "loss": 3.9125, + "step": 405 + }, + { + "epoch": 0.004170735677083333, + "grad_norm": 16.105815887451172, + "learning_rate": 4.999877066221175e-06, + "loss": 3.5135, + "step": 410 + }, + { + "epoch": 0.004221598307291667, + "grad_norm": 10.558919906616211, + "learning_rate": 4.999873068668544e-06, + "loss": 3.6677, + "step": 415 + }, + { + "epoch": 0.0042724609375, + "grad_norm": 10.016486167907715, + "learning_rate": 4.999869007157252e-06, + "loss": 3.5761, + "step": 420 + }, + { + "epoch": 0.004323323567708333, + "grad_norm": 16.01995849609375, + "learning_rate": 4.999864881687404e-06, + "loss": 3.9739, + "step": 425 + }, + { + "epoch": 0.004374186197916667, + "grad_norm": 21.719324111938477, + "learning_rate": 4.999860692259105e-06, + "loss": 3.9211, + "step": 430 + }, + { + "epoch": 0.004425048828125, + "grad_norm": 42.85234069824219, + "learning_rate": 4.999856438872463e-06, + "loss": 3.9438, + "step": 435 + }, + { + "epoch": 0.004475911458333333, + "grad_norm": 13.891044616699219, + "learning_rate": 4.999852121527588e-06, + "loss": 3.6916, + "step": 440 + }, + { + "epoch": 0.004526774088541667, + "grad_norm": 18.872058868408203, + "learning_rate": 4.999847740224587e-06, + "loss": 4.0558, + "step": 445 + }, + { + "epoch": 0.00457763671875, + "grad_norm": 16.054645538330078, + "learning_rate": 4.999843294963576e-06, + "loss": 3.8301, + "step": 450 + }, + { + "epoch": 0.004628499348958333, + "grad_norm": 14.235097885131836, + "learning_rate": 4.999838785744665e-06, + "loss": 3.745, + "step": 455 + }, + { + "epoch": 0.004679361979166667, + "grad_norm": 21.37851905822754, + "learning_rate": 4.999834212567972e-06, + "loss": 3.8782, + "step": 460 + }, + { + "epoch": 0.004730224609375, + "grad_norm": 17.109268188476562, + "learning_rate": 4.999829575433613e-06, + "loss": 3.4868, + "step": 465 + }, + { + "epoch": 0.004781087239583333, + "grad_norm": 16.911109924316406, + "learning_rate": 4.999824874341708e-06, + "loss": 3.4223, + "step": 470 + }, + { + "epoch": 0.004831949869791667, + "grad_norm": 9.432239532470703, + "learning_rate": 4.9998201092923746e-06, + "loss": 3.3816, + "step": 475 + }, + { + "epoch": 0.0048828125, + "grad_norm": 18.146076202392578, + "learning_rate": 4.999815280285737e-06, + "loss": 4.0282, + "step": 480 + }, + { + "epoch": 0.004933675130208333, + "grad_norm": 26.35177993774414, + "learning_rate": 4.999810387321917e-06, + "loss": 3.2918, + "step": 485 + }, + { + "epoch": 0.004984537760416667, + "grad_norm": 17.22603988647461, + "learning_rate": 4.9998054304010425e-06, + "loss": 3.7617, + "step": 490 + }, + { + "epoch": 0.005035400390625, + "grad_norm": 15.333989143371582, + "learning_rate": 4.999800409523237e-06, + "loss": 4.2317, + "step": 495 + }, + { + "epoch": 0.005086263020833333, + "grad_norm": 19.130863189697266, + "learning_rate": 4.999795324688631e-06, + "loss": 3.8223, + "step": 500 + }, + { + "epoch": 0.005137125651041667, + "grad_norm": 12.130922317504883, + "learning_rate": 4.999790175897355e-06, + "loss": 3.7448, + "step": 505 + }, + { + "epoch": 0.00518798828125, + "grad_norm": 18.195701599121094, + "learning_rate": 4.999784963149539e-06, + "loss": 3.8377, + "step": 510 + }, + { + "epoch": 0.005238850911458333, + "grad_norm": 13.157418251037598, + "learning_rate": 4.999779686445318e-06, + "loss": 4.2839, + "step": 515 + }, + { + "epoch": 0.005289713541666667, + "grad_norm": 15.427362442016602, + "learning_rate": 4.999774345784825e-06, + "loss": 3.6147, + "step": 520 + }, + { + "epoch": 0.005340576171875, + "grad_norm": 18.931623458862305, + "learning_rate": 4.9997689411681986e-06, + "loss": 4.246, + "step": 525 + }, + { + "epoch": 0.005391438802083333, + "grad_norm": 16.495134353637695, + "learning_rate": 4.9997634725955756e-06, + "loss": 3.811, + "step": 530 + }, + { + "epoch": 0.005442301432291667, + "grad_norm": 17.250377655029297, + "learning_rate": 4.999757940067098e-06, + "loss": 3.9962, + "step": 535 + }, + { + "epoch": 0.0054931640625, + "grad_norm": 19.960880279541016, + "learning_rate": 4.999752343582905e-06, + "loss": 3.8214, + "step": 540 + }, + { + "epoch": 0.005544026692708333, + "grad_norm": 21.063762664794922, + "learning_rate": 4.999746683143141e-06, + "loss": 3.5984, + "step": 545 + }, + { + "epoch": 0.005594889322916667, + "grad_norm": 15.372790336608887, + "learning_rate": 4.999740958747951e-06, + "loss": 3.8445, + "step": 550 + }, + { + "epoch": 0.005645751953125, + "grad_norm": 12.668439865112305, + "learning_rate": 4.9997351703974804e-06, + "loss": 4.1112, + "step": 555 + }, + { + "epoch": 0.005696614583333333, + "grad_norm": 14.557056427001953, + "learning_rate": 4.999729318091878e-06, + "loss": 3.8517, + "step": 560 + }, + { + "epoch": 0.005747477213541667, + "grad_norm": 18.399843215942383, + "learning_rate": 4.9997234018312945e-06, + "loss": 3.5826, + "step": 565 + }, + { + "epoch": 0.00579833984375, + "grad_norm": 16.69244956970215, + "learning_rate": 4.9997174216158795e-06, + "loss": 3.6708, + "step": 570 + }, + { + "epoch": 0.005849202473958333, + "grad_norm": 20.53217887878418, + "learning_rate": 4.9997113774457865e-06, + "loss": 4.0217, + "step": 575 + }, + { + "epoch": 0.005900065104166667, + "grad_norm": 15.122044563293457, + "learning_rate": 4.999705269321171e-06, + "loss": 3.8238, + "step": 580 + }, + { + "epoch": 0.005950927734375, + "grad_norm": 9.872886657714844, + "learning_rate": 4.999699097242189e-06, + "loss": 3.4889, + "step": 585 + }, + { + "epoch": 0.006001790364583333, + "grad_norm": 14.158080101013184, + "learning_rate": 4.999692861208997e-06, + "loss": 3.5411, + "step": 590 + }, + { + "epoch": 0.006052652994791667, + "grad_norm": 15.669339179992676, + "learning_rate": 4.999686561221756e-06, + "loss": 3.936, + "step": 595 + }, + { + "epoch": 0.006103515625, + "grad_norm": 11.445816040039062, + "learning_rate": 4.999680197280628e-06, + "loss": 3.8597, + "step": 600 + }, + { + "epoch": 0.006154378255208333, + "grad_norm": 16.03506088256836, + "learning_rate": 4.999673769385773e-06, + "loss": 3.4291, + "step": 605 + }, + { + "epoch": 0.006205240885416667, + "grad_norm": 13.63383960723877, + "learning_rate": 4.999667277537358e-06, + "loss": 3.8566, + "step": 610 + }, + { + "epoch": 0.006256103515625, + "grad_norm": 11.830592155456543, + "learning_rate": 4.999660721735547e-06, + "loss": 3.8303, + "step": 615 + }, + { + "epoch": 0.006306966145833333, + "grad_norm": 19.99570655822754, + "learning_rate": 4.999654101980511e-06, + "loss": 3.8236, + "step": 620 + }, + { + "epoch": 0.006357828776041667, + "grad_norm": 12.528511047363281, + "learning_rate": 4.999647418272415e-06, + "loss": 4.0486, + "step": 625 + }, + { + "epoch": 0.00640869140625, + "grad_norm": 15.760631561279297, + "learning_rate": 4.999640670611434e-06, + "loss": 3.4468, + "step": 630 + }, + { + "epoch": 0.006459554036458333, + "grad_norm": 19.84269905090332, + "learning_rate": 4.999633858997738e-06, + "loss": 3.8804, + "step": 635 + }, + { + "epoch": 0.006510416666666667, + "grad_norm": 11.239544868469238, + "learning_rate": 4.999626983431503e-06, + "loss": 3.5703, + "step": 640 + }, + { + "epoch": 0.006561279296875, + "grad_norm": 13.65317440032959, + "learning_rate": 4.999620043912904e-06, + "loss": 3.7155, + "step": 645 + }, + { + "epoch": 0.006612141927083333, + "grad_norm": 12.85306167602539, + "learning_rate": 4.999613040442118e-06, + "loss": 3.9372, + "step": 650 + }, + { + "epoch": 0.006663004557291667, + "grad_norm": 16.895217895507812, + "learning_rate": 4.999605973019325e-06, + "loss": 4.0348, + "step": 655 + }, + { + "epoch": 0.0067138671875, + "grad_norm": 15.865885734558105, + "learning_rate": 4.999598841644706e-06, + "loss": 3.6799, + "step": 660 + }, + { + "epoch": 0.006764729817708333, + "grad_norm": 17.68973731994629, + "learning_rate": 4.999591646318443e-06, + "loss": 3.446, + "step": 665 + }, + { + "epoch": 0.006815592447916667, + "grad_norm": 17.351646423339844, + "learning_rate": 4.99958438704072e-06, + "loss": 3.9662, + "step": 670 + }, + { + "epoch": 0.006866455078125, + "grad_norm": 12.52562427520752, + "learning_rate": 4.999577063811723e-06, + "loss": 3.564, + "step": 675 + }, + { + "epoch": 0.006917317708333333, + "grad_norm": 15.416642189025879, + "learning_rate": 4.999569676631639e-06, + "loss": 3.9313, + "step": 680 + }, + { + "epoch": 0.006968180338541667, + "grad_norm": 11.908843994140625, + "learning_rate": 4.999562225500658e-06, + "loss": 3.7715, + "step": 685 + }, + { + "epoch": 0.00701904296875, + "grad_norm": 15.781961441040039, + "learning_rate": 4.999554710418969e-06, + "loss": 3.467, + "step": 690 + }, + { + "epoch": 0.007069905598958333, + "grad_norm": 17.706050872802734, + "learning_rate": 4.999547131386766e-06, + "loss": 3.442, + "step": 695 + }, + { + "epoch": 0.007120768229166667, + "grad_norm": 16.651485443115234, + "learning_rate": 4.999539488404242e-06, + "loss": 3.7181, + "step": 700 + }, + { + "epoch": 0.007171630859375, + "grad_norm": 10.602474212646484, + "learning_rate": 4.9995317814715925e-06, + "loss": 3.6258, + "step": 705 + }, + { + "epoch": 0.007222493489583333, + "grad_norm": 20.507596969604492, + "learning_rate": 4.999524010589015e-06, + "loss": 3.9877, + "step": 710 + }, + { + "epoch": 0.007273356119791667, + "grad_norm": 17.32587432861328, + "learning_rate": 4.999516175756708e-06, + "loss": 3.7221, + "step": 715 + }, + { + "epoch": 0.00732421875, + "grad_norm": 11.81552791595459, + "learning_rate": 4.9995082769748715e-06, + "loss": 3.4813, + "step": 720 + }, + { + "epoch": 0.007375081380208333, + "grad_norm": 13.437356948852539, + "learning_rate": 4.9995003142437086e-06, + "loss": 3.6507, + "step": 725 + }, + { + "epoch": 0.007425944010416667, + "grad_norm": 25.208234786987305, + "learning_rate": 4.999492287563422e-06, + "loss": 3.9219, + "step": 730 + }, + { + "epoch": 0.007476806640625, + "grad_norm": 13.51179313659668, + "learning_rate": 4.999484196934219e-06, + "loss": 3.5389, + "step": 735 + }, + { + "epoch": 0.007527669270833333, + "grad_norm": 13.544529914855957, + "learning_rate": 4.999476042356305e-06, + "loss": 3.597, + "step": 740 + }, + { + "epoch": 0.007578531901041667, + "grad_norm": 10.314176559448242, + "learning_rate": 4.999467823829888e-06, + "loss": 3.8758, + "step": 745 + }, + { + "epoch": 0.00762939453125, + "grad_norm": 21.49091148376465, + "learning_rate": 4.99945954135518e-06, + "loss": 3.3777, + "step": 750 + }, + { + "epoch": 0.007680257161458333, + "grad_norm": 12.480860710144043, + "learning_rate": 4.999451194932392e-06, + "loss": 3.511, + "step": 755 + }, + { + "epoch": 0.007731119791666667, + "grad_norm": 15.50587272644043, + "learning_rate": 4.999442784561737e-06, + "loss": 3.6119, + "step": 760 + }, + { + "epoch": 0.007781982421875, + "grad_norm": 17.906938552856445, + "learning_rate": 4.9994343102434314e-06, + "loss": 3.8537, + "step": 765 + }, + { + "epoch": 0.007832845052083334, + "grad_norm": 18.00349998474121, + "learning_rate": 4.9994257719776915e-06, + "loss": 3.5682, + "step": 770 + }, + { + "epoch": 0.007883707682291666, + "grad_norm": 14.669235229492188, + "learning_rate": 4.999417169764735e-06, + "loss": 3.4905, + "step": 775 + }, + { + "epoch": 0.0079345703125, + "grad_norm": 15.826396942138672, + "learning_rate": 4.999408503604783e-06, + "loss": 3.9937, + "step": 780 + }, + { + "epoch": 0.007985432942708334, + "grad_norm": 17.248136520385742, + "learning_rate": 4.999399773498057e-06, + "loss": 4.14, + "step": 785 + }, + { + "epoch": 0.008036295572916666, + "grad_norm": 16.377283096313477, + "learning_rate": 4.99939097944478e-06, + "loss": 4.2206, + "step": 790 + }, + { + "epoch": 0.008087158203125, + "grad_norm": 11.323334693908691, + "learning_rate": 4.9993821214451774e-06, + "loss": 4.1021, + "step": 795 + }, + { + "epoch": 0.008138020833333334, + "grad_norm": 14.058307647705078, + "learning_rate": 4.999373199499476e-06, + "loss": 3.9362, + "step": 800 + }, + { + "epoch": 0.008188883463541666, + "grad_norm": 17.746889114379883, + "learning_rate": 4.9993642136079025e-06, + "loss": 4.0898, + "step": 805 + }, + { + "epoch": 0.00823974609375, + "grad_norm": 17.276458740234375, + "learning_rate": 4.999355163770688e-06, + "loss": 3.9971, + "step": 810 + }, + { + "epoch": 0.008290608723958334, + "grad_norm": 20.424314498901367, + "learning_rate": 4.999346049988065e-06, + "loss": 3.9069, + "step": 815 + }, + { + "epoch": 0.008341471354166666, + "grad_norm": 21.62326431274414, + "learning_rate": 4.999336872260266e-06, + "loss": 3.812, + "step": 820 + }, + { + "epoch": 0.008392333984375, + "grad_norm": 12.865884780883789, + "learning_rate": 4.999327630587525e-06, + "loss": 3.7855, + "step": 825 + }, + { + "epoch": 0.008443196614583334, + "grad_norm": 14.172892570495605, + "learning_rate": 4.999318324970079e-06, + "loss": 3.8214, + "step": 830 + }, + { + "epoch": 0.008494059244791666, + "grad_norm": 19.895843505859375, + "learning_rate": 4.999308955408166e-06, + "loss": 3.5814, + "step": 835 + }, + { + "epoch": 0.008544921875, + "grad_norm": 14.874984741210938, + "learning_rate": 4.999299521902026e-06, + "loss": 3.64, + "step": 840 + }, + { + "epoch": 0.008595784505208334, + "grad_norm": 15.312943458557129, + "learning_rate": 4.9992900244519e-06, + "loss": 3.7124, + "step": 845 + }, + { + "epoch": 0.008646647135416666, + "grad_norm": 17.663049697875977, + "learning_rate": 4.999280463058031e-06, + "loss": 3.3959, + "step": 850 + }, + { + "epoch": 0.008697509765625, + "grad_norm": 19.949892044067383, + "learning_rate": 4.999270837720663e-06, + "loss": 3.4157, + "step": 855 + }, + { + "epoch": 0.008748372395833334, + "grad_norm": 14.926891326904297, + "learning_rate": 4.9992611484400444e-06, + "loss": 3.4777, + "step": 860 + }, + { + "epoch": 0.008799235026041666, + "grad_norm": 13.239298820495605, + "learning_rate": 4.999251395216421e-06, + "loss": 3.6932, + "step": 865 + }, + { + "epoch": 0.00885009765625, + "grad_norm": 16.126522064208984, + "learning_rate": 4.999241578050044e-06, + "loss": 3.7468, + "step": 870 + }, + { + "epoch": 0.008900960286458334, + "grad_norm": 26.55894660949707, + "learning_rate": 4.999231696941162e-06, + "loss": 3.2945, + "step": 875 + }, + { + "epoch": 0.008951822916666666, + "grad_norm": 16.377870559692383, + "learning_rate": 4.99922175189003e-06, + "loss": 3.8092, + "step": 880 + }, + { + "epoch": 0.009002685546875, + "grad_norm": 25.377391815185547, + "learning_rate": 4.999211742896902e-06, + "loss": 4.0082, + "step": 885 + }, + { + "epoch": 0.009053548177083334, + "grad_norm": 16.60061264038086, + "learning_rate": 4.999201669962034e-06, + "loss": 3.717, + "step": 890 + }, + { + "epoch": 0.009104410807291666, + "grad_norm": 14.67759895324707, + "learning_rate": 4.999191533085684e-06, + "loss": 3.5591, + "step": 895 + }, + { + "epoch": 0.0091552734375, + "grad_norm": 10.630406379699707, + "learning_rate": 4.9991813322681105e-06, + "loss": 3.6146, + "step": 900 + }, + { + "epoch": 0.009206136067708334, + "grad_norm": 13.745230674743652, + "learning_rate": 4.999171067509575e-06, + "loss": 3.5511, + "step": 905 + }, + { + "epoch": 0.009256998697916666, + "grad_norm": 15.126039505004883, + "learning_rate": 4.99916073881034e-06, + "loss": 3.6083, + "step": 910 + }, + { + "epoch": 0.009307861328125, + "grad_norm": 18.30549430847168, + "learning_rate": 4.99915034617067e-06, + "loss": 3.4733, + "step": 915 + }, + { + "epoch": 0.009358723958333334, + "grad_norm": 12.323296546936035, + "learning_rate": 4.999139889590832e-06, + "loss": 3.6615, + "step": 920 + }, + { + "epoch": 0.009409586588541666, + "grad_norm": 19.943538665771484, + "learning_rate": 4.999129369071091e-06, + "loss": 4.1944, + "step": 925 + }, + { + "epoch": 0.00946044921875, + "grad_norm": 10.738496780395508, + "learning_rate": 4.9991187846117175e-06, + "loss": 3.4904, + "step": 930 + }, + { + "epoch": 0.009511311848958334, + "grad_norm": 11.223962783813477, + "learning_rate": 4.999108136212982e-06, + "loss": 3.6925, + "step": 935 + }, + { + "epoch": 0.009562174479166666, + "grad_norm": 13.026283264160156, + "learning_rate": 4.999097423875158e-06, + "loss": 4.1644, + "step": 940 + }, + { + "epoch": 0.009613037109375, + "grad_norm": 16.166419982910156, + "learning_rate": 4.999086647598518e-06, + "loss": 3.5475, + "step": 945 + }, + { + "epoch": 0.009663899739583334, + "grad_norm": 18.75146484375, + "learning_rate": 4.999075807383339e-06, + "loss": 3.4752, + "step": 950 + }, + { + "epoch": 0.009714762369791666, + "grad_norm": 14.348006248474121, + "learning_rate": 4.999064903229897e-06, + "loss": 3.5193, + "step": 955 + }, + { + "epoch": 0.009765625, + "grad_norm": 13.05683708190918, + "learning_rate": 4.9990539351384725e-06, + "loss": 3.5619, + "step": 960 + }, + { + "epoch": 0.009816487630208334, + "grad_norm": 14.792659759521484, + "learning_rate": 4.999042903109345e-06, + "loss": 3.2695, + "step": 965 + }, + { + "epoch": 0.009867350260416666, + "grad_norm": 11.70416259765625, + "learning_rate": 4.999031807142798e-06, + "loss": 3.7578, + "step": 970 + }, + { + "epoch": 0.009918212890625, + "grad_norm": 10.735535621643066, + "learning_rate": 4.999020647239114e-06, + "loss": 3.8695, + "step": 975 + }, + { + "epoch": 0.009969075520833334, + "grad_norm": 17.912128448486328, + "learning_rate": 4.999009423398579e-06, + "loss": 3.5609, + "step": 980 + }, + { + "epoch": 0.010019938151041666, + "grad_norm": 14.438199043273926, + "learning_rate": 4.99899813562148e-06, + "loss": 3.7282, + "step": 985 + }, + { + "epoch": 0.01007080078125, + "grad_norm": 15.654008865356445, + "learning_rate": 4.9989867839081065e-06, + "loss": 3.8585, + "step": 990 + }, + { + "epoch": 0.010121663411458334, + "grad_norm": 15.130616188049316, + "learning_rate": 4.998975368258749e-06, + "loss": 3.4857, + "step": 995 + }, + { + "epoch": 0.010172526041666666, + "grad_norm": 15.098103523254395, + "learning_rate": 4.998963888673698e-06, + "loss": 3.7006, + "step": 1000 + }, + { + "epoch": 0.010223388671875, + "grad_norm": 14.421039581298828, + "learning_rate": 4.998952345153249e-06, + "loss": 3.7491, + "step": 1005 + }, + { + "epoch": 0.010274251302083334, + "grad_norm": 20.087514877319336, + "learning_rate": 4.998940737697695e-06, + "loss": 3.7776, + "step": 1010 + }, + { + "epoch": 0.010325113932291666, + "grad_norm": 18.88400650024414, + "learning_rate": 4.998929066307336e-06, + "loss": 3.7481, + "step": 1015 + }, + { + "epoch": 0.0103759765625, + "grad_norm": 21.07455062866211, + "learning_rate": 4.998917330982469e-06, + "loss": 3.5294, + "step": 1020 + }, + { + "epoch": 0.010426839192708334, + "grad_norm": 11.424040794372559, + "learning_rate": 4.998905531723394e-06, + "loss": 3.9097, + "step": 1025 + }, + { + "epoch": 0.010477701822916666, + "grad_norm": 10.922247886657715, + "learning_rate": 4.998893668530414e-06, + "loss": 3.5514, + "step": 1030 + }, + { + "epoch": 0.010528564453125, + "grad_norm": 17.195194244384766, + "learning_rate": 4.99888174140383e-06, + "loss": 3.616, + "step": 1035 + }, + { + "epoch": 0.010579427083333334, + "grad_norm": 10.144538879394531, + "learning_rate": 4.998869750343951e-06, + "loss": 3.7874, + "step": 1040 + }, + { + "epoch": 0.010630289713541666, + "grad_norm": 11.742879867553711, + "learning_rate": 4.998857695351081e-06, + "loss": 3.9271, + "step": 1045 + }, + { + "epoch": 0.01068115234375, + "grad_norm": 13.740591049194336, + "learning_rate": 4.998845576425529e-06, + "loss": 3.7697, + "step": 1050 + }, + { + "epoch": 0.010732014973958334, + "grad_norm": 15.526152610778809, + "learning_rate": 4.998833393567605e-06, + "loss": 3.3944, + "step": 1055 + }, + { + "epoch": 0.010782877604166666, + "grad_norm": 14.3406982421875, + "learning_rate": 4.998821146777622e-06, + "loss": 3.8095, + "step": 1060 + }, + { + "epoch": 0.010833740234375, + "grad_norm": 12.40285873413086, + "learning_rate": 4.99880883605589e-06, + "loss": 3.6193, + "step": 1065 + }, + { + "epoch": 0.010884602864583334, + "grad_norm": 20.011245727539062, + "learning_rate": 4.998796461402729e-06, + "loss": 3.8485, + "step": 1070 + }, + { + "epoch": 0.010935465494791666, + "grad_norm": 15.710371017456055, + "learning_rate": 4.998784022818452e-06, + "loss": 3.8256, + "step": 1075 + }, + { + "epoch": 0.010986328125, + "grad_norm": 18.561986923217773, + "learning_rate": 4.998771520303376e-06, + "loss": 3.9108, + "step": 1080 + }, + { + "epoch": 0.011037190755208334, + "grad_norm": 18.281129837036133, + "learning_rate": 4.998758953857825e-06, + "loss": 3.4311, + "step": 1085 + }, + { + "epoch": 0.011088053385416666, + "grad_norm": 14.150856018066406, + "learning_rate": 4.998746323482117e-06, + "loss": 3.5551, + "step": 1090 + }, + { + "epoch": 0.011138916015625, + "grad_norm": 13.725726127624512, + "learning_rate": 4.9987336291765784e-06, + "loss": 3.6229, + "step": 1095 + }, + { + "epoch": 0.011189778645833334, + "grad_norm": 11.532977104187012, + "learning_rate": 4.998720870941531e-06, + "loss": 3.5328, + "step": 1100 + }, + { + "epoch": 0.011240641276041666, + "grad_norm": 15.219062805175781, + "learning_rate": 4.998708048777303e-06, + "loss": 3.7603, + "step": 1105 + }, + { + "epoch": 0.01129150390625, + "grad_norm": 10.167997360229492, + "learning_rate": 4.9986951626842215e-06, + "loss": 4.157, + "step": 1110 + }, + { + "epoch": 0.011342366536458334, + "grad_norm": 25.62779998779297, + "learning_rate": 4.9986822126626165e-06, + "loss": 3.539, + "step": 1115 + }, + { + "epoch": 0.011393229166666666, + "grad_norm": 13.080843925476074, + "learning_rate": 4.998669198712819e-06, + "loss": 3.9684, + "step": 1120 + }, + { + "epoch": 0.011444091796875, + "grad_norm": 13.87924861907959, + "learning_rate": 4.998656120835163e-06, + "loss": 3.6138, + "step": 1125 + }, + { + "epoch": 0.011494954427083334, + "grad_norm": 16.160778045654297, + "learning_rate": 4.998642979029982e-06, + "loss": 3.7762, + "step": 1130 + }, + { + "epoch": 0.011545817057291666, + "grad_norm": 12.456393241882324, + "learning_rate": 4.998629773297613e-06, + "loss": 3.5095, + "step": 1135 + }, + { + "epoch": 0.0115966796875, + "grad_norm": 18.339937210083008, + "learning_rate": 4.998616503638393e-06, + "loss": 3.6277, + "step": 1140 + }, + { + "epoch": 0.011647542317708334, + "grad_norm": 13.26982307434082, + "learning_rate": 4.998603170052662e-06, + "loss": 3.9732, + "step": 1145 + }, + { + "epoch": 0.011698404947916666, + "grad_norm": 12.922799110412598, + "learning_rate": 4.9985897725407616e-06, + "loss": 3.4633, + "step": 1150 + }, + { + "epoch": 0.011749267578125, + "grad_norm": 19.501089096069336, + "learning_rate": 4.998576311103033e-06, + "loss": 3.7619, + "step": 1155 + }, + { + "epoch": 0.011800130208333334, + "grad_norm": 12.718639373779297, + "learning_rate": 4.998562785739823e-06, + "loss": 3.7302, + "step": 1160 + }, + { + "epoch": 0.011850992838541666, + "grad_norm": 10.442389488220215, + "learning_rate": 4.998549196451475e-06, + "loss": 3.5264, + "step": 1165 + }, + { + "epoch": 0.01190185546875, + "grad_norm": 15.280603408813477, + "learning_rate": 4.99853554323834e-06, + "loss": 3.7362, + "step": 1170 + }, + { + "epoch": 0.011952718098958334, + "grad_norm": 14.212106704711914, + "learning_rate": 4.998521826100764e-06, + "loss": 3.7874, + "step": 1175 + }, + { + "epoch": 0.012003580729166666, + "grad_norm": 13.269865989685059, + "learning_rate": 4.998508045039099e-06, + "loss": 3.6369, + "step": 1180 + }, + { + "epoch": 0.012054443359375, + "grad_norm": 12.6091890335083, + "learning_rate": 4.998494200053698e-06, + "loss": 3.6542, + "step": 1185 + }, + { + "epoch": 0.012105305989583334, + "grad_norm": 14.155792236328125, + "learning_rate": 4.998480291144916e-06, + "loss": 4.0782, + "step": 1190 + }, + { + "epoch": 0.012156168619791666, + "grad_norm": 19.589990615844727, + "learning_rate": 4.998466318313108e-06, + "loss": 3.7343, + "step": 1195 + }, + { + "epoch": 0.01220703125, + "grad_norm": 13.791729927062988, + "learning_rate": 4.99845228155863e-06, + "loss": 3.6456, + "step": 1200 + }, + { + "epoch": 0.012257893880208334, + "grad_norm": 13.561478614807129, + "learning_rate": 4.998438180881844e-06, + "loss": 4.0166, + "step": 1205 + }, + { + "epoch": 0.012308756510416666, + "grad_norm": 18.43284034729004, + "learning_rate": 4.998424016283109e-06, + "loss": 3.6783, + "step": 1210 + }, + { + "epoch": 0.012359619140625, + "grad_norm": 19.545591354370117, + "learning_rate": 4.9984097877627865e-06, + "loss": 3.7069, + "step": 1215 + }, + { + "epoch": 0.012410481770833334, + "grad_norm": 16.589338302612305, + "learning_rate": 4.998395495321243e-06, + "loss": 3.5455, + "step": 1220 + }, + { + "epoch": 0.012461344401041666, + "grad_norm": 13.449020385742188, + "learning_rate": 4.998381138958843e-06, + "loss": 3.6977, + "step": 1225 + }, + { + "epoch": 0.01251220703125, + "grad_norm": 17.311525344848633, + "learning_rate": 4.9983667186759535e-06, + "loss": 3.6899, + "step": 1230 + }, + { + "epoch": 0.012563069661458334, + "grad_norm": 15.647948265075684, + "learning_rate": 4.998352234472944e-06, + "loss": 3.1426, + "step": 1235 + }, + { + "epoch": 0.012613932291666666, + "grad_norm": 22.255765914916992, + "learning_rate": 4.998337686350184e-06, + "loss": 3.66, + "step": 1240 + }, + { + "epoch": 0.012664794921875, + "grad_norm": 15.966592788696289, + "learning_rate": 4.998323074308047e-06, + "loss": 3.6801, + "step": 1245 + }, + { + "epoch": 0.012715657552083334, + "grad_norm": 10.439255714416504, + "learning_rate": 4.998308398346906e-06, + "loss": 3.6352, + "step": 1250 + }, + { + "epoch": 0.012766520182291666, + "grad_norm": 14.009291648864746, + "learning_rate": 4.998293658467137e-06, + "loss": 3.5148, + "step": 1255 + }, + { + "epoch": 0.0128173828125, + "grad_norm": 9.830448150634766, + "learning_rate": 4.998278854669117e-06, + "loss": 3.405, + "step": 1260 + }, + { + "epoch": 0.012868245442708334, + "grad_norm": 18.137758255004883, + "learning_rate": 4.998263986953224e-06, + "loss": 3.2475, + "step": 1265 + }, + { + "epoch": 0.012919108072916666, + "grad_norm": 13.296918869018555, + "learning_rate": 4.99824905531984e-06, + "loss": 3.5983, + "step": 1270 + }, + { + "epoch": 0.012969970703125, + "grad_norm": 17.331226348876953, + "learning_rate": 4.9982340597693455e-06, + "loss": 3.4618, + "step": 1275 + }, + { + "epoch": 0.013020833333333334, + "grad_norm": 16.383100509643555, + "learning_rate": 4.998219000302125e-06, + "loss": 3.5292, + "step": 1280 + }, + { + "epoch": 0.013071695963541666, + "grad_norm": 10.87190055847168, + "learning_rate": 4.998203876918564e-06, + "loss": 3.6293, + "step": 1285 + }, + { + "epoch": 0.01312255859375, + "grad_norm": 12.597033500671387, + "learning_rate": 4.998188689619048e-06, + "loss": 3.7316, + "step": 1290 + }, + { + "epoch": 0.013173421223958334, + "grad_norm": 12.681578636169434, + "learning_rate": 4.998173438403966e-06, + "loss": 3.478, + "step": 1295 + }, + { + "epoch": 0.013224283854166666, + "grad_norm": 16.530866622924805, + "learning_rate": 4.99815812327371e-06, + "loss": 3.7343, + "step": 1300 + }, + { + "epoch": 0.013275146484375, + "grad_norm": 17.380083084106445, + "learning_rate": 4.99814274422867e-06, + "loss": 3.5332, + "step": 1305 + }, + { + "epoch": 0.013326009114583334, + "grad_norm": 15.711562156677246, + "learning_rate": 4.998127301269241e-06, + "loss": 3.4281, + "step": 1310 + }, + { + "epoch": 0.013376871744791666, + "grad_norm": 13.467732429504395, + "learning_rate": 4.998111794395816e-06, + "loss": 3.588, + "step": 1315 + }, + { + "epoch": 0.013427734375, + "grad_norm": 16.989633560180664, + "learning_rate": 4.998096223608792e-06, + "loss": 3.304, + "step": 1320 + }, + { + "epoch": 0.013478597005208334, + "grad_norm": 12.060418128967285, + "learning_rate": 4.998080588908571e-06, + "loss": 3.121, + "step": 1325 + }, + { + "epoch": 0.013529459635416666, + "grad_norm": 15.241015434265137, + "learning_rate": 4.9980648902955475e-06, + "loss": 3.6774, + "step": 1330 + }, + { + "epoch": 0.013580322265625, + "grad_norm": 15.760343551635742, + "learning_rate": 4.998049127770127e-06, + "loss": 3.2477, + "step": 1335 + }, + { + "epoch": 0.013631184895833334, + "grad_norm": 15.907750129699707, + "learning_rate": 4.998033301332712e-06, + "loss": 3.3698, + "step": 1340 + }, + { + "epoch": 0.013682047526041666, + "grad_norm": 17.1384334564209, + "learning_rate": 4.9980174109837065e-06, + "loss": 3.7272, + "step": 1345 + }, + { + "epoch": 0.01373291015625, + "grad_norm": 16.641490936279297, + "learning_rate": 4.998001456723518e-06, + "loss": 3.6683, + "step": 1350 + }, + { + "epoch": 0.013783772786458334, + "grad_norm": 8.19157600402832, + "learning_rate": 4.997985438552554e-06, + "loss": 3.6485, + "step": 1355 + }, + { + "epoch": 0.013834635416666666, + "grad_norm": 13.124740600585938, + "learning_rate": 4.997969356471225e-06, + "loss": 3.6489, + "step": 1360 + }, + { + "epoch": 0.013885498046875, + "grad_norm": 11.489811897277832, + "learning_rate": 4.997953210479941e-06, + "loss": 3.9778, + "step": 1365 + }, + { + "epoch": 0.013936360677083334, + "grad_norm": 18.886642456054688, + "learning_rate": 4.997937000579118e-06, + "loss": 3.7343, + "step": 1370 + }, + { + "epoch": 0.013987223307291666, + "grad_norm": 9.009153366088867, + "learning_rate": 4.997920726769168e-06, + "loss": 3.5347, + "step": 1375 + }, + { + "epoch": 0.0140380859375, + "grad_norm": 14.774568557739258, + "learning_rate": 4.997904389050508e-06, + "loss": 3.7011, + "step": 1380 + }, + { + "epoch": 0.014088948567708334, + "grad_norm": 26.565807342529297, + "learning_rate": 4.997887987423556e-06, + "loss": 4.026, + "step": 1385 + }, + { + "epoch": 0.014139811197916666, + "grad_norm": 13.156881332397461, + "learning_rate": 4.997871521888733e-06, + "loss": 3.645, + "step": 1390 + }, + { + "epoch": 0.014190673828125, + "grad_norm": 13.961007118225098, + "learning_rate": 4.9978549924464595e-06, + "loss": 3.638, + "step": 1395 + }, + { + "epoch": 0.014241536458333334, + "grad_norm": 12.008218765258789, + "learning_rate": 4.997838399097157e-06, + "loss": 3.7365, + "step": 1400 + }, + { + "epoch": 0.014292399088541666, + "grad_norm": 16.033838272094727, + "learning_rate": 4.997821741841251e-06, + "loss": 3.8982, + "step": 1405 + }, + { + "epoch": 0.01434326171875, + "grad_norm": 10.620261192321777, + "learning_rate": 4.997805020679169e-06, + "loss": 3.5443, + "step": 1410 + }, + { + "epoch": 0.014394124348958334, + "grad_norm": 13.818375587463379, + "learning_rate": 4.997788235611336e-06, + "loss": 3.8064, + "step": 1415 + }, + { + "epoch": 0.014444986979166666, + "grad_norm": 14.587963104248047, + "learning_rate": 4.997771386638184e-06, + "loss": 3.599, + "step": 1420 + }, + { + "epoch": 0.014495849609375, + "grad_norm": 11.063785552978516, + "learning_rate": 4.997754473760143e-06, + "loss": 3.6159, + "step": 1425 + }, + { + "epoch": 0.014546712239583334, + "grad_norm": 11.75071907043457, + "learning_rate": 4.997737496977645e-06, + "loss": 4.4565, + "step": 1430 + }, + { + "epoch": 0.014597574869791666, + "grad_norm": 14.461017608642578, + "learning_rate": 4.997720456291126e-06, + "loss": 3.7098, + "step": 1435 + }, + { + "epoch": 0.0146484375, + "grad_norm": 11.124221801757812, + "learning_rate": 4.997703351701021e-06, + "loss": 3.5637, + "step": 1440 + }, + { + "epoch": 0.014699300130208334, + "grad_norm": 13.678889274597168, + "learning_rate": 4.997686183207767e-06, + "loss": 3.5194, + "step": 1445 + }, + { + "epoch": 0.014750162760416666, + "grad_norm": 11.239283561706543, + "learning_rate": 4.9976689508118055e-06, + "loss": 3.6568, + "step": 1450 + }, + { + "epoch": 0.014801025390625, + "grad_norm": 12.112907409667969, + "learning_rate": 4.997651654513575e-06, + "loss": 3.6608, + "step": 1455 + }, + { + "epoch": 0.014851888020833334, + "grad_norm": 13.270151138305664, + "learning_rate": 4.997634294313519e-06, + "loss": 3.5079, + "step": 1460 + }, + { + "epoch": 0.014902750651041666, + "grad_norm": 13.076834678649902, + "learning_rate": 4.997616870212082e-06, + "loss": 3.3748, + "step": 1465 + }, + { + "epoch": 0.01495361328125, + "grad_norm": 10.513318061828613, + "learning_rate": 4.997599382209709e-06, + "loss": 3.6342, + "step": 1470 + }, + { + "epoch": 0.015004475911458334, + "grad_norm": 18.05800437927246, + "learning_rate": 4.997581830306848e-06, + "loss": 3.6844, + "step": 1475 + }, + { + "epoch": 0.015055338541666666, + "grad_norm": 10.047406196594238, + "learning_rate": 4.997564214503947e-06, + "loss": 4.0638, + "step": 1480 + }, + { + "epoch": 0.015106201171875, + "grad_norm": 15.92277717590332, + "learning_rate": 4.997546534801459e-06, + "loss": 3.7102, + "step": 1485 + }, + { + "epoch": 0.015157063802083334, + "grad_norm": 15.660829544067383, + "learning_rate": 4.997528791199834e-06, + "loss": 4.0566, + "step": 1490 + }, + { + "epoch": 0.015207926432291666, + "grad_norm": 21.788442611694336, + "learning_rate": 4.997510983699527e-06, + "loss": 4.1864, + "step": 1495 + }, + { + "epoch": 0.0152587890625, + "grad_norm": 10.556007385253906, + "learning_rate": 4.997493112300994e-06, + "loss": 3.621, + "step": 1500 + }, + { + "epoch": 0.015309651692708334, + "grad_norm": 13.028473854064941, + "learning_rate": 4.99747517700469e-06, + "loss": 4.406, + "step": 1505 + }, + { + "epoch": 0.015360514322916666, + "grad_norm": 14.941851615905762, + "learning_rate": 4.997457177811077e-06, + "loss": 3.1199, + "step": 1510 + }, + { + "epoch": 0.015411376953125, + "grad_norm": 18.102357864379883, + "learning_rate": 4.997439114720614e-06, + "loss": 3.7058, + "step": 1515 + }, + { + "epoch": 0.015462239583333334, + "grad_norm": 22.9876651763916, + "learning_rate": 4.997420987733763e-06, + "loss": 3.5268, + "step": 1520 + }, + { + "epoch": 0.015513102213541666, + "grad_norm": 11.794015884399414, + "learning_rate": 4.997402796850989e-06, + "loss": 3.6392, + "step": 1525 + }, + { + "epoch": 0.01556396484375, + "grad_norm": 11.62551212310791, + "learning_rate": 4.997384542072755e-06, + "loss": 3.4675, + "step": 1530 + }, + { + "epoch": 0.015614827473958334, + "grad_norm": 14.436004638671875, + "learning_rate": 4.99736622339953e-06, + "loss": 3.657, + "step": 1535 + }, + { + "epoch": 0.015665690104166668, + "grad_norm": 13.236321449279785, + "learning_rate": 4.997347840831782e-06, + "loss": 3.2599, + "step": 1540 + }, + { + "epoch": 0.015716552734375, + "grad_norm": 11.505661010742188, + "learning_rate": 4.997329394369981e-06, + "loss": 3.6238, + "step": 1545 + }, + { + "epoch": 0.015767415364583332, + "grad_norm": 14.669344902038574, + "learning_rate": 4.997310884014599e-06, + "loss": 3.4919, + "step": 1550 + }, + { + "epoch": 0.015818277994791668, + "grad_norm": 9.981671333312988, + "learning_rate": 4.997292309766111e-06, + "loss": 3.4323, + "step": 1555 + }, + { + "epoch": 0.015869140625, + "grad_norm": 13.35000228881836, + "learning_rate": 4.997273671624991e-06, + "loss": 3.5778, + "step": 1560 + }, + { + "epoch": 0.015920003255208332, + "grad_norm": 13.411084175109863, + "learning_rate": 4.997254969591716e-06, + "loss": 3.8477, + "step": 1565 + }, + { + "epoch": 0.015970865885416668, + "grad_norm": 13.634716033935547, + "learning_rate": 4.997236203666764e-06, + "loss": 3.4397, + "step": 1570 + }, + { + "epoch": 0.016021728515625, + "grad_norm": 9.219624519348145, + "learning_rate": 4.997217373850617e-06, + "loss": 3.5114, + "step": 1575 + }, + { + "epoch": 0.016072591145833332, + "grad_norm": 12.118866920471191, + "learning_rate": 4.997198480143755e-06, + "loss": 3.7325, + "step": 1580 + }, + { + "epoch": 0.016123453776041668, + "grad_norm": 12.5546875, + "learning_rate": 4.99717952254666e-06, + "loss": 3.6924, + "step": 1585 + }, + { + "epoch": 0.01617431640625, + "grad_norm": 15.333346366882324, + "learning_rate": 4.99716050105982e-06, + "loss": 3.4589, + "step": 1590 + }, + { + "epoch": 0.016225179036458332, + "grad_norm": 11.861115455627441, + "learning_rate": 4.997141415683721e-06, + "loss": 3.7009, + "step": 1595 + }, + { + "epoch": 0.016276041666666668, + "grad_norm": 15.62850570678711, + "learning_rate": 4.99712226641885e-06, + "loss": 4.2749, + "step": 1600 + }, + { + "epoch": 0.016326904296875, + "grad_norm": 16.968338012695312, + "learning_rate": 4.997103053265698e-06, + "loss": 3.458, + "step": 1605 + }, + { + "epoch": 0.016377766927083332, + "grad_norm": 15.927022933959961, + "learning_rate": 4.997083776224757e-06, + "loss": 3.5739, + "step": 1610 + }, + { + "epoch": 0.016428629557291668, + "grad_norm": 12.973907470703125, + "learning_rate": 4.997064435296518e-06, + "loss": 3.5018, + "step": 1615 + }, + { + "epoch": 0.0164794921875, + "grad_norm": 16.26294708251953, + "learning_rate": 4.997045030481478e-06, + "loss": 3.8331, + "step": 1620 + }, + { + "epoch": 0.016530354817708332, + "grad_norm": 14.90145492553711, + "learning_rate": 4.997025561780133e-06, + "loss": 3.7663, + "step": 1625 + }, + { + "epoch": 0.016581217447916668, + "grad_norm": 10.96834945678711, + "learning_rate": 4.9970060291929816e-06, + "loss": 3.2283, + "step": 1630 + }, + { + "epoch": 0.016632080078125, + "grad_norm": 14.805373191833496, + "learning_rate": 4.996986432720521e-06, + "loss": 3.5404, + "step": 1635 + }, + { + "epoch": 0.016682942708333332, + "grad_norm": 14.054951667785645, + "learning_rate": 4.996966772363255e-06, + "loss": 3.7508, + "step": 1640 + }, + { + "epoch": 0.016733805338541668, + "grad_norm": 18.448801040649414, + "learning_rate": 4.996947048121686e-06, + "loss": 3.1579, + "step": 1645 + }, + { + "epoch": 0.01678466796875, + "grad_norm": 11.387920379638672, + "learning_rate": 4.996927259996319e-06, + "loss": 3.1547, + "step": 1650 + }, + { + "epoch": 0.016835530598958332, + "grad_norm": 17.80491828918457, + "learning_rate": 4.99690740798766e-06, + "loss": 3.5202, + "step": 1655 + }, + { + "epoch": 0.016886393229166668, + "grad_norm": 15.007222175598145, + "learning_rate": 4.9968874920962165e-06, + "loss": 3.6235, + "step": 1660 + }, + { + "epoch": 0.016937255859375, + "grad_norm": 10.149649620056152, + "learning_rate": 4.996867512322499e-06, + "loss": 3.6799, + "step": 1665 + }, + { + "epoch": 0.016988118489583332, + "grad_norm": 17.945354461669922, + "learning_rate": 4.996847468667016e-06, + "loss": 3.9798, + "step": 1670 + }, + { + "epoch": 0.017038981119791668, + "grad_norm": 15.979263305664062, + "learning_rate": 4.9968273611302845e-06, + "loss": 3.3488, + "step": 1675 + }, + { + "epoch": 0.01708984375, + "grad_norm": 14.131532669067383, + "learning_rate": 4.996807189712815e-06, + "loss": 3.3508, + "step": 1680 + }, + { + "epoch": 0.017140706380208332, + "grad_norm": 19.903356552124023, + "learning_rate": 4.996786954415127e-06, + "loss": 3.5976, + "step": 1685 + }, + { + "epoch": 0.017191569010416668, + "grad_norm": 13.6408052444458, + "learning_rate": 4.996766655237736e-06, + "loss": 3.4822, + "step": 1690 + }, + { + "epoch": 0.017242431640625, + "grad_norm": 9.797231674194336, + "learning_rate": 4.9967462921811614e-06, + "loss": 3.9847, + "step": 1695 + }, + { + "epoch": 0.017293294270833332, + "grad_norm": 13.015440940856934, + "learning_rate": 4.996725865245926e-06, + "loss": 3.5052, + "step": 1700 + }, + { + "epoch": 0.017344156901041668, + "grad_norm": 17.70994758605957, + "learning_rate": 4.99670537443255e-06, + "loss": 3.9466, + "step": 1705 + }, + { + "epoch": 0.01739501953125, + "grad_norm": 9.877345085144043, + "learning_rate": 4.996684819741559e-06, + "loss": 3.6346, + "step": 1710 + }, + { + "epoch": 0.017445882161458332, + "grad_norm": 12.302515983581543, + "learning_rate": 4.996664201173478e-06, + "loss": 3.3808, + "step": 1715 + }, + { + "epoch": 0.017496744791666668, + "grad_norm": 12.325230598449707, + "learning_rate": 4.9966435187288365e-06, + "loss": 3.8128, + "step": 1720 + }, + { + "epoch": 0.017547607421875, + "grad_norm": 14.4461669921875, + "learning_rate": 4.996622772408162e-06, + "loss": 4.3367, + "step": 1725 + }, + { + "epoch": 0.017598470052083332, + "grad_norm": 15.207497596740723, + "learning_rate": 4.996601962211985e-06, + "loss": 3.5895, + "step": 1730 + }, + { + "epoch": 0.017649332682291668, + "grad_norm": 10.924785614013672, + "learning_rate": 4.9965810881408384e-06, + "loss": 3.3813, + "step": 1735 + }, + { + "epoch": 0.0177001953125, + "grad_norm": 19.209896087646484, + "learning_rate": 4.996560150195257e-06, + "loss": 3.4489, + "step": 1740 + }, + { + "epoch": 0.017751057942708332, + "grad_norm": 14.56915283203125, + "learning_rate": 4.9965391483757765e-06, + "loss": 3.4132, + "step": 1745 + }, + { + "epoch": 0.017801920572916668, + "grad_norm": 9.958040237426758, + "learning_rate": 4.996518082682933e-06, + "loss": 3.5184, + "step": 1750 + }, + { + "epoch": 0.017852783203125, + "grad_norm": 14.741584777832031, + "learning_rate": 4.9964969531172656e-06, + "loss": 3.7564, + "step": 1755 + }, + { + "epoch": 0.017903645833333332, + "grad_norm": 12.061718940734863, + "learning_rate": 4.996475759679316e-06, + "loss": 3.5494, + "step": 1760 + }, + { + "epoch": 0.017954508463541668, + "grad_norm": 19.455101013183594, + "learning_rate": 4.9964545023696255e-06, + "loss": 3.5547, + "step": 1765 + }, + { + "epoch": 0.01800537109375, + "grad_norm": 15.922080993652344, + "learning_rate": 4.996433181188739e-06, + "loss": 3.6292, + "step": 1770 + }, + { + "epoch": 0.018056233723958332, + "grad_norm": 10.75054931640625, + "learning_rate": 4.996411796137201e-06, + "loss": 3.3277, + "step": 1775 + }, + { + "epoch": 0.018107096354166668, + "grad_norm": 14.0524263381958, + "learning_rate": 4.996390347215558e-06, + "loss": 3.671, + "step": 1780 + }, + { + "epoch": 0.018157958984375, + "grad_norm": 17.705394744873047, + "learning_rate": 4.9963688344243605e-06, + "loss": 3.4854, + "step": 1785 + }, + { + "epoch": 0.018208821614583332, + "grad_norm": 14.042203903198242, + "learning_rate": 4.996347257764158e-06, + "loss": 3.3586, + "step": 1790 + }, + { + "epoch": 0.018259684244791668, + "grad_norm": 15.437379837036133, + "learning_rate": 4.996325617235502e-06, + "loss": 3.1949, + "step": 1795 + }, + { + "epoch": 0.018310546875, + "grad_norm": 14.990214347839355, + "learning_rate": 4.996303912838948e-06, + "loss": 3.2657, + "step": 1800 + }, + { + "epoch": 0.018361409505208332, + "grad_norm": 20.377416610717773, + "learning_rate": 4.9962821445750485e-06, + "loss": 3.5048, + "step": 1805 + }, + { + "epoch": 0.018412272135416668, + "grad_norm": 11.091985702514648, + "learning_rate": 4.996260312444363e-06, + "loss": 3.5013, + "step": 1810 + }, + { + "epoch": 0.018463134765625, + "grad_norm": 17.564790725708008, + "learning_rate": 4.9962384164474495e-06, + "loss": 3.9346, + "step": 1815 + }, + { + "epoch": 0.018513997395833332, + "grad_norm": 11.695920944213867, + "learning_rate": 4.996216456584867e-06, + "loss": 3.7216, + "step": 1820 + }, + { + "epoch": 0.018564860026041668, + "grad_norm": 12.64533519744873, + "learning_rate": 4.9961944328571785e-06, + "loss": 3.4994, + "step": 1825 + }, + { + "epoch": 0.01861572265625, + "grad_norm": 11.813600540161133, + "learning_rate": 4.9961723452649465e-06, + "loss": 3.3742, + "step": 1830 + }, + { + "epoch": 0.018666585286458332, + "grad_norm": 13.247116088867188, + "learning_rate": 4.9961501938087375e-06, + "loss": 3.7691, + "step": 1835 + }, + { + "epoch": 0.018717447916666668, + "grad_norm": 10.45460319519043, + "learning_rate": 4.996127978489117e-06, + "loss": 3.6143, + "step": 1840 + }, + { + "epoch": 0.018768310546875, + "grad_norm": 19.198963165283203, + "learning_rate": 4.996105699306654e-06, + "loss": 3.4559, + "step": 1845 + }, + { + "epoch": 0.018819173177083332, + "grad_norm": 15.408687591552734, + "learning_rate": 4.996083356261918e-06, + "loss": 3.6425, + "step": 1850 + }, + { + "epoch": 0.018870035807291668, + "grad_norm": 16.426103591918945, + "learning_rate": 4.996060949355481e-06, + "loss": 3.6617, + "step": 1855 + }, + { + "epoch": 0.0189208984375, + "grad_norm": 9.62375259399414, + "learning_rate": 4.996038478587916e-06, + "loss": 4.2817, + "step": 1860 + }, + { + "epoch": 0.018971761067708332, + "grad_norm": 13.927205085754395, + "learning_rate": 4.9960159439598e-06, + "loss": 3.1512, + "step": 1865 + }, + { + "epoch": 0.019022623697916668, + "grad_norm": 16.292879104614258, + "learning_rate": 4.995993345471706e-06, + "loss": 3.7999, + "step": 1870 + }, + { + "epoch": 0.019073486328125, + "grad_norm": 16.1326904296875, + "learning_rate": 4.995970683124214e-06, + "loss": 3.0987, + "step": 1875 + }, + { + "epoch": 0.019124348958333332, + "grad_norm": 17.09796142578125, + "learning_rate": 4.995947956917904e-06, + "loss": 3.6616, + "step": 1880 + }, + { + "epoch": 0.019175211588541668, + "grad_norm": 8.162797927856445, + "learning_rate": 4.995925166853357e-06, + "loss": 3.5208, + "step": 1885 + }, + { + "epoch": 0.01922607421875, + "grad_norm": 35.92325210571289, + "learning_rate": 4.995902312931156e-06, + "loss": 3.819, + "step": 1890 + }, + { + "epoch": 0.019276936848958332, + "grad_norm": 15.114096641540527, + "learning_rate": 4.995879395151886e-06, + "loss": 3.5895, + "step": 1895 + }, + { + "epoch": 0.019327799479166668, + "grad_norm": 18.872722625732422, + "learning_rate": 4.995856413516134e-06, + "loss": 3.731, + "step": 1900 + }, + { + "epoch": 0.019378662109375, + "grad_norm": 15.703900337219238, + "learning_rate": 4.9958333680244865e-06, + "loss": 3.8427, + "step": 1905 + }, + { + "epoch": 0.019429524739583332, + "grad_norm": 14.129450798034668, + "learning_rate": 4.9958102586775334e-06, + "loss": 3.6527, + "step": 1910 + }, + { + "epoch": 0.019480387369791668, + "grad_norm": 11.964359283447266, + "learning_rate": 4.9957870854758675e-06, + "loss": 3.3432, + "step": 1915 + }, + { + "epoch": 0.01953125, + "grad_norm": 19.071924209594727, + "learning_rate": 4.99576384842008e-06, + "loss": 3.7882, + "step": 1920 + }, + { + "epoch": 0.019582112630208332, + "grad_norm": 14.704536437988281, + "learning_rate": 4.995740547510766e-06, + "loss": 3.2511, + "step": 1925 + }, + { + "epoch": 0.019632975260416668, + "grad_norm": 20.858272552490234, + "learning_rate": 4.9957171827485215e-06, + "loss": 4.266, + "step": 1930 + }, + { + "epoch": 0.019683837890625, + "grad_norm": 12.940155982971191, + "learning_rate": 4.995693754133944e-06, + "loss": 3.7349, + "step": 1935 + }, + { + "epoch": 0.019734700520833332, + "grad_norm": 17.16631507873535, + "learning_rate": 4.995670261667635e-06, + "loss": 3.6617, + "step": 1940 + }, + { + "epoch": 0.019785563151041668, + "grad_norm": 12.754393577575684, + "learning_rate": 4.995646705350193e-06, + "loss": 3.6483, + "step": 1945 + }, + { + "epoch": 0.01983642578125, + "grad_norm": 12.014126777648926, + "learning_rate": 4.995623085182221e-06, + "loss": 3.4977, + "step": 1950 + }, + { + "epoch": 0.019887288411458332, + "grad_norm": 16.564720153808594, + "learning_rate": 4.995599401164325e-06, + "loss": 3.7256, + "step": 1955 + }, + { + "epoch": 0.019938151041666668, + "grad_norm": 11.473775863647461, + "learning_rate": 4.995575653297109e-06, + "loss": 3.416, + "step": 1960 + }, + { + "epoch": 0.019989013671875, + "grad_norm": 14.353362083435059, + "learning_rate": 4.995551841581181e-06, + "loss": 3.4641, + "step": 1965 + }, + { + "epoch": 0.020039876302083332, + "grad_norm": 16.31317901611328, + "learning_rate": 4.9955279660171514e-06, + "loss": 3.7767, + "step": 1970 + }, + { + "epoch": 0.020090738932291668, + "grad_norm": 8.309894561767578, + "learning_rate": 4.99550402660563e-06, + "loss": 3.6317, + "step": 1975 + }, + { + "epoch": 0.0201416015625, + "grad_norm": 18.78270149230957, + "learning_rate": 4.99548002334723e-06, + "loss": 3.2542, + "step": 1980 + }, + { + "epoch": 0.020192464192708332, + "grad_norm": 10.488119125366211, + "learning_rate": 4.9954559562425654e-06, + "loss": 3.3925, + "step": 1985 + }, + { + "epoch": 0.020243326822916668, + "grad_norm": 19.56174087524414, + "learning_rate": 4.99543182529225e-06, + "loss": 3.4966, + "step": 1990 + }, + { + "epoch": 0.020294189453125, + "grad_norm": 11.951705932617188, + "learning_rate": 4.995407630496905e-06, + "loss": 3.6632, + "step": 1995 + }, + { + "epoch": 0.020345052083333332, + "grad_norm": 15.856471061706543, + "learning_rate": 4.995383371857145e-06, + "loss": 3.3239, + "step": 2000 + }, + { + "epoch": 0.020395914713541668, + "grad_norm": 12.084221839904785, + "learning_rate": 4.9953590493735945e-06, + "loss": 3.5615, + "step": 2005 + }, + { + "epoch": 0.02044677734375, + "grad_norm": 27.98715591430664, + "learning_rate": 4.995334663046874e-06, + "loss": 3.6218, + "step": 2010 + }, + { + "epoch": 0.020497639973958332, + "grad_norm": 9.576325416564941, + "learning_rate": 4.995310212877608e-06, + "loss": 3.4569, + "step": 2015 + }, + { + "epoch": 0.020548502604166668, + "grad_norm": 13.834068298339844, + "learning_rate": 4.9952856988664205e-06, + "loss": 3.2584, + "step": 2020 + }, + { + "epoch": 0.020599365234375, + "grad_norm": 17.03768539428711, + "learning_rate": 4.99526112101394e-06, + "loss": 3.7883, + "step": 2025 + }, + { + "epoch": 0.020650227864583332, + "grad_norm": 11.945119857788086, + "learning_rate": 4.995236479320796e-06, + "loss": 3.3385, + "step": 2030 + }, + { + "epoch": 0.020701090494791668, + "grad_norm": 13.012072563171387, + "learning_rate": 4.995211773787617e-06, + "loss": 3.5075, + "step": 2035 + }, + { + "epoch": 0.020751953125, + "grad_norm": 11.229742050170898, + "learning_rate": 4.995187004415038e-06, + "loss": 3.1753, + "step": 2040 + }, + { + "epoch": 0.020802815755208332, + "grad_norm": 15.254374504089355, + "learning_rate": 4.995162171203689e-06, + "loss": 3.4893, + "step": 2045 + }, + { + "epoch": 0.020853678385416668, + "grad_norm": 12.569649696350098, + "learning_rate": 4.9951372741542084e-06, + "loss": 3.7053, + "step": 2050 + }, + { + "epoch": 0.020904541015625, + "grad_norm": 13.927783966064453, + "learning_rate": 4.995112313267231e-06, + "loss": 3.43, + "step": 2055 + }, + { + "epoch": 0.020955403645833332, + "grad_norm": 10.135937690734863, + "learning_rate": 4.995087288543397e-06, + "loss": 3.3201, + "step": 2060 + }, + { + "epoch": 0.021006266276041668, + "grad_norm": 17.4373722076416, + "learning_rate": 4.995062199983346e-06, + "loss": 3.9556, + "step": 2065 + }, + { + "epoch": 0.02105712890625, + "grad_norm": 8.249671936035156, + "learning_rate": 4.9950370475877204e-06, + "loss": 3.4842, + "step": 2070 + }, + { + "epoch": 0.021107991536458332, + "grad_norm": 7.946985244750977, + "learning_rate": 4.995011831357164e-06, + "loss": 3.7847, + "step": 2075 + }, + { + "epoch": 0.021158854166666668, + "grad_norm": 14.219249725341797, + "learning_rate": 4.99498655129232e-06, + "loss": 3.6362, + "step": 2080 + }, + { + "epoch": 0.021209716796875, + "grad_norm": 16.812952041625977, + "learning_rate": 4.994961207393837e-06, + "loss": 3.7761, + "step": 2085 + }, + { + "epoch": 0.021260579427083332, + "grad_norm": 12.385375022888184, + "learning_rate": 4.994935799662363e-06, + "loss": 3.5996, + "step": 2090 + }, + { + "epoch": 0.021311442057291668, + "grad_norm": 8.916481971740723, + "learning_rate": 4.994910328098548e-06, + "loss": 3.6023, + "step": 2095 + }, + { + "epoch": 0.0213623046875, + "grad_norm": 14.721908569335938, + "learning_rate": 4.994884792703043e-06, + "loss": 3.3011, + "step": 2100 + }, + { + "epoch": 0.021413167317708332, + "grad_norm": 15.370878219604492, + "learning_rate": 4.9948591934765025e-06, + "loss": 3.5518, + "step": 2105 + }, + { + "epoch": 0.021464029947916668, + "grad_norm": 14.788433074951172, + "learning_rate": 4.994833530419581e-06, + "loss": 3.5008, + "step": 2110 + }, + { + "epoch": 0.021514892578125, + "grad_norm": 12.971417427062988, + "learning_rate": 4.994807803532934e-06, + "loss": 3.5467, + "step": 2115 + }, + { + "epoch": 0.021565755208333332, + "grad_norm": 14.517382621765137, + "learning_rate": 4.994782012817221e-06, + "loss": 3.6258, + "step": 2120 + }, + { + "epoch": 0.021616617838541668, + "grad_norm": 10.307328224182129, + "learning_rate": 4.994756158273102e-06, + "loss": 4.0953, + "step": 2125 + }, + { + "epoch": 0.02166748046875, + "grad_norm": 15.38490104675293, + "learning_rate": 4.994730239901238e-06, + "loss": 4.2029, + "step": 2130 + }, + { + "epoch": 0.021718343098958332, + "grad_norm": 12.546719551086426, + "learning_rate": 4.994704257702292e-06, + "loss": 3.4173, + "step": 2135 + }, + { + "epoch": 0.021769205729166668, + "grad_norm": 11.977869033813477, + "learning_rate": 4.994678211676929e-06, + "loss": 3.4095, + "step": 2140 + }, + { + "epoch": 0.021820068359375, + "grad_norm": 10.538824081420898, + "learning_rate": 4.994652101825815e-06, + "loss": 3.607, + "step": 2145 + }, + { + "epoch": 0.021870930989583332, + "grad_norm": 18.2629337310791, + "learning_rate": 4.994625928149619e-06, + "loss": 3.9144, + "step": 2150 + }, + { + "epoch": 0.021921793619791668, + "grad_norm": 12.87021255493164, + "learning_rate": 4.994599690649009e-06, + "loss": 3.9643, + "step": 2155 + }, + { + "epoch": 0.02197265625, + "grad_norm": 16.46050453186035, + "learning_rate": 4.994573389324657e-06, + "loss": 3.7189, + "step": 2160 + }, + { + "epoch": 0.022023518880208332, + "grad_norm": 14.574606895446777, + "learning_rate": 4.994547024177236e-06, + "loss": 3.906, + "step": 2165 + }, + { + "epoch": 0.022074381510416668, + "grad_norm": 10.178772926330566, + "learning_rate": 4.994520595207422e-06, + "loss": 3.7685, + "step": 2170 + }, + { + "epoch": 0.022125244140625, + "grad_norm": 16.5206298828125, + "learning_rate": 4.994494102415889e-06, + "loss": 3.6191, + "step": 2175 + }, + { + "epoch": 0.022176106770833332, + "grad_norm": 13.858648300170898, + "learning_rate": 4.9944675458033156e-06, + "loss": 3.5425, + "step": 2180 + }, + { + "epoch": 0.022226969401041668, + "grad_norm": 15.840536117553711, + "learning_rate": 4.994440925370382e-06, + "loss": 3.5171, + "step": 2185 + }, + { + "epoch": 0.02227783203125, + "grad_norm": 13.754470825195312, + "learning_rate": 4.9944142411177675e-06, + "loss": 3.3044, + "step": 2190 + }, + { + "epoch": 0.022328694661458332, + "grad_norm": 13.027044296264648, + "learning_rate": 4.994387493046157e-06, + "loss": 3.3715, + "step": 2195 + }, + { + "epoch": 0.022379557291666668, + "grad_norm": 10.115904808044434, + "learning_rate": 4.994360681156233e-06, + "loss": 3.8424, + "step": 2200 + }, + { + "epoch": 0.022430419921875, + "grad_norm": 16.806591033935547, + "learning_rate": 4.994333805448682e-06, + "loss": 4.0637, + "step": 2205 + }, + { + "epoch": 0.022481282552083332, + "grad_norm": 9.8330078125, + "learning_rate": 4.994306865924192e-06, + "loss": 3.383, + "step": 2210 + }, + { + "epoch": 0.022532145182291668, + "grad_norm": 13.583646774291992, + "learning_rate": 4.994279862583453e-06, + "loss": 3.3756, + "step": 2215 + }, + { + "epoch": 0.0225830078125, + "grad_norm": 12.496264457702637, + "learning_rate": 4.994252795427153e-06, + "loss": 3.5715, + "step": 2220 + }, + { + "epoch": 0.022633870442708332, + "grad_norm": 9.020492553710938, + "learning_rate": 4.994225664455989e-06, + "loss": 3.3013, + "step": 2225 + }, + { + "epoch": 0.022684733072916668, + "grad_norm": 14.377449989318848, + "learning_rate": 4.99419846967065e-06, + "loss": 3.3859, + "step": 2230 + }, + { + "epoch": 0.022735595703125, + "grad_norm": 13.713812828063965, + "learning_rate": 4.994171211071836e-06, + "loss": 3.3353, + "step": 2235 + }, + { + "epoch": 0.022786458333333332, + "grad_norm": 18.0495548248291, + "learning_rate": 4.994143888660242e-06, + "loss": 3.5454, + "step": 2240 + }, + { + "epoch": 0.022837320963541668, + "grad_norm": 13.879693031311035, + "learning_rate": 4.994116502436568e-06, + "loss": 3.9231, + "step": 2245 + }, + { + "epoch": 0.02288818359375, + "grad_norm": 14.874258041381836, + "learning_rate": 4.994089052401515e-06, + "loss": 3.114, + "step": 2250 + }, + { + "epoch": 0.022939046223958332, + "grad_norm": 17.684362411499023, + "learning_rate": 4.994061538555784e-06, + "loss": 3.5418, + "step": 2255 + }, + { + "epoch": 0.022989908854166668, + "grad_norm": 20.040653228759766, + "learning_rate": 4.9940339609000796e-06, + "loss": 4.0738, + "step": 2260 + }, + { + "epoch": 0.023040771484375, + "grad_norm": 8.416106224060059, + "learning_rate": 4.994006319435108e-06, + "loss": 3.7515, + "step": 2265 + }, + { + "epoch": 0.023091634114583332, + "grad_norm": 14.56326675415039, + "learning_rate": 4.9939786141615754e-06, + "loss": 3.4834, + "step": 2270 + }, + { + "epoch": 0.023142496744791668, + "grad_norm": 9.106837272644043, + "learning_rate": 4.993950845080191e-06, + "loss": 3.363, + "step": 2275 + }, + { + "epoch": 0.023193359375, + "grad_norm": 16.284809112548828, + "learning_rate": 4.993923012191666e-06, + "loss": 3.4758, + "step": 2280 + }, + { + "epoch": 0.023244222005208332, + "grad_norm": 12.630521774291992, + "learning_rate": 4.993895115496712e-06, + "loss": 3.5318, + "step": 2285 + }, + { + "epoch": 0.023295084635416668, + "grad_norm": 11.593666076660156, + "learning_rate": 4.993867154996042e-06, + "loss": 3.7887, + "step": 2290 + }, + { + "epoch": 0.023345947265625, + "grad_norm": 14.084927558898926, + "learning_rate": 4.993839130690372e-06, + "loss": 3.5525, + "step": 2295 + }, + { + "epoch": 0.023396809895833332, + "grad_norm": 15.547626495361328, + "learning_rate": 4.993811042580419e-06, + "loss": 3.3359, + "step": 2300 + }, + { + "epoch": 0.023447672526041668, + "grad_norm": 17.31133460998535, + "learning_rate": 4.993782890666902e-06, + "loss": 3.6608, + "step": 2305 + }, + { + "epoch": 0.02349853515625, + "grad_norm": 10.99673080444336, + "learning_rate": 4.99375467495054e-06, + "loss": 3.4517, + "step": 2310 + }, + { + "epoch": 0.023549397786458332, + "grad_norm": 10.245965957641602, + "learning_rate": 4.993726395432056e-06, + "loss": 3.5961, + "step": 2315 + }, + { + "epoch": 0.023600260416666668, + "grad_norm": 11.301896095275879, + "learning_rate": 4.993698052112174e-06, + "loss": 3.2348, + "step": 2320 + }, + { + "epoch": 0.023651123046875, + "grad_norm": 16.186355590820312, + "learning_rate": 4.993669644991617e-06, + "loss": 3.6148, + "step": 2325 + }, + { + "epoch": 0.023701985677083332, + "grad_norm": 15.257999420166016, + "learning_rate": 4.993641174071115e-06, + "loss": 3.7732, + "step": 2330 + }, + { + "epoch": 0.023752848307291668, + "grad_norm": 17.36763572692871, + "learning_rate": 4.993612639351393e-06, + "loss": 3.5761, + "step": 2335 + }, + { + "epoch": 0.0238037109375, + "grad_norm": 12.962672233581543, + "learning_rate": 4.993584040833183e-06, + "loss": 3.6535, + "step": 2340 + }, + { + "epoch": 0.023854573567708332, + "grad_norm": 13.28327751159668, + "learning_rate": 4.993555378517217e-06, + "loss": 3.5254, + "step": 2345 + }, + { + "epoch": 0.023905436197916668, + "grad_norm": 14.348062515258789, + "learning_rate": 4.993526652404227e-06, + "loss": 3.6133, + "step": 2350 + }, + { + "epoch": 0.023956298828125, + "grad_norm": 15.535712242126465, + "learning_rate": 4.993497862494949e-06, + "loss": 3.5402, + "step": 2355 + }, + { + "epoch": 0.024007161458333332, + "grad_norm": 10.898641586303711, + "learning_rate": 4.993469008790119e-06, + "loss": 3.5233, + "step": 2360 + }, + { + "epoch": 0.024058024088541668, + "grad_norm": 9.476696968078613, + "learning_rate": 4.993440091290476e-06, + "loss": 4.3757, + "step": 2365 + }, + { + "epoch": 0.02410888671875, + "grad_norm": 16.282838821411133, + "learning_rate": 4.993411109996759e-06, + "loss": 3.8592, + "step": 2370 + }, + { + "epoch": 0.024159749348958332, + "grad_norm": 15.315332412719727, + "learning_rate": 4.99338206490971e-06, + "loss": 3.626, + "step": 2375 + }, + { + "epoch": 0.024210611979166668, + "grad_norm": 11.935681343078613, + "learning_rate": 4.993352956030071e-06, + "loss": 3.5642, + "step": 2380 + }, + { + "epoch": 0.024261474609375, + "grad_norm": 14.131113052368164, + "learning_rate": 4.993323783358588e-06, + "loss": 3.6816, + "step": 2385 + }, + { + "epoch": 0.024312337239583332, + "grad_norm": 17.588123321533203, + "learning_rate": 4.993294546896007e-06, + "loss": 3.574, + "step": 2390 + }, + { + "epoch": 0.024363199869791668, + "grad_norm": 11.560687065124512, + "learning_rate": 4.993265246643076e-06, + "loss": 3.6406, + "step": 2395 + }, + { + "epoch": 0.0244140625, + "grad_norm": 9.761478424072266, + "learning_rate": 4.993235882600545e-06, + "loss": 3.5061, + "step": 2400 + }, + { + "epoch": 0.024464925130208332, + "grad_norm": 13.649985313415527, + "learning_rate": 4.993206454769165e-06, + "loss": 3.4024, + "step": 2405 + }, + { + "epoch": 0.024515787760416668, + "grad_norm": 8.24244499206543, + "learning_rate": 4.993176963149689e-06, + "loss": 3.6048, + "step": 2410 + }, + { + "epoch": 0.024566650390625, + "grad_norm": 16.1616268157959, + "learning_rate": 4.99314740774287e-06, + "loss": 3.5386, + "step": 2415 + }, + { + "epoch": 0.024617513020833332, + "grad_norm": 9.706306457519531, + "learning_rate": 4.993117788549466e-06, + "loss": 3.5126, + "step": 2420 + }, + { + "epoch": 0.024668375651041668, + "grad_norm": 12.5220365524292, + "learning_rate": 4.993088105570235e-06, + "loss": 4.1297, + "step": 2425 + }, + { + "epoch": 0.02471923828125, + "grad_norm": 12.254471778869629, + "learning_rate": 4.993058358805935e-06, + "loss": 3.4553, + "step": 2430 + }, + { + "epoch": 0.024770100911458332, + "grad_norm": 14.104937553405762, + "learning_rate": 4.993028548257328e-06, + "loss": 4.2029, + "step": 2435 + }, + { + "epoch": 0.024820963541666668, + "grad_norm": 14.170137405395508, + "learning_rate": 4.992998673925177e-06, + "loss": 3.45, + "step": 2440 + }, + { + "epoch": 0.024871826171875, + "grad_norm": 16.678157806396484, + "learning_rate": 4.9929687358102455e-06, + "loss": 3.464, + "step": 2445 + }, + { + "epoch": 0.024922688802083332, + "grad_norm": 11.808859825134277, + "learning_rate": 4.9929387339133e-06, + "loss": 4.1151, + "step": 2450 + }, + { + "epoch": 0.024973551432291668, + "grad_norm": 13.544400215148926, + "learning_rate": 4.992908668235107e-06, + "loss": 3.4802, + "step": 2455 + }, + { + "epoch": 0.0250244140625, + "grad_norm": 10.196426391601562, + "learning_rate": 4.992878538776438e-06, + "loss": 3.6122, + "step": 2460 + }, + { + "epoch": 0.025075276692708332, + "grad_norm": 24.040620803833008, + "learning_rate": 4.992848345538062e-06, + "loss": 3.7206, + "step": 2465 + }, + { + "epoch": 0.025126139322916668, + "grad_norm": 13.72429370880127, + "learning_rate": 4.992818088520751e-06, + "loss": 3.7046, + "step": 2470 + }, + { + "epoch": 0.025177001953125, + "grad_norm": 17.09827995300293, + "learning_rate": 4.992787767725281e-06, + "loss": 3.6007, + "step": 2475 + }, + { + "epoch": 0.025227864583333332, + "grad_norm": 17.559282302856445, + "learning_rate": 4.992757383152427e-06, + "loss": 3.5262, + "step": 2480 + }, + { + "epoch": 0.025278727213541668, + "grad_norm": 12.092161178588867, + "learning_rate": 4.992726934802965e-06, + "loss": 3.4942, + "step": 2485 + }, + { + "epoch": 0.02532958984375, + "grad_norm": 12.694467544555664, + "learning_rate": 4.992696422677677e-06, + "loss": 3.6092, + "step": 2490 + }, + { + "epoch": 0.025380452473958332, + "grad_norm": 19.91484832763672, + "learning_rate": 4.99266584677734e-06, + "loss": 3.6935, + "step": 2495 + }, + { + "epoch": 0.025431315104166668, + "grad_norm": 12.521865844726562, + "learning_rate": 4.99263520710274e-06, + "loss": 3.509, + "step": 2500 + }, + { + "epoch": 0.025482177734375, + "grad_norm": 14.746912002563477, + "learning_rate": 4.9926045036546576e-06, + "loss": 3.4715, + "step": 2505 + }, + { + "epoch": 0.025533040364583332, + "grad_norm": 13.788453102111816, + "learning_rate": 4.99257373643388e-06, + "loss": 3.7318, + "step": 2510 + }, + { + "epoch": 0.025583902994791668, + "grad_norm": 15.496885299682617, + "learning_rate": 4.992542905441194e-06, + "loss": 3.6001, + "step": 2515 + }, + { + "epoch": 0.025634765625, + "grad_norm": 11.678740501403809, + "learning_rate": 4.992512010677389e-06, + "loss": 3.3945, + "step": 2520 + }, + { + "epoch": 0.025685628255208332, + "grad_norm": 11.555937767028809, + "learning_rate": 4.992481052143256e-06, + "loss": 3.5856, + "step": 2525 + }, + { + "epoch": 0.025736490885416668, + "grad_norm": 11.76779556274414, + "learning_rate": 4.992450029839584e-06, + "loss": 3.6207, + "step": 2530 + }, + { + "epoch": 0.025787353515625, + "grad_norm": 11.902135848999023, + "learning_rate": 4.99241894376717e-06, + "loss": 3.6504, + "step": 2535 + }, + { + "epoch": 0.025838216145833332, + "grad_norm": 13.56098461151123, + "learning_rate": 4.992387793926808e-06, + "loss": 3.561, + "step": 2540 + }, + { + "epoch": 0.025889078776041668, + "grad_norm": 9.4614839553833, + "learning_rate": 4.9923565803192945e-06, + "loss": 3.7419, + "step": 2545 + }, + { + "epoch": 0.02593994140625, + "grad_norm": 15.035961151123047, + "learning_rate": 4.9923253029454295e-06, + "loss": 3.4019, + "step": 2550 + }, + { + "epoch": 0.025990804036458332, + "grad_norm": 14.106694221496582, + "learning_rate": 4.992293961806012e-06, + "loss": 3.6503, + "step": 2555 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 13.903535842895508, + "learning_rate": 4.992262556901844e-06, + "loss": 3.5632, + "step": 2560 + }, + { + "epoch": 0.026092529296875, + "grad_norm": 21.27461814880371, + "learning_rate": 4.99223108823373e-06, + "loss": 3.3556, + "step": 2565 + }, + { + "epoch": 0.026143391927083332, + "grad_norm": 19.755544662475586, + "learning_rate": 4.992199555802473e-06, + "loss": 3.9516, + "step": 2570 + }, + { + "epoch": 0.026194254557291668, + "grad_norm": 10.459436416625977, + "learning_rate": 4.992167959608882e-06, + "loss": 4.184, + "step": 2575 + }, + { + "epoch": 0.0262451171875, + "grad_norm": 15.970342636108398, + "learning_rate": 4.992136299653763e-06, + "loss": 3.4005, + "step": 2580 + }, + { + "epoch": 0.026295979817708332, + "grad_norm": 12.037581443786621, + "learning_rate": 4.992104575937929e-06, + "loss": 4.3835, + "step": 2585 + }, + { + "epoch": 0.026346842447916668, + "grad_norm": 9.693264961242676, + "learning_rate": 4.99207278846219e-06, + "loss": 3.7904, + "step": 2590 + }, + { + "epoch": 0.026397705078125, + "grad_norm": 18.028060913085938, + "learning_rate": 4.99204093722736e-06, + "loss": 3.4888, + "step": 2595 + }, + { + "epoch": 0.026448567708333332, + "grad_norm": 14.927176475524902, + "learning_rate": 4.992009022234252e-06, + "loss": 3.7141, + "step": 2600 + }, + { + "epoch": 0.026499430338541668, + "grad_norm": 10.663310050964355, + "learning_rate": 4.991977043483684e-06, + "loss": 3.3668, + "step": 2605 + }, + { + "epoch": 0.02655029296875, + "grad_norm": 21.105947494506836, + "learning_rate": 4.991945000976475e-06, + "loss": 3.7438, + "step": 2610 + }, + { + "epoch": 0.026601155598958332, + "grad_norm": 11.864397048950195, + "learning_rate": 4.991912894713443e-06, + "loss": 3.4406, + "step": 2615 + }, + { + "epoch": 0.026652018229166668, + "grad_norm": 16.927186965942383, + "learning_rate": 4.99188072469541e-06, + "loss": 3.8746, + "step": 2620 + }, + { + "epoch": 0.026702880859375, + "grad_norm": 10.140044212341309, + "learning_rate": 4.9918484909232e-06, + "loss": 3.7149, + "step": 2625 + }, + { + "epoch": 0.026753743489583332, + "grad_norm": 10.26709270477295, + "learning_rate": 4.991816193397637e-06, + "loss": 3.4153, + "step": 2630 + }, + { + "epoch": 0.026804606119791668, + "grad_norm": 18.33658790588379, + "learning_rate": 4.991783832119547e-06, + "loss": 3.6604, + "step": 2635 + }, + { + "epoch": 0.02685546875, + "grad_norm": 15.353517532348633, + "learning_rate": 4.991751407089759e-06, + "loss": 3.4237, + "step": 2640 + }, + { + "epoch": 0.026906331380208332, + "grad_norm": 13.645818710327148, + "learning_rate": 4.991718918309101e-06, + "loss": 3.3133, + "step": 2645 + }, + { + "epoch": 0.026957194010416668, + "grad_norm": 8.63716983795166, + "learning_rate": 4.991686365778405e-06, + "loss": 3.3966, + "step": 2650 + }, + { + "epoch": 0.027008056640625, + "grad_norm": 12.624420166015625, + "learning_rate": 4.991653749498504e-06, + "loss": 3.3418, + "step": 2655 + }, + { + "epoch": 0.027058919270833332, + "grad_norm": 16.91732406616211, + "learning_rate": 4.991621069470233e-06, + "loss": 4.1244, + "step": 2660 + }, + { + "epoch": 0.027109781901041668, + "grad_norm": 15.740944862365723, + "learning_rate": 4.991588325694426e-06, + "loss": 3.4994, + "step": 2665 + }, + { + "epoch": 0.02716064453125, + "grad_norm": 12.525944709777832, + "learning_rate": 4.9915555181719235e-06, + "loss": 3.7722, + "step": 2670 + }, + { + "epoch": 0.027211507161458332, + "grad_norm": 13.604511260986328, + "learning_rate": 4.991522646903564e-06, + "loss": 3.4386, + "step": 2675 + }, + { + "epoch": 0.027262369791666668, + "grad_norm": 10.147235870361328, + "learning_rate": 4.991489711890188e-06, + "loss": 3.5642, + "step": 2680 + }, + { + "epoch": 0.027313232421875, + "grad_norm": 14.985891342163086, + "learning_rate": 4.991456713132637e-06, + "loss": 3.5542, + "step": 2685 + }, + { + "epoch": 0.027364095052083332, + "grad_norm": 9.699203491210938, + "learning_rate": 4.991423650631758e-06, + "loss": 3.4597, + "step": 2690 + }, + { + "epoch": 0.027414957682291668, + "grad_norm": 19.256378173828125, + "learning_rate": 4.991390524388394e-06, + "loss": 3.9099, + "step": 2695 + }, + { + "epoch": 0.0274658203125, + "grad_norm": 13.979656219482422, + "learning_rate": 4.991357334403396e-06, + "loss": 3.4309, + "step": 2700 + }, + { + "epoch": 0.027516682942708332, + "grad_norm": 15.490063667297363, + "learning_rate": 4.9913240806776095e-06, + "loss": 3.6231, + "step": 2705 + }, + { + "epoch": 0.027567545572916668, + "grad_norm": 11.414307594299316, + "learning_rate": 4.991290763211887e-06, + "loss": 3.5332, + "step": 2710 + }, + { + "epoch": 0.027618408203125, + "grad_norm": 14.588093757629395, + "learning_rate": 4.991257382007081e-06, + "loss": 3.8702, + "step": 2715 + }, + { + "epoch": 0.027669270833333332, + "grad_norm": 11.244452476501465, + "learning_rate": 4.9912239370640455e-06, + "loss": 4.1972, + "step": 2720 + }, + { + "epoch": 0.027720133463541668, + "grad_norm": 18.345767974853516, + "learning_rate": 4.991190428383637e-06, + "loss": 3.4974, + "step": 2725 + }, + { + "epoch": 0.02777099609375, + "grad_norm": 16.70025634765625, + "learning_rate": 4.99115685596671e-06, + "loss": 3.4827, + "step": 2730 + }, + { + "epoch": 0.027821858723958332, + "grad_norm": 9.72400951385498, + "learning_rate": 4.9911232198141266e-06, + "loss": 3.7941, + "step": 2735 + }, + { + "epoch": 0.027872721354166668, + "grad_norm": 18.471965789794922, + "learning_rate": 4.991089519926746e-06, + "loss": 3.3165, + "step": 2740 + }, + { + "epoch": 0.027923583984375, + "grad_norm": 8.832108497619629, + "learning_rate": 4.9910557563054295e-06, + "loss": 4.046, + "step": 2745 + }, + { + "epoch": 0.027974446614583332, + "grad_norm": 13.375359535217285, + "learning_rate": 4.991021928951043e-06, + "loss": 3.7558, + "step": 2750 + }, + { + "epoch": 0.028025309244791668, + "grad_norm": 14.425735473632812, + "learning_rate": 4.99098803786445e-06, + "loss": 3.7207, + "step": 2755 + }, + { + "epoch": 0.028076171875, + "grad_norm": 11.840872764587402, + "learning_rate": 4.99095408304652e-06, + "loss": 3.8429, + "step": 2760 + }, + { + "epoch": 0.028127034505208332, + "grad_norm": 11.43602466583252, + "learning_rate": 4.990920064498119e-06, + "loss": 3.3477, + "step": 2765 + }, + { + "epoch": 0.028177897135416668, + "grad_norm": 21.40333366394043, + "learning_rate": 4.9908859822201186e-06, + "loss": 3.3127, + "step": 2770 + }, + { + "epoch": 0.028228759765625, + "grad_norm": 9.050609588623047, + "learning_rate": 4.990851836213391e-06, + "loss": 3.5749, + "step": 2775 + }, + { + "epoch": 0.028279622395833332, + "grad_norm": 18.72174644470215, + "learning_rate": 4.990817626478809e-06, + "loss": 3.4886, + "step": 2780 + }, + { + "epoch": 0.028330485026041668, + "grad_norm": 17.724315643310547, + "learning_rate": 4.990783353017249e-06, + "loss": 3.6107, + "step": 2785 + }, + { + "epoch": 0.02838134765625, + "grad_norm": 17.255958557128906, + "learning_rate": 4.990749015829587e-06, + "loss": 3.4667, + "step": 2790 + }, + { + "epoch": 0.028432210286458332, + "grad_norm": 15.389385223388672, + "learning_rate": 4.9907146149167025e-06, + "loss": 3.6194, + "step": 2795 + }, + { + "epoch": 0.028483072916666668, + "grad_norm": 16.21523666381836, + "learning_rate": 4.990680150279474e-06, + "loss": 3.7462, + "step": 2800 + }, + { + "epoch": 0.028533935546875, + "grad_norm": 10.564112663269043, + "learning_rate": 4.990645621918785e-06, + "loss": 3.5578, + "step": 2805 + }, + { + "epoch": 0.028584798177083332, + "grad_norm": 11.502965927124023, + "learning_rate": 4.990611029835518e-06, + "loss": 3.3145, + "step": 2810 + }, + { + "epoch": 0.028635660807291668, + "grad_norm": 11.729395866394043, + "learning_rate": 4.990576374030558e-06, + "loss": 3.5022, + "step": 2815 + }, + { + "epoch": 0.0286865234375, + "grad_norm": 17.083139419555664, + "learning_rate": 4.9905416545047914e-06, + "loss": 3.1965, + "step": 2820 + }, + { + "epoch": 0.028737386067708332, + "grad_norm": 12.707961082458496, + "learning_rate": 4.990506871259107e-06, + "loss": 3.3708, + "step": 2825 + }, + { + "epoch": 0.028788248697916668, + "grad_norm": 12.2274808883667, + "learning_rate": 4.990472024294395e-06, + "loss": 3.6543, + "step": 2830 + }, + { + "epoch": 0.028839111328125, + "grad_norm": 10.415914535522461, + "learning_rate": 4.990437113611546e-06, + "loss": 3.3149, + "step": 2835 + }, + { + "epoch": 0.028889973958333332, + "grad_norm": 14.803680419921875, + "learning_rate": 4.990402139211454e-06, + "loss": 3.6954, + "step": 2840 + }, + { + "epoch": 0.028940836588541668, + "grad_norm": 9.584573745727539, + "learning_rate": 4.990367101095014e-06, + "loss": 3.2306, + "step": 2845 + }, + { + "epoch": 0.02899169921875, + "grad_norm": 9.19983196258545, + "learning_rate": 4.9903319992631215e-06, + "loss": 3.5329, + "step": 2850 + }, + { + "epoch": 0.029042561848958332, + "grad_norm": 13.915033340454102, + "learning_rate": 4.990296833716676e-06, + "loss": 3.5633, + "step": 2855 + }, + { + "epoch": 0.029093424479166668, + "grad_norm": 8.67843246459961, + "learning_rate": 4.990261604456575e-06, + "loss": 3.5621, + "step": 2860 + }, + { + "epoch": 0.029144287109375, + "grad_norm": 14.383540153503418, + "learning_rate": 4.990226311483721e-06, + "loss": 3.6706, + "step": 2865 + }, + { + "epoch": 0.029195149739583332, + "grad_norm": 15.184710502624512, + "learning_rate": 4.990190954799018e-06, + "loss": 3.5497, + "step": 2870 + }, + { + "epoch": 0.029246012369791668, + "grad_norm": 17.983572006225586, + "learning_rate": 4.990155534403369e-06, + "loss": 3.2129, + "step": 2875 + }, + { + "epoch": 0.029296875, + "grad_norm": 19.197294235229492, + "learning_rate": 4.9901200502976825e-06, + "loss": 3.7564, + "step": 2880 + }, + { + "epoch": 0.029347737630208332, + "grad_norm": 14.85810661315918, + "learning_rate": 4.990084502482863e-06, + "loss": 3.4496, + "step": 2885 + }, + { + "epoch": 0.029398600260416668, + "grad_norm": 12.596855163574219, + "learning_rate": 4.990048890959822e-06, + "loss": 3.7283, + "step": 2890 + }, + { + "epoch": 0.029449462890625, + "grad_norm": 17.732402801513672, + "learning_rate": 4.99001321572947e-06, + "loss": 3.8592, + "step": 2895 + }, + { + "epoch": 0.029500325520833332, + "grad_norm": 12.990941047668457, + "learning_rate": 4.989977476792721e-06, + "loss": 3.5972, + "step": 2900 + }, + { + "epoch": 0.029551188151041668, + "grad_norm": 15.059063911437988, + "learning_rate": 4.989941674150488e-06, + "loss": 3.6941, + "step": 2905 + }, + { + "epoch": 0.02960205078125, + "grad_norm": 13.130831718444824, + "learning_rate": 4.989905807803688e-06, + "loss": 3.5952, + "step": 2910 + }, + { + "epoch": 0.029652913411458332, + "grad_norm": 15.897665977478027, + "learning_rate": 4.989869877753237e-06, + "loss": 3.5691, + "step": 2915 + }, + { + "epoch": 0.029703776041666668, + "grad_norm": 11.824736595153809, + "learning_rate": 4.989833884000056e-06, + "loss": 3.7098, + "step": 2920 + }, + { + "epoch": 0.029754638671875, + "grad_norm": 14.641777038574219, + "learning_rate": 4.989797826545065e-06, + "loss": 3.6694, + "step": 2925 + }, + { + "epoch": 0.029805501302083332, + "grad_norm": 12.433199882507324, + "learning_rate": 4.989761705389187e-06, + "loss": 3.5255, + "step": 2930 + }, + { + "epoch": 0.029856363932291668, + "grad_norm": 12.532072067260742, + "learning_rate": 4.989725520533346e-06, + "loss": 3.5487, + "step": 2935 + }, + { + "epoch": 0.0299072265625, + "grad_norm": 16.237571716308594, + "learning_rate": 4.9896892719784675e-06, + "loss": 3.562, + "step": 2940 + }, + { + "epoch": 0.029958089192708332, + "grad_norm": 15.376089096069336, + "learning_rate": 4.989652959725479e-06, + "loss": 3.4396, + "step": 2945 + }, + { + "epoch": 0.030008951822916668, + "grad_norm": 13.294553756713867, + "learning_rate": 4.98961658377531e-06, + "loss": 3.8229, + "step": 2950 + }, + { + "epoch": 0.030059814453125, + "grad_norm": 15.016079902648926, + "learning_rate": 4.98958014412889e-06, + "loss": 3.4993, + "step": 2955 + }, + { + "epoch": 0.030110677083333332, + "grad_norm": 16.527847290039062, + "learning_rate": 4.989543640787153e-06, + "loss": 3.6308, + "step": 2960 + }, + { + "epoch": 0.030161539713541668, + "grad_norm": 15.960955619812012, + "learning_rate": 4.989507073751032e-06, + "loss": 3.5736, + "step": 2965 + }, + { + "epoch": 0.03021240234375, + "grad_norm": 19.20722198486328, + "learning_rate": 4.989470443021462e-06, + "loss": 3.3009, + "step": 2970 + }, + { + "epoch": 0.030263264973958332, + "grad_norm": 11.201433181762695, + "learning_rate": 4.989433748599381e-06, + "loss": 3.6235, + "step": 2975 + }, + { + "epoch": 0.030314127604166668, + "grad_norm": 13.62901782989502, + "learning_rate": 4.989396990485727e-06, + "loss": 4.1232, + "step": 2980 + }, + { + "epoch": 0.030364990234375, + "grad_norm": 15.175191879272461, + "learning_rate": 4.989360168681442e-06, + "loss": 3.6938, + "step": 2985 + }, + { + "epoch": 0.030415852864583332, + "grad_norm": 12.873221397399902, + "learning_rate": 4.9893232831874676e-06, + "loss": 3.7898, + "step": 2990 + }, + { + "epoch": 0.030466715494791668, + "grad_norm": 10.560770034790039, + "learning_rate": 4.989286334004746e-06, + "loss": 3.1075, + "step": 2995 + }, + { + "epoch": 0.030517578125, + "grad_norm": 16.364398956298828, + "learning_rate": 4.9892493211342235e-06, + "loss": 3.8729, + "step": 3000 + }, + { + "epoch": 0.030568440755208332, + "grad_norm": 14.276408195495605, + "learning_rate": 4.989212244576848e-06, + "loss": 3.4278, + "step": 3005 + }, + { + "epoch": 0.030619303385416668, + "grad_norm": 17.391857147216797, + "learning_rate": 4.989175104333567e-06, + "loss": 3.5136, + "step": 3010 + }, + { + "epoch": 0.030670166015625, + "grad_norm": 16.542964935302734, + "learning_rate": 4.98913790040533e-06, + "loss": 3.7648, + "step": 3015 + }, + { + "epoch": 0.030721028645833332, + "grad_norm": 12.984308242797852, + "learning_rate": 4.9891006327930905e-06, + "loss": 3.2495, + "step": 3020 + }, + { + "epoch": 0.030771891276041668, + "grad_norm": 16.2092227935791, + "learning_rate": 4.989063301497801e-06, + "loss": 3.2186, + "step": 3025 + }, + { + "epoch": 0.03082275390625, + "grad_norm": 9.466002464294434, + "learning_rate": 4.989025906520417e-06, + "loss": 3.5156, + "step": 3030 + }, + { + "epoch": 0.030873616536458332, + "grad_norm": 11.886150360107422, + "learning_rate": 4.988988447861895e-06, + "loss": 3.5858, + "step": 3035 + }, + { + "epoch": 0.030924479166666668, + "grad_norm": 10.326323509216309, + "learning_rate": 4.988950925523194e-06, + "loss": 3.3724, + "step": 3040 + }, + { + "epoch": 0.030975341796875, + "grad_norm": 15.874045372009277, + "learning_rate": 4.988913339505274e-06, + "loss": 3.5452, + "step": 3045 + }, + { + "epoch": 0.031026204427083332, + "grad_norm": 11.754904747009277, + "learning_rate": 4.988875689809095e-06, + "loss": 3.4911, + "step": 3050 + }, + { + "epoch": 0.031077067057291668, + "grad_norm": 17.041898727416992, + "learning_rate": 4.988837976435622e-06, + "loss": 3.6784, + "step": 3055 + }, + { + "epoch": 0.0311279296875, + "grad_norm": 11.09570026397705, + "learning_rate": 4.988800199385819e-06, + "loss": 3.366, + "step": 3060 + }, + { + "epoch": 0.031178792317708332, + "grad_norm": 14.231475830078125, + "learning_rate": 4.988762358660654e-06, + "loss": 3.3146, + "step": 3065 + }, + { + "epoch": 0.031229654947916668, + "grad_norm": 11.312032699584961, + "learning_rate": 4.988724454261092e-06, + "loss": 3.6764, + "step": 3070 + }, + { + "epoch": 0.031280517578125, + "grad_norm": 15.22202205657959, + "learning_rate": 4.988686486188105e-06, + "loss": 3.4084, + "step": 3075 + }, + { + "epoch": 0.031331380208333336, + "grad_norm": 14.88944149017334, + "learning_rate": 4.988648454442666e-06, + "loss": 3.1168, + "step": 3080 + }, + { + "epoch": 0.031382242838541664, + "grad_norm": 10.730203628540039, + "learning_rate": 4.988610359025745e-06, + "loss": 3.3345, + "step": 3085 + }, + { + "epoch": 0.03143310546875, + "grad_norm": 13.363783836364746, + "learning_rate": 4.988572199938317e-06, + "loss": 3.0878, + "step": 3090 + }, + { + "epoch": 0.031483968098958336, + "grad_norm": 21.714414596557617, + "learning_rate": 4.9885339771813604e-06, + "loss": 3.6447, + "step": 3095 + }, + { + "epoch": 0.031534830729166664, + "grad_norm": 18.616336822509766, + "learning_rate": 4.9884956907558515e-06, + "loss": 3.5968, + "step": 3100 + }, + { + "epoch": 0.031585693359375, + "grad_norm": 12.650989532470703, + "learning_rate": 4.98845734066277e-06, + "loss": 3.4597, + "step": 3105 + }, + { + "epoch": 0.031636555989583336, + "grad_norm": 14.19465446472168, + "learning_rate": 4.988418926903098e-06, + "loss": 3.399, + "step": 3110 + }, + { + "epoch": 0.031687418619791664, + "grad_norm": 9.14580249786377, + "learning_rate": 4.9883804494778165e-06, + "loss": 3.6816, + "step": 3115 + }, + { + "epoch": 0.03173828125, + "grad_norm": 10.480582237243652, + "learning_rate": 4.988341908387912e-06, + "loss": 3.128, + "step": 3120 + }, + { + "epoch": 0.031789143880208336, + "grad_norm": 10.952704429626465, + "learning_rate": 4.988303303634368e-06, + "loss": 3.4517, + "step": 3125 + }, + { + "epoch": 0.031840006510416664, + "grad_norm": 12.522828102111816, + "learning_rate": 4.988264635218175e-06, + "loss": 3.9186, + "step": 3130 + }, + { + "epoch": 0.031890869140625, + "grad_norm": 12.782024383544922, + "learning_rate": 4.988225903140321e-06, + "loss": 3.725, + "step": 3135 + }, + { + "epoch": 0.031941731770833336, + "grad_norm": 11.72850227355957, + "learning_rate": 4.988187107401797e-06, + "loss": 3.7264, + "step": 3140 + }, + { + "epoch": 0.031992594401041664, + "grad_norm": 12.133509635925293, + "learning_rate": 4.988148248003595e-06, + "loss": 3.4472, + "step": 3145 + }, + { + "epoch": 0.03204345703125, + "grad_norm": 12.93433666229248, + "learning_rate": 4.98810932494671e-06, + "loss": 3.9443, + "step": 3150 + }, + { + "epoch": 0.032094319661458336, + "grad_norm": 16.135568618774414, + "learning_rate": 4.988070338232138e-06, + "loss": 3.4966, + "step": 3155 + }, + { + "epoch": 0.032145182291666664, + "grad_norm": 18.191791534423828, + "learning_rate": 4.988031287860877e-06, + "loss": 3.5644, + "step": 3160 + }, + { + "epoch": 0.032196044921875, + "grad_norm": 11.924603462219238, + "learning_rate": 4.987992173833924e-06, + "loss": 3.4058, + "step": 3165 + }, + { + "epoch": 0.032246907552083336, + "grad_norm": 12.82744026184082, + "learning_rate": 4.987952996152281e-06, + "loss": 3.492, + "step": 3170 + }, + { + "epoch": 0.032297770182291664, + "grad_norm": 8.64533519744873, + "learning_rate": 4.987913754816951e-06, + "loss": 3.9099, + "step": 3175 + }, + { + "epoch": 0.0323486328125, + "grad_norm": 16.365373611450195, + "learning_rate": 4.987874449828937e-06, + "loss": 3.1102, + "step": 3180 + }, + { + "epoch": 0.032399495442708336, + "grad_norm": 15.942273139953613, + "learning_rate": 4.987835081189245e-06, + "loss": 3.7335, + "step": 3185 + }, + { + "epoch": 0.032450358072916664, + "grad_norm": 17.56052589416504, + "learning_rate": 4.987795648898882e-06, + "loss": 3.8561, + "step": 3190 + }, + { + "epoch": 0.032501220703125, + "grad_norm": 9.97986125946045, + "learning_rate": 4.987756152958857e-06, + "loss": 3.3722, + "step": 3195 + }, + { + "epoch": 0.032552083333333336, + "grad_norm": 13.236599922180176, + "learning_rate": 4.98771659337018e-06, + "loss": 3.5341, + "step": 3200 + }, + { + "epoch": 0.032602945963541664, + "grad_norm": 15.567898750305176, + "learning_rate": 4.987676970133864e-06, + "loss": 3.6612, + "step": 3205 + }, + { + "epoch": 0.03265380859375, + "grad_norm": 14.741514205932617, + "learning_rate": 4.987637283250923e-06, + "loss": 3.7577, + "step": 3210 + }, + { + "epoch": 0.032704671223958336, + "grad_norm": 12.727431297302246, + "learning_rate": 4.98759753272237e-06, + "loss": 4.3001, + "step": 3215 + }, + { + "epoch": 0.032755533854166664, + "grad_norm": 11.071520805358887, + "learning_rate": 4.987557718549225e-06, + "loss": 3.61, + "step": 3220 + }, + { + "epoch": 0.032806396484375, + "grad_norm": 10.508591651916504, + "learning_rate": 4.987517840732505e-06, + "loss": 3.3211, + "step": 3225 + }, + { + "epoch": 0.032857259114583336, + "grad_norm": 12.959603309631348, + "learning_rate": 4.987477899273232e-06, + "loss": 3.5145, + "step": 3230 + }, + { + "epoch": 0.032908121744791664, + "grad_norm": 14.553711891174316, + "learning_rate": 4.987437894172426e-06, + "loss": 3.9318, + "step": 3235 + }, + { + "epoch": 0.032958984375, + "grad_norm": 14.899251937866211, + "learning_rate": 4.987397825431109e-06, + "loss": 3.5583, + "step": 3240 + }, + { + "epoch": 0.033009847005208336, + "grad_norm": 11.648773193359375, + "learning_rate": 4.98735769305031e-06, + "loss": 3.2714, + "step": 3245 + }, + { + "epoch": 0.033060709635416664, + "grad_norm": 13.525915145874023, + "learning_rate": 4.987317497031055e-06, + "loss": 3.3922, + "step": 3250 + }, + { + "epoch": 0.033111572265625, + "grad_norm": 16.7314453125, + "learning_rate": 4.987277237374369e-06, + "loss": 3.6725, + "step": 3255 + }, + { + "epoch": 0.033162434895833336, + "grad_norm": 16.31707191467285, + "learning_rate": 4.987236914081286e-06, + "loss": 3.3669, + "step": 3260 + }, + { + "epoch": 0.033213297526041664, + "grad_norm": 11.438323974609375, + "learning_rate": 4.987196527152835e-06, + "loss": 3.4222, + "step": 3265 + }, + { + "epoch": 0.03326416015625, + "grad_norm": 14.705175399780273, + "learning_rate": 4.987156076590051e-06, + "loss": 3.6076, + "step": 3270 + }, + { + "epoch": 0.033315022786458336, + "grad_norm": 14.206781387329102, + "learning_rate": 4.987115562393969e-06, + "loss": 3.4704, + "step": 3275 + }, + { + "epoch": 0.033365885416666664, + "grad_norm": 10.712113380432129, + "learning_rate": 4.987074984565624e-06, + "loss": 3.575, + "step": 3280 + }, + { + "epoch": 0.033416748046875, + "grad_norm": 12.62575626373291, + "learning_rate": 4.987034343106055e-06, + "loss": 3.2375, + "step": 3285 + }, + { + "epoch": 0.033467610677083336, + "grad_norm": 13.155769348144531, + "learning_rate": 4.986993638016302e-06, + "loss": 3.0587, + "step": 3290 + }, + { + "epoch": 0.033518473307291664, + "grad_norm": 15.919364929199219, + "learning_rate": 4.986952869297407e-06, + "loss": 3.569, + "step": 3295 + }, + { + "epoch": 0.0335693359375, + "grad_norm": 16.994239807128906, + "learning_rate": 4.986912036950411e-06, + "loss": 3.7481, + "step": 3300 + }, + { + "epoch": 0.033620198567708336, + "grad_norm": 13.510692596435547, + "learning_rate": 4.986871140976361e-06, + "loss": 3.5751, + "step": 3305 + }, + { + "epoch": 0.033671061197916664, + "grad_norm": 14.510315895080566, + "learning_rate": 4.986830181376302e-06, + "loss": 3.2262, + "step": 3310 + }, + { + "epoch": 0.033721923828125, + "grad_norm": 13.225347518920898, + "learning_rate": 4.986789158151282e-06, + "loss": 3.3544, + "step": 3315 + }, + { + "epoch": 0.033772786458333336, + "grad_norm": 15.174870491027832, + "learning_rate": 4.9867480713023506e-06, + "loss": 3.39, + "step": 3320 + }, + { + "epoch": 0.033823649088541664, + "grad_norm": 13.420331954956055, + "learning_rate": 4.98670692083056e-06, + "loss": 3.4115, + "step": 3325 + }, + { + "epoch": 0.03387451171875, + "grad_norm": 14.904714584350586, + "learning_rate": 4.986665706736962e-06, + "loss": 3.5199, + "step": 3330 + }, + { + "epoch": 0.033925374348958336, + "grad_norm": 9.973340034484863, + "learning_rate": 4.986624429022611e-06, + "loss": 3.5716, + "step": 3335 + }, + { + "epoch": 0.033976236979166664, + "grad_norm": 12.208763122558594, + "learning_rate": 4.986583087688563e-06, + "loss": 3.6914, + "step": 3340 + }, + { + "epoch": 0.034027099609375, + "grad_norm": 13.434621810913086, + "learning_rate": 4.986541682735877e-06, + "loss": 3.2363, + "step": 3345 + }, + { + "epoch": 0.034077962239583336, + "grad_norm": 17.55096435546875, + "learning_rate": 4.986500214165611e-06, + "loss": 3.4762, + "step": 3350 + }, + { + "epoch": 0.034128824869791664, + "grad_norm": 11.070765495300293, + "learning_rate": 4.986458681978826e-06, + "loss": 3.6086, + "step": 3355 + }, + { + "epoch": 0.0341796875, + "grad_norm": 12.782002449035645, + "learning_rate": 4.986417086176586e-06, + "loss": 3.3807, + "step": 3360 + }, + { + "epoch": 0.034230550130208336, + "grad_norm": 11.93525218963623, + "learning_rate": 4.9863754267599535e-06, + "loss": 3.6002, + "step": 3365 + }, + { + "epoch": 0.034281412760416664, + "grad_norm": 12.5736083984375, + "learning_rate": 4.986333703729995e-06, + "loss": 3.6075, + "step": 3370 + }, + { + "epoch": 0.034332275390625, + "grad_norm": 14.359274864196777, + "learning_rate": 4.986291917087778e-06, + "loss": 3.461, + "step": 3375 + }, + { + "epoch": 0.034383138020833336, + "grad_norm": 12.868420600891113, + "learning_rate": 4.9862500668343714e-06, + "loss": 3.9434, + "step": 3380 + }, + { + "epoch": 0.034434000651041664, + "grad_norm": 18.13227653503418, + "learning_rate": 4.986208152970847e-06, + "loss": 3.2659, + "step": 3385 + }, + { + "epoch": 0.03448486328125, + "grad_norm": 8.368807792663574, + "learning_rate": 4.986166175498276e-06, + "loss": 3.6618, + "step": 3390 + }, + { + "epoch": 0.034535725911458336, + "grad_norm": 11.16234302520752, + "learning_rate": 4.986124134417732e-06, + "loss": 3.3186, + "step": 3395 + }, + { + "epoch": 0.034586588541666664, + "grad_norm": 14.434364318847656, + "learning_rate": 4.986082029730292e-06, + "loss": 3.8224, + "step": 3400 + }, + { + "epoch": 0.034637451171875, + "grad_norm": 11.538227081298828, + "learning_rate": 4.9860398614370324e-06, + "loss": 3.3252, + "step": 3405 + }, + { + "epoch": 0.034688313802083336, + "grad_norm": 15.10848617553711, + "learning_rate": 4.985997629539032e-06, + "loss": 3.3582, + "step": 3410 + }, + { + "epoch": 0.034739176432291664, + "grad_norm": 12.740103721618652, + "learning_rate": 4.985955334037372e-06, + "loss": 3.672, + "step": 3415 + }, + { + "epoch": 0.0347900390625, + "grad_norm": 15.706042289733887, + "learning_rate": 4.985912974933134e-06, + "loss": 3.379, + "step": 3420 + }, + { + "epoch": 0.034840901692708336, + "grad_norm": 21.957704544067383, + "learning_rate": 4.985870552227401e-06, + "loss": 3.8408, + "step": 3425 + }, + { + "epoch": 0.034891764322916664, + "grad_norm": 8.741256713867188, + "learning_rate": 4.9858280659212595e-06, + "loss": 3.4575, + "step": 3430 + }, + { + "epoch": 0.034942626953125, + "grad_norm": 16.37959098815918, + "learning_rate": 4.9857855160157965e-06, + "loss": 3.7038, + "step": 3435 + }, + { + "epoch": 0.034993489583333336, + "grad_norm": 15.630584716796875, + "learning_rate": 4.9857429025120996e-06, + "loss": 3.3838, + "step": 3440 + }, + { + "epoch": 0.035044352213541664, + "grad_norm": 13.591779708862305, + "learning_rate": 4.98570022541126e-06, + "loss": 3.4661, + "step": 3445 + }, + { + "epoch": 0.03509521484375, + "grad_norm": 14.534747123718262, + "learning_rate": 4.985657484714369e-06, + "loss": 3.487, + "step": 3450 + }, + { + "epoch": 0.035146077473958336, + "grad_norm": 10.500582695007324, + "learning_rate": 4.985614680422521e-06, + "loss": 3.6731, + "step": 3455 + }, + { + "epoch": 0.035196940104166664, + "grad_norm": 10.115405082702637, + "learning_rate": 4.9855718125368105e-06, + "loss": 3.4601, + "step": 3460 + }, + { + "epoch": 0.035247802734375, + "grad_norm": 17.56122398376465, + "learning_rate": 4.985528881058334e-06, + "loss": 3.6548, + "step": 3465 + }, + { + "epoch": 0.035298665364583336, + "grad_norm": 9.477646827697754, + "learning_rate": 4.9854858859881905e-06, + "loss": 3.3522, + "step": 3470 + }, + { + "epoch": 0.035349527994791664, + "grad_norm": 12.787845611572266, + "learning_rate": 4.985442827327479e-06, + "loss": 3.3005, + "step": 3475 + }, + { + "epoch": 0.035400390625, + "grad_norm": 9.624972343444824, + "learning_rate": 4.985399705077303e-06, + "loss": 3.6494, + "step": 3480 + }, + { + "epoch": 0.035451253255208336, + "grad_norm": 16.32085609436035, + "learning_rate": 4.985356519238764e-06, + "loss": 3.6078, + "step": 3485 + }, + { + "epoch": 0.035502115885416664, + "grad_norm": 14.953262329101562, + "learning_rate": 4.985313269812968e-06, + "loss": 3.256, + "step": 3490 + }, + { + "epoch": 0.035552978515625, + "grad_norm": 19.811403274536133, + "learning_rate": 4.985269956801021e-06, + "loss": 3.6117, + "step": 3495 + }, + { + "epoch": 0.035603841145833336, + "grad_norm": 15.88729190826416, + "learning_rate": 4.985226580204031e-06, + "loss": 3.5138, + "step": 3500 + }, + { + "epoch": 0.035654703776041664, + "grad_norm": 16.489349365234375, + "learning_rate": 4.9851831400231075e-06, + "loss": 3.5093, + "step": 3505 + }, + { + "epoch": 0.03570556640625, + "grad_norm": 11.760025024414062, + "learning_rate": 4.985139636259363e-06, + "loss": 4.247, + "step": 3510 + }, + { + "epoch": 0.035756429036458336, + "grad_norm": 12.459917068481445, + "learning_rate": 4.98509606891391e-06, + "loss": 3.4997, + "step": 3515 + }, + { + "epoch": 0.035807291666666664, + "grad_norm": 10.306233406066895, + "learning_rate": 4.985052437987863e-06, + "loss": 3.4573, + "step": 3520 + }, + { + "epoch": 0.035858154296875, + "grad_norm": 16.409082412719727, + "learning_rate": 4.9850087434823384e-06, + "loss": 3.4351, + "step": 3525 + }, + { + "epoch": 0.035909016927083336, + "grad_norm": 9.501917839050293, + "learning_rate": 4.984964985398454e-06, + "loss": 3.4707, + "step": 3530 + }, + { + "epoch": 0.035959879557291664, + "grad_norm": 13.083230972290039, + "learning_rate": 4.98492116373733e-06, + "loss": 3.3439, + "step": 3535 + }, + { + "epoch": 0.0360107421875, + "grad_norm": 11.03459358215332, + "learning_rate": 4.984877278500087e-06, + "loss": 3.3296, + "step": 3540 + }, + { + "epoch": 0.036061604817708336, + "grad_norm": 15.221756935119629, + "learning_rate": 4.984833329687847e-06, + "loss": 3.5049, + "step": 3545 + }, + { + "epoch": 0.036112467447916664, + "grad_norm": 12.892142295837402, + "learning_rate": 4.9847893173017345e-06, + "loss": 3.6549, + "step": 3550 + }, + { + "epoch": 0.036163330078125, + "grad_norm": 13.091279029846191, + "learning_rate": 4.984745241342877e-06, + "loss": 3.6493, + "step": 3555 + }, + { + "epoch": 0.036214192708333336, + "grad_norm": 14.84507942199707, + "learning_rate": 4.984701101812402e-06, + "loss": 3.5608, + "step": 3560 + }, + { + "epoch": 0.036265055338541664, + "grad_norm": 12.539996147155762, + "learning_rate": 4.984656898711438e-06, + "loss": 3.5464, + "step": 3565 + }, + { + "epoch": 0.03631591796875, + "grad_norm": 12.54378604888916, + "learning_rate": 4.984612632041117e-06, + "loss": 3.7036, + "step": 3570 + }, + { + "epoch": 0.036366780598958336, + "grad_norm": 15.650618553161621, + "learning_rate": 4.98456830180257e-06, + "loss": 3.7464, + "step": 3575 + }, + { + "epoch": 0.036417643229166664, + "grad_norm": 12.167283058166504, + "learning_rate": 4.984523907996932e-06, + "loss": 3.6591, + "step": 3580 + }, + { + "epoch": 0.036468505859375, + "grad_norm": 13.527252197265625, + "learning_rate": 4.984479450625338e-06, + "loss": 3.5705, + "step": 3585 + }, + { + "epoch": 0.036519368489583336, + "grad_norm": 16.410930633544922, + "learning_rate": 4.9844349296889275e-06, + "loss": 3.7471, + "step": 3590 + }, + { + "epoch": 0.036570231119791664, + "grad_norm": 10.030250549316406, + "learning_rate": 4.984390345188838e-06, + "loss": 3.841, + "step": 3595 + }, + { + "epoch": 0.03662109375, + "grad_norm": 11.255728721618652, + "learning_rate": 4.9843456971262095e-06, + "loss": 3.5435, + "step": 3600 + }, + { + "epoch": 0.036671956380208336, + "grad_norm": 9.756819725036621, + "learning_rate": 4.984300985502185e-06, + "loss": 3.3497, + "step": 3605 + }, + { + "epoch": 0.036722819010416664, + "grad_norm": 12.754555702209473, + "learning_rate": 4.984256210317909e-06, + "loss": 3.723, + "step": 3610 + }, + { + "epoch": 0.036773681640625, + "grad_norm": 15.393765449523926, + "learning_rate": 4.984211371574527e-06, + "loss": 3.99, + "step": 3615 + }, + { + "epoch": 0.036824544270833336, + "grad_norm": 16.75415802001953, + "learning_rate": 4.984166469273186e-06, + "loss": 3.1881, + "step": 3620 + }, + { + "epoch": 0.036875406901041664, + "grad_norm": 15.176877975463867, + "learning_rate": 4.984121503415034e-06, + "loss": 3.6833, + "step": 3625 + }, + { + "epoch": 0.03692626953125, + "grad_norm": 95.34676361083984, + "learning_rate": 4.9840764740012225e-06, + "loss": 3.5113, + "step": 3630 + }, + { + "epoch": 0.036977132161458336, + "grad_norm": 12.578669548034668, + "learning_rate": 4.984031381032903e-06, + "loss": 4.0164, + "step": 3635 + }, + { + "epoch": 0.037027994791666664, + "grad_norm": 12.883451461791992, + "learning_rate": 4.98398622451123e-06, + "loss": 3.2015, + "step": 3640 + }, + { + "epoch": 0.037078857421875, + "grad_norm": 14.403608322143555, + "learning_rate": 4.983941004437358e-06, + "loss": 3.2212, + "step": 3645 + }, + { + "epoch": 0.037129720052083336, + "grad_norm": 14.960567474365234, + "learning_rate": 4.983895720812444e-06, + "loss": 3.2845, + "step": 3650 + }, + { + "epoch": 0.037180582682291664, + "grad_norm": 9.842466354370117, + "learning_rate": 4.9838503736376465e-06, + "loss": 3.4744, + "step": 3655 + }, + { + "epoch": 0.0372314453125, + "grad_norm": 9.656144142150879, + "learning_rate": 4.983804962914126e-06, + "loss": 3.6732, + "step": 3660 + }, + { + "epoch": 0.037282307942708336, + "grad_norm": 15.675665855407715, + "learning_rate": 4.983759488643045e-06, + "loss": 3.4493, + "step": 3665 + }, + { + "epoch": 0.037333170572916664, + "grad_norm": 11.249836921691895, + "learning_rate": 4.983713950825565e-06, + "loss": 3.6472, + "step": 3670 + }, + { + "epoch": 0.037384033203125, + "grad_norm": 16.177907943725586, + "learning_rate": 4.983668349462853e-06, + "loss": 3.3561, + "step": 3675 + }, + { + "epoch": 0.037434895833333336, + "grad_norm": 14.318443298339844, + "learning_rate": 4.983622684556075e-06, + "loss": 3.4628, + "step": 3680 + }, + { + "epoch": 0.037485758463541664, + "grad_norm": 14.132331848144531, + "learning_rate": 4.9835769561064e-06, + "loss": 3.4866, + "step": 3685 + }, + { + "epoch": 0.03753662109375, + "grad_norm": 12.774168014526367, + "learning_rate": 4.9835311641149955e-06, + "loss": 3.2058, + "step": 3690 + }, + { + "epoch": 0.037587483723958336, + "grad_norm": 10.149826049804688, + "learning_rate": 4.983485308583036e-06, + "loss": 3.2685, + "step": 3695 + }, + { + "epoch": 0.037638346354166664, + "grad_norm": 12.236494064331055, + "learning_rate": 4.983439389511693e-06, + "loss": 3.3043, + "step": 3700 + }, + { + "epoch": 0.037689208984375, + "grad_norm": 13.76052188873291, + "learning_rate": 4.983393406902142e-06, + "loss": 3.6717, + "step": 3705 + }, + { + "epoch": 0.037740071614583336, + "grad_norm": 16.32668113708496, + "learning_rate": 4.983347360755559e-06, + "loss": 3.6747, + "step": 3710 + }, + { + "epoch": 0.037790934244791664, + "grad_norm": 14.244294166564941, + "learning_rate": 4.983301251073124e-06, + "loss": 3.8085, + "step": 3715 + }, + { + "epoch": 0.037841796875, + "grad_norm": 15.147490501403809, + "learning_rate": 4.983255077856014e-06, + "loss": 3.2321, + "step": 3720 + }, + { + "epoch": 0.037892659505208336, + "grad_norm": 12.199444770812988, + "learning_rate": 4.983208841105411e-06, + "loss": 3.4871, + "step": 3725 + }, + { + "epoch": 0.037943522135416664, + "grad_norm": 18.730581283569336, + "learning_rate": 4.983162540822498e-06, + "loss": 4.198, + "step": 3730 + }, + { + "epoch": 0.037994384765625, + "grad_norm": 14.874238967895508, + "learning_rate": 4.983116177008461e-06, + "loss": 3.5318, + "step": 3735 + }, + { + "epoch": 0.038045247395833336, + "grad_norm": 10.15190601348877, + "learning_rate": 4.9830697496644855e-06, + "loss": 3.3821, + "step": 3740 + }, + { + "epoch": 0.038096110026041664, + "grad_norm": 17.800188064575195, + "learning_rate": 4.983023258791758e-06, + "loss": 3.5971, + "step": 3745 + }, + { + "epoch": 0.03814697265625, + "grad_norm": 45.00802993774414, + "learning_rate": 4.98297670439147e-06, + "loss": 3.0957, + "step": 3750 + }, + { + "epoch": 0.038197835286458336, + "grad_norm": 11.17233657836914, + "learning_rate": 4.9829300864648104e-06, + "loss": 3.6894, + "step": 3755 + }, + { + "epoch": 0.038248697916666664, + "grad_norm": 16.200355529785156, + "learning_rate": 4.982883405012974e-06, + "loss": 3.2511, + "step": 3760 + }, + { + "epoch": 0.038299560546875, + "grad_norm": 11.851147651672363, + "learning_rate": 4.982836660037154e-06, + "loss": 3.31, + "step": 3765 + }, + { + "epoch": 0.038350423177083336, + "grad_norm": 8.351678848266602, + "learning_rate": 4.982789851538545e-06, + "loss": 3.1318, + "step": 3770 + }, + { + "epoch": 0.038401285807291664, + "grad_norm": 12.307997703552246, + "learning_rate": 4.982742979518348e-06, + "loss": 3.3283, + "step": 3775 + }, + { + "epoch": 0.0384521484375, + "grad_norm": 11.954642295837402, + "learning_rate": 4.98269604397776e-06, + "loss": 3.7551, + "step": 3780 + }, + { + "epoch": 0.038503011067708336, + "grad_norm": 11.246746063232422, + "learning_rate": 4.982649044917982e-06, + "loss": 3.5243, + "step": 3785 + }, + { + "epoch": 0.038553873697916664, + "grad_norm": 19.319927215576172, + "learning_rate": 4.982601982340216e-06, + "loss": 3.546, + "step": 3790 + }, + { + "epoch": 0.038604736328125, + "grad_norm": 15.717957496643066, + "learning_rate": 4.982554856245668e-06, + "loss": 3.9242, + "step": 3795 + }, + { + "epoch": 0.038655598958333336, + "grad_norm": 10.940032005310059, + "learning_rate": 4.982507666635541e-06, + "loss": 3.5011, + "step": 3800 + }, + { + "epoch": 0.038706461588541664, + "grad_norm": 17.505874633789062, + "learning_rate": 4.982460413511045e-06, + "loss": 3.4926, + "step": 3805 + }, + { + "epoch": 0.03875732421875, + "grad_norm": 13.361002922058105, + "learning_rate": 4.9824130968733875e-06, + "loss": 3.431, + "step": 3810 + }, + { + "epoch": 0.038808186848958336, + "grad_norm": 11.022466659545898, + "learning_rate": 4.982365716723779e-06, + "loss": 3.4269, + "step": 3815 + }, + { + "epoch": 0.038859049479166664, + "grad_norm": 13.40270709991455, + "learning_rate": 4.982318273063432e-06, + "loss": 3.4141, + "step": 3820 + }, + { + "epoch": 0.038909912109375, + "grad_norm": 12.13808536529541, + "learning_rate": 4.98227076589356e-06, + "loss": 3.4167, + "step": 3825 + }, + { + "epoch": 0.038960774739583336, + "grad_norm": 13.911450386047363, + "learning_rate": 4.98222319521538e-06, + "loss": 3.5935, + "step": 3830 + }, + { + "epoch": 0.039011637369791664, + "grad_norm": 9.00546646118164, + "learning_rate": 4.982175561030107e-06, + "loss": 3.4011, + "step": 3835 + }, + { + "epoch": 0.0390625, + "grad_norm": 15.89461612701416, + "learning_rate": 4.982127863338961e-06, + "loss": 3.3942, + "step": 3840 + }, + { + "epoch": 0.039113362630208336, + "grad_norm": 15.606123924255371, + "learning_rate": 4.982080102143161e-06, + "loss": 3.4825, + "step": 3845 + }, + { + "epoch": 0.039164225260416664, + "grad_norm": 16.339702606201172, + "learning_rate": 4.982032277443931e-06, + "loss": 3.5663, + "step": 3850 + }, + { + "epoch": 0.039215087890625, + "grad_norm": 9.590949058532715, + "learning_rate": 4.981984389242493e-06, + "loss": 3.5181, + "step": 3855 + }, + { + "epoch": 0.039265950520833336, + "grad_norm": 8.50133228302002, + "learning_rate": 4.981936437540073e-06, + "loss": 3.5247, + "step": 3860 + }, + { + "epoch": 0.039316813151041664, + "grad_norm": 15.226415634155273, + "learning_rate": 4.981888422337897e-06, + "loss": 3.5207, + "step": 3865 + }, + { + "epoch": 0.03936767578125, + "grad_norm": 15.351889610290527, + "learning_rate": 4.981840343637194e-06, + "loss": 3.6821, + "step": 3870 + }, + { + "epoch": 0.039418538411458336, + "grad_norm": 11.84135627746582, + "learning_rate": 4.981792201439195e-06, + "loss": 3.4474, + "step": 3875 + }, + { + "epoch": 0.039469401041666664, + "grad_norm": 19.106542587280273, + "learning_rate": 4.9817439957451295e-06, + "loss": 3.4201, + "step": 3880 + }, + { + "epoch": 0.039520263671875, + "grad_norm": 14.762589454650879, + "learning_rate": 4.981695726556233e-06, + "loss": 3.6159, + "step": 3885 + }, + { + "epoch": 0.039571126302083336, + "grad_norm": 10.755833625793457, + "learning_rate": 4.98164739387374e-06, + "loss": 3.6952, + "step": 3890 + }, + { + "epoch": 0.039621988932291664, + "grad_norm": 7.459182262420654, + "learning_rate": 4.9815989976988856e-06, + "loss": 3.3008, + "step": 3895 + }, + { + "epoch": 0.0396728515625, + "grad_norm": 16.29890251159668, + "learning_rate": 4.98155053803291e-06, + "loss": 3.7614, + "step": 3900 + }, + { + "epoch": 0.039723714192708336, + "grad_norm": 15.238051414489746, + "learning_rate": 4.981502014877051e-06, + "loss": 3.5197, + "step": 3905 + }, + { + "epoch": 0.039774576822916664, + "grad_norm": 16.749492645263672, + "learning_rate": 4.981453428232551e-06, + "loss": 4.1579, + "step": 3910 + }, + { + "epoch": 0.039825439453125, + "grad_norm": 14.327499389648438, + "learning_rate": 4.981404778100654e-06, + "loss": 3.1206, + "step": 3915 + }, + { + "epoch": 0.039876302083333336, + "grad_norm": 18.08525848388672, + "learning_rate": 4.981356064482604e-06, + "loss": 3.5667, + "step": 3920 + }, + { + "epoch": 0.039927164713541664, + "grad_norm": 9.953446388244629, + "learning_rate": 4.981307287379647e-06, + "loss": 3.1772, + "step": 3925 + }, + { + "epoch": 0.03997802734375, + "grad_norm": 12.199675559997559, + "learning_rate": 4.9812584467930315e-06, + "loss": 3.3682, + "step": 3930 + }, + { + "epoch": 0.040028889973958336, + "grad_norm": 14.886872291564941, + "learning_rate": 4.981209542724006e-06, + "loss": 3.3351, + "step": 3935 + }, + { + "epoch": 0.040079752604166664, + "grad_norm": 15.33847427368164, + "learning_rate": 4.981160575173823e-06, + "loss": 3.2065, + "step": 3940 + }, + { + "epoch": 0.040130615234375, + "grad_norm": 10.76516056060791, + "learning_rate": 4.981111544143735e-06, + "loss": 3.6167, + "step": 3945 + }, + { + "epoch": 0.040181477864583336, + "grad_norm": 10.835091590881348, + "learning_rate": 4.981062449634996e-06, + "loss": 3.3896, + "step": 3950 + }, + { + "epoch": 0.040232340494791664, + "grad_norm": 8.555797576904297, + "learning_rate": 4.981013291648861e-06, + "loss": 3.3618, + "step": 3955 + }, + { + "epoch": 0.040283203125, + "grad_norm": 20.06720733642578, + "learning_rate": 4.980964070186591e-06, + "loss": 3.1369, + "step": 3960 + }, + { + "epoch": 0.040334065755208336, + "grad_norm": 10.057026863098145, + "learning_rate": 4.9809147852494425e-06, + "loss": 3.4979, + "step": 3965 + }, + { + "epoch": 0.040384928385416664, + "grad_norm": 9.190557479858398, + "learning_rate": 4.980865436838677e-06, + "loss": 3.5271, + "step": 3970 + }, + { + "epoch": 0.040435791015625, + "grad_norm": 13.96591854095459, + "learning_rate": 4.9808160249555585e-06, + "loss": 3.8185, + "step": 3975 + }, + { + "epoch": 0.040486653645833336, + "grad_norm": 13.018309593200684, + "learning_rate": 4.980766549601349e-06, + "loss": 3.3418, + "step": 3980 + }, + { + "epoch": 0.040537516276041664, + "grad_norm": 12.482734680175781, + "learning_rate": 4.9807170107773155e-06, + "loss": 3.7106, + "step": 3985 + }, + { + "epoch": 0.04058837890625, + "grad_norm": 19.169965744018555, + "learning_rate": 4.980667408484725e-06, + "loss": 3.7609, + "step": 3990 + }, + { + "epoch": 0.040639241536458336, + "grad_norm": 16.576343536376953, + "learning_rate": 4.980617742724847e-06, + "loss": 3.564, + "step": 3995 + }, + { + "epoch": 0.040690104166666664, + "grad_norm": 13.268610954284668, + "learning_rate": 4.980568013498952e-06, + "loss": 3.5274, + "step": 4000 + }, + { + "epoch": 0.040740966796875, + "grad_norm": 11.195540428161621, + "learning_rate": 4.980518220808312e-06, + "loss": 3.6116, + "step": 4005 + }, + { + "epoch": 0.040791829427083336, + "grad_norm": 10.44593620300293, + "learning_rate": 4.980468364654202e-06, + "loss": 3.4021, + "step": 4010 + }, + { + "epoch": 0.040842692057291664, + "grad_norm": 49.621131896972656, + "learning_rate": 4.980418445037897e-06, + "loss": 3.6859, + "step": 4015 + }, + { + "epoch": 0.0408935546875, + "grad_norm": 14.462821006774902, + "learning_rate": 4.980368461960673e-06, + "loss": 3.4476, + "step": 4020 + }, + { + "epoch": 0.040944417317708336, + "grad_norm": 10.35971736907959, + "learning_rate": 4.98031841542381e-06, + "loss": 3.6455, + "step": 4025 + }, + { + "epoch": 0.040995279947916664, + "grad_norm": 16.127132415771484, + "learning_rate": 4.980268305428589e-06, + "loss": 3.4884, + "step": 4030 + }, + { + "epoch": 0.041046142578125, + "grad_norm": 11.943824768066406, + "learning_rate": 4.980218131976291e-06, + "loss": 3.7037, + "step": 4035 + }, + { + "epoch": 0.041097005208333336, + "grad_norm": 13.393162727355957, + "learning_rate": 4.9801678950682e-06, + "loss": 3.5249, + "step": 4040 + }, + { + "epoch": 0.041147867838541664, + "grad_norm": 13.681143760681152, + "learning_rate": 4.9801175947056005e-06, + "loss": 3.4255, + "step": 4045 + }, + { + "epoch": 0.04119873046875, + "grad_norm": 14.354728698730469, + "learning_rate": 4.980067230889781e-06, + "loss": 3.3633, + "step": 4050 + }, + { + "epoch": 0.041249593098958336, + "grad_norm": 15.5615873336792, + "learning_rate": 4.9800168036220295e-06, + "loss": 3.5337, + "step": 4055 + }, + { + "epoch": 0.041300455729166664, + "grad_norm": 8.801639556884766, + "learning_rate": 4.9799663129036354e-06, + "loss": 3.5424, + "step": 4060 + }, + { + "epoch": 0.041351318359375, + "grad_norm": 11.887249946594238, + "learning_rate": 4.9799157587358905e-06, + "loss": 3.595, + "step": 4065 + }, + { + "epoch": 0.041402180989583336, + "grad_norm": 10.93061637878418, + "learning_rate": 4.979865141120089e-06, + "loss": 3.2786, + "step": 4070 + }, + { + "epoch": 0.041453043619791664, + "grad_norm": 13.5011625289917, + "learning_rate": 4.979814460057527e-06, + "loss": 3.3899, + "step": 4075 + }, + { + "epoch": 0.04150390625, + "grad_norm": 13.714580535888672, + "learning_rate": 4.979763715549498e-06, + "loss": 3.3754, + "step": 4080 + }, + { + "epoch": 0.041554768880208336, + "grad_norm": 8.235960960388184, + "learning_rate": 4.9797129075973025e-06, + "loss": 3.4213, + "step": 4085 + }, + { + "epoch": 0.041605631510416664, + "grad_norm": 9.007630348205566, + "learning_rate": 4.979662036202241e-06, + "loss": 3.5357, + "step": 4090 + }, + { + "epoch": 0.041656494140625, + "grad_norm": 11.23054313659668, + "learning_rate": 4.979611101365613e-06, + "loss": 3.7044, + "step": 4095 + }, + { + "epoch": 0.041707356770833336, + "grad_norm": 12.494560241699219, + "learning_rate": 4.979560103088723e-06, + "loss": 3.5578, + "step": 4100 + }, + { + "epoch": 0.041758219401041664, + "grad_norm": 15.440023422241211, + "learning_rate": 4.979509041372876e-06, + "loss": 3.3444, + "step": 4105 + }, + { + "epoch": 0.04180908203125, + "grad_norm": 14.82787799835205, + "learning_rate": 4.979457916219378e-06, + "loss": 3.7682, + "step": 4110 + }, + { + "epoch": 0.041859944661458336, + "grad_norm": 13.421795845031738, + "learning_rate": 4.979406727629536e-06, + "loss": 3.673, + "step": 4115 + }, + { + "epoch": 0.041910807291666664, + "grad_norm": 19.039793014526367, + "learning_rate": 4.979355475604661e-06, + "loss": 3.5433, + "step": 4120 + }, + { + "epoch": 0.041961669921875, + "grad_norm": 16.655475616455078, + "learning_rate": 4.979304160146064e-06, + "loss": 3.2191, + "step": 4125 + }, + { + "epoch": 0.042012532552083336, + "grad_norm": 13.381031036376953, + "learning_rate": 4.979252781255057e-06, + "loss": 3.1412, + "step": 4130 + }, + { + "epoch": 0.042063395182291664, + "grad_norm": 11.062989234924316, + "learning_rate": 4.979201338932956e-06, + "loss": 3.8812, + "step": 4135 + }, + { + "epoch": 0.0421142578125, + "grad_norm": 11.04766845703125, + "learning_rate": 4.979149833181076e-06, + "loss": 3.1701, + "step": 4140 + }, + { + "epoch": 0.042165120442708336, + "grad_norm": 12.870123863220215, + "learning_rate": 4.979098264000736e-06, + "loss": 3.4735, + "step": 4145 + }, + { + "epoch": 0.042215983072916664, + "grad_norm": 11.865944862365723, + "learning_rate": 4.979046631393253e-06, + "loss": 3.666, + "step": 4150 + }, + { + "epoch": 0.042266845703125, + "grad_norm": 10.82459831237793, + "learning_rate": 4.97899493535995e-06, + "loss": 3.6551, + "step": 4155 + }, + { + "epoch": 0.042317708333333336, + "grad_norm": 12.600262641906738, + "learning_rate": 4.97894317590215e-06, + "loss": 3.8571, + "step": 4160 + }, + { + "epoch": 0.042368570963541664, + "grad_norm": 9.6876220703125, + "learning_rate": 4.978891353021176e-06, + "loss": 3.5783, + "step": 4165 + }, + { + "epoch": 0.04241943359375, + "grad_norm": 13.637721061706543, + "learning_rate": 4.978839466718354e-06, + "loss": 3.7054, + "step": 4170 + }, + { + "epoch": 0.042470296223958336, + "grad_norm": 10.315239906311035, + "learning_rate": 4.978787516995012e-06, + "loss": 3.442, + "step": 4175 + }, + { + "epoch": 0.042521158854166664, + "grad_norm": 12.303563117980957, + "learning_rate": 4.9787355038524785e-06, + "loss": 3.425, + "step": 4180 + }, + { + "epoch": 0.042572021484375, + "grad_norm": 8.549363136291504, + "learning_rate": 4.978683427292086e-06, + "loss": 3.3095, + "step": 4185 + }, + { + "epoch": 0.042622884114583336, + "grad_norm": 14.406485557556152, + "learning_rate": 4.978631287315165e-06, + "loss": 3.4881, + "step": 4190 + }, + { + "epoch": 0.042673746744791664, + "grad_norm": 13.812122344970703, + "learning_rate": 4.978579083923049e-06, + "loss": 3.5369, + "step": 4195 + }, + { + "epoch": 0.042724609375, + "grad_norm": 14.43535041809082, + "learning_rate": 4.978526817117075e-06, + "loss": 3.5955, + "step": 4200 + }, + { + "epoch": 0.042775472005208336, + "grad_norm": 14.955559730529785, + "learning_rate": 4.97847448689858e-06, + "loss": 3.4783, + "step": 4205 + }, + { + "epoch": 0.042826334635416664, + "grad_norm": 15.662449836730957, + "learning_rate": 4.978422093268903e-06, + "loss": 3.0456, + "step": 4210 + }, + { + "epoch": 0.042877197265625, + "grad_norm": 21.45525360107422, + "learning_rate": 4.978369636229383e-06, + "loss": 3.3702, + "step": 4215 + }, + { + "epoch": 0.042928059895833336, + "grad_norm": 11.133001327514648, + "learning_rate": 4.978317115781365e-06, + "loss": 3.7719, + "step": 4220 + }, + { + "epoch": 0.042978922526041664, + "grad_norm": 14.71216869354248, + "learning_rate": 4.97826453192619e-06, + "loss": 3.2268, + "step": 4225 + }, + { + "epoch": 0.04302978515625, + "grad_norm": 8.537089347839355, + "learning_rate": 4.978211884665205e-06, + "loss": 3.3107, + "step": 4230 + }, + { + "epoch": 0.043080647786458336, + "grad_norm": 20.554950714111328, + "learning_rate": 4.978159173999756e-06, + "loss": 3.9013, + "step": 4235 + }, + { + "epoch": 0.043131510416666664, + "grad_norm": 16.465560913085938, + "learning_rate": 4.9781063999311914e-06, + "loss": 3.5264, + "step": 4240 + }, + { + "epoch": 0.043182373046875, + "grad_norm": 12.345917701721191, + "learning_rate": 4.978053562460863e-06, + "loss": 3.6222, + "step": 4245 + }, + { + "epoch": 0.043233235677083336, + "grad_norm": 14.286460876464844, + "learning_rate": 4.978000661590121e-06, + "loss": 3.55, + "step": 4250 + }, + { + "epoch": 0.043284098307291664, + "grad_norm": 16.970375061035156, + "learning_rate": 4.97794769732032e-06, + "loss": 3.7756, + "step": 4255 + }, + { + "epoch": 0.0433349609375, + "grad_norm": 12.45829963684082, + "learning_rate": 4.977894669652814e-06, + "loss": 3.5234, + "step": 4260 + }, + { + "epoch": 0.043385823567708336, + "grad_norm": 7.817399501800537, + "learning_rate": 4.97784157858896e-06, + "loss": 3.658, + "step": 4265 + }, + { + "epoch": 0.043436686197916664, + "grad_norm": 16.19847869873047, + "learning_rate": 4.9777884241301165e-06, + "loss": 3.5021, + "step": 4270 + }, + { + "epoch": 0.043487548828125, + "grad_norm": 8.416207313537598, + "learning_rate": 4.977735206277644e-06, + "loss": 3.3046, + "step": 4275 + }, + { + "epoch": 0.043538411458333336, + "grad_norm": 13.35494613647461, + "learning_rate": 4.977681925032902e-06, + "loss": 3.3918, + "step": 4280 + }, + { + "epoch": 0.043589274088541664, + "grad_norm": 10.832646369934082, + "learning_rate": 4.977628580397257e-06, + "loss": 3.4338, + "step": 4285 + }, + { + "epoch": 0.04364013671875, + "grad_norm": 9.665356636047363, + "learning_rate": 4.977575172372072e-06, + "loss": 3.5508, + "step": 4290 + }, + { + "epoch": 0.043690999348958336, + "grad_norm": 11.935736656188965, + "learning_rate": 4.977521700958712e-06, + "loss": 3.463, + "step": 4295 + }, + { + "epoch": 0.043741861979166664, + "grad_norm": 16.61982536315918, + "learning_rate": 4.977468166158548e-06, + "loss": 3.7634, + "step": 4300 + }, + { + "epoch": 0.043792724609375, + "grad_norm": 14.604238510131836, + "learning_rate": 4.977414567972948e-06, + "loss": 3.3686, + "step": 4305 + }, + { + "epoch": 0.043843587239583336, + "grad_norm": 13.716455459594727, + "learning_rate": 4.977360906403283e-06, + "loss": 3.374, + "step": 4310 + }, + { + "epoch": 0.043894449869791664, + "grad_norm": 11.446566581726074, + "learning_rate": 4.977307181450926e-06, + "loss": 3.3746, + "step": 4315 + }, + { + "epoch": 0.0439453125, + "grad_norm": 9.788915634155273, + "learning_rate": 4.977253393117253e-06, + "loss": 3.5568, + "step": 4320 + }, + { + "epoch": 0.043996175130208336, + "grad_norm": 203.79078674316406, + "learning_rate": 4.977199541403638e-06, + "loss": 3.7115, + "step": 4325 + }, + { + "epoch": 0.044047037760416664, + "grad_norm": 14.938610076904297, + "learning_rate": 4.97714562631146e-06, + "loss": 3.6643, + "step": 4330 + }, + { + "epoch": 0.044097900390625, + "grad_norm": 15.485106468200684, + "learning_rate": 4.977091647842099e-06, + "loss": 3.5269, + "step": 4335 + }, + { + "epoch": 0.044148763020833336, + "grad_norm": 12.620315551757812, + "learning_rate": 4.977037605996936e-06, + "loss": 4.1156, + "step": 4340 + }, + { + "epoch": 0.044199625651041664, + "grad_norm": 16.161775588989258, + "learning_rate": 4.976983500777352e-06, + "loss": 3.845, + "step": 4345 + }, + { + "epoch": 0.04425048828125, + "grad_norm": 14.427170753479004, + "learning_rate": 4.976929332184732e-06, + "loss": 3.7344, + "step": 4350 + }, + { + "epoch": 0.044301350911458336, + "grad_norm": 15.274232864379883, + "learning_rate": 4.976875100220462e-06, + "loss": 3.3062, + "step": 4355 + }, + { + "epoch": 0.044352213541666664, + "grad_norm": 8.53735637664795, + "learning_rate": 4.97682080488593e-06, + "loss": 3.272, + "step": 4360 + }, + { + "epoch": 0.044403076171875, + "grad_norm": 20.58081817626953, + "learning_rate": 4.9767664461825246e-06, + "loss": 3.3835, + "step": 4365 + }, + { + "epoch": 0.044453938802083336, + "grad_norm": 8.867008209228516, + "learning_rate": 4.976712024111637e-06, + "loss": 3.36, + "step": 4370 + }, + { + "epoch": 0.044504801432291664, + "grad_norm": 13.777779579162598, + "learning_rate": 4.976657538674659e-06, + "loss": 3.6504, + "step": 4375 + }, + { + "epoch": 0.0445556640625, + "grad_norm": 13.533966064453125, + "learning_rate": 4.9766029898729865e-06, + "loss": 3.6923, + "step": 4380 + }, + { + "epoch": 0.044606526692708336, + "grad_norm": 16.9124698638916, + "learning_rate": 4.976548377708011e-06, + "loss": 3.7102, + "step": 4385 + }, + { + "epoch": 0.044657389322916664, + "grad_norm": 14.597479820251465, + "learning_rate": 4.9764937021811345e-06, + "loss": 3.6415, + "step": 4390 + }, + { + "epoch": 0.044708251953125, + "grad_norm": 12.073945999145508, + "learning_rate": 4.976438963293753e-06, + "loss": 3.5683, + "step": 4395 + }, + { + "epoch": 0.044759114583333336, + "grad_norm": 10.953951835632324, + "learning_rate": 4.976384161047266e-06, + "loss": 3.4114, + "step": 4400 + }, + { + "epoch": 0.044809977213541664, + "grad_norm": 11.895225524902344, + "learning_rate": 4.976329295443079e-06, + "loss": 3.5981, + "step": 4405 + }, + { + "epoch": 0.04486083984375, + "grad_norm": 13.530261039733887, + "learning_rate": 4.976274366482593e-06, + "loss": 3.416, + "step": 4410 + }, + { + "epoch": 0.044911702473958336, + "grad_norm": 12.748217582702637, + "learning_rate": 4.9762193741672145e-06, + "loss": 3.6393, + "step": 4415 + }, + { + "epoch": 0.044962565104166664, + "grad_norm": 34.55634307861328, + "learning_rate": 4.976164318498351e-06, + "loss": 3.6435, + "step": 4420 + }, + { + "epoch": 0.045013427734375, + "grad_norm": 9.771318435668945, + "learning_rate": 4.9761091994774095e-06, + "loss": 3.8975, + "step": 4425 + }, + { + "epoch": 0.045064290364583336, + "grad_norm": 10.606549263000488, + "learning_rate": 4.976054017105801e-06, + "loss": 3.8694, + "step": 4430 + }, + { + "epoch": 0.045115152994791664, + "grad_norm": 18.156574249267578, + "learning_rate": 4.975998771384938e-06, + "loss": 3.8872, + "step": 4435 + }, + { + "epoch": 0.045166015625, + "grad_norm": 12.91622257232666, + "learning_rate": 4.9759434623162325e-06, + "loss": 3.5453, + "step": 4440 + }, + { + "epoch": 0.045216878255208336, + "grad_norm": 12.400276184082031, + "learning_rate": 4.975888089901101e-06, + "loss": 3.7277, + "step": 4445 + }, + { + "epoch": 0.045267740885416664, + "grad_norm": 11.56432056427002, + "learning_rate": 4.97583265414096e-06, + "loss": 3.62, + "step": 4450 + }, + { + "epoch": 0.045318603515625, + "grad_norm": 12.74065113067627, + "learning_rate": 4.975777155037226e-06, + "loss": 3.2927, + "step": 4455 + }, + { + "epoch": 0.045369466145833336, + "grad_norm": 16.921592712402344, + "learning_rate": 4.975721592591321e-06, + "loss": 3.4098, + "step": 4460 + }, + { + "epoch": 0.045420328776041664, + "grad_norm": 10.564414024353027, + "learning_rate": 4.975665966804666e-06, + "loss": 3.5746, + "step": 4465 + }, + { + "epoch": 0.04547119140625, + "grad_norm": 11.815017700195312, + "learning_rate": 4.9756102776786845e-06, + "loss": 3.0929, + "step": 4470 + }, + { + "epoch": 0.045522054036458336, + "grad_norm": 11.405243873596191, + "learning_rate": 4.9755545252147995e-06, + "loss": 3.8489, + "step": 4475 + }, + { + "epoch": 0.045572916666666664, + "grad_norm": 14.713571548461914, + "learning_rate": 4.97549870941444e-06, + "loss": 3.7178, + "step": 4480 + }, + { + "epoch": 0.045623779296875, + "grad_norm": 11.61874771118164, + "learning_rate": 4.9754428302790325e-06, + "loss": 3.3223, + "step": 4485 + }, + { + "epoch": 0.045674641927083336, + "grad_norm": 8.922009468078613, + "learning_rate": 4.975386887810007e-06, + "loss": 3.7321, + "step": 4490 + }, + { + "epoch": 0.045725504557291664, + "grad_norm": 8.890037536621094, + "learning_rate": 4.975330882008794e-06, + "loss": 3.4178, + "step": 4495 + }, + { + "epoch": 0.0457763671875, + "grad_norm": 11.523167610168457, + "learning_rate": 4.9752748128768275e-06, + "loss": 3.8556, + "step": 4500 + }, + { + "epoch": 0.045827229817708336, + "grad_norm": 15.356710433959961, + "learning_rate": 4.975218680415541e-06, + "loss": 3.4063, + "step": 4505 + }, + { + "epoch": 0.045878092447916664, + "grad_norm": 12.003191947937012, + "learning_rate": 4.9751624846263725e-06, + "loss": 3.727, + "step": 4510 + }, + { + "epoch": 0.045928955078125, + "grad_norm": 11.115232467651367, + "learning_rate": 4.9751062255107575e-06, + "loss": 3.3798, + "step": 4515 + }, + { + "epoch": 0.045979817708333336, + "grad_norm": 12.786600112915039, + "learning_rate": 4.975049903070137e-06, + "loss": 3.8232, + "step": 4520 + }, + { + "epoch": 0.046030680338541664, + "grad_norm": 9.778971672058105, + "learning_rate": 4.974993517305952e-06, + "loss": 3.5046, + "step": 4525 + }, + { + "epoch": 0.04608154296875, + "grad_norm": 11.696513175964355, + "learning_rate": 4.974937068219643e-06, + "loss": 3.5063, + "step": 4530 + }, + { + "epoch": 0.046132405598958336, + "grad_norm": 10.245550155639648, + "learning_rate": 4.974880555812656e-06, + "loss": 3.2886, + "step": 4535 + }, + { + "epoch": 0.046183268229166664, + "grad_norm": 15.772186279296875, + "learning_rate": 4.9748239800864375e-06, + "loss": 3.482, + "step": 4540 + }, + { + "epoch": 0.046234130859375, + "grad_norm": 11.759252548217773, + "learning_rate": 4.974767341042433e-06, + "loss": 3.663, + "step": 4545 + }, + { + "epoch": 0.046284993489583336, + "grad_norm": 9.798601150512695, + "learning_rate": 4.9747106386820934e-06, + "loss": 3.3534, + "step": 4550 + }, + { + "epoch": 0.046335856119791664, + "grad_norm": 13.721288681030273, + "learning_rate": 4.9746538730068684e-06, + "loss": 3.4893, + "step": 4555 + }, + { + "epoch": 0.04638671875, + "grad_norm": 11.46217155456543, + "learning_rate": 4.974597044018211e-06, + "loss": 3.3706, + "step": 4560 + }, + { + "epoch": 0.046437581380208336, + "grad_norm": 8.599750518798828, + "learning_rate": 4.974540151717574e-06, + "loss": 4.0034, + "step": 4565 + }, + { + "epoch": 0.046488444010416664, + "grad_norm": 15.226737022399902, + "learning_rate": 4.974483196106415e-06, + "loss": 3.591, + "step": 4570 + }, + { + "epoch": 0.046539306640625, + "grad_norm": 8.270947456359863, + "learning_rate": 4.9744261771861894e-06, + "loss": 3.3649, + "step": 4575 + }, + { + "epoch": 0.046590169270833336, + "grad_norm": 13.826691627502441, + "learning_rate": 4.974369094958356e-06, + "loss": 3.7075, + "step": 4580 + }, + { + "epoch": 0.046641031901041664, + "grad_norm": 15.241434097290039, + "learning_rate": 4.974311949424376e-06, + "loss": 3.9796, + "step": 4585 + }, + { + "epoch": 0.04669189453125, + "grad_norm": 9.045734405517578, + "learning_rate": 4.974254740585712e-06, + "loss": 3.3453, + "step": 4590 + }, + { + "epoch": 0.046742757161458336, + "grad_norm": 12.359314918518066, + "learning_rate": 4.974197468443826e-06, + "loss": 3.5554, + "step": 4595 + }, + { + "epoch": 0.046793619791666664, + "grad_norm": 16.416982650756836, + "learning_rate": 4.974140133000184e-06, + "loss": 3.4799, + "step": 4600 + }, + { + "epoch": 0.046844482421875, + "grad_norm": 12.722060203552246, + "learning_rate": 4.974082734256254e-06, + "loss": 3.8829, + "step": 4605 + }, + { + "epoch": 0.046895345052083336, + "grad_norm": 14.915850639343262, + "learning_rate": 4.9740252722135035e-06, + "loss": 3.0652, + "step": 4610 + }, + { + "epoch": 0.046946207682291664, + "grad_norm": 15.305362701416016, + "learning_rate": 4.973967746873403e-06, + "loss": 3.2942, + "step": 4615 + }, + { + "epoch": 0.0469970703125, + "grad_norm": 18.63602638244629, + "learning_rate": 4.973910158237423e-06, + "loss": 3.1882, + "step": 4620 + }, + { + "epoch": 0.047047932942708336, + "grad_norm": 15.647112846374512, + "learning_rate": 4.973852506307039e-06, + "loss": 3.2307, + "step": 4625 + }, + { + "epoch": 0.047098795572916664, + "grad_norm": 13.300541877746582, + "learning_rate": 4.973794791083725e-06, + "loss": 3.6601, + "step": 4630 + }, + { + "epoch": 0.047149658203125, + "grad_norm": 12.027398109436035, + "learning_rate": 4.9737370125689575e-06, + "loss": 3.5073, + "step": 4635 + }, + { + "epoch": 0.047200520833333336, + "grad_norm": 9.269347190856934, + "learning_rate": 4.973679170764214e-06, + "loss": 4.1268, + "step": 4640 + }, + { + "epoch": 0.047251383463541664, + "grad_norm": 9.16490650177002, + "learning_rate": 4.973621265670976e-06, + "loss": 3.2929, + "step": 4645 + }, + { + "epoch": 0.04730224609375, + "grad_norm": 11.5504789352417, + "learning_rate": 4.973563297290724e-06, + "loss": 3.369, + "step": 4650 + }, + { + "epoch": 0.047353108723958336, + "grad_norm": 15.80301284790039, + "learning_rate": 4.973505265624942e-06, + "loss": 3.4352, + "step": 4655 + }, + { + "epoch": 0.047403971354166664, + "grad_norm": 9.65170669555664, + "learning_rate": 4.9734471706751135e-06, + "loss": 3.8803, + "step": 4660 + }, + { + "epoch": 0.047454833984375, + "grad_norm": 11.459211349487305, + "learning_rate": 4.9733890124427255e-06, + "loss": 3.5879, + "step": 4665 + }, + { + "epoch": 0.047505696614583336, + "grad_norm": 16.0178279876709, + "learning_rate": 4.973330790929266e-06, + "loss": 3.6492, + "step": 4670 + }, + { + "epoch": 0.047556559244791664, + "grad_norm": 16.536197662353516, + "learning_rate": 4.973272506136224e-06, + "loss": 3.2745, + "step": 4675 + }, + { + "epoch": 0.047607421875, + "grad_norm": 10.868372917175293, + "learning_rate": 4.973214158065092e-06, + "loss": 3.4472, + "step": 4680 + }, + { + "epoch": 0.047658284505208336, + "grad_norm": 14.260408401489258, + "learning_rate": 4.973155746717361e-06, + "loss": 3.4165, + "step": 4685 + }, + { + "epoch": 0.047709147135416664, + "grad_norm": 10.171429634094238, + "learning_rate": 4.973097272094527e-06, + "loss": 3.209, + "step": 4690 + }, + { + "epoch": 0.047760009765625, + "grad_norm": 13.096430778503418, + "learning_rate": 4.973038734198086e-06, + "loss": 3.4693, + "step": 4695 + }, + { + "epoch": 0.047810872395833336, + "grad_norm": 15.2460355758667, + "learning_rate": 4.972980133029534e-06, + "loss": 3.4677, + "step": 4700 + }, + { + "epoch": 0.047861735026041664, + "grad_norm": 13.545559883117676, + "learning_rate": 4.9729214685903725e-06, + "loss": 3.4694, + "step": 4705 + }, + { + "epoch": 0.04791259765625, + "grad_norm": 17.726600646972656, + "learning_rate": 4.9728627408821e-06, + "loss": 3.5932, + "step": 4710 + }, + { + "epoch": 0.047963460286458336, + "grad_norm": 13.018586158752441, + "learning_rate": 4.972803949906222e-06, + "loss": 3.4267, + "step": 4715 + }, + { + "epoch": 0.048014322916666664, + "grad_norm": 15.820887565612793, + "learning_rate": 4.9727450956642395e-06, + "loss": 3.3878, + "step": 4720 + }, + { + "epoch": 0.048065185546875, + "grad_norm": 10.09266471862793, + "learning_rate": 4.972686178157661e-06, + "loss": 3.7088, + "step": 4725 + }, + { + "epoch": 0.048116048177083336, + "grad_norm": 14.535094261169434, + "learning_rate": 4.972627197387993e-06, + "loss": 3.492, + "step": 4730 + }, + { + "epoch": 0.048166910807291664, + "grad_norm": 16.890783309936523, + "learning_rate": 4.972568153356744e-06, + "loss": 3.3656, + "step": 4735 + }, + { + "epoch": 0.0482177734375, + "grad_norm": 8.608535766601562, + "learning_rate": 4.972509046065423e-06, + "loss": 3.4594, + "step": 4740 + }, + { + "epoch": 0.048268636067708336, + "grad_norm": 12.446272850036621, + "learning_rate": 4.9724498755155455e-06, + "loss": 3.3436, + "step": 4745 + }, + { + "epoch": 0.048319498697916664, + "grad_norm": 11.247904777526855, + "learning_rate": 4.972390641708625e-06, + "loss": 3.7488, + "step": 4750 + }, + { + "epoch": 0.048370361328125, + "grad_norm": 10.85303783416748, + "learning_rate": 4.972331344646175e-06, + "loss": 3.4678, + "step": 4755 + }, + { + "epoch": 0.048421223958333336, + "grad_norm": 12.21154499053955, + "learning_rate": 4.972271984329713e-06, + "loss": 3.4773, + "step": 4760 + }, + { + "epoch": 0.048472086588541664, + "grad_norm": 12.006597518920898, + "learning_rate": 4.9722125607607595e-06, + "loss": 3.5087, + "step": 4765 + }, + { + "epoch": 0.04852294921875, + "grad_norm": 9.981345176696777, + "learning_rate": 4.972153073940833e-06, + "loss": 3.5338, + "step": 4770 + }, + { + "epoch": 0.048573811848958336, + "grad_norm": 13.28870964050293, + "learning_rate": 4.972093523871456e-06, + "loss": 3.2124, + "step": 4775 + }, + { + "epoch": 0.048624674479166664, + "grad_norm": 11.399847984313965, + "learning_rate": 4.972033910554151e-06, + "loss": 3.251, + "step": 4780 + }, + { + "epoch": 0.048675537109375, + "grad_norm": 11.87983226776123, + "learning_rate": 4.971974233990447e-06, + "loss": 3.2554, + "step": 4785 + }, + { + "epoch": 0.048726399739583336, + "grad_norm": 10.418863296508789, + "learning_rate": 4.971914494181866e-06, + "loss": 3.8199, + "step": 4790 + }, + { + "epoch": 0.048777262369791664, + "grad_norm": 13.546000480651855, + "learning_rate": 4.971854691129939e-06, + "loss": 3.6464, + "step": 4795 + }, + { + "epoch": 0.048828125, + "grad_norm": 16.547273635864258, + "learning_rate": 4.9717948248361954e-06, + "loss": 3.3086, + "step": 4800 + }, + { + "epoch": 0.048878987630208336, + "grad_norm": 10.11819839477539, + "learning_rate": 4.971734895302168e-06, + "loss": 3.5591, + "step": 4805 + }, + { + "epoch": 0.048929850260416664, + "grad_norm": 11.45274829864502, + "learning_rate": 4.971674902529389e-06, + "loss": 3.666, + "step": 4810 + }, + { + "epoch": 0.048980712890625, + "grad_norm": 12.099120140075684, + "learning_rate": 4.971614846519393e-06, + "loss": 3.6187, + "step": 4815 + }, + { + "epoch": 0.049031575520833336, + "grad_norm": 9.340841293334961, + "learning_rate": 4.971554727273718e-06, + "loss": 3.4518, + "step": 4820 + }, + { + "epoch": 0.049082438151041664, + "grad_norm": 14.398073196411133, + "learning_rate": 4.9714945447939e-06, + "loss": 3.514, + "step": 4825 + }, + { + "epoch": 0.04913330078125, + "grad_norm": 10.84057331085205, + "learning_rate": 4.97143429908148e-06, + "loss": 4.0011, + "step": 4830 + }, + { + "epoch": 0.049184163411458336, + "grad_norm": 14.066889762878418, + "learning_rate": 4.971373990137999e-06, + "loss": 3.4912, + "step": 4835 + }, + { + "epoch": 0.049235026041666664, + "grad_norm": 13.424138069152832, + "learning_rate": 4.971313617965001e-06, + "loss": 3.4337, + "step": 4840 + }, + { + "epoch": 0.049285888671875, + "grad_norm": 15.70202922821045, + "learning_rate": 4.971253182564029e-06, + "loss": 3.5523, + "step": 4845 + }, + { + "epoch": 0.049336751302083336, + "grad_norm": 15.583520889282227, + "learning_rate": 4.971192683936631e-06, + "loss": 3.548, + "step": 4850 + }, + { + "epoch": 0.049387613932291664, + "grad_norm": 16.691743850708008, + "learning_rate": 4.9711321220843535e-06, + "loss": 3.3919, + "step": 4855 + }, + { + "epoch": 0.0494384765625, + "grad_norm": 15.311267852783203, + "learning_rate": 4.971071497008746e-06, + "loss": 3.275, + "step": 4860 + }, + { + "epoch": 0.049489339192708336, + "grad_norm": 11.667826652526855, + "learning_rate": 4.971010808711361e-06, + "loss": 3.3042, + "step": 4865 + }, + { + "epoch": 0.049540201822916664, + "grad_norm": 12.931231498718262, + "learning_rate": 4.9709500571937485e-06, + "loss": 3.6733, + "step": 4870 + }, + { + "epoch": 0.049591064453125, + "grad_norm": 14.819002151489258, + "learning_rate": 4.970889242457466e-06, + "loss": 3.3424, + "step": 4875 + }, + { + "epoch": 0.049641927083333336, + "grad_norm": 11.588568687438965, + "learning_rate": 4.9708283645040675e-06, + "loss": 3.2318, + "step": 4880 + }, + { + "epoch": 0.049692789713541664, + "grad_norm": 14.38076400756836, + "learning_rate": 4.970767423335111e-06, + "loss": 3.5042, + "step": 4885 + }, + { + "epoch": 0.04974365234375, + "grad_norm": 11.927538871765137, + "learning_rate": 4.970706418952155e-06, + "loss": 3.4414, + "step": 4890 + }, + { + "epoch": 0.049794514973958336, + "grad_norm": 17.747844696044922, + "learning_rate": 4.970645351356761e-06, + "loss": 3.9686, + "step": 4895 + }, + { + "epoch": 0.049845377604166664, + "grad_norm": 13.796122550964355, + "learning_rate": 4.970584220550492e-06, + "loss": 4.094, + "step": 4900 + }, + { + "epoch": 0.049896240234375, + "grad_norm": 13.095043182373047, + "learning_rate": 4.97052302653491e-06, + "loss": 3.7906, + "step": 4905 + }, + { + "epoch": 0.049947102864583336, + "grad_norm": 12.416662216186523, + "learning_rate": 4.970461769311583e-06, + "loss": 3.4955, + "step": 4910 + }, + { + "epoch": 0.049997965494791664, + "grad_norm": 12.235672950744629, + "learning_rate": 4.970400448882078e-06, + "loss": 3.0274, + "step": 4915 + }, + { + "epoch": 0.050048828125, + "grad_norm": 12.012406349182129, + "learning_rate": 4.9703390652479615e-06, + "loss": 3.3271, + "step": 4920 + }, + { + "epoch": 0.050099690755208336, + "grad_norm": 11.558436393737793, + "learning_rate": 4.970277618410806e-06, + "loss": 3.4439, + "step": 4925 + }, + { + "epoch": 0.050150553385416664, + "grad_norm": 10.99045467376709, + "learning_rate": 4.970216108372184e-06, + "loss": 3.4762, + "step": 4930 + }, + { + "epoch": 0.050201416015625, + "grad_norm": 14.257964134216309, + "learning_rate": 4.970154535133667e-06, + "loss": 4.1728, + "step": 4935 + }, + { + "epoch": 0.050252278645833336, + "grad_norm": 14.24341106414795, + "learning_rate": 4.970092898696832e-06, + "loss": 3.3222, + "step": 4940 + }, + { + "epoch": 0.050303141276041664, + "grad_norm": 17.90021324157715, + "learning_rate": 4.9700311990632565e-06, + "loss": 3.2803, + "step": 4945 + }, + { + "epoch": 0.05035400390625, + "grad_norm": 13.78532600402832, + "learning_rate": 4.9699694362345175e-06, + "loss": 3.6847, + "step": 4950 + }, + { + "epoch": 0.050404866536458336, + "grad_norm": 9.49715518951416, + "learning_rate": 4.969907610212197e-06, + "loss": 3.4094, + "step": 4955 + }, + { + "epoch": 0.050455729166666664, + "grad_norm": 16.413652420043945, + "learning_rate": 4.969845720997874e-06, + "loss": 3.4125, + "step": 4960 + }, + { + "epoch": 0.050506591796875, + "grad_norm": 11.681251525878906, + "learning_rate": 4.969783768593135e-06, + "loss": 3.7634, + "step": 4965 + }, + { + "epoch": 0.050557454427083336, + "grad_norm": 12.140727043151855, + "learning_rate": 4.969721752999563e-06, + "loss": 3.3175, + "step": 4970 + }, + { + "epoch": 0.050608317057291664, + "grad_norm": 17.70606231689453, + "learning_rate": 4.9696596742187455e-06, + "loss": 3.7512, + "step": 4975 + }, + { + "epoch": 0.0506591796875, + "grad_norm": 14.161558151245117, + "learning_rate": 4.969597532252271e-06, + "loss": 3.2145, + "step": 4980 + }, + { + "epoch": 0.050710042317708336, + "grad_norm": 9.752479553222656, + "learning_rate": 4.969535327101729e-06, + "loss": 3.3454, + "step": 4985 + }, + { + "epoch": 0.050760904947916664, + "grad_norm": 10.96493911743164, + "learning_rate": 4.96947305876871e-06, + "loss": 3.493, + "step": 4990 + }, + { + "epoch": 0.050811767578125, + "grad_norm": 10.895770072937012, + "learning_rate": 4.969410727254809e-06, + "loss": 3.1966, + "step": 4995 + }, + { + "epoch": 0.050862630208333336, + "grad_norm": 14.102956771850586, + "learning_rate": 4.96934833256162e-06, + "loss": 3.4336, + "step": 5000 + }, + { + "epoch": 0.050913492838541664, + "grad_norm": 10.122415542602539, + "learning_rate": 4.9692858746907395e-06, + "loss": 3.275, + "step": 5005 + }, + { + "epoch": 0.05096435546875, + "grad_norm": 16.442476272583008, + "learning_rate": 4.969223353643764e-06, + "loss": 3.7156, + "step": 5010 + }, + { + "epoch": 0.051015218098958336, + "grad_norm": 12.107405662536621, + "learning_rate": 4.969160769422294e-06, + "loss": 3.645, + "step": 5015 + }, + { + "epoch": 0.051066080729166664, + "grad_norm": 18.26190948486328, + "learning_rate": 4.969098122027932e-06, + "loss": 3.5743, + "step": 5020 + }, + { + "epoch": 0.051116943359375, + "grad_norm": 11.404193878173828, + "learning_rate": 4.96903541146228e-06, + "loss": 3.2735, + "step": 5025 + }, + { + "epoch": 0.051167805989583336, + "grad_norm": 14.428089141845703, + "learning_rate": 4.968972637726942e-06, + "loss": 3.7864, + "step": 5030 + }, + { + "epoch": 0.051218668619791664, + "grad_norm": 10.478737831115723, + "learning_rate": 4.9689098008235235e-06, + "loss": 3.8017, + "step": 5035 + }, + { + "epoch": 0.05126953125, + "grad_norm": 15.701272010803223, + "learning_rate": 4.968846900753634e-06, + "loss": 3.4754, + "step": 5040 + }, + { + "epoch": 0.051320393880208336, + "grad_norm": 11.944446563720703, + "learning_rate": 4.968783937518882e-06, + "loss": 3.5693, + "step": 5045 + }, + { + "epoch": 0.051371256510416664, + "grad_norm": 9.105219841003418, + "learning_rate": 4.968720911120876e-06, + "loss": 3.3752, + "step": 5050 + }, + { + "epoch": 0.051422119140625, + "grad_norm": 17.28725814819336, + "learning_rate": 4.968657821561233e-06, + "loss": 3.542, + "step": 5055 + }, + { + "epoch": 0.051472981770833336, + "grad_norm": 9.695812225341797, + "learning_rate": 4.9685946688415635e-06, + "loss": 3.3903, + "step": 5060 + }, + { + "epoch": 0.051523844401041664, + "grad_norm": 11.877903938293457, + "learning_rate": 4.968531452963485e-06, + "loss": 3.8497, + "step": 5065 + }, + { + "epoch": 0.05157470703125, + "grad_norm": 14.348855018615723, + "learning_rate": 4.968468173928614e-06, + "loss": 3.2782, + "step": 5070 + }, + { + "epoch": 0.051625569661458336, + "grad_norm": 14.756049156188965, + "learning_rate": 4.96840483173857e-06, + "loss": 4.0242, + "step": 5075 + }, + { + "epoch": 0.051676432291666664, + "grad_norm": 14.059425354003906, + "learning_rate": 4.968341426394974e-06, + "loss": 3.2894, + "step": 5080 + }, + { + "epoch": 0.051727294921875, + "grad_norm": 11.158661842346191, + "learning_rate": 4.968277957899446e-06, + "loss": 3.6611, + "step": 5085 + }, + { + "epoch": 0.051778157552083336, + "grad_norm": 18.206151962280273, + "learning_rate": 4.968214426253613e-06, + "loss": 3.4665, + "step": 5090 + }, + { + "epoch": 0.051829020182291664, + "grad_norm": 8.762948989868164, + "learning_rate": 4.968150831459099e-06, + "loss": 3.4422, + "step": 5095 + }, + { + "epoch": 0.0518798828125, + "grad_norm": 14.994571685791016, + "learning_rate": 4.968087173517531e-06, + "loss": 3.602, + "step": 5100 + }, + { + "epoch": 0.051930745442708336, + "grad_norm": 12.887874603271484, + "learning_rate": 4.968023452430537e-06, + "loss": 3.12, + "step": 5105 + }, + { + "epoch": 0.051981608072916664, + "grad_norm": 15.499711990356445, + "learning_rate": 4.967959668199748e-06, + "loss": 3.7486, + "step": 5110 + }, + { + "epoch": 0.052032470703125, + "grad_norm": 12.390300750732422, + "learning_rate": 4.967895820826796e-06, + "loss": 3.4817, + "step": 5115 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 13.03585433959961, + "learning_rate": 4.967831910313314e-06, + "loss": 3.1236, + "step": 5120 + }, + { + "epoch": 0.052134195963541664, + "grad_norm": 11.844696998596191, + "learning_rate": 4.967767936660939e-06, + "loss": 3.4756, + "step": 5125 + }, + { + "epoch": 0.05218505859375, + "grad_norm": 12.35226821899414, + "learning_rate": 4.967703899871304e-06, + "loss": 3.481, + "step": 5130 + }, + { + "epoch": 0.052235921223958336, + "grad_norm": 10.799506187438965, + "learning_rate": 4.967639799946052e-06, + "loss": 3.7089, + "step": 5135 + }, + { + "epoch": 0.052286783854166664, + "grad_norm": 11.297710418701172, + "learning_rate": 4.967575636886819e-06, + "loss": 3.5326, + "step": 5140 + }, + { + "epoch": 0.052337646484375, + "grad_norm": 9.424179077148438, + "learning_rate": 4.967511410695249e-06, + "loss": 3.7654, + "step": 5145 + }, + { + "epoch": 0.052388509114583336, + "grad_norm": 11.021271705627441, + "learning_rate": 4.9674471213729836e-06, + "loss": 3.278, + "step": 5150 + }, + { + "epoch": 0.052439371744791664, + "grad_norm": 12.35921573638916, + "learning_rate": 4.9673827689216695e-06, + "loss": 3.4304, + "step": 5155 + }, + { + "epoch": 0.052490234375, + "grad_norm": 12.476812362670898, + "learning_rate": 4.967318353342952e-06, + "loss": 3.3936, + "step": 5160 + }, + { + "epoch": 0.052541097005208336, + "grad_norm": 10.683613777160645, + "learning_rate": 4.967253874638478e-06, + "loss": 3.1559, + "step": 5165 + }, + { + "epoch": 0.052591959635416664, + "grad_norm": 13.43266773223877, + "learning_rate": 4.967189332809899e-06, + "loss": 3.8937, + "step": 5170 + }, + { + "epoch": 0.052642822265625, + "grad_norm": 17.731548309326172, + "learning_rate": 4.967124727858867e-06, + "loss": 3.4165, + "step": 5175 + }, + { + "epoch": 0.052693684895833336, + "grad_norm": 15.984275817871094, + "learning_rate": 4.967060059787032e-06, + "loss": 3.6395, + "step": 5180 + }, + { + "epoch": 0.052744547526041664, + "grad_norm": 18.11372184753418, + "learning_rate": 4.96699532859605e-06, + "loss": 3.495, + "step": 5185 + }, + { + "epoch": 0.05279541015625, + "grad_norm": 13.443792343139648, + "learning_rate": 4.9669305342875785e-06, + "loss": 4.1017, + "step": 5190 + }, + { + "epoch": 0.052846272786458336, + "grad_norm": 16.402788162231445, + "learning_rate": 4.9668656768632725e-06, + "loss": 3.4839, + "step": 5195 + }, + { + "epoch": 0.052897135416666664, + "grad_norm": 14.646184921264648, + "learning_rate": 4.966800756324794e-06, + "loss": 3.7131, + "step": 5200 + }, + { + "epoch": 0.052947998046875, + "grad_norm": 16.304649353027344, + "learning_rate": 4.966735772673803e-06, + "loss": 3.4049, + "step": 5205 + }, + { + "epoch": 0.052998860677083336, + "grad_norm": 17.611019134521484, + "learning_rate": 4.966670725911962e-06, + "loss": 3.2992, + "step": 5210 + }, + { + "epoch": 0.053049723307291664, + "grad_norm": 9.94887638092041, + "learning_rate": 4.966605616040935e-06, + "loss": 3.498, + "step": 5215 + }, + { + "epoch": 0.0531005859375, + "grad_norm": 14.684574127197266, + "learning_rate": 4.9665404430623874e-06, + "loss": 3.6487, + "step": 5220 + }, + { + "epoch": 0.053151448567708336, + "grad_norm": 14.74726676940918, + "learning_rate": 4.9664752069779875e-06, + "loss": 3.3533, + "step": 5225 + }, + { + "epoch": 0.053202311197916664, + "grad_norm": 16.72749137878418, + "learning_rate": 4.966409907789403e-06, + "loss": 3.203, + "step": 5230 + }, + { + "epoch": 0.053253173828125, + "grad_norm": 12.44540786743164, + "learning_rate": 4.966344545498307e-06, + "loss": 3.4401, + "step": 5235 + }, + { + "epoch": 0.053304036458333336, + "grad_norm": 10.099539756774902, + "learning_rate": 4.96627912010637e-06, + "loss": 3.5915, + "step": 5240 + }, + { + "epoch": 0.053354899088541664, + "grad_norm": 11.097341537475586, + "learning_rate": 4.966213631615266e-06, + "loss": 3.9558, + "step": 5245 + }, + { + "epoch": 0.05340576171875, + "grad_norm": 13.878585815429688, + "learning_rate": 4.966148080026671e-06, + "loss": 3.4617, + "step": 5250 + }, + { + "epoch": 0.053456624348958336, + "grad_norm": 10.348923683166504, + "learning_rate": 4.966082465342263e-06, + "loss": 3.6141, + "step": 5255 + }, + { + "epoch": 0.053507486979166664, + "grad_norm": 9.475302696228027, + "learning_rate": 4.966016787563719e-06, + "loss": 3.2901, + "step": 5260 + }, + { + "epoch": 0.053558349609375, + "grad_norm": 16.38394546508789, + "learning_rate": 4.965951046692719e-06, + "loss": 3.1904, + "step": 5265 + }, + { + "epoch": 0.053609212239583336, + "grad_norm": 14.318309783935547, + "learning_rate": 4.965885242730947e-06, + "loss": 3.3258, + "step": 5270 + }, + { + "epoch": 0.053660074869791664, + "grad_norm": 8.593515396118164, + "learning_rate": 4.965819375680085e-06, + "loss": 3.341, + "step": 5275 + }, + { + "epoch": 0.0537109375, + "grad_norm": 14.667823791503906, + "learning_rate": 4.9657534455418186e-06, + "loss": 3.5297, + "step": 5280 + }, + { + "epoch": 0.053761800130208336, + "grad_norm": 15.408576965332031, + "learning_rate": 4.965687452317836e-06, + "loss": 3.3244, + "step": 5285 + }, + { + "epoch": 0.053812662760416664, + "grad_norm": 15.13183307647705, + "learning_rate": 4.9656213960098235e-06, + "loss": 3.3597, + "step": 5290 + }, + { + "epoch": 0.053863525390625, + "grad_norm": 10.050660133361816, + "learning_rate": 4.965555276619471e-06, + "loss": 3.2173, + "step": 5295 + }, + { + "epoch": 0.053914388020833336, + "grad_norm": 11.996289253234863, + "learning_rate": 4.965489094148473e-06, + "loss": 3.258, + "step": 5300 + }, + { + "epoch": 0.053965250651041664, + "grad_norm": 13.994873046875, + "learning_rate": 4.965422848598519e-06, + "loss": 3.9475, + "step": 5305 + }, + { + "epoch": 0.05401611328125, + "grad_norm": 13.173226356506348, + "learning_rate": 4.965356539971307e-06, + "loss": 3.5109, + "step": 5310 + }, + { + "epoch": 0.054066975911458336, + "grad_norm": 11.68813419342041, + "learning_rate": 4.965290168268532e-06, + "loss": 3.7385, + "step": 5315 + }, + { + "epoch": 0.054117838541666664, + "grad_norm": 13.431297302246094, + "learning_rate": 4.965223733491893e-06, + "loss": 3.4336, + "step": 5320 + }, + { + "epoch": 0.054168701171875, + "grad_norm": 14.174638748168945, + "learning_rate": 4.965157235643088e-06, + "loss": 3.2346, + "step": 5325 + }, + { + "epoch": 0.054219563802083336, + "grad_norm": 14.978144645690918, + "learning_rate": 4.96509067472382e-06, + "loss": 3.6034, + "step": 5330 + }, + { + "epoch": 0.054270426432291664, + "grad_norm": 11.926872253417969, + "learning_rate": 4.965024050735792e-06, + "loss": 3.4996, + "step": 5335 + }, + { + "epoch": 0.0543212890625, + "grad_norm": 9.099247932434082, + "learning_rate": 4.9649573636807065e-06, + "loss": 3.1218, + "step": 5340 + }, + { + "epoch": 0.054372151692708336, + "grad_norm": 17.655933380126953, + "learning_rate": 4.964890613560272e-06, + "loss": 3.6185, + "step": 5345 + }, + { + "epoch": 0.054423014322916664, + "grad_norm": 10.588809967041016, + "learning_rate": 4.964823800376195e-06, + "loss": 3.5396, + "step": 5350 + }, + { + "epoch": 0.054473876953125, + "grad_norm": 12.350024223327637, + "learning_rate": 4.964756924130186e-06, + "loss": 3.7472, + "step": 5355 + }, + { + "epoch": 0.054524739583333336, + "grad_norm": 13.325206756591797, + "learning_rate": 4.964689984823955e-06, + "loss": 3.1619, + "step": 5360 + }, + { + "epoch": 0.054575602213541664, + "grad_norm": 12.419147491455078, + "learning_rate": 4.964622982459214e-06, + "loss": 3.3821, + "step": 5365 + }, + { + "epoch": 0.05462646484375, + "grad_norm": 11.1918306350708, + "learning_rate": 4.964555917037679e-06, + "loss": 3.2978, + "step": 5370 + }, + { + "epoch": 0.054677327473958336, + "grad_norm": 8.211668014526367, + "learning_rate": 4.964488788561066e-06, + "loss": 3.1058, + "step": 5375 + }, + { + "epoch": 0.054728190104166664, + "grad_norm": 14.706374168395996, + "learning_rate": 4.96442159703109e-06, + "loss": 3.5759, + "step": 5380 + }, + { + "epoch": 0.054779052734375, + "grad_norm": 12.363343238830566, + "learning_rate": 4.964354342449472e-06, + "loss": 3.3521, + "step": 5385 + }, + { + "epoch": 0.054829915364583336, + "grad_norm": 11.750041007995605, + "learning_rate": 4.964287024817933e-06, + "loss": 3.7942, + "step": 5390 + }, + { + "epoch": 0.054880777994791664, + "grad_norm": 15.14163589477539, + "learning_rate": 4.964219644138194e-06, + "loss": 3.7889, + "step": 5395 + }, + { + "epoch": 0.054931640625, + "grad_norm": 14.809743881225586, + "learning_rate": 4.964152200411979e-06, + "loss": 3.1966, + "step": 5400 + }, + { + "epoch": 0.054982503255208336, + "grad_norm": 10.114827156066895, + "learning_rate": 4.964084693641014e-06, + "loss": 3.5248, + "step": 5405 + }, + { + "epoch": 0.055033365885416664, + "grad_norm": 12.720572471618652, + "learning_rate": 4.964017123827027e-06, + "loss": 3.1197, + "step": 5410 + }, + { + "epoch": 0.055084228515625, + "grad_norm": 8.508932113647461, + "learning_rate": 4.963949490971746e-06, + "loss": 3.8478, + "step": 5415 + }, + { + "epoch": 0.055135091145833336, + "grad_norm": 7.770327091217041, + "learning_rate": 4.963881795076901e-06, + "loss": 3.0364, + "step": 5420 + }, + { + "epoch": 0.055185953776041664, + "grad_norm": 13.418745994567871, + "learning_rate": 4.963814036144223e-06, + "loss": 3.1178, + "step": 5425 + }, + { + "epoch": 0.05523681640625, + "grad_norm": 17.769771575927734, + "learning_rate": 4.963746214175448e-06, + "loss": 3.5998, + "step": 5430 + }, + { + "epoch": 0.055287679036458336, + "grad_norm": 12.966880798339844, + "learning_rate": 4.96367832917231e-06, + "loss": 3.2851, + "step": 5435 + }, + { + "epoch": 0.055338541666666664, + "grad_norm": 13.409180641174316, + "learning_rate": 4.9636103811365464e-06, + "loss": 3.7049, + "step": 5440 + }, + { + "epoch": 0.055389404296875, + "grad_norm": 13.363730430603027, + "learning_rate": 4.963542370069895e-06, + "loss": 3.6024, + "step": 5445 + }, + { + "epoch": 0.055440266927083336, + "grad_norm": 13.596270561218262, + "learning_rate": 4.963474295974095e-06, + "loss": 3.5165, + "step": 5450 + }, + { + "epoch": 0.055491129557291664, + "grad_norm": 10.22996711730957, + "learning_rate": 4.96340615885089e-06, + "loss": 3.6675, + "step": 5455 + }, + { + "epoch": 0.0555419921875, + "grad_norm": 14.241738319396973, + "learning_rate": 4.963337958702022e-06, + "loss": 3.3507, + "step": 5460 + }, + { + "epoch": 0.055592854817708336, + "grad_norm": 9.234009742736816, + "learning_rate": 4.963269695529236e-06, + "loss": 3.4202, + "step": 5465 + }, + { + "epoch": 0.055643717447916664, + "grad_norm": 13.222739219665527, + "learning_rate": 4.963201369334279e-06, + "loss": 3.7927, + "step": 5470 + }, + { + "epoch": 0.055694580078125, + "grad_norm": 11.704004287719727, + "learning_rate": 4.963132980118899e-06, + "loss": 3.3365, + "step": 5475 + }, + { + "epoch": 0.055745442708333336, + "grad_norm": 11.018651008605957, + "learning_rate": 4.963064527884845e-06, + "loss": 3.6051, + "step": 5480 + }, + { + "epoch": 0.055796305338541664, + "grad_norm": 14.942146301269531, + "learning_rate": 4.96299601263387e-06, + "loss": 3.2443, + "step": 5485 + }, + { + "epoch": 0.05584716796875, + "grad_norm": 9.587372779846191, + "learning_rate": 4.962927434367724e-06, + "loss": 3.637, + "step": 5490 + }, + { + "epoch": 0.055898030598958336, + "grad_norm": 9.663505554199219, + "learning_rate": 4.9628587930881646e-06, + "loss": 3.5114, + "step": 5495 + }, + { + "epoch": 0.055948893229166664, + "grad_norm": 15.723515510559082, + "learning_rate": 4.962790088796946e-06, + "loss": 3.5283, + "step": 5500 + }, + { + "epoch": 0.055999755859375, + "grad_norm": 11.564032554626465, + "learning_rate": 4.962721321495827e-06, + "loss": 3.523, + "step": 5505 + }, + { + "epoch": 0.056050618489583336, + "grad_norm": 15.196375846862793, + "learning_rate": 4.962652491186567e-06, + "loss": 3.4245, + "step": 5510 + }, + { + "epoch": 0.056101481119791664, + "grad_norm": 11.132039070129395, + "learning_rate": 4.962583597870927e-06, + "loss": 3.6306, + "step": 5515 + }, + { + "epoch": 0.05615234375, + "grad_norm": 13.807767868041992, + "learning_rate": 4.962514641550668e-06, + "loss": 3.284, + "step": 5520 + }, + { + "epoch": 0.056203206380208336, + "grad_norm": 8.87157154083252, + "learning_rate": 4.962445622227558e-06, + "loss": 3.5285, + "step": 5525 + }, + { + "epoch": 0.056254069010416664, + "grad_norm": 16.512109756469727, + "learning_rate": 4.962376539903359e-06, + "loss": 3.5056, + "step": 5530 + }, + { + "epoch": 0.056304931640625, + "grad_norm": 10.745555877685547, + "learning_rate": 4.962307394579839e-06, + "loss": 3.2763, + "step": 5535 + }, + { + "epoch": 0.056355794270833336, + "grad_norm": 10.458619117736816, + "learning_rate": 4.9622381862587685e-06, + "loss": 3.3848, + "step": 5540 + }, + { + "epoch": 0.056406656901041664, + "grad_norm": 10.014710426330566, + "learning_rate": 4.962168914941919e-06, + "loss": 3.4247, + "step": 5545 + }, + { + "epoch": 0.05645751953125, + "grad_norm": 15.231337547302246, + "learning_rate": 4.96209958063106e-06, + "loss": 3.368, + "step": 5550 + }, + { + "epoch": 0.056508382161458336, + "grad_norm": 14.728731155395508, + "learning_rate": 4.962030183327967e-06, + "loss": 3.4259, + "step": 5555 + }, + { + "epoch": 0.056559244791666664, + "grad_norm": 14.27651596069336, + "learning_rate": 4.961960723034415e-06, + "loss": 3.3908, + "step": 5560 + }, + { + "epoch": 0.056610107421875, + "grad_norm": 15.396354675292969, + "learning_rate": 4.961891199752182e-06, + "loss": 3.6155, + "step": 5565 + }, + { + "epoch": 0.056660970052083336, + "grad_norm": 13.059240341186523, + "learning_rate": 4.961821613483047e-06, + "loss": 3.737, + "step": 5570 + }, + { + "epoch": 0.056711832682291664, + "grad_norm": 21.01386833190918, + "learning_rate": 4.961751964228788e-06, + "loss": 3.4121, + "step": 5575 + }, + { + "epoch": 0.0567626953125, + "grad_norm": 12.324559211730957, + "learning_rate": 4.961682251991189e-06, + "loss": 3.1513, + "step": 5580 + }, + { + "epoch": 0.056813557942708336, + "grad_norm": 11.327527046203613, + "learning_rate": 4.961612476772033e-06, + "loss": 3.4368, + "step": 5585 + }, + { + "epoch": 0.056864420572916664, + "grad_norm": 9.986822128295898, + "learning_rate": 4.961542638573106e-06, + "loss": 3.5226, + "step": 5590 + }, + { + "epoch": 0.056915283203125, + "grad_norm": 16.936189651489258, + "learning_rate": 4.961472737396193e-06, + "loss": 3.698, + "step": 5595 + }, + { + "epoch": 0.056966145833333336, + "grad_norm": 15.606807708740234, + "learning_rate": 4.9614027732430835e-06, + "loss": 3.4487, + "step": 5600 + }, + { + "epoch": 0.057017008463541664, + "grad_norm": 11.489161491394043, + "learning_rate": 4.961332746115568e-06, + "loss": 3.5364, + "step": 5605 + }, + { + "epoch": 0.05706787109375, + "grad_norm": 13.7288818359375, + "learning_rate": 4.9612626560154375e-06, + "loss": 3.6168, + "step": 5610 + }, + { + "epoch": 0.057118733723958336, + "grad_norm": 8.421143531799316, + "learning_rate": 4.961192502944485e-06, + "loss": 3.4713, + "step": 5615 + }, + { + "epoch": 0.057169596354166664, + "grad_norm": 17.100997924804688, + "learning_rate": 4.961122286904506e-06, + "loss": 3.392, + "step": 5620 + }, + { + "epoch": 0.057220458984375, + "grad_norm": 17.024621963500977, + "learning_rate": 4.961052007897297e-06, + "loss": 3.6999, + "step": 5625 + }, + { + "epoch": 0.057271321614583336, + "grad_norm": 8.612135887145996, + "learning_rate": 4.960981665924655e-06, + "loss": 3.2406, + "step": 5630 + }, + { + "epoch": 0.057322184244791664, + "grad_norm": 11.135616302490234, + "learning_rate": 4.9609112609883816e-06, + "loss": 3.9177, + "step": 5635 + }, + { + "epoch": 0.057373046875, + "grad_norm": 13.029520988464355, + "learning_rate": 4.960840793090276e-06, + "loss": 3.4563, + "step": 5640 + }, + { + "epoch": 0.057423909505208336, + "grad_norm": 12.987259864807129, + "learning_rate": 4.960770262232141e-06, + "loss": 3.4741, + "step": 5645 + }, + { + "epoch": 0.057474772135416664, + "grad_norm": 8.016860008239746, + "learning_rate": 4.960699668415784e-06, + "loss": 3.423, + "step": 5650 + }, + { + "epoch": 0.057525634765625, + "grad_norm": 11.892767906188965, + "learning_rate": 4.960629011643008e-06, + "loss": 3.0988, + "step": 5655 + }, + { + "epoch": 0.057576497395833336, + "grad_norm": 7.135873794555664, + "learning_rate": 4.960558291915622e-06, + "loss": 3.2616, + "step": 5660 + }, + { + "epoch": 0.057627360026041664, + "grad_norm": 10.335238456726074, + "learning_rate": 4.960487509235435e-06, + "loss": 3.5319, + "step": 5665 + }, + { + "epoch": 0.05767822265625, + "grad_norm": 10.540165901184082, + "learning_rate": 4.96041666360426e-06, + "loss": 3.3135, + "step": 5670 + }, + { + "epoch": 0.057729085286458336, + "grad_norm": 17.09685516357422, + "learning_rate": 4.9603457550239065e-06, + "loss": 3.5976, + "step": 5675 + }, + { + "epoch": 0.057779947916666664, + "grad_norm": 13.739051818847656, + "learning_rate": 4.96027478349619e-06, + "loss": 3.5974, + "step": 5680 + }, + { + "epoch": 0.057830810546875, + "grad_norm": 13.99421501159668, + "learning_rate": 4.960203749022927e-06, + "loss": 3.2972, + "step": 5685 + }, + { + "epoch": 0.057881673177083336, + "grad_norm": 12.529129028320312, + "learning_rate": 4.960132651605934e-06, + "loss": 3.4988, + "step": 5690 + }, + { + "epoch": 0.057932535807291664, + "grad_norm": 13.570207595825195, + "learning_rate": 4.96006149124703e-06, + "loss": 3.2342, + "step": 5695 + }, + { + "epoch": 0.0579833984375, + "grad_norm": 8.2381591796875, + "learning_rate": 4.959990267948035e-06, + "loss": 3.6295, + "step": 5700 + }, + { + "epoch": 0.058034261067708336, + "grad_norm": 10.554726600646973, + "learning_rate": 4.959918981710773e-06, + "loss": 3.6647, + "step": 5705 + }, + { + "epoch": 0.058085123697916664, + "grad_norm": 10.161996841430664, + "learning_rate": 4.959847632537067e-06, + "loss": 3.3671, + "step": 5710 + }, + { + "epoch": 0.058135986328125, + "grad_norm": 10.479255676269531, + "learning_rate": 4.959776220428743e-06, + "loss": 3.6848, + "step": 5715 + }, + { + "epoch": 0.058186848958333336, + "grad_norm": 13.689743995666504, + "learning_rate": 4.959704745387626e-06, + "loss": 3.6923, + "step": 5720 + }, + { + "epoch": 0.058237711588541664, + "grad_norm": 15.075189590454102, + "learning_rate": 4.9596332074155465e-06, + "loss": 3.1929, + "step": 5725 + }, + { + "epoch": 0.05828857421875, + "grad_norm": 10.657966613769531, + "learning_rate": 4.959561606514335e-06, + "loss": 3.3856, + "step": 5730 + }, + { + "epoch": 0.058339436848958336, + "grad_norm": 17.425458908081055, + "learning_rate": 4.959489942685822e-06, + "loss": 3.5663, + "step": 5735 + }, + { + "epoch": 0.058390299479166664, + "grad_norm": 10.390463829040527, + "learning_rate": 4.959418215931843e-06, + "loss": 4.1035, + "step": 5740 + }, + { + "epoch": 0.058441162109375, + "grad_norm": 9.901952743530273, + "learning_rate": 4.959346426254231e-06, + "loss": 3.8645, + "step": 5745 + }, + { + "epoch": 0.058492024739583336, + "grad_norm": 12.191425323486328, + "learning_rate": 4.9592745736548235e-06, + "loss": 3.5104, + "step": 5750 + }, + { + "epoch": 0.058542887369791664, + "grad_norm": 8.743103981018066, + "learning_rate": 4.959202658135459e-06, + "loss": 3.4598, + "step": 5755 + }, + { + "epoch": 0.05859375, + "grad_norm": 12.222476959228516, + "learning_rate": 4.959130679697978e-06, + "loss": 3.4976, + "step": 5760 + }, + { + "epoch": 0.058644612630208336, + "grad_norm": 12.773666381835938, + "learning_rate": 4.95905863834422e-06, + "loss": 3.3259, + "step": 5765 + }, + { + "epoch": 0.058695475260416664, + "grad_norm": 12.655311584472656, + "learning_rate": 4.958986534076031e-06, + "loss": 3.6635, + "step": 5770 + }, + { + "epoch": 0.058746337890625, + "grad_norm": 10.425118446350098, + "learning_rate": 4.9589143668952536e-06, + "loss": 3.3051, + "step": 5775 + }, + { + "epoch": 0.058797200520833336, + "grad_norm": 12.868205070495605, + "learning_rate": 4.958842136803735e-06, + "loss": 3.6249, + "step": 5780 + }, + { + "epoch": 0.058848063151041664, + "grad_norm": 15.800631523132324, + "learning_rate": 4.958769843803324e-06, + "loss": 3.5991, + "step": 5785 + }, + { + "epoch": 0.05889892578125, + "grad_norm": 10.733988761901855, + "learning_rate": 4.958697487895869e-06, + "loss": 3.6806, + "step": 5790 + }, + { + "epoch": 0.058949788411458336, + "grad_norm": 14.783018112182617, + "learning_rate": 4.9586250690832214e-06, + "loss": 3.6191, + "step": 5795 + }, + { + "epoch": 0.059000651041666664, + "grad_norm": 9.55103874206543, + "learning_rate": 4.958552587367233e-06, + "loss": 3.2202, + "step": 5800 + }, + { + "epoch": 0.059051513671875, + "grad_norm": 12.465963363647461, + "learning_rate": 4.958480042749762e-06, + "loss": 3.0866, + "step": 5805 + }, + { + "epoch": 0.059102376302083336, + "grad_norm": 10.959587097167969, + "learning_rate": 4.958407435232659e-06, + "loss": 3.6214, + "step": 5810 + }, + { + "epoch": 0.059153238932291664, + "grad_norm": 20.63323974609375, + "learning_rate": 4.958334764817786e-06, + "loss": 4.0361, + "step": 5815 + }, + { + "epoch": 0.0592041015625, + "grad_norm": 7.549029350280762, + "learning_rate": 4.9582620315070005e-06, + "loss": 3.3141, + "step": 5820 + }, + { + "epoch": 0.059254964192708336, + "grad_norm": 12.9930419921875, + "learning_rate": 4.958189235302164e-06, + "loss": 3.397, + "step": 5825 + }, + { + "epoch": 0.059305826822916664, + "grad_norm": 15.772438049316406, + "learning_rate": 4.958116376205138e-06, + "loss": 3.375, + "step": 5830 + }, + { + "epoch": 0.059356689453125, + "grad_norm": 14.877005577087402, + "learning_rate": 4.9580434542177875e-06, + "loss": 3.2812, + "step": 5835 + }, + { + "epoch": 0.059407552083333336, + "grad_norm": 14.17616081237793, + "learning_rate": 4.957970469341977e-06, + "loss": 3.4907, + "step": 5840 + }, + { + "epoch": 0.059458414713541664, + "grad_norm": 16.345909118652344, + "learning_rate": 4.957897421579576e-06, + "loss": 3.3758, + "step": 5845 + }, + { + "epoch": 0.05950927734375, + "grad_norm": 15.003165245056152, + "learning_rate": 4.957824310932451e-06, + "loss": 3.0737, + "step": 5850 + }, + { + "epoch": 0.059560139973958336, + "grad_norm": 14.897906303405762, + "learning_rate": 4.957751137402475e-06, + "loss": 3.2275, + "step": 5855 + }, + { + "epoch": 0.059611002604166664, + "grad_norm": 13.421442031860352, + "learning_rate": 4.957677900991516e-06, + "loss": 3.2209, + "step": 5860 + }, + { + "epoch": 0.059661865234375, + "grad_norm": 9.91036319732666, + "learning_rate": 4.957604601701453e-06, + "loss": 3.9609, + "step": 5865 + }, + { + "epoch": 0.059712727864583336, + "grad_norm": 18.146198272705078, + "learning_rate": 4.957531239534158e-06, + "loss": 3.4241, + "step": 5870 + }, + { + "epoch": 0.059763590494791664, + "grad_norm": 13.376401901245117, + "learning_rate": 4.957457814491509e-06, + "loss": 3.4611, + "step": 5875 + }, + { + "epoch": 0.059814453125, + "grad_norm": 9.991517066955566, + "learning_rate": 4.957384326575383e-06, + "loss": 3.1355, + "step": 5880 + }, + { + "epoch": 0.059865315755208336, + "grad_norm": 8.1071138381958, + "learning_rate": 4.9573107757876625e-06, + "loss": 3.4358, + "step": 5885 + }, + { + "epoch": 0.059916178385416664, + "grad_norm": 16.746749877929688, + "learning_rate": 4.9572371621302284e-06, + "loss": 3.6471, + "step": 5890 + }, + { + "epoch": 0.059967041015625, + "grad_norm": 16.02521514892578, + "learning_rate": 4.957163485604963e-06, + "loss": 3.5407, + "step": 5895 + }, + { + "epoch": 0.060017903645833336, + "grad_norm": 8.261488914489746, + "learning_rate": 4.957089746213753e-06, + "loss": 3.2416, + "step": 5900 + }, + { + "epoch": 0.060068766276041664, + "grad_norm": 12.552051544189453, + "learning_rate": 4.957015943958484e-06, + "loss": 3.3061, + "step": 5905 + }, + { + "epoch": 0.06011962890625, + "grad_norm": 15.044920921325684, + "learning_rate": 4.956942078841045e-06, + "loss": 3.8459, + "step": 5910 + }, + { + "epoch": 0.060170491536458336, + "grad_norm": 13.593306541442871, + "learning_rate": 4.9568681508633246e-06, + "loss": 3.2017, + "step": 5915 + }, + { + "epoch": 0.060221354166666664, + "grad_norm": 12.529535293579102, + "learning_rate": 4.956794160027215e-06, + "loss": 3.7657, + "step": 5920 + }, + { + "epoch": 0.060272216796875, + "grad_norm": 13.670187950134277, + "learning_rate": 4.9567201063346096e-06, + "loss": 3.5618, + "step": 5925 + }, + { + "epoch": 0.060323079427083336, + "grad_norm": 14.781356811523438, + "learning_rate": 4.956645989787402e-06, + "loss": 3.3705, + "step": 5930 + }, + { + "epoch": 0.060373942057291664, + "grad_norm": 12.808194160461426, + "learning_rate": 4.95657181038749e-06, + "loss": 3.5778, + "step": 5935 + }, + { + "epoch": 0.0604248046875, + "grad_norm": 9.7361421585083, + "learning_rate": 4.956497568136769e-06, + "loss": 3.9065, + "step": 5940 + }, + { + "epoch": 0.060475667317708336, + "grad_norm": 8.544392585754395, + "learning_rate": 4.9564232630371414e-06, + "loss": 3.0755, + "step": 5945 + }, + { + "epoch": 0.060526529947916664, + "grad_norm": 16.483707427978516, + "learning_rate": 4.956348895090506e-06, + "loss": 3.2558, + "step": 5950 + }, + { + "epoch": 0.060577392578125, + "grad_norm": 17.052963256835938, + "learning_rate": 4.956274464298766e-06, + "loss": 3.6055, + "step": 5955 + }, + { + "epoch": 0.060628255208333336, + "grad_norm": 13.169681549072266, + "learning_rate": 4.956199970663827e-06, + "loss": 3.6049, + "step": 5960 + }, + { + "epoch": 0.060679117838541664, + "grad_norm": 11.642385482788086, + "learning_rate": 4.956125414187594e-06, + "loss": 3.3229, + "step": 5965 + }, + { + "epoch": 0.06072998046875, + "grad_norm": 9.508110046386719, + "learning_rate": 4.956050794871974e-06, + "loss": 3.2438, + "step": 5970 + }, + { + "epoch": 0.060780843098958336, + "grad_norm": 9.767243385314941, + "learning_rate": 4.955976112718876e-06, + "loss": 3.623, + "step": 5975 + }, + { + "epoch": 0.060831705729166664, + "grad_norm": 13.47176456451416, + "learning_rate": 4.955901367730212e-06, + "loss": 3.5425, + "step": 5980 + }, + { + "epoch": 0.060882568359375, + "grad_norm": 13.689672470092773, + "learning_rate": 4.9558265599078935e-06, + "loss": 3.4059, + "step": 5985 + }, + { + "epoch": 0.060933430989583336, + "grad_norm": 14.70511531829834, + "learning_rate": 4.955751689253834e-06, + "loss": 3.2993, + "step": 5990 + }, + { + "epoch": 0.060984293619791664, + "grad_norm": 10.678279876708984, + "learning_rate": 4.955676755769951e-06, + "loss": 3.8429, + "step": 5995 + }, + { + "epoch": 0.06103515625, + "grad_norm": 9.907837867736816, + "learning_rate": 4.955601759458158e-06, + "loss": 3.6905, + "step": 6000 + }, + { + "epoch": 0.061086018880208336, + "grad_norm": 14.26388931274414, + "learning_rate": 4.955526700320378e-06, + "loss": 3.8103, + "step": 6005 + }, + { + "epoch": 0.061136881510416664, + "grad_norm": 13.094128608703613, + "learning_rate": 4.955451578358529e-06, + "loss": 3.5114, + "step": 6010 + }, + { + "epoch": 0.061187744140625, + "grad_norm": 9.898237228393555, + "learning_rate": 4.955376393574533e-06, + "loss": 3.0525, + "step": 6015 + }, + { + "epoch": 0.061238606770833336, + "grad_norm": 20.232635498046875, + "learning_rate": 4.955301145970314e-06, + "loss": 3.3842, + "step": 6020 + }, + { + "epoch": 0.061289469401041664, + "grad_norm": 10.714902877807617, + "learning_rate": 4.955225835547798e-06, + "loss": 3.2338, + "step": 6025 + }, + { + "epoch": 0.06134033203125, + "grad_norm": 16.868967056274414, + "learning_rate": 4.95515046230891e-06, + "loss": 3.3336, + "step": 6030 + }, + { + "epoch": 0.061391194661458336, + "grad_norm": 10.545510292053223, + "learning_rate": 4.9550750262555795e-06, + "loss": 3.4626, + "step": 6035 + }, + { + "epoch": 0.061442057291666664, + "grad_norm": 16.030223846435547, + "learning_rate": 4.9549995273897365e-06, + "loss": 3.746, + "step": 6040 + }, + { + "epoch": 0.061492919921875, + "grad_norm": 16.585622787475586, + "learning_rate": 4.954923965713312e-06, + "loss": 3.5239, + "step": 6045 + }, + { + "epoch": 0.061543782552083336, + "grad_norm": 10.373279571533203, + "learning_rate": 4.95484834122824e-06, + "loss": 3.4698, + "step": 6050 + }, + { + "epoch": 0.061594645182291664, + "grad_norm": 11.033610343933105, + "learning_rate": 4.954772653936455e-06, + "loss": 3.521, + "step": 6055 + }, + { + "epoch": 0.0616455078125, + "grad_norm": 12.441916465759277, + "learning_rate": 4.954696903839894e-06, + "loss": 3.5568, + "step": 6060 + }, + { + "epoch": 0.061696370442708336, + "grad_norm": 12.675573348999023, + "learning_rate": 4.954621090940495e-06, + "loss": 3.199, + "step": 6065 + }, + { + "epoch": 0.061747233072916664, + "grad_norm": 11.934941291809082, + "learning_rate": 4.9545452152401965e-06, + "loss": 3.5181, + "step": 6070 + }, + { + "epoch": 0.061798095703125, + "grad_norm": 12.990299224853516, + "learning_rate": 4.95446927674094e-06, + "loss": 3.1343, + "step": 6075 + }, + { + "epoch": 0.061848958333333336, + "grad_norm": 15.312989234924316, + "learning_rate": 4.954393275444669e-06, + "loss": 2.912, + "step": 6080 + }, + { + "epoch": 0.061899820963541664, + "grad_norm": 16.04402732849121, + "learning_rate": 4.954317211353328e-06, + "loss": 3.5654, + "step": 6085 + }, + { + "epoch": 0.06195068359375, + "grad_norm": 16.57796287536621, + "learning_rate": 4.954241084468863e-06, + "loss": 3.4681, + "step": 6090 + }, + { + "epoch": 0.062001546223958336, + "grad_norm": 12.278860092163086, + "learning_rate": 4.954164894793222e-06, + "loss": 3.4626, + "step": 6095 + }, + { + "epoch": 0.062052408854166664, + "grad_norm": 85.06364440917969, + "learning_rate": 4.954088642328353e-06, + "loss": 3.5676, + "step": 6100 + }, + { + "epoch": 0.062103271484375, + "grad_norm": 9.45626163482666, + "learning_rate": 4.954012327076207e-06, + "loss": 3.7756, + "step": 6105 + }, + { + "epoch": 0.062154134114583336, + "grad_norm": 13.716742515563965, + "learning_rate": 4.95393594903874e-06, + "loss": 3.5243, + "step": 6110 + }, + { + "epoch": 0.062204996744791664, + "grad_norm": 19.727203369140625, + "learning_rate": 4.953859508217901e-06, + "loss": 3.479, + "step": 6115 + }, + { + "epoch": 0.062255859375, + "grad_norm": 14.221769332885742, + "learning_rate": 4.953783004615649e-06, + "loss": 3.3731, + "step": 6120 + }, + { + "epoch": 0.062306722005208336, + "grad_norm": 8.173396110534668, + "learning_rate": 4.953706438233941e-06, + "loss": 3.2895, + "step": 6125 + }, + { + "epoch": 0.062357584635416664, + "grad_norm": 11.394742965698242, + "learning_rate": 4.953629809074734e-06, + "loss": 3.3431, + "step": 6130 + }, + { + "epoch": 0.062408447265625, + "grad_norm": 7.780871391296387, + "learning_rate": 4.953553117139991e-06, + "loss": 3.6523, + "step": 6135 + }, + { + "epoch": 0.062459309895833336, + "grad_norm": 12.438297271728516, + "learning_rate": 4.953476362431672e-06, + "loss": 3.4039, + "step": 6140 + }, + { + "epoch": 0.06251017252604167, + "grad_norm": 12.829780578613281, + "learning_rate": 4.953399544951742e-06, + "loss": 3.5707, + "step": 6145 + }, + { + "epoch": 0.06256103515625, + "grad_norm": 11.045709609985352, + "learning_rate": 4.953322664702167e-06, + "loss": 3.1758, + "step": 6150 + }, + { + "epoch": 0.06261189778645833, + "grad_norm": 13.500697135925293, + "learning_rate": 4.953245721684913e-06, + "loss": 3.5634, + "step": 6155 + }, + { + "epoch": 0.06266276041666667, + "grad_norm": 13.87531852722168, + "learning_rate": 4.953168715901949e-06, + "loss": 3.0295, + "step": 6160 + }, + { + "epoch": 0.062713623046875, + "grad_norm": 10.557450294494629, + "learning_rate": 4.953091647355244e-06, + "loss": 3.6556, + "step": 6165 + }, + { + "epoch": 0.06276448567708333, + "grad_norm": 14.243014335632324, + "learning_rate": 4.953014516046771e-06, + "loss": 3.493, + "step": 6170 + }, + { + "epoch": 0.06281534830729167, + "grad_norm": 14.641172409057617, + "learning_rate": 4.952937321978502e-06, + "loss": 3.5743, + "step": 6175 + }, + { + "epoch": 0.0628662109375, + "grad_norm": 14.950688362121582, + "learning_rate": 4.952860065152415e-06, + "loss": 3.3458, + "step": 6180 + }, + { + "epoch": 0.06291707356770833, + "grad_norm": 10.996521949768066, + "learning_rate": 4.952782745570483e-06, + "loss": 3.3334, + "step": 6185 + }, + { + "epoch": 0.06296793619791667, + "grad_norm": 10.796847343444824, + "learning_rate": 4.952705363234687e-06, + "loss": 3.6746, + "step": 6190 + }, + { + "epoch": 0.063018798828125, + "grad_norm": 16.92947006225586, + "learning_rate": 4.952627918147005e-06, + "loss": 3.3581, + "step": 6195 + }, + { + "epoch": 0.06306966145833333, + "grad_norm": 9.009227752685547, + "learning_rate": 4.952550410309419e-06, + "loss": 3.3688, + "step": 6200 + }, + { + "epoch": 0.06312052408854167, + "grad_norm": 9.859138488769531, + "learning_rate": 4.952472839723912e-06, + "loss": 3.2627, + "step": 6205 + }, + { + "epoch": 0.06317138671875, + "grad_norm": 16.701183319091797, + "learning_rate": 4.952395206392469e-06, + "loss": 3.6775, + "step": 6210 + }, + { + "epoch": 0.06322224934895833, + "grad_norm": 17.42451286315918, + "learning_rate": 4.952317510317076e-06, + "loss": 3.5911, + "step": 6215 + }, + { + "epoch": 0.06327311197916667, + "grad_norm": 12.738759994506836, + "learning_rate": 4.952239751499721e-06, + "loss": 3.2518, + "step": 6220 + }, + { + "epoch": 0.063323974609375, + "grad_norm": 12.116572380065918, + "learning_rate": 4.952161929942393e-06, + "loss": 3.6424, + "step": 6225 + }, + { + "epoch": 0.06337483723958333, + "grad_norm": 15.029314041137695, + "learning_rate": 4.952084045647083e-06, + "loss": 3.4677, + "step": 6230 + }, + { + "epoch": 0.06342569986979167, + "grad_norm": 16.844175338745117, + "learning_rate": 4.952006098615784e-06, + "loss": 3.0719, + "step": 6235 + }, + { + "epoch": 0.0634765625, + "grad_norm": 16.886533737182617, + "learning_rate": 4.95192808885049e-06, + "loss": 3.4529, + "step": 6240 + }, + { + "epoch": 0.06352742513020833, + "grad_norm": 15.081376075744629, + "learning_rate": 4.9518500163531966e-06, + "loss": 3.1723, + "step": 6245 + }, + { + "epoch": 0.06357828776041667, + "grad_norm": 9.575623512268066, + "learning_rate": 4.951771881125903e-06, + "loss": 3.5933, + "step": 6250 + }, + { + "epoch": 0.063629150390625, + "grad_norm": 9.829601287841797, + "learning_rate": 4.951693683170606e-06, + "loss": 3.7078, + "step": 6255 + }, + { + "epoch": 0.06368001302083333, + "grad_norm": 10.027034759521484, + "learning_rate": 4.951615422489308e-06, + "loss": 3.3214, + "step": 6260 + }, + { + "epoch": 0.06373087565104167, + "grad_norm": 14.713983535766602, + "learning_rate": 4.9515370990840095e-06, + "loss": 3.9297, + "step": 6265 + }, + { + "epoch": 0.06378173828125, + "grad_norm": 10.196232795715332, + "learning_rate": 4.951458712956716e-06, + "loss": 3.3136, + "step": 6270 + }, + { + "epoch": 0.06383260091145833, + "grad_norm": 8.880327224731445, + "learning_rate": 4.9513802641094325e-06, + "loss": 3.4526, + "step": 6275 + }, + { + "epoch": 0.06388346354166667, + "grad_norm": 10.190267562866211, + "learning_rate": 4.951301752544165e-06, + "loss": 3.8351, + "step": 6280 + }, + { + "epoch": 0.063934326171875, + "grad_norm": 13.929616928100586, + "learning_rate": 4.951223178262924e-06, + "loss": 3.3708, + "step": 6285 + }, + { + "epoch": 0.06398518880208333, + "grad_norm": 12.750432014465332, + "learning_rate": 4.951144541267719e-06, + "loss": 3.617, + "step": 6290 + }, + { + "epoch": 0.06403605143229167, + "grad_norm": 17.89909553527832, + "learning_rate": 4.951065841560561e-06, + "loss": 3.6072, + "step": 6295 + }, + { + "epoch": 0.0640869140625, + "grad_norm": 14.314258575439453, + "learning_rate": 4.950987079143465e-06, + "loss": 3.4483, + "step": 6300 + }, + { + "epoch": 0.06413777669270833, + "grad_norm": 12.541434288024902, + "learning_rate": 4.950908254018446e-06, + "loss": 3.9403, + "step": 6305 + }, + { + "epoch": 0.06418863932291667, + "grad_norm": 10.588662147521973, + "learning_rate": 4.9508293661875205e-06, + "loss": 3.3919, + "step": 6310 + }, + { + "epoch": 0.064239501953125, + "grad_norm": 14.57321834564209, + "learning_rate": 4.950750415652706e-06, + "loss": 3.6334, + "step": 6315 + }, + { + "epoch": 0.06429036458333333, + "grad_norm": 12.940896034240723, + "learning_rate": 4.950671402416023e-06, + "loss": 3.5378, + "step": 6320 + }, + { + "epoch": 0.06434122721354167, + "grad_norm": 14.000555992126465, + "learning_rate": 4.9505923264794935e-06, + "loss": 3.7651, + "step": 6325 + }, + { + "epoch": 0.06439208984375, + "grad_norm": 12.165482521057129, + "learning_rate": 4.95051318784514e-06, + "loss": 3.3013, + "step": 6330 + }, + { + "epoch": 0.06444295247395833, + "grad_norm": 10.64943790435791, + "learning_rate": 4.950433986514986e-06, + "loss": 3.6277, + "step": 6335 + }, + { + "epoch": 0.06449381510416667, + "grad_norm": 9.537559509277344, + "learning_rate": 4.9503547224910605e-06, + "loss": 3.3196, + "step": 6340 + }, + { + "epoch": 0.064544677734375, + "grad_norm": 12.809176445007324, + "learning_rate": 4.9502753957753905e-06, + "loss": 3.5917, + "step": 6345 + }, + { + "epoch": 0.06459554036458333, + "grad_norm": 9.395776748657227, + "learning_rate": 4.950196006370005e-06, + "loss": 3.4648, + "step": 6350 + }, + { + "epoch": 0.06464640299479167, + "grad_norm": 14.865406036376953, + "learning_rate": 4.950116554276936e-06, + "loss": 3.5405, + "step": 6355 + }, + { + "epoch": 0.064697265625, + "grad_norm": 13.822293281555176, + "learning_rate": 4.950037039498215e-06, + "loss": 3.2149, + "step": 6360 + }, + { + "epoch": 0.06474812825520833, + "grad_norm": 10.368663787841797, + "learning_rate": 4.949957462035877e-06, + "loss": 3.2916, + "step": 6365 + }, + { + "epoch": 0.06479899088541667, + "grad_norm": 15.785028457641602, + "learning_rate": 4.949877821891958e-06, + "loss": 3.2872, + "step": 6370 + }, + { + "epoch": 0.064849853515625, + "grad_norm": 9.993640899658203, + "learning_rate": 4.949798119068495e-06, + "loss": 3.3167, + "step": 6375 + }, + { + "epoch": 0.06490071614583333, + "grad_norm": 14.402359962463379, + "learning_rate": 4.949718353567529e-06, + "loss": 3.7203, + "step": 6380 + }, + { + "epoch": 0.06495157877604167, + "grad_norm": 13.888138771057129, + "learning_rate": 4.9496385253910996e-06, + "loss": 3.4504, + "step": 6385 + }, + { + "epoch": 0.06500244140625, + "grad_norm": 19.04587745666504, + "learning_rate": 4.949558634541249e-06, + "loss": 3.4553, + "step": 6390 + }, + { + "epoch": 0.06505330403645833, + "grad_norm": 17.151155471801758, + "learning_rate": 4.94947868102002e-06, + "loss": 3.7058, + "step": 6395 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 13.129286766052246, + "learning_rate": 4.949398664829461e-06, + "loss": 3.5657, + "step": 6400 + }, + { + "epoch": 0.065155029296875, + "grad_norm": 12.586851119995117, + "learning_rate": 4.949318585971617e-06, + "loss": 3.2961, + "step": 6405 + }, + { + "epoch": 0.06520589192708333, + "grad_norm": 15.264113426208496, + "learning_rate": 4.949238444448539e-06, + "loss": 3.7745, + "step": 6410 + }, + { + "epoch": 0.06525675455729167, + "grad_norm": 12.554951667785645, + "learning_rate": 4.949158240262274e-06, + "loss": 3.455, + "step": 6415 + }, + { + "epoch": 0.0653076171875, + "grad_norm": 11.962057113647461, + "learning_rate": 4.949077973414877e-06, + "loss": 3.2963, + "step": 6420 + }, + { + "epoch": 0.06535847981770833, + "grad_norm": 12.596719741821289, + "learning_rate": 4.9489976439084e-06, + "loss": 3.7278, + "step": 6425 + }, + { + "epoch": 0.06540934244791667, + "grad_norm": 10.47852897644043, + "learning_rate": 4.948917251744899e-06, + "loss": 3.6283, + "step": 6430 + }, + { + "epoch": 0.065460205078125, + "grad_norm": 9.309669494628906, + "learning_rate": 4.9488367969264304e-06, + "loss": 3.3136, + "step": 6435 + }, + { + "epoch": 0.06551106770833333, + "grad_norm": 13.344404220581055, + "learning_rate": 4.9487562794550535e-06, + "loss": 3.3655, + "step": 6440 + }, + { + "epoch": 0.06556193033854167, + "grad_norm": 13.794371604919434, + "learning_rate": 4.948675699332827e-06, + "loss": 3.2034, + "step": 6445 + }, + { + "epoch": 0.06561279296875, + "grad_norm": 14.009343147277832, + "learning_rate": 4.9485950565618134e-06, + "loss": 3.4136, + "step": 6450 + }, + { + "epoch": 0.06566365559895833, + "grad_norm": 10.496328353881836, + "learning_rate": 4.9485143511440754e-06, + "loss": 3.5469, + "step": 6455 + }, + { + "epoch": 0.06571451822916667, + "grad_norm": 18.860055923461914, + "learning_rate": 4.948433583081678e-06, + "loss": 4.3312, + "step": 6460 + }, + { + "epoch": 0.065765380859375, + "grad_norm": 12.226606369018555, + "learning_rate": 4.948352752376689e-06, + "loss": 3.187, + "step": 6465 + }, + { + "epoch": 0.06581624348958333, + "grad_norm": 8.17588996887207, + "learning_rate": 4.948271859031173e-06, + "loss": 3.4183, + "step": 6470 + }, + { + "epoch": 0.06586710611979167, + "grad_norm": 18.25055694580078, + "learning_rate": 4.948190903047203e-06, + "loss": 3.9294, + "step": 6475 + }, + { + "epoch": 0.06591796875, + "grad_norm": 15.113214492797852, + "learning_rate": 4.948109884426849e-06, + "loss": 3.2971, + "step": 6480 + }, + { + "epoch": 0.06596883138020833, + "grad_norm": 10.636944770812988, + "learning_rate": 4.9480288031721835e-06, + "loss": 3.1685, + "step": 6485 + }, + { + "epoch": 0.06601969401041667, + "grad_norm": 12.23661994934082, + "learning_rate": 4.947947659285281e-06, + "loss": 3.2994, + "step": 6490 + }, + { + "epoch": 0.066070556640625, + "grad_norm": 14.394771575927734, + "learning_rate": 4.9478664527682194e-06, + "loss": 3.5549, + "step": 6495 + }, + { + "epoch": 0.06612141927083333, + "grad_norm": 12.141425132751465, + "learning_rate": 4.947785183623074e-06, + "loss": 3.5396, + "step": 6500 + }, + { + "epoch": 0.06617228190104167, + "grad_norm": 11.212850570678711, + "learning_rate": 4.9477038518519235e-06, + "loss": 3.3503, + "step": 6505 + }, + { + "epoch": 0.06622314453125, + "grad_norm": 13.835770606994629, + "learning_rate": 4.947622457456852e-06, + "loss": 3.1723, + "step": 6510 + }, + { + "epoch": 0.06627400716145833, + "grad_norm": 13.431824684143066, + "learning_rate": 4.94754100043994e-06, + "loss": 3.339, + "step": 6515 + }, + { + "epoch": 0.06632486979166667, + "grad_norm": 12.01131820678711, + "learning_rate": 4.94745948080327e-06, + "loss": 3.2717, + "step": 6520 + }, + { + "epoch": 0.066375732421875, + "grad_norm": 14.991222381591797, + "learning_rate": 4.947377898548931e-06, + "loss": 3.52, + "step": 6525 + }, + { + "epoch": 0.06642659505208333, + "grad_norm": 14.108927726745605, + "learning_rate": 4.947296253679008e-06, + "loss": 3.1859, + "step": 6530 + }, + { + "epoch": 0.06647745768229167, + "grad_norm": 9.38899040222168, + "learning_rate": 4.94721454619559e-06, + "loss": 3.6626, + "step": 6535 + }, + { + "epoch": 0.0665283203125, + "grad_norm": 12.356621742248535, + "learning_rate": 4.947132776100768e-06, + "loss": 3.8692, + "step": 6540 + }, + { + "epoch": 0.06657918294270833, + "grad_norm": 16.474075317382812, + "learning_rate": 4.947050943396634e-06, + "loss": 3.0192, + "step": 6545 + }, + { + "epoch": 0.06663004557291667, + "grad_norm": 12.742081642150879, + "learning_rate": 4.9469690480852824e-06, + "loss": 3.3703, + "step": 6550 + }, + { + "epoch": 0.066680908203125, + "grad_norm": 13.639100074768066, + "learning_rate": 4.946887090168807e-06, + "loss": 3.8247, + "step": 6555 + }, + { + "epoch": 0.06673177083333333, + "grad_norm": 9.607970237731934, + "learning_rate": 4.946805069649305e-06, + "loss": 3.4566, + "step": 6560 + }, + { + "epoch": 0.06678263346354167, + "grad_norm": 13.177118301391602, + "learning_rate": 4.946722986528876e-06, + "loss": 3.5717, + "step": 6565 + }, + { + "epoch": 0.06683349609375, + "grad_norm": 12.43066120147705, + "learning_rate": 4.946640840809619e-06, + "loss": 3.5391, + "step": 6570 + }, + { + "epoch": 0.06688435872395833, + "grad_norm": 13.892637252807617, + "learning_rate": 4.946558632493636e-06, + "loss": 3.802, + "step": 6575 + }, + { + "epoch": 0.06693522135416667, + "grad_norm": 13.632702827453613, + "learning_rate": 4.94647636158303e-06, + "loss": 3.3285, + "step": 6580 + }, + { + "epoch": 0.066986083984375, + "grad_norm": 12.024981498718262, + "learning_rate": 4.946394028079907e-06, + "loss": 3.2208, + "step": 6585 + }, + { + "epoch": 0.06703694661458333, + "grad_norm": 15.032366752624512, + "learning_rate": 4.946311631986372e-06, + "loss": 3.374, + "step": 6590 + }, + { + "epoch": 0.06708780924479167, + "grad_norm": 10.841226577758789, + "learning_rate": 4.946229173304535e-06, + "loss": 3.2525, + "step": 6595 + }, + { + "epoch": 0.067138671875, + "grad_norm": 10.338508605957031, + "learning_rate": 4.946146652036502e-06, + "loss": 3.5958, + "step": 6600 + }, + { + "epoch": 0.06718953450520833, + "grad_norm": 10.029006004333496, + "learning_rate": 4.9460640681843885e-06, + "loss": 3.6985, + "step": 6605 + }, + { + "epoch": 0.06724039713541667, + "grad_norm": 9.308262825012207, + "learning_rate": 4.945981421750305e-06, + "loss": 3.3633, + "step": 6610 + }, + { + "epoch": 0.067291259765625, + "grad_norm": 8.373737335205078, + "learning_rate": 4.945898712736366e-06, + "loss": 3.5412, + "step": 6615 + }, + { + "epoch": 0.06734212239583333, + "grad_norm": 11.022754669189453, + "learning_rate": 4.94581594114469e-06, + "loss": 4.3451, + "step": 6620 + }, + { + "epoch": 0.06739298502604167, + "grad_norm": 7.635653972625732, + "learning_rate": 4.945733106977391e-06, + "loss": 3.1132, + "step": 6625 + }, + { + "epoch": 0.06744384765625, + "grad_norm": 11.958892822265625, + "learning_rate": 4.945650210236591e-06, + "loss": 3.736, + "step": 6630 + }, + { + "epoch": 0.06749471028645833, + "grad_norm": 17.302410125732422, + "learning_rate": 4.94556725092441e-06, + "loss": 3.2335, + "step": 6635 + }, + { + "epoch": 0.06754557291666667, + "grad_norm": 10.765392303466797, + "learning_rate": 4.945484229042971e-06, + "loss": 3.0814, + "step": 6640 + }, + { + "epoch": 0.067596435546875, + "grad_norm": 10.385746002197266, + "learning_rate": 4.945401144594397e-06, + "loss": 3.4643, + "step": 6645 + }, + { + "epoch": 0.06764729817708333, + "grad_norm": 14.88101577758789, + "learning_rate": 4.945317997580814e-06, + "loss": 3.7192, + "step": 6650 + }, + { + "epoch": 0.06769816080729167, + "grad_norm": 10.534723281860352, + "learning_rate": 4.9452347880043505e-06, + "loss": 3.4338, + "step": 6655 + }, + { + "epoch": 0.0677490234375, + "grad_norm": 10.487231254577637, + "learning_rate": 4.945151515867134e-06, + "loss": 3.6227, + "step": 6660 + }, + { + "epoch": 0.06779988606770833, + "grad_norm": 10.452747344970703, + "learning_rate": 4.9450681811712954e-06, + "loss": 3.2158, + "step": 6665 + }, + { + "epoch": 0.06785074869791667, + "grad_norm": 12.921323776245117, + "learning_rate": 4.944984783918968e-06, + "loss": 3.5547, + "step": 6670 + }, + { + "epoch": 0.067901611328125, + "grad_norm": 12.540404319763184, + "learning_rate": 4.944901324112283e-06, + "loss": 3.5022, + "step": 6675 + }, + { + "epoch": 0.06795247395833333, + "grad_norm": 11.059189796447754, + "learning_rate": 4.9448178017533775e-06, + "loss": 3.3226, + "step": 6680 + }, + { + "epoch": 0.06800333658854167, + "grad_norm": 19.822628021240234, + "learning_rate": 4.944734216844388e-06, + "loss": 3.7347, + "step": 6685 + }, + { + "epoch": 0.06805419921875, + "grad_norm": 12.81627082824707, + "learning_rate": 4.944650569387453e-06, + "loss": 3.9766, + "step": 6690 + }, + { + "epoch": 0.06810506184895833, + "grad_norm": 14.65715217590332, + "learning_rate": 4.944566859384714e-06, + "loss": 3.2335, + "step": 6695 + }, + { + "epoch": 0.06815592447916667, + "grad_norm": 9.100480079650879, + "learning_rate": 4.94448308683831e-06, + "loss": 3.3709, + "step": 6700 + }, + { + "epoch": 0.068206787109375, + "grad_norm": 12.44413948059082, + "learning_rate": 4.944399251750386e-06, + "loss": 3.705, + "step": 6705 + }, + { + "epoch": 0.06825764973958333, + "grad_norm": 612.3552856445312, + "learning_rate": 4.944315354123086e-06, + "loss": 3.851, + "step": 6710 + }, + { + "epoch": 0.06830851236979167, + "grad_norm": 11.150618553161621, + "learning_rate": 4.944231393958558e-06, + "loss": 3.3883, + "step": 6715 + }, + { + "epoch": 0.068359375, + "grad_norm": 15.825736999511719, + "learning_rate": 4.944147371258948e-06, + "loss": 3.1653, + "step": 6720 + }, + { + "epoch": 0.06841023763020833, + "grad_norm": 13.351125717163086, + "learning_rate": 4.944063286026408e-06, + "loss": 3.6262, + "step": 6725 + }, + { + "epoch": 0.06846110026041667, + "grad_norm": 10.806169509887695, + "learning_rate": 4.9439791382630875e-06, + "loss": 3.4346, + "step": 6730 + }, + { + "epoch": 0.068511962890625, + "grad_norm": 14.80085277557373, + "learning_rate": 4.94389492797114e-06, + "loss": 4.8293, + "step": 6735 + }, + { + "epoch": 0.06856282552083333, + "grad_norm": 15.626360893249512, + "learning_rate": 4.94381065515272e-06, + "loss": 3.5732, + "step": 6740 + }, + { + "epoch": 0.06861368815104167, + "grad_norm": 12.220220565795898, + "learning_rate": 4.943726319809984e-06, + "loss": 3.1158, + "step": 6745 + }, + { + "epoch": 0.06866455078125, + "grad_norm": 11.84122371673584, + "learning_rate": 4.943641921945089e-06, + "loss": 3.2894, + "step": 6750 + }, + { + "epoch": 0.06871541341145833, + "grad_norm": 11.213668823242188, + "learning_rate": 4.943557461560195e-06, + "loss": 3.5569, + "step": 6755 + }, + { + "epoch": 0.06876627604166667, + "grad_norm": 10.223577499389648, + "learning_rate": 4.943472938657462e-06, + "loss": 3.4687, + "step": 6760 + }, + { + "epoch": 0.068817138671875, + "grad_norm": 11.476598739624023, + "learning_rate": 4.943388353239053e-06, + "loss": 3.3717, + "step": 6765 + }, + { + "epoch": 0.06886800130208333, + "grad_norm": 12.814501762390137, + "learning_rate": 4.943303705307133e-06, + "loss": 3.4168, + "step": 6770 + }, + { + "epoch": 0.06891886393229167, + "grad_norm": 15.519283294677734, + "learning_rate": 4.943218994863866e-06, + "loss": 3.4865, + "step": 6775 + }, + { + "epoch": 0.0689697265625, + "grad_norm": 10.789101600646973, + "learning_rate": 4.943134221911421e-06, + "loss": 3.3488, + "step": 6780 + }, + { + "epoch": 0.06902058919270833, + "grad_norm": 16.44157600402832, + "learning_rate": 4.943049386451964e-06, + "loss": 3.7548, + "step": 6785 + }, + { + "epoch": 0.06907145182291667, + "grad_norm": 12.07598876953125, + "learning_rate": 4.942964488487669e-06, + "loss": 3.6223, + "step": 6790 + }, + { + "epoch": 0.069122314453125, + "grad_norm": 13.612014770507812, + "learning_rate": 4.942879528020707e-06, + "loss": 3.9421, + "step": 6795 + }, + { + "epoch": 0.06917317708333333, + "grad_norm": 10.37803840637207, + "learning_rate": 4.9427945050532515e-06, + "loss": 3.6429, + "step": 6800 + }, + { + "epoch": 0.06922403971354167, + "grad_norm": 10.748969078063965, + "learning_rate": 4.942709419587476e-06, + "loss": 3.4308, + "step": 6805 + }, + { + "epoch": 0.06927490234375, + "grad_norm": 8.533020973205566, + "learning_rate": 4.9426242716255605e-06, + "loss": 3.2213, + "step": 6810 + }, + { + "epoch": 0.06932576497395833, + "grad_norm": 14.239413261413574, + "learning_rate": 4.942539061169681e-06, + "loss": 3.5663, + "step": 6815 + }, + { + "epoch": 0.06937662760416667, + "grad_norm": 12.664621353149414, + "learning_rate": 4.942453788222019e-06, + "loss": 3.5002, + "step": 6820 + }, + { + "epoch": 0.069427490234375, + "grad_norm": 12.779163360595703, + "learning_rate": 4.942368452784756e-06, + "loss": 3.2068, + "step": 6825 + }, + { + "epoch": 0.06947835286458333, + "grad_norm": 16.653684616088867, + "learning_rate": 4.9422830548600745e-06, + "loss": 3.3543, + "step": 6830 + }, + { + "epoch": 0.06952921549479167, + "grad_norm": 16.349224090576172, + "learning_rate": 4.94219759445016e-06, + "loss": 3.5099, + "step": 6835 + }, + { + "epoch": 0.069580078125, + "grad_norm": 8.445566177368164, + "learning_rate": 4.942112071557199e-06, + "loss": 3.793, + "step": 6840 + }, + { + "epoch": 0.06963094075520833, + "grad_norm": 14.703461647033691, + "learning_rate": 4.94202648618338e-06, + "loss": 3.5832, + "step": 6845 + }, + { + "epoch": 0.06968180338541667, + "grad_norm": 14.559622764587402, + "learning_rate": 4.941940838330891e-06, + "loss": 3.6992, + "step": 6850 + }, + { + "epoch": 0.069732666015625, + "grad_norm": 11.245277404785156, + "learning_rate": 4.941855128001925e-06, + "loss": 2.9931, + "step": 6855 + }, + { + "epoch": 0.06978352864583333, + "grad_norm": 17.25091552734375, + "learning_rate": 4.941769355198675e-06, + "loss": 3.243, + "step": 6860 + }, + { + "epoch": 0.06983439127604167, + "grad_norm": 11.740355491638184, + "learning_rate": 4.941683519923335e-06, + "loss": 3.3695, + "step": 6865 + }, + { + "epoch": 0.06988525390625, + "grad_norm": 12.884564399719238, + "learning_rate": 4.9415976221781e-06, + "loss": 3.3783, + "step": 6870 + }, + { + "epoch": 0.06993611653645833, + "grad_norm": 14.360733032226562, + "learning_rate": 4.9415116619651685e-06, + "loss": 3.332, + "step": 6875 + }, + { + "epoch": 0.06998697916666667, + "grad_norm": 15.234169960021973, + "learning_rate": 4.94142563928674e-06, + "loss": 3.0307, + "step": 6880 + }, + { + "epoch": 0.070037841796875, + "grad_norm": 12.016648292541504, + "learning_rate": 4.941339554145015e-06, + "loss": 3.607, + "step": 6885 + }, + { + "epoch": 0.07008870442708333, + "grad_norm": 10.132488250732422, + "learning_rate": 4.941253406542197e-06, + "loss": 3.7322, + "step": 6890 + }, + { + "epoch": 0.07013956705729167, + "grad_norm": 11.677488327026367, + "learning_rate": 4.941167196480489e-06, + "loss": 3.7035, + "step": 6895 + }, + { + "epoch": 0.0701904296875, + "grad_norm": 8.434168815612793, + "learning_rate": 4.941080923962096e-06, + "loss": 3.3046, + "step": 6900 + }, + { + "epoch": 0.07024129231770833, + "grad_norm": 18.53049659729004, + "learning_rate": 4.940994588989227e-06, + "loss": 3.9959, + "step": 6905 + }, + { + "epoch": 0.07029215494791667, + "grad_norm": 14.731730461120605, + "learning_rate": 4.94090819156409e-06, + "loss": 3.5238, + "step": 6910 + }, + { + "epoch": 0.070343017578125, + "grad_norm": 9.122929573059082, + "learning_rate": 4.940821731688895e-06, + "loss": 3.525, + "step": 6915 + }, + { + "epoch": 0.07039388020833333, + "grad_norm": 11.580704689025879, + "learning_rate": 4.940735209365855e-06, + "loss": 3.9973, + "step": 6920 + }, + { + "epoch": 0.07044474283854167, + "grad_norm": 13.921919822692871, + "learning_rate": 4.940648624597183e-06, + "loss": 3.3295, + "step": 6925 + }, + { + "epoch": 0.07049560546875, + "grad_norm": 11.508909225463867, + "learning_rate": 4.9405619773850944e-06, + "loss": 3.3846, + "step": 6930 + }, + { + "epoch": 0.07054646809895833, + "grad_norm": 15.52819538116455, + "learning_rate": 4.940475267731806e-06, + "loss": 3.7001, + "step": 6935 + }, + { + "epoch": 0.07059733072916667, + "grad_norm": 12.931517601013184, + "learning_rate": 4.940388495639537e-06, + "loss": 3.4707, + "step": 6940 + }, + { + "epoch": 0.070648193359375, + "grad_norm": 16.401697158813477, + "learning_rate": 4.9403016611105055e-06, + "loss": 3.5479, + "step": 6945 + }, + { + "epoch": 0.07069905598958333, + "grad_norm": 14.115461349487305, + "learning_rate": 4.940214764146935e-06, + "loss": 3.4651, + "step": 6950 + }, + { + "epoch": 0.07074991861979167, + "grad_norm": 13.501302719116211, + "learning_rate": 4.940127804751048e-06, + "loss": 3.2613, + "step": 6955 + }, + { + "epoch": 0.07080078125, + "grad_norm": 9.428580284118652, + "learning_rate": 4.94004078292507e-06, + "loss": 3.528, + "step": 6960 + }, + { + "epoch": 0.07085164388020833, + "grad_norm": 15.635141372680664, + "learning_rate": 4.939953698671227e-06, + "loss": 3.413, + "step": 6965 + }, + { + "epoch": 0.07090250651041667, + "grad_norm": 18.93190574645996, + "learning_rate": 4.939866551991746e-06, + "loss": 3.6146, + "step": 6970 + }, + { + "epoch": 0.070953369140625, + "grad_norm": 13.07038688659668, + "learning_rate": 4.939779342888858e-06, + "loss": 3.6089, + "step": 6975 + }, + { + "epoch": 0.07100423177083333, + "grad_norm": 11.98829174041748, + "learning_rate": 4.939692071364794e-06, + "loss": 3.2366, + "step": 6980 + }, + { + "epoch": 0.07105509440104167, + "grad_norm": 13.00400161743164, + "learning_rate": 4.939604737421787e-06, + "loss": 3.6997, + "step": 6985 + }, + { + "epoch": 0.07110595703125, + "grad_norm": 10.635180473327637, + "learning_rate": 4.9395173410620714e-06, + "loss": 3.5263, + "step": 6990 + }, + { + "epoch": 0.07115681966145833, + "grad_norm": 8.874784469604492, + "learning_rate": 4.939429882287881e-06, + "loss": 3.7435, + "step": 6995 + }, + { + "epoch": 0.07120768229166667, + "grad_norm": 17.223342895507812, + "learning_rate": 4.939342361101457e-06, + "loss": 3.248, + "step": 7000 + }, + { + "epoch": 0.071258544921875, + "grad_norm": 15.709144592285156, + "learning_rate": 4.939254777505037e-06, + "loss": 3.3038, + "step": 7005 + }, + { + "epoch": 0.07130940755208333, + "grad_norm": 10.639139175415039, + "learning_rate": 4.93916713150086e-06, + "loss": 3.433, + "step": 7010 + }, + { + "epoch": 0.07136027018229167, + "grad_norm": 13.915593147277832, + "learning_rate": 4.9390794230911715e-06, + "loss": 3.4058, + "step": 7015 + }, + { + "epoch": 0.0714111328125, + "grad_norm": 8.267925262451172, + "learning_rate": 4.938991652278213e-06, + "loss": 3.5928, + "step": 7020 + }, + { + "epoch": 0.07146199544270833, + "grad_norm": 16.01654624938965, + "learning_rate": 4.938903819064232e-06, + "loss": 3.1889, + "step": 7025 + }, + { + "epoch": 0.07151285807291667, + "grad_norm": 13.366415977478027, + "learning_rate": 4.938815923451476e-06, + "loss": 3.3296, + "step": 7030 + }, + { + "epoch": 0.071563720703125, + "grad_norm": 12.922379493713379, + "learning_rate": 4.9387279654421905e-06, + "loss": 3.4845, + "step": 7035 + }, + { + "epoch": 0.07161458333333333, + "grad_norm": 10.66179084777832, + "learning_rate": 4.938639945038629e-06, + "loss": 3.3688, + "step": 7040 + }, + { + "epoch": 0.07166544596354167, + "grad_norm": 9.114995002746582, + "learning_rate": 4.938551862243042e-06, + "loss": 3.2822, + "step": 7045 + }, + { + "epoch": 0.07171630859375, + "grad_norm": 10.946172714233398, + "learning_rate": 4.9384637170576844e-06, + "loss": 3.5603, + "step": 7050 + }, + { + "epoch": 0.07176717122395833, + "grad_norm": 11.345202445983887, + "learning_rate": 4.93837550948481e-06, + "loss": 3.4276, + "step": 7055 + }, + { + "epoch": 0.07181803385416667, + "grad_norm": 9.533265113830566, + "learning_rate": 4.938287239526676e-06, + "loss": 3.1513, + "step": 7060 + }, + { + "epoch": 0.071868896484375, + "grad_norm": 9.234315872192383, + "learning_rate": 4.938198907185542e-06, + "loss": 3.3517, + "step": 7065 + }, + { + "epoch": 0.07191975911458333, + "grad_norm": 10.09774398803711, + "learning_rate": 4.938110512463666e-06, + "loss": 3.5838, + "step": 7070 + }, + { + "epoch": 0.07197062174479167, + "grad_norm": 14.220784187316895, + "learning_rate": 4.938022055363311e-06, + "loss": 3.4819, + "step": 7075 + }, + { + "epoch": 0.072021484375, + "grad_norm": 10.781584739685059, + "learning_rate": 4.9379335358867384e-06, + "loss": 3.7672, + "step": 7080 + }, + { + "epoch": 0.07207234700520833, + "grad_norm": 9.52526569366455, + "learning_rate": 4.937844954036215e-06, + "loss": 3.3561, + "step": 7085 + }, + { + "epoch": 0.07212320963541667, + "grad_norm": 12.460453033447266, + "learning_rate": 4.9377563098140065e-06, + "loss": 3.4052, + "step": 7090 + }, + { + "epoch": 0.072174072265625, + "grad_norm": 14.343060493469238, + "learning_rate": 4.9376676032223805e-06, + "loss": 3.214, + "step": 7095 + }, + { + "epoch": 0.07222493489583333, + "grad_norm": 12.104986190795898, + "learning_rate": 4.937578834263607e-06, + "loss": 3.472, + "step": 7100 + }, + { + "epoch": 0.07227579752604167, + "grad_norm": 12.275232315063477, + "learning_rate": 4.9374900029399555e-06, + "loss": 3.7637, + "step": 7105 + }, + { + "epoch": 0.07232666015625, + "grad_norm": 12.134095191955566, + "learning_rate": 4.937401109253701e-06, + "loss": 3.6845, + "step": 7110 + }, + { + "epoch": 0.07237752278645833, + "grad_norm": 15.203348159790039, + "learning_rate": 4.937312153207117e-06, + "loss": 3.5803, + "step": 7115 + }, + { + "epoch": 0.07242838541666667, + "grad_norm": 17.58793830871582, + "learning_rate": 4.937223134802478e-06, + "loss": 3.1135, + "step": 7120 + }, + { + "epoch": 0.072479248046875, + "grad_norm": 9.813260078430176, + "learning_rate": 4.937134054042064e-06, + "loss": 3.487, + "step": 7125 + }, + { + "epoch": 0.07253011067708333, + "grad_norm": 12.799726486206055, + "learning_rate": 4.9370449109281524e-06, + "loss": 3.5097, + "step": 7130 + }, + { + "epoch": 0.07258097330729167, + "grad_norm": 10.807867050170898, + "learning_rate": 4.936955705463025e-06, + "loss": 3.5704, + "step": 7135 + }, + { + "epoch": 0.0726318359375, + "grad_norm": 13.212722778320312, + "learning_rate": 4.936866437648963e-06, + "loss": 3.2637, + "step": 7140 + }, + { + "epoch": 0.07268269856770833, + "grad_norm": 10.858726501464844, + "learning_rate": 4.936777107488251e-06, + "loss": 3.3638, + "step": 7145 + }, + { + "epoch": 0.07273356119791667, + "grad_norm": 14.568108558654785, + "learning_rate": 4.936687714983174e-06, + "loss": 3.2916, + "step": 7150 + }, + { + "epoch": 0.072784423828125, + "grad_norm": 12.920387268066406, + "learning_rate": 4.9365982601360194e-06, + "loss": 3.4835, + "step": 7155 + }, + { + "epoch": 0.07283528645833333, + "grad_norm": 13.888976097106934, + "learning_rate": 4.9365087429490765e-06, + "loss": 3.4057, + "step": 7160 + }, + { + "epoch": 0.07288614908854167, + "grad_norm": 8.182246208190918, + "learning_rate": 4.936419163424634e-06, + "loss": 3.4935, + "step": 7165 + }, + { + "epoch": 0.07293701171875, + "grad_norm": 9.68957805633545, + "learning_rate": 4.936329521564986e-06, + "loss": 3.4725, + "step": 7170 + }, + { + "epoch": 0.07298787434895833, + "grad_norm": 8.523253440856934, + "learning_rate": 4.936239817372423e-06, + "loss": 3.3035, + "step": 7175 + }, + { + "epoch": 0.07303873697916667, + "grad_norm": 15.208540916442871, + "learning_rate": 4.936150050849242e-06, + "loss": 3.3124, + "step": 7180 + }, + { + "epoch": 0.073089599609375, + "grad_norm": 9.686799049377441, + "learning_rate": 4.93606022199774e-06, + "loss": 3.1586, + "step": 7185 + }, + { + "epoch": 0.07314046223958333, + "grad_norm": 17.809568405151367, + "learning_rate": 4.935970330820215e-06, + "loss": 3.5217, + "step": 7190 + }, + { + "epoch": 0.07319132486979167, + "grad_norm": 9.103455543518066, + "learning_rate": 4.935880377318965e-06, + "loss": 3.3448, + "step": 7195 + }, + { + "epoch": 0.0732421875, + "grad_norm": 17.965932846069336, + "learning_rate": 4.935790361496295e-06, + "loss": 3.4703, + "step": 7200 + }, + { + "epoch": 0.07329305013020833, + "grad_norm": 17.913076400756836, + "learning_rate": 4.935700283354504e-06, + "loss": 3.7618, + "step": 7205 + }, + { + "epoch": 0.07334391276041667, + "grad_norm": 10.30252456665039, + "learning_rate": 4.9356101428959e-06, + "loss": 3.0109, + "step": 7210 + }, + { + "epoch": 0.073394775390625, + "grad_norm": 9.662969589233398, + "learning_rate": 4.935519940122787e-06, + "loss": 3.4018, + "step": 7215 + }, + { + "epoch": 0.07344563802083333, + "grad_norm": 15.70295524597168, + "learning_rate": 4.935429675037474e-06, + "loss": 3.083, + "step": 7220 + }, + { + "epoch": 0.07349650065104167, + "grad_norm": 8.53459644317627, + "learning_rate": 4.935339347642269e-06, + "loss": 3.0109, + "step": 7225 + }, + { + "epoch": 0.07354736328125, + "grad_norm": 15.32571792602539, + "learning_rate": 4.935248957939486e-06, + "loss": 3.5678, + "step": 7230 + }, + { + "epoch": 0.07359822591145833, + "grad_norm": 15.416007995605469, + "learning_rate": 4.935158505931434e-06, + "loss": 3.2472, + "step": 7235 + }, + { + "epoch": 0.07364908854166667, + "grad_norm": 12.033348083496094, + "learning_rate": 4.93506799162043e-06, + "loss": 3.3393, + "step": 7240 + }, + { + "epoch": 0.073699951171875, + "grad_norm": 12.217211723327637, + "learning_rate": 4.934977415008787e-06, + "loss": 3.7256, + "step": 7245 + }, + { + "epoch": 0.07375081380208333, + "grad_norm": 13.5538969039917, + "learning_rate": 4.934886776098825e-06, + "loss": 3.6471, + "step": 7250 + }, + { + "epoch": 0.07380167643229167, + "grad_norm": 11.107625961303711, + "learning_rate": 4.934796074892862e-06, + "loss": 3.4801, + "step": 7255 + }, + { + "epoch": 0.0738525390625, + "grad_norm": 8.796854972839355, + "learning_rate": 4.934705311393219e-06, + "loss": 2.999, + "step": 7260 + }, + { + "epoch": 0.07390340169270833, + "grad_norm": 15.173271179199219, + "learning_rate": 4.934614485602217e-06, + "loss": 3.2074, + "step": 7265 + }, + { + "epoch": 0.07395426432291667, + "grad_norm": 14.630196571350098, + "learning_rate": 4.9345235975221804e-06, + "loss": 3.2888, + "step": 7270 + }, + { + "epoch": 0.074005126953125, + "grad_norm": 10.86033821105957, + "learning_rate": 4.934432647155435e-06, + "loss": 3.2936, + "step": 7275 + }, + { + "epoch": 0.07405598958333333, + "grad_norm": 13.982941627502441, + "learning_rate": 4.934341634504307e-06, + "loss": 3.7184, + "step": 7280 + }, + { + "epoch": 0.07410685221354167, + "grad_norm": 9.288113594055176, + "learning_rate": 4.934250559571126e-06, + "loss": 3.1865, + "step": 7285 + }, + { + "epoch": 0.07415771484375, + "grad_norm": 11.010740280151367, + "learning_rate": 4.93415942235822e-06, + "loss": 3.442, + "step": 7290 + }, + { + "epoch": 0.07420857747395833, + "grad_norm": 15.357698440551758, + "learning_rate": 4.934068222867923e-06, + "loss": 3.6836, + "step": 7295 + }, + { + "epoch": 0.07425944010416667, + "grad_norm": 14.378852844238281, + "learning_rate": 4.9339769611025675e-06, + "loss": 3.4716, + "step": 7300 + }, + { + "epoch": 0.074310302734375, + "grad_norm": 10.144874572753906, + "learning_rate": 4.933885637064489e-06, + "loss": 3.7662, + "step": 7305 + }, + { + "epoch": 0.07436116536458333, + "grad_norm": 15.959395408630371, + "learning_rate": 4.933794250756022e-06, + "loss": 3.2475, + "step": 7310 + }, + { + "epoch": 0.07441202799479167, + "grad_norm": 15.771014213562012, + "learning_rate": 4.933702802179506e-06, + "loss": 3.5408, + "step": 7315 + }, + { + "epoch": 0.074462890625, + "grad_norm": 16.440584182739258, + "learning_rate": 4.933611291337282e-06, + "loss": 3.5703, + "step": 7320 + }, + { + "epoch": 0.07451375325520833, + "grad_norm": 13.77549934387207, + "learning_rate": 4.933519718231689e-06, + "loss": 3.7564, + "step": 7325 + }, + { + "epoch": 0.07456461588541667, + "grad_norm": 13.803625106811523, + "learning_rate": 4.9334280828650714e-06, + "loss": 3.9605, + "step": 7330 + }, + { + "epoch": 0.074615478515625, + "grad_norm": 16.266210556030273, + "learning_rate": 4.933336385239772e-06, + "loss": 3.6834, + "step": 7335 + }, + { + "epoch": 0.07466634114583333, + "grad_norm": 12.476020812988281, + "learning_rate": 4.933244625358139e-06, + "loss": 3.2462, + "step": 7340 + }, + { + "epoch": 0.07471720377604167, + "grad_norm": 14.898695945739746, + "learning_rate": 4.9331528032225186e-06, + "loss": 4.0511, + "step": 7345 + }, + { + "epoch": 0.07476806640625, + "grad_norm": 13.865912437438965, + "learning_rate": 4.933060918835261e-06, + "loss": 3.5242, + "step": 7350 + }, + { + "epoch": 0.07481892903645833, + "grad_norm": 15.214755058288574, + "learning_rate": 4.932968972198715e-06, + "loss": 3.6132, + "step": 7355 + }, + { + "epoch": 0.07486979166666667, + "grad_norm": 14.871356010437012, + "learning_rate": 4.932876963315236e-06, + "loss": 3.3851, + "step": 7360 + }, + { + "epoch": 0.074920654296875, + "grad_norm": 12.997641563415527, + "learning_rate": 4.932784892187176e-06, + "loss": 3.1629, + "step": 7365 + }, + { + "epoch": 0.07497151692708333, + "grad_norm": 10.09203052520752, + "learning_rate": 4.932692758816892e-06, + "loss": 3.3311, + "step": 7370 + }, + { + "epoch": 0.07502237955729167, + "grad_norm": 8.960355758666992, + "learning_rate": 4.932600563206739e-06, + "loss": 3.1715, + "step": 7375 + }, + { + "epoch": 0.0750732421875, + "grad_norm": 16.078197479248047, + "learning_rate": 4.932508305359078e-06, + "loss": 3.5975, + "step": 7380 + }, + { + "epoch": 0.07512410481770833, + "grad_norm": 15.18477725982666, + "learning_rate": 4.9324159852762685e-06, + "loss": 3.341, + "step": 7385 + }, + { + "epoch": 0.07517496744791667, + "grad_norm": 15.12987995147705, + "learning_rate": 4.932323602960673e-06, + "loss": 3.5689, + "step": 7390 + }, + { + "epoch": 0.075225830078125, + "grad_norm": 13.448582649230957, + "learning_rate": 4.932231158414653e-06, + "loss": 3.6534, + "step": 7395 + }, + { + "epoch": 0.07527669270833333, + "grad_norm": 9.764447212219238, + "learning_rate": 4.932138651640577e-06, + "loss": 3.5477, + "step": 7400 + }, + { + "epoch": 0.07532755533854167, + "grad_norm": 11.549277305603027, + "learning_rate": 4.932046082640809e-06, + "loss": 3.0506, + "step": 7405 + }, + { + "epoch": 0.07537841796875, + "grad_norm": 9.322796821594238, + "learning_rate": 4.9319534514177196e-06, + "loss": 3.2946, + "step": 7410 + }, + { + "epoch": 0.07542928059895833, + "grad_norm": 10.205277442932129, + "learning_rate": 4.931860757973676e-06, + "loss": 3.3928, + "step": 7415 + }, + { + "epoch": 0.07548014322916667, + "grad_norm": 8.97710132598877, + "learning_rate": 4.931768002311052e-06, + "loss": 3.5807, + "step": 7420 + }, + { + "epoch": 0.075531005859375, + "grad_norm": 15.988985061645508, + "learning_rate": 4.931675184432221e-06, + "loss": 3.5203, + "step": 7425 + }, + { + "epoch": 0.07558186848958333, + "grad_norm": 12.848153114318848, + "learning_rate": 4.931582304339556e-06, + "loss": 3.1023, + "step": 7430 + }, + { + "epoch": 0.07563273111979167, + "grad_norm": 14.775106430053711, + "learning_rate": 4.931489362035434e-06, + "loss": 3.6487, + "step": 7435 + }, + { + "epoch": 0.07568359375, + "grad_norm": 15.638808250427246, + "learning_rate": 4.931396357522233e-06, + "loss": 3.4362, + "step": 7440 + }, + { + "epoch": 0.07573445638020833, + "grad_norm": 17.059715270996094, + "learning_rate": 4.931303290802333e-06, + "loss": 4.0533, + "step": 7445 + }, + { + "epoch": 0.07578531901041667, + "grad_norm": 13.491350173950195, + "learning_rate": 4.931210161878114e-06, + "loss": 3.1253, + "step": 7450 + }, + { + "epoch": 0.075836181640625, + "grad_norm": 11.535757064819336, + "learning_rate": 4.93111697075196e-06, + "loss": 3.4529, + "step": 7455 + }, + { + "epoch": 0.07588704427083333, + "grad_norm": 8.205851554870605, + "learning_rate": 4.9310237174262535e-06, + "loss": 3.3011, + "step": 7460 + }, + { + "epoch": 0.07593790690104167, + "grad_norm": 10.262289047241211, + "learning_rate": 4.930930401903382e-06, + "loss": 3.2739, + "step": 7465 + }, + { + "epoch": 0.07598876953125, + "grad_norm": 11.151719093322754, + "learning_rate": 4.930837024185732e-06, + "loss": 3.6304, + "step": 7470 + }, + { + "epoch": 0.07603963216145833, + "grad_norm": 13.860788345336914, + "learning_rate": 4.930743584275694e-06, + "loss": 3.3064, + "step": 7475 + }, + { + "epoch": 0.07609049479166667, + "grad_norm": 12.631816864013672, + "learning_rate": 4.930650082175656e-06, + "loss": 3.3838, + "step": 7480 + }, + { + "epoch": 0.076141357421875, + "grad_norm": 15.496430397033691, + "learning_rate": 4.930556517888013e-06, + "loss": 3.3085, + "step": 7485 + }, + { + "epoch": 0.07619222005208333, + "grad_norm": 15.001185417175293, + "learning_rate": 4.930462891415156e-06, + "loss": 3.1146, + "step": 7490 + }, + { + "epoch": 0.07624308268229167, + "grad_norm": 9.253118515014648, + "learning_rate": 4.930369202759484e-06, + "loss": 3.5206, + "step": 7495 + }, + { + "epoch": 0.0762939453125, + "grad_norm": 9.574434280395508, + "learning_rate": 4.9302754519233905e-06, + "loss": 3.0834, + "step": 7500 + }, + { + "epoch": 0.07634480794270833, + "grad_norm": 8.028079986572266, + "learning_rate": 4.9301816389092775e-06, + "loss": 3.3453, + "step": 7505 + }, + { + "epoch": 0.07639567057291667, + "grad_norm": 15.80912971496582, + "learning_rate": 4.930087763719541e-06, + "loss": 3.3046, + "step": 7510 + }, + { + "epoch": 0.076446533203125, + "grad_norm": 12.27437686920166, + "learning_rate": 4.929993826356586e-06, + "loss": 3.3498, + "step": 7515 + }, + { + "epoch": 0.07649739583333333, + "grad_norm": 8.595794677734375, + "learning_rate": 4.9298998268228154e-06, + "loss": 3.4443, + "step": 7520 + }, + { + "epoch": 0.07654825846354167, + "grad_norm": 14.03144645690918, + "learning_rate": 4.929805765120633e-06, + "loss": 3.4373, + "step": 7525 + }, + { + "epoch": 0.07659912109375, + "grad_norm": 16.364065170288086, + "learning_rate": 4.929711641252446e-06, + "loss": 3.3485, + "step": 7530 + }, + { + "epoch": 0.07664998372395833, + "grad_norm": 13.637460708618164, + "learning_rate": 4.929617455220664e-06, + "loss": 3.1559, + "step": 7535 + }, + { + "epoch": 0.07670084635416667, + "grad_norm": 9.677896499633789, + "learning_rate": 4.929523207027693e-06, + "loss": 3.1611, + "step": 7540 + }, + { + "epoch": 0.076751708984375, + "grad_norm": 11.03842830657959, + "learning_rate": 4.929428896675949e-06, + "loss": 3.468, + "step": 7545 + }, + { + "epoch": 0.07680257161458333, + "grad_norm": 8.96399974822998, + "learning_rate": 4.92933452416784e-06, + "loss": 3.2743, + "step": 7550 + }, + { + "epoch": 0.07685343424479167, + "grad_norm": 14.87559986114502, + "learning_rate": 4.929240089505785e-06, + "loss": 3.4153, + "step": 7555 + }, + { + "epoch": 0.076904296875, + "grad_norm": 16.70171356201172, + "learning_rate": 4.929145592692197e-06, + "loss": 3.3959, + "step": 7560 + }, + { + "epoch": 0.07695515950520833, + "grad_norm": 10.184401512145996, + "learning_rate": 4.929051033729495e-06, + "loss": 3.3099, + "step": 7565 + }, + { + "epoch": 0.07700602213541667, + "grad_norm": 16.423912048339844, + "learning_rate": 4.928956412620098e-06, + "loss": 3.5852, + "step": 7570 + }, + { + "epoch": 0.077056884765625, + "grad_norm": 12.25012493133545, + "learning_rate": 4.928861729366427e-06, + "loss": 3.3323, + "step": 7575 + }, + { + "epoch": 0.07710774739583333, + "grad_norm": 14.227832794189453, + "learning_rate": 4.928766983970905e-06, + "loss": 3.7213, + "step": 7580 + }, + { + "epoch": 0.07715861002604167, + "grad_norm": 9.725828170776367, + "learning_rate": 4.928672176435955e-06, + "loss": 3.8787, + "step": 7585 + }, + { + "epoch": 0.07720947265625, + "grad_norm": 13.200632095336914, + "learning_rate": 4.928577306764003e-06, + "loss": 3.3488, + "step": 7590 + }, + { + "epoch": 0.07726033528645833, + "grad_norm": 16.116952896118164, + "learning_rate": 4.928482374957476e-06, + "loss": 3.5104, + "step": 7595 + }, + { + "epoch": 0.07731119791666667, + "grad_norm": 14.31235408782959, + "learning_rate": 4.928387381018803e-06, + "loss": 3.4802, + "step": 7600 + }, + { + "epoch": 0.077362060546875, + "grad_norm": 14.306567192077637, + "learning_rate": 4.928292324950415e-06, + "loss": 3.1986, + "step": 7605 + }, + { + "epoch": 0.07741292317708333, + "grad_norm": 10.679328918457031, + "learning_rate": 4.9281972067547435e-06, + "loss": 3.4726, + "step": 7610 + }, + { + "epoch": 0.07746378580729167, + "grad_norm": 13.975683212280273, + "learning_rate": 4.928102026434221e-06, + "loss": 3.5155, + "step": 7615 + }, + { + "epoch": 0.0775146484375, + "grad_norm": 11.450372695922852, + "learning_rate": 4.928006783991285e-06, + "loss": 3.26, + "step": 7620 + }, + { + "epoch": 0.07756551106770833, + "grad_norm": 9.844697952270508, + "learning_rate": 4.92791147942837e-06, + "loss": 3.552, + "step": 7625 + }, + { + "epoch": 0.07761637369791667, + "grad_norm": 13.510926246643066, + "learning_rate": 4.927816112747915e-06, + "loss": 3.2648, + "step": 7630 + }, + { + "epoch": 0.077667236328125, + "grad_norm": 16.180030822753906, + "learning_rate": 4.927720683952361e-06, + "loss": 3.8344, + "step": 7635 + }, + { + "epoch": 0.07771809895833333, + "grad_norm": 10.086830139160156, + "learning_rate": 4.9276251930441485e-06, + "loss": 3.5012, + "step": 7640 + }, + { + "epoch": 0.07776896158854167, + "grad_norm": 16.06332778930664, + "learning_rate": 4.927529640025721e-06, + "loss": 3.5488, + "step": 7645 + }, + { + "epoch": 0.07781982421875, + "grad_norm": 9.544757843017578, + "learning_rate": 4.927434024899522e-06, + "loss": 3.3311, + "step": 7650 + }, + { + "epoch": 0.07787068684895833, + "grad_norm": 10.502907752990723, + "learning_rate": 4.927338347668e-06, + "loss": 3.7151, + "step": 7655 + }, + { + "epoch": 0.07792154947916667, + "grad_norm": 21.776485443115234, + "learning_rate": 4.927242608333601e-06, + "loss": 3.6689, + "step": 7660 + }, + { + "epoch": 0.077972412109375, + "grad_norm": 9.892070770263672, + "learning_rate": 4.927146806898776e-06, + "loss": 3.4572, + "step": 7665 + }, + { + "epoch": 0.07802327473958333, + "grad_norm": 10.099709510803223, + "learning_rate": 4.927050943365974e-06, + "loss": 3.1367, + "step": 7670 + }, + { + "epoch": 0.07807413736979167, + "grad_norm": 11.609224319458008, + "learning_rate": 4.92695501773765e-06, + "loss": 3.6545, + "step": 7675 + }, + { + "epoch": 0.078125, + "grad_norm": 12.737848281860352, + "learning_rate": 4.926859030016257e-06, + "loss": 3.4818, + "step": 7680 + }, + { + "epoch": 0.07817586263020833, + "grad_norm": 11.716086387634277, + "learning_rate": 4.926762980204251e-06, + "loss": 3.3399, + "step": 7685 + }, + { + "epoch": 0.07822672526041667, + "grad_norm": 15.350922584533691, + "learning_rate": 4.926666868304089e-06, + "loss": 3.2643, + "step": 7690 + }, + { + "epoch": 0.078277587890625, + "grad_norm": 12.929101943969727, + "learning_rate": 4.9265706943182305e-06, + "loss": 3.1085, + "step": 7695 + }, + { + "epoch": 0.07832845052083333, + "grad_norm": 11.21131706237793, + "learning_rate": 4.926474458249137e-06, + "loss": 3.3425, + "step": 7700 + }, + { + "epoch": 0.07837931315104167, + "grad_norm": 16.155336380004883, + "learning_rate": 4.9263781600992675e-06, + "loss": 3.7774, + "step": 7705 + }, + { + "epoch": 0.07843017578125, + "grad_norm": 16.52438735961914, + "learning_rate": 4.926281799871089e-06, + "loss": 3.6957, + "step": 7710 + }, + { + "epoch": 0.07848103841145833, + "grad_norm": 15.890446662902832, + "learning_rate": 4.926185377567065e-06, + "loss": 3.5257, + "step": 7715 + }, + { + "epoch": 0.07853190104166667, + "grad_norm": 12.618369102478027, + "learning_rate": 4.926088893189665e-06, + "loss": 3.5028, + "step": 7720 + }, + { + "epoch": 0.078582763671875, + "grad_norm": 8.554431915283203, + "learning_rate": 4.925992346741354e-06, + "loss": 3.0891, + "step": 7725 + }, + { + "epoch": 0.07863362630208333, + "grad_norm": 15.57974624633789, + "learning_rate": 4.9258957382246045e-06, + "loss": 3.4668, + "step": 7730 + }, + { + "epoch": 0.07868448893229167, + "grad_norm": 13.180612564086914, + "learning_rate": 4.925799067641888e-06, + "loss": 3.6996, + "step": 7735 + }, + { + "epoch": 0.0787353515625, + "grad_norm": 10.547351837158203, + "learning_rate": 4.9257023349956765e-06, + "loss": 3.8278, + "step": 7740 + }, + { + "epoch": 0.07878621419270833, + "grad_norm": 18.47353744506836, + "learning_rate": 4.925605540288445e-06, + "loss": 3.5644, + "step": 7745 + }, + { + "epoch": 0.07883707682291667, + "grad_norm": 14.555137634277344, + "learning_rate": 4.925508683522673e-06, + "loss": 3.3353, + "step": 7750 + }, + { + "epoch": 0.078887939453125, + "grad_norm": 14.03541088104248, + "learning_rate": 4.925411764700834e-06, + "loss": 3.3763, + "step": 7755 + }, + { + "epoch": 0.07893880208333333, + "grad_norm": 13.177834510803223, + "learning_rate": 4.925314783825411e-06, + "loss": 3.6113, + "step": 7760 + }, + { + "epoch": 0.07898966471354167, + "grad_norm": 12.112380027770996, + "learning_rate": 4.925217740898884e-06, + "loss": 3.3866, + "step": 7765 + }, + { + "epoch": 0.07904052734375, + "grad_norm": 12.616769790649414, + "learning_rate": 4.925120635923736e-06, + "loss": 3.7202, + "step": 7770 + }, + { + "epoch": 0.07909138997395833, + "grad_norm": 12.70201587677002, + "learning_rate": 4.925023468902451e-06, + "loss": 3.316, + "step": 7775 + }, + { + "epoch": 0.07914225260416667, + "grad_norm": 15.856217384338379, + "learning_rate": 4.924926239837515e-06, + "loss": 3.36, + "step": 7780 + }, + { + "epoch": 0.079193115234375, + "grad_norm": 12.738685607910156, + "learning_rate": 4.9248289487314174e-06, + "loss": 3.3463, + "step": 7785 + }, + { + "epoch": 0.07924397786458333, + "grad_norm": 15.38477897644043, + "learning_rate": 4.924731595586645e-06, + "loss": 3.3254, + "step": 7790 + }, + { + "epoch": 0.07929484049479167, + "grad_norm": 12.949041366577148, + "learning_rate": 4.924634180405689e-06, + "loss": 3.2682, + "step": 7795 + }, + { + "epoch": 0.079345703125, + "grad_norm": 17.46603775024414, + "learning_rate": 4.924536703191043e-06, + "loss": 3.5482, + "step": 7800 + }, + { + "epoch": 0.07939656575520833, + "grad_norm": 15.399724960327148, + "learning_rate": 4.9244391639451995e-06, + "loss": 3.5545, + "step": 7805 + }, + { + "epoch": 0.07944742838541667, + "grad_norm": 13.765381813049316, + "learning_rate": 4.924341562670655e-06, + "loss": 3.2777, + "step": 7810 + }, + { + "epoch": 0.079498291015625, + "grad_norm": 13.219882011413574, + "learning_rate": 4.924243899369906e-06, + "loss": 3.4312, + "step": 7815 + }, + { + "epoch": 0.07954915364583333, + "grad_norm": 15.765204429626465, + "learning_rate": 4.924146174045451e-06, + "loss": 3.4005, + "step": 7820 + }, + { + "epoch": 0.07960001627604167, + "grad_norm": 14.722639083862305, + "learning_rate": 4.924048386699792e-06, + "loss": 3.4344, + "step": 7825 + }, + { + "epoch": 0.07965087890625, + "grad_norm": 14.24759292602539, + "learning_rate": 4.923950537335429e-06, + "loss": 3.5187, + "step": 7830 + }, + { + "epoch": 0.07970174153645833, + "grad_norm": 14.9491548538208, + "learning_rate": 4.923852625954866e-06, + "loss": 3.3459, + "step": 7835 + }, + { + "epoch": 0.07975260416666667, + "grad_norm": 10.572219848632812, + "learning_rate": 4.9237546525606075e-06, + "loss": 3.6514, + "step": 7840 + }, + { + "epoch": 0.079803466796875, + "grad_norm": 13.39721393585205, + "learning_rate": 4.923656617155162e-06, + "loss": 3.4159, + "step": 7845 + }, + { + "epoch": 0.07985432942708333, + "grad_norm": 13.185128211975098, + "learning_rate": 4.923558519741035e-06, + "loss": 3.3885, + "step": 7850 + }, + { + "epoch": 0.07990519205729167, + "grad_norm": 10.458941459655762, + "learning_rate": 4.923460360320738e-06, + "loss": 3.8603, + "step": 7855 + }, + { + "epoch": 0.0799560546875, + "grad_norm": 16.00554084777832, + "learning_rate": 4.923362138896782e-06, + "loss": 3.8007, + "step": 7860 + }, + { + "epoch": 0.08000691731770833, + "grad_norm": 19.089359283447266, + "learning_rate": 4.923263855471681e-06, + "loss": 3.4031, + "step": 7865 + }, + { + "epoch": 0.08005777994791667, + "grad_norm": 16.948728561401367, + "learning_rate": 4.923165510047948e-06, + "loss": 3.4621, + "step": 7870 + }, + { + "epoch": 0.080108642578125, + "grad_norm": 15.898715019226074, + "learning_rate": 4.9230671026281e-06, + "loss": 3.2493, + "step": 7875 + }, + { + "epoch": 0.08015950520833333, + "grad_norm": 15.451108932495117, + "learning_rate": 4.922968633214654e-06, + "loss": 3.5485, + "step": 7880 + }, + { + "epoch": 0.08021036783854167, + "grad_norm": 8.44896411895752, + "learning_rate": 4.922870101810131e-06, + "loss": 3.6708, + "step": 7885 + }, + { + "epoch": 0.08026123046875, + "grad_norm": 15.81263542175293, + "learning_rate": 4.92277150841705e-06, + "loss": 3.0769, + "step": 7890 + }, + { + "epoch": 0.08031209309895833, + "grad_norm": 11.589963912963867, + "learning_rate": 4.922672853037934e-06, + "loss": 3.1524, + "step": 7895 + }, + { + "epoch": 0.08036295572916667, + "grad_norm": 11.018848419189453, + "learning_rate": 4.922574135675308e-06, + "loss": 3.4289, + "step": 7900 + }, + { + "epoch": 0.080413818359375, + "grad_norm": 14.702198028564453, + "learning_rate": 4.922475356331696e-06, + "loss": 3.7443, + "step": 7905 + }, + { + "epoch": 0.08046468098958333, + "grad_norm": 7.636882305145264, + "learning_rate": 4.922376515009627e-06, + "loss": 3.2372, + "step": 7910 + }, + { + "epoch": 0.08051554361979167, + "grad_norm": 14.174872398376465, + "learning_rate": 4.922277611711629e-06, + "loss": 3.2774, + "step": 7915 + }, + { + "epoch": 0.08056640625, + "grad_norm": 7.712305068969727, + "learning_rate": 4.922178646440232e-06, + "loss": 2.9999, + "step": 7920 + }, + { + "epoch": 0.08061726888020833, + "grad_norm": 13.299323081970215, + "learning_rate": 4.922079619197968e-06, + "loss": 3.4508, + "step": 7925 + }, + { + "epoch": 0.08066813151041667, + "grad_norm": 12.260127067565918, + "learning_rate": 4.9219805299873715e-06, + "loss": 3.5971, + "step": 7930 + }, + { + "epoch": 0.080718994140625, + "grad_norm": 11.661309242248535, + "learning_rate": 4.9218813788109776e-06, + "loss": 3.2826, + "step": 7935 + }, + { + "epoch": 0.08076985677083333, + "grad_norm": 14.099932670593262, + "learning_rate": 4.921782165671322e-06, + "loss": 3.2441, + "step": 7940 + }, + { + "epoch": 0.08082071940104167, + "grad_norm": 11.960383415222168, + "learning_rate": 4.9216828905709445e-06, + "loss": 3.2706, + "step": 7945 + }, + { + "epoch": 0.08087158203125, + "grad_norm": 14.964332580566406, + "learning_rate": 4.921583553512384e-06, + "loss": 2.997, + "step": 7950 + }, + { + "epoch": 0.08092244466145833, + "grad_norm": 11.150708198547363, + "learning_rate": 4.9214841544981826e-06, + "loss": 3.3165, + "step": 7955 + }, + { + "epoch": 0.08097330729166667, + "grad_norm": 9.034783363342285, + "learning_rate": 4.9213846935308816e-06, + "loss": 3.4451, + "step": 7960 + }, + { + "epoch": 0.081024169921875, + "grad_norm": 12.717357635498047, + "learning_rate": 4.921285170613029e-06, + "loss": 3.4678, + "step": 7965 + }, + { + "epoch": 0.08107503255208333, + "grad_norm": 15.22363567352295, + "learning_rate": 4.921185585747168e-06, + "loss": 3.3836, + "step": 7970 + }, + { + "epoch": 0.08112589518229167, + "grad_norm": 12.619773864746094, + "learning_rate": 4.9210859389358475e-06, + "loss": 3.3497, + "step": 7975 + }, + { + "epoch": 0.0811767578125, + "grad_norm": 10.314453125, + "learning_rate": 4.920986230181618e-06, + "loss": 3.3937, + "step": 7980 + }, + { + "epoch": 0.08122762044270833, + "grad_norm": 16.396963119506836, + "learning_rate": 4.920886459487029e-06, + "loss": 3.2124, + "step": 7985 + }, + { + "epoch": 0.08127848307291667, + "grad_norm": 10.935754776000977, + "learning_rate": 4.920786626854634e-06, + "loss": 3.3134, + "step": 7990 + }, + { + "epoch": 0.081329345703125, + "grad_norm": 11.221820831298828, + "learning_rate": 4.920686732286988e-06, + "loss": 3.3703, + "step": 7995 + }, + { + "epoch": 0.08138020833333333, + "grad_norm": 15.557806968688965, + "learning_rate": 4.9205867757866445e-06, + "loss": 3.2063, + "step": 8000 + }, + { + "epoch": 0.08143107096354167, + "grad_norm": 14.617751121520996, + "learning_rate": 4.920486757356162e-06, + "loss": 3.4361, + "step": 8005 + }, + { + "epoch": 0.08148193359375, + "grad_norm": 7.648261070251465, + "learning_rate": 4.9203866769981e-06, + "loss": 3.1502, + "step": 8010 + }, + { + "epoch": 0.08153279622395833, + "grad_norm": 13.730716705322266, + "learning_rate": 4.920286534715018e-06, + "loss": 3.7119, + "step": 8015 + }, + { + "epoch": 0.08158365885416667, + "grad_norm": 14.1985502243042, + "learning_rate": 4.9201863305094786e-06, + "loss": 3.4887, + "step": 8020 + }, + { + "epoch": 0.081634521484375, + "grad_norm": 13.51031494140625, + "learning_rate": 4.920086064384046e-06, + "loss": 3.0307, + "step": 8025 + }, + { + "epoch": 0.08168538411458333, + "grad_norm": 11.816555976867676, + "learning_rate": 4.919985736341286e-06, + "loss": 3.4726, + "step": 8030 + }, + { + "epoch": 0.08173624674479167, + "grad_norm": 15.351333618164062, + "learning_rate": 4.919885346383764e-06, + "loss": 3.6725, + "step": 8035 + }, + { + "epoch": 0.081787109375, + "grad_norm": 17.565916061401367, + "learning_rate": 4.919784894514048e-06, + "loss": 3.551, + "step": 8040 + }, + { + "epoch": 0.08183797200520833, + "grad_norm": 11.794846534729004, + "learning_rate": 4.91968438073471e-06, + "loss": 3.7001, + "step": 8045 + }, + { + "epoch": 0.08188883463541667, + "grad_norm": 13.469988822937012, + "learning_rate": 4.919583805048321e-06, + "loss": 3.2706, + "step": 8050 + }, + { + "epoch": 0.081939697265625, + "grad_norm": 9.399026870727539, + "learning_rate": 4.919483167457452e-06, + "loss": 3.476, + "step": 8055 + }, + { + "epoch": 0.08199055989583333, + "grad_norm": 10.455305099487305, + "learning_rate": 4.919382467964681e-06, + "loss": 3.78, + "step": 8060 + }, + { + "epoch": 0.08204142252604167, + "grad_norm": 13.494087219238281, + "learning_rate": 4.919281706572583e-06, + "loss": 3.5696, + "step": 8065 + }, + { + "epoch": 0.08209228515625, + "grad_norm": 9.601789474487305, + "learning_rate": 4.919180883283735e-06, + "loss": 3.4697, + "step": 8070 + }, + { + "epoch": 0.08214314778645833, + "grad_norm": 8.843679428100586, + "learning_rate": 4.919079998100719e-06, + "loss": 3.3273, + "step": 8075 + }, + { + "epoch": 0.08219401041666667, + "grad_norm": 17.03862953186035, + "learning_rate": 4.918979051026113e-06, + "loss": 3.3333, + "step": 8080 + }, + { + "epoch": 0.082244873046875, + "grad_norm": 9.111615180969238, + "learning_rate": 4.918878042062503e-06, + "loss": 3.1043, + "step": 8085 + }, + { + "epoch": 0.08229573567708333, + "grad_norm": 9.032851219177246, + "learning_rate": 4.918776971212471e-06, + "loss": 3.1896, + "step": 8090 + }, + { + "epoch": 0.08234659830729167, + "grad_norm": 15.128028869628906, + "learning_rate": 4.918675838478603e-06, + "loss": 3.5589, + "step": 8095 + }, + { + "epoch": 0.0823974609375, + "grad_norm": 9.789177894592285, + "learning_rate": 4.918574643863488e-06, + "loss": 3.4767, + "step": 8100 + }, + { + "epoch": 0.08244832356770833, + "grad_norm": 8.648837089538574, + "learning_rate": 4.918473387369713e-06, + "loss": 3.6547, + "step": 8105 + }, + { + "epoch": 0.08249918619791667, + "grad_norm": 12.925644874572754, + "learning_rate": 4.91837206899987e-06, + "loss": 3.4412, + "step": 8110 + }, + { + "epoch": 0.082550048828125, + "grad_norm": 10.411079406738281, + "learning_rate": 4.918270688756551e-06, + "loss": 3.501, + "step": 8115 + }, + { + "epoch": 0.08260091145833333, + "grad_norm": 16.97960662841797, + "learning_rate": 4.918169246642349e-06, + "loss": 3.9448, + "step": 8120 + }, + { + "epoch": 0.08265177408854167, + "grad_norm": 13.027390480041504, + "learning_rate": 4.91806774265986e-06, + "loss": 3.2829, + "step": 8125 + }, + { + "epoch": 0.08270263671875, + "grad_norm": 11.88217544555664, + "learning_rate": 4.9179661768116815e-06, + "loss": 3.6066, + "step": 8130 + }, + { + "epoch": 0.08275349934895833, + "grad_norm": 9.799572944641113, + "learning_rate": 4.9178645491004115e-06, + "loss": 3.2738, + "step": 8135 + }, + { + "epoch": 0.08280436197916667, + "grad_norm": 12.099909782409668, + "learning_rate": 4.91776285952865e-06, + "loss": 3.2961, + "step": 8140 + }, + { + "epoch": 0.082855224609375, + "grad_norm": 11.036294937133789, + "learning_rate": 4.917661108098999e-06, + "loss": 3.5535, + "step": 8145 + }, + { + "epoch": 0.08290608723958333, + "grad_norm": 16.11235237121582, + "learning_rate": 4.9175592948140614e-06, + "loss": 3.5957, + "step": 8150 + }, + { + "epoch": 0.08295694986979167, + "grad_norm": 12.630952835083008, + "learning_rate": 4.917457419676443e-06, + "loss": 3.1787, + "step": 8155 + }, + { + "epoch": 0.0830078125, + "grad_norm": 10.421707153320312, + "learning_rate": 4.9173554826887485e-06, + "loss": 3.5609, + "step": 8160 + }, + { + "epoch": 0.08305867513020833, + "grad_norm": 14.865351676940918, + "learning_rate": 4.917253483853587e-06, + "loss": 3.4347, + "step": 8165 + }, + { + "epoch": 0.08310953776041667, + "grad_norm": 10.349051475524902, + "learning_rate": 4.917151423173568e-06, + "loss": 3.3103, + "step": 8170 + }, + { + "epoch": 0.083160400390625, + "grad_norm": 12.763856887817383, + "learning_rate": 4.917049300651303e-06, + "loss": 3.2145, + "step": 8175 + }, + { + "epoch": 0.08321126302083333, + "grad_norm": 13.702238082885742, + "learning_rate": 4.916947116289405e-06, + "loss": 3.4389, + "step": 8180 + }, + { + "epoch": 0.08326212565104167, + "grad_norm": 8.125384330749512, + "learning_rate": 4.916844870090487e-06, + "loss": 3.1778, + "step": 8185 + }, + { + "epoch": 0.08331298828125, + "grad_norm": 13.532679557800293, + "learning_rate": 4.916742562057166e-06, + "loss": 3.5005, + "step": 8190 + }, + { + "epoch": 0.08336385091145833, + "grad_norm": 11.276693344116211, + "learning_rate": 4.91664019219206e-06, + "loss": 3.1357, + "step": 8195 + }, + { + "epoch": 0.08341471354166667, + "grad_norm": 10.653390884399414, + "learning_rate": 4.916537760497787e-06, + "loss": 2.9572, + "step": 8200 + }, + { + "epoch": 0.083465576171875, + "grad_norm": 19.044883728027344, + "learning_rate": 4.9164352669769685e-06, + "loss": 3.4532, + "step": 8205 + }, + { + "epoch": 0.08351643880208333, + "grad_norm": 9.898783683776855, + "learning_rate": 4.916332711632227e-06, + "loss": 3.3115, + "step": 8210 + }, + { + "epoch": 0.08356730143229167, + "grad_norm": 7.111556053161621, + "learning_rate": 4.916230094466185e-06, + "loss": 3.5531, + "step": 8215 + }, + { + "epoch": 0.0836181640625, + "grad_norm": 15.385550498962402, + "learning_rate": 4.916127415481469e-06, + "loss": 3.5578, + "step": 8220 + }, + { + "epoch": 0.08366902669270833, + "grad_norm": 9.553897857666016, + "learning_rate": 4.916024674680705e-06, + "loss": 3.4616, + "step": 8225 + }, + { + "epoch": 0.08371988932291667, + "grad_norm": 9.77919864654541, + "learning_rate": 4.915921872066524e-06, + "loss": 3.5612, + "step": 8230 + }, + { + "epoch": 0.083770751953125, + "grad_norm": 13.859053611755371, + "learning_rate": 4.915819007641553e-06, + "loss": 3.3759, + "step": 8235 + }, + { + "epoch": 0.08382161458333333, + "grad_norm": 13.282938003540039, + "learning_rate": 4.915716081408426e-06, + "loss": 3.1999, + "step": 8240 + }, + { + "epoch": 0.08387247721354167, + "grad_norm": 8.742036819458008, + "learning_rate": 4.9156130933697756e-06, + "loss": 3.6131, + "step": 8245 + }, + { + "epoch": 0.08392333984375, + "grad_norm": 10.683076858520508, + "learning_rate": 4.915510043528237e-06, + "loss": 3.6148, + "step": 8250 + }, + { + "epoch": 0.08397420247395833, + "grad_norm": 13.600173950195312, + "learning_rate": 4.915406931886446e-06, + "loss": 3.0754, + "step": 8255 + }, + { + "epoch": 0.08402506510416667, + "grad_norm": 9.838703155517578, + "learning_rate": 4.915303758447041e-06, + "loss": 3.422, + "step": 8260 + }, + { + "epoch": 0.084075927734375, + "grad_norm": 17.08401107788086, + "learning_rate": 4.915200523212662e-06, + "loss": 3.5272, + "step": 8265 + }, + { + "epoch": 0.08412679036458333, + "grad_norm": 11.889626502990723, + "learning_rate": 4.91509722618595e-06, + "loss": 3.244, + "step": 8270 + }, + { + "epoch": 0.08417765299479167, + "grad_norm": 13.750388145446777, + "learning_rate": 4.914993867369549e-06, + "loss": 3.0886, + "step": 8275 + }, + { + "epoch": 0.084228515625, + "grad_norm": 12.008491516113281, + "learning_rate": 4.914890446766101e-06, + "loss": 3.4445, + "step": 8280 + }, + { + "epoch": 0.08427937825520833, + "grad_norm": 8.624137878417969, + "learning_rate": 4.914786964378253e-06, + "loss": 3.1456, + "step": 8285 + }, + { + "epoch": 0.08433024088541667, + "grad_norm": 13.373948097229004, + "learning_rate": 4.914683420208654e-06, + "loss": 3.1974, + "step": 8290 + }, + { + "epoch": 0.084381103515625, + "grad_norm": 11.786771774291992, + "learning_rate": 4.914579814259952e-06, + "loss": 3.7054, + "step": 8295 + }, + { + "epoch": 0.08443196614583333, + "grad_norm": 12.221219062805176, + "learning_rate": 4.914476146534797e-06, + "loss": 3.3905, + "step": 8300 + }, + { + "epoch": 0.08448282877604167, + "grad_norm": 11.495316505432129, + "learning_rate": 4.914372417035843e-06, + "loss": 3.9629, + "step": 8305 + }, + { + "epoch": 0.08453369140625, + "grad_norm": 13.219930648803711, + "learning_rate": 4.914268625765742e-06, + "loss": 3.359, + "step": 8310 + }, + { + "epoch": 0.08458455403645833, + "grad_norm": 11.233851432800293, + "learning_rate": 4.9141647727271515e-06, + "loss": 3.6808, + "step": 8315 + }, + { + "epoch": 0.08463541666666667, + "grad_norm": 10.176496505737305, + "learning_rate": 4.914060857922727e-06, + "loss": 3.3135, + "step": 8320 + }, + { + "epoch": 0.084686279296875, + "grad_norm": 10.016979217529297, + "learning_rate": 4.9139568813551275e-06, + "loss": 3.8656, + "step": 8325 + }, + { + "epoch": 0.08473714192708333, + "grad_norm": 14.729958534240723, + "learning_rate": 4.913852843027013e-06, + "loss": 3.4418, + "step": 8330 + }, + { + "epoch": 0.08478800455729167, + "grad_norm": 16.3724422454834, + "learning_rate": 4.913748742941046e-06, + "loss": 3.3672, + "step": 8335 + }, + { + "epoch": 0.0848388671875, + "grad_norm": 9.82769775390625, + "learning_rate": 4.91364458109989e-06, + "loss": 3.0876, + "step": 8340 + }, + { + "epoch": 0.08488972981770833, + "grad_norm": 12.202662467956543, + "learning_rate": 4.913540357506209e-06, + "loss": 3.3755, + "step": 8345 + }, + { + "epoch": 0.08494059244791667, + "grad_norm": 13.47652530670166, + "learning_rate": 4.913436072162671e-06, + "loss": 3.3207, + "step": 8350 + }, + { + "epoch": 0.084991455078125, + "grad_norm": 13.997584342956543, + "learning_rate": 4.913331725071942e-06, + "loss": 3.4016, + "step": 8355 + }, + { + "epoch": 0.08504231770833333, + "grad_norm": 10.103306770324707, + "learning_rate": 4.9132273162366926e-06, + "loss": 3.9784, + "step": 8360 + }, + { + "epoch": 0.08509318033854167, + "grad_norm": 13.56583023071289, + "learning_rate": 4.913122845659595e-06, + "loss": 3.5657, + "step": 8365 + }, + { + "epoch": 0.08514404296875, + "grad_norm": 16.747276306152344, + "learning_rate": 4.913018313343322e-06, + "loss": 3.3329, + "step": 8370 + }, + { + "epoch": 0.08519490559895833, + "grad_norm": 15.131979942321777, + "learning_rate": 4.912913719290546e-06, + "loss": 3.7669, + "step": 8375 + }, + { + "epoch": 0.08524576822916667, + "grad_norm": 13.406637191772461, + "learning_rate": 4.912809063503945e-06, + "loss": 3.8842, + "step": 8380 + }, + { + "epoch": 0.085296630859375, + "grad_norm": 10.718530654907227, + "learning_rate": 4.912704345986196e-06, + "loss": 3.3867, + "step": 8385 + }, + { + "epoch": 0.08534749348958333, + "grad_norm": 10.112154006958008, + "learning_rate": 4.912599566739979e-06, + "loss": 3.1978, + "step": 8390 + }, + { + "epoch": 0.08539835611979167, + "grad_norm": 9.753875732421875, + "learning_rate": 4.912494725767972e-06, + "loss": 3.2426, + "step": 8395 + }, + { + "epoch": 0.08544921875, + "grad_norm": 13.011728286743164, + "learning_rate": 4.9123898230728616e-06, + "loss": 3.689, + "step": 8400 + }, + { + "epoch": 0.08550008138020833, + "grad_norm": 10.342684745788574, + "learning_rate": 4.912284858657328e-06, + "loss": 3.2583, + "step": 8405 + }, + { + "epoch": 0.08555094401041667, + "grad_norm": 15.096055030822754, + "learning_rate": 4.9121798325240574e-06, + "loss": 3.797, + "step": 8410 + }, + { + "epoch": 0.085601806640625, + "grad_norm": 16.6988582611084, + "learning_rate": 4.912074744675739e-06, + "loss": 3.7339, + "step": 8415 + }, + { + "epoch": 0.08565266927083333, + "grad_norm": 9.644712448120117, + "learning_rate": 4.911969595115059e-06, + "loss": 3.6449, + "step": 8420 + }, + { + "epoch": 0.08570353190104167, + "grad_norm": 9.427045822143555, + "learning_rate": 4.911864383844709e-06, + "loss": 3.7292, + "step": 8425 + }, + { + "epoch": 0.08575439453125, + "grad_norm": 12.242361068725586, + "learning_rate": 4.9117591108673815e-06, + "loss": 3.6643, + "step": 8430 + }, + { + "epoch": 0.08580525716145833, + "grad_norm": 15.966888427734375, + "learning_rate": 4.911653776185768e-06, + "loss": 3.7233, + "step": 8435 + }, + { + "epoch": 0.08585611979166667, + "grad_norm": 13.699934005737305, + "learning_rate": 4.9115483798025635e-06, + "loss": 3.2923, + "step": 8440 + }, + { + "epoch": 0.085906982421875, + "grad_norm": 18.62406349182129, + "learning_rate": 4.911442921720465e-06, + "loss": 3.3712, + "step": 8445 + }, + { + "epoch": 0.08595784505208333, + "grad_norm": 12.284649848937988, + "learning_rate": 4.911337401942172e-06, + "loss": 3.0775, + "step": 8450 + }, + { + "epoch": 0.08600870768229167, + "grad_norm": 15.336729049682617, + "learning_rate": 4.911231820470383e-06, + "loss": 3.1454, + "step": 8455 + }, + { + "epoch": 0.0860595703125, + "grad_norm": 12.336068153381348, + "learning_rate": 4.911126177307799e-06, + "loss": 3.5428, + "step": 8460 + }, + { + "epoch": 0.08611043294270833, + "grad_norm": 9.04693603515625, + "learning_rate": 4.911020472457124e-06, + "loss": 4.0488, + "step": 8465 + }, + { + "epoch": 0.08616129557291667, + "grad_norm": 8.606358528137207, + "learning_rate": 4.91091470592106e-06, + "loss": 3.2326, + "step": 8470 + }, + { + "epoch": 0.086212158203125, + "grad_norm": 16.552690505981445, + "learning_rate": 4.910808877702317e-06, + "loss": 3.6028, + "step": 8475 + }, + { + "epoch": 0.08626302083333333, + "grad_norm": 10.208168029785156, + "learning_rate": 4.910702987803599e-06, + "loss": 3.4335, + "step": 8480 + }, + { + "epoch": 0.08631388346354167, + "grad_norm": 12.139317512512207, + "learning_rate": 4.910597036227617e-06, + "loss": 3.2395, + "step": 8485 + }, + { + "epoch": 0.08636474609375, + "grad_norm": 10.915331840515137, + "learning_rate": 4.91049102297708e-06, + "loss": 3.4295, + "step": 8490 + }, + { + "epoch": 0.08641560872395833, + "grad_norm": 9.047663688659668, + "learning_rate": 4.910384948054703e-06, + "loss": 3.7313, + "step": 8495 + }, + { + "epoch": 0.08646647135416667, + "grad_norm": 11.79930305480957, + "learning_rate": 4.910278811463197e-06, + "loss": 3.8923, + "step": 8500 + }, + { + "epoch": 0.086517333984375, + "grad_norm": 9.152587890625, + "learning_rate": 4.91017261320528e-06, + "loss": 3.4595, + "step": 8505 + }, + { + "epoch": 0.08656819661458333, + "grad_norm": 10.824846267700195, + "learning_rate": 4.910066353283668e-06, + "loss": 3.2886, + "step": 8510 + }, + { + "epoch": 0.08661905924479167, + "grad_norm": 6.95084285736084, + "learning_rate": 4.909960031701079e-06, + "loss": 3.3848, + "step": 8515 + }, + { + "epoch": 0.086669921875, + "grad_norm": 16.21605682373047, + "learning_rate": 4.9098536484602334e-06, + "loss": 3.2412, + "step": 8520 + }, + { + "epoch": 0.08672078450520833, + "grad_norm": 8.85698127746582, + "learning_rate": 4.909747203563855e-06, + "loss": 3.5511, + "step": 8525 + }, + { + "epoch": 0.08677164713541667, + "grad_norm": 16.60267448425293, + "learning_rate": 4.909640697014664e-06, + "loss": 3.9667, + "step": 8530 + }, + { + "epoch": 0.086822509765625, + "grad_norm": 14.518190383911133, + "learning_rate": 4.909534128815387e-06, + "loss": 3.3415, + "step": 8535 + }, + { + "epoch": 0.08687337239583333, + "grad_norm": 10.557718276977539, + "learning_rate": 4.909427498968752e-06, + "loss": 3.4739, + "step": 8540 + }, + { + "epoch": 0.08692423502604167, + "grad_norm": 11.117448806762695, + "learning_rate": 4.909320807477485e-06, + "loss": 3.345, + "step": 8545 + }, + { + "epoch": 0.08697509765625, + "grad_norm": 13.413477897644043, + "learning_rate": 4.9092140543443145e-06, + "loss": 3.8153, + "step": 8550 + }, + { + "epoch": 0.08702596028645833, + "grad_norm": 11.934691429138184, + "learning_rate": 4.909107239571975e-06, + "loss": 3.548, + "step": 8555 + }, + { + "epoch": 0.08707682291666667, + "grad_norm": 10.76050853729248, + "learning_rate": 4.9090003631631975e-06, + "loss": 3.3043, + "step": 8560 + }, + { + "epoch": 0.087127685546875, + "grad_norm": 10.168484687805176, + "learning_rate": 4.9088934251207165e-06, + "loss": 3.8714, + "step": 8565 + }, + { + "epoch": 0.08717854817708333, + "grad_norm": 13.013615608215332, + "learning_rate": 4.908786425447269e-06, + "loss": 3.5501, + "step": 8570 + }, + { + "epoch": 0.08722941080729167, + "grad_norm": 19.23887062072754, + "learning_rate": 4.908679364145591e-06, + "loss": 3.5044, + "step": 8575 + }, + { + "epoch": 0.0872802734375, + "grad_norm": 14.924409866333008, + "learning_rate": 4.908572241218422e-06, + "loss": 3.2358, + "step": 8580 + }, + { + "epoch": 0.08733113606770833, + "grad_norm": 12.832475662231445, + "learning_rate": 4.908465056668504e-06, + "loss": 3.6383, + "step": 8585 + }, + { + "epoch": 0.08738199869791667, + "grad_norm": 15.433663368225098, + "learning_rate": 4.908357810498578e-06, + "loss": 3.4119, + "step": 8590 + }, + { + "epoch": 0.087432861328125, + "grad_norm": 8.936971664428711, + "learning_rate": 4.908250502711388e-06, + "loss": 3.2385, + "step": 8595 + }, + { + "epoch": 0.08748372395833333, + "grad_norm": 15.772163391113281, + "learning_rate": 4.9081431333096805e-06, + "loss": 3.5685, + "step": 8600 + }, + { + "epoch": 0.08753458658854167, + "grad_norm": 11.028748512268066, + "learning_rate": 4.908035702296201e-06, + "loss": 3.4128, + "step": 8605 + }, + { + "epoch": 0.08758544921875, + "grad_norm": 12.614755630493164, + "learning_rate": 4.907928209673699e-06, + "loss": 3.2888, + "step": 8610 + }, + { + "epoch": 0.08763631184895833, + "grad_norm": 12.599555969238281, + "learning_rate": 4.907820655444924e-06, + "loss": 3.1925, + "step": 8615 + }, + { + "epoch": 0.08768717447916667, + "grad_norm": 8.425646781921387, + "learning_rate": 4.907713039612629e-06, + "loss": 3.2201, + "step": 8620 + }, + { + "epoch": 0.087738037109375, + "grad_norm": 11.59953784942627, + "learning_rate": 4.907605362179566e-06, + "loss": 3.2422, + "step": 8625 + }, + { + "epoch": 0.08778889973958333, + "grad_norm": 7.5655131340026855, + "learning_rate": 4.907497623148491e-06, + "loss": 3.5319, + "step": 8630 + }, + { + "epoch": 0.08783976236979167, + "grad_norm": 15.107267379760742, + "learning_rate": 4.90738982252216e-06, + "loss": 3.5978, + "step": 8635 + }, + { + "epoch": 0.087890625, + "grad_norm": 15.67873764038086, + "learning_rate": 4.90728196030333e-06, + "loss": 3.2055, + "step": 8640 + }, + { + "epoch": 0.08794148763020833, + "grad_norm": 13.261436462402344, + "learning_rate": 4.907174036494763e-06, + "loss": 3.0959, + "step": 8645 + }, + { + "epoch": 0.08799235026041667, + "grad_norm": 8.268048286437988, + "learning_rate": 4.907066051099219e-06, + "loss": 4.1154, + "step": 8650 + }, + { + "epoch": 0.088043212890625, + "grad_norm": 10.855912208557129, + "learning_rate": 4.906958004119459e-06, + "loss": 3.7448, + "step": 8655 + }, + { + "epoch": 0.08809407552083333, + "grad_norm": 11.688394546508789, + "learning_rate": 4.90684989555825e-06, + "loss": 3.5907, + "step": 8660 + }, + { + "epoch": 0.08814493815104167, + "grad_norm": 10.814862251281738, + "learning_rate": 4.906741725418357e-06, + "loss": 3.0817, + "step": 8665 + }, + { + "epoch": 0.08819580078125, + "grad_norm": 15.317708969116211, + "learning_rate": 4.906633493702547e-06, + "loss": 3.3432, + "step": 8670 + }, + { + "epoch": 0.08824666341145833, + "grad_norm": 9.729673385620117, + "learning_rate": 4.9065252004135896e-06, + "loss": 3.2816, + "step": 8675 + }, + { + "epoch": 0.08829752604166667, + "grad_norm": 16.496000289916992, + "learning_rate": 4.906416845554255e-06, + "loss": 3.511, + "step": 8680 + }, + { + "epoch": 0.088348388671875, + "grad_norm": 16.304975509643555, + "learning_rate": 4.906308429127317e-06, + "loss": 3.3911, + "step": 8685 + }, + { + "epoch": 0.08839925130208333, + "grad_norm": 9.595266342163086, + "learning_rate": 4.906199951135547e-06, + "loss": 3.5494, + "step": 8690 + }, + { + "epoch": 0.08845011393229167, + "grad_norm": 13.496864318847656, + "learning_rate": 4.906091411581722e-06, + "loss": 3.0696, + "step": 8695 + }, + { + "epoch": 0.0885009765625, + "grad_norm": 9.127147674560547, + "learning_rate": 4.905982810468619e-06, + "loss": 3.5201, + "step": 8700 + }, + { + "epoch": 0.08855183919270833, + "grad_norm": 17.793336868286133, + "learning_rate": 4.905874147799015e-06, + "loss": 3.9616, + "step": 8705 + }, + { + "epoch": 0.08860270182291667, + "grad_norm": 12.619993209838867, + "learning_rate": 4.905765423575692e-06, + "loss": 3.6193, + "step": 8710 + }, + { + "epoch": 0.088653564453125, + "grad_norm": 13.0260648727417, + "learning_rate": 4.90565663780143e-06, + "loss": 3.1035, + "step": 8715 + }, + { + "epoch": 0.08870442708333333, + "grad_norm": 11.968487739562988, + "learning_rate": 4.905547790479015e-06, + "loss": 3.8811, + "step": 8720 + }, + { + "epoch": 0.08875528971354167, + "grad_norm": 14.09224796295166, + "learning_rate": 4.905438881611228e-06, + "loss": 3.2689, + "step": 8725 + }, + { + "epoch": 0.08880615234375, + "grad_norm": 15.479565620422363, + "learning_rate": 4.905329911200858e-06, + "loss": 3.3083, + "step": 8730 + }, + { + "epoch": 0.08885701497395833, + "grad_norm": 12.190298080444336, + "learning_rate": 4.905220879250693e-06, + "loss": 3.0763, + "step": 8735 + }, + { + "epoch": 0.08890787760416667, + "grad_norm": 13.899861335754395, + "learning_rate": 4.905111785763521e-06, + "loss": 3.5266, + "step": 8740 + }, + { + "epoch": 0.088958740234375, + "grad_norm": 11.89075756072998, + "learning_rate": 4.905002630742135e-06, + "loss": 3.9321, + "step": 8745 + }, + { + "epoch": 0.08900960286458333, + "grad_norm": 13.489803314208984, + "learning_rate": 4.904893414189326e-06, + "loss": 3.4766, + "step": 8750 + }, + { + "epoch": 0.08906046549479167, + "grad_norm": 13.605609893798828, + "learning_rate": 4.904784136107888e-06, + "loss": 3.1691, + "step": 8755 + }, + { + "epoch": 0.089111328125, + "grad_norm": 10.71879768371582, + "learning_rate": 4.90467479650062e-06, + "loss": 3.2934, + "step": 8760 + }, + { + "epoch": 0.08916219075520833, + "grad_norm": 10.595026016235352, + "learning_rate": 4.9045653953703156e-06, + "loss": 3.1218, + "step": 8765 + }, + { + "epoch": 0.08921305338541667, + "grad_norm": 11.80540943145752, + "learning_rate": 4.9044559327197764e-06, + "loss": 3.2589, + "step": 8770 + }, + { + "epoch": 0.089263916015625, + "grad_norm": 14.13592529296875, + "learning_rate": 4.9043464085518026e-06, + "loss": 3.3531, + "step": 8775 + }, + { + "epoch": 0.08931477864583333, + "grad_norm": 12.386794090270996, + "learning_rate": 4.904236822869195e-06, + "loss": 3.529, + "step": 8780 + }, + { + "epoch": 0.08936564127604167, + "grad_norm": 15.09712028503418, + "learning_rate": 4.904127175674758e-06, + "loss": 3.5851, + "step": 8785 + }, + { + "epoch": 0.08941650390625, + "grad_norm": 22.025758743286133, + "learning_rate": 4.904017466971297e-06, + "loss": 3.4247, + "step": 8790 + }, + { + "epoch": 0.08946736653645833, + "grad_norm": 12.594602584838867, + "learning_rate": 4.9039076967616196e-06, + "loss": 3.5087, + "step": 8795 + }, + { + "epoch": 0.08951822916666667, + "grad_norm": 12.804073333740234, + "learning_rate": 4.903797865048533e-06, + "loss": 3.3348, + "step": 8800 + }, + { + "epoch": 0.089569091796875, + "grad_norm": 10.385623931884766, + "learning_rate": 4.903687971834848e-06, + "loss": 3.4176, + "step": 8805 + }, + { + "epoch": 0.08961995442708333, + "grad_norm": 9.570121765136719, + "learning_rate": 4.903578017123376e-06, + "loss": 3.3923, + "step": 8810 + }, + { + "epoch": 0.08967081705729167, + "grad_norm": 12.22430419921875, + "learning_rate": 4.90346800091693e-06, + "loss": 3.6975, + "step": 8815 + }, + { + "epoch": 0.0897216796875, + "grad_norm": 14.948432922363281, + "learning_rate": 4.9033579232183256e-06, + "loss": 3.483, + "step": 8820 + }, + { + "epoch": 0.08977254231770833, + "grad_norm": 14.922759056091309, + "learning_rate": 4.903247784030377e-06, + "loss": 3.4358, + "step": 8825 + }, + { + "epoch": 0.08982340494791667, + "grad_norm": 11.016508102416992, + "learning_rate": 4.903137583355905e-06, + "loss": 3.4084, + "step": 8830 + }, + { + "epoch": 0.089874267578125, + "grad_norm": 17.078842163085938, + "learning_rate": 4.903027321197726e-06, + "loss": 3.5926, + "step": 8835 + }, + { + "epoch": 0.08992513020833333, + "grad_norm": 13.634777069091797, + "learning_rate": 4.902916997558665e-06, + "loss": 3.5905, + "step": 8840 + }, + { + "epoch": 0.08997599283854167, + "grad_norm": 8.89342212677002, + "learning_rate": 4.902806612441539e-06, + "loss": 3.2723, + "step": 8845 + }, + { + "epoch": 0.09002685546875, + "grad_norm": 14.009095191955566, + "learning_rate": 4.902696165849178e-06, + "loss": 3.5151, + "step": 8850 + }, + { + "epoch": 0.09007771809895833, + "grad_norm": 16.494112014770508, + "learning_rate": 4.902585657784404e-06, + "loss": 3.229, + "step": 8855 + }, + { + "epoch": 0.09012858072916667, + "grad_norm": 9.716590881347656, + "learning_rate": 4.902475088250045e-06, + "loss": 3.4538, + "step": 8860 + }, + { + "epoch": 0.090179443359375, + "grad_norm": 14.583306312561035, + "learning_rate": 4.90236445724893e-06, + "loss": 3.6892, + "step": 8865 + }, + { + "epoch": 0.09023030598958333, + "grad_norm": 11.482134819030762, + "learning_rate": 4.902253764783891e-06, + "loss": 3.3962, + "step": 8870 + }, + { + "epoch": 0.09028116861979167, + "grad_norm": 14.702147483825684, + "learning_rate": 4.902143010857758e-06, + "loss": 3.2805, + "step": 8875 + }, + { + "epoch": 0.09033203125, + "grad_norm": 16.604808807373047, + "learning_rate": 4.902032195473366e-06, + "loss": 3.2619, + "step": 8880 + }, + { + "epoch": 0.09038289388020833, + "grad_norm": 12.936920166015625, + "learning_rate": 4.901921318633549e-06, + "loss": 3.4328, + "step": 8885 + }, + { + "epoch": 0.09043375651041667, + "grad_norm": 7.750491142272949, + "learning_rate": 4.901810380341145e-06, + "loss": 3.7113, + "step": 8890 + }, + { + "epoch": 0.090484619140625, + "grad_norm": 17.840023040771484, + "learning_rate": 4.901699380598992e-06, + "loss": 3.1851, + "step": 8895 + }, + { + "epoch": 0.09053548177083333, + "grad_norm": 9.541340827941895, + "learning_rate": 4.901588319409929e-06, + "loss": 3.5305, + "step": 8900 + }, + { + "epoch": 0.09058634440104167, + "grad_norm": 15.900449752807617, + "learning_rate": 4.901477196776798e-06, + "loss": 3.084, + "step": 8905 + }, + { + "epoch": 0.09063720703125, + "grad_norm": 11.218430519104004, + "learning_rate": 4.901366012702443e-06, + "loss": 3.5674, + "step": 8910 + }, + { + "epoch": 0.09068806966145833, + "grad_norm": 12.750350952148438, + "learning_rate": 4.901254767189707e-06, + "loss": 3.8215, + "step": 8915 + }, + { + "epoch": 0.09073893229166667, + "grad_norm": 13.617959976196289, + "learning_rate": 4.901143460241437e-06, + "loss": 3.3986, + "step": 8920 + }, + { + "epoch": 0.090789794921875, + "grad_norm": 13.11312484741211, + "learning_rate": 4.90103209186048e-06, + "loss": 3.453, + "step": 8925 + }, + { + "epoch": 0.09084065755208333, + "grad_norm": 9.750372886657715, + "learning_rate": 4.9009206620496875e-06, + "loss": 3.348, + "step": 8930 + }, + { + "epoch": 0.09089152018229167, + "grad_norm": 13.289474487304688, + "learning_rate": 4.900809170811908e-06, + "loss": 3.284, + "step": 8935 + }, + { + "epoch": 0.0909423828125, + "grad_norm": 10.578423500061035, + "learning_rate": 4.900697618149995e-06, + "loss": 3.8172, + "step": 8940 + }, + { + "epoch": 0.09099324544270833, + "grad_norm": 14.269426345825195, + "learning_rate": 4.900586004066803e-06, + "loss": 3.8127, + "step": 8945 + }, + { + "epoch": 0.09104410807291667, + "grad_norm": 11.517721176147461, + "learning_rate": 4.900474328565186e-06, + "loss": 3.2798, + "step": 8950 + }, + { + "epoch": 0.091094970703125, + "grad_norm": 12.702051162719727, + "learning_rate": 4.900362591648003e-06, + "loss": 3.1521, + "step": 8955 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 13.737582206726074, + "learning_rate": 4.900250793318112e-06, + "loss": 3.3586, + "step": 8960 + }, + { + "epoch": 0.09119669596354167, + "grad_norm": 14.892784118652344, + "learning_rate": 4.900138933578373e-06, + "loss": 3.4807, + "step": 8965 + }, + { + "epoch": 0.09124755859375, + "grad_norm": 11.482332229614258, + "learning_rate": 4.9000270124316495e-06, + "loss": 3.0206, + "step": 8970 + }, + { + "epoch": 0.09129842122395833, + "grad_norm": 13.81802749633789, + "learning_rate": 4.899915029880803e-06, + "loss": 3.3766, + "step": 8975 + }, + { + "epoch": 0.09134928385416667, + "grad_norm": 11.666979789733887, + "learning_rate": 4.899802985928699e-06, + "loss": 3.2013, + "step": 8980 + }, + { + "epoch": 0.091400146484375, + "grad_norm": 14.300288200378418, + "learning_rate": 4.899690880578205e-06, + "loss": 3.1857, + "step": 8985 + }, + { + "epoch": 0.09145100911458333, + "grad_norm": 9.281865119934082, + "learning_rate": 4.899578713832188e-06, + "loss": 3.4702, + "step": 8990 + }, + { + "epoch": 0.09150187174479167, + "grad_norm": 13.91549015045166, + "learning_rate": 4.899466485693518e-06, + "loss": 3.9119, + "step": 8995 + }, + { + "epoch": 0.091552734375, + "grad_norm": 8.730186462402344, + "learning_rate": 4.899354196165068e-06, + "loss": 3.136, + "step": 9000 + }, + { + "epoch": 0.09160359700520833, + "grad_norm": 9.029413223266602, + "learning_rate": 4.899241845249708e-06, + "loss": 3.6015, + "step": 9005 + }, + { + "epoch": 0.09165445963541667, + "grad_norm": 10.518607139587402, + "learning_rate": 4.899129432950316e-06, + "loss": 3.5339, + "step": 9010 + }, + { + "epoch": 0.091705322265625, + "grad_norm": 13.978117942810059, + "learning_rate": 4.899016959269764e-06, + "loss": 3.6865, + "step": 9015 + }, + { + "epoch": 0.09175618489583333, + "grad_norm": 13.56008243560791, + "learning_rate": 4.898904424210934e-06, + "loss": 3.1692, + "step": 9020 + }, + { + "epoch": 0.09180704752604167, + "grad_norm": 11.420557975769043, + "learning_rate": 4.898791827776701e-06, + "loss": 3.3184, + "step": 9025 + }, + { + "epoch": 0.09185791015625, + "grad_norm": 11.740954399108887, + "learning_rate": 4.898679169969949e-06, + "loss": 3.5797, + "step": 9030 + }, + { + "epoch": 0.09190877278645833, + "grad_norm": 11.412757873535156, + "learning_rate": 4.898566450793558e-06, + "loss": 3.4047, + "step": 9035 + }, + { + "epoch": 0.09195963541666667, + "grad_norm": 7.887825965881348, + "learning_rate": 4.898453670250413e-06, + "loss": 3.3213, + "step": 9040 + }, + { + "epoch": 0.092010498046875, + "grad_norm": 10.620329856872559, + "learning_rate": 4.8983408283433995e-06, + "loss": 3.3956, + "step": 9045 + }, + { + "epoch": 0.09206136067708333, + "grad_norm": 10.935351371765137, + "learning_rate": 4.898227925075405e-06, + "loss": 3.1719, + "step": 9050 + }, + { + "epoch": 0.09211222330729167, + "grad_norm": 13.198664665222168, + "learning_rate": 4.898114960449317e-06, + "loss": 3.2274, + "step": 9055 + }, + { + "epoch": 0.0921630859375, + "grad_norm": 7.661378383636475, + "learning_rate": 4.8980019344680255e-06, + "loss": 3.4533, + "step": 9060 + }, + { + "epoch": 0.09221394856770833, + "grad_norm": 14.63359546661377, + "learning_rate": 4.897888847134424e-06, + "loss": 3.2218, + "step": 9065 + }, + { + "epoch": 0.09226481119791667, + "grad_norm": 10.786942481994629, + "learning_rate": 4.897775698451404e-06, + "loss": 3.0604, + "step": 9070 + }, + { + "epoch": 0.092315673828125, + "grad_norm": 13.290276527404785, + "learning_rate": 4.897662488421861e-06, + "loss": 3.4856, + "step": 9075 + }, + { + "epoch": 0.09236653645833333, + "grad_norm": 13.736598014831543, + "learning_rate": 4.897549217048692e-06, + "loss": 3.4483, + "step": 9080 + }, + { + "epoch": 0.09241739908854167, + "grad_norm": 13.418255805969238, + "learning_rate": 4.897435884334795e-06, + "loss": 3.3985, + "step": 9085 + }, + { + "epoch": 0.09246826171875, + "grad_norm": 11.04183578491211, + "learning_rate": 4.897322490283069e-06, + "loss": 3.746, + "step": 9090 + }, + { + "epoch": 0.09251912434895833, + "grad_norm": 14.375475883483887, + "learning_rate": 4.897209034896414e-06, + "loss": 3.3322, + "step": 9095 + }, + { + "epoch": 0.09256998697916667, + "grad_norm": 12.415508270263672, + "learning_rate": 4.897095518177735e-06, + "loss": 3.221, + "step": 9100 + }, + { + "epoch": 0.092620849609375, + "grad_norm": 13.276843070983887, + "learning_rate": 4.896981940129935e-06, + "loss": 3.1526, + "step": 9105 + }, + { + "epoch": 0.09267171223958333, + "grad_norm": 16.24017333984375, + "learning_rate": 4.8968683007559204e-06, + "loss": 3.1622, + "step": 9110 + }, + { + "epoch": 0.09272257486979167, + "grad_norm": 13.676992416381836, + "learning_rate": 4.8967546000585985e-06, + "loss": 3.4367, + "step": 9115 + }, + { + "epoch": 0.0927734375, + "grad_norm": 12.518017768859863, + "learning_rate": 4.896640838040878e-06, + "loss": 3.149, + "step": 9120 + }, + { + "epoch": 0.09282430013020833, + "grad_norm": 9.173798561096191, + "learning_rate": 4.89652701470567e-06, + "loss": 3.4939, + "step": 9125 + }, + { + "epoch": 0.09287516276041667, + "grad_norm": 9.206432342529297, + "learning_rate": 4.896413130055887e-06, + "loss": 3.2798, + "step": 9130 + }, + { + "epoch": 0.092926025390625, + "grad_norm": 10.400619506835938, + "learning_rate": 4.896299184094441e-06, + "loss": 3.5009, + "step": 9135 + }, + { + "epoch": 0.09297688802083333, + "grad_norm": 14.5686616897583, + "learning_rate": 4.896185176824249e-06, + "loss": 3.0607, + "step": 9140 + }, + { + "epoch": 0.09302775065104167, + "grad_norm": 15.614046096801758, + "learning_rate": 4.8960711082482275e-06, + "loss": 3.5764, + "step": 9145 + }, + { + "epoch": 0.09307861328125, + "grad_norm": 11.736412048339844, + "learning_rate": 4.895956978369294e-06, + "loss": 3.2503, + "step": 9150 + }, + { + "epoch": 0.09312947591145833, + "grad_norm": 8.38943862915039, + "learning_rate": 4.895842787190369e-06, + "loss": 3.5593, + "step": 9155 + }, + { + "epoch": 0.09318033854166667, + "grad_norm": 10.80936050415039, + "learning_rate": 4.895728534714375e-06, + "loss": 3.7306, + "step": 9160 + }, + { + "epoch": 0.093231201171875, + "grad_norm": 8.9891939163208, + "learning_rate": 4.895614220944233e-06, + "loss": 3.5101, + "step": 9165 + }, + { + "epoch": 0.09328206380208333, + "grad_norm": 10.55257511138916, + "learning_rate": 4.895499845882869e-06, + "loss": 3.642, + "step": 9170 + }, + { + "epoch": 0.09333292643229167, + "grad_norm": 11.787149429321289, + "learning_rate": 4.895385409533211e-06, + "loss": 3.3967, + "step": 9175 + }, + { + "epoch": 0.0933837890625, + "grad_norm": 14.418353080749512, + "learning_rate": 4.895270911898183e-06, + "loss": 3.3822, + "step": 9180 + }, + { + "epoch": 0.09343465169270833, + "grad_norm": 10.53858470916748, + "learning_rate": 4.895156352980718e-06, + "loss": 3.5411, + "step": 9185 + }, + { + "epoch": 0.09348551432291667, + "grad_norm": 14.017937660217285, + "learning_rate": 4.895041732783745e-06, + "loss": 3.2294, + "step": 9190 + }, + { + "epoch": 0.093536376953125, + "grad_norm": 9.317304611206055, + "learning_rate": 4.8949270513101965e-06, + "loss": 3.0859, + "step": 9195 + }, + { + "epoch": 0.09358723958333333, + "grad_norm": 14.010085105895996, + "learning_rate": 4.894812308563007e-06, + "loss": 3.3879, + "step": 9200 + }, + { + "epoch": 0.09363810221354167, + "grad_norm": 14.181611061096191, + "learning_rate": 4.8946975045451125e-06, + "loss": 3.4383, + "step": 9205 + }, + { + "epoch": 0.09368896484375, + "grad_norm": 17.099210739135742, + "learning_rate": 4.894582639259451e-06, + "loss": 3.2286, + "step": 9210 + }, + { + "epoch": 0.09373982747395833, + "grad_norm": 15.619363784790039, + "learning_rate": 4.894467712708959e-06, + "loss": 3.5737, + "step": 9215 + }, + { + "epoch": 0.09379069010416667, + "grad_norm": 8.870353698730469, + "learning_rate": 4.8943527248965786e-06, + "loss": 3.2734, + "step": 9220 + }, + { + "epoch": 0.093841552734375, + "grad_norm": 8.589174270629883, + "learning_rate": 4.894237675825251e-06, + "loss": 3.5581, + "step": 9225 + }, + { + "epoch": 0.09389241536458333, + "grad_norm": 16.112144470214844, + "learning_rate": 4.89412256549792e-06, + "loss": 3.4103, + "step": 9230 + }, + { + "epoch": 0.09394327799479167, + "grad_norm": 10.410552978515625, + "learning_rate": 4.89400739391753e-06, + "loss": 3.4895, + "step": 9235 + }, + { + "epoch": 0.093994140625, + "grad_norm": 13.545750617980957, + "learning_rate": 4.89389216108703e-06, + "loss": 3.2598, + "step": 9240 + }, + { + "epoch": 0.09404500325520833, + "grad_norm": 18.490036010742188, + "learning_rate": 4.893776867009365e-06, + "loss": 3.3705, + "step": 9245 + }, + { + "epoch": 0.09409586588541667, + "grad_norm": 12.499258041381836, + "learning_rate": 4.893661511687487e-06, + "loss": 3.3089, + "step": 9250 + }, + { + "epoch": 0.094146728515625, + "grad_norm": 46.41202163696289, + "learning_rate": 4.893546095124346e-06, + "loss": 3.46, + "step": 9255 + }, + { + "epoch": 0.09419759114583333, + "grad_norm": 13.263053894042969, + "learning_rate": 4.893430617322895e-06, + "loss": 3.4889, + "step": 9260 + }, + { + "epoch": 0.09424845377604167, + "grad_norm": 7.173794269561768, + "learning_rate": 4.8933150782860905e-06, + "loss": 3.4028, + "step": 9265 + }, + { + "epoch": 0.09429931640625, + "grad_norm": 15.863984107971191, + "learning_rate": 4.893199478016886e-06, + "loss": 3.1168, + "step": 9270 + }, + { + "epoch": 0.09435017903645833, + "grad_norm": 9.970428466796875, + "learning_rate": 4.8930838165182405e-06, + "loss": 3.3857, + "step": 9275 + }, + { + "epoch": 0.09440104166666667, + "grad_norm": 6.910197734832764, + "learning_rate": 4.892968093793112e-06, + "loss": 3.6812, + "step": 9280 + }, + { + "epoch": 0.094451904296875, + "grad_norm": 12.154581069946289, + "learning_rate": 4.892852309844462e-06, + "loss": 3.459, + "step": 9285 + }, + { + "epoch": 0.09450276692708333, + "grad_norm": 13.089887619018555, + "learning_rate": 4.892736464675254e-06, + "loss": 3.75, + "step": 9290 + }, + { + "epoch": 0.09455362955729167, + "grad_norm": 12.283695220947266, + "learning_rate": 4.89262055828845e-06, + "loss": 3.5142, + "step": 9295 + }, + { + "epoch": 0.0946044921875, + "grad_norm": 16.34111976623535, + "learning_rate": 4.892504590687016e-06, + "loss": 3.4612, + "step": 9300 + }, + { + "epoch": 0.09465535481770833, + "grad_norm": 14.332940101623535, + "learning_rate": 4.89238856187392e-06, + "loss": 2.9819, + "step": 9305 + }, + { + "epoch": 0.09470621744791667, + "grad_norm": 9.912870407104492, + "learning_rate": 4.892272471852128e-06, + "loss": 3.3375, + "step": 9310 + }, + { + "epoch": 0.094757080078125, + "grad_norm": 18.02761459350586, + "learning_rate": 4.892156320624613e-06, + "loss": 3.5184, + "step": 9315 + }, + { + "epoch": 0.09480794270833333, + "grad_norm": 14.656006813049316, + "learning_rate": 4.892040108194346e-06, + "loss": 3.7168, + "step": 9320 + }, + { + "epoch": 0.09485880533854167, + "grad_norm": 14.194197654724121, + "learning_rate": 4.8919238345643e-06, + "loss": 3.7077, + "step": 9325 + }, + { + "epoch": 0.09490966796875, + "grad_norm": 14.731765747070312, + "learning_rate": 4.891807499737449e-06, + "loss": 3.526, + "step": 9330 + }, + { + "epoch": 0.09496053059895833, + "grad_norm": 11.650579452514648, + "learning_rate": 4.891691103716769e-06, + "loss": 3.2138, + "step": 9335 + }, + { + "epoch": 0.09501139322916667, + "grad_norm": 8.169177055358887, + "learning_rate": 4.89157464650524e-06, + "loss": 3.4209, + "step": 9340 + }, + { + "epoch": 0.095062255859375, + "grad_norm": 13.967549324035645, + "learning_rate": 4.89145812810584e-06, + "loss": 3.523, + "step": 9345 + }, + { + "epoch": 0.09511311848958333, + "grad_norm": 15.903494834899902, + "learning_rate": 4.891341548521552e-06, + "loss": 3.5998, + "step": 9350 + }, + { + "epoch": 0.09516398111979167, + "grad_norm": 18.27981185913086, + "learning_rate": 4.8912249077553566e-06, + "loss": 4.1232, + "step": 9355 + }, + { + "epoch": 0.09521484375, + "grad_norm": 12.90363597869873, + "learning_rate": 4.8911082058102375e-06, + "loss": 3.6707, + "step": 9360 + }, + { + "epoch": 0.09526570638020833, + "grad_norm": 15.968925476074219, + "learning_rate": 4.890991442689184e-06, + "loss": 3.1965, + "step": 9365 + }, + { + "epoch": 0.09531656901041667, + "grad_norm": 12.801103591918945, + "learning_rate": 4.890874618395179e-06, + "loss": 3.2281, + "step": 9370 + }, + { + "epoch": 0.095367431640625, + "grad_norm": 9.642354011535645, + "learning_rate": 4.890757732931215e-06, + "loss": 3.6309, + "step": 9375 + }, + { + "epoch": 0.09541829427083333, + "grad_norm": 18.670930862426758, + "learning_rate": 4.8906407863002805e-06, + "loss": 3.7841, + "step": 9380 + }, + { + "epoch": 0.09546915690104167, + "grad_norm": 12.024273872375488, + "learning_rate": 4.8905237785053675e-06, + "loss": 3.3917, + "step": 9385 + }, + { + "epoch": 0.09552001953125, + "grad_norm": 8.89647102355957, + "learning_rate": 4.8904067095494714e-06, + "loss": 3.703, + "step": 9390 + }, + { + "epoch": 0.09557088216145833, + "grad_norm": 11.106717109680176, + "learning_rate": 4.890289579435585e-06, + "loss": 3.2822, + "step": 9395 + }, + { + "epoch": 0.09562174479166667, + "grad_norm": 14.37429428100586, + "learning_rate": 4.8901723881667075e-06, + "loss": 3.3098, + "step": 9400 + }, + { + "epoch": 0.095672607421875, + "grad_norm": 12.311078071594238, + "learning_rate": 4.890055135745835e-06, + "loss": 3.1674, + "step": 9405 + }, + { + "epoch": 0.09572347005208333, + "grad_norm": 13.66702938079834, + "learning_rate": 4.88993782217597e-06, + "loss": 3.0287, + "step": 9410 + }, + { + "epoch": 0.09577433268229167, + "grad_norm": 11.145536422729492, + "learning_rate": 4.889820447460111e-06, + "loss": 3.3768, + "step": 9415 + }, + { + "epoch": 0.0958251953125, + "grad_norm": 12.304450988769531, + "learning_rate": 4.889703011601262e-06, + "loss": 3.5019, + "step": 9420 + }, + { + "epoch": 0.09587605794270833, + "grad_norm": 9.890800476074219, + "learning_rate": 4.889585514602429e-06, + "loss": 3.1514, + "step": 9425 + }, + { + "epoch": 0.09592692057291667, + "grad_norm": 9.260703086853027, + "learning_rate": 4.889467956466616e-06, + "loss": 3.4432, + "step": 9430 + }, + { + "epoch": 0.095977783203125, + "grad_norm": 13.874801635742188, + "learning_rate": 4.889350337196832e-06, + "loss": 3.2378, + "step": 9435 + }, + { + "epoch": 0.09602864583333333, + "grad_norm": 13.591817855834961, + "learning_rate": 4.889232656796086e-06, + "loss": 3.4383, + "step": 9440 + }, + { + "epoch": 0.09607950846354167, + "grad_norm": 9.10405445098877, + "learning_rate": 4.8891149152673875e-06, + "loss": 3.4487, + "step": 9445 + }, + { + "epoch": 0.09613037109375, + "grad_norm": 8.167961120605469, + "learning_rate": 4.888997112613752e-06, + "loss": 3.6359, + "step": 9450 + }, + { + "epoch": 0.09618123372395833, + "grad_norm": 14.467639923095703, + "learning_rate": 4.888879248838191e-06, + "loss": 3.1424, + "step": 9455 + }, + { + "epoch": 0.09623209635416667, + "grad_norm": 12.446633338928223, + "learning_rate": 4.888761323943721e-06, + "loss": 3.5426, + "step": 9460 + }, + { + "epoch": 0.096282958984375, + "grad_norm": 11.570879936218262, + "learning_rate": 4.888643337933358e-06, + "loss": 3.5324, + "step": 9465 + }, + { + "epoch": 0.09633382161458333, + "grad_norm": 13.078673362731934, + "learning_rate": 4.8885252908101226e-06, + "loss": 3.3334, + "step": 9470 + }, + { + "epoch": 0.09638468424479167, + "grad_norm": 10.936042785644531, + "learning_rate": 4.888407182577032e-06, + "loss": 3.2394, + "step": 9475 + }, + { + "epoch": 0.096435546875, + "grad_norm": 13.004395484924316, + "learning_rate": 4.888289013237112e-06, + "loss": 3.3966, + "step": 9480 + }, + { + "epoch": 0.09648640950520833, + "grad_norm": 11.469902992248535, + "learning_rate": 4.888170782793382e-06, + "loss": 3.717, + "step": 9485 + }, + { + "epoch": 0.09653727213541667, + "grad_norm": 14.799452781677246, + "learning_rate": 4.888052491248869e-06, + "loss": 3.4483, + "step": 9490 + }, + { + "epoch": 0.096588134765625, + "grad_norm": 10.995068550109863, + "learning_rate": 4.887934138606599e-06, + "loss": 3.3911, + "step": 9495 + }, + { + "epoch": 0.09663899739583333, + "grad_norm": 13.323307037353516, + "learning_rate": 4.8878157248696e-06, + "loss": 3.749, + "step": 9500 + }, + { + "epoch": 0.09668986002604167, + "grad_norm": 20.117530822753906, + "learning_rate": 4.887697250040901e-06, + "loss": 3.5721, + "step": 9505 + }, + { + "epoch": 0.09674072265625, + "grad_norm": 10.738547325134277, + "learning_rate": 4.887578714123536e-06, + "loss": 3.3881, + "step": 9510 + }, + { + "epoch": 0.09679158528645833, + "grad_norm": 12.129579544067383, + "learning_rate": 4.887460117120533e-06, + "loss": 3.5233, + "step": 9515 + }, + { + "epoch": 0.09684244791666667, + "grad_norm": 14.902997970581055, + "learning_rate": 4.88734145903493e-06, + "loss": 3.5717, + "step": 9520 + }, + { + "epoch": 0.096893310546875, + "grad_norm": 11.737396240234375, + "learning_rate": 4.887222739869761e-06, + "loss": 3.175, + "step": 9525 + }, + { + "epoch": 0.09694417317708333, + "grad_norm": 6.6605730056762695, + "learning_rate": 4.8871039596280654e-06, + "loss": 3.4191, + "step": 9530 + }, + { + "epoch": 0.09699503580729167, + "grad_norm": 11.67459487915039, + "learning_rate": 4.88698511831288e-06, + "loss": 3.4298, + "step": 9535 + }, + { + "epoch": 0.0970458984375, + "grad_norm": 7.061823844909668, + "learning_rate": 4.886866215927246e-06, + "loss": 3.4257, + "step": 9540 + }, + { + "epoch": 0.09709676106770833, + "grad_norm": 14.894327163696289, + "learning_rate": 4.8867472524742055e-06, + "loss": 3.4476, + "step": 9545 + }, + { + "epoch": 0.09714762369791667, + "grad_norm": 14.296971321105957, + "learning_rate": 4.8866282279568024e-06, + "loss": 3.7858, + "step": 9550 + }, + { + "epoch": 0.097198486328125, + "grad_norm": 12.031661033630371, + "learning_rate": 4.886509142378082e-06, + "loss": 3.5772, + "step": 9555 + }, + { + "epoch": 0.09724934895833333, + "grad_norm": 13.860501289367676, + "learning_rate": 4.88638999574109e-06, + "loss": 3.259, + "step": 9560 + }, + { + "epoch": 0.09730021158854167, + "grad_norm": 12.739351272583008, + "learning_rate": 4.886270788048877e-06, + "loss": 3.358, + "step": 9565 + }, + { + "epoch": 0.09735107421875, + "grad_norm": 8.996005058288574, + "learning_rate": 4.8861515193044905e-06, + "loss": 3.0429, + "step": 9570 + }, + { + "epoch": 0.09740193684895833, + "grad_norm": 10.95101261138916, + "learning_rate": 4.886032189510983e-06, + "loss": 3.8154, + "step": 9575 + }, + { + "epoch": 0.09745279947916667, + "grad_norm": 13.15895938873291, + "learning_rate": 4.885912798671408e-06, + "loss": 3.3369, + "step": 9580 + }, + { + "epoch": 0.097503662109375, + "grad_norm": 13.110836029052734, + "learning_rate": 4.885793346788819e-06, + "loss": 3.5893, + "step": 9585 + }, + { + "epoch": 0.09755452473958333, + "grad_norm": 13.61912727355957, + "learning_rate": 4.885673833866273e-06, + "loss": 3.3841, + "step": 9590 + }, + { + "epoch": 0.09760538736979167, + "grad_norm": 8.887752532958984, + "learning_rate": 4.885554259906827e-06, + "loss": 3.7583, + "step": 9595 + }, + { + "epoch": 0.09765625, + "grad_norm": 14.288346290588379, + "learning_rate": 4.885434624913541e-06, + "loss": 3.0627, + "step": 9600 + }, + { + "epoch": 0.09770711263020833, + "grad_norm": 13.13482666015625, + "learning_rate": 4.8853149288894765e-06, + "loss": 3.6814, + "step": 9605 + }, + { + "epoch": 0.09775797526041667, + "grad_norm": 10.950799942016602, + "learning_rate": 4.885195171837694e-06, + "loss": 3.0417, + "step": 9610 + }, + { + "epoch": 0.097808837890625, + "grad_norm": 15.985264778137207, + "learning_rate": 4.885075353761258e-06, + "loss": 3.4947, + "step": 9615 + }, + { + "epoch": 0.09785970052083333, + "grad_norm": 11.092909812927246, + "learning_rate": 4.884955474663235e-06, + "loss": 3.6053, + "step": 9620 + }, + { + "epoch": 0.09791056315104167, + "grad_norm": 10.567216873168945, + "learning_rate": 4.884835534546692e-06, + "loss": 3.5717, + "step": 9625 + }, + { + "epoch": 0.09796142578125, + "grad_norm": 16.9993896484375, + "learning_rate": 4.884715533414696e-06, + "loss": 3.8297, + "step": 9630 + }, + { + "epoch": 0.09801228841145833, + "grad_norm": 9.653956413269043, + "learning_rate": 4.884595471270319e-06, + "loss": 3.2011, + "step": 9635 + }, + { + "epoch": 0.09806315104166667, + "grad_norm": 7.467728137969971, + "learning_rate": 4.884475348116631e-06, + "loss": 3.3446, + "step": 9640 + }, + { + "epoch": 0.098114013671875, + "grad_norm": 16.218908309936523, + "learning_rate": 4.884355163956708e-06, + "loss": 3.4298, + "step": 9645 + }, + { + "epoch": 0.09816487630208333, + "grad_norm": 8.244502067565918, + "learning_rate": 4.884234918793622e-06, + "loss": 3.3024, + "step": 9650 + }, + { + "epoch": 0.09821573893229167, + "grad_norm": 13.385653495788574, + "learning_rate": 4.884114612630451e-06, + "loss": 3.2548, + "step": 9655 + }, + { + "epoch": 0.0982666015625, + "grad_norm": 12.53419303894043, + "learning_rate": 4.883994245470274e-06, + "loss": 3.0654, + "step": 9660 + }, + { + "epoch": 0.09831746419270833, + "grad_norm": 7.7406005859375, + "learning_rate": 4.883873817316168e-06, + "loss": 3.7867, + "step": 9665 + }, + { + "epoch": 0.09836832682291667, + "grad_norm": 16.32261085510254, + "learning_rate": 4.883753328171216e-06, + "loss": 3.5244, + "step": 9670 + }, + { + "epoch": 0.098419189453125, + "grad_norm": 14.094369888305664, + "learning_rate": 4.8836327780385e-06, + "loss": 3.6854, + "step": 9675 + }, + { + "epoch": 0.09847005208333333, + "grad_norm": 12.182512283325195, + "learning_rate": 4.883512166921104e-06, + "loss": 3.5938, + "step": 9680 + }, + { + "epoch": 0.09852091471354167, + "grad_norm": 11.643973350524902, + "learning_rate": 4.883391494822114e-06, + "loss": 3.3795, + "step": 9685 + }, + { + "epoch": 0.09857177734375, + "grad_norm": 13.109829902648926, + "learning_rate": 4.883270761744617e-06, + "loss": 3.4673, + "step": 9690 + }, + { + "epoch": 0.09862263997395833, + "grad_norm": 9.588884353637695, + "learning_rate": 4.883149967691704e-06, + "loss": 3.4358, + "step": 9695 + }, + { + "epoch": 0.09867350260416667, + "grad_norm": 16.999698638916016, + "learning_rate": 4.883029112666463e-06, + "loss": 3.4918, + "step": 9700 + }, + { + "epoch": 0.098724365234375, + "grad_norm": 14.010063171386719, + "learning_rate": 4.882908196671987e-06, + "loss": 3.5411, + "step": 9705 + }, + { + "epoch": 0.09877522786458333, + "grad_norm": 14.857710838317871, + "learning_rate": 4.88278721971137e-06, + "loss": 3.4874, + "step": 9710 + }, + { + "epoch": 0.09882609049479167, + "grad_norm": 12.407607078552246, + "learning_rate": 4.882666181787707e-06, + "loss": 3.2071, + "step": 9715 + }, + { + "epoch": 0.098876953125, + "grad_norm": 12.295507431030273, + "learning_rate": 4.882545082904094e-06, + "loss": 3.3899, + "step": 9720 + }, + { + "epoch": 0.09892781575520833, + "grad_norm": 12.28351879119873, + "learning_rate": 4.88242392306363e-06, + "loss": 3.8177, + "step": 9725 + }, + { + "epoch": 0.09897867838541667, + "grad_norm": 8.689821243286133, + "learning_rate": 4.882302702269415e-06, + "loss": 3.5002, + "step": 9730 + }, + { + "epoch": 0.099029541015625, + "grad_norm": 10.74190616607666, + "learning_rate": 4.882181420524548e-06, + "loss": 3.1903, + "step": 9735 + }, + { + "epoch": 0.09908040364583333, + "grad_norm": 11.707921981811523, + "learning_rate": 4.882060077832137e-06, + "loss": 3.2921, + "step": 9740 + }, + { + "epoch": 0.09913126627604167, + "grad_norm": 13.249473571777344, + "learning_rate": 4.881938674195282e-06, + "loss": 3.3386, + "step": 9745 + }, + { + "epoch": 0.09918212890625, + "grad_norm": 16.1980037689209, + "learning_rate": 4.88181720961709e-06, + "loss": 3.5874, + "step": 9750 + }, + { + "epoch": 0.09923299153645833, + "grad_norm": 14.430301666259766, + "learning_rate": 4.88169568410067e-06, + "loss": 3.2323, + "step": 9755 + }, + { + "epoch": 0.09928385416666667, + "grad_norm": 8.47429084777832, + "learning_rate": 4.881574097649131e-06, + "loss": 3.2952, + "step": 9760 + }, + { + "epoch": 0.099334716796875, + "grad_norm": 8.88611888885498, + "learning_rate": 4.881452450265583e-06, + "loss": 3.3712, + "step": 9765 + }, + { + "epoch": 0.09938557942708333, + "grad_norm": 10.999044418334961, + "learning_rate": 4.881330741953137e-06, + "loss": 3.504, + "step": 9770 + }, + { + "epoch": 0.09943644205729167, + "grad_norm": 13.003782272338867, + "learning_rate": 4.88120897271491e-06, + "loss": 3.8129, + "step": 9775 + }, + { + "epoch": 0.0994873046875, + "grad_norm": 11.573512077331543, + "learning_rate": 4.881087142554015e-06, + "loss": 3.3195, + "step": 9780 + }, + { + "epoch": 0.09953816731770833, + "grad_norm": 16.704687118530273, + "learning_rate": 4.880965251473571e-06, + "loss": 3.0435, + "step": 9785 + }, + { + "epoch": 0.09958902994791667, + "grad_norm": 13.198221206665039, + "learning_rate": 4.8808432994766944e-06, + "loss": 3.2013, + "step": 9790 + }, + { + "epoch": 0.099639892578125, + "grad_norm": 9.851302146911621, + "learning_rate": 4.880721286566506e-06, + "loss": 3.3261, + "step": 9795 + }, + { + "epoch": 0.09969075520833333, + "grad_norm": 14.158812522888184, + "learning_rate": 4.880599212746128e-06, + "loss": 3.2733, + "step": 9800 + }, + { + "epoch": 0.09974161783854167, + "grad_norm": 15.338472366333008, + "learning_rate": 4.880477078018684e-06, + "loss": 3.3534, + "step": 9805 + }, + { + "epoch": 0.09979248046875, + "grad_norm": 14.599812507629395, + "learning_rate": 4.8803548823872985e-06, + "loss": 3.6244, + "step": 9810 + }, + { + "epoch": 0.09984334309895833, + "grad_norm": 11.007782936096191, + "learning_rate": 4.880232625855096e-06, + "loss": 3.0961, + "step": 9815 + }, + { + "epoch": 0.09989420572916667, + "grad_norm": 13.516772270202637, + "learning_rate": 4.880110308425207e-06, + "loss": 3.3936, + "step": 9820 + }, + { + "epoch": 0.099945068359375, + "grad_norm": 7.3785905838012695, + "learning_rate": 4.8799879301007596e-06, + "loss": 3.7464, + "step": 9825 + }, + { + "epoch": 0.09999593098958333, + "grad_norm": 38.93828582763672, + "learning_rate": 4.879865490884886e-06, + "loss": 3.5595, + "step": 9830 + }, + { + "epoch": 0.10004679361979167, + "grad_norm": 14.068145751953125, + "learning_rate": 4.879742990780717e-06, + "loss": 3.5197, + "step": 9835 + }, + { + "epoch": 0.10009765625, + "grad_norm": 11.864350318908691, + "learning_rate": 4.879620429791387e-06, + "loss": 3.601, + "step": 9840 + }, + { + "epoch": 0.10014851888020833, + "grad_norm": 15.89621639251709, + "learning_rate": 4.879497807920034e-06, + "loss": 3.2539, + "step": 9845 + }, + { + "epoch": 0.10019938151041667, + "grad_norm": 11.07425308227539, + "learning_rate": 4.8793751251697925e-06, + "loss": 3.5963, + "step": 9850 + }, + { + "epoch": 0.100250244140625, + "grad_norm": 10.702958106994629, + "learning_rate": 4.879252381543803e-06, + "loss": 3.6547, + "step": 9855 + }, + { + "epoch": 0.10030110677083333, + "grad_norm": 11.077383995056152, + "learning_rate": 4.879129577045204e-06, + "loss": 3.3366, + "step": 9860 + }, + { + "epoch": 0.10035196940104167, + "grad_norm": 11.844645500183105, + "learning_rate": 4.87900671167714e-06, + "loss": 3.377, + "step": 9865 + }, + { + "epoch": 0.10040283203125, + "grad_norm": 12.209272384643555, + "learning_rate": 4.8788837854427525e-06, + "loss": 4.0542, + "step": 9870 + }, + { + "epoch": 0.10045369466145833, + "grad_norm": 12.173541069030762, + "learning_rate": 4.878760798345188e-06, + "loss": 3.5212, + "step": 9875 + }, + { + "epoch": 0.10050455729166667, + "grad_norm": 12.681471824645996, + "learning_rate": 4.878637750387591e-06, + "loss": 3.8139, + "step": 9880 + }, + { + "epoch": 0.100555419921875, + "grad_norm": 8.942806243896484, + "learning_rate": 4.878514641573112e-06, + "loss": 3.5788, + "step": 9885 + }, + { + "epoch": 0.10060628255208333, + "grad_norm": 11.298273086547852, + "learning_rate": 4.8783914719048995e-06, + "loss": 3.3029, + "step": 9890 + }, + { + "epoch": 0.10065714518229167, + "grad_norm": 9.567761421203613, + "learning_rate": 4.8782682413861046e-06, + "loss": 3.7846, + "step": 9895 + }, + { + "epoch": 0.1007080078125, + "grad_norm": 10.052159309387207, + "learning_rate": 4.8781449500198804e-06, + "loss": 3.3519, + "step": 9900 + }, + { + "epoch": 0.10075887044270833, + "grad_norm": 16.80272674560547, + "learning_rate": 4.878021597809382e-06, + "loss": 3.7498, + "step": 9905 + }, + { + "epoch": 0.10080973307291667, + "grad_norm": 11.9678316116333, + "learning_rate": 4.877898184757765e-06, + "loss": 3.7687, + "step": 9910 + }, + { + "epoch": 0.100860595703125, + "grad_norm": 14.82444953918457, + "learning_rate": 4.877774710868185e-06, + "loss": 3.4121, + "step": 9915 + }, + { + "epoch": 0.10091145833333333, + "grad_norm": 12.413556098937988, + "learning_rate": 4.877651176143804e-06, + "loss": 3.5701, + "step": 9920 + }, + { + "epoch": 0.10096232096354167, + "grad_norm": 16.419466018676758, + "learning_rate": 4.877527580587781e-06, + "loss": 3.5236, + "step": 9925 + }, + { + "epoch": 0.10101318359375, + "grad_norm": 15.198001861572266, + "learning_rate": 4.877403924203278e-06, + "loss": 3.4151, + "step": 9930 + }, + { + "epoch": 0.10106404622395833, + "grad_norm": 11.96876335144043, + "learning_rate": 4.877280206993459e-06, + "loss": 3.1713, + "step": 9935 + }, + { + "epoch": 0.10111490885416667, + "grad_norm": 12.78167724609375, + "learning_rate": 4.8771564289614895e-06, + "loss": 3.3529, + "step": 9940 + }, + { + "epoch": 0.101165771484375, + "grad_norm": 14.186490058898926, + "learning_rate": 4.877032590110536e-06, + "loss": 3.8994, + "step": 9945 + }, + { + "epoch": 0.10121663411458333, + "grad_norm": 9.95617389678955, + "learning_rate": 4.876908690443767e-06, + "loss": 3.402, + "step": 9950 + }, + { + "epoch": 0.10126749674479167, + "grad_norm": 14.145368576049805, + "learning_rate": 4.876784729964353e-06, + "loss": 3.5655, + "step": 9955 + }, + { + "epoch": 0.101318359375, + "grad_norm": 10.90357780456543, + "learning_rate": 4.876660708675465e-06, + "loss": 3.4442, + "step": 9960 + }, + { + "epoch": 0.10136922200520833, + "grad_norm": 13.902193069458008, + "learning_rate": 4.876536626580276e-06, + "loss": 3.3617, + "step": 9965 + }, + { + "epoch": 0.10142008463541667, + "grad_norm": 8.821534156799316, + "learning_rate": 4.876412483681961e-06, + "loss": 3.445, + "step": 9970 + }, + { + "epoch": 0.101470947265625, + "grad_norm": 11.418331146240234, + "learning_rate": 4.8762882799836955e-06, + "loss": 3.4656, + "step": 9975 + }, + { + "epoch": 0.10152180989583333, + "grad_norm": 11.632497787475586, + "learning_rate": 4.876164015488658e-06, + "loss": 3.2188, + "step": 9980 + }, + { + "epoch": 0.10157267252604167, + "grad_norm": 10.307924270629883, + "learning_rate": 4.876039690200027e-06, + "loss": 3.5271, + "step": 9985 + }, + { + "epoch": 0.10162353515625, + "grad_norm": 11.517049789428711, + "learning_rate": 4.875915304120984e-06, + "loss": 3.5669, + "step": 9990 + }, + { + "epoch": 0.10167439778645833, + "grad_norm": 9.514555931091309, + "learning_rate": 4.875790857254711e-06, + "loss": 3.3767, + "step": 9995 + }, + { + "epoch": 0.10172526041666667, + "grad_norm": 13.482763290405273, + "learning_rate": 4.875666349604392e-06, + "loss": 3.5474, + "step": 10000 + }, + { + "epoch": 0.101776123046875, + "grad_norm": 8.935997009277344, + "learning_rate": 4.875541781173212e-06, + "loss": 3.4632, + "step": 10005 + }, + { + "epoch": 0.10182698567708333, + "grad_norm": 8.468302726745605, + "learning_rate": 4.875417151964359e-06, + "loss": 3.4322, + "step": 10010 + }, + { + "epoch": 0.10187784830729167, + "grad_norm": 12.212846755981445, + "learning_rate": 4.875292461981022e-06, + "loss": 4.2413, + "step": 10015 + }, + { + "epoch": 0.1019287109375, + "grad_norm": 11.472529411315918, + "learning_rate": 4.87516771122639e-06, + "loss": 3.2805, + "step": 10020 + }, + { + "epoch": 0.10197957356770833, + "grad_norm": 13.230598449707031, + "learning_rate": 4.875042899703654e-06, + "loss": 3.4256, + "step": 10025 + }, + { + "epoch": 0.10203043619791667, + "grad_norm": 13.139010429382324, + "learning_rate": 4.874918027416009e-06, + "loss": 3.5081, + "step": 10030 + }, + { + "epoch": 0.102081298828125, + "grad_norm": 6.4391703605651855, + "learning_rate": 4.874793094366649e-06, + "loss": 3.3638, + "step": 10035 + }, + { + "epoch": 0.10213216145833333, + "grad_norm": 15.451169967651367, + "learning_rate": 4.8746681005587715e-06, + "loss": 3.6243, + "step": 10040 + }, + { + "epoch": 0.10218302408854167, + "grad_norm": 11.433987617492676, + "learning_rate": 4.874543045995572e-06, + "loss": 3.6848, + "step": 10045 + }, + { + "epoch": 0.10223388671875, + "grad_norm": 8.72944450378418, + "learning_rate": 4.874417930680253e-06, + "loss": 3.3524, + "step": 10050 + }, + { + "epoch": 0.10228474934895833, + "grad_norm": 15.81502628326416, + "learning_rate": 4.874292754616014e-06, + "loss": 3.3923, + "step": 10055 + }, + { + "epoch": 0.10233561197916667, + "grad_norm": 12.636479377746582, + "learning_rate": 4.8741675178060565e-06, + "loss": 3.2582, + "step": 10060 + }, + { + "epoch": 0.102386474609375, + "grad_norm": 10.026837348937988, + "learning_rate": 4.874042220253586e-06, + "loss": 3.5174, + "step": 10065 + }, + { + "epoch": 0.10243733723958333, + "grad_norm": 14.83649730682373, + "learning_rate": 4.8739168619618086e-06, + "loss": 3.5073, + "step": 10070 + }, + { + "epoch": 0.10248819986979167, + "grad_norm": 15.73272705078125, + "learning_rate": 4.873791442933931e-06, + "loss": 3.3724, + "step": 10075 + }, + { + "epoch": 0.1025390625, + "grad_norm": 15.510259628295898, + "learning_rate": 4.873665963173161e-06, + "loss": 3.1763, + "step": 10080 + }, + { + "epoch": 0.10258992513020833, + "grad_norm": 8.296263694763184, + "learning_rate": 4.873540422682711e-06, + "loss": 3.4604, + "step": 10085 + }, + { + "epoch": 0.10264078776041667, + "grad_norm": 11.954290390014648, + "learning_rate": 4.873414821465792e-06, + "loss": 3.4896, + "step": 10090 + }, + { + "epoch": 0.102691650390625, + "grad_norm": 10.191630363464355, + "learning_rate": 4.873289159525617e-06, + "loss": 3.5075, + "step": 10095 + }, + { + "epoch": 0.10274251302083333, + "grad_norm": 14.362115859985352, + "learning_rate": 4.873163436865401e-06, + "loss": 3.0294, + "step": 10100 + }, + { + "epoch": 0.10279337565104167, + "grad_norm": 10.870323181152344, + "learning_rate": 4.873037653488361e-06, + "loss": 3.5388, + "step": 10105 + }, + { + "epoch": 0.10284423828125, + "grad_norm": 14.673710823059082, + "learning_rate": 4.872911809397715e-06, + "loss": 3.2948, + "step": 10110 + }, + { + "epoch": 0.10289510091145833, + "grad_norm": 10.174817085266113, + "learning_rate": 4.872785904596684e-06, + "loss": 3.831, + "step": 10115 + }, + { + "epoch": 0.10294596354166667, + "grad_norm": 13.638348579406738, + "learning_rate": 4.8726599390884866e-06, + "loss": 3.2765, + "step": 10120 + }, + { + "epoch": 0.102996826171875, + "grad_norm": 14.297739028930664, + "learning_rate": 4.872533912876348e-06, + "loss": 3.4857, + "step": 10125 + }, + { + "epoch": 0.10304768880208333, + "grad_norm": 8.859086036682129, + "learning_rate": 4.872407825963491e-06, + "loss": 3.5332, + "step": 10130 + }, + { + "epoch": 0.10309855143229167, + "grad_norm": 17.016996383666992, + "learning_rate": 4.872281678353142e-06, + "loss": 3.858, + "step": 10135 + }, + { + "epoch": 0.1031494140625, + "grad_norm": 7.245146751403809, + "learning_rate": 4.872155470048529e-06, + "loss": 3.3026, + "step": 10140 + }, + { + "epoch": 0.10320027669270833, + "grad_norm": 13.683245658874512, + "learning_rate": 4.87202920105288e-06, + "loss": 3.5045, + "step": 10145 + }, + { + "epoch": 0.10325113932291667, + "grad_norm": 10.41574478149414, + "learning_rate": 4.871902871369427e-06, + "loss": 3.289, + "step": 10150 + }, + { + "epoch": 0.103302001953125, + "grad_norm": 13.628034591674805, + "learning_rate": 4.871776481001401e-06, + "loss": 3.6961, + "step": 10155 + }, + { + "epoch": 0.10335286458333333, + "grad_norm": 9.731292724609375, + "learning_rate": 4.8716500299520356e-06, + "loss": 3.4743, + "step": 10160 + }, + { + "epoch": 0.10340372721354167, + "grad_norm": 15.618063926696777, + "learning_rate": 4.871523518224567e-06, + "loss": 3.0353, + "step": 10165 + }, + { + "epoch": 0.10345458984375, + "grad_norm": 12.420486450195312, + "learning_rate": 4.87139694582223e-06, + "loss": 3.5414, + "step": 10170 + }, + { + "epoch": 0.10350545247395833, + "grad_norm": 9.918728828430176, + "learning_rate": 4.871270312748265e-06, + "loss": 3.6799, + "step": 10175 + }, + { + "epoch": 0.10355631510416667, + "grad_norm": 10.1067533493042, + "learning_rate": 4.871143619005911e-06, + "loss": 3.4345, + "step": 10180 + }, + { + "epoch": 0.103607177734375, + "grad_norm": 6.703737735748291, + "learning_rate": 4.87101686459841e-06, + "loss": 3.2298, + "step": 10185 + }, + { + "epoch": 0.10365804036458333, + "grad_norm": 15.222867012023926, + "learning_rate": 4.8708900495290035e-06, + "loss": 3.6949, + "step": 10190 + }, + { + "epoch": 0.10370890299479167, + "grad_norm": 12.256555557250977, + "learning_rate": 4.8707631738009376e-06, + "loss": 3.4872, + "step": 10195 + }, + { + "epoch": 0.103759765625, + "grad_norm": 8.05500602722168, + "learning_rate": 4.870636237417458e-06, + "loss": 3.5136, + "step": 10200 + }, + { + "epoch": 0.10381062825520833, + "grad_norm": 9.914165496826172, + "learning_rate": 4.870509240381812e-06, + "loss": 3.5153, + "step": 10205 + }, + { + "epoch": 0.10386149088541667, + "grad_norm": 14.44861125946045, + "learning_rate": 4.8703821826972495e-06, + "loss": 3.4795, + "step": 10210 + }, + { + "epoch": 0.103912353515625, + "grad_norm": 12.164118766784668, + "learning_rate": 4.87025506436702e-06, + "loss": 3.4607, + "step": 10215 + }, + { + "epoch": 0.10396321614583333, + "grad_norm": 15.556924819946289, + "learning_rate": 4.8701278853943764e-06, + "loss": 3.3538, + "step": 10220 + }, + { + "epoch": 0.10401407877604167, + "grad_norm": 14.256190299987793, + "learning_rate": 4.870000645782573e-06, + "loss": 3.3421, + "step": 10225 + }, + { + "epoch": 0.10406494140625, + "grad_norm": 13.732766151428223, + "learning_rate": 4.869873345534865e-06, + "loss": 3.0725, + "step": 10230 + }, + { + "epoch": 0.10411580403645833, + "grad_norm": 16.038616180419922, + "learning_rate": 4.869745984654508e-06, + "loss": 3.212, + "step": 10235 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 14.277772903442383, + "learning_rate": 4.8696185631447635e-06, + "loss": 3.6345, + "step": 10240 + }, + { + "epoch": 0.104217529296875, + "grad_norm": 10.199970245361328, + "learning_rate": 4.869491081008889e-06, + "loss": 3.5137, + "step": 10245 + }, + { + "epoch": 0.10426839192708333, + "grad_norm": 12.390953063964844, + "learning_rate": 4.869363538250146e-06, + "loss": 3.3681, + "step": 10250 + }, + { + "epoch": 0.10431925455729167, + "grad_norm": 8.647974967956543, + "learning_rate": 4.869235934871799e-06, + "loss": 3.2399, + "step": 10255 + }, + { + "epoch": 0.1043701171875, + "grad_norm": 11.923579216003418, + "learning_rate": 4.869108270877112e-06, + "loss": 3.6247, + "step": 10260 + }, + { + "epoch": 0.10442097981770833, + "grad_norm": 13.356016159057617, + "learning_rate": 4.868980546269352e-06, + "loss": 3.7096, + "step": 10265 + }, + { + "epoch": 0.10447184244791667, + "grad_norm": 8.816129684448242, + "learning_rate": 4.868852761051787e-06, + "loss": 3.3696, + "step": 10270 + }, + { + "epoch": 0.104522705078125, + "grad_norm": 11.433305740356445, + "learning_rate": 4.868724915227684e-06, + "loss": 3.3735, + "step": 10275 + }, + { + "epoch": 0.10457356770833333, + "grad_norm": 10.5938138961792, + "learning_rate": 4.868597008800315e-06, + "loss": 3.5073, + "step": 10280 + }, + { + "epoch": 0.10462443033854167, + "grad_norm": 13.988579750061035, + "learning_rate": 4.868469041772955e-06, + "loss": 3.6398, + "step": 10285 + }, + { + "epoch": 0.10467529296875, + "grad_norm": 8.426319122314453, + "learning_rate": 4.868341014148875e-06, + "loss": 3.5602, + "step": 10290 + }, + { + "epoch": 0.10472615559895833, + "grad_norm": 12.661907196044922, + "learning_rate": 4.868212925931351e-06, + "loss": 3.3872, + "step": 10295 + }, + { + "epoch": 0.10477701822916667, + "grad_norm": 10.294717788696289, + "learning_rate": 4.868084777123661e-06, + "loss": 3.5918, + "step": 10300 + }, + { + "epoch": 0.104827880859375, + "grad_norm": 16.74044418334961, + "learning_rate": 4.867956567729084e-06, + "loss": 3.4537, + "step": 10305 + }, + { + "epoch": 0.10487874348958333, + "grad_norm": 10.488282203674316, + "learning_rate": 4.867828297750899e-06, + "loss": 3.101, + "step": 10310 + }, + { + "epoch": 0.10492960611979167, + "grad_norm": 10.39643669128418, + "learning_rate": 4.867699967192388e-06, + "loss": 3.1673, + "step": 10315 + }, + { + "epoch": 0.10498046875, + "grad_norm": 13.151554107666016, + "learning_rate": 4.867571576056834e-06, + "loss": 3.6526, + "step": 10320 + }, + { + "epoch": 0.10503133138020833, + "grad_norm": 11.318380355834961, + "learning_rate": 4.867443124347523e-06, + "loss": 3.6569, + "step": 10325 + }, + { + "epoch": 0.10508219401041667, + "grad_norm": 11.30242919921875, + "learning_rate": 4.867314612067741e-06, + "loss": 3.2106, + "step": 10330 + }, + { + "epoch": 0.105133056640625, + "grad_norm": 8.88015079498291, + "learning_rate": 4.867186039220775e-06, + "loss": 3.3928, + "step": 10335 + }, + { + "epoch": 0.10518391927083333, + "grad_norm": 9.858390808105469, + "learning_rate": 4.867057405809916e-06, + "loss": 3.3096, + "step": 10340 + }, + { + "epoch": 0.10523478190104167, + "grad_norm": 14.723819732666016, + "learning_rate": 4.866928711838455e-06, + "loss": 3.3486, + "step": 10345 + }, + { + "epoch": 0.10528564453125, + "grad_norm": 8.097146034240723, + "learning_rate": 4.866799957309682e-06, + "loss": 4.0018, + "step": 10350 + }, + { + "epoch": 0.10533650716145833, + "grad_norm": 14.983758926391602, + "learning_rate": 4.866671142226895e-06, + "loss": 3.5232, + "step": 10355 + }, + { + "epoch": 0.10538736979166667, + "grad_norm": 10.44787311553955, + "learning_rate": 4.866542266593387e-06, + "loss": 3.6857, + "step": 10360 + }, + { + "epoch": 0.105438232421875, + "grad_norm": 13.872020721435547, + "learning_rate": 4.8664133304124555e-06, + "loss": 3.2874, + "step": 10365 + }, + { + "epoch": 0.10548909505208333, + "grad_norm": 11.341287612915039, + "learning_rate": 4.8662843336874e-06, + "loss": 3.202, + "step": 10370 + }, + { + "epoch": 0.10553995768229167, + "grad_norm": 9.611438751220703, + "learning_rate": 4.866155276421522e-06, + "loss": 3.4759, + "step": 10375 + }, + { + "epoch": 0.1055908203125, + "grad_norm": 12.784802436828613, + "learning_rate": 4.8660261586181205e-06, + "loss": 3.3239, + "step": 10380 + }, + { + "epoch": 0.10564168294270833, + "grad_norm": 15.74527645111084, + "learning_rate": 4.865896980280501e-06, + "loss": 3.6173, + "step": 10385 + }, + { + "epoch": 0.10569254557291667, + "grad_norm": 9.02971076965332, + "learning_rate": 4.865767741411969e-06, + "loss": 3.7378, + "step": 10390 + }, + { + "epoch": 0.105743408203125, + "grad_norm": 8.090533256530762, + "learning_rate": 4.8656384420158285e-06, + "loss": 3.2215, + "step": 10395 + }, + { + "epoch": 0.10579427083333333, + "grad_norm": 9.529847145080566, + "learning_rate": 4.86550908209539e-06, + "loss": 3.3138, + "step": 10400 + }, + { + "epoch": 0.10584513346354167, + "grad_norm": 11.145707130432129, + "learning_rate": 4.865379661653963e-06, + "loss": 2.8438, + "step": 10405 + }, + { + "epoch": 0.10589599609375, + "grad_norm": 9.50683879852295, + "learning_rate": 4.8652501806948575e-06, + "loss": 3.4143, + "step": 10410 + }, + { + "epoch": 0.10594685872395833, + "grad_norm": 14.874054908752441, + "learning_rate": 4.865120639221386e-06, + "loss": 3.6896, + "step": 10415 + }, + { + "epoch": 0.10599772135416667, + "grad_norm": 15.502473831176758, + "learning_rate": 4.864991037236864e-06, + "loss": 3.7117, + "step": 10420 + }, + { + "epoch": 0.106048583984375, + "grad_norm": 17.096446990966797, + "learning_rate": 4.864861374744607e-06, + "loss": 3.4389, + "step": 10425 + }, + { + "epoch": 0.10609944661458333, + "grad_norm": 12.414908409118652, + "learning_rate": 4.8647316517479326e-06, + "loss": 3.5158, + "step": 10430 + }, + { + "epoch": 0.10615030924479167, + "grad_norm": 13.846083641052246, + "learning_rate": 4.864601868250159e-06, + "loss": 3.4447, + "step": 10435 + }, + { + "epoch": 0.106201171875, + "grad_norm": 12.67673110961914, + "learning_rate": 4.864472024254607e-06, + "loss": 3.0979, + "step": 10440 + }, + { + "epoch": 0.10625203450520833, + "grad_norm": 13.106204986572266, + "learning_rate": 4.864342119764599e-06, + "loss": 3.4044, + "step": 10445 + }, + { + "epoch": 0.10630289713541667, + "grad_norm": 10.791247367858887, + "learning_rate": 4.864212154783458e-06, + "loss": 3.4244, + "step": 10450 + }, + { + "epoch": 0.106353759765625, + "grad_norm": 15.417497634887695, + "learning_rate": 4.86408212931451e-06, + "loss": 3.8919, + "step": 10455 + }, + { + "epoch": 0.10640462239583333, + "grad_norm": 12.463935852050781, + "learning_rate": 4.86395204336108e-06, + "loss": 3.3458, + "step": 10460 + }, + { + "epoch": 0.10645548502604167, + "grad_norm": 11.733417510986328, + "learning_rate": 4.863821896926498e-06, + "loss": 3.5288, + "step": 10465 + }, + { + "epoch": 0.10650634765625, + "grad_norm": 14.689102172851562, + "learning_rate": 4.863691690014093e-06, + "loss": 3.2967, + "step": 10470 + }, + { + "epoch": 0.10655721028645833, + "grad_norm": 10.164743423461914, + "learning_rate": 4.863561422627197e-06, + "loss": 3.2024, + "step": 10475 + }, + { + "epoch": 0.10660807291666667, + "grad_norm": 10.591276168823242, + "learning_rate": 4.863431094769141e-06, + "loss": 3.1854, + "step": 10480 + }, + { + "epoch": 0.106658935546875, + "grad_norm": 7.265041351318359, + "learning_rate": 4.863300706443261e-06, + "loss": 3.3769, + "step": 10485 + }, + { + "epoch": 0.10670979817708333, + "grad_norm": 9.011372566223145, + "learning_rate": 4.8631702576528924e-06, + "loss": 3.3535, + "step": 10490 + }, + { + "epoch": 0.10676066080729167, + "grad_norm": 12.629626274108887, + "learning_rate": 4.863039748401374e-06, + "loss": 3.1713, + "step": 10495 + }, + { + "epoch": 0.1068115234375, + "grad_norm": 10.04238224029541, + "learning_rate": 4.8629091786920425e-06, + "loss": 3.2648, + "step": 10500 + }, + { + "epoch": 0.10686238606770833, + "grad_norm": 8.861063003540039, + "learning_rate": 4.862778548528239e-06, + "loss": 3.4088, + "step": 10505 + }, + { + "epoch": 0.10691324869791667, + "grad_norm": 10.639623641967773, + "learning_rate": 4.862647857913308e-06, + "loss": 3.1807, + "step": 10510 + }, + { + "epoch": 0.106964111328125, + "grad_norm": 11.753366470336914, + "learning_rate": 4.862517106850592e-06, + "loss": 4.0414, + "step": 10515 + }, + { + "epoch": 0.10701497395833333, + "grad_norm": 6.897514343261719, + "learning_rate": 4.862386295343435e-06, + "loss": 3.1811, + "step": 10520 + }, + { + "epoch": 0.10706583658854167, + "grad_norm": 17.118993759155273, + "learning_rate": 4.862255423395184e-06, + "loss": 3.549, + "step": 10525 + }, + { + "epoch": 0.10711669921875, + "grad_norm": 12.017741203308105, + "learning_rate": 4.862124491009188e-06, + "loss": 3.4445, + "step": 10530 + }, + { + "epoch": 0.10716756184895833, + "grad_norm": 18.230016708374023, + "learning_rate": 4.861993498188798e-06, + "loss": 3.381, + "step": 10535 + }, + { + "epoch": 0.10721842447916667, + "grad_norm": 16.504117965698242, + "learning_rate": 4.861862444937363e-06, + "loss": 4.0049, + "step": 10540 + }, + { + "epoch": 0.107269287109375, + "grad_norm": 13.888938903808594, + "learning_rate": 4.861731331258238e-06, + "loss": 3.5727, + "step": 10545 + }, + { + "epoch": 0.10732014973958333, + "grad_norm": 14.863890647888184, + "learning_rate": 4.8616001571547764e-06, + "loss": 3.4779, + "step": 10550 + }, + { + "epoch": 0.10737101236979167, + "grad_norm": 13.943183898925781, + "learning_rate": 4.8614689226303345e-06, + "loss": 3.602, + "step": 10555 + }, + { + "epoch": 0.107421875, + "grad_norm": 10.1746826171875, + "learning_rate": 4.86133762768827e-06, + "loss": 3.4561, + "step": 10560 + }, + { + "epoch": 0.10747273763020833, + "grad_norm": 12.866034507751465, + "learning_rate": 4.861206272331941e-06, + "loss": 3.2867, + "step": 10565 + }, + { + "epoch": 0.10752360026041667, + "grad_norm": 13.17371654510498, + "learning_rate": 4.86107485656471e-06, + "loss": 3.4148, + "step": 10570 + }, + { + "epoch": 0.107574462890625, + "grad_norm": 8.4532470703125, + "learning_rate": 4.860943380389939e-06, + "loss": 3.1673, + "step": 10575 + }, + { + "epoch": 0.10762532552083333, + "grad_norm": 10.179365158081055, + "learning_rate": 4.86081184381099e-06, + "loss": 3.3304, + "step": 10580 + }, + { + "epoch": 0.10767618815104167, + "grad_norm": 10.048255920410156, + "learning_rate": 4.860680246831231e-06, + "loss": 3.3539, + "step": 10585 + }, + { + "epoch": 0.10772705078125, + "grad_norm": 14.505502700805664, + "learning_rate": 4.860548589454026e-06, + "loss": 3.5862, + "step": 10590 + }, + { + "epoch": 0.10777791341145833, + "grad_norm": 11.778282165527344, + "learning_rate": 4.860416871682746e-06, + "loss": 3.3957, + "step": 10595 + }, + { + "epoch": 0.10782877604166667, + "grad_norm": 10.966447830200195, + "learning_rate": 4.860285093520759e-06, + "loss": 3.2105, + "step": 10600 + }, + { + "epoch": 0.107879638671875, + "grad_norm": 12.947134017944336, + "learning_rate": 4.860153254971437e-06, + "loss": 3.7268, + "step": 10605 + }, + { + "epoch": 0.10793050130208333, + "grad_norm": 12.653708457946777, + "learning_rate": 4.860021356038155e-06, + "loss": 3.4977, + "step": 10610 + }, + { + "epoch": 0.10798136393229167, + "grad_norm": 15.799676895141602, + "learning_rate": 4.859889396724284e-06, + "loss": 3.7318, + "step": 10615 + }, + { + "epoch": 0.1080322265625, + "grad_norm": 10.117684364318848, + "learning_rate": 4.859757377033204e-06, + "loss": 3.8512, + "step": 10620 + }, + { + "epoch": 0.10808308919270833, + "grad_norm": 16.469112396240234, + "learning_rate": 4.85962529696829e-06, + "loss": 3.6864, + "step": 10625 + }, + { + "epoch": 0.10813395182291667, + "grad_norm": 8.12623119354248, + "learning_rate": 4.859493156532922e-06, + "loss": 3.162, + "step": 10630 + }, + { + "epoch": 0.108184814453125, + "grad_norm": 10.442404747009277, + "learning_rate": 4.859360955730481e-06, + "loss": 3.3013, + "step": 10635 + }, + { + "epoch": 0.10823567708333333, + "grad_norm": 13.673596382141113, + "learning_rate": 4.8592286945643485e-06, + "loss": 3.3599, + "step": 10640 + }, + { + "epoch": 0.10828653971354167, + "grad_norm": 8.366315841674805, + "learning_rate": 4.859096373037911e-06, + "loss": 3.2563, + "step": 10645 + }, + { + "epoch": 0.10833740234375, + "grad_norm": 16.115209579467773, + "learning_rate": 4.8589639911545495e-06, + "loss": 3.6421, + "step": 10650 + }, + { + "epoch": 0.10838826497395833, + "grad_norm": 12.849785804748535, + "learning_rate": 4.858831548917655e-06, + "loss": 3.4333, + "step": 10655 + }, + { + "epoch": 0.10843912760416667, + "grad_norm": 12.12121295928955, + "learning_rate": 4.858699046330614e-06, + "loss": 3.8339, + "step": 10660 + }, + { + "epoch": 0.108489990234375, + "grad_norm": 12.373353004455566, + "learning_rate": 4.858566483396816e-06, + "loss": 3.3636, + "step": 10665 + }, + { + "epoch": 0.10854085286458333, + "grad_norm": 13.366467475891113, + "learning_rate": 4.858433860119655e-06, + "loss": 3.1534, + "step": 10670 + }, + { + "epoch": 0.10859171549479167, + "grad_norm": 11.035595893859863, + "learning_rate": 4.858301176502522e-06, + "loss": 3.0366, + "step": 10675 + }, + { + "epoch": 0.108642578125, + "grad_norm": 9.009740829467773, + "learning_rate": 4.858168432548813e-06, + "loss": 3.1213, + "step": 10680 + }, + { + "epoch": 0.10869344075520833, + "grad_norm": 13.181190490722656, + "learning_rate": 4.858035628261924e-06, + "loss": 3.2362, + "step": 10685 + }, + { + "epoch": 0.10874430338541667, + "grad_norm": 12.755681991577148, + "learning_rate": 4.85790276364525e-06, + "loss": 3.493, + "step": 10690 + }, + { + "epoch": 0.108795166015625, + "grad_norm": 16.021320343017578, + "learning_rate": 4.857769838702195e-06, + "loss": 3.3145, + "step": 10695 + }, + { + "epoch": 0.10884602864583333, + "grad_norm": 11.281843185424805, + "learning_rate": 4.857636853436156e-06, + "loss": 3.4385, + "step": 10700 + }, + { + "epoch": 0.10889689127604167, + "grad_norm": 13.89880657196045, + "learning_rate": 4.857503807850538e-06, + "loss": 3.5553, + "step": 10705 + }, + { + "epoch": 0.10894775390625, + "grad_norm": 17.039382934570312, + "learning_rate": 4.857370701948744e-06, + "loss": 3.7388, + "step": 10710 + }, + { + "epoch": 0.10899861653645833, + "grad_norm": 8.904913902282715, + "learning_rate": 4.857237535734179e-06, + "loss": 3.5905, + "step": 10715 + }, + { + "epoch": 0.10904947916666667, + "grad_norm": 10.64655876159668, + "learning_rate": 4.8571043092102496e-06, + "loss": 3.174, + "step": 10720 + }, + { + "epoch": 0.109100341796875, + "grad_norm": 11.749823570251465, + "learning_rate": 4.856971022380366e-06, + "loss": 3.295, + "step": 10725 + }, + { + "epoch": 0.10915120442708333, + "grad_norm": 9.72337532043457, + "learning_rate": 4.856837675247938e-06, + "loss": 3.3855, + "step": 10730 + }, + { + "epoch": 0.10920206705729167, + "grad_norm": 16.54507827758789, + "learning_rate": 4.856704267816375e-06, + "loss": 3.5955, + "step": 10735 + }, + { + "epoch": 0.1092529296875, + "grad_norm": 13.975875854492188, + "learning_rate": 4.856570800089093e-06, + "loss": 3.3142, + "step": 10740 + }, + { + "epoch": 0.10930379231770833, + "grad_norm": 15.316740989685059, + "learning_rate": 4.856437272069506e-06, + "loss": 3.4958, + "step": 10745 + }, + { + "epoch": 0.10935465494791667, + "grad_norm": 10.500741004943848, + "learning_rate": 4.856303683761029e-06, + "loss": 3.2991, + "step": 10750 + }, + { + "epoch": 0.109405517578125, + "grad_norm": 10.17525577545166, + "learning_rate": 4.8561700351670815e-06, + "loss": 3.7226, + "step": 10755 + }, + { + "epoch": 0.10945638020833333, + "grad_norm": 12.4932861328125, + "learning_rate": 4.856036326291082e-06, + "loss": 3.2116, + "step": 10760 + }, + { + "epoch": 0.10950724283854167, + "grad_norm": 9.614534378051758, + "learning_rate": 4.855902557136451e-06, + "loss": 3.351, + "step": 10765 + }, + { + "epoch": 0.10955810546875, + "grad_norm": 14.387618064880371, + "learning_rate": 4.855768727706613e-06, + "loss": 3.5085, + "step": 10770 + }, + { + "epoch": 0.10960896809895833, + "grad_norm": 11.664185523986816, + "learning_rate": 4.855634838004988e-06, + "loss": 3.2219, + "step": 10775 + }, + { + "epoch": 0.10965983072916667, + "grad_norm": 8.379794120788574, + "learning_rate": 4.8555008880350055e-06, + "loss": 3.0816, + "step": 10780 + }, + { + "epoch": 0.109710693359375, + "grad_norm": 14.983808517456055, + "learning_rate": 4.8553668778000905e-06, + "loss": 3.0428, + "step": 10785 + }, + { + "epoch": 0.10976155598958333, + "grad_norm": 10.499786376953125, + "learning_rate": 4.855232807303673e-06, + "loss": 3.1102, + "step": 10790 + }, + { + "epoch": 0.10981241861979167, + "grad_norm": 15.770118713378906, + "learning_rate": 4.8550986765491825e-06, + "loss": 4.0818, + "step": 10795 + }, + { + "epoch": 0.10986328125, + "grad_norm": 13.66915225982666, + "learning_rate": 4.85496448554005e-06, + "loss": 3.7093, + "step": 10800 + }, + { + "epoch": 0.10991414388020833, + "grad_norm": 8.571983337402344, + "learning_rate": 4.85483023427971e-06, + "loss": 3.0368, + "step": 10805 + }, + { + "epoch": 0.10996500651041667, + "grad_norm": 15.759471893310547, + "learning_rate": 4.854695922771595e-06, + "loss": 3.5172, + "step": 10810 + }, + { + "epoch": 0.110015869140625, + "grad_norm": 13.83719539642334, + "learning_rate": 4.854561551019145e-06, + "loss": 3.5774, + "step": 10815 + }, + { + "epoch": 0.11006673177083333, + "grad_norm": 11.690563201904297, + "learning_rate": 4.854427119025794e-06, + "loss": 3.5709, + "step": 10820 + }, + { + "epoch": 0.11011759440104167, + "grad_norm": 16.460613250732422, + "learning_rate": 4.854292626794984e-06, + "loss": 3.0922, + "step": 10825 + }, + { + "epoch": 0.11016845703125, + "grad_norm": 9.609016418457031, + "learning_rate": 4.854158074330155e-06, + "loss": 3.6565, + "step": 10830 + }, + { + "epoch": 0.11021931966145833, + "grad_norm": 8.492650985717773, + "learning_rate": 4.85402346163475e-06, + "loss": 3.0948, + "step": 10835 + }, + { + "epoch": 0.11027018229166667, + "grad_norm": 8.52239990234375, + "learning_rate": 4.853888788712213e-06, + "loss": 3.6836, + "step": 10840 + }, + { + "epoch": 0.110321044921875, + "grad_norm": 14.75137710571289, + "learning_rate": 4.853754055565988e-06, + "loss": 3.5731, + "step": 10845 + }, + { + "epoch": 0.11037190755208333, + "grad_norm": 17.106433868408203, + "learning_rate": 4.853619262199525e-06, + "loss": 3.3211, + "step": 10850 + }, + { + "epoch": 0.11042277018229167, + "grad_norm": 13.21174430847168, + "learning_rate": 4.85348440861627e-06, + "loss": 3.1972, + "step": 10855 + }, + { + "epoch": 0.1104736328125, + "grad_norm": 15.949371337890625, + "learning_rate": 4.8533494948196746e-06, + "loss": 3.7361, + "step": 10860 + }, + { + "epoch": 0.11052449544270833, + "grad_norm": 13.830065727233887, + "learning_rate": 4.8532145208131894e-06, + "loss": 3.4961, + "step": 10865 + }, + { + "epoch": 0.11057535807291667, + "grad_norm": 14.657466888427734, + "learning_rate": 4.85307948660027e-06, + "loss": 3.3326, + "step": 10870 + }, + { + "epoch": 0.110626220703125, + "grad_norm": 18.071182250976562, + "learning_rate": 4.852944392184369e-06, + "loss": 3.5664, + "step": 10875 + }, + { + "epoch": 0.11067708333333333, + "grad_norm": 259.91387939453125, + "learning_rate": 4.852809237568943e-06, + "loss": 3.6806, + "step": 10880 + }, + { + "epoch": 0.11072794596354167, + "grad_norm": 12.160555839538574, + "learning_rate": 4.85267402275745e-06, + "loss": 3.8418, + "step": 10885 + }, + { + "epoch": 0.11077880859375, + "grad_norm": 15.426597595214844, + "learning_rate": 4.852538747753351e-06, + "loss": 3.5979, + "step": 10890 + }, + { + "epoch": 0.11082967122395833, + "grad_norm": 13.01453685760498, + "learning_rate": 4.852403412560105e-06, + "loss": 3.2011, + "step": 10895 + }, + { + "epoch": 0.11088053385416667, + "grad_norm": 10.577211380004883, + "learning_rate": 4.852268017181176e-06, + "loss": 3.727, + "step": 10900 + }, + { + "epoch": 0.110931396484375, + "grad_norm": 14.287973403930664, + "learning_rate": 4.852132561620026e-06, + "loss": 3.758, + "step": 10905 + }, + { + "epoch": 0.11098225911458333, + "grad_norm": 11.887785911560059, + "learning_rate": 4.851997045880123e-06, + "loss": 2.9819, + "step": 10910 + }, + { + "epoch": 0.11103312174479167, + "grad_norm": 7.388796329498291, + "learning_rate": 4.851861469964932e-06, + "loss": 3.0688, + "step": 10915 + }, + { + "epoch": 0.111083984375, + "grad_norm": 11.511301040649414, + "learning_rate": 4.851725833877924e-06, + "loss": 3.3384, + "step": 10920 + }, + { + "epoch": 0.11113484700520833, + "grad_norm": 13.27392864227295, + "learning_rate": 4.851590137622567e-06, + "loss": 3.0957, + "step": 10925 + }, + { + "epoch": 0.11118570963541667, + "grad_norm": 14.255321502685547, + "learning_rate": 4.851454381202334e-06, + "loss": 3.2306, + "step": 10930 + }, + { + "epoch": 0.111236572265625, + "grad_norm": 11.45173168182373, + "learning_rate": 4.851318564620699e-06, + "loss": 3.4344, + "step": 10935 + }, + { + "epoch": 0.11128743489583333, + "grad_norm": 12.557133674621582, + "learning_rate": 4.851182687881134e-06, + "loss": 3.4574, + "step": 10940 + }, + { + "epoch": 0.11133829752604167, + "grad_norm": 12.226534843444824, + "learning_rate": 4.851046750987118e-06, + "loss": 3.8019, + "step": 10945 + }, + { + "epoch": 0.11138916015625, + "grad_norm": 10.195042610168457, + "learning_rate": 4.850910753942129e-06, + "loss": 3.0214, + "step": 10950 + }, + { + "epoch": 0.11144002278645833, + "grad_norm": 14.165132522583008, + "learning_rate": 4.850774696749645e-06, + "loss": 3.5724, + "step": 10955 + }, + { + "epoch": 0.11149088541666667, + "grad_norm": 13.284117698669434, + "learning_rate": 4.850638579413147e-06, + "loss": 4.1911, + "step": 10960 + }, + { + "epoch": 0.111541748046875, + "grad_norm": 16.30782127380371, + "learning_rate": 4.850502401936119e-06, + "loss": 3.1399, + "step": 10965 + }, + { + "epoch": 0.11159261067708333, + "grad_norm": 13.27759838104248, + "learning_rate": 4.850366164322044e-06, + "loss": 3.4511, + "step": 10970 + }, + { + "epoch": 0.11164347330729167, + "grad_norm": 12.456367492675781, + "learning_rate": 4.850229866574407e-06, + "loss": 3.5491, + "step": 10975 + }, + { + "epoch": 0.1116943359375, + "grad_norm": 10.18445873260498, + "learning_rate": 4.850093508696697e-06, + "loss": 3.5397, + "step": 10980 + }, + { + "epoch": 0.11174519856770833, + "grad_norm": 14.981690406799316, + "learning_rate": 4.849957090692401e-06, + "loss": 3.3078, + "step": 10985 + }, + { + "epoch": 0.11179606119791667, + "grad_norm": 18.036338806152344, + "learning_rate": 4.84982061256501e-06, + "loss": 3.2716, + "step": 10990 + }, + { + "epoch": 0.111846923828125, + "grad_norm": 18.29755973815918, + "learning_rate": 4.849684074318015e-06, + "loss": 3.6321, + "step": 10995 + }, + { + "epoch": 0.11189778645833333, + "grad_norm": 13.462907791137695, + "learning_rate": 4.849547475954911e-06, + "loss": 3.3401, + "step": 11000 + }, + { + "epoch": 0.11194864908854167, + "grad_norm": 9.00394058227539, + "learning_rate": 4.849410817479191e-06, + "loss": 3.6376, + "step": 11005 + }, + { + "epoch": 0.11199951171875, + "grad_norm": 7.56473970413208, + "learning_rate": 4.849274098894352e-06, + "loss": 3.5239, + "step": 11010 + }, + { + "epoch": 0.11205037434895833, + "grad_norm": 10.673871994018555, + "learning_rate": 4.849137320203892e-06, + "loss": 3.4489, + "step": 11015 + }, + { + "epoch": 0.11210123697916667, + "grad_norm": 17.68508529663086, + "learning_rate": 4.849000481411312e-06, + "loss": 3.3573, + "step": 11020 + }, + { + "epoch": 0.112152099609375, + "grad_norm": 13.00468921661377, + "learning_rate": 4.84886358252011e-06, + "loss": 3.3832, + "step": 11025 + }, + { + "epoch": 0.11220296223958333, + "grad_norm": 8.050721168518066, + "learning_rate": 4.8487266235337895e-06, + "loss": 3.2107, + "step": 11030 + }, + { + "epoch": 0.11225382486979167, + "grad_norm": 10.355036735534668, + "learning_rate": 4.848589604455856e-06, + "loss": 3.304, + "step": 11035 + }, + { + "epoch": 0.1123046875, + "grad_norm": 19.039920806884766, + "learning_rate": 4.848452525289814e-06, + "loss": 3.7786, + "step": 11040 + }, + { + "epoch": 0.11235555013020833, + "grad_norm": 12.501426696777344, + "learning_rate": 4.8483153860391705e-06, + "loss": 3.5708, + "step": 11045 + }, + { + "epoch": 0.11240641276041667, + "grad_norm": 15.971506118774414, + "learning_rate": 4.848178186707435e-06, + "loss": 3.209, + "step": 11050 + }, + { + "epoch": 0.112457275390625, + "grad_norm": 7.047082424163818, + "learning_rate": 4.8480409272981165e-06, + "loss": 3.6421, + "step": 11055 + }, + { + "epoch": 0.11250813802083333, + "grad_norm": 14.781586647033691, + "learning_rate": 4.847903607814728e-06, + "loss": 3.4269, + "step": 11060 + }, + { + "epoch": 0.11255900065104167, + "grad_norm": 10.393120765686035, + "learning_rate": 4.847766228260781e-06, + "loss": 3.5995, + "step": 11065 + }, + { + "epoch": 0.11260986328125, + "grad_norm": 10.039695739746094, + "learning_rate": 4.847628788639793e-06, + "loss": 3.3204, + "step": 11070 + }, + { + "epoch": 0.11266072591145833, + "grad_norm": 13.824012756347656, + "learning_rate": 4.847491288955279e-06, + "loss": 3.6698, + "step": 11075 + }, + { + "epoch": 0.11271158854166667, + "grad_norm": 11.334912300109863, + "learning_rate": 4.847353729210756e-06, + "loss": 3.5687, + "step": 11080 + }, + { + "epoch": 0.112762451171875, + "grad_norm": 15.515557289123535, + "learning_rate": 4.847216109409744e-06, + "loss": 3.761, + "step": 11085 + }, + { + "epoch": 0.11281331380208333, + "grad_norm": 7.701551914215088, + "learning_rate": 4.847078429555765e-06, + "loss": 3.6052, + "step": 11090 + }, + { + "epoch": 0.11286417643229167, + "grad_norm": 13.438464164733887, + "learning_rate": 4.8469406896523405e-06, + "loss": 3.2666, + "step": 11095 + }, + { + "epoch": 0.1129150390625, + "grad_norm": 11.519120216369629, + "learning_rate": 4.846802889702994e-06, + "loss": 3.1568, + "step": 11100 + }, + { + "epoch": 0.11296590169270833, + "grad_norm": 10.409358978271484, + "learning_rate": 4.8466650297112525e-06, + "loss": 3.5128, + "step": 11105 + }, + { + "epoch": 0.11301676432291667, + "grad_norm": 10.928455352783203, + "learning_rate": 4.846527109680642e-06, + "loss": 3.7998, + "step": 11110 + }, + { + "epoch": 0.113067626953125, + "grad_norm": 13.899455070495605, + "learning_rate": 4.846389129614692e-06, + "loss": 3.1142, + "step": 11115 + }, + { + "epoch": 0.11311848958333333, + "grad_norm": 10.756808280944824, + "learning_rate": 4.846251089516932e-06, + "loss": 3.3781, + "step": 11120 + }, + { + "epoch": 0.11316935221354167, + "grad_norm": 10.080056190490723, + "learning_rate": 4.846112989390894e-06, + "loss": 3.1493, + "step": 11125 + }, + { + "epoch": 0.11322021484375, + "grad_norm": 16.393461227416992, + "learning_rate": 4.845974829240112e-06, + "loss": 3.6096, + "step": 11130 + }, + { + "epoch": 0.11327107747395833, + "grad_norm": 13.437868118286133, + "learning_rate": 4.845836609068119e-06, + "loss": 3.3507, + "step": 11135 + }, + { + "epoch": 0.11332194010416667, + "grad_norm": 12.06811237335205, + "learning_rate": 4.8456983288784535e-06, + "loss": 3.197, + "step": 11140 + }, + { + "epoch": 0.113372802734375, + "grad_norm": 11.709611892700195, + "learning_rate": 4.845559988674651e-06, + "loss": 3.5839, + "step": 11145 + }, + { + "epoch": 0.11342366536458333, + "grad_norm": 17.550220489501953, + "learning_rate": 4.8454215884602525e-06, + "loss": 3.6834, + "step": 11150 + }, + { + "epoch": 0.11347452799479167, + "grad_norm": 10.27842903137207, + "learning_rate": 4.845283128238799e-06, + "loss": 3.5057, + "step": 11155 + }, + { + "epoch": 0.113525390625, + "grad_norm": 16.689462661743164, + "learning_rate": 4.845144608013832e-06, + "loss": 3.3252, + "step": 11160 + }, + { + "epoch": 0.11357625325520833, + "grad_norm": 10.314776420593262, + "learning_rate": 4.845006027788897e-06, + "loss": 4.4668, + "step": 11165 + }, + { + "epoch": 0.11362711588541667, + "grad_norm": 12.564552307128906, + "learning_rate": 4.844867387567538e-06, + "loss": 3.3531, + "step": 11170 + }, + { + "epoch": 0.113677978515625, + "grad_norm": 16.5703182220459, + "learning_rate": 4.8447286873533025e-06, + "loss": 3.3511, + "step": 11175 + }, + { + "epoch": 0.11372884114583333, + "grad_norm": 10.776202201843262, + "learning_rate": 4.84458992714974e-06, + "loss": 3.5057, + "step": 11180 + }, + { + "epoch": 0.11377970377604167, + "grad_norm": 9.926742553710938, + "learning_rate": 4.844451106960399e-06, + "loss": 3.5368, + "step": 11185 + }, + { + "epoch": 0.11383056640625, + "grad_norm": 13.466375350952148, + "learning_rate": 4.844312226788833e-06, + "loss": 3.4744, + "step": 11190 + }, + { + "epoch": 0.11388142903645833, + "grad_norm": 13.84046745300293, + "learning_rate": 4.844173286638593e-06, + "loss": 3.4421, + "step": 11195 + }, + { + "epoch": 0.11393229166666667, + "grad_norm": 16.326255798339844, + "learning_rate": 4.8440342865132365e-06, + "loss": 3.5388, + "step": 11200 + }, + { + "epoch": 0.113983154296875, + "grad_norm": 11.938819885253906, + "learning_rate": 4.843895226416317e-06, + "loss": 3.171, + "step": 11205 + }, + { + "epoch": 0.11403401692708333, + "grad_norm": 8.013636589050293, + "learning_rate": 4.843756106351396e-06, + "loss": 4.0519, + "step": 11210 + }, + { + "epoch": 0.11408487955729167, + "grad_norm": 12.114385604858398, + "learning_rate": 4.843616926322029e-06, + "loss": 3.5346, + "step": 11215 + }, + { + "epoch": 0.1141357421875, + "grad_norm": 14.332465171813965, + "learning_rate": 4.843477686331778e-06, + "loss": 3.3585, + "step": 11220 + }, + { + "epoch": 0.11418660481770833, + "grad_norm": 13.207276344299316, + "learning_rate": 4.8433383863842065e-06, + "loss": 3.2574, + "step": 11225 + }, + { + "epoch": 0.11423746744791667, + "grad_norm": 13.37678050994873, + "learning_rate": 4.8431990264828775e-06, + "loss": 3.0765, + "step": 11230 + }, + { + "epoch": 0.114288330078125, + "grad_norm": 10.288887977600098, + "learning_rate": 4.843059606631358e-06, + "loss": 3.4786, + "step": 11235 + }, + { + "epoch": 0.11433919270833333, + "grad_norm": 9.490957260131836, + "learning_rate": 4.842920126833212e-06, + "loss": 3.0409, + "step": 11240 + }, + { + "epoch": 0.11439005533854167, + "grad_norm": 12.170294761657715, + "learning_rate": 4.842780587092011e-06, + "loss": 3.4692, + "step": 11245 + }, + { + "epoch": 0.11444091796875, + "grad_norm": 8.890890121459961, + "learning_rate": 4.842640987411323e-06, + "loss": 3.1144, + "step": 11250 + }, + { + "epoch": 0.11449178059895833, + "grad_norm": 10.649124145507812, + "learning_rate": 4.842501327794722e-06, + "loss": 3.3058, + "step": 11255 + }, + { + "epoch": 0.11454264322916667, + "grad_norm": 15.376280784606934, + "learning_rate": 4.842361608245779e-06, + "loss": 3.2482, + "step": 11260 + }, + { + "epoch": 0.114593505859375, + "grad_norm": 11.391185760498047, + "learning_rate": 4.84222182876807e-06, + "loss": 3.2936, + "step": 11265 + }, + { + "epoch": 0.11464436848958333, + "grad_norm": 8.210448265075684, + "learning_rate": 4.84208198936517e-06, + "loss": 3.5134, + "step": 11270 + }, + { + "epoch": 0.11469523111979167, + "grad_norm": 15.108521461486816, + "learning_rate": 4.841942090040658e-06, + "loss": 3.4495, + "step": 11275 + }, + { + "epoch": 0.11474609375, + "grad_norm": 10.561015129089355, + "learning_rate": 4.841802130798112e-06, + "loss": 3.3369, + "step": 11280 + }, + { + "epoch": 0.11479695638020833, + "grad_norm": 13.979567527770996, + "learning_rate": 4.841662111641114e-06, + "loss": 3.5918, + "step": 11285 + }, + { + "epoch": 0.11484781901041667, + "grad_norm": 12.601395606994629, + "learning_rate": 4.841522032573246e-06, + "loss": 3.3726, + "step": 11290 + }, + { + "epoch": 0.114898681640625, + "grad_norm": 14.680171966552734, + "learning_rate": 4.841381893598092e-06, + "loss": 3.3161, + "step": 11295 + }, + { + "epoch": 0.11494954427083333, + "grad_norm": 11.404961585998535, + "learning_rate": 4.841241694719236e-06, + "loss": 3.9943, + "step": 11300 + }, + { + "epoch": 0.11500040690104167, + "grad_norm": 7.569732666015625, + "learning_rate": 4.841101435940268e-06, + "loss": 3.2494, + "step": 11305 + }, + { + "epoch": 0.11505126953125, + "grad_norm": 13.507386207580566, + "learning_rate": 4.840961117264773e-06, + "loss": 3.2946, + "step": 11310 + }, + { + "epoch": 0.11510213216145833, + "grad_norm": 8.6294584274292, + "learning_rate": 4.840820738696343e-06, + "loss": 3.6376, + "step": 11315 + }, + { + "epoch": 0.11515299479166667, + "grad_norm": 12.276251792907715, + "learning_rate": 4.8406803002385696e-06, + "loss": 3.5475, + "step": 11320 + }, + { + "epoch": 0.115203857421875, + "grad_norm": 13.95095157623291, + "learning_rate": 4.8405398018950465e-06, + "loss": 3.4318, + "step": 11325 + }, + { + "epoch": 0.11525472005208333, + "grad_norm": 8.981563568115234, + "learning_rate": 4.840399243669366e-06, + "loss": 3.3984, + "step": 11330 + }, + { + "epoch": 0.11530558268229167, + "grad_norm": 17.332687377929688, + "learning_rate": 4.840258625565126e-06, + "loss": 3.4236, + "step": 11335 + }, + { + "epoch": 0.1153564453125, + "grad_norm": 13.317326545715332, + "learning_rate": 4.840117947585924e-06, + "loss": 3.7795, + "step": 11340 + }, + { + "epoch": 0.11540730794270833, + "grad_norm": 7.8791656494140625, + "learning_rate": 4.8399772097353585e-06, + "loss": 3.316, + "step": 11345 + }, + { + "epoch": 0.11545817057291667, + "grad_norm": 11.29205322265625, + "learning_rate": 4.839836412017031e-06, + "loss": 3.5903, + "step": 11350 + }, + { + "epoch": 0.115509033203125, + "grad_norm": 9.628116607666016, + "learning_rate": 4.839695554434543e-06, + "loss": 3.9415, + "step": 11355 + }, + { + "epoch": 0.11555989583333333, + "grad_norm": 13.431588172912598, + "learning_rate": 4.839554636991499e-06, + "loss": 3.3724, + "step": 11360 + }, + { + "epoch": 0.11561075846354167, + "grad_norm": 16.05211639404297, + "learning_rate": 4.8394136596915044e-06, + "loss": 3.6998, + "step": 11365 + }, + { + "epoch": 0.11566162109375, + "grad_norm": 15.686810493469238, + "learning_rate": 4.839272622538166e-06, + "loss": 3.3292, + "step": 11370 + }, + { + "epoch": 0.11571248372395833, + "grad_norm": 10.349496841430664, + "learning_rate": 4.839131525535093e-06, + "loss": 3.7189, + "step": 11375 + }, + { + "epoch": 0.11576334635416667, + "grad_norm": 12.135592460632324, + "learning_rate": 4.838990368685892e-06, + "loss": 4.0048, + "step": 11380 + }, + { + "epoch": 0.115814208984375, + "grad_norm": 16.356788635253906, + "learning_rate": 4.838849151994178e-06, + "loss": 3.6877, + "step": 11385 + }, + { + "epoch": 0.11586507161458333, + "grad_norm": 16.019092559814453, + "learning_rate": 4.838707875463563e-06, + "loss": 3.4608, + "step": 11390 + }, + { + "epoch": 0.11591593424479167, + "grad_norm": 9.560792922973633, + "learning_rate": 4.838566539097661e-06, + "loss": 3.3188, + "step": 11395 + }, + { + "epoch": 0.115966796875, + "grad_norm": 14.858901023864746, + "learning_rate": 4.838425142900089e-06, + "loss": 3.589, + "step": 11400 + }, + { + "epoch": 0.11601765950520833, + "grad_norm": 8.294513702392578, + "learning_rate": 4.8382836868744635e-06, + "loss": 3.3469, + "step": 11405 + }, + { + "epoch": 0.11606852213541667, + "grad_norm": 10.487101554870605, + "learning_rate": 4.838142171024404e-06, + "loss": 3.6045, + "step": 11410 + }, + { + "epoch": 0.116119384765625, + "grad_norm": 12.35394287109375, + "learning_rate": 4.838000595353531e-06, + "loss": 3.4698, + "step": 11415 + }, + { + "epoch": 0.11617024739583333, + "grad_norm": 16.841279983520508, + "learning_rate": 4.8378589598654675e-06, + "loss": 3.3743, + "step": 11420 + }, + { + "epoch": 0.11622111002604167, + "grad_norm": 8.052149772644043, + "learning_rate": 4.837717264563837e-06, + "loss": 3.7649, + "step": 11425 + }, + { + "epoch": 0.11627197265625, + "grad_norm": 8.459037780761719, + "learning_rate": 4.837575509452264e-06, + "loss": 3.1424, + "step": 11430 + }, + { + "epoch": 0.11632283528645833, + "grad_norm": 13.29317855834961, + "learning_rate": 4.837433694534376e-06, + "loss": 3.2511, + "step": 11435 + }, + { + "epoch": 0.11637369791666667, + "grad_norm": 10.47177791595459, + "learning_rate": 4.8372918198138e-06, + "loss": 3.1985, + "step": 11440 + }, + { + "epoch": 0.116424560546875, + "grad_norm": 12.912489891052246, + "learning_rate": 4.837149885294167e-06, + "loss": 3.414, + "step": 11445 + }, + { + "epoch": 0.11647542317708333, + "grad_norm": 11.507333755493164, + "learning_rate": 4.837007890979108e-06, + "loss": 3.362, + "step": 11450 + }, + { + "epoch": 0.11652628580729167, + "grad_norm": 11.616119384765625, + "learning_rate": 4.836865836872257e-06, + "loss": 3.4036, + "step": 11455 + }, + { + "epoch": 0.1165771484375, + "grad_norm": 16.553955078125, + "learning_rate": 4.8367237229772466e-06, + "loss": 3.5742, + "step": 11460 + }, + { + "epoch": 0.11662801106770833, + "grad_norm": 10.673224449157715, + "learning_rate": 4.836581549297715e-06, + "loss": 3.6775, + "step": 11465 + }, + { + "epoch": 0.11667887369791667, + "grad_norm": 12.565034866333008, + "learning_rate": 4.836439315837297e-06, + "loss": 3.8003, + "step": 11470 + }, + { + "epoch": 0.116729736328125, + "grad_norm": 8.820014953613281, + "learning_rate": 4.8362970225996334e-06, + "loss": 3.1671, + "step": 11475 + }, + { + "epoch": 0.11678059895833333, + "grad_norm": 13.71123218536377, + "learning_rate": 4.836154669588363e-06, + "loss": 3.2272, + "step": 11480 + }, + { + "epoch": 0.11683146158854167, + "grad_norm": 12.12161636352539, + "learning_rate": 4.8360122568071304e-06, + "loss": 3.6246, + "step": 11485 + }, + { + "epoch": 0.11688232421875, + "grad_norm": 9.981605529785156, + "learning_rate": 4.835869784259578e-06, + "loss": 3.2711, + "step": 11490 + }, + { + "epoch": 0.11693318684895833, + "grad_norm": 8.769081115722656, + "learning_rate": 4.83572725194935e-06, + "loss": 3.4075, + "step": 11495 + }, + { + "epoch": 0.11698404947916667, + "grad_norm": 9.689691543579102, + "learning_rate": 4.835584659880095e-06, + "loss": 3.2837, + "step": 11500 + }, + { + "epoch": 0.117034912109375, + "grad_norm": 10.834155082702637, + "learning_rate": 4.835442008055459e-06, + "loss": 3.1802, + "step": 11505 + }, + { + "epoch": 0.11708577473958333, + "grad_norm": 13.873346328735352, + "learning_rate": 4.835299296479093e-06, + "loss": 3.8511, + "step": 11510 + }, + { + "epoch": 0.11713663736979167, + "grad_norm": 10.343167304992676, + "learning_rate": 4.835156525154648e-06, + "loss": 3.5516, + "step": 11515 + }, + { + "epoch": 0.1171875, + "grad_norm": 13.721023559570312, + "learning_rate": 4.8350136940857775e-06, + "loss": 3.4011, + "step": 11520 + }, + { + "epoch": 0.11723836263020833, + "grad_norm": 15.717689514160156, + "learning_rate": 4.834870803276134e-06, + "loss": 3.5618, + "step": 11525 + }, + { + "epoch": 0.11728922526041667, + "grad_norm": 16.386823654174805, + "learning_rate": 4.834727852729375e-06, + "loss": 3.3112, + "step": 11530 + }, + { + "epoch": 0.117340087890625, + "grad_norm": 15.418145179748535, + "learning_rate": 4.834584842449158e-06, + "loss": 4.0253, + "step": 11535 + }, + { + "epoch": 0.11739095052083333, + "grad_norm": 8.149100303649902, + "learning_rate": 4.83444177243914e-06, + "loss": 3.5978, + "step": 11540 + }, + { + "epoch": 0.11744181315104167, + "grad_norm": 8.802492141723633, + "learning_rate": 4.834298642702983e-06, + "loss": 3.931, + "step": 11545 + }, + { + "epoch": 0.11749267578125, + "grad_norm": 10.84029483795166, + "learning_rate": 4.834155453244348e-06, + "loss": 3.4902, + "step": 11550 + }, + { + "epoch": 0.11754353841145833, + "grad_norm": 13.952017784118652, + "learning_rate": 4.8340122040669e-06, + "loss": 3.3633, + "step": 11555 + }, + { + "epoch": 0.11759440104166667, + "grad_norm": 9.742794036865234, + "learning_rate": 4.833868895174303e-06, + "loss": 3.46, + "step": 11560 + }, + { + "epoch": 0.117645263671875, + "grad_norm": 9.079483032226562, + "learning_rate": 4.833725526570223e-06, + "loss": 3.2567, + "step": 11565 + }, + { + "epoch": 0.11769612630208333, + "grad_norm": 11.4492826461792, + "learning_rate": 4.833582098258328e-06, + "loss": 3.332, + "step": 11570 + }, + { + "epoch": 0.11774698893229167, + "grad_norm": 13.898246765136719, + "learning_rate": 4.833438610242289e-06, + "loss": 3.6246, + "step": 11575 + }, + { + "epoch": 0.1177978515625, + "grad_norm": 18.6462459564209, + "learning_rate": 4.833295062525775e-06, + "loss": 4.2665, + "step": 11580 + }, + { + "epoch": 0.11784871419270833, + "grad_norm": 15.118053436279297, + "learning_rate": 4.833151455112462e-06, + "loss": 3.4488, + "step": 11585 + }, + { + "epoch": 0.11789957682291667, + "grad_norm": 12.25283432006836, + "learning_rate": 4.833007788006021e-06, + "loss": 3.3655, + "step": 11590 + }, + { + "epoch": 0.117950439453125, + "grad_norm": 12.915300369262695, + "learning_rate": 4.832864061210128e-06, + "loss": 3.2749, + "step": 11595 + }, + { + "epoch": 0.11800130208333333, + "grad_norm": 13.077988624572754, + "learning_rate": 4.832720274728462e-06, + "loss": 3.7166, + "step": 11600 + }, + { + "epoch": 0.11805216471354167, + "grad_norm": 12.226242065429688, + "learning_rate": 4.8325764285647e-06, + "loss": 3.4749, + "step": 11605 + }, + { + "epoch": 0.11810302734375, + "grad_norm": 12.33906078338623, + "learning_rate": 4.832432522722523e-06, + "loss": 3.8167, + "step": 11610 + }, + { + "epoch": 0.11815388997395833, + "grad_norm": 18.989513397216797, + "learning_rate": 4.832288557205612e-06, + "loss": 3.2801, + "step": 11615 + }, + { + "epoch": 0.11820475260416667, + "grad_norm": 13.292867660522461, + "learning_rate": 4.832144532017653e-06, + "loss": 3.4304, + "step": 11620 + }, + { + "epoch": 0.118255615234375, + "grad_norm": 13.21804141998291, + "learning_rate": 4.832000447162328e-06, + "loss": 3.1607, + "step": 11625 + }, + { + "epoch": 0.11830647786458333, + "grad_norm": 9.76976203918457, + "learning_rate": 4.8318563026433244e-06, + "loss": 3.6529, + "step": 11630 + }, + { + "epoch": 0.11835734049479167, + "grad_norm": 14.536739349365234, + "learning_rate": 4.831712098464329e-06, + "loss": 3.216, + "step": 11635 + }, + { + "epoch": 0.118408203125, + "grad_norm": 7.9985880851745605, + "learning_rate": 4.831567834629033e-06, + "loss": 3.0674, + "step": 11640 + }, + { + "epoch": 0.11845906575520833, + "grad_norm": 11.221656799316406, + "learning_rate": 4.831423511141127e-06, + "loss": 3.3163, + "step": 11645 + }, + { + "epoch": 0.11850992838541667, + "grad_norm": 16.915536880493164, + "learning_rate": 4.831279128004303e-06, + "loss": 3.6208, + "step": 11650 + }, + { + "epoch": 0.118560791015625, + "grad_norm": 11.9306058883667, + "learning_rate": 4.831134685222255e-06, + "loss": 3.6851, + "step": 11655 + }, + { + "epoch": 0.11861165364583333, + "grad_norm": 10.024066925048828, + "learning_rate": 4.8309901827986785e-06, + "loss": 3.4386, + "step": 11660 + }, + { + "epoch": 0.11866251627604167, + "grad_norm": 14.097890853881836, + "learning_rate": 4.83084562073727e-06, + "loss": 3.3924, + "step": 11665 + }, + { + "epoch": 0.11871337890625, + "grad_norm": 17.218984603881836, + "learning_rate": 4.83070099904173e-06, + "loss": 3.3288, + "step": 11670 + }, + { + "epoch": 0.11876424153645833, + "grad_norm": 9.430810928344727, + "learning_rate": 4.830556317715757e-06, + "loss": 3.1723, + "step": 11675 + }, + { + "epoch": 0.11881510416666667, + "grad_norm": 16.22243309020996, + "learning_rate": 4.830411576763052e-06, + "loss": 3.2928, + "step": 11680 + }, + { + "epoch": 0.118865966796875, + "grad_norm": 14.999611854553223, + "learning_rate": 4.83026677618732e-06, + "loss": 3.5795, + "step": 11685 + }, + { + "epoch": 0.11891682942708333, + "grad_norm": 13.795806884765625, + "learning_rate": 4.830121915992265e-06, + "loss": 3.6035, + "step": 11690 + }, + { + "epoch": 0.11896769205729167, + "grad_norm": 11.390329360961914, + "learning_rate": 4.829976996181593e-06, + "loss": 3.6227, + "step": 11695 + }, + { + "epoch": 0.1190185546875, + "grad_norm": 13.322566986083984, + "learning_rate": 4.829832016759012e-06, + "loss": 3.2774, + "step": 11700 + }, + { + "epoch": 0.11906941731770833, + "grad_norm": 13.909505844116211, + "learning_rate": 4.829686977728231e-06, + "loss": 3.3357, + "step": 11705 + }, + { + "epoch": 0.11912027994791667, + "grad_norm": 12.06095027923584, + "learning_rate": 4.82954187909296e-06, + "loss": 3.9097, + "step": 11710 + }, + { + "epoch": 0.119171142578125, + "grad_norm": 11.268613815307617, + "learning_rate": 4.829396720856913e-06, + "loss": 3.2764, + "step": 11715 + }, + { + "epoch": 0.11922200520833333, + "grad_norm": 8.79672622680664, + "learning_rate": 4.829251503023803e-06, + "loss": 3.587, + "step": 11720 + }, + { + "epoch": 0.11927286783854167, + "grad_norm": 15.61196231842041, + "learning_rate": 4.8291062255973455e-06, + "loss": 3.2073, + "step": 11725 + }, + { + "epoch": 0.11932373046875, + "grad_norm": 8.96154499053955, + "learning_rate": 4.828960888581256e-06, + "loss": 3.4395, + "step": 11730 + }, + { + "epoch": 0.11937459309895833, + "grad_norm": 13.609565734863281, + "learning_rate": 4.828815491979256e-06, + "loss": 3.3523, + "step": 11735 + }, + { + "epoch": 0.11942545572916667, + "grad_norm": 8.735452651977539, + "learning_rate": 4.828670035795063e-06, + "loss": 3.3582, + "step": 11740 + }, + { + "epoch": 0.119476318359375, + "grad_norm": 13.924631118774414, + "learning_rate": 4.828524520032399e-06, + "loss": 3.3311, + "step": 11745 + }, + { + "epoch": 0.11952718098958333, + "grad_norm": 13.989679336547852, + "learning_rate": 4.828378944694987e-06, + "loss": 3.1395, + "step": 11750 + }, + { + "epoch": 0.11957804361979167, + "grad_norm": 17.6063289642334, + "learning_rate": 4.828233309786552e-06, + "loss": 3.2459, + "step": 11755 + }, + { + "epoch": 0.11962890625, + "grad_norm": 14.077485084533691, + "learning_rate": 4.828087615310819e-06, + "loss": 3.5539, + "step": 11760 + }, + { + "epoch": 0.11967976888020833, + "grad_norm": 11.900781631469727, + "learning_rate": 4.8279418612715165e-06, + "loss": 3.3846, + "step": 11765 + }, + { + "epoch": 0.11973063151041667, + "grad_norm": 9.877041816711426, + "learning_rate": 4.8277960476723726e-06, + "loss": 3.7396, + "step": 11770 + }, + { + "epoch": 0.119781494140625, + "grad_norm": 16.47829818725586, + "learning_rate": 4.82765017451712e-06, + "loss": 3.1649, + "step": 11775 + }, + { + "epoch": 0.11983235677083333, + "grad_norm": 12.586395263671875, + "learning_rate": 4.827504241809488e-06, + "loss": 3.513, + "step": 11780 + }, + { + "epoch": 0.11988321940104167, + "grad_norm": 12.90040111541748, + "learning_rate": 4.827358249553213e-06, + "loss": 3.5154, + "step": 11785 + }, + { + "epoch": 0.11993408203125, + "grad_norm": 7.8959503173828125, + "learning_rate": 4.8272121977520266e-06, + "loss": 3.5876, + "step": 11790 + }, + { + "epoch": 0.11998494466145833, + "grad_norm": 14.199788093566895, + "learning_rate": 4.82706608640967e-06, + "loss": 3.3843, + "step": 11795 + }, + { + "epoch": 0.12003580729166667, + "grad_norm": 14.042448043823242, + "learning_rate": 4.826919915529878e-06, + "loss": 2.9448, + "step": 11800 + }, + { + "epoch": 0.120086669921875, + "grad_norm": 14.99729061126709, + "learning_rate": 4.826773685116392e-06, + "loss": 3.3641, + "step": 11805 + }, + { + "epoch": 0.12013753255208333, + "grad_norm": 8.840888023376465, + "learning_rate": 4.826627395172952e-06, + "loss": 3.1467, + "step": 11810 + }, + { + "epoch": 0.12018839518229167, + "grad_norm": 8.198177337646484, + "learning_rate": 4.8264810457033025e-06, + "loss": 3.5442, + "step": 11815 + }, + { + "epoch": 0.1202392578125, + "grad_norm": 10.527442932128906, + "learning_rate": 4.826334636711186e-06, + "loss": 3.431, + "step": 11820 + }, + { + "epoch": 0.12029012044270833, + "grad_norm": 11.18076229095459, + "learning_rate": 4.82618816820035e-06, + "loss": 3.2887, + "step": 11825 + }, + { + "epoch": 0.12034098307291667, + "grad_norm": 13.812418937683105, + "learning_rate": 4.826041640174542e-06, + "loss": 3.5518, + "step": 11830 + }, + { + "epoch": 0.120391845703125, + "grad_norm": 17.44132423400879, + "learning_rate": 4.825895052637508e-06, + "loss": 3.629, + "step": 11835 + }, + { + "epoch": 0.12044270833333333, + "grad_norm": 13.55534553527832, + "learning_rate": 4.825748405593001e-06, + "loss": 3.4921, + "step": 11840 + }, + { + "epoch": 0.12049357096354167, + "grad_norm": 10.288969993591309, + "learning_rate": 4.825601699044773e-06, + "loss": 3.876, + "step": 11845 + }, + { + "epoch": 0.12054443359375, + "grad_norm": 15.818678855895996, + "learning_rate": 4.825454932996576e-06, + "loss": 3.2189, + "step": 11850 + }, + { + "epoch": 0.12059529622395833, + "grad_norm": 8.30019474029541, + "learning_rate": 4.825308107452166e-06, + "loss": 3.3629, + "step": 11855 + }, + { + "epoch": 0.12064615885416667, + "grad_norm": 8.714160919189453, + "learning_rate": 4.825161222415299e-06, + "loss": 3.2909, + "step": 11860 + }, + { + "epoch": 0.120697021484375, + "grad_norm": 8.198768615722656, + "learning_rate": 4.825014277889733e-06, + "loss": 3.5741, + "step": 11865 + }, + { + "epoch": 0.12074788411458333, + "grad_norm": 14.308550834655762, + "learning_rate": 4.824867273879229e-06, + "loss": 3.5978, + "step": 11870 + }, + { + "epoch": 0.12079874674479167, + "grad_norm": 11.020129203796387, + "learning_rate": 4.8247202103875455e-06, + "loss": 3.3011, + "step": 11875 + }, + { + "epoch": 0.120849609375, + "grad_norm": 12.168553352355957, + "learning_rate": 4.824573087418447e-06, + "loss": 3.4922, + "step": 11880 + }, + { + "epoch": 0.12090047200520833, + "grad_norm": 8.048748016357422, + "learning_rate": 4.824425904975697e-06, + "loss": 3.7659, + "step": 11885 + }, + { + "epoch": 0.12095133463541667, + "grad_norm": 13.554608345031738, + "learning_rate": 4.8242786630630615e-06, + "loss": 3.2647, + "step": 11890 + }, + { + "epoch": 0.121002197265625, + "grad_norm": 14.179122924804688, + "learning_rate": 4.824131361684308e-06, + "loss": 3.3172, + "step": 11895 + }, + { + "epoch": 0.12105305989583333, + "grad_norm": 17.352275848388672, + "learning_rate": 4.823984000843203e-06, + "loss": 3.9405, + "step": 11900 + }, + { + "epoch": 0.12110392252604167, + "grad_norm": 7.687533378601074, + "learning_rate": 4.823836580543519e-06, + "loss": 3.6776, + "step": 11905 + }, + { + "epoch": 0.12115478515625, + "grad_norm": 18.918060302734375, + "learning_rate": 4.823689100789026e-06, + "loss": 3.6594, + "step": 11910 + }, + { + "epoch": 0.12120564778645833, + "grad_norm": 16.174633026123047, + "learning_rate": 4.823541561583499e-06, + "loss": 3.4054, + "step": 11915 + }, + { + "epoch": 0.12125651041666667, + "grad_norm": 12.678868293762207, + "learning_rate": 4.8233939629307115e-06, + "loss": 3.4099, + "step": 11920 + }, + { + "epoch": 0.121307373046875, + "grad_norm": 12.389989852905273, + "learning_rate": 4.82324630483444e-06, + "loss": 3.6306, + "step": 11925 + }, + { + "epoch": 0.12135823567708333, + "grad_norm": 8.188562393188477, + "learning_rate": 4.823098587298463e-06, + "loss": 3.2383, + "step": 11930 + }, + { + "epoch": 0.12140909830729167, + "grad_norm": 8.033101081848145, + "learning_rate": 4.8229508103265595e-06, + "loss": 3.322, + "step": 11935 + }, + { + "epoch": 0.1214599609375, + "grad_norm": 11.021990776062012, + "learning_rate": 4.822802973922509e-06, + "loss": 3.704, + "step": 11940 + }, + { + "epoch": 0.12151082356770833, + "grad_norm": 6.139153003692627, + "learning_rate": 4.822655078090096e-06, + "loss": 3.2763, + "step": 11945 + }, + { + "epoch": 0.12156168619791667, + "grad_norm": 13.162734985351562, + "learning_rate": 4.822507122833104e-06, + "loss": 3.8643, + "step": 11950 + }, + { + "epoch": 0.121612548828125, + "grad_norm": 11.088370323181152, + "learning_rate": 4.8223591081553154e-06, + "loss": 3.5046, + "step": 11955 + }, + { + "epoch": 0.12166341145833333, + "grad_norm": 13.00243091583252, + "learning_rate": 4.822211034060521e-06, + "loss": 4.2186, + "step": 11960 + }, + { + "epoch": 0.12171427408854167, + "grad_norm": 11.786962509155273, + "learning_rate": 4.822062900552507e-06, + "loss": 3.4922, + "step": 11965 + }, + { + "epoch": 0.12176513671875, + "grad_norm": 8.783388137817383, + "learning_rate": 4.821914707635065e-06, + "loss": 3.3294, + "step": 11970 + }, + { + "epoch": 0.12181599934895833, + "grad_norm": 9.021117210388184, + "learning_rate": 4.821766455311986e-06, + "loss": 3.4608, + "step": 11975 + }, + { + "epoch": 0.12186686197916667, + "grad_norm": 11.563819885253906, + "learning_rate": 4.821618143587062e-06, + "loss": 3.4183, + "step": 11980 + }, + { + "epoch": 0.121917724609375, + "grad_norm": 7.746058464050293, + "learning_rate": 4.821469772464087e-06, + "loss": 3.0433, + "step": 11985 + }, + { + "epoch": 0.12196858723958333, + "grad_norm": 16.20210838317871, + "learning_rate": 4.821321341946859e-06, + "loss": 3.6232, + "step": 11990 + }, + { + "epoch": 0.12201944986979167, + "grad_norm": 14.226766586303711, + "learning_rate": 4.821172852039175e-06, + "loss": 3.5745, + "step": 11995 + }, + { + "epoch": 0.1220703125, + "grad_norm": 13.494339942932129, + "learning_rate": 4.821024302744834e-06, + "loss": 3.5401, + "step": 12000 + }, + { + "epoch": 0.12212117513020833, + "grad_norm": 9.108319282531738, + "learning_rate": 4.820875694067635e-06, + "loss": 3.2929, + "step": 12005 + }, + { + "epoch": 0.12217203776041667, + "grad_norm": 9.119816780090332, + "learning_rate": 4.820727026011382e-06, + "loss": 2.9584, + "step": 12010 + }, + { + "epoch": 0.122222900390625, + "grad_norm": 14.945995330810547, + "learning_rate": 4.820578298579879e-06, + "loss": 3.4583, + "step": 12015 + }, + { + "epoch": 0.12227376302083333, + "grad_norm": 12.933895111083984, + "learning_rate": 4.820429511776929e-06, + "loss": 3.5372, + "step": 12020 + }, + { + "epoch": 0.12232462565104167, + "grad_norm": 14.16519546508789, + "learning_rate": 4.820280665606341e-06, + "loss": 3.3085, + "step": 12025 + }, + { + "epoch": 0.12237548828125, + "grad_norm": 14.623350143432617, + "learning_rate": 4.820131760071921e-06, + "loss": 2.9811, + "step": 12030 + }, + { + "epoch": 0.12242635091145833, + "grad_norm": 13.574602127075195, + "learning_rate": 4.8199827951774805e-06, + "loss": 3.5069, + "step": 12035 + }, + { + "epoch": 0.12247721354166667, + "grad_norm": 15.912371635437012, + "learning_rate": 4.8198337709268305e-06, + "loss": 3.6601, + "step": 12040 + }, + { + "epoch": 0.122528076171875, + "grad_norm": 15.12542724609375, + "learning_rate": 4.819684687323783e-06, + "loss": 3.3761, + "step": 12045 + }, + { + "epoch": 0.12257893880208333, + "grad_norm": 8.16356372833252, + "learning_rate": 4.819535544372153e-06, + "loss": 3.474, + "step": 12050 + }, + { + "epoch": 0.12262980143229167, + "grad_norm": 9.369928359985352, + "learning_rate": 4.819386342075755e-06, + "loss": 3.548, + "step": 12055 + }, + { + "epoch": 0.1226806640625, + "grad_norm": 16.48990249633789, + "learning_rate": 4.8192370804384075e-06, + "loss": 3.5765, + "step": 12060 + }, + { + "epoch": 0.12273152669270833, + "grad_norm": 14.442497253417969, + "learning_rate": 4.819087759463929e-06, + "loss": 3.3908, + "step": 12065 + }, + { + "epoch": 0.12278238932291667, + "grad_norm": 15.705107688903809, + "learning_rate": 4.81893837915614e-06, + "loss": 3.6492, + "step": 12070 + }, + { + "epoch": 0.122833251953125, + "grad_norm": 15.82107162475586, + "learning_rate": 4.818788939518863e-06, + "loss": 3.2617, + "step": 12075 + }, + { + "epoch": 0.12288411458333333, + "grad_norm": 16.034608840942383, + "learning_rate": 4.8186394405559186e-06, + "loss": 3.2683, + "step": 12080 + }, + { + "epoch": 0.12293497721354167, + "grad_norm": 10.579744338989258, + "learning_rate": 4.818489882271135e-06, + "loss": 3.2758, + "step": 12085 + }, + { + "epoch": 0.12298583984375, + "grad_norm": 12.461812019348145, + "learning_rate": 4.818340264668337e-06, + "loss": 3.2529, + "step": 12090 + }, + { + "epoch": 0.12303670247395833, + "grad_norm": 17.07436752319336, + "learning_rate": 4.8181905877513535e-06, + "loss": 3.7906, + "step": 12095 + }, + { + "epoch": 0.12308756510416667, + "grad_norm": 11.529945373535156, + "learning_rate": 4.818040851524013e-06, + "loss": 3.5339, + "step": 12100 + }, + { + "epoch": 0.123138427734375, + "grad_norm": 15.79837417602539, + "learning_rate": 4.817891055990146e-06, + "loss": 3.4221, + "step": 12105 + }, + { + "epoch": 0.12318929036458333, + "grad_norm": 15.319779396057129, + "learning_rate": 4.817741201153587e-06, + "loss": 3.4168, + "step": 12110 + }, + { + "epoch": 0.12324015299479167, + "grad_norm": 13.032570838928223, + "learning_rate": 4.817591287018168e-06, + "loss": 3.5855, + "step": 12115 + }, + { + "epoch": 0.123291015625, + "grad_norm": 10.453932762145996, + "learning_rate": 4.817441313587725e-06, + "loss": 3.3555, + "step": 12120 + }, + { + "epoch": 0.12334187825520833, + "grad_norm": 15.642955780029297, + "learning_rate": 4.817291280866096e-06, + "loss": 3.5448, + "step": 12125 + }, + { + "epoch": 0.12339274088541667, + "grad_norm": 10.044322967529297, + "learning_rate": 4.8171411888571185e-06, + "loss": 3.3746, + "step": 12130 + }, + { + "epoch": 0.123443603515625, + "grad_norm": 11.619243621826172, + "learning_rate": 4.816991037564632e-06, + "loss": 3.6049, + "step": 12135 + }, + { + "epoch": 0.12349446614583333, + "grad_norm": 9.354294776916504, + "learning_rate": 4.81684082699248e-06, + "loss": 3.5606, + "step": 12140 + }, + { + "epoch": 0.12354532877604167, + "grad_norm": 15.51186752319336, + "learning_rate": 4.816690557144505e-06, + "loss": 3.6708, + "step": 12145 + }, + { + "epoch": 0.12359619140625, + "grad_norm": 13.874622344970703, + "learning_rate": 4.816540228024551e-06, + "loss": 3.5003, + "step": 12150 + }, + { + "epoch": 0.12364705403645833, + "grad_norm": 11.789558410644531, + "learning_rate": 4.816389839636463e-06, + "loss": 3.2361, + "step": 12155 + }, + { + "epoch": 0.12369791666666667, + "grad_norm": 14.9029541015625, + "learning_rate": 4.816239391984091e-06, + "loss": 3.8518, + "step": 12160 + }, + { + "epoch": 0.123748779296875, + "grad_norm": 16.188743591308594, + "learning_rate": 4.8160888850712835e-06, + "loss": 3.4669, + "step": 12165 + }, + { + "epoch": 0.12379964192708333, + "grad_norm": 16.114046096801758, + "learning_rate": 4.81593831890189e-06, + "loss": 3.329, + "step": 12170 + }, + { + "epoch": 0.12385050455729167, + "grad_norm": 9.05451774597168, + "learning_rate": 4.815787693479764e-06, + "loss": 3.1477, + "step": 12175 + }, + { + "epoch": 0.1239013671875, + "grad_norm": 13.087759971618652, + "learning_rate": 4.815637008808759e-06, + "loss": 3.4289, + "step": 12180 + }, + { + "epoch": 0.12395222981770833, + "grad_norm": 11.878509521484375, + "learning_rate": 4.81548626489273e-06, + "loss": 3.3253, + "step": 12185 + }, + { + "epoch": 0.12400309244791667, + "grad_norm": 23.86294937133789, + "learning_rate": 4.815335461735534e-06, + "loss": 3.7135, + "step": 12190 + }, + { + "epoch": 0.124053955078125, + "grad_norm": 12.333020210266113, + "learning_rate": 4.815184599341029e-06, + "loss": 3.5888, + "step": 12195 + }, + { + "epoch": 0.12410481770833333, + "grad_norm": 11.767260551452637, + "learning_rate": 4.8150336777130736e-06, + "loss": 3.2373, + "step": 12200 + }, + { + "epoch": 0.12415568033854167, + "grad_norm": 11.746440887451172, + "learning_rate": 4.8148826968555306e-06, + "loss": 3.3153, + "step": 12205 + }, + { + "epoch": 0.12420654296875, + "grad_norm": 12.361739158630371, + "learning_rate": 4.814731656772263e-06, + "loss": 3.4058, + "step": 12210 + }, + { + "epoch": 0.12425740559895833, + "grad_norm": 10.730027198791504, + "learning_rate": 4.8145805574671346e-06, + "loss": 3.4855, + "step": 12215 + }, + { + "epoch": 0.12430826822916667, + "grad_norm": 16.28338623046875, + "learning_rate": 4.814429398944011e-06, + "loss": 3.5275, + "step": 12220 + }, + { + "epoch": 0.124359130859375, + "grad_norm": 16.588144302368164, + "learning_rate": 4.81427818120676e-06, + "loss": 3.6741, + "step": 12225 + }, + { + "epoch": 0.12440999348958333, + "grad_norm": 15.339471817016602, + "learning_rate": 4.81412690425925e-06, + "loss": 3.9718, + "step": 12230 + }, + { + "epoch": 0.12446085611979167, + "grad_norm": 8.622764587402344, + "learning_rate": 4.8139755681053526e-06, + "loss": 3.5536, + "step": 12235 + }, + { + "epoch": 0.12451171875, + "grad_norm": 13.108068466186523, + "learning_rate": 4.813824172748938e-06, + "loss": 3.283, + "step": 12240 + }, + { + "epoch": 0.12456258138020833, + "grad_norm": 12.999624252319336, + "learning_rate": 4.8136727181938804e-06, + "loss": 3.7082, + "step": 12245 + }, + { + "epoch": 0.12461344401041667, + "grad_norm": 9.05246639251709, + "learning_rate": 4.813521204444055e-06, + "loss": 3.655, + "step": 12250 + }, + { + "epoch": 0.124664306640625, + "grad_norm": 11.840641021728516, + "learning_rate": 4.8133696315033375e-06, + "loss": 3.4432, + "step": 12255 + }, + { + "epoch": 0.12471516927083333, + "grad_norm": 12.203230857849121, + "learning_rate": 4.813217999375606e-06, + "loss": 3.1917, + "step": 12260 + }, + { + "epoch": 0.12476603190104167, + "grad_norm": 10.946849822998047, + "learning_rate": 4.813066308064741e-06, + "loss": 3.2396, + "step": 12265 + }, + { + "epoch": 0.12481689453125, + "grad_norm": 9.767146110534668, + "learning_rate": 4.812914557574622e-06, + "loss": 3.337, + "step": 12270 + }, + { + "epoch": 0.12486775716145833, + "grad_norm": 10.88211727142334, + "learning_rate": 4.8127627479091336e-06, + "loss": 3.2582, + "step": 12275 + }, + { + "epoch": 0.12491861979166667, + "grad_norm": 9.982248306274414, + "learning_rate": 4.812610879072157e-06, + "loss": 3.2663, + "step": 12280 + }, + { + "epoch": 0.124969482421875, + "grad_norm": 8.785655975341797, + "learning_rate": 4.8124589510675805e-06, + "loss": 3.1978, + "step": 12285 + }, + { + "epoch": 0.12502034505208334, + "grad_norm": 13.370884895324707, + "learning_rate": 4.812306963899289e-06, + "loss": 3.5219, + "step": 12290 + }, + { + "epoch": 0.12507120768229166, + "grad_norm": 14.19118595123291, + "learning_rate": 4.812154917571172e-06, + "loss": 3.1948, + "step": 12295 + }, + { + "epoch": 0.1251220703125, + "grad_norm": 15.058856010437012, + "learning_rate": 4.81200281208712e-06, + "loss": 3.4093, + "step": 12300 + }, + { + "epoch": 0.12517293294270834, + "grad_norm": 17.146102905273438, + "learning_rate": 4.811850647451024e-06, + "loss": 3.3073, + "step": 12305 + }, + { + "epoch": 0.12522379557291666, + "grad_norm": 10.474042892456055, + "learning_rate": 4.811698423666777e-06, + "loss": 3.3589, + "step": 12310 + }, + { + "epoch": 0.125274658203125, + "grad_norm": 12.146330833435059, + "learning_rate": 4.811546140738273e-06, + "loss": 3.4111, + "step": 12315 + }, + { + "epoch": 0.12532552083333334, + "grad_norm": 8.893861770629883, + "learning_rate": 4.811393798669409e-06, + "loss": 3.4912, + "step": 12320 + }, + { + "epoch": 0.12537638346354166, + "grad_norm": 14.769695281982422, + "learning_rate": 4.811241397464083e-06, + "loss": 3.459, + "step": 12325 + }, + { + "epoch": 0.12542724609375, + "grad_norm": 16.10647964477539, + "learning_rate": 4.811088937126194e-06, + "loss": 3.3882, + "step": 12330 + }, + { + "epoch": 0.12547810872395834, + "grad_norm": 14.999397277832031, + "learning_rate": 4.8109364176596416e-06, + "loss": 3.3708, + "step": 12335 + }, + { + "epoch": 0.12552897135416666, + "grad_norm": 11.39329719543457, + "learning_rate": 4.810783839068329e-06, + "loss": 3.5732, + "step": 12340 + }, + { + "epoch": 0.125579833984375, + "grad_norm": 10.63199234008789, + "learning_rate": 4.81063120135616e-06, + "loss": 3.5412, + "step": 12345 + }, + { + "epoch": 0.12563069661458334, + "grad_norm": 9.905284881591797, + "learning_rate": 4.81047850452704e-06, + "loss": 3.1996, + "step": 12350 + }, + { + "epoch": 0.12568155924479166, + "grad_norm": 15.250255584716797, + "learning_rate": 4.810325748584873e-06, + "loss": 3.3639, + "step": 12355 + }, + { + "epoch": 0.125732421875, + "grad_norm": 12.014054298400879, + "learning_rate": 4.8101729335335716e-06, + "loss": 3.2804, + "step": 12360 + }, + { + "epoch": 0.12578328450520834, + "grad_norm": 15.268871307373047, + "learning_rate": 4.810020059377042e-06, + "loss": 3.4151, + "step": 12365 + }, + { + "epoch": 0.12583414713541666, + "grad_norm": 15.318325996398926, + "learning_rate": 4.809867126119197e-06, + "loss": 3.6018, + "step": 12370 + }, + { + "epoch": 0.125885009765625, + "grad_norm": 13.372442245483398, + "learning_rate": 4.8097141337639485e-06, + "loss": 3.8433, + "step": 12375 + }, + { + "epoch": 0.12593587239583334, + "grad_norm": 15.433815956115723, + "learning_rate": 4.809561082315212e-06, + "loss": 3.2409, + "step": 12380 + }, + { + "epoch": 0.12598673502604166, + "grad_norm": 9.874568939208984, + "learning_rate": 4.809407971776902e-06, + "loss": 3.5737, + "step": 12385 + }, + { + "epoch": 0.12603759765625, + "grad_norm": 13.617438316345215, + "learning_rate": 4.809254802152937e-06, + "loss": 3.598, + "step": 12390 + }, + { + "epoch": 0.12608846028645834, + "grad_norm": 17.306371688842773, + "learning_rate": 4.809101573447236e-06, + "loss": 3.9022, + "step": 12395 + }, + { + "epoch": 0.12613932291666666, + "grad_norm": 12.497845649719238, + "learning_rate": 4.808948285663717e-06, + "loss": 3.292, + "step": 12400 + }, + { + "epoch": 0.126190185546875, + "grad_norm": 11.501578330993652, + "learning_rate": 4.808794938806305e-06, + "loss": 3.4564, + "step": 12405 + }, + { + "epoch": 0.12624104817708334, + "grad_norm": 15.333441734313965, + "learning_rate": 4.808641532878921e-06, + "loss": 3.257, + "step": 12410 + }, + { + "epoch": 0.12629191080729166, + "grad_norm": 9.874608993530273, + "learning_rate": 4.80848806788549e-06, + "loss": 3.2448, + "step": 12415 + }, + { + "epoch": 0.1263427734375, + "grad_norm": 9.775317192077637, + "learning_rate": 4.808334543829939e-06, + "loss": 3.4979, + "step": 12420 + }, + { + "epoch": 0.12639363606770834, + "grad_norm": 9.998175621032715, + "learning_rate": 4.808180960716196e-06, + "loss": 3.1983, + "step": 12425 + }, + { + "epoch": 0.12644449869791666, + "grad_norm": 10.001486778259277, + "learning_rate": 4.808027318548191e-06, + "loss": 3.2788, + "step": 12430 + }, + { + "epoch": 0.126495361328125, + "grad_norm": 15.605623245239258, + "learning_rate": 4.807873617329854e-06, + "loss": 3.7982, + "step": 12435 + }, + { + "epoch": 0.12654622395833334, + "grad_norm": 16.90433692932129, + "learning_rate": 4.807719857065117e-06, + "loss": 3.7252, + "step": 12440 + }, + { + "epoch": 0.12659708658854166, + "grad_norm": 9.86048412322998, + "learning_rate": 4.807566037757914e-06, + "loss": 3.3411, + "step": 12445 + }, + { + "epoch": 0.12664794921875, + "grad_norm": 9.026899337768555, + "learning_rate": 4.807412159412181e-06, + "loss": 3.1362, + "step": 12450 + }, + { + "epoch": 0.12669881184895834, + "grad_norm": 11.268509864807129, + "learning_rate": 4.807258222031855e-06, + "loss": 3.2253, + "step": 12455 + }, + { + "epoch": 0.12674967447916666, + "grad_norm": 14.036666870117188, + "learning_rate": 4.807104225620875e-06, + "loss": 3.1731, + "step": 12460 + }, + { + "epoch": 0.126800537109375, + "grad_norm": 9.418083190917969, + "learning_rate": 4.8069501701831795e-06, + "loss": 3.1714, + "step": 12465 + }, + { + "epoch": 0.12685139973958334, + "grad_norm": 12.404669761657715, + "learning_rate": 4.8067960557227114e-06, + "loss": 3.6047, + "step": 12470 + }, + { + "epoch": 0.12690226236979166, + "grad_norm": 12.391560554504395, + "learning_rate": 4.806641882243412e-06, + "loss": 3.4368, + "step": 12475 + }, + { + "epoch": 0.126953125, + "grad_norm": 13.154583930969238, + "learning_rate": 4.806487649749228e-06, + "loss": 4.0829, + "step": 12480 + }, + { + "epoch": 0.12700398763020834, + "grad_norm": 13.884321212768555, + "learning_rate": 4.806333358244103e-06, + "loss": 3.1453, + "step": 12485 + }, + { + "epoch": 0.12705485026041666, + "grad_norm": 11.85291862487793, + "learning_rate": 4.806179007731986e-06, + "loss": 3.4844, + "step": 12490 + }, + { + "epoch": 0.127105712890625, + "grad_norm": 12.22746753692627, + "learning_rate": 4.806024598216826e-06, + "loss": 3.7238, + "step": 12495 + }, + { + "epoch": 0.12715657552083334, + "grad_norm": 14.834999084472656, + "learning_rate": 4.805870129702573e-06, + "loss": 3.7607, + "step": 12500 + }, + { + "epoch": 0.12720743815104166, + "grad_norm": 18.713239669799805, + "learning_rate": 4.8057156021931795e-06, + "loss": 3.5314, + "step": 12505 + }, + { + "epoch": 0.12725830078125, + "grad_norm": 15.553114891052246, + "learning_rate": 4.8055610156925984e-06, + "loss": 3.9413, + "step": 12510 + }, + { + "epoch": 0.12730916341145834, + "grad_norm": 12.916542053222656, + "learning_rate": 4.805406370204785e-06, + "loss": 3.3368, + "step": 12515 + }, + { + "epoch": 0.12736002604166666, + "grad_norm": 13.694195747375488, + "learning_rate": 4.805251665733696e-06, + "loss": 3.2709, + "step": 12520 + }, + { + "epoch": 0.127410888671875, + "grad_norm": 10.602334976196289, + "learning_rate": 4.805096902283291e-06, + "loss": 3.3952, + "step": 12525 + }, + { + "epoch": 0.12746175130208334, + "grad_norm": 14.200652122497559, + "learning_rate": 4.804942079857527e-06, + "loss": 3.5215, + "step": 12530 + }, + { + "epoch": 0.12751261393229166, + "grad_norm": 7.785111427307129, + "learning_rate": 4.804787198460366e-06, + "loss": 3.3147, + "step": 12535 + }, + { + "epoch": 0.1275634765625, + "grad_norm": 12.794201850891113, + "learning_rate": 4.804632258095772e-06, + "loss": 3.4677, + "step": 12540 + }, + { + "epoch": 0.12761433919270834, + "grad_norm": 12.663290977478027, + "learning_rate": 4.804477258767707e-06, + "loss": 3.6271, + "step": 12545 + }, + { + "epoch": 0.12766520182291666, + "grad_norm": 10.109106063842773, + "learning_rate": 4.804322200480138e-06, + "loss": 3.5064, + "step": 12550 + }, + { + "epoch": 0.127716064453125, + "grad_norm": 15.36133861541748, + "learning_rate": 4.804167083237031e-06, + "loss": 3.4629, + "step": 12555 + }, + { + "epoch": 0.12776692708333334, + "grad_norm": 12.881776809692383, + "learning_rate": 4.804011907042356e-06, + "loss": 3.4292, + "step": 12560 + }, + { + "epoch": 0.12781778971354166, + "grad_norm": 8.816761016845703, + "learning_rate": 4.8038566719000825e-06, + "loss": 3.0355, + "step": 12565 + }, + { + "epoch": 0.12786865234375, + "grad_norm": 15.458996772766113, + "learning_rate": 4.803701377814181e-06, + "loss": 4.0244, + "step": 12570 + }, + { + "epoch": 0.12791951497395834, + "grad_norm": 9.55286979675293, + "learning_rate": 4.803546024788628e-06, + "loss": 3.4125, + "step": 12575 + }, + { + "epoch": 0.12797037760416666, + "grad_norm": 15.313224792480469, + "learning_rate": 4.803390612827394e-06, + "loss": 3.4685, + "step": 12580 + }, + { + "epoch": 0.128021240234375, + "grad_norm": 13.189021110534668, + "learning_rate": 4.803235141934458e-06, + "loss": 3.3217, + "step": 12585 + }, + { + "epoch": 0.12807210286458334, + "grad_norm": 16.103500366210938, + "learning_rate": 4.803079612113796e-06, + "loss": 3.1009, + "step": 12590 + }, + { + "epoch": 0.12812296549479166, + "grad_norm": 16.239477157592773, + "learning_rate": 4.802924023369388e-06, + "loss": 3.4193, + "step": 12595 + }, + { + "epoch": 0.128173828125, + "grad_norm": 11.9988374710083, + "learning_rate": 4.802768375705216e-06, + "loss": 3.6531, + "step": 12600 + }, + { + "epoch": 0.12822469075520834, + "grad_norm": 11.093536376953125, + "learning_rate": 4.802612669125261e-06, + "loss": 3.2401, + "step": 12605 + }, + { + "epoch": 0.12827555338541666, + "grad_norm": 10.944393157958984, + "learning_rate": 4.8024569036335055e-06, + "loss": 3.2432, + "step": 12610 + }, + { + "epoch": 0.128326416015625, + "grad_norm": 14.31828498840332, + "learning_rate": 4.802301079233936e-06, + "loss": 3.6411, + "step": 12615 + }, + { + "epoch": 0.12837727864583334, + "grad_norm": 9.36100959777832, + "learning_rate": 4.802145195930539e-06, + "loss": 3.4192, + "step": 12620 + }, + { + "epoch": 0.12842814127604166, + "grad_norm": 13.963410377502441, + "learning_rate": 4.801989253727303e-06, + "loss": 3.1725, + "step": 12625 + }, + { + "epoch": 0.12847900390625, + "grad_norm": 16.763933181762695, + "learning_rate": 4.801833252628218e-06, + "loss": 3.3481, + "step": 12630 + }, + { + "epoch": 0.12852986653645834, + "grad_norm": 12.801989555358887, + "learning_rate": 4.801677192637275e-06, + "loss": 3.5976, + "step": 12635 + }, + { + "epoch": 0.12858072916666666, + "grad_norm": 81.20991516113281, + "learning_rate": 4.801521073758466e-06, + "loss": 3.3127, + "step": 12640 + }, + { + "epoch": 0.128631591796875, + "grad_norm": 13.878232955932617, + "learning_rate": 4.801364895995786e-06, + "loss": 3.313, + "step": 12645 + }, + { + "epoch": 0.12868245442708334, + "grad_norm": 10.5735445022583, + "learning_rate": 4.8012086593532306e-06, + "loss": 3.298, + "step": 12650 + }, + { + "epoch": 0.12873331705729166, + "grad_norm": 10.607198715209961, + "learning_rate": 4.8010523638347965e-06, + "loss": 3.4635, + "step": 12655 + }, + { + "epoch": 0.1287841796875, + "grad_norm": 10.28955078125, + "learning_rate": 4.800896009444484e-06, + "loss": 3.5608, + "step": 12660 + }, + { + "epoch": 0.12883504231770834, + "grad_norm": 12.767330169677734, + "learning_rate": 4.800739596186293e-06, + "loss": 3.222, + "step": 12665 + }, + { + "epoch": 0.12888590494791666, + "grad_norm": 15.003509521484375, + "learning_rate": 4.800583124064223e-06, + "loss": 3.2579, + "step": 12670 + }, + { + "epoch": 0.128936767578125, + "grad_norm": 14.754562377929688, + "learning_rate": 4.80042659308228e-06, + "loss": 3.2074, + "step": 12675 + }, + { + "epoch": 0.12898763020833334, + "grad_norm": 15.33542251586914, + "learning_rate": 4.800270003244467e-06, + "loss": 3.2303, + "step": 12680 + }, + { + "epoch": 0.12903849283854166, + "grad_norm": 14.25710391998291, + "learning_rate": 4.800113354554793e-06, + "loss": 3.8232, + "step": 12685 + }, + { + "epoch": 0.12908935546875, + "grad_norm": 16.291183471679688, + "learning_rate": 4.799956647017262e-06, + "loss": 3.3102, + "step": 12690 + }, + { + "epoch": 0.12914021809895834, + "grad_norm": 13.827568054199219, + "learning_rate": 4.799799880635887e-06, + "loss": 3.8675, + "step": 12695 + }, + { + "epoch": 0.12919108072916666, + "grad_norm": 13.914209365844727, + "learning_rate": 4.799643055414677e-06, + "loss": 3.0558, + "step": 12700 + }, + { + "epoch": 0.129241943359375, + "grad_norm": 10.144859313964844, + "learning_rate": 4.799486171357644e-06, + "loss": 3.4235, + "step": 12705 + }, + { + "epoch": 0.12929280598958334, + "grad_norm": 10.29310417175293, + "learning_rate": 4.799329228468802e-06, + "loss": 3.4347, + "step": 12710 + }, + { + "epoch": 0.12934366861979166, + "grad_norm": 13.560129165649414, + "learning_rate": 4.7991722267521665e-06, + "loss": 3.3288, + "step": 12715 + }, + { + "epoch": 0.12939453125, + "grad_norm": 13.470560073852539, + "learning_rate": 4.799015166211756e-06, + "loss": 3.2676, + "step": 12720 + }, + { + "epoch": 0.12944539388020834, + "grad_norm": 12.207547187805176, + "learning_rate": 4.798858046851587e-06, + "loss": 3.0864, + "step": 12725 + }, + { + "epoch": 0.12949625651041666, + "grad_norm": 15.827969551086426, + "learning_rate": 4.798700868675679e-06, + "loss": 3.3807, + "step": 12730 + }, + { + "epoch": 0.129547119140625, + "grad_norm": 14.415461540222168, + "learning_rate": 4.798543631688054e-06, + "loss": 3.0016, + "step": 12735 + }, + { + "epoch": 0.12959798177083334, + "grad_norm": 7.998185157775879, + "learning_rate": 4.798386335892735e-06, + "loss": 3.2422, + "step": 12740 + }, + { + "epoch": 0.12964884440104166, + "grad_norm": 7.472536087036133, + "learning_rate": 4.798228981293747e-06, + "loss": 3.4627, + "step": 12745 + }, + { + "epoch": 0.12969970703125, + "grad_norm": 7.803865909576416, + "learning_rate": 4.798071567895115e-06, + "loss": 3.4333, + "step": 12750 + }, + { + "epoch": 0.12975056966145834, + "grad_norm": 11.206137657165527, + "learning_rate": 4.797914095700867e-06, + "loss": 3.2908, + "step": 12755 + }, + { + "epoch": 0.12980143229166666, + "grad_norm": 13.730634689331055, + "learning_rate": 4.797756564715031e-06, + "loss": 3.4254, + "step": 12760 + }, + { + "epoch": 0.129852294921875, + "grad_norm": 13.240988731384277, + "learning_rate": 4.797598974941638e-06, + "loss": 3.8927, + "step": 12765 + }, + { + "epoch": 0.12990315755208334, + "grad_norm": 14.986083030700684, + "learning_rate": 4.79744132638472e-06, + "loss": 2.9281, + "step": 12770 + }, + { + "epoch": 0.12995402018229166, + "grad_norm": 12.508495330810547, + "learning_rate": 4.79728361904831e-06, + "loss": 3.2039, + "step": 12775 + }, + { + "epoch": 0.1300048828125, + "grad_norm": 13.931479454040527, + "learning_rate": 4.797125852936444e-06, + "loss": 3.3995, + "step": 12780 + }, + { + "epoch": 0.13005574544270834, + "grad_norm": 15.499288558959961, + "learning_rate": 4.796968028053156e-06, + "loss": 3.5259, + "step": 12785 + }, + { + "epoch": 0.13010660807291666, + "grad_norm": 15.569916725158691, + "learning_rate": 4.796810144402486e-06, + "loss": 3.5487, + "step": 12790 + }, + { + "epoch": 0.130157470703125, + "grad_norm": 11.951336860656738, + "learning_rate": 4.796652201988474e-06, + "loss": 3.1815, + "step": 12795 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 10.30989933013916, + "learning_rate": 4.796494200815158e-06, + "loss": 3.4612, + "step": 12800 + }, + { + "epoch": 0.13025919596354166, + "grad_norm": 10.12043571472168, + "learning_rate": 4.796336140886584e-06, + "loss": 3.9442, + "step": 12805 + }, + { + "epoch": 0.13031005859375, + "grad_norm": 16.965572357177734, + "learning_rate": 4.796178022206793e-06, + "loss": 3.603, + "step": 12810 + }, + { + "epoch": 0.13036092122395834, + "grad_norm": 12.111398696899414, + "learning_rate": 4.796019844779831e-06, + "loss": 3.6743, + "step": 12815 + }, + { + "epoch": 0.13041178385416666, + "grad_norm": 14.302387237548828, + "learning_rate": 4.795861608609747e-06, + "loss": 3.2853, + "step": 12820 + }, + { + "epoch": 0.130462646484375, + "grad_norm": 12.947281837463379, + "learning_rate": 4.795703313700587e-06, + "loss": 3.3062, + "step": 12825 + }, + { + "epoch": 0.13051350911458334, + "grad_norm": 12.033870697021484, + "learning_rate": 4.795544960056402e-06, + "loss": 3.1838, + "step": 12830 + }, + { + "epoch": 0.13056437174479166, + "grad_norm": 9.891314506530762, + "learning_rate": 4.7953865476812435e-06, + "loss": 3.4533, + "step": 12835 + }, + { + "epoch": 0.130615234375, + "grad_norm": 11.173048973083496, + "learning_rate": 4.795228076579164e-06, + "loss": 3.3019, + "step": 12840 + }, + { + "epoch": 0.13066609700520834, + "grad_norm": 58.573089599609375, + "learning_rate": 4.795069546754219e-06, + "loss": 3.2771, + "step": 12845 + }, + { + "epoch": 0.13071695963541666, + "grad_norm": 10.534260749816895, + "learning_rate": 4.794910958210463e-06, + "loss": 3.5155, + "step": 12850 + }, + { + "epoch": 0.130767822265625, + "grad_norm": 10.294408798217773, + "learning_rate": 4.7947523109519535e-06, + "loss": 3.1176, + "step": 12855 + }, + { + "epoch": 0.13081868489583334, + "grad_norm": 8.581644058227539, + "learning_rate": 4.79459360498275e-06, + "loss": 3.6988, + "step": 12860 + }, + { + "epoch": 0.13086954752604166, + "grad_norm": 7.670634746551514, + "learning_rate": 4.794434840306914e-06, + "loss": 3.4395, + "step": 12865 + }, + { + "epoch": 0.13092041015625, + "grad_norm": 8.824413299560547, + "learning_rate": 4.794276016928506e-06, + "loss": 3.7669, + "step": 12870 + }, + { + "epoch": 0.13097127278645834, + "grad_norm": 16.448566436767578, + "learning_rate": 4.794117134851589e-06, + "loss": 3.6758, + "step": 12875 + }, + { + "epoch": 0.13102213541666666, + "grad_norm": 17.62189483642578, + "learning_rate": 4.79395819408023e-06, + "loss": 3.4432, + "step": 12880 + }, + { + "epoch": 0.131072998046875, + "grad_norm": 13.63729190826416, + "learning_rate": 4.793799194618495e-06, + "loss": 3.1668, + "step": 12885 + }, + { + "epoch": 0.13112386067708334, + "grad_norm": 9.173359870910645, + "learning_rate": 4.79364013647045e-06, + "loss": 3.28, + "step": 12890 + }, + { + "epoch": 0.13117472330729166, + "grad_norm": 10.53978157043457, + "learning_rate": 4.793481019640166e-06, + "loss": 3.2492, + "step": 12895 + }, + { + "epoch": 0.1312255859375, + "grad_norm": 11.081829071044922, + "learning_rate": 4.793321844131714e-06, + "loss": 3.3987, + "step": 12900 + }, + { + "epoch": 0.13127644856770834, + "grad_norm": 8.339278221130371, + "learning_rate": 4.793162609949166e-06, + "loss": 3.1725, + "step": 12905 + }, + { + "epoch": 0.13132731119791666, + "grad_norm": 9.031716346740723, + "learning_rate": 4.793003317096596e-06, + "loss": 3.2547, + "step": 12910 + }, + { + "epoch": 0.131378173828125, + "grad_norm": 12.638387680053711, + "learning_rate": 4.79284396557808e-06, + "loss": 3.367, + "step": 12915 + }, + { + "epoch": 0.13142903645833334, + "grad_norm": 12.951153755187988, + "learning_rate": 4.7926845553976945e-06, + "loss": 3.2392, + "step": 12920 + }, + { + "epoch": 0.13147989908854166, + "grad_norm": 15.467037200927734, + "learning_rate": 4.792525086559518e-06, + "loss": 3.5068, + "step": 12925 + }, + { + "epoch": 0.13153076171875, + "grad_norm": 8.698431015014648, + "learning_rate": 4.792365559067631e-06, + "loss": 3.3362, + "step": 12930 + }, + { + "epoch": 0.13158162434895834, + "grad_norm": 14.218523979187012, + "learning_rate": 4.792205972926114e-06, + "loss": 3.0097, + "step": 12935 + }, + { + "epoch": 0.13163248697916666, + "grad_norm": 7.461666584014893, + "learning_rate": 4.792046328139051e-06, + "loss": 3.3224, + "step": 12940 + }, + { + "epoch": 0.131683349609375, + "grad_norm": 15.739773750305176, + "learning_rate": 4.791886624710525e-06, + "loss": 3.8441, + "step": 12945 + }, + { + "epoch": 0.13173421223958334, + "grad_norm": 10.866748809814453, + "learning_rate": 4.791726862644623e-06, + "loss": 3.861, + "step": 12950 + }, + { + "epoch": 0.13178507486979166, + "grad_norm": 9.433417320251465, + "learning_rate": 4.791567041945433e-06, + "loss": 3.0838, + "step": 12955 + }, + { + "epoch": 0.1318359375, + "grad_norm": 16.90796661376953, + "learning_rate": 4.791407162617043e-06, + "loss": 3.7086, + "step": 12960 + }, + { + "epoch": 0.13188680013020834, + "grad_norm": 14.038589477539062, + "learning_rate": 4.791247224663545e-06, + "loss": 3.4241, + "step": 12965 + }, + { + "epoch": 0.13193766276041666, + "grad_norm": 8.530431747436523, + "learning_rate": 4.791087228089029e-06, + "loss": 3.4557, + "step": 12970 + }, + { + "epoch": 0.131988525390625, + "grad_norm": 16.40488624572754, + "learning_rate": 4.790927172897589e-06, + "loss": 3.7428, + "step": 12975 + }, + { + "epoch": 0.13203938802083334, + "grad_norm": 10.48965072631836, + "learning_rate": 4.790767059093321e-06, + "loss": 3.8246, + "step": 12980 + }, + { + "epoch": 0.13209025065104166, + "grad_norm": 11.309000968933105, + "learning_rate": 4.79060688668032e-06, + "loss": 3.3004, + "step": 12985 + }, + { + "epoch": 0.13214111328125, + "grad_norm": 13.044082641601562, + "learning_rate": 4.790446655662686e-06, + "loss": 3.4936, + "step": 12990 + }, + { + "epoch": 0.13219197591145834, + "grad_norm": 14.110028266906738, + "learning_rate": 4.790286366044516e-06, + "loss": 2.9904, + "step": 12995 + }, + { + "epoch": 0.13224283854166666, + "grad_norm": 16.38824462890625, + "learning_rate": 4.790126017829913e-06, + "loss": 3.1172, + "step": 13000 + }, + { + "epoch": 0.132293701171875, + "grad_norm": 12.276939392089844, + "learning_rate": 4.789965611022977e-06, + "loss": 3.588, + "step": 13005 + }, + { + "epoch": 0.13234456380208334, + "grad_norm": 12.605525016784668, + "learning_rate": 4.7898051456278155e-06, + "loss": 3.1167, + "step": 13010 + }, + { + "epoch": 0.13239542643229166, + "grad_norm": 10.54214096069336, + "learning_rate": 4.7896446216485314e-06, + "loss": 3.496, + "step": 13015 + }, + { + "epoch": 0.1324462890625, + "grad_norm": 11.415157318115234, + "learning_rate": 4.789484039089232e-06, + "loss": 3.5018, + "step": 13020 + }, + { + "epoch": 0.13249715169270834, + "grad_norm": 10.655110359191895, + "learning_rate": 4.789323397954027e-06, + "loss": 3.5284, + "step": 13025 + }, + { + "epoch": 0.13254801432291666, + "grad_norm": 12.17111873626709, + "learning_rate": 4.789162698247024e-06, + "loss": 3.0777, + "step": 13030 + }, + { + "epoch": 0.132598876953125, + "grad_norm": 14.725519180297852, + "learning_rate": 4.789001939972338e-06, + "loss": 3.358, + "step": 13035 + }, + { + "epoch": 0.13264973958333334, + "grad_norm": 15.81676959991455, + "learning_rate": 4.7888411231340785e-06, + "loss": 3.2394, + "step": 13040 + }, + { + "epoch": 0.13270060221354166, + "grad_norm": 9.43869686126709, + "learning_rate": 4.788680247736362e-06, + "loss": 3.3947, + "step": 13045 + }, + { + "epoch": 0.13275146484375, + "grad_norm": 14.853139877319336, + "learning_rate": 4.788519313783303e-06, + "loss": 3.0722, + "step": 13050 + }, + { + "epoch": 0.13280232747395834, + "grad_norm": 12.241186141967773, + "learning_rate": 4.788358321279021e-06, + "loss": 3.8822, + "step": 13055 + }, + { + "epoch": 0.13285319010416666, + "grad_norm": 8.0377779006958, + "learning_rate": 4.788197270227633e-06, + "loss": 3.4792, + "step": 13060 + }, + { + "epoch": 0.132904052734375, + "grad_norm": 13.722543716430664, + "learning_rate": 4.78803616063326e-06, + "loss": 3.3762, + "step": 13065 + }, + { + "epoch": 0.13295491536458334, + "grad_norm": 9.478768348693848, + "learning_rate": 4.787874992500024e-06, + "loss": 3.1809, + "step": 13070 + }, + { + "epoch": 0.13300577799479166, + "grad_norm": 9.255359649658203, + "learning_rate": 4.7877137658320496e-06, + "loss": 3.4478, + "step": 13075 + }, + { + "epoch": 0.133056640625, + "grad_norm": 14.590103149414062, + "learning_rate": 4.7875524806334605e-06, + "loss": 3.4376, + "step": 13080 + }, + { + "epoch": 0.13310750325520834, + "grad_norm": 9.905108451843262, + "learning_rate": 4.787391136908383e-06, + "loss": 3.4683, + "step": 13085 + }, + { + "epoch": 0.13315836588541666, + "grad_norm": 14.800880432128906, + "learning_rate": 4.787229734660945e-06, + "loss": 3.3281, + "step": 13090 + }, + { + "epoch": 0.133209228515625, + "grad_norm": 15.413259506225586, + "learning_rate": 4.787068273895278e-06, + "loss": 3.222, + "step": 13095 + }, + { + "epoch": 0.13326009114583334, + "grad_norm": 14.764119148254395, + "learning_rate": 4.7869067546155105e-06, + "loss": 3.3957, + "step": 13100 + }, + { + "epoch": 0.13331095377604166, + "grad_norm": 10.476200103759766, + "learning_rate": 4.786745176825775e-06, + "loss": 3.2024, + "step": 13105 + }, + { + "epoch": 0.13336181640625, + "grad_norm": 6.172470569610596, + "learning_rate": 4.786583540530206e-06, + "loss": 3.0385, + "step": 13110 + }, + { + "epoch": 0.13341267903645834, + "grad_norm": 16.662702560424805, + "learning_rate": 4.78642184573294e-06, + "loss": 3.3144, + "step": 13115 + }, + { + "epoch": 0.13346354166666666, + "grad_norm": 14.16555404663086, + "learning_rate": 4.786260092438113e-06, + "loss": 3.4183, + "step": 13120 + }, + { + "epoch": 0.133514404296875, + "grad_norm": 13.170214653015137, + "learning_rate": 4.7860982806498635e-06, + "loss": 3.2948, + "step": 13125 + }, + { + "epoch": 0.13356526692708334, + "grad_norm": 14.692107200622559, + "learning_rate": 4.78593641037233e-06, + "loss": 3.583, + "step": 13130 + }, + { + "epoch": 0.13361612955729166, + "grad_norm": 14.330788612365723, + "learning_rate": 4.785774481609657e-06, + "loss": 3.5935, + "step": 13135 + }, + { + "epoch": 0.1336669921875, + "grad_norm": 10.425037384033203, + "learning_rate": 4.785612494365985e-06, + "loss": 3.4952, + "step": 13140 + }, + { + "epoch": 0.13371785481770834, + "grad_norm": 9.754368782043457, + "learning_rate": 4.785450448645459e-06, + "loss": 3.1403, + "step": 13145 + }, + { + "epoch": 0.13376871744791666, + "grad_norm": 12.570056915283203, + "learning_rate": 4.785288344452226e-06, + "loss": 4.0699, + "step": 13150 + }, + { + "epoch": 0.133819580078125, + "grad_norm": 9.117902755737305, + "learning_rate": 4.78512618179043e-06, + "loss": 3.8218, + "step": 13155 + }, + { + "epoch": 0.13387044270833334, + "grad_norm": 13.497501373291016, + "learning_rate": 4.784963960664224e-06, + "loss": 3.5526, + "step": 13160 + }, + { + "epoch": 0.13392130533854166, + "grad_norm": 12.687576293945312, + "learning_rate": 4.784801681077757e-06, + "loss": 3.5295, + "step": 13165 + }, + { + "epoch": 0.13397216796875, + "grad_norm": 9.852676391601562, + "learning_rate": 4.78463934303518e-06, + "loss": 3.4353, + "step": 13170 + }, + { + "epoch": 0.13402303059895834, + "grad_norm": 11.296305656433105, + "learning_rate": 4.7844769465406464e-06, + "loss": 3.3819, + "step": 13175 + }, + { + "epoch": 0.13407389322916666, + "grad_norm": 9.368551254272461, + "learning_rate": 4.784314491598312e-06, + "loss": 3.1461, + "step": 13180 + }, + { + "epoch": 0.134124755859375, + "grad_norm": 9.357016563415527, + "learning_rate": 4.784151978212333e-06, + "loss": 3.582, + "step": 13185 + }, + { + "epoch": 0.13417561848958334, + "grad_norm": 11.293736457824707, + "learning_rate": 4.783989406386867e-06, + "loss": 3.5203, + "step": 13190 + }, + { + "epoch": 0.13422648111979166, + "grad_norm": 13.790380477905273, + "learning_rate": 4.783826776126073e-06, + "loss": 3.3007, + "step": 13195 + }, + { + "epoch": 0.13427734375, + "grad_norm": 14.679352760314941, + "learning_rate": 4.783664087434112e-06, + "loss": 3.8487, + "step": 13200 + }, + { + "epoch": 0.13432820638020834, + "grad_norm": 15.090688705444336, + "learning_rate": 4.783501340315147e-06, + "loss": 2.9251, + "step": 13205 + }, + { + "epoch": 0.13437906901041666, + "grad_norm": 15.12507152557373, + "learning_rate": 4.783338534773343e-06, + "loss": 3.2318, + "step": 13210 + }, + { + "epoch": 0.134429931640625, + "grad_norm": 10.328474044799805, + "learning_rate": 4.783175670812862e-06, + "loss": 3.3133, + "step": 13215 + }, + { + "epoch": 0.13448079427083334, + "grad_norm": 9.897954940795898, + "learning_rate": 4.783012748437873e-06, + "loss": 3.3901, + "step": 13220 + }, + { + "epoch": 0.13453165690104166, + "grad_norm": 12.634456634521484, + "learning_rate": 4.782849767652544e-06, + "loss": 3.4324, + "step": 13225 + }, + { + "epoch": 0.13458251953125, + "grad_norm": 13.496397972106934, + "learning_rate": 4.782686728461044e-06, + "loss": 3.4852, + "step": 13230 + }, + { + "epoch": 0.13463338216145834, + "grad_norm": 9.415319442749023, + "learning_rate": 4.782523630867546e-06, + "loss": 3.8193, + "step": 13235 + }, + { + "epoch": 0.13468424479166666, + "grad_norm": 14.183247566223145, + "learning_rate": 4.782360474876222e-06, + "loss": 3.7154, + "step": 13240 + }, + { + "epoch": 0.134735107421875, + "grad_norm": 9.120186805725098, + "learning_rate": 4.7821972604912464e-06, + "loss": 3.0765, + "step": 13245 + }, + { + "epoch": 0.13478597005208334, + "grad_norm": 15.911162376403809, + "learning_rate": 4.782033987716794e-06, + "loss": 3.2823, + "step": 13250 + }, + { + "epoch": 0.13483683268229166, + "grad_norm": 12.479145050048828, + "learning_rate": 4.781870656557044e-06, + "loss": 3.3999, + "step": 13255 + }, + { + "epoch": 0.1348876953125, + "grad_norm": 10.130742073059082, + "learning_rate": 4.781707267016174e-06, + "loss": 3.7453, + "step": 13260 + }, + { + "epoch": 0.13493855794270834, + "grad_norm": 13.885787963867188, + "learning_rate": 4.781543819098363e-06, + "loss": 3.348, + "step": 13265 + }, + { + "epoch": 0.13498942057291666, + "grad_norm": 9.897067070007324, + "learning_rate": 4.781380312807795e-06, + "loss": 3.6482, + "step": 13270 + }, + { + "epoch": 0.135040283203125, + "grad_norm": 10.115368843078613, + "learning_rate": 4.781216748148653e-06, + "loss": 3.5157, + "step": 13275 + }, + { + "epoch": 0.13509114583333334, + "grad_norm": 11.32646369934082, + "learning_rate": 4.78105312512512e-06, + "loss": 3.3044, + "step": 13280 + }, + { + "epoch": 0.13514200846354166, + "grad_norm": 12.162534713745117, + "learning_rate": 4.780889443741384e-06, + "loss": 3.3909, + "step": 13285 + }, + { + "epoch": 0.13519287109375, + "grad_norm": 14.136061668395996, + "learning_rate": 4.780725704001633e-06, + "loss": 3.5301, + "step": 13290 + }, + { + "epoch": 0.13524373372395834, + "grad_norm": 12.348136901855469, + "learning_rate": 4.780561905910055e-06, + "loss": 3.979, + "step": 13295 + }, + { + "epoch": 0.13529459635416666, + "grad_norm": 12.501129150390625, + "learning_rate": 4.780398049470841e-06, + "loss": 3.6263, + "step": 13300 + }, + { + "epoch": 0.135345458984375, + "grad_norm": 16.70313835144043, + "learning_rate": 4.780234134688184e-06, + "loss": 3.584, + "step": 13305 + }, + { + "epoch": 0.13539632161458334, + "grad_norm": 12.98574447631836, + "learning_rate": 4.780070161566276e-06, + "loss": 2.9994, + "step": 13310 + }, + { + "epoch": 0.13544718424479166, + "grad_norm": 9.226192474365234, + "learning_rate": 4.7799061301093144e-06, + "loss": 3.1362, + "step": 13315 + }, + { + "epoch": 0.135498046875, + "grad_norm": 7.743569850921631, + "learning_rate": 4.779742040321494e-06, + "loss": 3.789, + "step": 13320 + }, + { + "epoch": 0.13554890950520834, + "grad_norm": 10.638326644897461, + "learning_rate": 4.779577892207015e-06, + "loss": 3.5861, + "step": 13325 + }, + { + "epoch": 0.13559977213541666, + "grad_norm": 10.221707344055176, + "learning_rate": 4.779413685770075e-06, + "loss": 3.1357, + "step": 13330 + }, + { + "epoch": 0.135650634765625, + "grad_norm": 12.546625137329102, + "learning_rate": 4.779249421014876e-06, + "loss": 3.2787, + "step": 13335 + }, + { + "epoch": 0.13570149739583334, + "grad_norm": 15.528656959533691, + "learning_rate": 4.779085097945621e-06, + "loss": 3.4499, + "step": 13340 + }, + { + "epoch": 0.13575236002604166, + "grad_norm": 15.490991592407227, + "learning_rate": 4.778920716566514e-06, + "loss": 3.7281, + "step": 13345 + }, + { + "epoch": 0.13580322265625, + "grad_norm": 12.92549991607666, + "learning_rate": 4.7787562768817605e-06, + "loss": 3.3561, + "step": 13350 + }, + { + "epoch": 0.13585408528645834, + "grad_norm": 14.30997371673584, + "learning_rate": 4.778591778895568e-06, + "loss": 3.4102, + "step": 13355 + }, + { + "epoch": 0.13590494791666666, + "grad_norm": 9.108956336975098, + "learning_rate": 4.778427222612145e-06, + "loss": 3.2839, + "step": 13360 + }, + { + "epoch": 0.135955810546875, + "grad_norm": 15.586246490478516, + "learning_rate": 4.778262608035702e-06, + "loss": 3.6318, + "step": 13365 + }, + { + "epoch": 0.13600667317708334, + "grad_norm": 12.77320671081543, + "learning_rate": 4.778097935170449e-06, + "loss": 3.4799, + "step": 13370 + }, + { + "epoch": 0.13605753580729166, + "grad_norm": 8.657517433166504, + "learning_rate": 4.777933204020602e-06, + "loss": 3.2848, + "step": 13375 + }, + { + "epoch": 0.1361083984375, + "grad_norm": 13.57066822052002, + "learning_rate": 4.777768414590372e-06, + "loss": 3.0883, + "step": 13380 + }, + { + "epoch": 0.13615926106770834, + "grad_norm": 15.412737846374512, + "learning_rate": 4.777603566883978e-06, + "loss": 3.3734, + "step": 13385 + }, + { + "epoch": 0.13621012369791666, + "grad_norm": 7.619217395782471, + "learning_rate": 4.777438660905637e-06, + "loss": 3.2357, + "step": 13390 + }, + { + "epoch": 0.136260986328125, + "grad_norm": 8.894469261169434, + "learning_rate": 4.777273696659567e-06, + "loss": 3.3946, + "step": 13395 + }, + { + "epoch": 0.13631184895833334, + "grad_norm": 11.534381866455078, + "learning_rate": 4.7771086741499895e-06, + "loss": 3.3128, + "step": 13400 + }, + { + "epoch": 0.13636271158854166, + "grad_norm": 9.838509559631348, + "learning_rate": 4.776943593381126e-06, + "loss": 3.5254, + "step": 13405 + }, + { + "epoch": 0.13641357421875, + "grad_norm": 12.967144012451172, + "learning_rate": 4.7767784543572e-06, + "loss": 3.1434, + "step": 13410 + }, + { + "epoch": 0.13646443684895834, + "grad_norm": 9.366642951965332, + "learning_rate": 4.776613257082439e-06, + "loss": 2.9998, + "step": 13415 + }, + { + "epoch": 0.13651529947916666, + "grad_norm": 10.693907737731934, + "learning_rate": 4.776448001561065e-06, + "loss": 3.9294, + "step": 13420 + }, + { + "epoch": 0.136566162109375, + "grad_norm": 14.118185997009277, + "learning_rate": 4.7762826877973095e-06, + "loss": 3.4184, + "step": 13425 + }, + { + "epoch": 0.13661702473958334, + "grad_norm": 16.79452133178711, + "learning_rate": 4.776117315795401e-06, + "loss": 3.4651, + "step": 13430 + }, + { + "epoch": 0.13666788736979166, + "grad_norm": 7.560031890869141, + "learning_rate": 4.77595188555957e-06, + "loss": 3.3598, + "step": 13435 + }, + { + "epoch": 0.13671875, + "grad_norm": 8.069879531860352, + "learning_rate": 4.77578639709405e-06, + "loss": 3.3903, + "step": 13440 + }, + { + "epoch": 0.13676961263020834, + "grad_norm": 14.968077659606934, + "learning_rate": 4.775620850403075e-06, + "loss": 3.1777, + "step": 13445 + }, + { + "epoch": 0.13682047526041666, + "grad_norm": 11.37788200378418, + "learning_rate": 4.775455245490879e-06, + "loss": 3.1365, + "step": 13450 + }, + { + "epoch": 0.136871337890625, + "grad_norm": 11.77219295501709, + "learning_rate": 4.7752895823616995e-06, + "loss": 3.397, + "step": 13455 + }, + { + "epoch": 0.13692220052083334, + "grad_norm": 11.196371078491211, + "learning_rate": 4.775123861019776e-06, + "loss": 3.146, + "step": 13460 + }, + { + "epoch": 0.13697306315104166, + "grad_norm": 19.208072662353516, + "learning_rate": 4.774958081469348e-06, + "loss": 3.2435, + "step": 13465 + }, + { + "epoch": 0.13702392578125, + "grad_norm": 13.371184349060059, + "learning_rate": 4.774792243714656e-06, + "loss": 3.4511, + "step": 13470 + }, + { + "epoch": 0.13707478841145834, + "grad_norm": 7.495774745941162, + "learning_rate": 4.774626347759944e-06, + "loss": 3.174, + "step": 13475 + }, + { + "epoch": 0.13712565104166666, + "grad_norm": 9.855815887451172, + "learning_rate": 4.774460393609456e-06, + "loss": 3.4519, + "step": 13480 + }, + { + "epoch": 0.137176513671875, + "grad_norm": 15.752311706542969, + "learning_rate": 4.774294381267438e-06, + "loss": 3.3032, + "step": 13485 + }, + { + "epoch": 0.13722737630208334, + "grad_norm": 10.933402061462402, + "learning_rate": 4.774128310738137e-06, + "loss": 3.5641, + "step": 13490 + }, + { + "epoch": 0.13727823893229166, + "grad_norm": 10.031787872314453, + "learning_rate": 4.773962182025803e-06, + "loss": 3.4982, + "step": 13495 + }, + { + "epoch": 0.1373291015625, + "grad_norm": 10.56356430053711, + "learning_rate": 4.773795995134685e-06, + "loss": 3.2439, + "step": 13500 + }, + { + "epoch": 0.13737996419270834, + "grad_norm": 7.9167585372924805, + "learning_rate": 4.773629750069036e-06, + "loss": 2.9149, + "step": 13505 + }, + { + "epoch": 0.13743082682291666, + "grad_norm": 11.75800609588623, + "learning_rate": 4.773463446833108e-06, + "loss": 3.3936, + "step": 13510 + }, + { + "epoch": 0.137481689453125, + "grad_norm": 10.611553192138672, + "learning_rate": 4.773297085431156e-06, + "loss": 3.3874, + "step": 13515 + }, + { + "epoch": 0.13753255208333334, + "grad_norm": 7.607089996337891, + "learning_rate": 4.773130665867438e-06, + "loss": 3.1678, + "step": 13520 + }, + { + "epoch": 0.13758341471354166, + "grad_norm": 13.853635787963867, + "learning_rate": 4.7729641881462106e-06, + "loss": 3.5818, + "step": 13525 + }, + { + "epoch": 0.13763427734375, + "grad_norm": 14.964798927307129, + "learning_rate": 4.772797652271732e-06, + "loss": 3.722, + "step": 13530 + }, + { + "epoch": 0.13768513997395834, + "grad_norm": 13.768731117248535, + "learning_rate": 4.772631058248266e-06, + "loss": 4.1596, + "step": 13535 + }, + { + "epoch": 0.13773600260416666, + "grad_norm": 14.737395286560059, + "learning_rate": 4.772464406080072e-06, + "loss": 3.3029, + "step": 13540 + }, + { + "epoch": 0.137786865234375, + "grad_norm": 13.914802551269531, + "learning_rate": 4.772297695771415e-06, + "loss": 3.4859, + "step": 13545 + }, + { + "epoch": 0.13783772786458334, + "grad_norm": 11.838445663452148, + "learning_rate": 4.7721309273265605e-06, + "loss": 3.3564, + "step": 13550 + }, + { + "epoch": 0.13788859049479166, + "grad_norm": 12.323065757751465, + "learning_rate": 4.771964100749774e-06, + "loss": 3.3043, + "step": 13555 + }, + { + "epoch": 0.137939453125, + "grad_norm": 13.693683624267578, + "learning_rate": 4.771797216045325e-06, + "loss": 3.4427, + "step": 13560 + }, + { + "epoch": 0.13799031575520834, + "grad_norm": 14.236191749572754, + "learning_rate": 4.771630273217483e-06, + "loss": 3.3801, + "step": 13565 + }, + { + "epoch": 0.13804117838541666, + "grad_norm": 12.197969436645508, + "learning_rate": 4.7714632722705175e-06, + "loss": 3.6923, + "step": 13570 + }, + { + "epoch": 0.138092041015625, + "grad_norm": 12.962698936462402, + "learning_rate": 4.771296213208704e-06, + "loss": 3.3643, + "step": 13575 + }, + { + "epoch": 0.13814290364583334, + "grad_norm": 15.476211547851562, + "learning_rate": 4.7711290960363145e-06, + "loss": 3.3381, + "step": 13580 + }, + { + "epoch": 0.13819376627604166, + "grad_norm": 12.055511474609375, + "learning_rate": 4.770961920757626e-06, + "loss": 3.1077, + "step": 13585 + }, + { + "epoch": 0.13824462890625, + "grad_norm": 11.437824249267578, + "learning_rate": 4.7707946873769144e-06, + "loss": 3.4872, + "step": 13590 + }, + { + "epoch": 0.13829549153645834, + "grad_norm": 14.684785842895508, + "learning_rate": 4.77062739589846e-06, + "loss": 3.1012, + "step": 13595 + }, + { + "epoch": 0.13834635416666666, + "grad_norm": 11.902538299560547, + "learning_rate": 4.77046004632654e-06, + "loss": 3.3424, + "step": 13600 + }, + { + "epoch": 0.138397216796875, + "grad_norm": 13.112920761108398, + "learning_rate": 4.770292638665439e-06, + "loss": 3.6353, + "step": 13605 + }, + { + "epoch": 0.13844807942708334, + "grad_norm": 12.018083572387695, + "learning_rate": 4.7701251729194396e-06, + "loss": 3.7497, + "step": 13610 + }, + { + "epoch": 0.13849894205729166, + "grad_norm": 12.88236141204834, + "learning_rate": 4.769957649092825e-06, + "loss": 3.9322, + "step": 13615 + }, + { + "epoch": 0.1385498046875, + "grad_norm": 16.120954513549805, + "learning_rate": 4.769790067189882e-06, + "loss": 3.4474, + "step": 13620 + }, + { + "epoch": 0.13860066731770834, + "grad_norm": 9.990036010742188, + "learning_rate": 4.769622427214898e-06, + "loss": 3.1569, + "step": 13625 + }, + { + "epoch": 0.13865152994791666, + "grad_norm": 9.095274925231934, + "learning_rate": 4.769454729172163e-06, + "loss": 3.3547, + "step": 13630 + }, + { + "epoch": 0.138702392578125, + "grad_norm": 12.9636812210083, + "learning_rate": 4.7692869730659655e-06, + "loss": 3.3197, + "step": 13635 + }, + { + "epoch": 0.13875325520833334, + "grad_norm": 14.65504264831543, + "learning_rate": 4.769119158900599e-06, + "loss": 3.4425, + "step": 13640 + }, + { + "epoch": 0.13880411783854166, + "grad_norm": 9.957447052001953, + "learning_rate": 4.768951286680357e-06, + "loss": 3.4952, + "step": 13645 + }, + { + "epoch": 0.13885498046875, + "grad_norm": 12.655380249023438, + "learning_rate": 4.768783356409535e-06, + "loss": 3.4519, + "step": 13650 + }, + { + "epoch": 0.13890584309895834, + "grad_norm": 14.230171203613281, + "learning_rate": 4.768615368092427e-06, + "loss": 3.2948, + "step": 13655 + }, + { + "epoch": 0.13895670572916666, + "grad_norm": 15.258803367614746, + "learning_rate": 4.768447321733332e-06, + "loss": 3.5278, + "step": 13660 + }, + { + "epoch": 0.139007568359375, + "grad_norm": 8.109949111938477, + "learning_rate": 4.7682792173365525e-06, + "loss": 3.2576, + "step": 13665 + }, + { + "epoch": 0.13905843098958334, + "grad_norm": 12.126956939697266, + "learning_rate": 4.768111054906384e-06, + "loss": 3.5308, + "step": 13670 + }, + { + "epoch": 0.13910929361979166, + "grad_norm": 12.557209014892578, + "learning_rate": 4.767942834447134e-06, + "loss": 3.405, + "step": 13675 + }, + { + "epoch": 0.13916015625, + "grad_norm": 8.383021354675293, + "learning_rate": 4.767774555963103e-06, + "loss": 3.3727, + "step": 13680 + }, + { + "epoch": 0.13921101888020834, + "grad_norm": 9.561806678771973, + "learning_rate": 4.767606219458598e-06, + "loss": 3.7015, + "step": 13685 + }, + { + "epoch": 0.13926188151041666, + "grad_norm": 11.462172508239746, + "learning_rate": 4.767437824937926e-06, + "loss": 3.6486, + "step": 13690 + }, + { + "epoch": 0.139312744140625, + "grad_norm": 16.29766273498535, + "learning_rate": 4.767269372405393e-06, + "loss": 3.0515, + "step": 13695 + }, + { + "epoch": 0.13936360677083334, + "grad_norm": 8.491412162780762, + "learning_rate": 4.767100861865311e-06, + "loss": 3.3933, + "step": 13700 + }, + { + "epoch": 0.13941446940104166, + "grad_norm": 8.013467788696289, + "learning_rate": 4.766932293321992e-06, + "loss": 3.3615, + "step": 13705 + }, + { + "epoch": 0.13946533203125, + "grad_norm": 8.9276704788208, + "learning_rate": 4.766763666779747e-06, + "loss": 3.4722, + "step": 13710 + }, + { + "epoch": 0.13951619466145834, + "grad_norm": 8.328059196472168, + "learning_rate": 4.76659498224289e-06, + "loss": 3.6743, + "step": 13715 + }, + { + "epoch": 0.13956705729166666, + "grad_norm": 10.622822761535645, + "learning_rate": 4.766426239715739e-06, + "loss": 3.3682, + "step": 13720 + }, + { + "epoch": 0.139617919921875, + "grad_norm": 13.057267189025879, + "learning_rate": 4.766257439202609e-06, + "loss": 3.41, + "step": 13725 + }, + { + "epoch": 0.13966878255208334, + "grad_norm": 13.841679573059082, + "learning_rate": 4.766088580707819e-06, + "loss": 3.2495, + "step": 13730 + }, + { + "epoch": 0.13971964518229166, + "grad_norm": 16.03135108947754, + "learning_rate": 4.765919664235691e-06, + "loss": 3.3903, + "step": 13735 + }, + { + "epoch": 0.1397705078125, + "grad_norm": 8.109480857849121, + "learning_rate": 4.765750689790545e-06, + "loss": 3.2099, + "step": 13740 + }, + { + "epoch": 0.13982137044270834, + "grad_norm": 15.432792663574219, + "learning_rate": 4.765581657376705e-06, + "loss": 3.2156, + "step": 13745 + }, + { + "epoch": 0.13987223307291666, + "grad_norm": 13.207000732421875, + "learning_rate": 4.7654125669984945e-06, + "loss": 3.3425, + "step": 13750 + }, + { + "epoch": 0.139923095703125, + "grad_norm": 10.853780746459961, + "learning_rate": 4.765243418660241e-06, + "loss": 3.5909, + "step": 13755 + }, + { + "epoch": 0.13997395833333334, + "grad_norm": 11.465790748596191, + "learning_rate": 4.765074212366271e-06, + "loss": 3.7235, + "step": 13760 + }, + { + "epoch": 0.14002482096354166, + "grad_norm": 14.370993614196777, + "learning_rate": 4.764904948120915e-06, + "loss": 3.6259, + "step": 13765 + }, + { + "epoch": 0.14007568359375, + "grad_norm": 11.535296440124512, + "learning_rate": 4.7647356259285025e-06, + "loss": 3.5197, + "step": 13770 + }, + { + "epoch": 0.14012654622395834, + "grad_norm": 11.556644439697266, + "learning_rate": 4.764566245793365e-06, + "loss": 3.5663, + "step": 13775 + }, + { + "epoch": 0.14017740885416666, + "grad_norm": 14.494634628295898, + "learning_rate": 4.764396807719838e-06, + "loss": 3.4005, + "step": 13780 + }, + { + "epoch": 0.140228271484375, + "grad_norm": 14.56704330444336, + "learning_rate": 4.764227311712255e-06, + "loss": 3.335, + "step": 13785 + }, + { + "epoch": 0.14027913411458334, + "grad_norm": 10.961151123046875, + "learning_rate": 4.764057757774953e-06, + "loss": 3.7573, + "step": 13790 + }, + { + "epoch": 0.14032999674479166, + "grad_norm": 8.999013900756836, + "learning_rate": 4.76388814591227e-06, + "loss": 3.4393, + "step": 13795 + }, + { + "epoch": 0.140380859375, + "grad_norm": 85.62931823730469, + "learning_rate": 4.763718476128545e-06, + "loss": 3.9256, + "step": 13800 + }, + { + "epoch": 0.14043172200520834, + "grad_norm": 11.35169792175293, + "learning_rate": 4.763548748428119e-06, + "loss": 3.6844, + "step": 13805 + }, + { + "epoch": 0.14048258463541666, + "grad_norm": 10.133095741271973, + "learning_rate": 4.763378962815335e-06, + "loss": 3.3458, + "step": 13810 + }, + { + "epoch": 0.140533447265625, + "grad_norm": 14.392997741699219, + "learning_rate": 4.763209119294537e-06, + "loss": 3.4801, + "step": 13815 + }, + { + "epoch": 0.14058430989583334, + "grad_norm": 13.355161666870117, + "learning_rate": 4.76303921787007e-06, + "loss": 3.98, + "step": 13820 + }, + { + "epoch": 0.14063517252604166, + "grad_norm": 15.28199291229248, + "learning_rate": 4.762869258546281e-06, + "loss": 3.565, + "step": 13825 + }, + { + "epoch": 0.14068603515625, + "grad_norm": 15.826837539672852, + "learning_rate": 4.762699241327518e-06, + "loss": 3.9634, + "step": 13830 + }, + { + "epoch": 0.14073689778645834, + "grad_norm": 19.135751724243164, + "learning_rate": 4.762529166218133e-06, + "loss": 3.298, + "step": 13835 + }, + { + "epoch": 0.14078776041666666, + "grad_norm": 14.346761703491211, + "learning_rate": 4.7623590332224735e-06, + "loss": 3.3069, + "step": 13840 + }, + { + "epoch": 0.140838623046875, + "grad_norm": 18.030282974243164, + "learning_rate": 4.762188842344896e-06, + "loss": 3.2781, + "step": 13845 + }, + { + "epoch": 0.14088948567708334, + "grad_norm": 11.988079071044922, + "learning_rate": 4.762018593589752e-06, + "loss": 3.7506, + "step": 13850 + }, + { + "epoch": 0.14094034830729166, + "grad_norm": 12.060202598571777, + "learning_rate": 4.761848286961398e-06, + "loss": 3.3371, + "step": 13855 + }, + { + "epoch": 0.1409912109375, + "grad_norm": 10.818347930908203, + "learning_rate": 4.7616779224641925e-06, + "loss": 3.528, + "step": 13860 + }, + { + "epoch": 0.14104207356770834, + "grad_norm": 14.578571319580078, + "learning_rate": 4.761507500102493e-06, + "loss": 3.3091, + "step": 13865 + }, + { + "epoch": 0.14109293619791666, + "grad_norm": 14.782280921936035, + "learning_rate": 4.761337019880661e-06, + "loss": 3.382, + "step": 13870 + }, + { + "epoch": 0.141143798828125, + "grad_norm": 16.304611206054688, + "learning_rate": 4.761166481803057e-06, + "loss": 3.3789, + "step": 13875 + }, + { + "epoch": 0.14119466145833334, + "grad_norm": 17.154775619506836, + "learning_rate": 4.760995885874045e-06, + "loss": 4.1372, + "step": 13880 + }, + { + "epoch": 0.14124552408854166, + "grad_norm": 7.446081161499023, + "learning_rate": 4.760825232097988e-06, + "loss": 3.44, + "step": 13885 + }, + { + "epoch": 0.14129638671875, + "grad_norm": 15.864761352539062, + "learning_rate": 4.760654520479254e-06, + "loss": 3.7214, + "step": 13890 + }, + { + "epoch": 0.14134724934895834, + "grad_norm": 10.105338096618652, + "learning_rate": 4.76048375102221e-06, + "loss": 3.0877, + "step": 13895 + }, + { + "epoch": 0.14139811197916666, + "grad_norm": 11.060812950134277, + "learning_rate": 4.760312923731224e-06, + "loss": 3.406, + "step": 13900 + }, + { + "epoch": 0.141448974609375, + "grad_norm": 17.411237716674805, + "learning_rate": 4.760142038610669e-06, + "loss": 3.4056, + "step": 13905 + }, + { + "epoch": 0.14149983723958334, + "grad_norm": 12.01961898803711, + "learning_rate": 4.759971095664915e-06, + "loss": 3.6575, + "step": 13910 + }, + { + "epoch": 0.14155069986979166, + "grad_norm": 10.522672653198242, + "learning_rate": 4.7598000948983355e-06, + "loss": 3.462, + "step": 13915 + }, + { + "epoch": 0.1416015625, + "grad_norm": 14.614590644836426, + "learning_rate": 4.759629036315307e-06, + "loss": 3.5353, + "step": 13920 + }, + { + "epoch": 0.14165242513020834, + "grad_norm": 13.122455596923828, + "learning_rate": 4.759457919920206e-06, + "loss": 3.2238, + "step": 13925 + }, + { + "epoch": 0.14170328776041666, + "grad_norm": 11.244791984558105, + "learning_rate": 4.759286745717409e-06, + "loss": 3.8902, + "step": 13930 + }, + { + "epoch": 0.141754150390625, + "grad_norm": 13.832063674926758, + "learning_rate": 4.759115513711296e-06, + "loss": 3.2644, + "step": 13935 + }, + { + "epoch": 0.14180501302083334, + "grad_norm": 18.43947982788086, + "learning_rate": 4.758944223906248e-06, + "loss": 3.3131, + "step": 13940 + }, + { + "epoch": 0.14185587565104166, + "grad_norm": 15.581222534179688, + "learning_rate": 4.758772876306647e-06, + "loss": 3.2354, + "step": 13945 + }, + { + "epoch": 0.14190673828125, + "grad_norm": 7.942548751831055, + "learning_rate": 4.758601470916878e-06, + "loss": 3.2157, + "step": 13950 + }, + { + "epoch": 0.14195760091145834, + "grad_norm": 15.951170921325684, + "learning_rate": 4.758430007741325e-06, + "loss": 3.3562, + "step": 13955 + }, + { + "epoch": 0.14200846354166666, + "grad_norm": 14.815132141113281, + "learning_rate": 4.7582584867843764e-06, + "loss": 2.939, + "step": 13960 + }, + { + "epoch": 0.142059326171875, + "grad_norm": 12.236970901489258, + "learning_rate": 4.7580869080504185e-06, + "loss": 3.2991, + "step": 13965 + }, + { + "epoch": 0.14211018880208334, + "grad_norm": 12.710593223571777, + "learning_rate": 4.757915271543844e-06, + "loss": 3.2584, + "step": 13970 + }, + { + "epoch": 0.14216105143229166, + "grad_norm": 12.963603019714355, + "learning_rate": 4.757743577269042e-06, + "loss": 3.1463, + "step": 13975 + }, + { + "epoch": 0.1422119140625, + "grad_norm": 10.430787086486816, + "learning_rate": 4.7575718252304046e-06, + "loss": 4.2165, + "step": 13980 + }, + { + "epoch": 0.14226277669270834, + "grad_norm": 10.955326080322266, + "learning_rate": 4.7574000154323274e-06, + "loss": 3.195, + "step": 13985 + }, + { + "epoch": 0.14231363932291666, + "grad_norm": 8.364349365234375, + "learning_rate": 4.757228147879207e-06, + "loss": 3.1683, + "step": 13990 + }, + { + "epoch": 0.142364501953125, + "grad_norm": 14.662646293640137, + "learning_rate": 4.757056222575438e-06, + "loss": 3.2215, + "step": 13995 + }, + { + "epoch": 0.14241536458333334, + "grad_norm": 14.448378562927246, + "learning_rate": 4.756884239525422e-06, + "loss": 3.3889, + "step": 14000 + }, + { + "epoch": 0.14246622721354166, + "grad_norm": 12.963875770568848, + "learning_rate": 4.756712198733557e-06, + "loss": 3.2769, + "step": 14005 + }, + { + "epoch": 0.14251708984375, + "grad_norm": 11.409318923950195, + "learning_rate": 4.756540100204245e-06, + "loss": 3.3303, + "step": 14010 + }, + { + "epoch": 0.14256795247395834, + "grad_norm": 9.132806777954102, + "learning_rate": 4.75636794394189e-06, + "loss": 3.2503, + "step": 14015 + }, + { + "epoch": 0.14261881510416666, + "grad_norm": 12.652349472045898, + "learning_rate": 4.756195729950896e-06, + "loss": 3.2854, + "step": 14020 + }, + { + "epoch": 0.142669677734375, + "grad_norm": 11.568374633789062, + "learning_rate": 4.756023458235668e-06, + "loss": 3.2795, + "step": 14025 + }, + { + "epoch": 0.14272054036458334, + "grad_norm": 14.18939208984375, + "learning_rate": 4.755851128800616e-06, + "loss": 3.2294, + "step": 14030 + }, + { + "epoch": 0.14277140299479166, + "grad_norm": 12.417285919189453, + "learning_rate": 4.755678741650146e-06, + "loss": 3.3423, + "step": 14035 + }, + { + "epoch": 0.142822265625, + "grad_norm": 12.653460502624512, + "learning_rate": 4.755506296788671e-06, + "loss": 3.4815, + "step": 14040 + }, + { + "epoch": 0.14287312825520834, + "grad_norm": 10.841574668884277, + "learning_rate": 4.7553337942206025e-06, + "loss": 3.3619, + "step": 14045 + }, + { + "epoch": 0.14292399088541666, + "grad_norm": 9.72189712524414, + "learning_rate": 4.7551612339503524e-06, + "loss": 3.0766, + "step": 14050 + }, + { + "epoch": 0.142974853515625, + "grad_norm": 10.824090003967285, + "learning_rate": 4.754988615982336e-06, + "loss": 3.2932, + "step": 14055 + }, + { + "epoch": 0.14302571614583334, + "grad_norm": 14.650367736816406, + "learning_rate": 4.7548159403209725e-06, + "loss": 3.3557, + "step": 14060 + }, + { + "epoch": 0.14307657877604166, + "grad_norm": 13.419211387634277, + "learning_rate": 4.7546432069706765e-06, + "loss": 3.135, + "step": 14065 + }, + { + "epoch": 0.14312744140625, + "grad_norm": 10.86098861694336, + "learning_rate": 4.754470415935868e-06, + "loss": 3.7383, + "step": 14070 + }, + { + "epoch": 0.14317830403645834, + "grad_norm": 16.1043758392334, + "learning_rate": 4.754297567220969e-06, + "loss": 3.5326, + "step": 14075 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 11.805451393127441, + "learning_rate": 4.754124660830401e-06, + "loss": 3.2176, + "step": 14080 + }, + { + "epoch": 0.143280029296875, + "grad_norm": 10.292030334472656, + "learning_rate": 4.753951696768587e-06, + "loss": 3.5026, + "step": 14085 + }, + { + "epoch": 0.14333089192708334, + "grad_norm": 9.801261901855469, + "learning_rate": 4.753778675039954e-06, + "loss": 3.5177, + "step": 14090 + }, + { + "epoch": 0.14338175455729166, + "grad_norm": 14.867837905883789, + "learning_rate": 4.753605595648928e-06, + "loss": 3.1356, + "step": 14095 + }, + { + "epoch": 0.1434326171875, + "grad_norm": 19.2530460357666, + "learning_rate": 4.753432458599936e-06, + "loss": 3.4818, + "step": 14100 + }, + { + "epoch": 0.14348347981770834, + "grad_norm": 11.164162635803223, + "learning_rate": 4.753259263897409e-06, + "loss": 3.65, + "step": 14105 + }, + { + "epoch": 0.14353434244791666, + "grad_norm": 7.864550590515137, + "learning_rate": 4.7530860115457785e-06, + "loss": 3.1865, + "step": 14110 + }, + { + "epoch": 0.143585205078125, + "grad_norm": 7.927250862121582, + "learning_rate": 4.7529127015494754e-06, + "loss": 3.2852, + "step": 14115 + }, + { + "epoch": 0.14363606770833334, + "grad_norm": 15.011672973632812, + "learning_rate": 4.752739333912936e-06, + "loss": 3.2359, + "step": 14120 + }, + { + "epoch": 0.14368693033854166, + "grad_norm": 10.188263893127441, + "learning_rate": 4.752565908640594e-06, + "loss": 3.4599, + "step": 14125 + }, + { + "epoch": 0.14373779296875, + "grad_norm": 13.60891056060791, + "learning_rate": 4.752392425736888e-06, + "loss": 3.5074, + "step": 14130 + }, + { + "epoch": 0.14378865559895834, + "grad_norm": 7.990344524383545, + "learning_rate": 4.752218885206255e-06, + "loss": 3.1402, + "step": 14135 + }, + { + "epoch": 0.14383951822916666, + "grad_norm": 13.563754081726074, + "learning_rate": 4.752045287053135e-06, + "loss": 3.6083, + "step": 14140 + }, + { + "epoch": 0.143890380859375, + "grad_norm": 7.188718795776367, + "learning_rate": 4.751871631281971e-06, + "loss": 3.6155, + "step": 14145 + }, + { + "epoch": 0.14394124348958334, + "grad_norm": 11.118795394897461, + "learning_rate": 4.751697917897204e-06, + "loss": 3.2539, + "step": 14150 + }, + { + "epoch": 0.14399210611979166, + "grad_norm": 11.786707878112793, + "learning_rate": 4.7515241469032805e-06, + "loss": 3.4012, + "step": 14155 + }, + { + "epoch": 0.14404296875, + "grad_norm": 15.172300338745117, + "learning_rate": 4.751350318304645e-06, + "loss": 3.4998, + "step": 14160 + }, + { + "epoch": 0.14409383138020834, + "grad_norm": 14.242692947387695, + "learning_rate": 4.751176432105746e-06, + "loss": 3.374, + "step": 14165 + }, + { + "epoch": 0.14414469401041666, + "grad_norm": 12.410831451416016, + "learning_rate": 4.751002488311031e-06, + "loss": 3.1663, + "step": 14170 + }, + { + "epoch": 0.144195556640625, + "grad_norm": 16.139020919799805, + "learning_rate": 4.75082848692495e-06, + "loss": 3.6473, + "step": 14175 + }, + { + "epoch": 0.14424641927083334, + "grad_norm": 9.563576698303223, + "learning_rate": 4.750654427951957e-06, + "loss": 3.133, + "step": 14180 + }, + { + "epoch": 0.14429728190104166, + "grad_norm": 9.554606437683105, + "learning_rate": 4.750480311396503e-06, + "loss": 3.2635, + "step": 14185 + }, + { + "epoch": 0.14434814453125, + "grad_norm": 16.014537811279297, + "learning_rate": 4.750306137263044e-06, + "loss": 3.4213, + "step": 14190 + }, + { + "epoch": 0.14439900716145834, + "grad_norm": 15.562359809875488, + "learning_rate": 4.750131905556036e-06, + "loss": 3.2832, + "step": 14195 + }, + { + "epoch": 0.14444986979166666, + "grad_norm": 15.114102363586426, + "learning_rate": 4.749957616279937e-06, + "loss": 3.3687, + "step": 14200 + }, + { + "epoch": 0.144500732421875, + "grad_norm": 15.910360336303711, + "learning_rate": 4.749783269439205e-06, + "loss": 3.1607, + "step": 14205 + }, + { + "epoch": 0.14455159505208334, + "grad_norm": 8.769811630249023, + "learning_rate": 4.749608865038301e-06, + "loss": 3.3552, + "step": 14210 + }, + { + "epoch": 0.14460245768229166, + "grad_norm": 9.91917896270752, + "learning_rate": 4.749434403081688e-06, + "loss": 3.2902, + "step": 14215 + }, + { + "epoch": 0.1446533203125, + "grad_norm": 9.88044261932373, + "learning_rate": 4.749259883573829e-06, + "loss": 3.4576, + "step": 14220 + }, + { + "epoch": 0.14470418294270834, + "grad_norm": 15.199368476867676, + "learning_rate": 4.749085306519189e-06, + "loss": 3.2871, + "step": 14225 + }, + { + "epoch": 0.14475504557291666, + "grad_norm": 15.931928634643555, + "learning_rate": 4.748910671922234e-06, + "loss": 3.5535, + "step": 14230 + }, + { + "epoch": 0.144805908203125, + "grad_norm": 11.521714210510254, + "learning_rate": 4.748735979787433e-06, + "loss": 3.3337, + "step": 14235 + }, + { + "epoch": 0.14485677083333334, + "grad_norm": 9.132640838623047, + "learning_rate": 4.7485612301192545e-06, + "loss": 3.599, + "step": 14240 + }, + { + "epoch": 0.14490763346354166, + "grad_norm": 14.95056438446045, + "learning_rate": 4.74838642292217e-06, + "loss": 3.2524, + "step": 14245 + }, + { + "epoch": 0.14495849609375, + "grad_norm": 13.88601303100586, + "learning_rate": 4.748211558200653e-06, + "loss": 3.3675, + "step": 14250 + }, + { + "epoch": 0.14500935872395834, + "grad_norm": 11.538806915283203, + "learning_rate": 4.748036635959174e-06, + "loss": 3.4054, + "step": 14255 + }, + { + "epoch": 0.14506022135416666, + "grad_norm": 12.001321792602539, + "learning_rate": 4.747861656202212e-06, + "loss": 3.481, + "step": 14260 + }, + { + "epoch": 0.145111083984375, + "grad_norm": 13.24527359008789, + "learning_rate": 4.747686618934242e-06, + "loss": 2.8579, + "step": 14265 + }, + { + "epoch": 0.14516194661458334, + "grad_norm": 13.960221290588379, + "learning_rate": 4.747511524159743e-06, + "loss": 3.6163, + "step": 14270 + }, + { + "epoch": 0.14521280924479166, + "grad_norm": 14.137311935424805, + "learning_rate": 4.747336371883194e-06, + "loss": 3.2439, + "step": 14275 + }, + { + "epoch": 0.145263671875, + "grad_norm": 13.414327621459961, + "learning_rate": 4.747161162109076e-06, + "loss": 3.6854, + "step": 14280 + }, + { + "epoch": 0.14531453450520834, + "grad_norm": 7.808047294616699, + "learning_rate": 4.746985894841873e-06, + "loss": 3.4737, + "step": 14285 + }, + { + "epoch": 0.14536539713541666, + "grad_norm": 7.56891393661499, + "learning_rate": 4.746810570086069e-06, + "loss": 3.2434, + "step": 14290 + }, + { + "epoch": 0.145416259765625, + "grad_norm": 15.081562042236328, + "learning_rate": 4.746635187846148e-06, + "loss": 3.3065, + "step": 14295 + }, + { + "epoch": 0.14546712239583334, + "grad_norm": 12.434776306152344, + "learning_rate": 4.746459748126599e-06, + "loss": 3.2885, + "step": 14300 + }, + { + "epoch": 0.14551798502604166, + "grad_norm": 14.816580772399902, + "learning_rate": 4.74628425093191e-06, + "loss": 3.3603, + "step": 14305 + }, + { + "epoch": 0.14556884765625, + "grad_norm": 10.694483757019043, + "learning_rate": 4.74610869626657e-06, + "loss": 3.4447, + "step": 14310 + }, + { + "epoch": 0.14561971028645834, + "grad_norm": 11.717911720275879, + "learning_rate": 4.745933084135071e-06, + "loss": 3.7955, + "step": 14315 + }, + { + "epoch": 0.14567057291666666, + "grad_norm": 12.842635154724121, + "learning_rate": 4.745757414541908e-06, + "loss": 3.63, + "step": 14320 + }, + { + "epoch": 0.145721435546875, + "grad_norm": 13.658503532409668, + "learning_rate": 4.745581687491573e-06, + "loss": 3.5957, + "step": 14325 + }, + { + "epoch": 0.14577229817708334, + "grad_norm": 8.604068756103516, + "learning_rate": 4.745405902988563e-06, + "loss": 3.3326, + "step": 14330 + }, + { + "epoch": 0.14582316080729166, + "grad_norm": 10.351202964782715, + "learning_rate": 4.745230061037375e-06, + "loss": 3.2391, + "step": 14335 + }, + { + "epoch": 0.1458740234375, + "grad_norm": 9.471860885620117, + "learning_rate": 4.745054161642508e-06, + "loss": 3.3626, + "step": 14340 + }, + { + "epoch": 0.14592488606770834, + "grad_norm": 10.003438949584961, + "learning_rate": 4.744878204808463e-06, + "loss": 3.5786, + "step": 14345 + }, + { + "epoch": 0.14597574869791666, + "grad_norm": 10.829023361206055, + "learning_rate": 4.744702190539741e-06, + "loss": 3.7278, + "step": 14350 + }, + { + "epoch": 0.146026611328125, + "grad_norm": 9.213700294494629, + "learning_rate": 4.744526118840844e-06, + "loss": 2.9384, + "step": 14355 + }, + { + "epoch": 0.14607747395833334, + "grad_norm": 12.245752334594727, + "learning_rate": 4.7443499897162794e-06, + "loss": 3.4541, + "step": 14360 + }, + { + "epoch": 0.14612833658854166, + "grad_norm": 9.579618453979492, + "learning_rate": 4.744173803170553e-06, + "loss": 3.2422, + "step": 14365 + }, + { + "epoch": 0.14617919921875, + "grad_norm": 7.907454967498779, + "learning_rate": 4.743997559208171e-06, + "loss": 3.2129, + "step": 14370 + }, + { + "epoch": 0.14623006184895834, + "grad_norm": 15.023747444152832, + "learning_rate": 4.743821257833644e-06, + "loss": 3.3496, + "step": 14375 + }, + { + "epoch": 0.14628092447916666, + "grad_norm": 8.13438892364502, + "learning_rate": 4.743644899051481e-06, + "loss": 3.4805, + "step": 14380 + }, + { + "epoch": 0.146331787109375, + "grad_norm": 10.506806373596191, + "learning_rate": 4.743468482866196e-06, + "loss": 3.653, + "step": 14385 + }, + { + "epoch": 0.14638264973958334, + "grad_norm": 13.162545204162598, + "learning_rate": 4.743292009282301e-06, + "loss": 3.4791, + "step": 14390 + }, + { + "epoch": 0.14643351236979166, + "grad_norm": 14.953128814697266, + "learning_rate": 4.743115478304312e-06, + "loss": 3.3943, + "step": 14395 + }, + { + "epoch": 0.146484375, + "grad_norm": 8.338605880737305, + "learning_rate": 4.742938889936745e-06, + "loss": 3.484, + "step": 14400 + }, + { + "epoch": 0.14653523763020834, + "grad_norm": 11.624462127685547, + "learning_rate": 4.742762244184117e-06, + "loss": 3.1865, + "step": 14405 + }, + { + "epoch": 0.14658610026041666, + "grad_norm": 9.61160945892334, + "learning_rate": 4.74258554105095e-06, + "loss": 3.518, + "step": 14410 + }, + { + "epoch": 0.146636962890625, + "grad_norm": 10.283778190612793, + "learning_rate": 4.742408780541763e-06, + "loss": 3.4033, + "step": 14415 + }, + { + "epoch": 0.14668782552083334, + "grad_norm": 14.608953475952148, + "learning_rate": 4.742231962661079e-06, + "loss": 3.0345, + "step": 14420 + }, + { + "epoch": 0.14673868815104166, + "grad_norm": 13.46399974822998, + "learning_rate": 4.742055087413422e-06, + "loss": 3.3489, + "step": 14425 + }, + { + "epoch": 0.14678955078125, + "grad_norm": 9.469012260437012, + "learning_rate": 4.741878154803316e-06, + "loss": 2.9819, + "step": 14430 + }, + { + "epoch": 0.14684041341145834, + "grad_norm": 10.287275314331055, + "learning_rate": 4.741701164835291e-06, + "loss": 3.191, + "step": 14435 + }, + { + "epoch": 0.14689127604166666, + "grad_norm": 12.825785636901855, + "learning_rate": 4.741524117513871e-06, + "loss": 2.9444, + "step": 14440 + }, + { + "epoch": 0.146942138671875, + "grad_norm": 11.87830924987793, + "learning_rate": 4.741347012843588e-06, + "loss": 3.3462, + "step": 14445 + }, + { + "epoch": 0.14699300130208334, + "grad_norm": 9.253486633300781, + "learning_rate": 4.7411698508289735e-06, + "loss": 3.7622, + "step": 14450 + }, + { + "epoch": 0.14704386393229166, + "grad_norm": 11.679107666015625, + "learning_rate": 4.740992631474559e-06, + "loss": 3.2838, + "step": 14455 + }, + { + "epoch": 0.1470947265625, + "grad_norm": 13.79211711883545, + "learning_rate": 4.740815354784879e-06, + "loss": 3.2918, + "step": 14460 + }, + { + "epoch": 0.14714558919270834, + "grad_norm": 14.639359474182129, + "learning_rate": 4.74063802076447e-06, + "loss": 3.4211, + "step": 14465 + }, + { + "epoch": 0.14719645182291666, + "grad_norm": 12.899956703186035, + "learning_rate": 4.7404606294178684e-06, + "loss": 3.6037, + "step": 14470 + }, + { + "epoch": 0.147247314453125, + "grad_norm": 15.321117401123047, + "learning_rate": 4.740283180749613e-06, + "loss": 3.5764, + "step": 14475 + }, + { + "epoch": 0.14729817708333334, + "grad_norm": 12.093023300170898, + "learning_rate": 4.740105674764243e-06, + "loss": 3.4525, + "step": 14480 + }, + { + "epoch": 0.14734903971354166, + "grad_norm": 12.851874351501465, + "learning_rate": 4.739928111466299e-06, + "loss": 3.7187, + "step": 14485 + }, + { + "epoch": 0.14739990234375, + "grad_norm": 11.185502052307129, + "learning_rate": 4.739750490860327e-06, + "loss": 3.6987, + "step": 14490 + }, + { + "epoch": 0.14745076497395834, + "grad_norm": 10.547276496887207, + "learning_rate": 4.7395728129508686e-06, + "loss": 3.746, + "step": 14495 + }, + { + "epoch": 0.14750162760416666, + "grad_norm": 14.449246406555176, + "learning_rate": 4.739395077742471e-06, + "loss": 3.6518, + "step": 14500 + }, + { + "epoch": 0.147552490234375, + "grad_norm": 14.639830589294434, + "learning_rate": 4.739217285239681e-06, + "loss": 4.1111, + "step": 14505 + }, + { + "epoch": 0.14760335286458334, + "grad_norm": 13.559803009033203, + "learning_rate": 4.739039435447047e-06, + "loss": 3.5169, + "step": 14510 + }, + { + "epoch": 0.14765421549479166, + "grad_norm": 17.011873245239258, + "learning_rate": 4.738861528369121e-06, + "loss": 3.3476, + "step": 14515 + }, + { + "epoch": 0.147705078125, + "grad_norm": 10.405207633972168, + "learning_rate": 4.7386835640104525e-06, + "loss": 3.1015, + "step": 14520 + }, + { + "epoch": 0.14775594075520834, + "grad_norm": 15.131099700927734, + "learning_rate": 4.738505542375595e-06, + "loss": 3.7291, + "step": 14525 + }, + { + "epoch": 0.14780680338541666, + "grad_norm": 17.359588623046875, + "learning_rate": 4.738327463469105e-06, + "loss": 3.3012, + "step": 14530 + }, + { + "epoch": 0.147857666015625, + "grad_norm": 8.4977445602417, + "learning_rate": 4.738149327295537e-06, + "loss": 4.0538, + "step": 14535 + }, + { + "epoch": 0.14790852864583334, + "grad_norm": 14.337766647338867, + "learning_rate": 4.737971133859449e-06, + "loss": 2.9475, + "step": 14540 + }, + { + "epoch": 0.14795939127604166, + "grad_norm": 6.989287853240967, + "learning_rate": 4.7377928831654e-06, + "loss": 3.5471, + "step": 14545 + }, + { + "epoch": 0.14801025390625, + "grad_norm": 8.3816499710083, + "learning_rate": 4.73761457521795e-06, + "loss": 3.4316, + "step": 14550 + }, + { + "epoch": 0.14806111653645834, + "grad_norm": 10.17712116241455, + "learning_rate": 4.7374362100216625e-06, + "loss": 3.5723, + "step": 14555 + }, + { + "epoch": 0.14811197916666666, + "grad_norm": 13.229191780090332, + "learning_rate": 4.737257787581099e-06, + "loss": 3.5133, + "step": 14560 + }, + { + "epoch": 0.148162841796875, + "grad_norm": 13.506834983825684, + "learning_rate": 4.737079307900826e-06, + "loss": 3.5203, + "step": 14565 + }, + { + "epoch": 0.14821370442708334, + "grad_norm": 13.098067283630371, + "learning_rate": 4.736900770985409e-06, + "loss": 3.1275, + "step": 14570 + }, + { + "epoch": 0.14826456705729166, + "grad_norm": 15.300044059753418, + "learning_rate": 4.7367221768394155e-06, + "loss": 3.3876, + "step": 14575 + }, + { + "epoch": 0.1483154296875, + "grad_norm": 11.651641845703125, + "learning_rate": 4.736543525467415e-06, + "loss": 3.1862, + "step": 14580 + }, + { + "epoch": 0.14836629231770834, + "grad_norm": 17.858064651489258, + "learning_rate": 4.736364816873979e-06, + "loss": 3.2058, + "step": 14585 + }, + { + "epoch": 0.14841715494791666, + "grad_norm": 11.784101486206055, + "learning_rate": 4.73618605106368e-06, + "loss": 3.3627, + "step": 14590 + }, + { + "epoch": 0.148468017578125, + "grad_norm": 16.125946044921875, + "learning_rate": 4.73600722804109e-06, + "loss": 3.2323, + "step": 14595 + }, + { + "epoch": 0.14851888020833334, + "grad_norm": 17.72896385192871, + "learning_rate": 4.735828347810785e-06, + "loss": 3.542, + "step": 14600 + }, + { + "epoch": 0.14856974283854166, + "grad_norm": 13.747530937194824, + "learning_rate": 4.735649410377342e-06, + "loss": 3.0246, + "step": 14605 + }, + { + "epoch": 0.14862060546875, + "grad_norm": 10.233755111694336, + "learning_rate": 4.735470415745339e-06, + "loss": 3.6899, + "step": 14610 + }, + { + "epoch": 0.14867146809895834, + "grad_norm": 13.699902534484863, + "learning_rate": 4.735291363919355e-06, + "loss": 3.1592, + "step": 14615 + }, + { + "epoch": 0.14872233072916666, + "grad_norm": 11.987502098083496, + "learning_rate": 4.735112254903971e-06, + "loss": 3.3525, + "step": 14620 + }, + { + "epoch": 0.148773193359375, + "grad_norm": 13.890175819396973, + "learning_rate": 4.73493308870377e-06, + "loss": 3.3058, + "step": 14625 + }, + { + "epoch": 0.14882405598958334, + "grad_norm": 14.703634262084961, + "learning_rate": 4.734753865323336e-06, + "loss": 3.3691, + "step": 14630 + }, + { + "epoch": 0.14887491861979166, + "grad_norm": 11.793313980102539, + "learning_rate": 4.734574584767253e-06, + "loss": 3.1809, + "step": 14635 + }, + { + "epoch": 0.14892578125, + "grad_norm": 13.151676177978516, + "learning_rate": 4.73439524704011e-06, + "loss": 3.4036, + "step": 14640 + }, + { + "epoch": 0.14897664388020834, + "grad_norm": 10.763456344604492, + "learning_rate": 4.734215852146493e-06, + "loss": 3.2028, + "step": 14645 + }, + { + "epoch": 0.14902750651041666, + "grad_norm": 15.11712646484375, + "learning_rate": 4.734036400090994e-06, + "loss": 3.5627, + "step": 14650 + }, + { + "epoch": 0.149078369140625, + "grad_norm": 7.594333171844482, + "learning_rate": 4.7338568908782036e-06, + "loss": 3.526, + "step": 14655 + }, + { + "epoch": 0.14912923177083334, + "grad_norm": 16.161596298217773, + "learning_rate": 4.733677324512713e-06, + "loss": 3.3709, + "step": 14660 + }, + { + "epoch": 0.14918009440104166, + "grad_norm": 9.044793128967285, + "learning_rate": 4.733497700999119e-06, + "loss": 3.2466, + "step": 14665 + }, + { + "epoch": 0.14923095703125, + "grad_norm": 9.249896049499512, + "learning_rate": 4.733318020342014e-06, + "loss": 3.7076, + "step": 14670 + }, + { + "epoch": 0.14928181966145834, + "grad_norm": 9.653711318969727, + "learning_rate": 4.7331382825459985e-06, + "loss": 3.3792, + "step": 14675 + }, + { + "epoch": 0.14933268229166666, + "grad_norm": 10.778152465820312, + "learning_rate": 4.732958487615668e-06, + "loss": 3.2468, + "step": 14680 + }, + { + "epoch": 0.149383544921875, + "grad_norm": 12.605219841003418, + "learning_rate": 4.7327786355556235e-06, + "loss": 3.717, + "step": 14685 + }, + { + "epoch": 0.14943440755208334, + "grad_norm": 11.11141300201416, + "learning_rate": 4.7325987263704685e-06, + "loss": 3.1906, + "step": 14690 + }, + { + "epoch": 0.14948527018229166, + "grad_norm": 13.047835350036621, + "learning_rate": 4.732418760064803e-06, + "loss": 3.5693, + "step": 14695 + }, + { + "epoch": 0.1495361328125, + "grad_norm": 12.675945281982422, + "learning_rate": 4.7322387366432335e-06, + "loss": 3.2765, + "step": 14700 + }, + { + "epoch": 0.14958699544270834, + "grad_norm": 11.04984188079834, + "learning_rate": 4.732058656110364e-06, + "loss": 3.0329, + "step": 14705 + }, + { + "epoch": 0.14963785807291666, + "grad_norm": 14.509688377380371, + "learning_rate": 4.7318785184708035e-06, + "loss": 3.5102, + "step": 14710 + }, + { + "epoch": 0.149688720703125, + "grad_norm": 12.124781608581543, + "learning_rate": 4.731698323729161e-06, + "loss": 3.2282, + "step": 14715 + }, + { + "epoch": 0.14973958333333334, + "grad_norm": 10.267120361328125, + "learning_rate": 4.731518071890045e-06, + "loss": 3.3788, + "step": 14720 + }, + { + "epoch": 0.14979044596354166, + "grad_norm": 9.034321784973145, + "learning_rate": 4.731337762958067e-06, + "loss": 3.4097, + "step": 14725 + }, + { + "epoch": 0.14984130859375, + "grad_norm": 13.264888763427734, + "learning_rate": 4.731157396937842e-06, + "loss": 3.5671, + "step": 14730 + }, + { + "epoch": 0.14989217122395834, + "grad_norm": 12.145549774169922, + "learning_rate": 4.730976973833984e-06, + "loss": 3.1394, + "step": 14735 + }, + { + "epoch": 0.14994303385416666, + "grad_norm": 10.380547523498535, + "learning_rate": 4.7307964936511095e-06, + "loss": 3.4537, + "step": 14740 + }, + { + "epoch": 0.149993896484375, + "grad_norm": 10.687427520751953, + "learning_rate": 4.730615956393835e-06, + "loss": 3.2803, + "step": 14745 + }, + { + "epoch": 0.15004475911458334, + "grad_norm": 9.110136985778809, + "learning_rate": 4.730435362066779e-06, + "loss": 3.1083, + "step": 14750 + }, + { + "epoch": 0.15009562174479166, + "grad_norm": 27.618864059448242, + "learning_rate": 4.730254710674564e-06, + "loss": 3.4778, + "step": 14755 + }, + { + "epoch": 0.150146484375, + "grad_norm": 9.159721374511719, + "learning_rate": 4.73007400222181e-06, + "loss": 3.5116, + "step": 14760 + }, + { + "epoch": 0.15019734700520834, + "grad_norm": 13.149566650390625, + "learning_rate": 4.729893236713142e-06, + "loss": 3.3386, + "step": 14765 + }, + { + "epoch": 0.15024820963541666, + "grad_norm": 10.829029083251953, + "learning_rate": 4.729712414153184e-06, + "loss": 3.862, + "step": 14770 + }, + { + "epoch": 0.150299072265625, + "grad_norm": 11.96367073059082, + "learning_rate": 4.729531534546563e-06, + "loss": 3.66, + "step": 14775 + }, + { + "epoch": 0.15034993489583334, + "grad_norm": 14.411954879760742, + "learning_rate": 4.729350597897905e-06, + "loss": 3.2965, + "step": 14780 + }, + { + "epoch": 0.15040079752604166, + "grad_norm": 12.512883186340332, + "learning_rate": 4.729169604211841e-06, + "loss": 3.2615, + "step": 14785 + }, + { + "epoch": 0.15045166015625, + "grad_norm": 11.264093399047852, + "learning_rate": 4.728988553493001e-06, + "loss": 3.1227, + "step": 14790 + }, + { + "epoch": 0.15050252278645834, + "grad_norm": 7.70527458190918, + "learning_rate": 4.728807445746018e-06, + "loss": 3.3477, + "step": 14795 + }, + { + "epoch": 0.15055338541666666, + "grad_norm": 14.769386291503906, + "learning_rate": 4.728626280975523e-06, + "loss": 3.6628, + "step": 14800 + }, + { + "epoch": 0.150604248046875, + "grad_norm": 13.030516624450684, + "learning_rate": 4.7284450591861545e-06, + "loss": 3.377, + "step": 14805 + }, + { + "epoch": 0.15065511067708334, + "grad_norm": 12.844939231872559, + "learning_rate": 4.728263780382546e-06, + "loss": 3.8196, + "step": 14810 + }, + { + "epoch": 0.15070597330729166, + "grad_norm": 10.097814559936523, + "learning_rate": 4.728082444569337e-06, + "loss": 3.8156, + "step": 14815 + }, + { + "epoch": 0.1507568359375, + "grad_norm": 10.388051986694336, + "learning_rate": 4.727901051751167e-06, + "loss": 3.4716, + "step": 14820 + }, + { + "epoch": 0.15080769856770834, + "grad_norm": 13.543767929077148, + "learning_rate": 4.727719601932678e-06, + "loss": 3.4144, + "step": 14825 + }, + { + "epoch": 0.15085856119791666, + "grad_norm": 11.783435821533203, + "learning_rate": 4.727538095118509e-06, + "loss": 3.2369, + "step": 14830 + }, + { + "epoch": 0.150909423828125, + "grad_norm": 14.941376686096191, + "learning_rate": 4.727356531313307e-06, + "loss": 3.5681, + "step": 14835 + }, + { + "epoch": 0.15096028645833334, + "grad_norm": 13.9028959274292, + "learning_rate": 4.727174910521716e-06, + "loss": 3.2229, + "step": 14840 + }, + { + "epoch": 0.15101114908854166, + "grad_norm": 13.589739799499512, + "learning_rate": 4.726993232748382e-06, + "loss": 3.3788, + "step": 14845 + }, + { + "epoch": 0.15106201171875, + "grad_norm": 11.502520561218262, + "learning_rate": 4.7268114979979555e-06, + "loss": 3.4142, + "step": 14850 + }, + { + "epoch": 0.15111287434895834, + "grad_norm": 12.786849975585938, + "learning_rate": 4.726629706275083e-06, + "loss": 3.5127, + "step": 14855 + }, + { + "epoch": 0.15116373697916666, + "grad_norm": 7.40986967086792, + "learning_rate": 4.7264478575844185e-06, + "loss": 3.127, + "step": 14860 + }, + { + "epoch": 0.151214599609375, + "grad_norm": 10.638337135314941, + "learning_rate": 4.726265951930612e-06, + "loss": 3.0998, + "step": 14865 + }, + { + "epoch": 0.15126546223958334, + "grad_norm": 10.695724487304688, + "learning_rate": 4.72608398931832e-06, + "loss": 3.5446, + "step": 14870 + }, + { + "epoch": 0.15131632486979166, + "grad_norm": 11.744098663330078, + "learning_rate": 4.7259019697521955e-06, + "loss": 2.9158, + "step": 14875 + }, + { + "epoch": 0.1513671875, + "grad_norm": 9.356399536132812, + "learning_rate": 4.725719893236898e-06, + "loss": 3.2486, + "step": 14880 + }, + { + "epoch": 0.15141805013020834, + "grad_norm": 12.832469940185547, + "learning_rate": 4.725537759777084e-06, + "loss": 3.2114, + "step": 14885 + }, + { + "epoch": 0.15146891276041666, + "grad_norm": 12.14478588104248, + "learning_rate": 4.725355569377415e-06, + "loss": 2.9888, + "step": 14890 + }, + { + "epoch": 0.151519775390625, + "grad_norm": 8.157812118530273, + "learning_rate": 4.72517332204255e-06, + "loss": 3.3431, + "step": 14895 + }, + { + "epoch": 0.15157063802083334, + "grad_norm": 12.803070068359375, + "learning_rate": 4.724991017777153e-06, + "loss": 3.5178, + "step": 14900 + }, + { + "epoch": 0.15162150065104166, + "grad_norm": 8.299994468688965, + "learning_rate": 4.7248086565858886e-06, + "loss": 3.5268, + "step": 14905 + }, + { + "epoch": 0.15167236328125, + "grad_norm": 12.789700508117676, + "learning_rate": 4.724626238473421e-06, + "loss": 3.2294, + "step": 14910 + }, + { + "epoch": 0.15172322591145834, + "grad_norm": 9.37491226196289, + "learning_rate": 4.724443763444419e-06, + "loss": 3.22, + "step": 14915 + }, + { + "epoch": 0.15177408854166666, + "grad_norm": 14.288036346435547, + "learning_rate": 4.724261231503552e-06, + "loss": 3.3765, + "step": 14920 + }, + { + "epoch": 0.151824951171875, + "grad_norm": 17.946796417236328, + "learning_rate": 4.724078642655487e-06, + "loss": 3.1767, + "step": 14925 + }, + { + "epoch": 0.15187581380208334, + "grad_norm": 9.992767333984375, + "learning_rate": 4.723895996904897e-06, + "loss": 4.5134, + "step": 14930 + }, + { + "epoch": 0.15192667643229166, + "grad_norm": 12.649903297424316, + "learning_rate": 4.7237132942564565e-06, + "loss": 3.2216, + "step": 14935 + }, + { + "epoch": 0.1519775390625, + "grad_norm": 11.577804565429688, + "learning_rate": 4.723530534714837e-06, + "loss": 3.3525, + "step": 14940 + }, + { + "epoch": 0.15202840169270834, + "grad_norm": 11.609172821044922, + "learning_rate": 4.723347718284716e-06, + "loss": 3.2994, + "step": 14945 + }, + { + "epoch": 0.15207926432291666, + "grad_norm": 11.450626373291016, + "learning_rate": 4.723164844970771e-06, + "loss": 3.1224, + "step": 14950 + }, + { + "epoch": 0.152130126953125, + "grad_norm": 12.139467239379883, + "learning_rate": 4.722981914777681e-06, + "loss": 2.981, + "step": 14955 + }, + { + "epoch": 0.15218098958333334, + "grad_norm": 14.975263595581055, + "learning_rate": 4.722798927710124e-06, + "loss": 3.2868, + "step": 14960 + }, + { + "epoch": 0.15223185221354166, + "grad_norm": 14.647977828979492, + "learning_rate": 4.722615883772785e-06, + "loss": 3.3901, + "step": 14965 + }, + { + "epoch": 0.15228271484375, + "grad_norm": 9.550837516784668, + "learning_rate": 4.7224327829703444e-06, + "loss": 3.8862, + "step": 14970 + }, + { + "epoch": 0.15233357747395834, + "grad_norm": 10.410728454589844, + "learning_rate": 4.7222496253074876e-06, + "loss": 3.3335, + "step": 14975 + }, + { + "epoch": 0.15238444010416666, + "grad_norm": 9.198328018188477, + "learning_rate": 4.722066410788902e-06, + "loss": 3.7736, + "step": 14980 + }, + { + "epoch": 0.152435302734375, + "grad_norm": 15.138815879821777, + "learning_rate": 4.721883139419273e-06, + "loss": 3.2838, + "step": 14985 + }, + { + "epoch": 0.15248616536458334, + "grad_norm": 10.338898658752441, + "learning_rate": 4.721699811203291e-06, + "loss": 3.3141, + "step": 14990 + }, + { + "epoch": 0.15253702799479166, + "grad_norm": 11.960433959960938, + "learning_rate": 4.721516426145646e-06, + "loss": 3.3508, + "step": 14995 + }, + { + "epoch": 0.152587890625, + "grad_norm": 12.279603958129883, + "learning_rate": 4.7213329842510295e-06, + "loss": 3.3861, + "step": 15000 + }, + { + "epoch": 0.15263875325520834, + "grad_norm": 11.51431941986084, + "learning_rate": 4.721149485524135e-06, + "loss": 3.3131, + "step": 15005 + }, + { + "epoch": 0.15268961588541666, + "grad_norm": 8.839967727661133, + "learning_rate": 4.720965929969658e-06, + "loss": 3.2533, + "step": 15010 + }, + { + "epoch": 0.152740478515625, + "grad_norm": 8.14935302734375, + "learning_rate": 4.720782317592293e-06, + "loss": 3.6221, + "step": 15015 + }, + { + "epoch": 0.15279134114583334, + "grad_norm": 12.631562232971191, + "learning_rate": 4.7205986483967396e-06, + "loss": 3.6434, + "step": 15020 + }, + { + "epoch": 0.15284220377604166, + "grad_norm": 12.91220760345459, + "learning_rate": 4.720414922387696e-06, + "loss": 3.1931, + "step": 15025 + }, + { + "epoch": 0.15289306640625, + "grad_norm": 15.801246643066406, + "learning_rate": 4.720231139569863e-06, + "loss": 3.4381, + "step": 15030 + }, + { + "epoch": 0.15294392903645834, + "grad_norm": 11.53238296508789, + "learning_rate": 4.720047299947943e-06, + "loss": 3.7497, + "step": 15035 + }, + { + "epoch": 0.15299479166666666, + "grad_norm": 12.755319595336914, + "learning_rate": 4.7198634035266375e-06, + "loss": 3.2225, + "step": 15040 + }, + { + "epoch": 0.153045654296875, + "grad_norm": 9.745579719543457, + "learning_rate": 4.719679450310654e-06, + "loss": 3.5779, + "step": 15045 + }, + { + "epoch": 0.15309651692708334, + "grad_norm": 12.108492851257324, + "learning_rate": 4.719495440304698e-06, + "loss": 3.9278, + "step": 15050 + }, + { + "epoch": 0.15314737955729166, + "grad_norm": 13.5195951461792, + "learning_rate": 4.719311373513477e-06, + "loss": 3.3074, + "step": 15055 + }, + { + "epoch": 0.1531982421875, + "grad_norm": 14.34796142578125, + "learning_rate": 4.719127249941701e-06, + "loss": 3.2327, + "step": 15060 + }, + { + "epoch": 0.15324910481770834, + "grad_norm": 10.067496299743652, + "learning_rate": 4.718943069594079e-06, + "loss": 3.8594, + "step": 15065 + }, + { + "epoch": 0.15329996744791666, + "grad_norm": 9.026041984558105, + "learning_rate": 4.718758832475326e-06, + "loss": 3.4053, + "step": 15070 + }, + { + "epoch": 0.153350830078125, + "grad_norm": 14.132732391357422, + "learning_rate": 4.718574538590154e-06, + "loss": 3.3339, + "step": 15075 + }, + { + "epoch": 0.15340169270833334, + "grad_norm": 13.146659851074219, + "learning_rate": 4.718390187943278e-06, + "loss": 4.1499, + "step": 15080 + }, + { + "epoch": 0.15345255533854166, + "grad_norm": 12.891654014587402, + "learning_rate": 4.7182057805394145e-06, + "loss": 3.1477, + "step": 15085 + }, + { + "epoch": 0.15350341796875, + "grad_norm": 8.704354286193848, + "learning_rate": 4.718021316383282e-06, + "loss": 3.4241, + "step": 15090 + }, + { + "epoch": 0.15355428059895834, + "grad_norm": 8.283037185668945, + "learning_rate": 4.7178367954796e-06, + "loss": 3.4241, + "step": 15095 + }, + { + "epoch": 0.15360514322916666, + "grad_norm": 14.645727157592773, + "learning_rate": 4.7176522178330895e-06, + "loss": 4.3103, + "step": 15100 + }, + { + "epoch": 0.153656005859375, + "grad_norm": 8.679193496704102, + "learning_rate": 4.717467583448472e-06, + "loss": 4.3254, + "step": 15105 + }, + { + "epoch": 0.15370686848958334, + "grad_norm": 22.80760383605957, + "learning_rate": 4.717282892330472e-06, + "loss": 3.3115, + "step": 15110 + }, + { + "epoch": 0.15375773111979166, + "grad_norm": 10.638278007507324, + "learning_rate": 4.717098144483815e-06, + "loss": 3.4555, + "step": 15115 + }, + { + "epoch": 0.15380859375, + "grad_norm": 16.177385330200195, + "learning_rate": 4.7169133399132285e-06, + "loss": 3.2353, + "step": 15120 + }, + { + "epoch": 0.15385945638020834, + "grad_norm": 13.588946342468262, + "learning_rate": 4.7167284786234385e-06, + "loss": 3.512, + "step": 15125 + }, + { + "epoch": 0.15391031901041666, + "grad_norm": 14.394001960754395, + "learning_rate": 4.716543560619175e-06, + "loss": 3.0573, + "step": 15130 + }, + { + "epoch": 0.153961181640625, + "grad_norm": 9.461201667785645, + "learning_rate": 4.716358585905172e-06, + "loss": 3.2514, + "step": 15135 + }, + { + "epoch": 0.15401204427083334, + "grad_norm": 13.264630317687988, + "learning_rate": 4.716173554486159e-06, + "loss": 3.2544, + "step": 15140 + }, + { + "epoch": 0.15406290690104166, + "grad_norm": 11.358147621154785, + "learning_rate": 4.71598846636687e-06, + "loss": 3.5706, + "step": 15145 + }, + { + "epoch": 0.15411376953125, + "grad_norm": 14.053950309753418, + "learning_rate": 4.715803321552043e-06, + "loss": 3.4664, + "step": 15150 + }, + { + "epoch": 0.15416463216145834, + "grad_norm": 14.992704391479492, + "learning_rate": 4.715618120046412e-06, + "loss": 3.9273, + "step": 15155 + }, + { + "epoch": 0.15421549479166666, + "grad_norm": 13.31781005859375, + "learning_rate": 4.715432861854717e-06, + "loss": 3.5456, + "step": 15160 + }, + { + "epoch": 0.154266357421875, + "grad_norm": 8.325743675231934, + "learning_rate": 4.715247546981697e-06, + "loss": 3.3918, + "step": 15165 + }, + { + "epoch": 0.15431722005208334, + "grad_norm": 11.262593269348145, + "learning_rate": 4.715062175432093e-06, + "loss": 3.5529, + "step": 15170 + }, + { + "epoch": 0.15436808268229166, + "grad_norm": 10.394039154052734, + "learning_rate": 4.71487674721065e-06, + "loss": 3.3751, + "step": 15175 + }, + { + "epoch": 0.1544189453125, + "grad_norm": 16.578327178955078, + "learning_rate": 4.7146912623221095e-06, + "loss": 3.1776, + "step": 15180 + }, + { + "epoch": 0.15446980794270834, + "grad_norm": 11.776280403137207, + "learning_rate": 4.7145057207712175e-06, + "loss": 3.5786, + "step": 15185 + }, + { + "epoch": 0.15452067057291666, + "grad_norm": 12.497096061706543, + "learning_rate": 4.714320122562722e-06, + "loss": 3.0066, + "step": 15190 + }, + { + "epoch": 0.154571533203125, + "grad_norm": 9.880024909973145, + "learning_rate": 4.714134467701371e-06, + "loss": 3.3478, + "step": 15195 + }, + { + "epoch": 0.15462239583333334, + "grad_norm": 9.223662376403809, + "learning_rate": 4.713948756191915e-06, + "loss": 3.3263, + "step": 15200 + }, + { + "epoch": 0.15467325846354166, + "grad_norm": 7.955392837524414, + "learning_rate": 4.713762988039105e-06, + "loss": 3.4025, + "step": 15205 + }, + { + "epoch": 0.15472412109375, + "grad_norm": 10.604644775390625, + "learning_rate": 4.713577163247692e-06, + "loss": 2.9697, + "step": 15210 + }, + { + "epoch": 0.15477498372395834, + "grad_norm": 7.452188014984131, + "learning_rate": 4.713391281822433e-06, + "loss": 3.6369, + "step": 15215 + }, + { + "epoch": 0.15482584635416666, + "grad_norm": 12.423209190368652, + "learning_rate": 4.713205343768082e-06, + "loss": 3.3861, + "step": 15220 + }, + { + "epoch": 0.154876708984375, + "grad_norm": 17.55681610107422, + "learning_rate": 4.713019349089399e-06, + "loss": 3.6858, + "step": 15225 + }, + { + "epoch": 0.15492757161458334, + "grad_norm": 14.02679443359375, + "learning_rate": 4.712833297791138e-06, + "loss": 3.167, + "step": 15230 + }, + { + "epoch": 0.15497843424479166, + "grad_norm": 11.839001655578613, + "learning_rate": 4.712647189878063e-06, + "loss": 2.955, + "step": 15235 + }, + { + "epoch": 0.155029296875, + "grad_norm": 11.20364761352539, + "learning_rate": 4.712461025354933e-06, + "loss": 3.0909, + "step": 15240 + }, + { + "epoch": 0.15508015950520834, + "grad_norm": 18.19971466064453, + "learning_rate": 4.712274804226513e-06, + "loss": 3.2367, + "step": 15245 + }, + { + "epoch": 0.15513102213541666, + "grad_norm": 7.526338577270508, + "learning_rate": 4.712088526497566e-06, + "loss": 3.3709, + "step": 15250 + }, + { + "epoch": 0.155181884765625, + "grad_norm": 15.513593673706055, + "learning_rate": 4.711902192172858e-06, + "loss": 3.9162, + "step": 15255 + }, + { + "epoch": 0.15523274739583334, + "grad_norm": 10.80632495880127, + "learning_rate": 4.7117158012571585e-06, + "loss": 3.1795, + "step": 15260 + }, + { + "epoch": 0.15528361002604166, + "grad_norm": 13.773353576660156, + "learning_rate": 4.711529353755233e-06, + "loss": 3.7235, + "step": 15265 + }, + { + "epoch": 0.15533447265625, + "grad_norm": 9.968823432922363, + "learning_rate": 4.711342849671853e-06, + "loss": 3.4381, + "step": 15270 + }, + { + "epoch": 0.15538533528645834, + "grad_norm": 7.348340034484863, + "learning_rate": 4.711156289011792e-06, + "loss": 2.9837, + "step": 15275 + }, + { + "epoch": 0.15543619791666666, + "grad_norm": 19.31353187561035, + "learning_rate": 4.710969671779819e-06, + "loss": 3.2548, + "step": 15280 + }, + { + "epoch": 0.155487060546875, + "grad_norm": 13.9642915725708, + "learning_rate": 4.7107829979807124e-06, + "loss": 3.1202, + "step": 15285 + }, + { + "epoch": 0.15553792317708334, + "grad_norm": 12.23745346069336, + "learning_rate": 4.710596267619247e-06, + "loss": 3.3532, + "step": 15290 + }, + { + "epoch": 0.15558878580729166, + "grad_norm": 10.830820083618164, + "learning_rate": 4.710409480700199e-06, + "loss": 3.5973, + "step": 15295 + }, + { + "epoch": 0.1556396484375, + "grad_norm": 10.229269981384277, + "learning_rate": 4.710222637228349e-06, + "loss": 3.5623, + "step": 15300 + }, + { + "epoch": 0.15569051106770834, + "grad_norm": 11.168558120727539, + "learning_rate": 4.710035737208477e-06, + "loss": 3.0141, + "step": 15305 + }, + { + "epoch": 0.15574137369791666, + "grad_norm": 7.080033302307129, + "learning_rate": 4.709848780645364e-06, + "loss": 3.4462, + "step": 15310 + }, + { + "epoch": 0.155792236328125, + "grad_norm": 7.330209255218506, + "learning_rate": 4.709661767543794e-06, + "loss": 3.2099, + "step": 15315 + }, + { + "epoch": 0.15584309895833334, + "grad_norm": 7.501708030700684, + "learning_rate": 4.709474697908552e-06, + "loss": 3.3722, + "step": 15320 + }, + { + "epoch": 0.15589396158854166, + "grad_norm": 9.475503921508789, + "learning_rate": 4.709287571744423e-06, + "loss": 3.6584, + "step": 15325 + }, + { + "epoch": 0.15594482421875, + "grad_norm": 14.18191146850586, + "learning_rate": 4.709100389056195e-06, + "loss": 3.3899, + "step": 15330 + }, + { + "epoch": 0.15599568684895834, + "grad_norm": 9.33660888671875, + "learning_rate": 4.708913149848658e-06, + "loss": 3.455, + "step": 15335 + }, + { + "epoch": 0.15604654947916666, + "grad_norm": 10.5362548828125, + "learning_rate": 4.7087258541266e-06, + "loss": 3.5219, + "step": 15340 + }, + { + "epoch": 0.156097412109375, + "grad_norm": 13.641423225402832, + "learning_rate": 4.7085385018948155e-06, + "loss": 3.8866, + "step": 15345 + }, + { + "epoch": 0.15614827473958334, + "grad_norm": 14.66014289855957, + "learning_rate": 4.708351093158097e-06, + "loss": 3.6575, + "step": 15350 + }, + { + "epoch": 0.15619913736979166, + "grad_norm": 10.71635627746582, + "learning_rate": 4.708163627921239e-06, + "loss": 3.5451, + "step": 15355 + }, + { + "epoch": 0.15625, + "grad_norm": 12.844642639160156, + "learning_rate": 4.7079761061890374e-06, + "loss": 3.3541, + "step": 15360 + }, + { + "epoch": 0.15630086263020834, + "grad_norm": 9.12993335723877, + "learning_rate": 4.707788527966291e-06, + "loss": 3.3004, + "step": 15365 + }, + { + "epoch": 0.15635172526041666, + "grad_norm": 8.735359191894531, + "learning_rate": 4.707600893257799e-06, + "loss": 3.6416, + "step": 15370 + }, + { + "epoch": 0.156402587890625, + "grad_norm": 17.718563079833984, + "learning_rate": 4.707413202068361e-06, + "loss": 3.4838, + "step": 15375 + }, + { + "epoch": 0.15645345052083334, + "grad_norm": 13.61154556274414, + "learning_rate": 4.707225454402779e-06, + "loss": 3.3291, + "step": 15380 + }, + { + "epoch": 0.15650431315104166, + "grad_norm": 10.323596954345703, + "learning_rate": 4.707037650265857e-06, + "loss": 3.44, + "step": 15385 + }, + { + "epoch": 0.15655517578125, + "grad_norm": 12.949178695678711, + "learning_rate": 4.7068497896624014e-06, + "loss": 3.6229, + "step": 15390 + }, + { + "epoch": 0.15660603841145834, + "grad_norm": 10.059599876403809, + "learning_rate": 4.706661872597215e-06, + "loss": 3.1794, + "step": 15395 + }, + { + "epoch": 0.15665690104166666, + "grad_norm": 13.678911209106445, + "learning_rate": 4.706473899075108e-06, + "loss": 3.9318, + "step": 15400 + }, + { + "epoch": 0.156707763671875, + "grad_norm": 11.830682754516602, + "learning_rate": 4.7062858691008906e-06, + "loss": 3.383, + "step": 15405 + }, + { + "epoch": 0.15675862630208334, + "grad_norm": 17.724088668823242, + "learning_rate": 4.706097782679371e-06, + "loss": 3.9705, + "step": 15410 + }, + { + "epoch": 0.15680948893229166, + "grad_norm": 7.41434907913208, + "learning_rate": 4.7059096398153624e-06, + "loss": 3.3684, + "step": 15415 + }, + { + "epoch": 0.1568603515625, + "grad_norm": 10.549930572509766, + "learning_rate": 4.705721440513679e-06, + "loss": 3.5617, + "step": 15420 + }, + { + "epoch": 0.15691121419270834, + "grad_norm": 13.3803071975708, + "learning_rate": 4.705533184779135e-06, + "loss": 3.4956, + "step": 15425 + }, + { + "epoch": 0.15696207682291666, + "grad_norm": 9.431879043579102, + "learning_rate": 4.705344872616548e-06, + "loss": 3.3123, + "step": 15430 + }, + { + "epoch": 0.157012939453125, + "grad_norm": 13.48584270477295, + "learning_rate": 4.705156504030735e-06, + "loss": 3.5462, + "step": 15435 + }, + { + "epoch": 0.15706380208333334, + "grad_norm": 12.505586624145508, + "learning_rate": 4.7049680790265145e-06, + "loss": 3.5568, + "step": 15440 + }, + { + "epoch": 0.15711466471354166, + "grad_norm": 14.382122993469238, + "learning_rate": 4.704779597608709e-06, + "loss": 3.2458, + "step": 15445 + }, + { + "epoch": 0.15716552734375, + "grad_norm": 17.052955627441406, + "learning_rate": 4.70459105978214e-06, + "loss": 3.3895, + "step": 15450 + }, + { + "epoch": 0.15721638997395834, + "grad_norm": 9.036126136779785, + "learning_rate": 4.704402465551632e-06, + "loss": 3.3079, + "step": 15455 + }, + { + "epoch": 0.15726725260416666, + "grad_norm": 10.451210021972656, + "learning_rate": 4.704213814922008e-06, + "loss": 3.5565, + "step": 15460 + }, + { + "epoch": 0.157318115234375, + "grad_norm": 9.082598686218262, + "learning_rate": 4.704025107898097e-06, + "loss": 3.1954, + "step": 15465 + }, + { + "epoch": 0.15736897786458334, + "grad_norm": 15.115447044372559, + "learning_rate": 4.703836344484726e-06, + "loss": 3.3543, + "step": 15470 + }, + { + "epoch": 0.15741984049479166, + "grad_norm": 11.461282730102539, + "learning_rate": 4.7036475246867245e-06, + "loss": 3.7555, + "step": 15475 + }, + { + "epoch": 0.157470703125, + "grad_norm": 10.950669288635254, + "learning_rate": 4.703458648508923e-06, + "loss": 3.5753, + "step": 15480 + }, + { + "epoch": 0.15752156575520834, + "grad_norm": 10.564921379089355, + "learning_rate": 4.703269715956154e-06, + "loss": 3.0771, + "step": 15485 + }, + { + "epoch": 0.15757242838541666, + "grad_norm": 11.57642650604248, + "learning_rate": 4.703080727033252e-06, + "loss": 3.1755, + "step": 15490 + }, + { + "epoch": 0.157623291015625, + "grad_norm": 10.133732795715332, + "learning_rate": 4.702891681745052e-06, + "loss": 3.6182, + "step": 15495 + }, + { + "epoch": 0.15767415364583334, + "grad_norm": 11.367161750793457, + "learning_rate": 4.70270258009639e-06, + "loss": 3.2131, + "step": 15500 + }, + { + "epoch": 0.15772501627604166, + "grad_norm": 10.937525749206543, + "learning_rate": 4.702513422092106e-06, + "loss": 3.6436, + "step": 15505 + }, + { + "epoch": 0.15777587890625, + "grad_norm": 7.845228672027588, + "learning_rate": 4.7023242077370365e-06, + "loss": 3.2949, + "step": 15510 + }, + { + "epoch": 0.15782674153645834, + "grad_norm": 10.833147048950195, + "learning_rate": 4.7021349370360246e-06, + "loss": 3.5691, + "step": 15515 + }, + { + "epoch": 0.15787760416666666, + "grad_norm": 10.04218578338623, + "learning_rate": 4.701945609993912e-06, + "loss": 3.6244, + "step": 15520 + }, + { + "epoch": 0.157928466796875, + "grad_norm": 15.010988235473633, + "learning_rate": 4.701756226615544e-06, + "loss": 3.5053, + "step": 15525 + }, + { + "epoch": 0.15797932942708334, + "grad_norm": 12.318516731262207, + "learning_rate": 4.701566786905763e-06, + "loss": 3.2215, + "step": 15530 + }, + { + "epoch": 0.15803019205729166, + "grad_norm": 12.219952583312988, + "learning_rate": 4.701377290869419e-06, + "loss": 3.5105, + "step": 15535 + }, + { + "epoch": 0.1580810546875, + "grad_norm": 15.182580947875977, + "learning_rate": 4.701187738511358e-06, + "loss": 3.6047, + "step": 15540 + }, + { + "epoch": 0.15813191731770834, + "grad_norm": 8.09954833984375, + "learning_rate": 4.700998129836431e-06, + "loss": 3.2873, + "step": 15545 + }, + { + "epoch": 0.15818277994791666, + "grad_norm": 12.153461456298828, + "learning_rate": 4.700808464849489e-06, + "loss": 3.2927, + "step": 15550 + }, + { + "epoch": 0.158233642578125, + "grad_norm": 16.8814754486084, + "learning_rate": 4.700618743555384e-06, + "loss": 3.6992, + "step": 15555 + }, + { + "epoch": 0.15828450520833334, + "grad_norm": 8.98434066772461, + "learning_rate": 4.700428965958968e-06, + "loss": 3.2662, + "step": 15560 + }, + { + "epoch": 0.15833536783854166, + "grad_norm": 14.116239547729492, + "learning_rate": 4.700239132065101e-06, + "loss": 3.6816, + "step": 15565 + }, + { + "epoch": 0.15838623046875, + "grad_norm": 14.018820762634277, + "learning_rate": 4.700049241878637e-06, + "loss": 3.4, + "step": 15570 + }, + { + "epoch": 0.15843709309895834, + "grad_norm": 9.490670204162598, + "learning_rate": 4.699859295404433e-06, + "loss": 3.3853, + "step": 15575 + }, + { + "epoch": 0.15848795572916666, + "grad_norm": 12.291707992553711, + "learning_rate": 4.699669292647352e-06, + "loss": 3.4993, + "step": 15580 + }, + { + "epoch": 0.158538818359375, + "grad_norm": 11.097095489501953, + "learning_rate": 4.699479233612252e-06, + "loss": 3.4258, + "step": 15585 + }, + { + "epoch": 0.15858968098958334, + "grad_norm": 16.145360946655273, + "learning_rate": 4.699289118303998e-06, + "loss": 3.6462, + "step": 15590 + }, + { + "epoch": 0.15864054361979166, + "grad_norm": 13.295560836791992, + "learning_rate": 4.699098946727454e-06, + "loss": 3.7865, + "step": 15595 + }, + { + "epoch": 0.15869140625, + "grad_norm": 7.7286152839660645, + "learning_rate": 4.6989087188874835e-06, + "loss": 3.2336, + "step": 15600 + }, + { + "epoch": 0.15874226888020834, + "grad_norm": 8.864896774291992, + "learning_rate": 4.698718434788955e-06, + "loss": 3.5222, + "step": 15605 + }, + { + "epoch": 0.15879313151041666, + "grad_norm": 16.60683822631836, + "learning_rate": 4.698528094436737e-06, + "loss": 3.5871, + "step": 15610 + }, + { + "epoch": 0.158843994140625, + "grad_norm": 13.247583389282227, + "learning_rate": 4.698337697835697e-06, + "loss": 3.3722, + "step": 15615 + }, + { + "epoch": 0.15889485677083334, + "grad_norm": 10.77502155303955, + "learning_rate": 4.69814724499071e-06, + "loss": 3.3955, + "step": 15620 + }, + { + "epoch": 0.15894571940104166, + "grad_norm": 12.344950675964355, + "learning_rate": 4.697956735906646e-06, + "loss": 3.197, + "step": 15625 + }, + { + "epoch": 0.15899658203125, + "grad_norm": 14.187060356140137, + "learning_rate": 4.6977661705883805e-06, + "loss": 3.3216, + "step": 15630 + }, + { + "epoch": 0.15904744466145834, + "grad_norm": 14.362591743469238, + "learning_rate": 4.697575549040788e-06, + "loss": 3.4256, + "step": 15635 + }, + { + "epoch": 0.15909830729166666, + "grad_norm": 10.191631317138672, + "learning_rate": 4.697384871268745e-06, + "loss": 3.4145, + "step": 15640 + }, + { + "epoch": 0.159149169921875, + "grad_norm": 15.049766540527344, + "learning_rate": 4.697194137277132e-06, + "loss": 3.4378, + "step": 15645 + }, + { + "epoch": 0.15920003255208334, + "grad_norm": 16.25983428955078, + "learning_rate": 4.697003347070828e-06, + "loss": 3.3361, + "step": 15650 + }, + { + "epoch": 0.15925089518229166, + "grad_norm": 17.122467041015625, + "learning_rate": 4.696812500654714e-06, + "loss": 3.2949, + "step": 15655 + }, + { + "epoch": 0.1593017578125, + "grad_norm": 11.814437866210938, + "learning_rate": 4.696621598033673e-06, + "loss": 3.1715, + "step": 15660 + }, + { + "epoch": 0.15935262044270834, + "grad_norm": 9.952800750732422, + "learning_rate": 4.696430639212588e-06, + "loss": 3.3326, + "step": 15665 + }, + { + "epoch": 0.15940348307291666, + "grad_norm": 13.430285453796387, + "learning_rate": 4.696239624196346e-06, + "loss": 3.542, + "step": 15670 + }, + { + "epoch": 0.159454345703125, + "grad_norm": 8.788776397705078, + "learning_rate": 4.696048552989835e-06, + "loss": 3.3412, + "step": 15675 + }, + { + "epoch": 0.15950520833333334, + "grad_norm": 11.158987045288086, + "learning_rate": 4.69585742559794e-06, + "loss": 3.1511, + "step": 15680 + }, + { + "epoch": 0.15955607096354166, + "grad_norm": 8.167466163635254, + "learning_rate": 4.695666242025556e-06, + "loss": 3.7305, + "step": 15685 + }, + { + "epoch": 0.15960693359375, + "grad_norm": 13.274105072021484, + "learning_rate": 4.69547500227757e-06, + "loss": 3.3312, + "step": 15690 + }, + { + "epoch": 0.15965779622395834, + "grad_norm": 10.515645027160645, + "learning_rate": 4.6952837063588766e-06, + "loss": 3.07, + "step": 15695 + }, + { + "epoch": 0.15970865885416666, + "grad_norm": 12.83730411529541, + "learning_rate": 4.69509235427437e-06, + "loss": 3.5047, + "step": 15700 + }, + { + "epoch": 0.159759521484375, + "grad_norm": 12.311369895935059, + "learning_rate": 4.694900946028946e-06, + "loss": 3.6497, + "step": 15705 + }, + { + "epoch": 0.15981038411458334, + "grad_norm": 12.914471626281738, + "learning_rate": 4.694709481627502e-06, + "loss": 3.2395, + "step": 15710 + }, + { + "epoch": 0.15986124674479166, + "grad_norm": 14.76524543762207, + "learning_rate": 4.694517961074934e-06, + "loss": 3.2397, + "step": 15715 + }, + { + "epoch": 0.159912109375, + "grad_norm": 18.405723571777344, + "learning_rate": 4.694326384376146e-06, + "loss": 3.3388, + "step": 15720 + }, + { + "epoch": 0.15996297200520834, + "grad_norm": 9.726785659790039, + "learning_rate": 4.694134751536038e-06, + "loss": 3.287, + "step": 15725 + }, + { + "epoch": 0.16001383463541666, + "grad_norm": 12.435685157775879, + "learning_rate": 4.693943062559512e-06, + "loss": 3.2259, + "step": 15730 + }, + { + "epoch": 0.160064697265625, + "grad_norm": 11.35924243927002, + "learning_rate": 4.693751317451472e-06, + "loss": 3.1574, + "step": 15735 + }, + { + "epoch": 0.16011555989583334, + "grad_norm": 10.276074409484863, + "learning_rate": 4.693559516216825e-06, + "loss": 3.3365, + "step": 15740 + }, + { + "epoch": 0.16016642252604166, + "grad_norm": 8.467253684997559, + "learning_rate": 4.693367658860478e-06, + "loss": 3.1872, + "step": 15745 + }, + { + "epoch": 0.16021728515625, + "grad_norm": 11.348249435424805, + "learning_rate": 4.693175745387339e-06, + "loss": 3.3335, + "step": 15750 + }, + { + "epoch": 0.16026814778645834, + "grad_norm": 12.088251113891602, + "learning_rate": 4.692983775802318e-06, + "loss": 3.4162, + "step": 15755 + }, + { + "epoch": 0.16031901041666666, + "grad_norm": 10.288962364196777, + "learning_rate": 4.692791750110327e-06, + "loss": 3.4439, + "step": 15760 + }, + { + "epoch": 0.160369873046875, + "grad_norm": 15.410057067871094, + "learning_rate": 4.692599668316279e-06, + "loss": 3.4407, + "step": 15765 + }, + { + "epoch": 0.16042073567708334, + "grad_norm": 13.116768836975098, + "learning_rate": 4.692407530425089e-06, + "loss": 3.4639, + "step": 15770 + }, + { + "epoch": 0.16047159830729166, + "grad_norm": 9.727956771850586, + "learning_rate": 4.692215336441671e-06, + "loss": 3.3015, + "step": 15775 + }, + { + "epoch": 0.1605224609375, + "grad_norm": 11.078639030456543, + "learning_rate": 4.692023086370944e-06, + "loss": 3.265, + "step": 15780 + }, + { + "epoch": 0.16057332356770834, + "grad_norm": 7.883986473083496, + "learning_rate": 4.6918307802178255e-06, + "loss": 3.4122, + "step": 15785 + }, + { + "epoch": 0.16062418619791666, + "grad_norm": 12.443114280700684, + "learning_rate": 4.6916384179872356e-06, + "loss": 3.0951, + "step": 15790 + }, + { + "epoch": 0.160675048828125, + "grad_norm": 9.635088920593262, + "learning_rate": 4.691445999684097e-06, + "loss": 3.8282, + "step": 15795 + }, + { + "epoch": 0.16072591145833334, + "grad_norm": 8.927238464355469, + "learning_rate": 4.69125352531333e-06, + "loss": 3.4864, + "step": 15800 + }, + { + "epoch": 0.16077677408854166, + "grad_norm": 10.879573822021484, + "learning_rate": 4.6910609948798636e-06, + "loss": 3.7555, + "step": 15805 + }, + { + "epoch": 0.16082763671875, + "grad_norm": 8.361101150512695, + "learning_rate": 4.69086840838862e-06, + "loss": 3.5791, + "step": 15810 + }, + { + "epoch": 0.16087849934895834, + "grad_norm": 10.21220588684082, + "learning_rate": 4.6906757658445265e-06, + "loss": 3.2996, + "step": 15815 + }, + { + "epoch": 0.16092936197916666, + "grad_norm": 12.77525520324707, + "learning_rate": 4.690483067252514e-06, + "loss": 3.4817, + "step": 15820 + }, + { + "epoch": 0.160980224609375, + "grad_norm": 12.158894538879395, + "learning_rate": 4.690290312617512e-06, + "loss": 3.3044, + "step": 15825 + }, + { + "epoch": 0.16103108723958334, + "grad_norm": 15.93702507019043, + "learning_rate": 4.69009750194445e-06, + "loss": 3.3049, + "step": 15830 + }, + { + "epoch": 0.16108194986979166, + "grad_norm": 14.75804615020752, + "learning_rate": 4.6899046352382625e-06, + "loss": 3.2458, + "step": 15835 + }, + { + "epoch": 0.1611328125, + "grad_norm": 7.818066596984863, + "learning_rate": 4.689711712503885e-06, + "loss": 3.3629, + "step": 15840 + }, + { + "epoch": 0.16118367513020834, + "grad_norm": 11.566434860229492, + "learning_rate": 4.689518733746251e-06, + "loss": 3.3139, + "step": 15845 + }, + { + "epoch": 0.16123453776041666, + "grad_norm": 11.620870590209961, + "learning_rate": 4.689325698970301e-06, + "loss": 3.3736, + "step": 15850 + }, + { + "epoch": 0.161285400390625, + "grad_norm": 9.550261497497559, + "learning_rate": 4.6891326081809705e-06, + "loss": 3.1503, + "step": 15855 + }, + { + "epoch": 0.16133626302083334, + "grad_norm": 14.424015045166016, + "learning_rate": 4.688939461383202e-06, + "loss": 3.3043, + "step": 15860 + }, + { + "epoch": 0.16138712565104166, + "grad_norm": 14.749124526977539, + "learning_rate": 4.688746258581936e-06, + "loss": 3.2469, + "step": 15865 + }, + { + "epoch": 0.16143798828125, + "grad_norm": 14.39970588684082, + "learning_rate": 4.688552999782114e-06, + "loss": 3.4534, + "step": 15870 + }, + { + "epoch": 0.16148885091145834, + "grad_norm": 11.76413631439209, + "learning_rate": 4.6883596849886845e-06, + "loss": 3.1851, + "step": 15875 + }, + { + "epoch": 0.16153971354166666, + "grad_norm": 14.800198554992676, + "learning_rate": 4.68816631420659e-06, + "loss": 3.7561, + "step": 15880 + }, + { + "epoch": 0.161590576171875, + "grad_norm": 13.527456283569336, + "learning_rate": 4.68797288744078e-06, + "loss": 3.4993, + "step": 15885 + }, + { + "epoch": 0.16164143880208334, + "grad_norm": 13.89983081817627, + "learning_rate": 4.6877794046962014e-06, + "loss": 3.5607, + "step": 15890 + }, + { + "epoch": 0.16169230143229166, + "grad_norm": 12.58095932006836, + "learning_rate": 4.687585865977806e-06, + "loss": 3.2699, + "step": 15895 + }, + { + "epoch": 0.1617431640625, + "grad_norm": 14.336568832397461, + "learning_rate": 4.687392271290544e-06, + "loss": 3.5323, + "step": 15900 + }, + { + "epoch": 0.16179402669270834, + "grad_norm": 12.0953950881958, + "learning_rate": 4.6871986206393695e-06, + "loss": 3.549, + "step": 15905 + }, + { + "epoch": 0.16184488932291666, + "grad_norm": 10.101968765258789, + "learning_rate": 4.687004914029237e-06, + "loss": 3.3465, + "step": 15910 + }, + { + "epoch": 0.161895751953125, + "grad_norm": 14.160582542419434, + "learning_rate": 4.6868111514651025e-06, + "loss": 3.3592, + "step": 15915 + }, + { + "epoch": 0.16194661458333334, + "grad_norm": 8.53564453125, + "learning_rate": 4.686617332951922e-06, + "loss": 3.3656, + "step": 15920 + }, + { + "epoch": 0.16199747721354166, + "grad_norm": 13.714225769042969, + "learning_rate": 4.686423458494655e-06, + "loss": 2.912, + "step": 15925 + }, + { + "epoch": 0.16204833984375, + "grad_norm": 13.133865356445312, + "learning_rate": 4.686229528098263e-06, + "loss": 3.5427, + "step": 15930 + }, + { + "epoch": 0.16209920247395834, + "grad_norm": 10.839924812316895, + "learning_rate": 4.686035541767707e-06, + "loss": 3.1262, + "step": 15935 + }, + { + "epoch": 0.16215006510416666, + "grad_norm": 16.54444694519043, + "learning_rate": 4.6858414995079495e-06, + "loss": 3.5354, + "step": 15940 + }, + { + "epoch": 0.162200927734375, + "grad_norm": 14.630023002624512, + "learning_rate": 4.685647401323955e-06, + "loss": 3.9413, + "step": 15945 + }, + { + "epoch": 0.16225179036458334, + "grad_norm": 9.191962242126465, + "learning_rate": 4.68545324722069e-06, + "loss": 3.011, + "step": 15950 + }, + { + "epoch": 0.16230265299479166, + "grad_norm": 9.845999717712402, + "learning_rate": 4.685259037203121e-06, + "loss": 3.433, + "step": 15955 + }, + { + "epoch": 0.162353515625, + "grad_norm": 10.282391548156738, + "learning_rate": 4.685064771276219e-06, + "loss": 4.0526, + "step": 15960 + }, + { + "epoch": 0.16240437825520834, + "grad_norm": 11.311758995056152, + "learning_rate": 4.684870449444951e-06, + "loss": 3.3342, + "step": 15965 + }, + { + "epoch": 0.16245524088541666, + "grad_norm": 10.48366641998291, + "learning_rate": 4.684676071714292e-06, + "loss": 3.3553, + "step": 15970 + }, + { + "epoch": 0.162506103515625, + "grad_norm": 14.828400611877441, + "learning_rate": 4.684481638089212e-06, + "loss": 3.3099, + "step": 15975 + }, + { + "epoch": 0.16255696614583334, + "grad_norm": 9.731256484985352, + "learning_rate": 4.684287148574689e-06, + "loss": 3.4463, + "step": 15980 + }, + { + "epoch": 0.16260782877604166, + "grad_norm": 11.136251449584961, + "learning_rate": 4.684092603175696e-06, + "loss": 3.5473, + "step": 15985 + }, + { + "epoch": 0.16265869140625, + "grad_norm": 13.161229133605957, + "learning_rate": 4.683898001897211e-06, + "loss": 3.3273, + "step": 15990 + }, + { + "epoch": 0.16270955403645834, + "grad_norm": 9.826051712036133, + "learning_rate": 4.683703344744213e-06, + "loss": 3.3483, + "step": 15995 + }, + { + "epoch": 0.16276041666666666, + "grad_norm": 7.0690460205078125, + "learning_rate": 4.683508631721684e-06, + "loss": 3.0126, + "step": 16000 + }, + { + "epoch": 0.162811279296875, + "grad_norm": 13.985572814941406, + "learning_rate": 4.683313862834603e-06, + "loss": 3.6359, + "step": 16005 + }, + { + "epoch": 0.16286214192708334, + "grad_norm": 10.578701972961426, + "learning_rate": 4.683119038087955e-06, + "loss": 3.2651, + "step": 16010 + }, + { + "epoch": 0.16291300455729166, + "grad_norm": 11.11215591430664, + "learning_rate": 4.682924157486724e-06, + "loss": 3.1562, + "step": 16015 + }, + { + "epoch": 0.1629638671875, + "grad_norm": 11.88314151763916, + "learning_rate": 4.682729221035895e-06, + "loss": 3.677, + "step": 16020 + }, + { + "epoch": 0.16301472981770834, + "grad_norm": 8.311817169189453, + "learning_rate": 4.6825342287404564e-06, + "loss": 3.6961, + "step": 16025 + }, + { + "epoch": 0.16306559244791666, + "grad_norm": 13.549260139465332, + "learning_rate": 4.682339180605397e-06, + "loss": 3.7595, + "step": 16030 + }, + { + "epoch": 0.163116455078125, + "grad_norm": 9.020939826965332, + "learning_rate": 4.682144076635707e-06, + "loss": 3.2208, + "step": 16035 + }, + { + "epoch": 0.16316731770833334, + "grad_norm": 16.133804321289062, + "learning_rate": 4.681948916836378e-06, + "loss": 3.4582, + "step": 16040 + }, + { + "epoch": 0.16321818033854166, + "grad_norm": 9.808793067932129, + "learning_rate": 4.681753701212404e-06, + "loss": 3.3957, + "step": 16045 + }, + { + "epoch": 0.16326904296875, + "grad_norm": 13.307278633117676, + "learning_rate": 4.681558429768777e-06, + "loss": 3.2849, + "step": 16050 + }, + { + "epoch": 0.16331990559895834, + "grad_norm": 17.242029190063477, + "learning_rate": 4.681363102510496e-06, + "loss": 3.2799, + "step": 16055 + }, + { + "epoch": 0.16337076822916666, + "grad_norm": 9.080634117126465, + "learning_rate": 4.6811677194425566e-06, + "loss": 2.9969, + "step": 16060 + }, + { + "epoch": 0.163421630859375, + "grad_norm": 8.223971366882324, + "learning_rate": 4.680972280569958e-06, + "loss": 3.3398, + "step": 16065 + }, + { + "epoch": 0.16347249348958334, + "grad_norm": 13.515527725219727, + "learning_rate": 4.680776785897701e-06, + "loss": 3.1698, + "step": 16070 + }, + { + "epoch": 0.16352335611979166, + "grad_norm": 10.615397453308105, + "learning_rate": 4.680581235430786e-06, + "loss": 3.644, + "step": 16075 + }, + { + "epoch": 0.16357421875, + "grad_norm": 13.023798942565918, + "learning_rate": 4.680385629174218e-06, + "loss": 3.8601, + "step": 16080 + }, + { + "epoch": 0.16362508138020834, + "grad_norm": 9.510238647460938, + "learning_rate": 4.680189967133e-06, + "loss": 3.2448, + "step": 16085 + }, + { + "epoch": 0.16367594401041666, + "grad_norm": 10.635534286499023, + "learning_rate": 4.6799942493121374e-06, + "loss": 3.7123, + "step": 16090 + }, + { + "epoch": 0.163726806640625, + "grad_norm": 8.131168365478516, + "learning_rate": 4.67979847571664e-06, + "loss": 3.4378, + "step": 16095 + }, + { + "epoch": 0.16377766927083334, + "grad_norm": 9.373008728027344, + "learning_rate": 4.679602646351515e-06, + "loss": 3.5492, + "step": 16100 + }, + { + "epoch": 0.16382853190104166, + "grad_norm": 15.981472969055176, + "learning_rate": 4.6794067612217734e-06, + "loss": 3.7413, + "step": 16105 + }, + { + "epoch": 0.16387939453125, + "grad_norm": 9.636930465698242, + "learning_rate": 4.679210820332425e-06, + "loss": 3.5977, + "step": 16110 + }, + { + "epoch": 0.16393025716145834, + "grad_norm": 13.382928848266602, + "learning_rate": 4.679014823688485e-06, + "loss": 3.3873, + "step": 16115 + }, + { + "epoch": 0.16398111979166666, + "grad_norm": 11.43319320678711, + "learning_rate": 4.678818771294967e-06, + "loss": 3.4517, + "step": 16120 + }, + { + "epoch": 0.164031982421875, + "grad_norm": 10.521183013916016, + "learning_rate": 4.678622663156888e-06, + "loss": 3.4823, + "step": 16125 + }, + { + "epoch": 0.16408284505208334, + "grad_norm": 10.852574348449707, + "learning_rate": 4.678426499279264e-06, + "loss": 3.4299, + "step": 16130 + }, + { + "epoch": 0.16413370768229166, + "grad_norm": 13.135746955871582, + "learning_rate": 4.6782302796671145e-06, + "loss": 3.4938, + "step": 16135 + }, + { + "epoch": 0.1641845703125, + "grad_norm": 16.104429244995117, + "learning_rate": 4.678034004325459e-06, + "loss": 3.4273, + "step": 16140 + }, + { + "epoch": 0.16423543294270834, + "grad_norm": 14.600593566894531, + "learning_rate": 4.677837673259321e-06, + "loss": 3.4873, + "step": 16145 + }, + { + "epoch": 0.16428629557291666, + "grad_norm": 10.380234718322754, + "learning_rate": 4.677641286473722e-06, + "loss": 3.4697, + "step": 16150 + }, + { + "epoch": 0.164337158203125, + "grad_norm": 13.653000831604004, + "learning_rate": 4.677444843973685e-06, + "loss": 3.2568, + "step": 16155 + }, + { + "epoch": 0.16438802083333334, + "grad_norm": 10.491547584533691, + "learning_rate": 4.67724834576424e-06, + "loss": 3.7773, + "step": 16160 + }, + { + "epoch": 0.16443888346354166, + "grad_norm": 18.804861068725586, + "learning_rate": 4.677051791850411e-06, + "loss": 3.1534, + "step": 16165 + }, + { + "epoch": 0.16448974609375, + "grad_norm": 10.031415939331055, + "learning_rate": 4.676855182237229e-06, + "loss": 3.1366, + "step": 16170 + }, + { + "epoch": 0.16454060872395834, + "grad_norm": 11.652718544006348, + "learning_rate": 4.6766585169297215e-06, + "loss": 3.302, + "step": 16175 + }, + { + "epoch": 0.16459147135416666, + "grad_norm": 12.849843978881836, + "learning_rate": 4.6764617959329226e-06, + "loss": 3.7761, + "step": 16180 + }, + { + "epoch": 0.164642333984375, + "grad_norm": 12.61086368560791, + "learning_rate": 4.676265019251865e-06, + "loss": 3.3563, + "step": 16185 + }, + { + "epoch": 0.16469319661458334, + "grad_norm": 10.435957908630371, + "learning_rate": 4.676068186891582e-06, + "loss": 3.4891, + "step": 16190 + }, + { + "epoch": 0.16474405924479166, + "grad_norm": 13.85044002532959, + "learning_rate": 4.675871298857111e-06, + "loss": 3.0696, + "step": 16195 + }, + { + "epoch": 0.164794921875, + "grad_norm": 9.022653579711914, + "learning_rate": 4.675674355153488e-06, + "loss": 3.4813, + "step": 16200 + }, + { + "epoch": 0.16484578450520834, + "grad_norm": 11.870932579040527, + "learning_rate": 4.675477355785752e-06, + "loss": 3.7666, + "step": 16205 + }, + { + "epoch": 0.16489664713541666, + "grad_norm": 12.24802303314209, + "learning_rate": 4.675280300758944e-06, + "loss": 3.3968, + "step": 16210 + }, + { + "epoch": 0.164947509765625, + "grad_norm": 12.894149780273438, + "learning_rate": 4.6750831900781055e-06, + "loss": 3.4708, + "step": 16215 + }, + { + "epoch": 0.16499837239583334, + "grad_norm": 9.178655624389648, + "learning_rate": 4.674886023748279e-06, + "loss": 3.3961, + "step": 16220 + }, + { + "epoch": 0.16504923502604166, + "grad_norm": 15.221061706542969, + "learning_rate": 4.674688801774508e-06, + "loss": 3.5445, + "step": 16225 + }, + { + "epoch": 0.16510009765625, + "grad_norm": 12.730554580688477, + "learning_rate": 4.67449152416184e-06, + "loss": 3.3576, + "step": 16230 + }, + { + "epoch": 0.16515096028645834, + "grad_norm": 10.138298988342285, + "learning_rate": 4.674294190915321e-06, + "loss": 3.3352, + "step": 16235 + }, + { + "epoch": 0.16520182291666666, + "grad_norm": 9.869894981384277, + "learning_rate": 4.674096802040003e-06, + "loss": 3.3375, + "step": 16240 + }, + { + "epoch": 0.165252685546875, + "grad_norm": 9.15701961517334, + "learning_rate": 4.673899357540932e-06, + "loss": 3.2079, + "step": 16245 + }, + { + "epoch": 0.16530354817708334, + "grad_norm": 13.204168319702148, + "learning_rate": 4.673701857423161e-06, + "loss": 3.2676, + "step": 16250 + }, + { + "epoch": 0.16535441080729166, + "grad_norm": 15.898265838623047, + "learning_rate": 4.6735043016917435e-06, + "loss": 3.3132, + "step": 16255 + }, + { + "epoch": 0.1654052734375, + "grad_norm": 10.086653709411621, + "learning_rate": 4.673306690351733e-06, + "loss": 3.2754, + "step": 16260 + }, + { + "epoch": 0.16545613606770834, + "grad_norm": 15.631150245666504, + "learning_rate": 4.6731090234081865e-06, + "loss": 3.6231, + "step": 16265 + }, + { + "epoch": 0.16550699869791666, + "grad_norm": 10.62215518951416, + "learning_rate": 4.672911300866161e-06, + "loss": 3.2062, + "step": 16270 + }, + { + "epoch": 0.165557861328125, + "grad_norm": 8.654640197753906, + "learning_rate": 4.672713522730715e-06, + "loss": 3.2674, + "step": 16275 + }, + { + "epoch": 0.16560872395833334, + "grad_norm": 11.249765396118164, + "learning_rate": 4.672515689006908e-06, + "loss": 3.0822, + "step": 16280 + }, + { + "epoch": 0.16565958658854166, + "grad_norm": 14.134060859680176, + "learning_rate": 4.6723177996998025e-06, + "loss": 3.58, + "step": 16285 + }, + { + "epoch": 0.16571044921875, + "grad_norm": 16.133581161499023, + "learning_rate": 4.672119854814461e-06, + "loss": 3.6814, + "step": 16290 + }, + { + "epoch": 0.16576131184895834, + "grad_norm": 13.389992713928223, + "learning_rate": 4.671921854355947e-06, + "loss": 3.4922, + "step": 16295 + }, + { + "epoch": 0.16581217447916666, + "grad_norm": 13.663773536682129, + "learning_rate": 4.671723798329328e-06, + "loss": 3.1055, + "step": 16300 + }, + { + "epoch": 0.165863037109375, + "grad_norm": 13.669440269470215, + "learning_rate": 4.671525686739669e-06, + "loss": 3.1514, + "step": 16305 + }, + { + "epoch": 0.16591389973958334, + "grad_norm": 12.217931747436523, + "learning_rate": 4.671327519592042e-06, + "loss": 3.6507, + "step": 16310 + }, + { + "epoch": 0.16596476236979166, + "grad_norm": 15.413629531860352, + "learning_rate": 4.6711292968915145e-06, + "loss": 3.4689, + "step": 16315 + }, + { + "epoch": 0.166015625, + "grad_norm": 11.545214653015137, + "learning_rate": 4.670931018643158e-06, + "loss": 3.2486, + "step": 16320 + }, + { + "epoch": 0.16606648763020834, + "grad_norm": 12.391217231750488, + "learning_rate": 4.670732684852046e-06, + "loss": 3.3009, + "step": 16325 + }, + { + "epoch": 0.16611735026041666, + "grad_norm": 10.62708854675293, + "learning_rate": 4.670534295523253e-06, + "loss": 3.3732, + "step": 16330 + }, + { + "epoch": 0.166168212890625, + "grad_norm": 8.105783462524414, + "learning_rate": 4.670335850661855e-06, + "loss": 3.3016, + "step": 16335 + }, + { + "epoch": 0.16621907552083334, + "grad_norm": 7.761030197143555, + "learning_rate": 4.670137350272927e-06, + "loss": 3.2436, + "step": 16340 + }, + { + "epoch": 0.16626993815104166, + "grad_norm": 12.776176452636719, + "learning_rate": 4.669938794361552e-06, + "loss": 3.3074, + "step": 16345 + }, + { + "epoch": 0.16632080078125, + "grad_norm": 7.695553302764893, + "learning_rate": 4.669740182932805e-06, + "loss": 3.0618, + "step": 16350 + }, + { + "epoch": 0.16637166341145834, + "grad_norm": 14.618805885314941, + "learning_rate": 4.66954151599177e-06, + "loss": 3.4561, + "step": 16355 + }, + { + "epoch": 0.16642252604166666, + "grad_norm": 14.894763946533203, + "learning_rate": 4.66934279354353e-06, + "loss": 3.492, + "step": 16360 + }, + { + "epoch": 0.166473388671875, + "grad_norm": 9.516297340393066, + "learning_rate": 4.669144015593169e-06, + "loss": 3.3012, + "step": 16365 + }, + { + "epoch": 0.16652425130208334, + "grad_norm": 15.990934371948242, + "learning_rate": 4.668945182145773e-06, + "loss": 3.4032, + "step": 16370 + }, + { + "epoch": 0.16657511393229166, + "grad_norm": 11.829225540161133, + "learning_rate": 4.668746293206428e-06, + "loss": 3.57, + "step": 16375 + }, + { + "epoch": 0.1666259765625, + "grad_norm": 9.242964744567871, + "learning_rate": 4.668547348780222e-06, + "loss": 3.43, + "step": 16380 + }, + { + "epoch": 0.16667683919270834, + "grad_norm": 15.611289024353027, + "learning_rate": 4.668348348872248e-06, + "loss": 3.2432, + "step": 16385 + }, + { + "epoch": 0.16672770182291666, + "grad_norm": 12.005697250366211, + "learning_rate": 4.668149293487595e-06, + "loss": 2.9898, + "step": 16390 + }, + { + "epoch": 0.166778564453125, + "grad_norm": 14.01111888885498, + "learning_rate": 4.6679501826313554e-06, + "loss": 3.5077, + "step": 16395 + }, + { + "epoch": 0.16682942708333334, + "grad_norm": 17.95404052734375, + "learning_rate": 4.667751016308624e-06, + "loss": 3.3168, + "step": 16400 + }, + { + "epoch": 0.16688028971354166, + "grad_norm": 14.67130184173584, + "learning_rate": 4.6675517945244975e-06, + "loss": 3.0814, + "step": 16405 + }, + { + "epoch": 0.16693115234375, + "grad_norm": 9.70960521697998, + "learning_rate": 4.667352517284072e-06, + "loss": 3.3854, + "step": 16410 + }, + { + "epoch": 0.16698201497395834, + "grad_norm": 10.68616008758545, + "learning_rate": 4.667153184592446e-06, + "loss": 3.375, + "step": 16415 + }, + { + "epoch": 0.16703287760416666, + "grad_norm": 12.836880683898926, + "learning_rate": 4.6669537964547195e-06, + "loss": 3.2576, + "step": 16420 + }, + { + "epoch": 0.167083740234375, + "grad_norm": 10.322628021240234, + "learning_rate": 4.666754352875994e-06, + "loss": 3.5553, + "step": 16425 + }, + { + "epoch": 0.16713460286458334, + "grad_norm": 13.36998176574707, + "learning_rate": 4.6665548538613715e-06, + "loss": 3.5801, + "step": 16430 + }, + { + "epoch": 0.16718546549479166, + "grad_norm": 11.6786527633667, + "learning_rate": 4.666355299415956e-06, + "loss": 3.1604, + "step": 16435 + }, + { + "epoch": 0.167236328125, + "grad_norm": 15.699369430541992, + "learning_rate": 4.666155689544855e-06, + "loss": 3.2281, + "step": 16440 + }, + { + "epoch": 0.16728719075520834, + "grad_norm": 23.027801513671875, + "learning_rate": 4.6659560242531735e-06, + "loss": 3.0434, + "step": 16445 + }, + { + "epoch": 0.16733805338541666, + "grad_norm": 10.911322593688965, + "learning_rate": 4.665756303546021e-06, + "loss": 3.5173, + "step": 16450 + }, + { + "epoch": 0.167388916015625, + "grad_norm": 9.32667064666748, + "learning_rate": 4.665556527428506e-06, + "loss": 3.5175, + "step": 16455 + }, + { + "epoch": 0.16743977864583334, + "grad_norm": 13.066845893859863, + "learning_rate": 4.66535669590574e-06, + "loss": 3.7572, + "step": 16460 + }, + { + "epoch": 0.16749064127604166, + "grad_norm": 14.599390029907227, + "learning_rate": 4.6651568089828384e-06, + "loss": 3.1729, + "step": 16465 + }, + { + "epoch": 0.16754150390625, + "grad_norm": 10.783397674560547, + "learning_rate": 4.664956866664912e-06, + "loss": 3.1904, + "step": 16470 + }, + { + "epoch": 0.16759236653645834, + "grad_norm": 15.606278419494629, + "learning_rate": 4.664756868957076e-06, + "loss": 3.3966, + "step": 16475 + }, + { + "epoch": 0.16764322916666666, + "grad_norm": 11.634224891662598, + "learning_rate": 4.6645568158644496e-06, + "loss": 3.4924, + "step": 16480 + }, + { + "epoch": 0.167694091796875, + "grad_norm": 8.687528610229492, + "learning_rate": 4.66435670739215e-06, + "loss": 3.4183, + "step": 16485 + }, + { + "epoch": 0.16774495442708334, + "grad_norm": 13.002685546875, + "learning_rate": 4.6641565435452975e-06, + "loss": 3.3721, + "step": 16490 + }, + { + "epoch": 0.16779581705729166, + "grad_norm": 13.268308639526367, + "learning_rate": 4.663956324329012e-06, + "loss": 3.7302, + "step": 16495 + }, + { + "epoch": 0.1678466796875, + "grad_norm": 9.55381965637207, + "learning_rate": 4.663756049748418e-06, + "loss": 3.2204, + "step": 16500 + }, + { + "epoch": 0.16789754231770834, + "grad_norm": 13.7435302734375, + "learning_rate": 4.6635557198086375e-06, + "loss": 3.6933, + "step": 16505 + }, + { + "epoch": 0.16794840494791666, + "grad_norm": 13.537063598632812, + "learning_rate": 4.663355334514796e-06, + "loss": 3.0507, + "step": 16510 + }, + { + "epoch": 0.167999267578125, + "grad_norm": 13.002664566040039, + "learning_rate": 4.663154893872023e-06, + "loss": 3.1599, + "step": 16515 + }, + { + "epoch": 0.16805013020833334, + "grad_norm": 8.084352493286133, + "learning_rate": 4.662954397885443e-06, + "loss": 3.5544, + "step": 16520 + }, + { + "epoch": 0.16810099283854166, + "grad_norm": 13.704971313476562, + "learning_rate": 4.662753846560189e-06, + "loss": 3.5446, + "step": 16525 + }, + { + "epoch": 0.16815185546875, + "grad_norm": 13.047155380249023, + "learning_rate": 4.662553239901389e-06, + "loss": 3.6407, + "step": 16530 + }, + { + "epoch": 0.16820271809895834, + "grad_norm": 10.460413932800293, + "learning_rate": 4.662352577914178e-06, + "loss": 3.5399, + "step": 16535 + }, + { + "epoch": 0.16825358072916666, + "grad_norm": 12.304898262023926, + "learning_rate": 4.6621518606036875e-06, + "loss": 3.3801, + "step": 16540 + }, + { + "epoch": 0.168304443359375, + "grad_norm": 10.110926628112793, + "learning_rate": 4.661951087975055e-06, + "loss": 3.2155, + "step": 16545 + }, + { + "epoch": 0.16835530598958334, + "grad_norm": 7.640111446380615, + "learning_rate": 4.661750260033417e-06, + "loss": 3.1164, + "step": 16550 + }, + { + "epoch": 0.16840616861979166, + "grad_norm": 8.465399742126465, + "learning_rate": 4.66154937678391e-06, + "loss": 3.7351, + "step": 16555 + }, + { + "epoch": 0.16845703125, + "grad_norm": 11.797218322753906, + "learning_rate": 4.661348438231675e-06, + "loss": 3.6523, + "step": 16560 + }, + { + "epoch": 0.16850789388020834, + "grad_norm": 9.346522331237793, + "learning_rate": 4.6611474443818525e-06, + "loss": 3.34, + "step": 16565 + }, + { + "epoch": 0.16855875651041666, + "grad_norm": 14.899089813232422, + "learning_rate": 4.660946395239584e-06, + "loss": 3.5477, + "step": 16570 + }, + { + "epoch": 0.168609619140625, + "grad_norm": 11.063288688659668, + "learning_rate": 4.660745290810015e-06, + "loss": 3.333, + "step": 16575 + }, + { + "epoch": 0.16866048177083334, + "grad_norm": 9.561201095581055, + "learning_rate": 4.66054413109829e-06, + "loss": 3.8984, + "step": 16580 + }, + { + "epoch": 0.16871134440104166, + "grad_norm": 13.455229759216309, + "learning_rate": 4.6603429161095556e-06, + "loss": 3.2126, + "step": 16585 + }, + { + "epoch": 0.16876220703125, + "grad_norm": 10.709653854370117, + "learning_rate": 4.660141645848959e-06, + "loss": 3.2361, + "step": 16590 + }, + { + "epoch": 0.16881306966145834, + "grad_norm": 12.909401893615723, + "learning_rate": 4.659940320321651e-06, + "loss": 3.3619, + "step": 16595 + }, + { + "epoch": 0.16886393229166666, + "grad_norm": 15.584907531738281, + "learning_rate": 4.6597389395327816e-06, + "loss": 3.1329, + "step": 16600 + }, + { + "epoch": 0.168914794921875, + "grad_norm": 14.770988464355469, + "learning_rate": 4.659537503487503e-06, + "loss": 3.9151, + "step": 16605 + }, + { + "epoch": 0.16896565755208334, + "grad_norm": 7.884305953979492, + "learning_rate": 4.6593360121909706e-06, + "loss": 3.2376, + "step": 16610 + }, + { + "epoch": 0.16901652018229166, + "grad_norm": 14.378878593444824, + "learning_rate": 4.659134465648338e-06, + "loss": 3.4527, + "step": 16615 + }, + { + "epoch": 0.1690673828125, + "grad_norm": 9.519623756408691, + "learning_rate": 4.65893286386476e-06, + "loss": 3.445, + "step": 16620 + }, + { + "epoch": 0.16911824544270834, + "grad_norm": 9.221073150634766, + "learning_rate": 4.658731206845398e-06, + "loss": 3.3477, + "step": 16625 + }, + { + "epoch": 0.16916910807291666, + "grad_norm": 10.170306205749512, + "learning_rate": 4.658529494595408e-06, + "loss": 3.3417, + "step": 16630 + }, + { + "epoch": 0.169219970703125, + "grad_norm": 15.229034423828125, + "learning_rate": 4.6583277271199545e-06, + "loss": 3.2686, + "step": 16635 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 12.785039901733398, + "learning_rate": 4.658125904424197e-06, + "loss": 3.4329, + "step": 16640 + }, + { + "epoch": 0.16932169596354166, + "grad_norm": 9.296720504760742, + "learning_rate": 4.6579240265133e-06, + "loss": 3.3711, + "step": 16645 + }, + { + "epoch": 0.16937255859375, + "grad_norm": 10.980588912963867, + "learning_rate": 4.657722093392428e-06, + "loss": 3.3667, + "step": 16650 + }, + { + "epoch": 0.16942342122395834, + "grad_norm": 11.73270034790039, + "learning_rate": 4.657520105066747e-06, + "loss": 3.4216, + "step": 16655 + }, + { + "epoch": 0.16947428385416666, + "grad_norm": 18.23244857788086, + "learning_rate": 4.6573180615414265e-06, + "loss": 3.6996, + "step": 16660 + }, + { + "epoch": 0.169525146484375, + "grad_norm": 13.225893020629883, + "learning_rate": 4.657115962821635e-06, + "loss": 3.5311, + "step": 16665 + }, + { + "epoch": 0.16957600911458334, + "grad_norm": 12.947046279907227, + "learning_rate": 4.656913808912542e-06, + "loss": 3.2338, + "step": 16670 + }, + { + "epoch": 0.16962687174479166, + "grad_norm": 16.09568977355957, + "learning_rate": 4.65671159981932e-06, + "loss": 3.3158, + "step": 16675 + }, + { + "epoch": 0.169677734375, + "grad_norm": 9.442344665527344, + "learning_rate": 4.656509335547144e-06, + "loss": 3.5821, + "step": 16680 + }, + { + "epoch": 0.16972859700520834, + "grad_norm": 8.947610855102539, + "learning_rate": 4.656307016101187e-06, + "loss": 3.2935, + "step": 16685 + }, + { + "epoch": 0.16977945963541666, + "grad_norm": 15.548479080200195, + "learning_rate": 4.656104641486628e-06, + "loss": 3.6593, + "step": 16690 + }, + { + "epoch": 0.169830322265625, + "grad_norm": 9.9256591796875, + "learning_rate": 4.655902211708641e-06, + "loss": 3.4071, + "step": 16695 + }, + { + "epoch": 0.16988118489583334, + "grad_norm": 11.299330711364746, + "learning_rate": 4.655699726772407e-06, + "loss": 3.4039, + "step": 16700 + }, + { + "epoch": 0.16993204752604166, + "grad_norm": 10.457588195800781, + "learning_rate": 4.655497186683107e-06, + "loss": 3.4151, + "step": 16705 + }, + { + "epoch": 0.16998291015625, + "grad_norm": 9.101759910583496, + "learning_rate": 4.655294591445921e-06, + "loss": 3.0531, + "step": 16710 + }, + { + "epoch": 0.17003377278645834, + "grad_norm": 9.755556106567383, + "learning_rate": 4.6550919410660355e-06, + "loss": 3.7921, + "step": 16715 + }, + { + "epoch": 0.17008463541666666, + "grad_norm": 6.866489410400391, + "learning_rate": 4.654889235548633e-06, + "loss": 3.3934, + "step": 16720 + }, + { + "epoch": 0.170135498046875, + "grad_norm": 11.931754112243652, + "learning_rate": 4.6546864748989e-06, + "loss": 3.2464, + "step": 16725 + }, + { + "epoch": 0.17018636067708334, + "grad_norm": 12.620525360107422, + "learning_rate": 4.654483659122025e-06, + "loss": 3.1906, + "step": 16730 + }, + { + "epoch": 0.17023722330729166, + "grad_norm": 9.603962898254395, + "learning_rate": 4.654280788223195e-06, + "loss": 3.1315, + "step": 16735 + }, + { + "epoch": 0.1702880859375, + "grad_norm": 10.157326698303223, + "learning_rate": 4.654077862207601e-06, + "loss": 3.4375, + "step": 16740 + }, + { + "epoch": 0.17033894856770834, + "grad_norm": 6.509634971618652, + "learning_rate": 4.653874881080437e-06, + "loss": 3.2071, + "step": 16745 + }, + { + "epoch": 0.17038981119791666, + "grad_norm": 11.346307754516602, + "learning_rate": 4.653671844846895e-06, + "loss": 3.4308, + "step": 16750 + }, + { + "epoch": 0.170440673828125, + "grad_norm": 10.316970825195312, + "learning_rate": 4.653468753512168e-06, + "loss": 3.5133, + "step": 16755 + }, + { + "epoch": 0.17049153645833334, + "grad_norm": 12.79603099822998, + "learning_rate": 4.653265607081454e-06, + "loss": 3.2712, + "step": 16760 + }, + { + "epoch": 0.17054239908854166, + "grad_norm": 12.55677604675293, + "learning_rate": 4.653062405559951e-06, + "loss": 3.2636, + "step": 16765 + }, + { + "epoch": 0.17059326171875, + "grad_norm": 15.563427925109863, + "learning_rate": 4.652859148952855e-06, + "loss": 3.2278, + "step": 16770 + }, + { + "epoch": 0.17064412434895834, + "grad_norm": 13.07026481628418, + "learning_rate": 4.652655837265369e-06, + "loss": 3.3884, + "step": 16775 + }, + { + "epoch": 0.17069498697916666, + "grad_norm": 13.581730842590332, + "learning_rate": 4.6524524705026925e-06, + "loss": 3.6175, + "step": 16780 + }, + { + "epoch": 0.170745849609375, + "grad_norm": 8.78980827331543, + "learning_rate": 4.65224904867003e-06, + "loss": 3.4371, + "step": 16785 + }, + { + "epoch": 0.17079671223958334, + "grad_norm": 11.52972412109375, + "learning_rate": 4.652045571772586e-06, + "loss": 3.2436, + "step": 16790 + }, + { + "epoch": 0.17084757486979166, + "grad_norm": 17.321304321289062, + "learning_rate": 4.651842039815566e-06, + "loss": 3.393, + "step": 16795 + }, + { + "epoch": 0.1708984375, + "grad_norm": 12.183976173400879, + "learning_rate": 4.651638452804178e-06, + "loss": 3.3443, + "step": 16800 + }, + { + "epoch": 0.17094930013020834, + "grad_norm": 11.435518264770508, + "learning_rate": 4.6514348107436305e-06, + "loss": 3.4001, + "step": 16805 + }, + { + "epoch": 0.17100016276041666, + "grad_norm": 15.479721069335938, + "learning_rate": 4.651231113639132e-06, + "loss": 3.0751, + "step": 16810 + }, + { + "epoch": 0.171051025390625, + "grad_norm": 12.146183013916016, + "learning_rate": 4.651027361495896e-06, + "loss": 3.238, + "step": 16815 + }, + { + "epoch": 0.17110188802083334, + "grad_norm": 12.965581893920898, + "learning_rate": 4.650823554319135e-06, + "loss": 3.8735, + "step": 16820 + }, + { + "epoch": 0.17115275065104166, + "grad_norm": 9.760518074035645, + "learning_rate": 4.650619692114063e-06, + "loss": 3.2039, + "step": 16825 + }, + { + "epoch": 0.17120361328125, + "grad_norm": 12.903510093688965, + "learning_rate": 4.650415774885896e-06, + "loss": 3.5834, + "step": 16830 + }, + { + "epoch": 0.17125447591145834, + "grad_norm": 13.00151538848877, + "learning_rate": 4.650211802639851e-06, + "loss": 3.243, + "step": 16835 + }, + { + "epoch": 0.17130533854166666, + "grad_norm": 15.11430549621582, + "learning_rate": 4.6500077753811465e-06, + "loss": 3.4178, + "step": 16840 + }, + { + "epoch": 0.171356201171875, + "grad_norm": 10.01813793182373, + "learning_rate": 4.649803693115003e-06, + "loss": 3.2601, + "step": 16845 + }, + { + "epoch": 0.17140706380208334, + "grad_norm": 11.622364044189453, + "learning_rate": 4.649599555846641e-06, + "loss": 3.6725, + "step": 16850 + }, + { + "epoch": 0.17145792643229166, + "grad_norm": 15.42918872833252, + "learning_rate": 4.649395363581285e-06, + "loss": 3.3048, + "step": 16855 + }, + { + "epoch": 0.1715087890625, + "grad_norm": 11.032341957092285, + "learning_rate": 4.649191116324158e-06, + "loss": 3.4853, + "step": 16860 + }, + { + "epoch": 0.17155965169270834, + "grad_norm": 12.54845142364502, + "learning_rate": 4.648986814080485e-06, + "loss": 3.2918, + "step": 16865 + }, + { + "epoch": 0.17161051432291666, + "grad_norm": 8.605419158935547, + "learning_rate": 4.648782456855493e-06, + "loss": 3.4214, + "step": 16870 + }, + { + "epoch": 0.171661376953125, + "grad_norm": 9.82323169708252, + "learning_rate": 4.648578044654412e-06, + "loss": 3.3439, + "step": 16875 + }, + { + "epoch": 0.17171223958333334, + "grad_norm": 15.435731887817383, + "learning_rate": 4.648373577482471e-06, + "loss": 3.2196, + "step": 16880 + }, + { + "epoch": 0.17176310221354166, + "grad_norm": 9.784239768981934, + "learning_rate": 4.6481690553449015e-06, + "loss": 4.0164, + "step": 16885 + }, + { + "epoch": 0.17181396484375, + "grad_norm": 11.91480541229248, + "learning_rate": 4.647964478246936e-06, + "loss": 3.2925, + "step": 16890 + }, + { + "epoch": 0.17186482747395834, + "grad_norm": 7.702728271484375, + "learning_rate": 4.647759846193808e-06, + "loss": 3.2802, + "step": 16895 + }, + { + "epoch": 0.17191569010416666, + "grad_norm": 7.844374656677246, + "learning_rate": 4.647555159190753e-06, + "loss": 3.3919, + "step": 16900 + }, + { + "epoch": 0.171966552734375, + "grad_norm": 9.673126220703125, + "learning_rate": 4.647350417243009e-06, + "loss": 3.7028, + "step": 16905 + }, + { + "epoch": 0.17201741536458334, + "grad_norm": 13.436027526855469, + "learning_rate": 4.647145620355813e-06, + "loss": 3.0947, + "step": 16910 + }, + { + "epoch": 0.17206827799479166, + "grad_norm": 9.890853881835938, + "learning_rate": 4.646940768534406e-06, + "loss": 3.3502, + "step": 16915 + }, + { + "epoch": 0.172119140625, + "grad_norm": 12.191718101501465, + "learning_rate": 4.6467358617840275e-06, + "loss": 3.1166, + "step": 16920 + }, + { + "epoch": 0.17217000325520834, + "grad_norm": 10.807548522949219, + "learning_rate": 4.646530900109921e-06, + "loss": 3.3743, + "step": 16925 + }, + { + "epoch": 0.17222086588541666, + "grad_norm": 15.573385238647461, + "learning_rate": 4.646325883517331e-06, + "loss": 3.4527, + "step": 16930 + }, + { + "epoch": 0.172271728515625, + "grad_norm": 12.284563064575195, + "learning_rate": 4.646120812011501e-06, + "loss": 3.1113, + "step": 16935 + }, + { + "epoch": 0.17232259114583334, + "grad_norm": 15.29260540008545, + "learning_rate": 4.645915685597679e-06, + "loss": 3.4474, + "step": 16940 + }, + { + "epoch": 0.17237345377604166, + "grad_norm": 12.571720123291016, + "learning_rate": 4.645710504281113e-06, + "loss": 3.1863, + "step": 16945 + }, + { + "epoch": 0.17242431640625, + "grad_norm": 14.381819725036621, + "learning_rate": 4.645505268067052e-06, + "loss": 3.3417, + "step": 16950 + }, + { + "epoch": 0.17247517903645834, + "grad_norm": 9.561775207519531, + "learning_rate": 4.645299976960747e-06, + "loss": 3.1955, + "step": 16955 + }, + { + "epoch": 0.17252604166666666, + "grad_norm": 12.148945808410645, + "learning_rate": 4.645094630967451e-06, + "loss": 3.7219, + "step": 16960 + }, + { + "epoch": 0.172576904296875, + "grad_norm": 10.793240547180176, + "learning_rate": 4.644889230092418e-06, + "loss": 3.5109, + "step": 16965 + }, + { + "epoch": 0.17262776692708334, + "grad_norm": 13.821535110473633, + "learning_rate": 4.644683774340902e-06, + "loss": 3.3895, + "step": 16970 + }, + { + "epoch": 0.17267862955729166, + "grad_norm": 10.016497611999512, + "learning_rate": 4.64447826371816e-06, + "loss": 3.317, + "step": 16975 + }, + { + "epoch": 0.1727294921875, + "grad_norm": 12.0796480178833, + "learning_rate": 4.64427269822945e-06, + "loss": 3.1043, + "step": 16980 + }, + { + "epoch": 0.17278035481770834, + "grad_norm": 11.921280860900879, + "learning_rate": 4.644067077880031e-06, + "loss": 3.4567, + "step": 16985 + }, + { + "epoch": 0.17283121744791666, + "grad_norm": 10.839923858642578, + "learning_rate": 4.643861402675164e-06, + "loss": 3.4736, + "step": 16990 + }, + { + "epoch": 0.172882080078125, + "grad_norm": 13.740301132202148, + "learning_rate": 4.643655672620111e-06, + "loss": 3.6096, + "step": 16995 + }, + { + "epoch": 0.17293294270833334, + "grad_norm": 14.093925476074219, + "learning_rate": 4.643449887720136e-06, + "loss": 3.2615, + "step": 17000 + }, + { + "epoch": 0.17298380533854166, + "grad_norm": 10.022769927978516, + "learning_rate": 4.643244047980503e-06, + "loss": 3.0516, + "step": 17005 + }, + { + "epoch": 0.17303466796875, + "grad_norm": 9.807087898254395, + "learning_rate": 4.64303815340648e-06, + "loss": 3.6837, + "step": 17010 + }, + { + "epoch": 0.17308553059895834, + "grad_norm": 10.774398803710938, + "learning_rate": 4.642832204003333e-06, + "loss": 3.046, + "step": 17015 + }, + { + "epoch": 0.17313639322916666, + "grad_norm": 8.116972923278809, + "learning_rate": 4.642626199776333e-06, + "loss": 3.071, + "step": 17020 + }, + { + "epoch": 0.173187255859375, + "grad_norm": 13.305663108825684, + "learning_rate": 4.642420140730749e-06, + "loss": 3.8665, + "step": 17025 + }, + { + "epoch": 0.17323811848958334, + "grad_norm": 16.738006591796875, + "learning_rate": 4.642214026871853e-06, + "loss": 3.6563, + "step": 17030 + }, + { + "epoch": 0.17328898111979166, + "grad_norm": 10.710490226745605, + "learning_rate": 4.642007858204919e-06, + "loss": 3.3579, + "step": 17035 + }, + { + "epoch": 0.17333984375, + "grad_norm": 17.46071434020996, + "learning_rate": 4.641801634735222e-06, + "loss": 3.2495, + "step": 17040 + }, + { + "epoch": 0.17339070638020834, + "grad_norm": 8.741815567016602, + "learning_rate": 4.6415953564680385e-06, + "loss": 3.0387, + "step": 17045 + }, + { + "epoch": 0.17344156901041666, + "grad_norm": 14.374468803405762, + "learning_rate": 4.641389023408644e-06, + "loss": 3.3034, + "step": 17050 + }, + { + "epoch": 0.173492431640625, + "grad_norm": 10.53064250946045, + "learning_rate": 4.64118263556232e-06, + "loss": 3.3958, + "step": 17055 + }, + { + "epoch": 0.17354329427083334, + "grad_norm": 16.45563316345215, + "learning_rate": 4.640976192934345e-06, + "loss": 3.6537, + "step": 17060 + }, + { + "epoch": 0.17359415690104166, + "grad_norm": 13.669546127319336, + "learning_rate": 4.6407696955300025e-06, + "loss": 3.136, + "step": 17065 + }, + { + "epoch": 0.17364501953125, + "grad_norm": 15.326542854309082, + "learning_rate": 4.640563143354574e-06, + "loss": 3.4866, + "step": 17070 + }, + { + "epoch": 0.17369588216145834, + "grad_norm": 7.852635860443115, + "learning_rate": 4.640356536413345e-06, + "loss": 3.4082, + "step": 17075 + }, + { + "epoch": 0.17374674479166666, + "grad_norm": 15.086888313293457, + "learning_rate": 4.640149874711601e-06, + "loss": 3.3045, + "step": 17080 + }, + { + "epoch": 0.173797607421875, + "grad_norm": 13.305411338806152, + "learning_rate": 4.63994315825463e-06, + "loss": 2.9298, + "step": 17085 + }, + { + "epoch": 0.17384847005208334, + "grad_norm": 6.963002681732178, + "learning_rate": 4.639736387047722e-06, + "loss": 3.5608, + "step": 17090 + }, + { + "epoch": 0.17389933268229166, + "grad_norm": 13.062210083007812, + "learning_rate": 4.639529561096164e-06, + "loss": 3.1996, + "step": 17095 + }, + { + "epoch": 0.1739501953125, + "grad_norm": 9.515623092651367, + "learning_rate": 4.639322680405249e-06, + "loss": 3.3422, + "step": 17100 + }, + { + "epoch": 0.17400105794270834, + "grad_norm": 11.610623359680176, + "learning_rate": 4.639115744980272e-06, + "loss": 3.3988, + "step": 17105 + }, + { + "epoch": 0.17405192057291666, + "grad_norm": 11.479599952697754, + "learning_rate": 4.6389087548265245e-06, + "loss": 3.2056, + "step": 17110 + }, + { + "epoch": 0.174102783203125, + "grad_norm": 11.929533004760742, + "learning_rate": 4.638701709949303e-06, + "loss": 3.0456, + "step": 17115 + }, + { + "epoch": 0.17415364583333334, + "grad_norm": 12.549202919006348, + "learning_rate": 4.638494610353907e-06, + "loss": 3.3503, + "step": 17120 + }, + { + "epoch": 0.17420450846354166, + "grad_norm": 14.711870193481445, + "learning_rate": 4.638287456045632e-06, + "loss": 3.4214, + "step": 17125 + }, + { + "epoch": 0.17425537109375, + "grad_norm": 14.64809513092041, + "learning_rate": 4.638080247029779e-06, + "loss": 3.4133, + "step": 17130 + }, + { + "epoch": 0.17430623372395834, + "grad_norm": 16.768775939941406, + "learning_rate": 4.63787298331165e-06, + "loss": 3.3163, + "step": 17135 + }, + { + "epoch": 0.17435709635416666, + "grad_norm": 13.071782112121582, + "learning_rate": 4.637665664896547e-06, + "loss": 3.8572, + "step": 17140 + }, + { + "epoch": 0.174407958984375, + "grad_norm": 11.809013366699219, + "learning_rate": 4.637458291789776e-06, + "loss": 3.4033, + "step": 17145 + }, + { + "epoch": 0.17445882161458334, + "grad_norm": 11.942599296569824, + "learning_rate": 4.63725086399664e-06, + "loss": 3.3937, + "step": 17150 + }, + { + "epoch": 0.17450968424479166, + "grad_norm": 11.510283470153809, + "learning_rate": 4.637043381522447e-06, + "loss": 3.2646, + "step": 17155 + }, + { + "epoch": 0.174560546875, + "grad_norm": 15.842909812927246, + "learning_rate": 4.636835844372507e-06, + "loss": 3.2227, + "step": 17160 + }, + { + "epoch": 0.17461140950520834, + "grad_norm": 9.673988342285156, + "learning_rate": 4.636628252552128e-06, + "loss": 3.5092, + "step": 17165 + }, + { + "epoch": 0.17466227213541666, + "grad_norm": 12.802469253540039, + "learning_rate": 4.636420606066621e-06, + "loss": 3.3495, + "step": 17170 + }, + { + "epoch": 0.174713134765625, + "grad_norm": 9.603645324707031, + "learning_rate": 4.636212904921299e-06, + "loss": 3.404, + "step": 17175 + }, + { + "epoch": 0.17476399739583334, + "grad_norm": 11.273427963256836, + "learning_rate": 4.6360051491214765e-06, + "loss": 3.2583, + "step": 17180 + }, + { + "epoch": 0.17481486002604166, + "grad_norm": 9.479177474975586, + "learning_rate": 4.635797338672469e-06, + "loss": 3.086, + "step": 17185 + }, + { + "epoch": 0.17486572265625, + "grad_norm": 15.164192199707031, + "learning_rate": 4.635589473579592e-06, + "loss": 3.7945, + "step": 17190 + }, + { + "epoch": 0.17491658528645834, + "grad_norm": 13.359272003173828, + "learning_rate": 4.635381553848165e-06, + "loss": 3.3775, + "step": 17195 + }, + { + "epoch": 0.17496744791666666, + "grad_norm": 7.546788692474365, + "learning_rate": 4.635173579483507e-06, + "loss": 3.2542, + "step": 17200 + }, + { + "epoch": 0.175018310546875, + "grad_norm": 13.577391624450684, + "learning_rate": 4.634965550490939e-06, + "loss": 3.4331, + "step": 17205 + }, + { + "epoch": 0.17506917317708334, + "grad_norm": 14.945005416870117, + "learning_rate": 4.6347574668757835e-06, + "loss": 3.2556, + "step": 17210 + }, + { + "epoch": 0.17512003580729166, + "grad_norm": 13.663776397705078, + "learning_rate": 4.634549328643364e-06, + "loss": 3.4574, + "step": 17215 + }, + { + "epoch": 0.1751708984375, + "grad_norm": 8.942784309387207, + "learning_rate": 4.634341135799007e-06, + "loss": 3.4193, + "step": 17220 + }, + { + "epoch": 0.17522176106770834, + "grad_norm": 15.326900482177734, + "learning_rate": 4.634132888348037e-06, + "loss": 3.6722, + "step": 17225 + }, + { + "epoch": 0.17527262369791666, + "grad_norm": 13.63649845123291, + "learning_rate": 4.633924586295782e-06, + "loss": 3.4126, + "step": 17230 + }, + { + "epoch": 0.175323486328125, + "grad_norm": 12.05754280090332, + "learning_rate": 4.633716229647573e-06, + "loss": 3.5295, + "step": 17235 + }, + { + "epoch": 0.17537434895833334, + "grad_norm": 14.691134452819824, + "learning_rate": 4.633507818408741e-06, + "loss": 3.2017, + "step": 17240 + }, + { + "epoch": 0.17542521158854166, + "grad_norm": 12.102783203125, + "learning_rate": 4.633299352584616e-06, + "loss": 3.3013, + "step": 17245 + }, + { + "epoch": 0.17547607421875, + "grad_norm": 8.052931785583496, + "learning_rate": 4.6330908321805336e-06, + "loss": 3.4158, + "step": 17250 + }, + { + "epoch": 0.17552693684895834, + "grad_norm": 11.040821075439453, + "learning_rate": 4.632882257201826e-06, + "loss": 3.1246, + "step": 17255 + }, + { + "epoch": 0.17557779947916666, + "grad_norm": 11.392463684082031, + "learning_rate": 4.632673627653833e-06, + "loss": 3.4227, + "step": 17260 + }, + { + "epoch": 0.175628662109375, + "grad_norm": 14.34813404083252, + "learning_rate": 4.6324649435418916e-06, + "loss": 3.2329, + "step": 17265 + }, + { + "epoch": 0.17567952473958334, + "grad_norm": 9.003851890563965, + "learning_rate": 4.632256204871338e-06, + "loss": 3.5315, + "step": 17270 + }, + { + "epoch": 0.17573038736979166, + "grad_norm": 11.924999237060547, + "learning_rate": 4.632047411647516e-06, + "loss": 3.4919, + "step": 17275 + }, + { + "epoch": 0.17578125, + "grad_norm": 11.398469924926758, + "learning_rate": 4.6318385638757665e-06, + "loss": 3.9053, + "step": 17280 + }, + { + "epoch": 0.17583211263020834, + "grad_norm": 10.507608413696289, + "learning_rate": 4.631629661561432e-06, + "loss": 3.8949, + "step": 17285 + }, + { + "epoch": 0.17588297526041666, + "grad_norm": 6.944844722747803, + "learning_rate": 4.6314207047098585e-06, + "loss": 3.6582, + "step": 17290 + }, + { + "epoch": 0.175933837890625, + "grad_norm": 15.985193252563477, + "learning_rate": 4.63121169332639e-06, + "loss": 3.1175, + "step": 17295 + }, + { + "epoch": 0.17598470052083334, + "grad_norm": 14.670384407043457, + "learning_rate": 4.6310026274163765e-06, + "loss": 3.547, + "step": 17300 + }, + { + "epoch": 0.17603556315104166, + "grad_norm": 12.856901168823242, + "learning_rate": 4.630793506985166e-06, + "loss": 3.3646, + "step": 17305 + }, + { + "epoch": 0.17608642578125, + "grad_norm": 14.06156063079834, + "learning_rate": 4.6305843320381085e-06, + "loss": 3.4701, + "step": 17310 + }, + { + "epoch": 0.17613728841145834, + "grad_norm": 18.088773727416992, + "learning_rate": 4.630375102580557e-06, + "loss": 3.5199, + "step": 17315 + }, + { + "epoch": 0.17618815104166666, + "grad_norm": 16.56032943725586, + "learning_rate": 4.630165818617862e-06, + "loss": 3.3376, + "step": 17320 + }, + { + "epoch": 0.176239013671875, + "grad_norm": 13.883744239807129, + "learning_rate": 4.62995648015538e-06, + "loss": 3.3735, + "step": 17325 + }, + { + "epoch": 0.17628987630208334, + "grad_norm": 9.11486530303955, + "learning_rate": 4.629747087198466e-06, + "loss": 3.2933, + "step": 17330 + }, + { + "epoch": 0.17634073893229166, + "grad_norm": 12.755553245544434, + "learning_rate": 4.629537639752477e-06, + "loss": 3.5813, + "step": 17335 + }, + { + "epoch": 0.1763916015625, + "grad_norm": 9.951680183410645, + "learning_rate": 4.629328137822774e-06, + "loss": 3.4514, + "step": 17340 + }, + { + "epoch": 0.17644246419270834, + "grad_norm": 14.455367088317871, + "learning_rate": 4.629118581414713e-06, + "loss": 2.6122, + "step": 17345 + }, + { + "epoch": 0.17649332682291666, + "grad_norm": 8.975110054016113, + "learning_rate": 4.6289089705336595e-06, + "loss": 3.1559, + "step": 17350 + }, + { + "epoch": 0.176544189453125, + "grad_norm": 17.275592803955078, + "learning_rate": 4.628699305184974e-06, + "loss": 3.3324, + "step": 17355 + }, + { + "epoch": 0.17659505208333334, + "grad_norm": 14.314291954040527, + "learning_rate": 4.628489585374022e-06, + "loss": 3.4651, + "step": 17360 + }, + { + "epoch": 0.17664591471354166, + "grad_norm": 15.956419944763184, + "learning_rate": 4.628279811106168e-06, + "loss": 3.6316, + "step": 17365 + }, + { + "epoch": 0.17669677734375, + "grad_norm": 13.444526672363281, + "learning_rate": 4.628069982386779e-06, + "loss": 3.4095, + "step": 17370 + }, + { + "epoch": 0.17674763997395834, + "grad_norm": 14.760951042175293, + "learning_rate": 4.627860099221224e-06, + "loss": 3.2458, + "step": 17375 + }, + { + "epoch": 0.17679850260416666, + "grad_norm": 9.031475067138672, + "learning_rate": 4.627650161614873e-06, + "loss": 3.1318, + "step": 17380 + }, + { + "epoch": 0.176849365234375, + "grad_norm": 9.814698219299316, + "learning_rate": 4.627440169573098e-06, + "loss": 3.8271, + "step": 17385 + }, + { + "epoch": 0.17690022786458334, + "grad_norm": 12.851936340332031, + "learning_rate": 4.627230123101268e-06, + "loss": 3.5598, + "step": 17390 + }, + { + "epoch": 0.17695109049479166, + "grad_norm": 44.61574935913086, + "learning_rate": 4.627020022204761e-06, + "loss": 3.6756, + "step": 17395 + }, + { + "epoch": 0.177001953125, + "grad_norm": 11.95059871673584, + "learning_rate": 4.626809866888951e-06, + "loss": 3.3316, + "step": 17400 + }, + { + "epoch": 0.17705281575520834, + "grad_norm": 16.417972564697266, + "learning_rate": 4.626599657159216e-06, + "loss": 3.3807, + "step": 17405 + }, + { + "epoch": 0.17710367838541666, + "grad_norm": 13.880332946777344, + "learning_rate": 4.6263893930209304e-06, + "loss": 3.3911, + "step": 17410 + }, + { + "epoch": 0.177154541015625, + "grad_norm": 15.084425926208496, + "learning_rate": 4.6261790744794765e-06, + "loss": 3.4722, + "step": 17415 + }, + { + "epoch": 0.17720540364583334, + "grad_norm": 15.803872108459473, + "learning_rate": 4.625968701540236e-06, + "loss": 3.1724, + "step": 17420 + }, + { + "epoch": 0.17725626627604166, + "grad_norm": 7.609067440032959, + "learning_rate": 4.62575827420859e-06, + "loss": 3.2543, + "step": 17425 + }, + { + "epoch": 0.17730712890625, + "grad_norm": 11.47111701965332, + "learning_rate": 4.625547792489922e-06, + "loss": 3.3642, + "step": 17430 + }, + { + "epoch": 0.17735799153645834, + "grad_norm": 9.357483863830566, + "learning_rate": 4.625337256389618e-06, + "loss": 3.5617, + "step": 17435 + }, + { + "epoch": 0.17740885416666666, + "grad_norm": 12.891478538513184, + "learning_rate": 4.625126665913063e-06, + "loss": 3.2682, + "step": 17440 + }, + { + "epoch": 0.177459716796875, + "grad_norm": 14.151500701904297, + "learning_rate": 4.6249160210656476e-06, + "loss": 3.3943, + "step": 17445 + }, + { + "epoch": 0.17751057942708334, + "grad_norm": 11.056961059570312, + "learning_rate": 4.624705321852758e-06, + "loss": 3.2693, + "step": 17450 + }, + { + "epoch": 0.17756144205729166, + "grad_norm": 16.99146270751953, + "learning_rate": 4.624494568279787e-06, + "loss": 3.5405, + "step": 17455 + }, + { + "epoch": 0.1776123046875, + "grad_norm": 9.272522926330566, + "learning_rate": 4.624283760352126e-06, + "loss": 3.0549, + "step": 17460 + }, + { + "epoch": 0.17766316731770834, + "grad_norm": 10.904102325439453, + "learning_rate": 4.624072898075168e-06, + "loss": 3.0915, + "step": 17465 + }, + { + "epoch": 0.17771402994791666, + "grad_norm": 15.731610298156738, + "learning_rate": 4.6238619814543094e-06, + "loss": 3.3617, + "step": 17470 + }, + { + "epoch": 0.177764892578125, + "grad_norm": 15.29787540435791, + "learning_rate": 4.623651010494945e-06, + "loss": 3.4183, + "step": 17475 + }, + { + "epoch": 0.17781575520833334, + "grad_norm": 13.272435188293457, + "learning_rate": 4.623439985202472e-06, + "loss": 3.3338, + "step": 17480 + }, + { + "epoch": 0.17786661783854166, + "grad_norm": 10.852397918701172, + "learning_rate": 4.623228905582292e-06, + "loss": 3.6102, + "step": 17485 + }, + { + "epoch": 0.17791748046875, + "grad_norm": 10.077887535095215, + "learning_rate": 4.623017771639803e-06, + "loss": 3.4454, + "step": 17490 + }, + { + "epoch": 0.17796834309895834, + "grad_norm": 13.571096420288086, + "learning_rate": 4.622806583380407e-06, + "loss": 3.3273, + "step": 17495 + }, + { + "epoch": 0.17801920572916666, + "grad_norm": 9.822858810424805, + "learning_rate": 4.622595340809508e-06, + "loss": 3.4383, + "step": 17500 + }, + { + "epoch": 0.178070068359375, + "grad_norm": 16.22844123840332, + "learning_rate": 4.622384043932509e-06, + "loss": 3.0372, + "step": 17505 + }, + { + "epoch": 0.17812093098958334, + "grad_norm": 7.383034706115723, + "learning_rate": 4.622172692754819e-06, + "loss": 3.1181, + "step": 17510 + }, + { + "epoch": 0.17817179361979166, + "grad_norm": 9.609219551086426, + "learning_rate": 4.621961287281843e-06, + "loss": 3.3651, + "step": 17515 + }, + { + "epoch": 0.17822265625, + "grad_norm": 9.730359077453613, + "learning_rate": 4.621749827518991e-06, + "loss": 3.4125, + "step": 17520 + }, + { + "epoch": 0.17827351888020834, + "grad_norm": 7.237141132354736, + "learning_rate": 4.621538313471673e-06, + "loss": 3.2425, + "step": 17525 + }, + { + "epoch": 0.17832438151041666, + "grad_norm": 10.888049125671387, + "learning_rate": 4.621326745145299e-06, + "loss": 3.5376, + "step": 17530 + }, + { + "epoch": 0.178375244140625, + "grad_norm": 12.970224380493164, + "learning_rate": 4.6211151225452835e-06, + "loss": 3.5308, + "step": 17535 + }, + { + "epoch": 0.17842610677083334, + "grad_norm": 12.810026168823242, + "learning_rate": 4.62090344567704e-06, + "loss": 3.1474, + "step": 17540 + }, + { + "epoch": 0.17847696940104166, + "grad_norm": 8.826539993286133, + "learning_rate": 4.6206917145459855e-06, + "loss": 3.2885, + "step": 17545 + }, + { + "epoch": 0.17852783203125, + "grad_norm": 10.239234924316406, + "learning_rate": 4.620479929157535e-06, + "loss": 3.1573, + "step": 17550 + }, + { + "epoch": 0.17857869466145834, + "grad_norm": 14.187760353088379, + "learning_rate": 4.620268089517108e-06, + "loss": 3.2755, + "step": 17555 + }, + { + "epoch": 0.17862955729166666, + "grad_norm": 11.916472434997559, + "learning_rate": 4.620056195630125e-06, + "loss": 3.2658, + "step": 17560 + }, + { + "epoch": 0.178680419921875, + "grad_norm": 18.4185791015625, + "learning_rate": 4.619844247502007e-06, + "loss": 3.3846, + "step": 17565 + }, + { + "epoch": 0.17873128255208334, + "grad_norm": 10.282975196838379, + "learning_rate": 4.619632245138176e-06, + "loss": 3.497, + "step": 17570 + }, + { + "epoch": 0.17878214518229166, + "grad_norm": 15.2913179397583, + "learning_rate": 4.619420188544057e-06, + "loss": 3.5962, + "step": 17575 + }, + { + "epoch": 0.1788330078125, + "grad_norm": 8.401836395263672, + "learning_rate": 4.619208077725075e-06, + "loss": 3.2448, + "step": 17580 + }, + { + "epoch": 0.17888387044270834, + "grad_norm": 11.083022117614746, + "learning_rate": 4.6189959126866555e-06, + "loss": 3.0683, + "step": 17585 + }, + { + "epoch": 0.17893473307291666, + "grad_norm": 11.012330055236816, + "learning_rate": 4.618783693434229e-06, + "loss": 3.3922, + "step": 17590 + }, + { + "epoch": 0.178985595703125, + "grad_norm": 10.7522611618042, + "learning_rate": 4.618571419973222e-06, + "loss": 3.1457, + "step": 17595 + }, + { + "epoch": 0.17903645833333334, + "grad_norm": 9.602813720703125, + "learning_rate": 4.6183590923090696e-06, + "loss": 3.2299, + "step": 17600 + }, + { + "epoch": 0.17908732096354166, + "grad_norm": 8.291756629943848, + "learning_rate": 4.6181467104472005e-06, + "loss": 3.0146, + "step": 17605 + }, + { + "epoch": 0.17913818359375, + "grad_norm": 13.860746383666992, + "learning_rate": 4.61793427439305e-06, + "loss": 3.1955, + "step": 17610 + }, + { + "epoch": 0.17918904622395834, + "grad_norm": 13.45958423614502, + "learning_rate": 4.6177217841520535e-06, + "loss": 2.8876, + "step": 17615 + }, + { + "epoch": 0.17923990885416666, + "grad_norm": 12.669150352478027, + "learning_rate": 4.617509239729647e-06, + "loss": 3.6825, + "step": 17620 + }, + { + "epoch": 0.179290771484375, + "grad_norm": 11.674629211425781, + "learning_rate": 4.61729664113127e-06, + "loss": 3.4646, + "step": 17625 + }, + { + "epoch": 0.17934163411458334, + "grad_norm": 14.575140953063965, + "learning_rate": 4.617083988362358e-06, + "loss": 2.9819, + "step": 17630 + }, + { + "epoch": 0.17939249674479166, + "grad_norm": 13.144431114196777, + "learning_rate": 4.616871281428355e-06, + "loss": 3.5436, + "step": 17635 + }, + { + "epoch": 0.179443359375, + "grad_norm": 14.929411888122559, + "learning_rate": 4.616658520334701e-06, + "loss": 3.3415, + "step": 17640 + }, + { + "epoch": 0.17949422200520834, + "grad_norm": 13.124006271362305, + "learning_rate": 4.616445705086842e-06, + "loss": 3.3565, + "step": 17645 + }, + { + "epoch": 0.17954508463541666, + "grad_norm": 8.569463729858398, + "learning_rate": 4.616232835690221e-06, + "loss": 4.0458, + "step": 17650 + }, + { + "epoch": 0.179595947265625, + "grad_norm": 11.597271919250488, + "learning_rate": 4.616019912150284e-06, + "loss": 3.5618, + "step": 17655 + }, + { + "epoch": 0.17964680989583334, + "grad_norm": 12.50623607635498, + "learning_rate": 4.615806934472479e-06, + "loss": 3.4437, + "step": 17660 + }, + { + "epoch": 0.17969767252604166, + "grad_norm": 13.184673309326172, + "learning_rate": 4.615593902662256e-06, + "loss": 3.4312, + "step": 17665 + }, + { + "epoch": 0.17974853515625, + "grad_norm": 10.000298500061035, + "learning_rate": 4.615380816725063e-06, + "loss": 3.693, + "step": 17670 + }, + { + "epoch": 0.17979939778645834, + "grad_norm": 15.589776039123535, + "learning_rate": 4.6151676766663536e-06, + "loss": 3.4443, + "step": 17675 + }, + { + "epoch": 0.17985026041666666, + "grad_norm": 12.44705867767334, + "learning_rate": 4.614954482491581e-06, + "loss": 3.2575, + "step": 17680 + }, + { + "epoch": 0.179901123046875, + "grad_norm": 16.3400821685791, + "learning_rate": 4.6147412342061995e-06, + "loss": 3.1107, + "step": 17685 + }, + { + "epoch": 0.17995198567708334, + "grad_norm": 9.2102689743042, + "learning_rate": 4.614527931815664e-06, + "loss": 3.2749, + "step": 17690 + }, + { + "epoch": 0.18000284830729166, + "grad_norm": 9.545188903808594, + "learning_rate": 4.6143145753254335e-06, + "loss": 3.5115, + "step": 17695 + }, + { + "epoch": 0.1800537109375, + "grad_norm": 13.04957389831543, + "learning_rate": 4.614101164740965e-06, + "loss": 3.5562, + "step": 17700 + }, + { + "epoch": 0.18010457356770834, + "grad_norm": 10.874368667602539, + "learning_rate": 4.613887700067719e-06, + "loss": 3.3832, + "step": 17705 + }, + { + "epoch": 0.18015543619791666, + "grad_norm": 11.728689193725586, + "learning_rate": 4.613674181311158e-06, + "loss": 3.2963, + "step": 17710 + }, + { + "epoch": 0.180206298828125, + "grad_norm": 10.86733341217041, + "learning_rate": 4.613460608476744e-06, + "loss": 3.2448, + "step": 17715 + }, + { + "epoch": 0.18025716145833334, + "grad_norm": 13.16111946105957, + "learning_rate": 4.613246981569941e-06, + "loss": 3.5583, + "step": 17720 + }, + { + "epoch": 0.18030802408854166, + "grad_norm": 15.626137733459473, + "learning_rate": 4.6130333005962144e-06, + "loss": 3.4084, + "step": 17725 + }, + { + "epoch": 0.18035888671875, + "grad_norm": 10.255508422851562, + "learning_rate": 4.612819565561033e-06, + "loss": 3.2747, + "step": 17730 + }, + { + "epoch": 0.18040974934895834, + "grad_norm": 10.121077537536621, + "learning_rate": 4.612605776469863e-06, + "loss": 3.3058, + "step": 17735 + }, + { + "epoch": 0.18046061197916666, + "grad_norm": 12.807663917541504, + "learning_rate": 4.612391933328175e-06, + "loss": 3.1888, + "step": 17740 + }, + { + "epoch": 0.180511474609375, + "grad_norm": 7.487052917480469, + "learning_rate": 4.61217803614144e-06, + "loss": 3.2892, + "step": 17745 + }, + { + "epoch": 0.18056233723958334, + "grad_norm": 13.956525802612305, + "learning_rate": 4.61196408491513e-06, + "loss": 3.7043, + "step": 17750 + }, + { + "epoch": 0.18061319986979166, + "grad_norm": 9.125615119934082, + "learning_rate": 4.611750079654721e-06, + "loss": 3.4562, + "step": 17755 + }, + { + "epoch": 0.1806640625, + "grad_norm": 14.65468692779541, + "learning_rate": 4.611536020365686e-06, + "loss": 3.3539, + "step": 17760 + }, + { + "epoch": 0.18071492513020834, + "grad_norm": 10.214896202087402, + "learning_rate": 4.611321907053502e-06, + "loss": 3.2788, + "step": 17765 + }, + { + "epoch": 0.18076578776041666, + "grad_norm": 13.62902545928955, + "learning_rate": 4.611107739723647e-06, + "loss": 3.3931, + "step": 17770 + }, + { + "epoch": 0.180816650390625, + "grad_norm": 12.189156532287598, + "learning_rate": 4.610893518381602e-06, + "loss": 3.4467, + "step": 17775 + }, + { + "epoch": 0.18086751302083334, + "grad_norm": 17.752410888671875, + "learning_rate": 4.610679243032846e-06, + "loss": 3.1061, + "step": 17780 + }, + { + "epoch": 0.18091837565104166, + "grad_norm": 9.7039794921875, + "learning_rate": 4.610464913682863e-06, + "loss": 3.3899, + "step": 17785 + }, + { + "epoch": 0.18096923828125, + "grad_norm": 18.238666534423828, + "learning_rate": 4.610250530337134e-06, + "loss": 3.4644, + "step": 17790 + }, + { + "epoch": 0.18102010091145834, + "grad_norm": 8.860824584960938, + "learning_rate": 4.6100360930011455e-06, + "loss": 3.4091, + "step": 17795 + }, + { + "epoch": 0.18107096354166666, + "grad_norm": 10.903741836547852, + "learning_rate": 4.6098216016803845e-06, + "loss": 3.276, + "step": 17800 + }, + { + "epoch": 0.181121826171875, + "grad_norm": 13.30379581451416, + "learning_rate": 4.609607056380337e-06, + "loss": 3.2626, + "step": 17805 + }, + { + "epoch": 0.18117268880208334, + "grad_norm": 12.02137565612793, + "learning_rate": 4.609392457106494e-06, + "loss": 3.6866, + "step": 17810 + }, + { + "epoch": 0.18122355143229166, + "grad_norm": 10.083224296569824, + "learning_rate": 4.6091778038643445e-06, + "loss": 3.2943, + "step": 17815 + }, + { + "epoch": 0.1812744140625, + "grad_norm": 13.689177513122559, + "learning_rate": 4.608963096659381e-06, + "loss": 3.4036, + "step": 17820 + }, + { + "epoch": 0.18132527669270834, + "grad_norm": 10.178418159484863, + "learning_rate": 4.608748335497096e-06, + "loss": 3.3081, + "step": 17825 + }, + { + "epoch": 0.18137613932291666, + "grad_norm": 13.94970417022705, + "learning_rate": 4.608533520382985e-06, + "loss": 3.3372, + "step": 17830 + }, + { + "epoch": 0.181427001953125, + "grad_norm": 10.877111434936523, + "learning_rate": 4.608318651322543e-06, + "loss": 3.6664, + "step": 17835 + }, + { + "epoch": 0.18147786458333334, + "grad_norm": 11.025904655456543, + "learning_rate": 4.608103728321269e-06, + "loss": 3.7866, + "step": 17840 + }, + { + "epoch": 0.18152872721354166, + "grad_norm": 14.711570739746094, + "learning_rate": 4.6078887513846605e-06, + "loss": 3.4579, + "step": 17845 + }, + { + "epoch": 0.18157958984375, + "grad_norm": 9.070608139038086, + "learning_rate": 4.607673720518218e-06, + "loss": 3.3378, + "step": 17850 + }, + { + "epoch": 0.18163045247395834, + "grad_norm": 10.735955238342285, + "learning_rate": 4.607458635727443e-06, + "loss": 3.4461, + "step": 17855 + }, + { + "epoch": 0.18168131510416666, + "grad_norm": 16.661575317382812, + "learning_rate": 4.607243497017838e-06, + "loss": 3.3425, + "step": 17860 + }, + { + "epoch": 0.181732177734375, + "grad_norm": 12.659605979919434, + "learning_rate": 4.607028304394907e-06, + "loss": 3.9381, + "step": 17865 + }, + { + "epoch": 0.18178304036458334, + "grad_norm": 16.774187088012695, + "learning_rate": 4.606813057864158e-06, + "loss": 3.5132, + "step": 17870 + }, + { + "epoch": 0.18183390299479166, + "grad_norm": 6.721248149871826, + "learning_rate": 4.606597757431095e-06, + "loss": 3.3477, + "step": 17875 + }, + { + "epoch": 0.181884765625, + "grad_norm": 11.525874137878418, + "learning_rate": 4.606382403101228e-06, + "loss": 3.5922, + "step": 17880 + }, + { + "epoch": 0.18193562825520834, + "grad_norm": 13.829561233520508, + "learning_rate": 4.606166994880067e-06, + "loss": 3.2248, + "step": 17885 + }, + { + "epoch": 0.18198649088541666, + "grad_norm": 8.705430030822754, + "learning_rate": 4.605951532773122e-06, + "loss": 3.2562, + "step": 17890 + }, + { + "epoch": 0.182037353515625, + "grad_norm": 8.117154121398926, + "learning_rate": 4.605736016785905e-06, + "loss": 3.2657, + "step": 17895 + }, + { + "epoch": 0.18208821614583334, + "grad_norm": 12.32487964630127, + "learning_rate": 4.605520446923933e-06, + "loss": 3.1516, + "step": 17900 + }, + { + "epoch": 0.18213907877604166, + "grad_norm": 14.840004920959473, + "learning_rate": 4.605304823192719e-06, + "loss": 3.4167, + "step": 17905 + }, + { + "epoch": 0.18218994140625, + "grad_norm": 14.861167907714844, + "learning_rate": 4.60508914559778e-06, + "loss": 3.729, + "step": 17910 + }, + { + "epoch": 0.18224080403645834, + "grad_norm": 12.168790817260742, + "learning_rate": 4.6048734141446335e-06, + "loss": 3.2407, + "step": 17915 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 7.106325149536133, + "learning_rate": 4.604657628838801e-06, + "loss": 3.2299, + "step": 17920 + }, + { + "epoch": 0.182342529296875, + "grad_norm": 7.858881950378418, + "learning_rate": 4.604441789685801e-06, + "loss": 3.477, + "step": 17925 + }, + { + "epoch": 0.18239339192708334, + "grad_norm": 11.167740821838379, + "learning_rate": 4.604225896691157e-06, + "loss": 3.673, + "step": 17930 + }, + { + "epoch": 0.18244425455729166, + "grad_norm": 14.516616821289062, + "learning_rate": 4.604009949860392e-06, + "loss": 3.0951, + "step": 17935 + }, + { + "epoch": 0.1824951171875, + "grad_norm": 11.77841567993164, + "learning_rate": 4.603793949199031e-06, + "loss": 3.4499, + "step": 17940 + }, + { + "epoch": 0.18254597981770834, + "grad_norm": 12.027185440063477, + "learning_rate": 4.603577894712601e-06, + "loss": 3.8324, + "step": 17945 + }, + { + "epoch": 0.18259684244791666, + "grad_norm": 13.794437408447266, + "learning_rate": 4.603361786406628e-06, + "loss": 3.6594, + "step": 17950 + }, + { + "epoch": 0.182647705078125, + "grad_norm": 13.343639373779297, + "learning_rate": 4.603145624286643e-06, + "loss": 3.3471, + "step": 17955 + }, + { + "epoch": 0.18269856770833334, + "grad_norm": 10.639410018920898, + "learning_rate": 4.602929408358176e-06, + "loss": 3.3093, + "step": 17960 + }, + { + "epoch": 0.18274943033854166, + "grad_norm": 15.507131576538086, + "learning_rate": 4.602713138626758e-06, + "loss": 3.3536, + "step": 17965 + }, + { + "epoch": 0.18280029296875, + "grad_norm": 13.404929161071777, + "learning_rate": 4.602496815097923e-06, + "loss": 3.244, + "step": 17970 + }, + { + "epoch": 0.18285115559895834, + "grad_norm": 13.34079647064209, + "learning_rate": 4.602280437777205e-06, + "loss": 3.4758, + "step": 17975 + }, + { + "epoch": 0.18290201822916666, + "grad_norm": 10.848614692687988, + "learning_rate": 4.602064006670141e-06, + "loss": 3.2688, + "step": 17980 + }, + { + "epoch": 0.182952880859375, + "grad_norm": 11.669103622436523, + "learning_rate": 4.601847521782268e-06, + "loss": 3.51, + "step": 17985 + }, + { + "epoch": 0.18300374348958334, + "grad_norm": 9.459100723266602, + "learning_rate": 4.601630983119123e-06, + "loss": 3.3047, + "step": 17990 + }, + { + "epoch": 0.18305460611979166, + "grad_norm": 10.783151626586914, + "learning_rate": 4.601414390686248e-06, + "loss": 3.1554, + "step": 17995 + }, + { + "epoch": 0.18310546875, + "grad_norm": 7.823272705078125, + "learning_rate": 4.601197744489184e-06, + "loss": 3.2805, + "step": 18000 + }, + { + "epoch": 0.18315633138020834, + "grad_norm": 10.413420677185059, + "learning_rate": 4.600981044533473e-06, + "loss": 3.1569, + "step": 18005 + }, + { + "epoch": 0.18320719401041666, + "grad_norm": 18.30168342590332, + "learning_rate": 4.600764290824661e-06, + "loss": 3.2039, + "step": 18010 + }, + { + "epoch": 0.183258056640625, + "grad_norm": 13.457409858703613, + "learning_rate": 4.600547483368292e-06, + "loss": 3.5678, + "step": 18015 + }, + { + "epoch": 0.18330891927083334, + "grad_norm": 12.043107032775879, + "learning_rate": 4.600330622169914e-06, + "loss": 3.2696, + "step": 18020 + }, + { + "epoch": 0.18335978190104166, + "grad_norm": 10.646010398864746, + "learning_rate": 4.600113707235075e-06, + "loss": 3.2272, + "step": 18025 + }, + { + "epoch": 0.18341064453125, + "grad_norm": 12.23196792602539, + "learning_rate": 4.5998967385693235e-06, + "loss": 3.4313, + "step": 18030 + }, + { + "epoch": 0.18346150716145834, + "grad_norm": 9.132007598876953, + "learning_rate": 4.599679716178212e-06, + "loss": 3.5767, + "step": 18035 + }, + { + "epoch": 0.18351236979166666, + "grad_norm": 15.052911758422852, + "learning_rate": 4.599462640067294e-06, + "loss": 3.5414, + "step": 18040 + }, + { + "epoch": 0.183563232421875, + "grad_norm": 12.69869327545166, + "learning_rate": 4.599245510242121e-06, + "loss": 3.2965, + "step": 18045 + }, + { + "epoch": 0.18361409505208334, + "grad_norm": 10.353547096252441, + "learning_rate": 4.599028326708248e-06, + "loss": 2.9671, + "step": 18050 + }, + { + "epoch": 0.18366495768229166, + "grad_norm": 8.513712882995605, + "learning_rate": 4.598811089471235e-06, + "loss": 3.4682, + "step": 18055 + }, + { + "epoch": 0.1837158203125, + "grad_norm": 10.49561882019043, + "learning_rate": 4.598593798536636e-06, + "loss": 3.1158, + "step": 18060 + }, + { + "epoch": 0.18376668294270834, + "grad_norm": 7.227719306945801, + "learning_rate": 4.598376453910013e-06, + "loss": 3.4783, + "step": 18065 + }, + { + "epoch": 0.18381754557291666, + "grad_norm": 8.991704940795898, + "learning_rate": 4.598159055596926e-06, + "loss": 3.258, + "step": 18070 + }, + { + "epoch": 0.183868408203125, + "grad_norm": 11.315898895263672, + "learning_rate": 4.5979416036029366e-06, + "loss": 3.3756, + "step": 18075 + }, + { + "epoch": 0.18391927083333334, + "grad_norm": 10.222885131835938, + "learning_rate": 4.597724097933608e-06, + "loss": 3.6465, + "step": 18080 + }, + { + "epoch": 0.18397013346354166, + "grad_norm": 15.640301704406738, + "learning_rate": 4.597506538594506e-06, + "loss": 3.247, + "step": 18085 + }, + { + "epoch": 0.18402099609375, + "grad_norm": 14.560544967651367, + "learning_rate": 4.597288925591196e-06, + "loss": 3.6574, + "step": 18090 + }, + { + "epoch": 0.18407185872395834, + "grad_norm": 11.333292007446289, + "learning_rate": 4.597071258929247e-06, + "loss": 3.6407, + "step": 18095 + }, + { + "epoch": 0.18412272135416666, + "grad_norm": 16.446786880493164, + "learning_rate": 4.596853538614226e-06, + "loss": 3.505, + "step": 18100 + }, + { + "epoch": 0.184173583984375, + "grad_norm": 8.610262870788574, + "learning_rate": 4.596635764651704e-06, + "loss": 3.867, + "step": 18105 + }, + { + "epoch": 0.18422444661458334, + "grad_norm": 11.611706733703613, + "learning_rate": 4.596417937047253e-06, + "loss": 3.4687, + "step": 18110 + }, + { + "epoch": 0.18427530924479166, + "grad_norm": 9.320916175842285, + "learning_rate": 4.5962000558064465e-06, + "loss": 3.6369, + "step": 18115 + }, + { + "epoch": 0.184326171875, + "grad_norm": 10.020218849182129, + "learning_rate": 4.5959821209348585e-06, + "loss": 3.2122, + "step": 18120 + }, + { + "epoch": 0.18437703450520834, + "grad_norm": 15.03809642791748, + "learning_rate": 4.595764132438064e-06, + "loss": 3.3018, + "step": 18125 + }, + { + "epoch": 0.18442789713541666, + "grad_norm": 13.933128356933594, + "learning_rate": 4.595546090321642e-06, + "loss": 3.4161, + "step": 18130 + }, + { + "epoch": 0.184478759765625, + "grad_norm": 15.626864433288574, + "learning_rate": 4.595327994591169e-06, + "loss": 3.1919, + "step": 18135 + }, + { + "epoch": 0.18452962239583334, + "grad_norm": 15.305878639221191, + "learning_rate": 4.595109845252226e-06, + "loss": 3.7762, + "step": 18140 + }, + { + "epoch": 0.18458048502604166, + "grad_norm": 12.688610076904297, + "learning_rate": 4.594891642310395e-06, + "loss": 3.5695, + "step": 18145 + }, + { + "epoch": 0.18463134765625, + "grad_norm": 12.089020729064941, + "learning_rate": 4.594673385771257e-06, + "loss": 3.651, + "step": 18150 + }, + { + "epoch": 0.18468221028645834, + "grad_norm": 14.751028060913086, + "learning_rate": 4.594455075640397e-06, + "loss": 3.3362, + "step": 18155 + }, + { + "epoch": 0.18473307291666666, + "grad_norm": 14.168632507324219, + "learning_rate": 4.594236711923401e-06, + "loss": 3.5103, + "step": 18160 + }, + { + "epoch": 0.184783935546875, + "grad_norm": 11.68898868560791, + "learning_rate": 4.594018294625855e-06, + "loss": 3.0502, + "step": 18165 + }, + { + "epoch": 0.18483479817708334, + "grad_norm": 11.675402641296387, + "learning_rate": 4.593799823753347e-06, + "loss": 3.2437, + "step": 18170 + }, + { + "epoch": 0.18488566080729166, + "grad_norm": 14.426863670349121, + "learning_rate": 4.593581299311467e-06, + "loss": 3.5249, + "step": 18175 + }, + { + "epoch": 0.1849365234375, + "grad_norm": 14.298029899597168, + "learning_rate": 4.593362721305805e-06, + "loss": 3.3278, + "step": 18180 + }, + { + "epoch": 0.18498738606770834, + "grad_norm": 13.843632698059082, + "learning_rate": 4.5931440897419546e-06, + "loss": 3.3659, + "step": 18185 + }, + { + "epoch": 0.18503824869791666, + "grad_norm": 15.963536262512207, + "learning_rate": 4.592925404625509e-06, + "loss": 3.2535, + "step": 18190 + }, + { + "epoch": 0.185089111328125, + "grad_norm": 13.099442481994629, + "learning_rate": 4.592706665962063e-06, + "loss": 3.4257, + "step": 18195 + }, + { + "epoch": 0.18513997395833334, + "grad_norm": 10.886616706848145, + "learning_rate": 4.592487873757212e-06, + "loss": 3.3174, + "step": 18200 + }, + { + "epoch": 0.18519083658854166, + "grad_norm": 14.132744789123535, + "learning_rate": 4.592269028016555e-06, + "loss": 3.5966, + "step": 18205 + }, + { + "epoch": 0.18524169921875, + "grad_norm": 12.172066688537598, + "learning_rate": 4.5920501287456905e-06, + "loss": 3.25, + "step": 18210 + }, + { + "epoch": 0.18529256184895834, + "grad_norm": 9.599930763244629, + "learning_rate": 4.591831175950221e-06, + "loss": 3.3482, + "step": 18215 + }, + { + "epoch": 0.18534342447916666, + "grad_norm": 14.642792701721191, + "learning_rate": 4.5916121696357454e-06, + "loss": 3.195, + "step": 18220 + }, + { + "epoch": 0.185394287109375, + "grad_norm": 17.15027618408203, + "learning_rate": 4.591393109807868e-06, + "loss": 3.6213, + "step": 18225 + }, + { + "epoch": 0.18544514973958334, + "grad_norm": 10.881146430969238, + "learning_rate": 4.591173996472195e-06, + "loss": 3.054, + "step": 18230 + }, + { + "epoch": 0.18549601236979166, + "grad_norm": 16.243993759155273, + "learning_rate": 4.5909548296343295e-06, + "loss": 3.2491, + "step": 18235 + }, + { + "epoch": 0.185546875, + "grad_norm": 12.273772239685059, + "learning_rate": 4.590735609299881e-06, + "loss": 3.208, + "step": 18240 + }, + { + "epoch": 0.18559773763020834, + "grad_norm": 12.737349510192871, + "learning_rate": 4.590516335474458e-06, + "loss": 3.0281, + "step": 18245 + }, + { + "epoch": 0.18564860026041666, + "grad_norm": 10.880949020385742, + "learning_rate": 4.590297008163669e-06, + "loss": 3.3972, + "step": 18250 + }, + { + "epoch": 0.185699462890625, + "grad_norm": 10.571752548217773, + "learning_rate": 4.590077627373126e-06, + "loss": 3.5009, + "step": 18255 + }, + { + "epoch": 0.18575032552083334, + "grad_norm": 17.717178344726562, + "learning_rate": 4.589858193108444e-06, + "loss": 3.5101, + "step": 18260 + }, + { + "epoch": 0.18580118815104166, + "grad_norm": 12.134099006652832, + "learning_rate": 4.589638705375234e-06, + "loss": 3.3066, + "step": 18265 + }, + { + "epoch": 0.18585205078125, + "grad_norm": 15.478253364562988, + "learning_rate": 4.5894191641791145e-06, + "loss": 3.329, + "step": 18270 + }, + { + "epoch": 0.18590291341145834, + "grad_norm": 8.206337928771973, + "learning_rate": 4.5891995695257e-06, + "loss": 3.2733, + "step": 18275 + }, + { + "epoch": 0.18595377604166666, + "grad_norm": 10.964665412902832, + "learning_rate": 4.58897992142061e-06, + "loss": 3.5635, + "step": 18280 + }, + { + "epoch": 0.186004638671875, + "grad_norm": 12.019591331481934, + "learning_rate": 4.588760219869463e-06, + "loss": 3.3073, + "step": 18285 + }, + { + "epoch": 0.18605550130208334, + "grad_norm": 12.096267700195312, + "learning_rate": 4.588540464877882e-06, + "loss": 3.4862, + "step": 18290 + }, + { + "epoch": 0.18610636393229166, + "grad_norm": 10.249032020568848, + "learning_rate": 4.588320656451487e-06, + "loss": 3.6171, + "step": 18295 + }, + { + "epoch": 0.1861572265625, + "grad_norm": 16.42626190185547, + "learning_rate": 4.588100794595904e-06, + "loss": 3.5845, + "step": 18300 + }, + { + "epoch": 0.18620808919270834, + "grad_norm": 16.055395126342773, + "learning_rate": 4.587880879316758e-06, + "loss": 3.5212, + "step": 18305 + }, + { + "epoch": 0.18625895182291666, + "grad_norm": 14.237022399902344, + "learning_rate": 4.587660910619672e-06, + "loss": 3.1991, + "step": 18310 + }, + { + "epoch": 0.186309814453125, + "grad_norm": 7.7949652671813965, + "learning_rate": 4.5874408885102785e-06, + "loss": 3.4076, + "step": 18315 + }, + { + "epoch": 0.18636067708333334, + "grad_norm": 8.902517318725586, + "learning_rate": 4.5872208129942045e-06, + "loss": 3.2999, + "step": 18320 + }, + { + "epoch": 0.18641153971354166, + "grad_norm": 11.778844833374023, + "learning_rate": 4.58700068407708e-06, + "loss": 3.2865, + "step": 18325 + }, + { + "epoch": 0.18646240234375, + "grad_norm": 8.537171363830566, + "learning_rate": 4.586780501764538e-06, + "loss": 3.2332, + "step": 18330 + }, + { + "epoch": 0.18651326497395834, + "grad_norm": 15.2643461227417, + "learning_rate": 4.586560266062211e-06, + "loss": 3.3204, + "step": 18335 + }, + { + "epoch": 0.18656412760416666, + "grad_norm": 8.688283920288086, + "learning_rate": 4.586339976975735e-06, + "loss": 3.3692, + "step": 18340 + }, + { + "epoch": 0.186614990234375, + "grad_norm": 12.671655654907227, + "learning_rate": 4.586119634510745e-06, + "loss": 3.3349, + "step": 18345 + }, + { + "epoch": 0.18666585286458334, + "grad_norm": 12.594867706298828, + "learning_rate": 4.585899238672878e-06, + "loss": 3.4412, + "step": 18350 + }, + { + "epoch": 0.18671671549479166, + "grad_norm": 16.03032875061035, + "learning_rate": 4.585678789467774e-06, + "loss": 3.474, + "step": 18355 + }, + { + "epoch": 0.186767578125, + "grad_norm": 13.385184288024902, + "learning_rate": 4.585458286901072e-06, + "loss": 3.2519, + "step": 18360 + }, + { + "epoch": 0.18681844075520834, + "grad_norm": 9.913110733032227, + "learning_rate": 4.5852377309784146e-06, + "loss": 3.3587, + "step": 18365 + }, + { + "epoch": 0.18686930338541666, + "grad_norm": 10.111922264099121, + "learning_rate": 4.585017121705444e-06, + "loss": 3.1416, + "step": 18370 + }, + { + "epoch": 0.186920166015625, + "grad_norm": 9.840933799743652, + "learning_rate": 4.584796459087805e-06, + "loss": 3.5775, + "step": 18375 + }, + { + "epoch": 0.18697102864583334, + "grad_norm": 12.728557586669922, + "learning_rate": 4.584575743131142e-06, + "loss": 3.6974, + "step": 18380 + }, + { + "epoch": 0.18702189127604166, + "grad_norm": 12.671290397644043, + "learning_rate": 4.584354973841103e-06, + "loss": 3.343, + "step": 18385 + }, + { + "epoch": 0.18707275390625, + "grad_norm": 14.869614601135254, + "learning_rate": 4.584134151223335e-06, + "loss": 3.4718, + "step": 18390 + }, + { + "epoch": 0.18712361653645834, + "grad_norm": 11.287585258483887, + "learning_rate": 4.58391327528349e-06, + "loss": 3.4128, + "step": 18395 + }, + { + "epoch": 0.18717447916666666, + "grad_norm": 16.449617385864258, + "learning_rate": 4.5836923460272175e-06, + "loss": 3.7161, + "step": 18400 + }, + { + "epoch": 0.187225341796875, + "grad_norm": 16.260473251342773, + "learning_rate": 4.58347136346017e-06, + "loss": 3.3264, + "step": 18405 + }, + { + "epoch": 0.18727620442708334, + "grad_norm": 13.514466285705566, + "learning_rate": 4.5832503275880015e-06, + "loss": 3.294, + "step": 18410 + }, + { + "epoch": 0.18732706705729166, + "grad_norm": 13.021100044250488, + "learning_rate": 4.583029238416368e-06, + "loss": 3.3275, + "step": 18415 + }, + { + "epoch": 0.1873779296875, + "grad_norm": 10.005925178527832, + "learning_rate": 4.582808095950924e-06, + "loss": 3.308, + "step": 18420 + }, + { + "epoch": 0.18742879231770834, + "grad_norm": 11.48652172088623, + "learning_rate": 4.5825869001973285e-06, + "loss": 3.5553, + "step": 18425 + }, + { + "epoch": 0.18747965494791666, + "grad_norm": 15.618109703063965, + "learning_rate": 4.582365651161242e-06, + "loss": 3.3904, + "step": 18430 + }, + { + "epoch": 0.187530517578125, + "grad_norm": 14.050233840942383, + "learning_rate": 4.582144348848323e-06, + "loss": 3.7672, + "step": 18435 + }, + { + "epoch": 0.18758138020833334, + "grad_norm": 10.15687370300293, + "learning_rate": 4.581922993264235e-06, + "loss": 3.2861, + "step": 18440 + }, + { + "epoch": 0.18763224283854166, + "grad_norm": 10.453628540039062, + "learning_rate": 4.58170158441464e-06, + "loss": 3.1252, + "step": 18445 + }, + { + "epoch": 0.18768310546875, + "grad_norm": 16.22777557373047, + "learning_rate": 4.5814801223052035e-06, + "loss": 3.43, + "step": 18450 + }, + { + "epoch": 0.18773396809895834, + "grad_norm": 9.1096830368042, + "learning_rate": 4.581258606941592e-06, + "loss": 3.3274, + "step": 18455 + }, + { + "epoch": 0.18778483072916666, + "grad_norm": 16.2208251953125, + "learning_rate": 4.581037038329472e-06, + "loss": 3.8525, + "step": 18460 + }, + { + "epoch": 0.187835693359375, + "grad_norm": 16.626689910888672, + "learning_rate": 4.580815416474512e-06, + "loss": 3.2766, + "step": 18465 + }, + { + "epoch": 0.18788655598958334, + "grad_norm": 16.821229934692383, + "learning_rate": 4.580593741382384e-06, + "loss": 3.6567, + "step": 18470 + }, + { + "epoch": 0.18793741861979166, + "grad_norm": 11.31299877166748, + "learning_rate": 4.580372013058757e-06, + "loss": 3.4805, + "step": 18475 + }, + { + "epoch": 0.18798828125, + "grad_norm": 16.890230178833008, + "learning_rate": 4.580150231509306e-06, + "loss": 3.2733, + "step": 18480 + }, + { + "epoch": 0.18803914388020834, + "grad_norm": 8.345052719116211, + "learning_rate": 4.579928396739704e-06, + "loss": 3.7072, + "step": 18485 + }, + { + "epoch": 0.18809000651041666, + "grad_norm": 7.271616458892822, + "learning_rate": 4.579706508755627e-06, + "loss": 3.4539, + "step": 18490 + }, + { + "epoch": 0.188140869140625, + "grad_norm": 14.4603910446167, + "learning_rate": 4.579484567562752e-06, + "loss": 3.8369, + "step": 18495 + }, + { + "epoch": 0.18819173177083334, + "grad_norm": 8.192587852478027, + "learning_rate": 4.579262573166757e-06, + "loss": 2.9722, + "step": 18500 + }, + { + "epoch": 0.18824259440104166, + "grad_norm": 8.657429695129395, + "learning_rate": 4.579040525573323e-06, + "loss": 3.7435, + "step": 18505 + }, + { + "epoch": 0.18829345703125, + "grad_norm": 11.86208724975586, + "learning_rate": 4.578818424788129e-06, + "loss": 3.3249, + "step": 18510 + }, + { + "epoch": 0.18834431966145834, + "grad_norm": 11.226592063903809, + "learning_rate": 4.578596270816858e-06, + "loss": 3.4405, + "step": 18515 + }, + { + "epoch": 0.18839518229166666, + "grad_norm": 7.9588727951049805, + "learning_rate": 4.578374063665195e-06, + "loss": 3.378, + "step": 18520 + }, + { + "epoch": 0.188446044921875, + "grad_norm": 10.29086971282959, + "learning_rate": 4.578151803338824e-06, + "loss": 3.6522, + "step": 18525 + }, + { + "epoch": 0.18849690755208334, + "grad_norm": 12.614090919494629, + "learning_rate": 4.577929489843431e-06, + "loss": 3.0822, + "step": 18530 + }, + { + "epoch": 0.18854777018229166, + "grad_norm": 14.052781105041504, + "learning_rate": 4.577707123184705e-06, + "loss": 3.3621, + "step": 18535 + }, + { + "epoch": 0.1885986328125, + "grad_norm": 13.460591316223145, + "learning_rate": 4.577484703368335e-06, + "loss": 3.5421, + "step": 18540 + }, + { + "epoch": 0.18864949544270834, + "grad_norm": 10.110848426818848, + "learning_rate": 4.57726223040001e-06, + "loss": 3.5516, + "step": 18545 + }, + { + "epoch": 0.18870035807291666, + "grad_norm": 11.632022857666016, + "learning_rate": 4.577039704285424e-06, + "loss": 3.5707, + "step": 18550 + }, + { + "epoch": 0.188751220703125, + "grad_norm": 8.971404075622559, + "learning_rate": 4.5768171250302706e-06, + "loss": 3.4157, + "step": 18555 + }, + { + "epoch": 0.18880208333333334, + "grad_norm": 14.906126976013184, + "learning_rate": 4.576594492640242e-06, + "loss": 3.7568, + "step": 18560 + }, + { + "epoch": 0.18885294596354166, + "grad_norm": 12.4305419921875, + "learning_rate": 4.576371807121036e-06, + "loss": 3.5414, + "step": 18565 + }, + { + "epoch": 0.18890380859375, + "grad_norm": 13.76634407043457, + "learning_rate": 4.57614906847835e-06, + "loss": 3.4253, + "step": 18570 + }, + { + "epoch": 0.18895467122395834, + "grad_norm": 14.682574272155762, + "learning_rate": 4.5759262767178805e-06, + "loss": 3.4316, + "step": 18575 + }, + { + "epoch": 0.18900553385416666, + "grad_norm": 12.575339317321777, + "learning_rate": 4.575703431845331e-06, + "loss": 3.6921, + "step": 18580 + }, + { + "epoch": 0.189056396484375, + "grad_norm": 7.278709411621094, + "learning_rate": 4.5754805338664e-06, + "loss": 3.1521, + "step": 18585 + }, + { + "epoch": 0.18910725911458334, + "grad_norm": 12.746679306030273, + "learning_rate": 4.575257582786792e-06, + "loss": 3.2062, + "step": 18590 + }, + { + "epoch": 0.18915812174479166, + "grad_norm": 15.597989082336426, + "learning_rate": 4.575034578612211e-06, + "loss": 3.5262, + "step": 18595 + }, + { + "epoch": 0.189208984375, + "grad_norm": 7.950192928314209, + "learning_rate": 4.574811521348361e-06, + "loss": 3.6785, + "step": 18600 + }, + { + "epoch": 0.18925984700520834, + "grad_norm": 10.346031188964844, + "learning_rate": 4.574588411000951e-06, + "loss": 3.3503, + "step": 18605 + }, + { + "epoch": 0.18931070963541666, + "grad_norm": 11.723124504089355, + "learning_rate": 4.574365247575688e-06, + "loss": 3.3964, + "step": 18610 + }, + { + "epoch": 0.189361572265625, + "grad_norm": 8.930464744567871, + "learning_rate": 4.574142031078282e-06, + "loss": 3.2352, + "step": 18615 + }, + { + "epoch": 0.18941243489583334, + "grad_norm": 13.8888521194458, + "learning_rate": 4.573918761514443e-06, + "loss": 3.315, + "step": 18620 + }, + { + "epoch": 0.18946329752604166, + "grad_norm": 7.327556133270264, + "learning_rate": 4.573695438889885e-06, + "loss": 3.1141, + "step": 18625 + }, + { + "epoch": 0.18951416015625, + "grad_norm": 8.570147514343262, + "learning_rate": 4.57347206321032e-06, + "loss": 3.1851, + "step": 18630 + }, + { + "epoch": 0.18956502278645834, + "grad_norm": 9.34282112121582, + "learning_rate": 4.573248634481464e-06, + "loss": 3.5862, + "step": 18635 + }, + { + "epoch": 0.18961588541666666, + "grad_norm": 12.512747764587402, + "learning_rate": 4.573025152709033e-06, + "loss": 3.5737, + "step": 18640 + }, + { + "epoch": 0.189666748046875, + "grad_norm": 12.564814567565918, + "learning_rate": 4.572801617898746e-06, + "loss": 3.4582, + "step": 18645 + }, + { + "epoch": 0.18971761067708334, + "grad_norm": 14.594040870666504, + "learning_rate": 4.572578030056319e-06, + "loss": 3.0915, + "step": 18650 + }, + { + "epoch": 0.18976847330729166, + "grad_norm": 11.229549407958984, + "learning_rate": 4.572354389187476e-06, + "loss": 3.6208, + "step": 18655 + }, + { + "epoch": 0.1898193359375, + "grad_norm": 13.392086029052734, + "learning_rate": 4.572130695297936e-06, + "loss": 3.5334, + "step": 18660 + }, + { + "epoch": 0.18987019856770834, + "grad_norm": 9.601693153381348, + "learning_rate": 4.571906948393424e-06, + "loss": 3.3134, + "step": 18665 + }, + { + "epoch": 0.18992106119791666, + "grad_norm": 13.067540168762207, + "learning_rate": 4.571683148479663e-06, + "loss": 3.0544, + "step": 18670 + }, + { + "epoch": 0.189971923828125, + "grad_norm": 10.653081893920898, + "learning_rate": 4.57145929556238e-06, + "loss": 3.7998, + "step": 18675 + }, + { + "epoch": 0.19002278645833334, + "grad_norm": 12.351947784423828, + "learning_rate": 4.5712353896473025e-06, + "loss": 3.3205, + "step": 18680 + }, + { + "epoch": 0.19007364908854166, + "grad_norm": 9.99809455871582, + "learning_rate": 4.571011430740158e-06, + "loss": 3.577, + "step": 18685 + }, + { + "epoch": 0.19012451171875, + "grad_norm": 11.64851188659668, + "learning_rate": 4.5707874188466774e-06, + "loss": 3.3207, + "step": 18690 + }, + { + "epoch": 0.19017537434895834, + "grad_norm": 10.789388656616211, + "learning_rate": 4.5705633539725915e-06, + "loss": 3.5073, + "step": 18695 + }, + { + "epoch": 0.19022623697916666, + "grad_norm": 11.41800308227539, + "learning_rate": 4.570339236123632e-06, + "loss": 3.4615, + "step": 18700 + }, + { + "epoch": 0.190277099609375, + "grad_norm": 12.321640968322754, + "learning_rate": 4.5701150653055345e-06, + "loss": 3.3763, + "step": 18705 + }, + { + "epoch": 0.19032796223958334, + "grad_norm": 14.531020164489746, + "learning_rate": 4.569890841524034e-06, + "loss": 3.5657, + "step": 18710 + }, + { + "epoch": 0.19037882486979166, + "grad_norm": 10.604043960571289, + "learning_rate": 4.569666564784867e-06, + "loss": 3.4327, + "step": 18715 + }, + { + "epoch": 0.1904296875, + "grad_norm": 9.537259101867676, + "learning_rate": 4.569442235093771e-06, + "loss": 3.3424, + "step": 18720 + }, + { + "epoch": 0.19048055013020834, + "grad_norm": 10.841087341308594, + "learning_rate": 4.569217852456486e-06, + "loss": 3.4501, + "step": 18725 + }, + { + "epoch": 0.19053141276041666, + "grad_norm": 10.725885391235352, + "learning_rate": 4.568993416878753e-06, + "loss": 3.6736, + "step": 18730 + }, + { + "epoch": 0.190582275390625, + "grad_norm": 13.705768585205078, + "learning_rate": 4.568768928366313e-06, + "loss": 3.7702, + "step": 18735 + }, + { + "epoch": 0.19063313802083334, + "grad_norm": 8.565932273864746, + "learning_rate": 4.568544386924911e-06, + "loss": 3.6213, + "step": 18740 + }, + { + "epoch": 0.19068400065104166, + "grad_norm": 14.157207489013672, + "learning_rate": 4.568319792560292e-06, + "loss": 3.4415, + "step": 18745 + }, + { + "epoch": 0.19073486328125, + "grad_norm": 12.56330680847168, + "learning_rate": 4.5680951452782e-06, + "loss": 3.1235, + "step": 18750 + }, + { + "epoch": 0.19078572591145834, + "grad_norm": 7.816888809204102, + "learning_rate": 4.567870445084385e-06, + "loss": 3.1856, + "step": 18755 + }, + { + "epoch": 0.19083658854166666, + "grad_norm": 13.05587387084961, + "learning_rate": 4.567645691984594e-06, + "loss": 3.1723, + "step": 18760 + }, + { + "epoch": 0.190887451171875, + "grad_norm": 13.687058448791504, + "learning_rate": 4.567420885984578e-06, + "loss": 3.8625, + "step": 18765 + }, + { + "epoch": 0.19093831380208334, + "grad_norm": 12.832854270935059, + "learning_rate": 4.567196027090088e-06, + "loss": 3.1376, + "step": 18770 + }, + { + "epoch": 0.19098917643229166, + "grad_norm": 13.068547248840332, + "learning_rate": 4.56697111530688e-06, + "loss": 3.3375, + "step": 18775 + }, + { + "epoch": 0.1910400390625, + "grad_norm": 10.62330150604248, + "learning_rate": 4.566746150640704e-06, + "loss": 3.4257, + "step": 18780 + }, + { + "epoch": 0.19109090169270834, + "grad_norm": 13.381546020507812, + "learning_rate": 4.566521133097318e-06, + "loss": 3.2436, + "step": 18785 + }, + { + "epoch": 0.19114176432291666, + "grad_norm": 9.422579765319824, + "learning_rate": 4.566296062682478e-06, + "loss": 3.4788, + "step": 18790 + }, + { + "epoch": 0.191192626953125, + "grad_norm": 16.37440299987793, + "learning_rate": 4.566070939401944e-06, + "loss": 3.5952, + "step": 18795 + }, + { + "epoch": 0.19124348958333334, + "grad_norm": 11.154594421386719, + "learning_rate": 4.565845763261475e-06, + "loss": 3.1788, + "step": 18800 + }, + { + "epoch": 0.19129435221354166, + "grad_norm": 13.181290626525879, + "learning_rate": 4.565620534266831e-06, + "loss": 3.2884, + "step": 18805 + }, + { + "epoch": 0.19134521484375, + "grad_norm": 10.758809089660645, + "learning_rate": 4.565395252423775e-06, + "loss": 3.2639, + "step": 18810 + }, + { + "epoch": 0.19139607747395834, + "grad_norm": 11.981339454650879, + "learning_rate": 4.5651699177380714e-06, + "loss": 3.2688, + "step": 18815 + }, + { + "epoch": 0.19144694010416666, + "grad_norm": 9.572751998901367, + "learning_rate": 4.564944530215486e-06, + "loss": 3.5481, + "step": 18820 + }, + { + "epoch": 0.191497802734375, + "grad_norm": 9.87020492553711, + "learning_rate": 4.564719089861783e-06, + "loss": 3.4662, + "step": 18825 + }, + { + "epoch": 0.19154866536458334, + "grad_norm": 9.3651123046875, + "learning_rate": 4.564493596682732e-06, + "loss": 3.2547, + "step": 18830 + }, + { + "epoch": 0.19159952799479166, + "grad_norm": 9.596661567687988, + "learning_rate": 4.564268050684101e-06, + "loss": 3.3904, + "step": 18835 + }, + { + "epoch": 0.191650390625, + "grad_norm": 15.922765731811523, + "learning_rate": 4.564042451871662e-06, + "loss": 3.2157, + "step": 18840 + }, + { + "epoch": 0.19170125325520834, + "grad_norm": 11.004732131958008, + "learning_rate": 4.563816800251185e-06, + "loss": 3.3153, + "step": 18845 + }, + { + "epoch": 0.19175211588541666, + "grad_norm": 8.292093276977539, + "learning_rate": 4.563591095828446e-06, + "loss": 3.2079, + "step": 18850 + }, + { + "epoch": 0.191802978515625, + "grad_norm": 8.698326110839844, + "learning_rate": 4.563365338609216e-06, + "loss": 3.404, + "step": 18855 + }, + { + "epoch": 0.19185384114583334, + "grad_norm": 11.080509185791016, + "learning_rate": 4.563139528599274e-06, + "loss": 3.2749, + "step": 18860 + }, + { + "epoch": 0.19190470377604166, + "grad_norm": 14.920134544372559, + "learning_rate": 4.562913665804397e-06, + "loss": 3.455, + "step": 18865 + }, + { + "epoch": 0.19195556640625, + "grad_norm": 10.069123268127441, + "learning_rate": 4.562687750230361e-06, + "loss": 3.6829, + "step": 18870 + }, + { + "epoch": 0.19200642903645834, + "grad_norm": 8.710797309875488, + "learning_rate": 4.562461781882949e-06, + "loss": 3.3883, + "step": 18875 + }, + { + "epoch": 0.19205729166666666, + "grad_norm": 13.156225204467773, + "learning_rate": 4.5622357607679415e-06, + "loss": 3.1545, + "step": 18880 + }, + { + "epoch": 0.192108154296875, + "grad_norm": 12.595132827758789, + "learning_rate": 4.5620096868911205e-06, + "loss": 3.4499, + "step": 18885 + }, + { + "epoch": 0.19215901692708334, + "grad_norm": 12.80722713470459, + "learning_rate": 4.561783560258269e-06, + "loss": 3.6053, + "step": 18890 + }, + { + "epoch": 0.19220987955729166, + "grad_norm": 10.552962303161621, + "learning_rate": 4.561557380875175e-06, + "loss": 3.4, + "step": 18895 + }, + { + "epoch": 0.1922607421875, + "grad_norm": 12.450384140014648, + "learning_rate": 4.561331148747624e-06, + "loss": 3.4466, + "step": 18900 + }, + { + "epoch": 0.19231160481770834, + "grad_norm": 9.195076942443848, + "learning_rate": 4.5611048638814045e-06, + "loss": 3.0326, + "step": 18905 + }, + { + "epoch": 0.19236246744791666, + "grad_norm": 10.586387634277344, + "learning_rate": 4.560878526282305e-06, + "loss": 3.1759, + "step": 18910 + } + ], + "logging_steps": 5, + "max_steps": 98304, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 394, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.328444652552192e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}