diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21721 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 49.441786283891545, + "eval_steps": 500, + "global_step": 31000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 3.6600120067596436, + "learning_rate": 0.0019993620414673046, + "loss": 2.7717, + "step": 10 + }, + { + "epoch": 0.03, + "grad_norm": 1.797737717628479, + "learning_rate": 0.0019987240829346096, + "loss": 2.2538, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 1.503915548324585, + "learning_rate": 0.001998086124401914, + "loss": 2.5162, + "step": 30 + }, + { + "epoch": 0.06, + "grad_norm": 2.3732383251190186, + "learning_rate": 0.0019974481658692187, + "loss": 2.7087, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 4.7273030281066895, + "learning_rate": 0.0019968102073365233, + "loss": 2.3868, + "step": 50 + }, + { + "epoch": 0.1, + "grad_norm": 4.214552402496338, + "learning_rate": 0.001996172248803828, + "loss": 2.4794, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 5.231570243835449, + "learning_rate": 0.0019955342902711324, + "loss": 2.5851, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 1.2388306856155396, + "learning_rate": 0.001994896331738437, + "loss": 2.3979, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 5.343308448791504, + "learning_rate": 0.001994258373205742, + "loss": 2.5801, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 5.010150909423828, + "learning_rate": 0.0019936204146730465, + "loss": 2.782, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 4.422647953033447, + "learning_rate": 0.001992982456140351, + "loss": 2.5693, + "step": 110 + }, + { + "epoch": 0.19, + "grad_norm": 2.5217037200927734, + "learning_rate": 0.0019923444976076557, + "loss": 2.6707, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 4.945374011993408, + "learning_rate": 0.0019917065390749602, + "loss": 2.7236, + "step": 130 + }, + { + "epoch": 0.22, + "grad_norm": 1.9850760698318481, + "learning_rate": 0.001991068580542265, + "loss": 2.5799, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 5.4692206382751465, + "learning_rate": 0.0019904306220095693, + "loss": 2.6595, + "step": 150 + }, + { + "epoch": 0.26, + "grad_norm": 2.3442928791046143, + "learning_rate": 0.0019897926634768743, + "loss": 2.8178, + "step": 160 + }, + { + "epoch": 0.27, + "grad_norm": 1.8789024353027344, + "learning_rate": 0.001989154704944179, + "loss": 2.5928, + "step": 170 + }, + { + "epoch": 0.29, + "grad_norm": 4.694440841674805, + "learning_rate": 0.0019885167464114835, + "loss": 2.7751, + "step": 180 + }, + { + "epoch": 0.3, + "grad_norm": 6.304182529449463, + "learning_rate": 0.001987878787878788, + "loss": 2.5837, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 5.747090816497803, + "learning_rate": 0.0019872408293460926, + "loss": 2.7569, + "step": 200 + }, + { + "epoch": 0.33, + "grad_norm": 5.950802803039551, + "learning_rate": 0.001986602870813397, + "loss": 2.5465, + "step": 210 + }, + { + "epoch": 0.35, + "grad_norm": 3.991403102874756, + "learning_rate": 0.0019859649122807017, + "loss": 2.3429, + "step": 220 + }, + { + "epoch": 0.37, + "grad_norm": 15.909507751464844, + "learning_rate": 0.0019853269537480063, + "loss": 2.6886, + "step": 230 + }, + { + "epoch": 0.38, + "grad_norm": 3.464792490005493, + "learning_rate": 0.0019846889952153113, + "loss": 2.6745, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 2.647952079772949, + "learning_rate": 0.001984051036682616, + "loss": 2.666, + "step": 250 + }, + { + "epoch": 0.41, + "grad_norm": 5.060638427734375, + "learning_rate": 0.0019834130781499204, + "loss": 2.502, + "step": 260 + }, + { + "epoch": 0.43, + "grad_norm": 2.046036720275879, + "learning_rate": 0.001982775119617225, + "loss": 2.6536, + "step": 270 + }, + { + "epoch": 0.45, + "grad_norm": 3.072054147720337, + "learning_rate": 0.0019821371610845295, + "loss": 2.7964, + "step": 280 + }, + { + "epoch": 0.46, + "grad_norm": 4.118267059326172, + "learning_rate": 0.001981499202551834, + "loss": 2.7141, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 6.113500118255615, + "learning_rate": 0.0019808612440191387, + "loss": 2.6802, + "step": 300 + }, + { + "epoch": 0.49, + "grad_norm": 4.462850570678711, + "learning_rate": 0.0019802232854864437, + "loss": 2.611, + "step": 310 + }, + { + "epoch": 0.51, + "grad_norm": 4.859583854675293, + "learning_rate": 0.001979585326953748, + "loss": 2.954, + "step": 320 + }, + { + "epoch": 0.53, + "grad_norm": 2.9786384105682373, + "learning_rate": 0.0019789473684210528, + "loss": 2.7084, + "step": 330 + }, + { + "epoch": 0.54, + "grad_norm": 2.122506618499756, + "learning_rate": 0.0019783094098883573, + "loss": 2.5917, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 2.5119547843933105, + "learning_rate": 0.001977671451355662, + "loss": 2.7874, + "step": 350 + }, + { + "epoch": 0.57, + "grad_norm": 5.032376289367676, + "learning_rate": 0.0019770334928229665, + "loss": 2.7288, + "step": 360 + }, + { + "epoch": 0.59, + "grad_norm": 6.186531066894531, + "learning_rate": 0.001976395534290271, + "loss": 2.7858, + "step": 370 + }, + { + "epoch": 0.61, + "grad_norm": 1.5598961114883423, + "learning_rate": 0.001975757575757576, + "loss": 2.8022, + "step": 380 + }, + { + "epoch": 0.62, + "grad_norm": 3.7895145416259766, + "learning_rate": 0.0019751196172248806, + "loss": 2.9117, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 9.171380043029785, + "learning_rate": 0.001974481658692185, + "loss": 2.8098, + "step": 400 + }, + { + "epoch": 0.65, + "grad_norm": 3.090906858444214, + "learning_rate": 0.0019738437001594897, + "loss": 2.7351, + "step": 410 + }, + { + "epoch": 0.67, + "grad_norm": 8.468884468078613, + "learning_rate": 0.0019732057416267943, + "loss": 2.868, + "step": 420 + }, + { + "epoch": 0.69, + "grad_norm": 3.8941705226898193, + "learning_rate": 0.001972567783094099, + "loss": 2.9424, + "step": 430 + }, + { + "epoch": 0.7, + "grad_norm": 2.694938898086548, + "learning_rate": 0.0019719298245614034, + "loss": 2.7966, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 3.2257883548736572, + "learning_rate": 0.0019712918660287084, + "loss": 2.8685, + "step": 450 + }, + { + "epoch": 0.73, + "grad_norm": 2.8044159412384033, + "learning_rate": 0.001970653907496013, + "loss": 2.9069, + "step": 460 + }, + { + "epoch": 0.75, + "grad_norm": 3.731559991836548, + "learning_rate": 0.0019700159489633175, + "loss": 2.9604, + "step": 470 + }, + { + "epoch": 0.77, + "grad_norm": 3.8493754863739014, + "learning_rate": 0.001969377990430622, + "loss": 3.1143, + "step": 480 + }, + { + "epoch": 0.78, + "grad_norm": 2.380948781967163, + "learning_rate": 0.0019687400318979266, + "loss": 2.8833, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 2.755617141723633, + "learning_rate": 0.001968102073365231, + "loss": 2.8669, + "step": 500 + }, + { + "epoch": 0.81, + "grad_norm": 3.0759685039520264, + "learning_rate": 0.0019674641148325358, + "loss": 2.659, + "step": 510 + }, + { + "epoch": 0.83, + "grad_norm": 2.367964744567871, + "learning_rate": 0.0019668261562998408, + "loss": 3.0106, + "step": 520 + }, + { + "epoch": 0.85, + "grad_norm": 4.058963775634766, + "learning_rate": 0.0019661881977671453, + "loss": 2.8824, + "step": 530 + }, + { + "epoch": 0.86, + "grad_norm": 1.9852606058120728, + "learning_rate": 0.00196555023923445, + "loss": 2.8171, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 3.7825193405151367, + "learning_rate": 0.0019649122807017545, + "loss": 2.9357, + "step": 550 + }, + { + "epoch": 0.89, + "grad_norm": 4.315491676330566, + "learning_rate": 0.001964274322169059, + "loss": 2.8921, + "step": 560 + }, + { + "epoch": 0.91, + "grad_norm": 2.1023871898651123, + "learning_rate": 0.0019636363636363636, + "loss": 2.8568, + "step": 570 + }, + { + "epoch": 0.93, + "grad_norm": 2.552720785140991, + "learning_rate": 0.001962998405103668, + "loss": 2.9858, + "step": 580 + }, + { + "epoch": 0.94, + "grad_norm": 9.927117347717285, + "learning_rate": 0.001962360446570973, + "loss": 2.8938, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 5.051787853240967, + "learning_rate": 0.0019617224880382777, + "loss": 2.7605, + "step": 600 + }, + { + "epoch": 0.97, + "grad_norm": 3.3584322929382324, + "learning_rate": 0.0019610845295055823, + "loss": 2.7514, + "step": 610 + }, + { + "epoch": 0.99, + "grad_norm": 1.7937440872192383, + "learning_rate": 0.001960446570972887, + "loss": 2.9748, + "step": 620 + }, + { + "epoch": 1.0, + "grad_norm": 5.3826003074646, + "learning_rate": 0.0019598086124401914, + "loss": 2.7608, + "step": 630 + }, + { + "epoch": 1.02, + "grad_norm": 4.8360724449157715, + "learning_rate": 0.001959170653907496, + "loss": 2.2607, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 2.8146324157714844, + "learning_rate": 0.0019585326953748005, + "loss": 2.2973, + "step": 650 + }, + { + "epoch": 1.05, + "grad_norm": 2.535956859588623, + "learning_rate": 0.0019578947368421055, + "loss": 2.2598, + "step": 660 + }, + { + "epoch": 1.07, + "grad_norm": 4.664738655090332, + "learning_rate": 0.00195725677830941, + "loss": 2.3387, + "step": 670 + }, + { + "epoch": 1.08, + "grad_norm": 3.6893537044525146, + "learning_rate": 0.0019566188197767146, + "loss": 2.249, + "step": 680 + }, + { + "epoch": 1.1, + "grad_norm": 4.679712295532227, + "learning_rate": 0.001955980861244019, + "loss": 2.3068, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 2.374504327774048, + "learning_rate": 0.0019553429027113238, + "loss": 2.5318, + "step": 700 + }, + { + "epoch": 1.13, + "grad_norm": 3.3196609020233154, + "learning_rate": 0.0019547049441786283, + "loss": 2.4066, + "step": 710 + }, + { + "epoch": 1.15, + "grad_norm": 5.714052200317383, + "learning_rate": 0.001954066985645933, + "loss": 2.4376, + "step": 720 + }, + { + "epoch": 1.16, + "grad_norm": 6.177249431610107, + "learning_rate": 0.001953429027113238, + "loss": 2.2004, + "step": 730 + }, + { + "epoch": 1.18, + "grad_norm": 2.936424970626831, + "learning_rate": 0.0019527910685805422, + "loss": 2.3424, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 3.634345769882202, + "learning_rate": 0.0019521531100478468, + "loss": 2.5118, + "step": 750 + }, + { + "epoch": 1.21, + "grad_norm": 1.8677217960357666, + "learning_rate": 0.0019515151515151514, + "loss": 2.3253, + "step": 760 + }, + { + "epoch": 1.23, + "grad_norm": 2.1149327754974365, + "learning_rate": 0.0019508771929824564, + "loss": 2.3353, + "step": 770 + }, + { + "epoch": 1.24, + "grad_norm": 4.144554138183594, + "learning_rate": 0.001950239234449761, + "loss": 2.6115, + "step": 780 + }, + { + "epoch": 1.26, + "grad_norm": 4.8128814697265625, + "learning_rate": 0.0019496012759170655, + "loss": 2.5473, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 2.2847745418548584, + "learning_rate": 0.0019489633173843703, + "loss": 2.4678, + "step": 800 + }, + { + "epoch": 1.29, + "grad_norm": 4.829673767089844, + "learning_rate": 0.0019483253588516748, + "loss": 2.638, + "step": 810 + }, + { + "epoch": 1.31, + "grad_norm": 2.4837145805358887, + "learning_rate": 0.0019476874003189794, + "loss": 2.4926, + "step": 820 + }, + { + "epoch": 1.32, + "grad_norm": 2.7193591594696045, + "learning_rate": 0.001947049441786284, + "loss": 2.4601, + "step": 830 + }, + { + "epoch": 1.34, + "grad_norm": 3.0565848350524902, + "learning_rate": 0.0019464114832535887, + "loss": 2.3951, + "step": 840 + }, + { + "epoch": 1.36, + "grad_norm": 3.4434733390808105, + "learning_rate": 0.0019457735247208933, + "loss": 2.4247, + "step": 850 + }, + { + "epoch": 1.37, + "grad_norm": 2.9921035766601562, + "learning_rate": 0.0019451355661881979, + "loss": 2.223, + "step": 860 + }, + { + "epoch": 1.39, + "grad_norm": 4.521476745605469, + "learning_rate": 0.0019444976076555026, + "loss": 2.371, + "step": 870 + }, + { + "epoch": 1.4, + "grad_norm": 1.987562656402588, + "learning_rate": 0.0019438596491228072, + "loss": 2.3996, + "step": 880 + }, + { + "epoch": 1.42, + "grad_norm": 2.5876095294952393, + "learning_rate": 0.0019432216905901118, + "loss": 2.6735, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 3.727102279663086, + "learning_rate": 0.0019425837320574163, + "loss": 2.3431, + "step": 900 + }, + { + "epoch": 1.45, + "grad_norm": 2.775712728500366, + "learning_rate": 0.001941945773524721, + "loss": 2.5217, + "step": 910 + }, + { + "epoch": 1.47, + "grad_norm": 4.316661357879639, + "learning_rate": 0.0019413078149920257, + "loss": 2.5654, + "step": 920 + }, + { + "epoch": 1.48, + "grad_norm": 5.313731670379639, + "learning_rate": 0.0019406698564593302, + "loss": 2.5676, + "step": 930 + }, + { + "epoch": 1.5, + "grad_norm": 3.3875491619110107, + "learning_rate": 0.0019400318979266348, + "loss": 2.701, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 5.12388801574707, + "learning_rate": 0.0019393939393939396, + "loss": 2.6469, + "step": 950 + }, + { + "epoch": 1.53, + "grad_norm": 5.213893890380859, + "learning_rate": 0.0019387559808612441, + "loss": 2.5527, + "step": 960 + }, + { + "epoch": 1.55, + "grad_norm": 2.5714313983917236, + "learning_rate": 0.0019381180223285487, + "loss": 2.5112, + "step": 970 + }, + { + "epoch": 1.56, + "grad_norm": 3.034376859664917, + "learning_rate": 0.0019374800637958535, + "loss": 2.7126, + "step": 980 + }, + { + "epoch": 1.58, + "grad_norm": 4.801724910736084, + "learning_rate": 0.001936842105263158, + "loss": 2.6247, + "step": 990 + }, + { + "epoch": 1.59, + "grad_norm": 2.8160829544067383, + "learning_rate": 0.0019362041467304626, + "loss": 2.5434, + "step": 1000 + }, + { + "epoch": 1.61, + "grad_norm": 5.186509132385254, + "learning_rate": 0.0019355661881977672, + "loss": 2.6724, + "step": 1010 + }, + { + "epoch": 1.63, + "grad_norm": 4.558096408843994, + "learning_rate": 0.001934928229665072, + "loss": 2.6487, + "step": 1020 + }, + { + "epoch": 1.64, + "grad_norm": 4.698276519775391, + "learning_rate": 0.0019342902711323765, + "loss": 2.5392, + "step": 1030 + }, + { + "epoch": 1.66, + "grad_norm": 3.624025821685791, + "learning_rate": 0.001933652312599681, + "loss": 2.6413, + "step": 1040 + }, + { + "epoch": 1.67, + "grad_norm": 2.634162664413452, + "learning_rate": 0.0019330143540669858, + "loss": 2.7082, + "step": 1050 + }, + { + "epoch": 1.69, + "grad_norm": 2.483462333679199, + "learning_rate": 0.0019323763955342904, + "loss": 2.716, + "step": 1060 + }, + { + "epoch": 1.71, + "grad_norm": 3.256911277770996, + "learning_rate": 0.001931738437001595, + "loss": 2.7508, + "step": 1070 + }, + { + "epoch": 1.72, + "grad_norm": 2.233299970626831, + "learning_rate": 0.0019311004784688995, + "loss": 2.7591, + "step": 1080 + }, + { + "epoch": 1.74, + "grad_norm": 3.583534002304077, + "learning_rate": 0.0019304625199362043, + "loss": 2.5285, + "step": 1090 + }, + { + "epoch": 1.75, + "grad_norm": 2.7138407230377197, + "learning_rate": 0.0019298245614035089, + "loss": 2.4167, + "step": 1100 + }, + { + "epoch": 1.77, + "grad_norm": 4.423559665679932, + "learning_rate": 0.0019291866028708134, + "loss": 2.6601, + "step": 1110 + }, + { + "epoch": 1.79, + "grad_norm": 4.424483776092529, + "learning_rate": 0.0019285486443381182, + "loss": 2.6824, + "step": 1120 + }, + { + "epoch": 1.8, + "grad_norm": 1.5624890327453613, + "learning_rate": 0.0019279106858054228, + "loss": 2.7301, + "step": 1130 + }, + { + "epoch": 1.82, + "grad_norm": 3.3539106845855713, + "learning_rate": 0.0019272727272727273, + "loss": 2.6942, + "step": 1140 + }, + { + "epoch": 1.83, + "grad_norm": 3.6363375186920166, + "learning_rate": 0.001926634768740032, + "loss": 2.6089, + "step": 1150 + }, + { + "epoch": 1.85, + "grad_norm": 4.147191047668457, + "learning_rate": 0.0019259968102073367, + "loss": 2.7383, + "step": 1160 + }, + { + "epoch": 1.87, + "grad_norm": 2.932065486907959, + "learning_rate": 0.0019253588516746412, + "loss": 2.6189, + "step": 1170 + }, + { + "epoch": 1.88, + "grad_norm": 4.373189449310303, + "learning_rate": 0.0019247208931419458, + "loss": 2.8137, + "step": 1180 + }, + { + "epoch": 1.9, + "grad_norm": 3.7591166496276855, + "learning_rate": 0.0019240829346092506, + "loss": 2.7852, + "step": 1190 + }, + { + "epoch": 1.91, + "grad_norm": 4.965326309204102, + "learning_rate": 0.0019234449760765552, + "loss": 2.6627, + "step": 1200 + }, + { + "epoch": 1.93, + "grad_norm": 3.0201761722564697, + "learning_rate": 0.0019228070175438597, + "loss": 2.8458, + "step": 1210 + }, + { + "epoch": 1.95, + "grad_norm": 4.8068695068359375, + "learning_rate": 0.0019221690590111643, + "loss": 2.6854, + "step": 1220 + }, + { + "epoch": 1.96, + "grad_norm": 2.76481032371521, + "learning_rate": 0.001921531100478469, + "loss": 2.721, + "step": 1230 + }, + { + "epoch": 1.98, + "grad_norm": 4.103845596313477, + "learning_rate": 0.0019208931419457736, + "loss": 2.7458, + "step": 1240 + }, + { + "epoch": 1.99, + "grad_norm": 2.848653793334961, + "learning_rate": 0.0019202551834130782, + "loss": 2.8408, + "step": 1250 + }, + { + "epoch": 2.01, + "grad_norm": 2.1055376529693604, + "learning_rate": 0.0019196172248803827, + "loss": 2.2232, + "step": 1260 + }, + { + "epoch": 2.03, + "grad_norm": 2.4950308799743652, + "learning_rate": 0.0019189792663476875, + "loss": 1.9928, + "step": 1270 + }, + { + "epoch": 2.04, + "grad_norm": 4.026719093322754, + "learning_rate": 0.001918341307814992, + "loss": 1.8887, + "step": 1280 + }, + { + "epoch": 2.06, + "grad_norm": 2.439951181411743, + "learning_rate": 0.0019177033492822966, + "loss": 1.7856, + "step": 1290 + }, + { + "epoch": 2.07, + "grad_norm": 3.8327765464782715, + "learning_rate": 0.0019170653907496014, + "loss": 1.9734, + "step": 1300 + }, + { + "epoch": 2.09, + "grad_norm": 4.497558116912842, + "learning_rate": 0.001916427432216906, + "loss": 1.9576, + "step": 1310 + }, + { + "epoch": 2.11, + "grad_norm": 4.017326831817627, + "learning_rate": 0.0019157894736842106, + "loss": 2.0879, + "step": 1320 + }, + { + "epoch": 2.12, + "grad_norm": 5.959986209869385, + "learning_rate": 0.0019151515151515151, + "loss": 2.0042, + "step": 1330 + }, + { + "epoch": 2.14, + "grad_norm": 2.2927639484405518, + "learning_rate": 0.00191451355661882, + "loss": 1.8132, + "step": 1340 + }, + { + "epoch": 2.15, + "grad_norm": 6.273167133331299, + "learning_rate": 0.0019138755980861245, + "loss": 2.0617, + "step": 1350 + }, + { + "epoch": 2.17, + "grad_norm": 2.9032981395721436, + "learning_rate": 0.001913237639553429, + "loss": 2.0173, + "step": 1360 + }, + { + "epoch": 2.19, + "grad_norm": 5.651817798614502, + "learning_rate": 0.0019125996810207338, + "loss": 2.0464, + "step": 1370 + }, + { + "epoch": 2.2, + "grad_norm": 4.344000339508057, + "learning_rate": 0.0019119617224880384, + "loss": 1.9377, + "step": 1380 + }, + { + "epoch": 2.22, + "grad_norm": 3.4183313846588135, + "learning_rate": 0.001911323763955343, + "loss": 2.1948, + "step": 1390 + }, + { + "epoch": 2.23, + "grad_norm": 8.772147178649902, + "learning_rate": 0.0019106858054226475, + "loss": 2.0348, + "step": 1400 + }, + { + "epoch": 2.25, + "grad_norm": 2.020637273788452, + "learning_rate": 0.0019100478468899523, + "loss": 2.0723, + "step": 1410 + }, + { + "epoch": 2.26, + "grad_norm": 4.4110565185546875, + "learning_rate": 0.0019094098883572568, + "loss": 1.9483, + "step": 1420 + }, + { + "epoch": 2.28, + "grad_norm": 4.694215774536133, + "learning_rate": 0.0019087719298245614, + "loss": 2.0805, + "step": 1430 + }, + { + "epoch": 2.3, + "grad_norm": 4.042151927947998, + "learning_rate": 0.0019081339712918662, + "loss": 2.0796, + "step": 1440 + }, + { + "epoch": 2.31, + "grad_norm": 3.466386318206787, + "learning_rate": 0.0019074960127591707, + "loss": 2.049, + "step": 1450 + }, + { + "epoch": 2.33, + "grad_norm": 5.676107406616211, + "learning_rate": 0.0019068580542264753, + "loss": 2.183, + "step": 1460 + }, + { + "epoch": 2.34, + "grad_norm": 2.662849187850952, + "learning_rate": 0.0019062200956937799, + "loss": 2.0795, + "step": 1470 + }, + { + "epoch": 2.36, + "grad_norm": 2.9790804386138916, + "learning_rate": 0.0019055821371610846, + "loss": 2.0264, + "step": 1480 + }, + { + "epoch": 2.38, + "grad_norm": 5.430638790130615, + "learning_rate": 0.0019049441786283892, + "loss": 2.0712, + "step": 1490 + }, + { + "epoch": 2.39, + "grad_norm": 3.5230486392974854, + "learning_rate": 0.0019043062200956938, + "loss": 2.1533, + "step": 1500 + }, + { + "epoch": 2.41, + "grad_norm": 3.2345664501190186, + "learning_rate": 0.0019036682615629985, + "loss": 2.325, + "step": 1510 + }, + { + "epoch": 2.42, + "grad_norm": 2.0495986938476562, + "learning_rate": 0.001903030303030303, + "loss": 2.225, + "step": 1520 + }, + { + "epoch": 2.44, + "grad_norm": 4.178987979888916, + "learning_rate": 0.0019023923444976077, + "loss": 2.3576, + "step": 1530 + }, + { + "epoch": 2.46, + "grad_norm": 4.338198184967041, + "learning_rate": 0.0019017543859649122, + "loss": 2.1247, + "step": 1540 + }, + { + "epoch": 2.47, + "grad_norm": 2.644819736480713, + "learning_rate": 0.001901116427432217, + "loss": 2.3011, + "step": 1550 + }, + { + "epoch": 2.49, + "grad_norm": 2.83943772315979, + "learning_rate": 0.0019004784688995216, + "loss": 2.3449, + "step": 1560 + }, + { + "epoch": 2.5, + "grad_norm": 5.573853492736816, + "learning_rate": 0.0018998405103668261, + "loss": 2.1616, + "step": 1570 + }, + { + "epoch": 2.52, + "grad_norm": 5.958674430847168, + "learning_rate": 0.001899202551834131, + "loss": 2.2724, + "step": 1580 + }, + { + "epoch": 2.54, + "grad_norm": 4.136911392211914, + "learning_rate": 0.0018985645933014355, + "loss": 2.1089, + "step": 1590 + }, + { + "epoch": 2.55, + "grad_norm": 2.5898241996765137, + "learning_rate": 0.00189792663476874, + "loss": 2.3466, + "step": 1600 + }, + { + "epoch": 2.57, + "grad_norm": 3.101346015930176, + "learning_rate": 0.0018972886762360446, + "loss": 2.2233, + "step": 1610 + }, + { + "epoch": 2.58, + "grad_norm": 3.3297476768493652, + "learning_rate": 0.0018966507177033494, + "loss": 2.1519, + "step": 1620 + }, + { + "epoch": 2.6, + "grad_norm": 5.8525848388671875, + "learning_rate": 0.001896012759170654, + "loss": 2.4774, + "step": 1630 + }, + { + "epoch": 2.62, + "grad_norm": 5.049089431762695, + "learning_rate": 0.0018953748006379585, + "loss": 2.2551, + "step": 1640 + }, + { + "epoch": 2.63, + "grad_norm": 3.721668004989624, + "learning_rate": 0.001894736842105263, + "loss": 2.1791, + "step": 1650 + }, + { + "epoch": 2.65, + "grad_norm": 3.234546661376953, + "learning_rate": 0.0018940988835725679, + "loss": 2.3578, + "step": 1660 + }, + { + "epoch": 2.66, + "grad_norm": 3.65110445022583, + "learning_rate": 0.0018934609250398724, + "loss": 2.4003, + "step": 1670 + }, + { + "epoch": 2.68, + "grad_norm": 4.681038856506348, + "learning_rate": 0.001892822966507177, + "loss": 2.3949, + "step": 1680 + }, + { + "epoch": 2.7, + "grad_norm": 4.4321136474609375, + "learning_rate": 0.0018921850079744818, + "loss": 2.3568, + "step": 1690 + }, + { + "epoch": 2.71, + "grad_norm": 3.075857162475586, + "learning_rate": 0.0018915470494417863, + "loss": 2.3458, + "step": 1700 + }, + { + "epoch": 2.73, + "grad_norm": 2.5896382331848145, + "learning_rate": 0.0018909090909090909, + "loss": 2.4374, + "step": 1710 + }, + { + "epoch": 2.74, + "grad_norm": 4.7238006591796875, + "learning_rate": 0.0018902711323763954, + "loss": 2.5829, + "step": 1720 + }, + { + "epoch": 2.76, + "grad_norm": 2.7794413566589355, + "learning_rate": 0.0018896331738437002, + "loss": 2.2883, + "step": 1730 + }, + { + "epoch": 2.78, + "grad_norm": 2.269745349884033, + "learning_rate": 0.0018889952153110048, + "loss": 2.1972, + "step": 1740 + }, + { + "epoch": 2.79, + "grad_norm": 4.9795918464660645, + "learning_rate": 0.0018883572567783093, + "loss": 2.3769, + "step": 1750 + }, + { + "epoch": 2.81, + "grad_norm": 2.1848745346069336, + "learning_rate": 0.0018877192982456141, + "loss": 2.3916, + "step": 1760 + }, + { + "epoch": 2.82, + "grad_norm": 3.246695041656494, + "learning_rate": 0.0018870813397129187, + "loss": 2.3303, + "step": 1770 + }, + { + "epoch": 2.84, + "grad_norm": 4.342026710510254, + "learning_rate": 0.0018864433811802233, + "loss": 2.3351, + "step": 1780 + }, + { + "epoch": 2.85, + "grad_norm": 3.7110838890075684, + "learning_rate": 0.0018858054226475278, + "loss": 2.2379, + "step": 1790 + }, + { + "epoch": 2.87, + "grad_norm": 3.2227859497070312, + "learning_rate": 0.0018851674641148326, + "loss": 2.3734, + "step": 1800 + }, + { + "epoch": 2.89, + "grad_norm": 3.257556915283203, + "learning_rate": 0.0018845295055821372, + "loss": 2.533, + "step": 1810 + }, + { + "epoch": 2.9, + "grad_norm": 3.287235975265503, + "learning_rate": 0.0018838915470494417, + "loss": 2.4149, + "step": 1820 + }, + { + "epoch": 2.92, + "grad_norm": 5.332248210906982, + "learning_rate": 0.0018832535885167465, + "loss": 2.4045, + "step": 1830 + }, + { + "epoch": 2.93, + "grad_norm": 6.954147815704346, + "learning_rate": 0.001882615629984051, + "loss": 2.4621, + "step": 1840 + }, + { + "epoch": 2.95, + "grad_norm": 3.5278656482696533, + "learning_rate": 0.0018819776714513556, + "loss": 2.3869, + "step": 1850 + }, + { + "epoch": 2.97, + "grad_norm": 5.101337909698486, + "learning_rate": 0.0018813397129186602, + "loss": 2.507, + "step": 1860 + }, + { + "epoch": 2.98, + "grad_norm": 3.60355544090271, + "learning_rate": 0.001880701754385965, + "loss": 2.409, + "step": 1870 + }, + { + "epoch": 3.0, + "grad_norm": 4.093048572540283, + "learning_rate": 0.0018800637958532695, + "loss": 2.2993, + "step": 1880 + }, + { + "epoch": 3.01, + "grad_norm": 2.8062260150909424, + "learning_rate": 0.001879425837320574, + "loss": 1.7359, + "step": 1890 + }, + { + "epoch": 3.03, + "grad_norm": 6.747288227081299, + "learning_rate": 0.0018787878787878789, + "loss": 1.6095, + "step": 1900 + }, + { + "epoch": 3.05, + "grad_norm": 4.082904815673828, + "learning_rate": 0.0018781499202551834, + "loss": 1.69, + "step": 1910 + }, + { + "epoch": 3.06, + "grad_norm": 1.8365858793258667, + "learning_rate": 0.001877511961722488, + "loss": 1.5815, + "step": 1920 + }, + { + "epoch": 3.08, + "grad_norm": 2.940593719482422, + "learning_rate": 0.0018768740031897926, + "loss": 1.6635, + "step": 1930 + }, + { + "epoch": 3.09, + "grad_norm": 2.660888433456421, + "learning_rate": 0.0018762360446570973, + "loss": 1.632, + "step": 1940 + }, + { + "epoch": 3.11, + "grad_norm": 3.061300039291382, + "learning_rate": 0.001875598086124402, + "loss": 1.6964, + "step": 1950 + }, + { + "epoch": 3.13, + "grad_norm": 3.134197235107422, + "learning_rate": 0.0018749601275917065, + "loss": 1.6702, + "step": 1960 + }, + { + "epoch": 3.14, + "grad_norm": 5.188543796539307, + "learning_rate": 0.001874322169059011, + "loss": 1.7615, + "step": 1970 + }, + { + "epoch": 3.16, + "grad_norm": 3.115239381790161, + "learning_rate": 0.0018736842105263158, + "loss": 1.8079, + "step": 1980 + }, + { + "epoch": 3.17, + "grad_norm": 4.397618770599365, + "learning_rate": 0.0018730462519936204, + "loss": 1.8256, + "step": 1990 + }, + { + "epoch": 3.19, + "grad_norm": 6.745879650115967, + "learning_rate": 0.001872408293460925, + "loss": 1.6285, + "step": 2000 + }, + { + "epoch": 3.21, + "grad_norm": 4.609273433685303, + "learning_rate": 0.0018717703349282297, + "loss": 1.8885, + "step": 2010 + }, + { + "epoch": 3.22, + "grad_norm": 2.650247097015381, + "learning_rate": 0.0018711323763955343, + "loss": 1.6762, + "step": 2020 + }, + { + "epoch": 3.24, + "grad_norm": 5.857548713684082, + "learning_rate": 0.0018704944178628388, + "loss": 1.7824, + "step": 2030 + }, + { + "epoch": 3.25, + "grad_norm": 3.2646751403808594, + "learning_rate": 0.0018698564593301434, + "loss": 1.8078, + "step": 2040 + }, + { + "epoch": 3.27, + "grad_norm": 3.6167776584625244, + "learning_rate": 0.0018692185007974482, + "loss": 1.7395, + "step": 2050 + }, + { + "epoch": 3.29, + "grad_norm": 3.98301100730896, + "learning_rate": 0.0018685805422647527, + "loss": 1.811, + "step": 2060 + }, + { + "epoch": 3.3, + "grad_norm": 5.3117594718933105, + "learning_rate": 0.0018679425837320573, + "loss": 1.7647, + "step": 2070 + }, + { + "epoch": 3.32, + "grad_norm": 6.290541172027588, + "learning_rate": 0.001867304625199362, + "loss": 1.8698, + "step": 2080 + }, + { + "epoch": 3.33, + "grad_norm": 6.5661091804504395, + "learning_rate": 0.0018666666666666666, + "loss": 2.0072, + "step": 2090 + }, + { + "epoch": 3.35, + "grad_norm": 6.150557994842529, + "learning_rate": 0.0018660287081339712, + "loss": 1.8055, + "step": 2100 + }, + { + "epoch": 3.37, + "grad_norm": 3.677581310272217, + "learning_rate": 0.0018653907496012758, + "loss": 1.8711, + "step": 2110 + }, + { + "epoch": 3.38, + "grad_norm": 2.2296063899993896, + "learning_rate": 0.0018647527910685806, + "loss": 1.8763, + "step": 2120 + }, + { + "epoch": 3.4, + "grad_norm": 3.410414695739746, + "learning_rate": 0.0018641148325358851, + "loss": 1.8947, + "step": 2130 + }, + { + "epoch": 3.41, + "grad_norm": 3.566406726837158, + "learning_rate": 0.0018634768740031897, + "loss": 1.9423, + "step": 2140 + }, + { + "epoch": 3.43, + "grad_norm": 5.5341668128967285, + "learning_rate": 0.0018628389154704945, + "loss": 2.0945, + "step": 2150 + }, + { + "epoch": 3.44, + "grad_norm": 4.542388439178467, + "learning_rate": 0.001862200956937799, + "loss": 1.9816, + "step": 2160 + }, + { + "epoch": 3.46, + "grad_norm": 3.3940858840942383, + "learning_rate": 0.0018615629984051036, + "loss": 1.9789, + "step": 2170 + }, + { + "epoch": 3.48, + "grad_norm": 3.9412808418273926, + "learning_rate": 0.0018609250398724081, + "loss": 2.1294, + "step": 2180 + }, + { + "epoch": 3.49, + "grad_norm": 2.695256233215332, + "learning_rate": 0.001860287081339713, + "loss": 2.0233, + "step": 2190 + }, + { + "epoch": 3.51, + "grad_norm": 3.1621010303497314, + "learning_rate": 0.0018596491228070175, + "loss": 1.8246, + "step": 2200 + }, + { + "epoch": 3.52, + "grad_norm": 5.293850898742676, + "learning_rate": 0.001859011164274322, + "loss": 2.0004, + "step": 2210 + }, + { + "epoch": 3.54, + "grad_norm": 3.9184532165527344, + "learning_rate": 0.0018583732057416268, + "loss": 2.1011, + "step": 2220 + }, + { + "epoch": 3.56, + "grad_norm": 2.1356756687164307, + "learning_rate": 0.0018577352472089314, + "loss": 2.1129, + "step": 2230 + }, + { + "epoch": 3.57, + "grad_norm": 3.8817296028137207, + "learning_rate": 0.001857097288676236, + "loss": 2.0047, + "step": 2240 + }, + { + "epoch": 3.59, + "grad_norm": 3.2533388137817383, + "learning_rate": 0.0018564593301435405, + "loss": 1.9137, + "step": 2250 + }, + { + "epoch": 3.6, + "grad_norm": 3.3586273193359375, + "learning_rate": 0.0018558213716108455, + "loss": 2.065, + "step": 2260 + }, + { + "epoch": 3.62, + "grad_norm": 4.144857406616211, + "learning_rate": 0.00185518341307815, + "loss": 2.0735, + "step": 2270 + }, + { + "epoch": 3.64, + "grad_norm": 3.9639623165130615, + "learning_rate": 0.0018545454545454546, + "loss": 2.118, + "step": 2280 + }, + { + "epoch": 3.65, + "grad_norm": 3.5141801834106445, + "learning_rate": 0.0018539074960127592, + "loss": 2.0006, + "step": 2290 + }, + { + "epoch": 3.67, + "grad_norm": 3.2397677898406982, + "learning_rate": 0.001853269537480064, + "loss": 2.1261, + "step": 2300 + }, + { + "epoch": 3.68, + "grad_norm": 5.273965835571289, + "learning_rate": 0.0018526315789473685, + "loss": 2.0134, + "step": 2310 + }, + { + "epoch": 3.7, + "grad_norm": 4.7644805908203125, + "learning_rate": 0.001851993620414673, + "loss": 1.9781, + "step": 2320 + }, + { + "epoch": 3.72, + "grad_norm": 3.042400598526001, + "learning_rate": 0.0018513556618819779, + "loss": 2.0429, + "step": 2330 + }, + { + "epoch": 3.73, + "grad_norm": 4.666615009307861, + "learning_rate": 0.0018507177033492824, + "loss": 2.052, + "step": 2340 + }, + { + "epoch": 3.75, + "grad_norm": 2.8000500202178955, + "learning_rate": 0.001850079744816587, + "loss": 1.8426, + "step": 2350 + }, + { + "epoch": 3.76, + "grad_norm": 4.616471767425537, + "learning_rate": 0.0018494417862838916, + "loss": 2.1656, + "step": 2360 + }, + { + "epoch": 3.78, + "grad_norm": 4.575398921966553, + "learning_rate": 0.0018488038277511964, + "loss": 2.176, + "step": 2370 + }, + { + "epoch": 3.8, + "grad_norm": 4.790685176849365, + "learning_rate": 0.001848165869218501, + "loss": 2.2702, + "step": 2380 + }, + { + "epoch": 3.81, + "grad_norm": 4.581923007965088, + "learning_rate": 0.0018475279106858055, + "loss": 2.1888, + "step": 2390 + }, + { + "epoch": 3.83, + "grad_norm": 4.997535705566406, + "learning_rate": 0.0018468899521531103, + "loss": 2.0964, + "step": 2400 + }, + { + "epoch": 3.84, + "grad_norm": 3.024472951889038, + "learning_rate": 0.0018462519936204148, + "loss": 1.9286, + "step": 2410 + }, + { + "epoch": 3.86, + "grad_norm": 3.9244346618652344, + "learning_rate": 0.0018456140350877194, + "loss": 2.0429, + "step": 2420 + }, + { + "epoch": 3.88, + "grad_norm": 5.021399974822998, + "learning_rate": 0.001844976076555024, + "loss": 2.0872, + "step": 2430 + }, + { + "epoch": 3.89, + "grad_norm": 2.4256746768951416, + "learning_rate": 0.0018443381180223287, + "loss": 2.0974, + "step": 2440 + }, + { + "epoch": 3.91, + "grad_norm": 1.7723888158798218, + "learning_rate": 0.0018437001594896333, + "loss": 2.1741, + "step": 2450 + }, + { + "epoch": 3.92, + "grad_norm": 2.9281272888183594, + "learning_rate": 0.0018430622009569379, + "loss": 2.0566, + "step": 2460 + }, + { + "epoch": 3.94, + "grad_norm": 3.0242364406585693, + "learning_rate": 0.0018424242424242426, + "loss": 2.2248, + "step": 2470 + }, + { + "epoch": 3.96, + "grad_norm": 3.027165651321411, + "learning_rate": 0.0018417862838915472, + "loss": 2.0762, + "step": 2480 + }, + { + "epoch": 3.97, + "grad_norm": 4.249027729034424, + "learning_rate": 0.0018411483253588518, + "loss": 2.1017, + "step": 2490 + }, + { + "epoch": 3.99, + "grad_norm": 3.3154234886169434, + "learning_rate": 0.0018405103668261563, + "loss": 2.0766, + "step": 2500 + }, + { + "epoch": 4.0, + "grad_norm": 1.4245625734329224, + "learning_rate": 0.001839872408293461, + "loss": 1.9103, + "step": 2510 + }, + { + "epoch": 4.02, + "grad_norm": 1.513168454170227, + "learning_rate": 0.0018392344497607657, + "loss": 1.4412, + "step": 2520 + }, + { + "epoch": 4.04, + "grad_norm": 4.4338507652282715, + "learning_rate": 0.0018385964912280702, + "loss": 1.3431, + "step": 2530 + }, + { + "epoch": 4.05, + "grad_norm": 4.030521869659424, + "learning_rate": 0.001837958532695375, + "loss": 1.4325, + "step": 2540 + }, + { + "epoch": 4.07, + "grad_norm": 4.0168137550354, + "learning_rate": 0.0018373205741626796, + "loss": 1.4083, + "step": 2550 + }, + { + "epoch": 4.08, + "grad_norm": 5.304862022399902, + "learning_rate": 0.0018366826156299841, + "loss": 1.5336, + "step": 2560 + }, + { + "epoch": 4.1, + "grad_norm": 3.5825703144073486, + "learning_rate": 0.0018360446570972887, + "loss": 1.4663, + "step": 2570 + }, + { + "epoch": 4.11, + "grad_norm": 3.8972997665405273, + "learning_rate": 0.0018354066985645935, + "loss": 1.5203, + "step": 2580 + }, + { + "epoch": 4.13, + "grad_norm": 5.68231725692749, + "learning_rate": 0.001834768740031898, + "loss": 1.6814, + "step": 2590 + }, + { + "epoch": 4.15, + "grad_norm": 3.8971197605133057, + "learning_rate": 0.0018341307814992026, + "loss": 1.3574, + "step": 2600 + }, + { + "epoch": 4.16, + "grad_norm": 3.819286346435547, + "learning_rate": 0.0018334928229665074, + "loss": 1.4937, + "step": 2610 + }, + { + "epoch": 4.18, + "grad_norm": 3.3106937408447266, + "learning_rate": 0.001832854864433812, + "loss": 1.5814, + "step": 2620 + }, + { + "epoch": 4.19, + "grad_norm": 5.2803754806518555, + "learning_rate": 0.0018322169059011165, + "loss": 1.641, + "step": 2630 + }, + { + "epoch": 4.21, + "grad_norm": 4.728196620941162, + "learning_rate": 0.001831578947368421, + "loss": 1.5647, + "step": 2640 + }, + { + "epoch": 4.23, + "grad_norm": 3.2671823501586914, + "learning_rate": 0.0018309409888357258, + "loss": 1.7673, + "step": 2650 + }, + { + "epoch": 4.24, + "grad_norm": 2.539050579071045, + "learning_rate": 0.0018303030303030304, + "loss": 1.5397, + "step": 2660 + }, + { + "epoch": 4.26, + "grad_norm": 2.7646982669830322, + "learning_rate": 0.001829665071770335, + "loss": 1.4788, + "step": 2670 + }, + { + "epoch": 4.27, + "grad_norm": 3.103675603866577, + "learning_rate": 0.0018290271132376395, + "loss": 1.5428, + "step": 2680 + }, + { + "epoch": 4.29, + "grad_norm": 5.560327053070068, + "learning_rate": 0.0018283891547049443, + "loss": 1.5389, + "step": 2690 + }, + { + "epoch": 4.31, + "grad_norm": 3.355659246444702, + "learning_rate": 0.0018277511961722489, + "loss": 1.6061, + "step": 2700 + }, + { + "epoch": 4.32, + "grad_norm": 5.579101085662842, + "learning_rate": 0.0018271132376395534, + "loss": 1.6554, + "step": 2710 + }, + { + "epoch": 4.34, + "grad_norm": 4.465839862823486, + "learning_rate": 0.0018264752791068582, + "loss": 1.7019, + "step": 2720 + }, + { + "epoch": 4.35, + "grad_norm": 2.9531333446502686, + "learning_rate": 0.0018258373205741628, + "loss": 1.6395, + "step": 2730 + }, + { + "epoch": 4.37, + "grad_norm": 3.912163257598877, + "learning_rate": 0.0018251993620414673, + "loss": 1.7232, + "step": 2740 + }, + { + "epoch": 4.39, + "grad_norm": 3.955035924911499, + "learning_rate": 0.001824561403508772, + "loss": 1.6601, + "step": 2750 + }, + { + "epoch": 4.4, + "grad_norm": 5.796784400939941, + "learning_rate": 0.0018239234449760767, + "loss": 1.7742, + "step": 2760 + }, + { + "epoch": 4.42, + "grad_norm": 3.470076322555542, + "learning_rate": 0.0018232854864433812, + "loss": 1.7226, + "step": 2770 + }, + { + "epoch": 4.43, + "grad_norm": 4.716192245483398, + "learning_rate": 0.0018226475279106858, + "loss": 1.6537, + "step": 2780 + }, + { + "epoch": 4.45, + "grad_norm": 4.586126327514648, + "learning_rate": 0.0018220095693779906, + "loss": 1.6319, + "step": 2790 + }, + { + "epoch": 4.47, + "grad_norm": 4.049830913543701, + "learning_rate": 0.0018213716108452952, + "loss": 1.8295, + "step": 2800 + }, + { + "epoch": 4.48, + "grad_norm": 2.4487478733062744, + "learning_rate": 0.0018207336523125997, + "loss": 1.9085, + "step": 2810 + }, + { + "epoch": 4.5, + "grad_norm": 3.3505730628967285, + "learning_rate": 0.0018200956937799043, + "loss": 1.7341, + "step": 2820 + }, + { + "epoch": 4.51, + "grad_norm": 3.655205011367798, + "learning_rate": 0.001819457735247209, + "loss": 1.7667, + "step": 2830 + }, + { + "epoch": 4.53, + "grad_norm": 4.730507850646973, + "learning_rate": 0.0018188197767145136, + "loss": 1.6444, + "step": 2840 + }, + { + "epoch": 4.55, + "grad_norm": 3.635011911392212, + "learning_rate": 0.0018181818181818182, + "loss": 1.7706, + "step": 2850 + }, + { + "epoch": 4.56, + "grad_norm": 2.98230242729187, + "learning_rate": 0.001817543859649123, + "loss": 1.7443, + "step": 2860 + }, + { + "epoch": 4.58, + "grad_norm": 2.706557035446167, + "learning_rate": 0.0018169059011164275, + "loss": 1.738, + "step": 2870 + }, + { + "epoch": 4.59, + "grad_norm": 5.715457439422607, + "learning_rate": 0.001816267942583732, + "loss": 1.9244, + "step": 2880 + }, + { + "epoch": 4.61, + "grad_norm": 4.379674911499023, + "learning_rate": 0.0018156299840510366, + "loss": 1.8655, + "step": 2890 + }, + { + "epoch": 4.63, + "grad_norm": 3.1540908813476562, + "learning_rate": 0.0018149920255183414, + "loss": 1.8485, + "step": 2900 + }, + { + "epoch": 4.64, + "grad_norm": 4.2252373695373535, + "learning_rate": 0.001814354066985646, + "loss": 1.8581, + "step": 2910 + }, + { + "epoch": 4.66, + "grad_norm": 2.00207781791687, + "learning_rate": 0.0018137161084529506, + "loss": 2.0877, + "step": 2920 + }, + { + "epoch": 4.67, + "grad_norm": 2.710052013397217, + "learning_rate": 0.0018130781499202553, + "loss": 1.6488, + "step": 2930 + }, + { + "epoch": 4.69, + "grad_norm": 5.69435453414917, + "learning_rate": 0.00181244019138756, + "loss": 1.8438, + "step": 2940 + }, + { + "epoch": 4.7, + "grad_norm": 3.163170576095581, + "learning_rate": 0.0018118022328548645, + "loss": 1.8168, + "step": 2950 + }, + { + "epoch": 4.72, + "grad_norm": 2.5819127559661865, + "learning_rate": 0.001811164274322169, + "loss": 1.8733, + "step": 2960 + }, + { + "epoch": 4.74, + "grad_norm": 3.780280351638794, + "learning_rate": 0.0018105263157894738, + "loss": 1.7007, + "step": 2970 + }, + { + "epoch": 4.75, + "grad_norm": 4.294229030609131, + "learning_rate": 0.0018098883572567784, + "loss": 1.838, + "step": 2980 + }, + { + "epoch": 4.77, + "grad_norm": 4.328463077545166, + "learning_rate": 0.001809250398724083, + "loss": 1.8721, + "step": 2990 + }, + { + "epoch": 4.78, + "grad_norm": 3.204005241394043, + "learning_rate": 0.0018086124401913875, + "loss": 1.8796, + "step": 3000 + }, + { + "epoch": 4.8, + "grad_norm": 6.089762210845947, + "learning_rate": 0.0018079744816586923, + "loss": 1.8764, + "step": 3010 + }, + { + "epoch": 4.82, + "grad_norm": 5.21134090423584, + "learning_rate": 0.0018073365231259968, + "loss": 1.8615, + "step": 3020 + }, + { + "epoch": 4.83, + "grad_norm": 5.567359924316406, + "learning_rate": 0.0018066985645933014, + "loss": 1.9394, + "step": 3030 + }, + { + "epoch": 4.85, + "grad_norm": 3.8925669193267822, + "learning_rate": 0.0018060606060606062, + "loss": 1.9556, + "step": 3040 + }, + { + "epoch": 4.86, + "grad_norm": 3.123612642288208, + "learning_rate": 0.0018054226475279107, + "loss": 1.8685, + "step": 3050 + }, + { + "epoch": 4.88, + "grad_norm": 3.970958709716797, + "learning_rate": 0.0018047846889952153, + "loss": 1.8955, + "step": 3060 + }, + { + "epoch": 4.9, + "grad_norm": 4.519131660461426, + "learning_rate": 0.0018041467304625199, + "loss": 1.9885, + "step": 3070 + }, + { + "epoch": 4.91, + "grad_norm": 3.834430456161499, + "learning_rate": 0.0018035087719298246, + "loss": 1.9572, + "step": 3080 + }, + { + "epoch": 4.93, + "grad_norm": 5.614201068878174, + "learning_rate": 0.0018028708133971292, + "loss": 1.9366, + "step": 3090 + }, + { + "epoch": 4.94, + "grad_norm": 3.8492119312286377, + "learning_rate": 0.0018022328548644338, + "loss": 1.8337, + "step": 3100 + }, + { + "epoch": 4.96, + "grad_norm": 5.122296333312988, + "learning_rate": 0.0018015948963317385, + "loss": 1.9093, + "step": 3110 + }, + { + "epoch": 4.98, + "grad_norm": 3.0235679149627686, + "learning_rate": 0.001800956937799043, + "loss": 1.8341, + "step": 3120 + }, + { + "epoch": 4.99, + "grad_norm": 3.4031426906585693, + "learning_rate": 0.0018003189792663477, + "loss": 1.8851, + "step": 3130 + }, + { + "epoch": 5.01, + "grad_norm": 3.6404995918273926, + "learning_rate": 0.0017996810207336522, + "loss": 1.4797, + "step": 3140 + }, + { + "epoch": 5.02, + "grad_norm": 3.4057798385620117, + "learning_rate": 0.001799043062200957, + "loss": 1.1585, + "step": 3150 + }, + { + "epoch": 5.04, + "grad_norm": 3.314164161682129, + "learning_rate": 0.0017984051036682616, + "loss": 1.3921, + "step": 3160 + }, + { + "epoch": 5.06, + "grad_norm": 4.028993606567383, + "learning_rate": 0.0017977671451355661, + "loss": 1.2938, + "step": 3170 + }, + { + "epoch": 5.07, + "grad_norm": 4.128094673156738, + "learning_rate": 0.001797129186602871, + "loss": 1.2979, + "step": 3180 + }, + { + "epoch": 5.09, + "grad_norm": 3.079228639602661, + "learning_rate": 0.0017964912280701755, + "loss": 1.292, + "step": 3190 + }, + { + "epoch": 5.1, + "grad_norm": 4.176467418670654, + "learning_rate": 0.00179585326953748, + "loss": 1.528, + "step": 3200 + }, + { + "epoch": 5.12, + "grad_norm": 3.689857244491577, + "learning_rate": 0.0017952153110047846, + "loss": 1.2832, + "step": 3210 + }, + { + "epoch": 5.14, + "grad_norm": 3.580005645751953, + "learning_rate": 0.0017945773524720894, + "loss": 1.3447, + "step": 3220 + }, + { + "epoch": 5.15, + "grad_norm": 3.8672592639923096, + "learning_rate": 0.001793939393939394, + "loss": 1.234, + "step": 3230 + }, + { + "epoch": 5.17, + "grad_norm": 3.5929276943206787, + "learning_rate": 0.0017933014354066985, + "loss": 1.2326, + "step": 3240 + }, + { + "epoch": 5.18, + "grad_norm": 3.1610376834869385, + "learning_rate": 0.0017926634768740033, + "loss": 1.3224, + "step": 3250 + }, + { + "epoch": 5.2, + "grad_norm": 3.908184289932251, + "learning_rate": 0.0017920255183413079, + "loss": 1.4446, + "step": 3260 + }, + { + "epoch": 5.22, + "grad_norm": 4.269443511962891, + "learning_rate": 0.0017913875598086124, + "loss": 1.4286, + "step": 3270 + }, + { + "epoch": 5.23, + "grad_norm": 3.0032732486724854, + "learning_rate": 0.001790749601275917, + "loss": 1.3768, + "step": 3280 + }, + { + "epoch": 5.25, + "grad_norm": 3.841958522796631, + "learning_rate": 0.0017901116427432218, + "loss": 1.3441, + "step": 3290 + }, + { + "epoch": 5.26, + "grad_norm": 3.0527617931365967, + "learning_rate": 0.0017894736842105263, + "loss": 1.4091, + "step": 3300 + }, + { + "epoch": 5.28, + "grad_norm": 3.266508102416992, + "learning_rate": 0.0017888357256778309, + "loss": 1.3933, + "step": 3310 + }, + { + "epoch": 5.3, + "grad_norm": 4.250580310821533, + "learning_rate": 0.0017881977671451357, + "loss": 1.5105, + "step": 3320 + }, + { + "epoch": 5.31, + "grad_norm": 3.375892162322998, + "learning_rate": 0.0017875598086124402, + "loss": 1.4668, + "step": 3330 + }, + { + "epoch": 5.33, + "grad_norm": 4.1522297859191895, + "learning_rate": 0.0017869218500797448, + "loss": 1.4022, + "step": 3340 + }, + { + "epoch": 5.34, + "grad_norm": 5.130900859832764, + "learning_rate": 0.0017862838915470493, + "loss": 1.4457, + "step": 3350 + }, + { + "epoch": 5.36, + "grad_norm": 3.176265239715576, + "learning_rate": 0.0017856459330143541, + "loss": 1.5113, + "step": 3360 + }, + { + "epoch": 5.37, + "grad_norm": 5.0800557136535645, + "learning_rate": 0.0017850079744816587, + "loss": 1.5258, + "step": 3370 + }, + { + "epoch": 5.39, + "grad_norm": 4.9642534255981445, + "learning_rate": 0.0017843700159489633, + "loss": 1.4101, + "step": 3380 + }, + { + "epoch": 5.41, + "grad_norm": 4.7204270362854, + "learning_rate": 0.0017837320574162678, + "loss": 1.5812, + "step": 3390 + }, + { + "epoch": 5.42, + "grad_norm": 3.163360834121704, + "learning_rate": 0.0017830940988835726, + "loss": 1.5647, + "step": 3400 + }, + { + "epoch": 5.44, + "grad_norm": 5.122838973999023, + "learning_rate": 0.0017824561403508772, + "loss": 1.5151, + "step": 3410 + }, + { + "epoch": 5.45, + "grad_norm": 3.543826103210449, + "learning_rate": 0.0017818181818181817, + "loss": 1.4134, + "step": 3420 + }, + { + "epoch": 5.47, + "grad_norm": 3.4644534587860107, + "learning_rate": 0.0017811802232854865, + "loss": 1.6267, + "step": 3430 + }, + { + "epoch": 5.49, + "grad_norm": 4.8260040283203125, + "learning_rate": 0.001780542264752791, + "loss": 1.4154, + "step": 3440 + }, + { + "epoch": 5.5, + "grad_norm": 2.8876681327819824, + "learning_rate": 0.0017799043062200956, + "loss": 1.675, + "step": 3450 + }, + { + "epoch": 5.52, + "grad_norm": 2.8691539764404297, + "learning_rate": 0.0017792663476874002, + "loss": 1.6627, + "step": 3460 + }, + { + "epoch": 5.53, + "grad_norm": 4.810047626495361, + "learning_rate": 0.001778628389154705, + "loss": 1.4778, + "step": 3470 + }, + { + "epoch": 5.55, + "grad_norm": 6.102086067199707, + "learning_rate": 0.0017779904306220095, + "loss": 1.597, + "step": 3480 + }, + { + "epoch": 5.57, + "grad_norm": 2.5562939643859863, + "learning_rate": 0.001777352472089314, + "loss": 1.6155, + "step": 3490 + }, + { + "epoch": 5.58, + "grad_norm": 4.305008888244629, + "learning_rate": 0.0017767145135566189, + "loss": 1.6084, + "step": 3500 + }, + { + "epoch": 5.6, + "grad_norm": 3.545440673828125, + "learning_rate": 0.0017760765550239234, + "loss": 1.7705, + "step": 3510 + }, + { + "epoch": 5.61, + "grad_norm": 3.9225101470947266, + "learning_rate": 0.001775438596491228, + "loss": 1.7765, + "step": 3520 + }, + { + "epoch": 5.63, + "grad_norm": 3.6406924724578857, + "learning_rate": 0.0017748006379585326, + "loss": 1.5835, + "step": 3530 + }, + { + "epoch": 5.65, + "grad_norm": 3.9222354888916016, + "learning_rate": 0.0017741626794258373, + "loss": 1.625, + "step": 3540 + }, + { + "epoch": 5.66, + "grad_norm": 3.7696895599365234, + "learning_rate": 0.001773524720893142, + "loss": 1.7595, + "step": 3550 + }, + { + "epoch": 5.68, + "grad_norm": 3.927811622619629, + "learning_rate": 0.0017728867623604465, + "loss": 1.5485, + "step": 3560 + }, + { + "epoch": 5.69, + "grad_norm": 5.417560577392578, + "learning_rate": 0.0017722488038277512, + "loss": 1.7188, + "step": 3570 + }, + { + "epoch": 5.71, + "grad_norm": 3.9058241844177246, + "learning_rate": 0.0017716108452950558, + "loss": 1.5654, + "step": 3580 + }, + { + "epoch": 5.73, + "grad_norm": 3.2079246044158936, + "learning_rate": 0.0017709728867623604, + "loss": 1.8261, + "step": 3590 + }, + { + "epoch": 5.74, + "grad_norm": 3.8155903816223145, + "learning_rate": 0.001770334928229665, + "loss": 1.5338, + "step": 3600 + }, + { + "epoch": 5.76, + "grad_norm": 4.09771203994751, + "learning_rate": 0.0017696969696969697, + "loss": 1.7654, + "step": 3610 + }, + { + "epoch": 5.77, + "grad_norm": 3.3263423442840576, + "learning_rate": 0.0017690590111642743, + "loss": 1.5809, + "step": 3620 + }, + { + "epoch": 5.79, + "grad_norm": 4.113112926483154, + "learning_rate": 0.0017684210526315788, + "loss": 1.7932, + "step": 3630 + }, + { + "epoch": 5.81, + "grad_norm": 4.0192694664001465, + "learning_rate": 0.0017677830940988836, + "loss": 1.7115, + "step": 3640 + }, + { + "epoch": 5.82, + "grad_norm": 3.165609121322632, + "learning_rate": 0.0017671451355661882, + "loss": 1.4939, + "step": 3650 + }, + { + "epoch": 5.84, + "grad_norm": 3.859196186065674, + "learning_rate": 0.0017665071770334927, + "loss": 1.7767, + "step": 3660 + }, + { + "epoch": 5.85, + "grad_norm": 4.11074686050415, + "learning_rate": 0.0017658692185007973, + "loss": 1.6754, + "step": 3670 + }, + { + "epoch": 5.87, + "grad_norm": 2.926147937774658, + "learning_rate": 0.001765231259968102, + "loss": 1.6904, + "step": 3680 + }, + { + "epoch": 5.89, + "grad_norm": 4.121160507202148, + "learning_rate": 0.0017645933014354066, + "loss": 1.575, + "step": 3690 + }, + { + "epoch": 5.9, + "grad_norm": 4.0827131271362305, + "learning_rate": 0.0017639553429027112, + "loss": 1.6392, + "step": 3700 + }, + { + "epoch": 5.92, + "grad_norm": 4.13917875289917, + "learning_rate": 0.0017633173843700158, + "loss": 1.6323, + "step": 3710 + }, + { + "epoch": 5.93, + "grad_norm": 3.052493095397949, + "learning_rate": 0.0017626794258373206, + "loss": 1.8932, + "step": 3720 + }, + { + "epoch": 5.95, + "grad_norm": 5.432674407958984, + "learning_rate": 0.0017620414673046251, + "loss": 1.58, + "step": 3730 + }, + { + "epoch": 5.96, + "grad_norm": 6.6524505615234375, + "learning_rate": 0.0017614035087719297, + "loss": 1.6361, + "step": 3740 + }, + { + "epoch": 5.98, + "grad_norm": 7.488154888153076, + "learning_rate": 0.0017607655502392347, + "loss": 1.823, + "step": 3750 + }, + { + "epoch": 6.0, + "grad_norm": 3.211604118347168, + "learning_rate": 0.0017601275917065392, + "loss": 1.8265, + "step": 3760 + }, + { + "epoch": 6.01, + "grad_norm": 2.5021958351135254, + "learning_rate": 0.0017594896331738438, + "loss": 1.3958, + "step": 3770 + }, + { + "epoch": 6.03, + "grad_norm": 3.7297511100769043, + "learning_rate": 0.0017588516746411484, + "loss": 1.1193, + "step": 3780 + }, + { + "epoch": 6.04, + "grad_norm": 4.050276279449463, + "learning_rate": 0.0017582137161084531, + "loss": 1.2042, + "step": 3790 + }, + { + "epoch": 6.06, + "grad_norm": 4.484896659851074, + "learning_rate": 0.0017575757575757577, + "loss": 1.2483, + "step": 3800 + }, + { + "epoch": 6.08, + "grad_norm": 7.920963764190674, + "learning_rate": 0.0017569377990430623, + "loss": 1.2832, + "step": 3810 + }, + { + "epoch": 6.09, + "grad_norm": 2.772211790084839, + "learning_rate": 0.001756299840510367, + "loss": 1.1111, + "step": 3820 + }, + { + "epoch": 6.11, + "grad_norm": 3.2087087631225586, + "learning_rate": 0.0017556618819776716, + "loss": 1.108, + "step": 3830 + }, + { + "epoch": 6.12, + "grad_norm": 3.650775194168091, + "learning_rate": 0.0017550239234449762, + "loss": 1.2609, + "step": 3840 + }, + { + "epoch": 6.14, + "grad_norm": 3.6753830909729004, + "learning_rate": 0.0017543859649122807, + "loss": 1.1581, + "step": 3850 + }, + { + "epoch": 6.16, + "grad_norm": 3.568274974822998, + "learning_rate": 0.0017537480063795855, + "loss": 1.2661, + "step": 3860 + }, + { + "epoch": 6.17, + "grad_norm": 3.6179471015930176, + "learning_rate": 0.00175311004784689, + "loss": 1.092, + "step": 3870 + }, + { + "epoch": 6.19, + "grad_norm": 2.885768413543701, + "learning_rate": 0.0017524720893141946, + "loss": 1.1903, + "step": 3880 + }, + { + "epoch": 6.2, + "grad_norm": 2.389308214187622, + "learning_rate": 0.0017518341307814994, + "loss": 1.1977, + "step": 3890 + }, + { + "epoch": 6.22, + "grad_norm": 4.820352554321289, + "learning_rate": 0.001751196172248804, + "loss": 1.3104, + "step": 3900 + }, + { + "epoch": 6.24, + "grad_norm": 2.8304367065429688, + "learning_rate": 0.0017505582137161085, + "loss": 1.3522, + "step": 3910 + }, + { + "epoch": 6.25, + "grad_norm": 4.1842732429504395, + "learning_rate": 0.001749920255183413, + "loss": 1.2901, + "step": 3920 + }, + { + "epoch": 6.27, + "grad_norm": 4.698485851287842, + "learning_rate": 0.0017492822966507179, + "loss": 1.2261, + "step": 3930 + }, + { + "epoch": 6.28, + "grad_norm": 5.434518814086914, + "learning_rate": 0.0017486443381180224, + "loss": 1.3348, + "step": 3940 + }, + { + "epoch": 6.3, + "grad_norm": 4.726064682006836, + "learning_rate": 0.001748006379585327, + "loss": 1.308, + "step": 3950 + }, + { + "epoch": 6.32, + "grad_norm": 3.2794930934906006, + "learning_rate": 0.0017473684210526318, + "loss": 1.232, + "step": 3960 + }, + { + "epoch": 6.33, + "grad_norm": 4.248810768127441, + "learning_rate": 0.0017467304625199364, + "loss": 1.2118, + "step": 3970 + }, + { + "epoch": 6.35, + "grad_norm": 5.226914882659912, + "learning_rate": 0.001746092503987241, + "loss": 1.273, + "step": 3980 + }, + { + "epoch": 6.36, + "grad_norm": 4.917492866516113, + "learning_rate": 0.0017454545454545455, + "loss": 1.2566, + "step": 3990 + }, + { + "epoch": 6.38, + "grad_norm": 6.164140224456787, + "learning_rate": 0.0017448165869218503, + "loss": 1.4062, + "step": 4000 + }, + { + "epoch": 6.4, + "grad_norm": 4.235147953033447, + "learning_rate": 0.0017441786283891548, + "loss": 1.2668, + "step": 4010 + }, + { + "epoch": 6.41, + "grad_norm": 4.627178192138672, + "learning_rate": 0.0017435406698564594, + "loss": 1.2448, + "step": 4020 + }, + { + "epoch": 6.43, + "grad_norm": 5.603235721588135, + "learning_rate": 0.0017429027113237642, + "loss": 1.4769, + "step": 4030 + }, + { + "epoch": 6.44, + "grad_norm": 2.3861303329467773, + "learning_rate": 0.0017422647527910687, + "loss": 1.6294, + "step": 4040 + }, + { + "epoch": 6.46, + "grad_norm": 3.891209840774536, + "learning_rate": 0.0017416267942583733, + "loss": 1.3206, + "step": 4050 + }, + { + "epoch": 6.48, + "grad_norm": 2.741506576538086, + "learning_rate": 0.0017409888357256779, + "loss": 1.3517, + "step": 4060 + }, + { + "epoch": 6.49, + "grad_norm": 3.152433156967163, + "learning_rate": 0.0017403508771929826, + "loss": 1.2853, + "step": 4070 + }, + { + "epoch": 6.51, + "grad_norm": 3.4589314460754395, + "learning_rate": 0.0017397129186602872, + "loss": 1.4094, + "step": 4080 + }, + { + "epoch": 6.52, + "grad_norm": 6.630537033081055, + "learning_rate": 0.0017390749601275918, + "loss": 1.3614, + "step": 4090 + }, + { + "epoch": 6.54, + "grad_norm": 3.220771551132202, + "learning_rate": 0.0017384370015948963, + "loss": 1.3354, + "step": 4100 + }, + { + "epoch": 6.56, + "grad_norm": 2.8003170490264893, + "learning_rate": 0.001737799043062201, + "loss": 1.398, + "step": 4110 + }, + { + "epoch": 6.57, + "grad_norm": 5.145318984985352, + "learning_rate": 0.0017371610845295057, + "loss": 1.4861, + "step": 4120 + }, + { + "epoch": 6.59, + "grad_norm": 3.6889803409576416, + "learning_rate": 0.0017365231259968102, + "loss": 1.3687, + "step": 4130 + }, + { + "epoch": 6.6, + "grad_norm": 3.3676440715789795, + "learning_rate": 0.001735885167464115, + "loss": 1.3656, + "step": 4140 + }, + { + "epoch": 6.62, + "grad_norm": 4.406673431396484, + "learning_rate": 0.0017352472089314196, + "loss": 1.4002, + "step": 4150 + }, + { + "epoch": 6.63, + "grad_norm": 4.088317394256592, + "learning_rate": 0.0017346092503987241, + "loss": 1.4265, + "step": 4160 + }, + { + "epoch": 6.65, + "grad_norm": 4.677865982055664, + "learning_rate": 0.0017339712918660287, + "loss": 1.5231, + "step": 4170 + }, + { + "epoch": 6.67, + "grad_norm": 4.6024322509765625, + "learning_rate": 0.0017333333333333335, + "loss": 1.5746, + "step": 4180 + }, + { + "epoch": 6.68, + "grad_norm": 4.752773284912109, + "learning_rate": 0.001732695374800638, + "loss": 1.3851, + "step": 4190 + }, + { + "epoch": 6.7, + "grad_norm": 3.870704412460327, + "learning_rate": 0.0017320574162679426, + "loss": 1.4281, + "step": 4200 + }, + { + "epoch": 6.71, + "grad_norm": 3.5712807178497314, + "learning_rate": 0.0017314194577352474, + "loss": 1.6449, + "step": 4210 + }, + { + "epoch": 6.73, + "grad_norm": 3.4372332096099854, + "learning_rate": 0.001730781499202552, + "loss": 1.5439, + "step": 4220 + }, + { + "epoch": 6.75, + "grad_norm": 5.638207912445068, + "learning_rate": 0.0017301435406698565, + "loss": 1.5426, + "step": 4230 + }, + { + "epoch": 6.76, + "grad_norm": 5.095453262329102, + "learning_rate": 0.001729505582137161, + "loss": 1.4252, + "step": 4240 + }, + { + "epoch": 6.78, + "grad_norm": 2.4728281497955322, + "learning_rate": 0.0017288676236044658, + "loss": 1.4374, + "step": 4250 + }, + { + "epoch": 6.79, + "grad_norm": 3.4558870792388916, + "learning_rate": 0.0017282296650717704, + "loss": 1.4457, + "step": 4260 + }, + { + "epoch": 6.81, + "grad_norm": 3.5765767097473145, + "learning_rate": 0.001727591706539075, + "loss": 1.5364, + "step": 4270 + }, + { + "epoch": 6.83, + "grad_norm": 4.479535102844238, + "learning_rate": 0.0017269537480063797, + "loss": 1.561, + "step": 4280 + }, + { + "epoch": 6.84, + "grad_norm": 3.1493709087371826, + "learning_rate": 0.0017263157894736843, + "loss": 1.4131, + "step": 4290 + }, + { + "epoch": 6.86, + "grad_norm": 4.1836042404174805, + "learning_rate": 0.0017256778309409889, + "loss": 1.5031, + "step": 4300 + }, + { + "epoch": 6.87, + "grad_norm": 3.2860119342803955, + "learning_rate": 0.0017250398724082934, + "loss": 1.6286, + "step": 4310 + }, + { + "epoch": 6.89, + "grad_norm": 2.8824214935302734, + "learning_rate": 0.0017244019138755982, + "loss": 1.5281, + "step": 4320 + }, + { + "epoch": 6.91, + "grad_norm": 5.243397235870361, + "learning_rate": 0.0017237639553429028, + "loss": 1.5868, + "step": 4330 + }, + { + "epoch": 6.92, + "grad_norm": 2.8732147216796875, + "learning_rate": 0.0017231259968102073, + "loss": 1.5813, + "step": 4340 + }, + { + "epoch": 6.94, + "grad_norm": 4.3689494132995605, + "learning_rate": 0.0017224880382775121, + "loss": 1.5706, + "step": 4350 + }, + { + "epoch": 6.95, + "grad_norm": 4.520773887634277, + "learning_rate": 0.0017218500797448167, + "loss": 1.5769, + "step": 4360 + }, + { + "epoch": 6.97, + "grad_norm": 3.988919734954834, + "learning_rate": 0.0017212121212121212, + "loss": 1.6688, + "step": 4370 + }, + { + "epoch": 6.99, + "grad_norm": 3.1639842987060547, + "learning_rate": 0.0017205741626794258, + "loss": 1.5045, + "step": 4380 + }, + { + "epoch": 7.0, + "grad_norm": 1.8472672700881958, + "learning_rate": 0.0017199362041467306, + "loss": 1.4904, + "step": 4390 + }, + { + "epoch": 7.02, + "grad_norm": 3.472080945968628, + "learning_rate": 0.0017192982456140352, + "loss": 1.0386, + "step": 4400 + }, + { + "epoch": 7.03, + "grad_norm": 3.553772211074829, + "learning_rate": 0.0017186602870813397, + "loss": 0.9889, + "step": 4410 + }, + { + "epoch": 7.05, + "grad_norm": 4.417268753051758, + "learning_rate": 0.0017180223285486443, + "loss": 0.9838, + "step": 4420 + }, + { + "epoch": 7.07, + "grad_norm": 5.340514659881592, + "learning_rate": 0.001717384370015949, + "loss": 1.0057, + "step": 4430 + }, + { + "epoch": 7.08, + "grad_norm": 3.2239015102386475, + "learning_rate": 0.0017167464114832536, + "loss": 1.1003, + "step": 4440 + }, + { + "epoch": 7.1, + "grad_norm": 3.5991039276123047, + "learning_rate": 0.0017161084529505582, + "loss": 1.0384, + "step": 4450 + }, + { + "epoch": 7.11, + "grad_norm": 3.8520448207855225, + "learning_rate": 0.001715470494417863, + "loss": 0.9872, + "step": 4460 + }, + { + "epoch": 7.13, + "grad_norm": 3.489706516265869, + "learning_rate": 0.0017148325358851675, + "loss": 1.1452, + "step": 4470 + }, + { + "epoch": 7.15, + "grad_norm": 2.60661244392395, + "learning_rate": 0.001714194577352472, + "loss": 1.0007, + "step": 4480 + }, + { + "epoch": 7.16, + "grad_norm": 5.66582727432251, + "learning_rate": 0.0017135566188197766, + "loss": 1.1072, + "step": 4490 + }, + { + "epoch": 7.18, + "grad_norm": 4.794973373413086, + "learning_rate": 0.0017129186602870814, + "loss": 1.1994, + "step": 4500 + }, + { + "epoch": 7.19, + "grad_norm": 5.310514450073242, + "learning_rate": 0.001712280701754386, + "loss": 1.104, + "step": 4510 + }, + { + "epoch": 7.21, + "grad_norm": 3.0956227779388428, + "learning_rate": 0.0017116427432216906, + "loss": 1.0067, + "step": 4520 + }, + { + "epoch": 7.22, + "grad_norm": 4.637990474700928, + "learning_rate": 0.0017110047846889953, + "loss": 1.0235, + "step": 4530 + }, + { + "epoch": 7.24, + "grad_norm": 2.7805848121643066, + "learning_rate": 0.0017103668261563, + "loss": 1.2051, + "step": 4540 + }, + { + "epoch": 7.26, + "grad_norm": 4.313024520874023, + "learning_rate": 0.0017097288676236045, + "loss": 1.1176, + "step": 4550 + }, + { + "epoch": 7.27, + "grad_norm": 3.135601282119751, + "learning_rate": 0.001709090909090909, + "loss": 1.2627, + "step": 4560 + }, + { + "epoch": 7.29, + "grad_norm": 3.0150604248046875, + "learning_rate": 0.0017084529505582138, + "loss": 1.0905, + "step": 4570 + }, + { + "epoch": 7.3, + "grad_norm": 3.5915615558624268, + "learning_rate": 0.0017078149920255184, + "loss": 1.0766, + "step": 4580 + }, + { + "epoch": 7.32, + "grad_norm": 3.981519937515259, + "learning_rate": 0.001707177033492823, + "loss": 1.2256, + "step": 4590 + }, + { + "epoch": 7.34, + "grad_norm": 3.5225601196289062, + "learning_rate": 0.0017065390749601277, + "loss": 1.2533, + "step": 4600 + }, + { + "epoch": 7.35, + "grad_norm": 4.971328258514404, + "learning_rate": 0.0017059011164274323, + "loss": 1.2287, + "step": 4610 + }, + { + "epoch": 7.37, + "grad_norm": 3.6815969944000244, + "learning_rate": 0.0017052631578947368, + "loss": 1.1224, + "step": 4620 + }, + { + "epoch": 7.38, + "grad_norm": 2.50472354888916, + "learning_rate": 0.0017046251993620414, + "loss": 1.3059, + "step": 4630 + }, + { + "epoch": 7.4, + "grad_norm": 2.376018524169922, + "learning_rate": 0.0017039872408293462, + "loss": 1.3294, + "step": 4640 + }, + { + "epoch": 7.42, + "grad_norm": 3.935692548751831, + "learning_rate": 0.0017033492822966507, + "loss": 1.3172, + "step": 4650 + }, + { + "epoch": 7.43, + "grad_norm": 2.974992513656616, + "learning_rate": 0.0017027113237639553, + "loss": 1.2235, + "step": 4660 + }, + { + "epoch": 7.45, + "grad_norm": 6.238065242767334, + "learning_rate": 0.00170207336523126, + "loss": 1.2355, + "step": 4670 + }, + { + "epoch": 7.46, + "grad_norm": 4.15529727935791, + "learning_rate": 0.0017014354066985646, + "loss": 1.1975, + "step": 4680 + }, + { + "epoch": 7.48, + "grad_norm": 3.069063663482666, + "learning_rate": 0.0017007974481658692, + "loss": 1.0709, + "step": 4690 + }, + { + "epoch": 7.5, + "grad_norm": 3.6260762214660645, + "learning_rate": 0.0017001594896331738, + "loss": 1.2178, + "step": 4700 + }, + { + "epoch": 7.51, + "grad_norm": 3.0013301372528076, + "learning_rate": 0.0016995215311004785, + "loss": 1.1398, + "step": 4710 + }, + { + "epoch": 7.53, + "grad_norm": 2.0015666484832764, + "learning_rate": 0.001698883572567783, + "loss": 1.3866, + "step": 4720 + }, + { + "epoch": 7.54, + "grad_norm": 3.997130870819092, + "learning_rate": 0.0016982456140350877, + "loss": 1.3066, + "step": 4730 + }, + { + "epoch": 7.56, + "grad_norm": 3.5671958923339844, + "learning_rate": 0.0016976076555023924, + "loss": 1.0524, + "step": 4740 + }, + { + "epoch": 7.58, + "grad_norm": 2.9513649940490723, + "learning_rate": 0.001696969696969697, + "loss": 1.173, + "step": 4750 + }, + { + "epoch": 7.59, + "grad_norm": 3.9709384441375732, + "learning_rate": 0.0016963317384370016, + "loss": 1.4574, + "step": 4760 + }, + { + "epoch": 7.61, + "grad_norm": 4.372689723968506, + "learning_rate": 0.0016956937799043061, + "loss": 1.4686, + "step": 4770 + }, + { + "epoch": 7.62, + "grad_norm": 3.748054265975952, + "learning_rate": 0.001695055821371611, + "loss": 1.4393, + "step": 4780 + }, + { + "epoch": 7.64, + "grad_norm": 3.7790236473083496, + "learning_rate": 0.0016944178628389155, + "loss": 1.2965, + "step": 4790 + }, + { + "epoch": 7.66, + "grad_norm": 4.572340965270996, + "learning_rate": 0.00169377990430622, + "loss": 1.5793, + "step": 4800 + }, + { + "epoch": 7.67, + "grad_norm": 3.838794231414795, + "learning_rate": 0.0016931419457735246, + "loss": 1.2758, + "step": 4810 + }, + { + "epoch": 7.69, + "grad_norm": 3.9073917865753174, + "learning_rate": 0.0016925039872408294, + "loss": 1.2657, + "step": 4820 + }, + { + "epoch": 7.7, + "grad_norm": 3.6725800037384033, + "learning_rate": 0.001691866028708134, + "loss": 1.1664, + "step": 4830 + }, + { + "epoch": 7.72, + "grad_norm": 2.742488384246826, + "learning_rate": 0.0016912280701754385, + "loss": 1.3705, + "step": 4840 + }, + { + "epoch": 7.74, + "grad_norm": 5.307029724121094, + "learning_rate": 0.0016905901116427433, + "loss": 1.3447, + "step": 4850 + }, + { + "epoch": 7.75, + "grad_norm": 3.2814066410064697, + "learning_rate": 0.0016899521531100479, + "loss": 1.2705, + "step": 4860 + }, + { + "epoch": 7.77, + "grad_norm": 4.674114227294922, + "learning_rate": 0.0016893141945773524, + "loss": 1.3062, + "step": 4870 + }, + { + "epoch": 7.78, + "grad_norm": 3.455000638961792, + "learning_rate": 0.001688676236044657, + "loss": 1.3788, + "step": 4880 + }, + { + "epoch": 7.8, + "grad_norm": 3.7969977855682373, + "learning_rate": 0.0016880382775119618, + "loss": 1.4107, + "step": 4890 + }, + { + "epoch": 7.81, + "grad_norm": 4.002437591552734, + "learning_rate": 0.0016874003189792663, + "loss": 1.3215, + "step": 4900 + }, + { + "epoch": 7.83, + "grad_norm": 2.509416103363037, + "learning_rate": 0.0016867623604465709, + "loss": 1.2652, + "step": 4910 + }, + { + "epoch": 7.85, + "grad_norm": 2.7716715335845947, + "learning_rate": 0.0016861244019138757, + "loss": 1.4095, + "step": 4920 + }, + { + "epoch": 7.86, + "grad_norm": 5.537817001342773, + "learning_rate": 0.0016854864433811802, + "loss": 1.2998, + "step": 4930 + }, + { + "epoch": 7.88, + "grad_norm": 3.2739720344543457, + "learning_rate": 0.0016848484848484848, + "loss": 1.3311, + "step": 4940 + }, + { + "epoch": 7.89, + "grad_norm": 3.1102712154388428, + "learning_rate": 0.0016842105263157893, + "loss": 1.2811, + "step": 4950 + }, + { + "epoch": 7.91, + "grad_norm": 4.807369709014893, + "learning_rate": 0.0016835725677830941, + "loss": 1.4339, + "step": 4960 + }, + { + "epoch": 7.93, + "grad_norm": 8.400796890258789, + "learning_rate": 0.0016829346092503987, + "loss": 1.5323, + "step": 4970 + }, + { + "epoch": 7.94, + "grad_norm": 3.6073365211486816, + "learning_rate": 0.0016822966507177033, + "loss": 1.5019, + "step": 4980 + }, + { + "epoch": 7.96, + "grad_norm": 3.30039644241333, + "learning_rate": 0.001681658692185008, + "loss": 1.4698, + "step": 4990 + }, + { + "epoch": 7.97, + "grad_norm": 3.7990474700927734, + "learning_rate": 0.0016810207336523126, + "loss": 1.5408, + "step": 5000 + }, + { + "epoch": 7.99, + "grad_norm": 4.0094499588012695, + "learning_rate": 0.0016803827751196172, + "loss": 1.4195, + "step": 5010 + }, + { + "epoch": 8.01, + "grad_norm": 2.0265750885009766, + "learning_rate": 0.0016797448165869217, + "loss": 1.181, + "step": 5020 + }, + { + "epoch": 8.02, + "grad_norm": 2.840583086013794, + "learning_rate": 0.0016791068580542265, + "loss": 0.8481, + "step": 5030 + }, + { + "epoch": 8.04, + "grad_norm": 2.409465789794922, + "learning_rate": 0.001678468899521531, + "loss": 0.9815, + "step": 5040 + }, + { + "epoch": 8.05, + "grad_norm": 5.790297031402588, + "learning_rate": 0.0016778309409888356, + "loss": 0.9621, + "step": 5050 + }, + { + "epoch": 8.07, + "grad_norm": 2.9008841514587402, + "learning_rate": 0.0016771929824561404, + "loss": 0.9734, + "step": 5060 + }, + { + "epoch": 8.09, + "grad_norm": 2.495950937271118, + "learning_rate": 0.001676555023923445, + "loss": 0.9353, + "step": 5070 + }, + { + "epoch": 8.1, + "grad_norm": 3.870645523071289, + "learning_rate": 0.0016759170653907495, + "loss": 1.023, + "step": 5080 + }, + { + "epoch": 8.12, + "grad_norm": 2.352860450744629, + "learning_rate": 0.001675279106858054, + "loss": 0.9064, + "step": 5090 + }, + { + "epoch": 8.13, + "grad_norm": 3.9795637130737305, + "learning_rate": 0.0016746411483253589, + "loss": 1.0378, + "step": 5100 + }, + { + "epoch": 8.15, + "grad_norm": 3.728628396987915, + "learning_rate": 0.0016740031897926634, + "loss": 0.8394, + "step": 5110 + }, + { + "epoch": 8.17, + "grad_norm": 4.232802391052246, + "learning_rate": 0.001673365231259968, + "loss": 1.0115, + "step": 5120 + }, + { + "epoch": 8.18, + "grad_norm": 4.09517765045166, + "learning_rate": 0.0016727272727272726, + "loss": 1.0139, + "step": 5130 + }, + { + "epoch": 8.2, + "grad_norm": 2.101757287979126, + "learning_rate": 0.0016720893141945773, + "loss": 1.0698, + "step": 5140 + }, + { + "epoch": 8.21, + "grad_norm": 4.124992370605469, + "learning_rate": 0.001671451355661882, + "loss": 1.0214, + "step": 5150 + }, + { + "epoch": 8.23, + "grad_norm": 4.000357151031494, + "learning_rate": 0.0016708133971291865, + "loss": 1.0302, + "step": 5160 + }, + { + "epoch": 8.25, + "grad_norm": 4.556628704071045, + "learning_rate": 0.0016701754385964912, + "loss": 1.1185, + "step": 5170 + }, + { + "epoch": 8.26, + "grad_norm": 4.288385391235352, + "learning_rate": 0.0016695374800637958, + "loss": 1.1112, + "step": 5180 + }, + { + "epoch": 8.28, + "grad_norm": 3.5157744884490967, + "learning_rate": 0.0016688995215311004, + "loss": 1.0085, + "step": 5190 + }, + { + "epoch": 8.29, + "grad_norm": 4.492936611175537, + "learning_rate": 0.001668261562998405, + "loss": 0.9928, + "step": 5200 + }, + { + "epoch": 8.31, + "grad_norm": 2.415928363800049, + "learning_rate": 0.0016676236044657097, + "loss": 1.0586, + "step": 5210 + }, + { + "epoch": 8.33, + "grad_norm": 4.508685111999512, + "learning_rate": 0.0016669856459330143, + "loss": 1.1178, + "step": 5220 + }, + { + "epoch": 8.34, + "grad_norm": 6.090748310089111, + "learning_rate": 0.0016663476874003188, + "loss": 1.0743, + "step": 5230 + }, + { + "epoch": 8.36, + "grad_norm": 4.638314723968506, + "learning_rate": 0.0016657097288676238, + "loss": 1.1763, + "step": 5240 + }, + { + "epoch": 8.37, + "grad_norm": 3.5104875564575195, + "learning_rate": 0.0016650717703349284, + "loss": 1.0157, + "step": 5250 + }, + { + "epoch": 8.39, + "grad_norm": 3.11543869972229, + "learning_rate": 0.001664433811802233, + "loss": 1.1719, + "step": 5260 + }, + { + "epoch": 8.41, + "grad_norm": 3.698253631591797, + "learning_rate": 0.0016637958532695375, + "loss": 1.3834, + "step": 5270 + }, + { + "epoch": 8.42, + "grad_norm": 4.070870876312256, + "learning_rate": 0.0016631578947368423, + "loss": 1.2485, + "step": 5280 + }, + { + "epoch": 8.44, + "grad_norm": 3.328082323074341, + "learning_rate": 0.0016625199362041469, + "loss": 1.0835, + "step": 5290 + }, + { + "epoch": 8.45, + "grad_norm": 4.319711208343506, + "learning_rate": 0.0016618819776714514, + "loss": 1.1718, + "step": 5300 + }, + { + "epoch": 8.47, + "grad_norm": 3.040421485900879, + "learning_rate": 0.0016612440191387562, + "loss": 1.1825, + "step": 5310 + }, + { + "epoch": 8.48, + "grad_norm": 3.8478896617889404, + "learning_rate": 0.0016606060606060608, + "loss": 1.2003, + "step": 5320 + }, + { + "epoch": 8.5, + "grad_norm": 3.5098345279693604, + "learning_rate": 0.0016599681020733653, + "loss": 1.3229, + "step": 5330 + }, + { + "epoch": 8.52, + "grad_norm": 1.7316700220108032, + "learning_rate": 0.00165933014354067, + "loss": 1.1779, + "step": 5340 + }, + { + "epoch": 8.53, + "grad_norm": 2.3097381591796875, + "learning_rate": 0.0016586921850079747, + "loss": 1.0794, + "step": 5350 + }, + { + "epoch": 8.55, + "grad_norm": 2.1922430992126465, + "learning_rate": 0.0016580542264752792, + "loss": 1.2068, + "step": 5360 + }, + { + "epoch": 8.56, + "grad_norm": 5.0043864250183105, + "learning_rate": 0.0016574162679425838, + "loss": 1.2837, + "step": 5370 + }, + { + "epoch": 8.58, + "grad_norm": 4.01829195022583, + "learning_rate": 0.0016567783094098886, + "loss": 1.0479, + "step": 5380 + }, + { + "epoch": 8.6, + "grad_norm": 3.4249794483184814, + "learning_rate": 0.0016561403508771931, + "loss": 1.3172, + "step": 5390 + }, + { + "epoch": 8.61, + "grad_norm": 3.52347993850708, + "learning_rate": 0.0016555023923444977, + "loss": 1.2796, + "step": 5400 + }, + { + "epoch": 8.63, + "grad_norm": 3.3641083240509033, + "learning_rate": 0.0016548644338118023, + "loss": 1.2373, + "step": 5410 + }, + { + "epoch": 8.64, + "grad_norm": 4.422176361083984, + "learning_rate": 0.001654226475279107, + "loss": 1.1601, + "step": 5420 + }, + { + "epoch": 8.66, + "grad_norm": 4.531619071960449, + "learning_rate": 0.0016535885167464116, + "loss": 1.1254, + "step": 5430 + }, + { + "epoch": 8.68, + "grad_norm": 2.5183939933776855, + "learning_rate": 0.0016529505582137162, + "loss": 1.2257, + "step": 5440 + }, + { + "epoch": 8.69, + "grad_norm": 2.493967294692993, + "learning_rate": 0.001652312599681021, + "loss": 1.1369, + "step": 5450 + }, + { + "epoch": 8.71, + "grad_norm": 4.203963279724121, + "learning_rate": 0.0016516746411483255, + "loss": 1.0819, + "step": 5460 + }, + { + "epoch": 8.72, + "grad_norm": 4.204017639160156, + "learning_rate": 0.00165103668261563, + "loss": 1.2126, + "step": 5470 + }, + { + "epoch": 8.74, + "grad_norm": 5.039621353149414, + "learning_rate": 0.0016503987240829346, + "loss": 1.265, + "step": 5480 + }, + { + "epoch": 8.76, + "grad_norm": 2.5682952404022217, + "learning_rate": 0.0016497607655502394, + "loss": 1.2403, + "step": 5490 + }, + { + "epoch": 8.77, + "grad_norm": 2.821531057357788, + "learning_rate": 0.001649122807017544, + "loss": 1.3182, + "step": 5500 + }, + { + "epoch": 8.79, + "grad_norm": 3.4752848148345947, + "learning_rate": 0.0016484848484848485, + "loss": 1.264, + "step": 5510 + }, + { + "epoch": 8.8, + "grad_norm": 2.5011346340179443, + "learning_rate": 0.001647846889952153, + "loss": 1.1792, + "step": 5520 + }, + { + "epoch": 8.82, + "grad_norm": 4.323322772979736, + "learning_rate": 0.0016472089314194579, + "loss": 1.1745, + "step": 5530 + }, + { + "epoch": 8.84, + "grad_norm": 4.5369768142700195, + "learning_rate": 0.0016465709728867624, + "loss": 1.2837, + "step": 5540 + }, + { + "epoch": 8.85, + "grad_norm": 4.292603492736816, + "learning_rate": 0.001645933014354067, + "loss": 1.3317, + "step": 5550 + }, + { + "epoch": 8.87, + "grad_norm": 2.312387228012085, + "learning_rate": 0.0016452950558213718, + "loss": 1.2081, + "step": 5560 + }, + { + "epoch": 8.88, + "grad_norm": 3.759363889694214, + "learning_rate": 0.0016446570972886764, + "loss": 1.2549, + "step": 5570 + }, + { + "epoch": 8.9, + "grad_norm": 4.08116340637207, + "learning_rate": 0.001644019138755981, + "loss": 1.2823, + "step": 5580 + }, + { + "epoch": 8.92, + "grad_norm": 3.29032039642334, + "learning_rate": 0.0016433811802232855, + "loss": 1.2339, + "step": 5590 + }, + { + "epoch": 8.93, + "grad_norm": 4.082303524017334, + "learning_rate": 0.0016427432216905903, + "loss": 1.2537, + "step": 5600 + }, + { + "epoch": 8.95, + "grad_norm": 3.470620632171631, + "learning_rate": 0.0016421052631578948, + "loss": 1.2605, + "step": 5610 + }, + { + "epoch": 8.96, + "grad_norm": 5.008780002593994, + "learning_rate": 0.0016414673046251994, + "loss": 1.3909, + "step": 5620 + }, + { + "epoch": 8.98, + "grad_norm": 3.3247218132019043, + "learning_rate": 0.0016408293460925042, + "loss": 1.2504, + "step": 5630 + }, + { + "epoch": 9.0, + "grad_norm": 3.653365135192871, + "learning_rate": 0.0016401913875598087, + "loss": 1.3043, + "step": 5640 + }, + { + "epoch": 9.01, + "grad_norm": 2.061579942703247, + "learning_rate": 0.0016395534290271133, + "loss": 1.0127, + "step": 5650 + }, + { + "epoch": 9.03, + "grad_norm": 3.8204243183135986, + "learning_rate": 0.0016389154704944179, + "loss": 0.8594, + "step": 5660 + }, + { + "epoch": 9.04, + "grad_norm": 3.1755354404449463, + "learning_rate": 0.0016382775119617226, + "loss": 0.8146, + "step": 5670 + }, + { + "epoch": 9.06, + "grad_norm": 6.688543319702148, + "learning_rate": 0.0016376395534290272, + "loss": 0.8972, + "step": 5680 + }, + { + "epoch": 9.07, + "grad_norm": 2.6439781188964844, + "learning_rate": 0.0016370015948963318, + "loss": 0.8627, + "step": 5690 + }, + { + "epoch": 9.09, + "grad_norm": 2.962597131729126, + "learning_rate": 0.0016363636363636365, + "loss": 0.8263, + "step": 5700 + }, + { + "epoch": 9.11, + "grad_norm": 4.008563995361328, + "learning_rate": 0.001635725677830941, + "loss": 0.849, + "step": 5710 + }, + { + "epoch": 9.12, + "grad_norm": 3.4718000888824463, + "learning_rate": 0.0016350877192982457, + "loss": 0.873, + "step": 5720 + }, + { + "epoch": 9.14, + "grad_norm": 3.4607927799224854, + "learning_rate": 0.0016344497607655502, + "loss": 0.9231, + "step": 5730 + }, + { + "epoch": 9.15, + "grad_norm": 3.6140999794006348, + "learning_rate": 0.001633811802232855, + "loss": 0.8654, + "step": 5740 + }, + { + "epoch": 9.17, + "grad_norm": 4.108109474182129, + "learning_rate": 0.0016331738437001596, + "loss": 1.1229, + "step": 5750 + }, + { + "epoch": 9.19, + "grad_norm": 3.7185311317443848, + "learning_rate": 0.0016325358851674641, + "loss": 0.8597, + "step": 5760 + }, + { + "epoch": 9.2, + "grad_norm": 2.866516351699829, + "learning_rate": 0.001631897926634769, + "loss": 0.9815, + "step": 5770 + }, + { + "epoch": 9.22, + "grad_norm": 3.452366590499878, + "learning_rate": 0.0016312599681020735, + "loss": 0.8871, + "step": 5780 + }, + { + "epoch": 9.23, + "grad_norm": 3.129293203353882, + "learning_rate": 0.001630622009569378, + "loss": 0.9316, + "step": 5790 + }, + { + "epoch": 9.25, + "grad_norm": 3.3350937366485596, + "learning_rate": 0.0016299840510366826, + "loss": 1.0546, + "step": 5800 + }, + { + "epoch": 9.27, + "grad_norm": 3.9579129219055176, + "learning_rate": 0.0016293460925039874, + "loss": 0.8929, + "step": 5810 + }, + { + "epoch": 9.28, + "grad_norm": 2.4661436080932617, + "learning_rate": 0.001628708133971292, + "loss": 0.8749, + "step": 5820 + }, + { + "epoch": 9.3, + "grad_norm": 5.519815444946289, + "learning_rate": 0.0016280701754385965, + "loss": 1.0323, + "step": 5830 + }, + { + "epoch": 9.31, + "grad_norm": 4.336925983428955, + "learning_rate": 0.001627432216905901, + "loss": 0.9915, + "step": 5840 + }, + { + "epoch": 9.33, + "grad_norm": 3.4402873516082764, + "learning_rate": 0.0016267942583732058, + "loss": 1.0824, + "step": 5850 + }, + { + "epoch": 9.35, + "grad_norm": 2.801079034805298, + "learning_rate": 0.0016261562998405104, + "loss": 1.0676, + "step": 5860 + }, + { + "epoch": 9.36, + "grad_norm": 4.0579729080200195, + "learning_rate": 0.001625518341307815, + "loss": 1.1761, + "step": 5870 + }, + { + "epoch": 9.38, + "grad_norm": 3.309401035308838, + "learning_rate": 0.0016248803827751197, + "loss": 0.9767, + "step": 5880 + }, + { + "epoch": 9.39, + "grad_norm": 2.3733794689178467, + "learning_rate": 0.0016242424242424243, + "loss": 0.874, + "step": 5890 + }, + { + "epoch": 9.41, + "grad_norm": 2.4089362621307373, + "learning_rate": 0.0016236044657097289, + "loss": 0.9179, + "step": 5900 + }, + { + "epoch": 9.43, + "grad_norm": 4.605165004730225, + "learning_rate": 0.0016229665071770334, + "loss": 1.1248, + "step": 5910 + }, + { + "epoch": 9.44, + "grad_norm": 5.862342357635498, + "learning_rate": 0.0016223285486443382, + "loss": 1.1098, + "step": 5920 + }, + { + "epoch": 9.46, + "grad_norm": 4.282538890838623, + "learning_rate": 0.0016216905901116428, + "loss": 0.9369, + "step": 5930 + }, + { + "epoch": 9.47, + "grad_norm": 4.155124187469482, + "learning_rate": 0.0016210526315789473, + "loss": 1.0002, + "step": 5940 + }, + { + "epoch": 9.49, + "grad_norm": 2.9133784770965576, + "learning_rate": 0.0016204146730462521, + "loss": 0.9377, + "step": 5950 + }, + { + "epoch": 9.51, + "grad_norm": 3.7865607738494873, + "learning_rate": 0.0016197767145135567, + "loss": 1.1844, + "step": 5960 + }, + { + "epoch": 9.52, + "grad_norm": 4.1235270500183105, + "learning_rate": 0.0016191387559808612, + "loss": 1.0376, + "step": 5970 + }, + { + "epoch": 9.54, + "grad_norm": 2.8695414066314697, + "learning_rate": 0.0016185007974481658, + "loss": 1.1122, + "step": 5980 + }, + { + "epoch": 9.55, + "grad_norm": 4.424979209899902, + "learning_rate": 0.0016178628389154706, + "loss": 1.0555, + "step": 5990 + }, + { + "epoch": 9.57, + "grad_norm": 4.249617576599121, + "learning_rate": 0.0016172248803827752, + "loss": 1.0784, + "step": 6000 + }, + { + "epoch": 9.59, + "grad_norm": 3.7470569610595703, + "learning_rate": 0.0016165869218500797, + "loss": 1.295, + "step": 6010 + }, + { + "epoch": 9.6, + "grad_norm": 3.8228983879089355, + "learning_rate": 0.0016159489633173845, + "loss": 1.005, + "step": 6020 + }, + { + "epoch": 9.62, + "grad_norm": 3.3642499446868896, + "learning_rate": 0.001615311004784689, + "loss": 1.002, + "step": 6030 + }, + { + "epoch": 9.63, + "grad_norm": 3.2379348278045654, + "learning_rate": 0.0016146730462519936, + "loss": 0.9958, + "step": 6040 + }, + { + "epoch": 9.65, + "grad_norm": 2.8118715286254883, + "learning_rate": 0.0016140350877192982, + "loss": 1.2711, + "step": 6050 + }, + { + "epoch": 9.67, + "grad_norm": 4.146730899810791, + "learning_rate": 0.001613397129186603, + "loss": 1.2352, + "step": 6060 + }, + { + "epoch": 9.68, + "grad_norm": 3.6763010025024414, + "learning_rate": 0.0016127591706539075, + "loss": 1.1935, + "step": 6070 + }, + { + "epoch": 9.7, + "grad_norm": 2.510589838027954, + "learning_rate": 0.001612121212121212, + "loss": 1.2449, + "step": 6080 + }, + { + "epoch": 9.71, + "grad_norm": 3.556995153427124, + "learning_rate": 0.0016114832535885169, + "loss": 1.3257, + "step": 6090 + }, + { + "epoch": 9.73, + "grad_norm": 3.670929193496704, + "learning_rate": 0.0016108452950558214, + "loss": 1.0892, + "step": 6100 + }, + { + "epoch": 9.74, + "grad_norm": 3.0864908695220947, + "learning_rate": 0.001610207336523126, + "loss": 1.0478, + "step": 6110 + }, + { + "epoch": 9.76, + "grad_norm": 2.65902042388916, + "learning_rate": 0.0016095693779904306, + "loss": 1.1211, + "step": 6120 + }, + { + "epoch": 9.78, + "grad_norm": 2.6973979473114014, + "learning_rate": 0.0016089314194577353, + "loss": 1.0727, + "step": 6130 + }, + { + "epoch": 9.79, + "grad_norm": 4.574107646942139, + "learning_rate": 0.00160829346092504, + "loss": 1.0997, + "step": 6140 + }, + { + "epoch": 9.81, + "grad_norm": 3.178717613220215, + "learning_rate": 0.0016076555023923445, + "loss": 1.0726, + "step": 6150 + }, + { + "epoch": 9.82, + "grad_norm": 3.5710108280181885, + "learning_rate": 0.0016070175438596492, + "loss": 1.0341, + "step": 6160 + }, + { + "epoch": 9.84, + "grad_norm": 3.28791880607605, + "learning_rate": 0.0016063795853269538, + "loss": 1.2142, + "step": 6170 + }, + { + "epoch": 9.86, + "grad_norm": 2.811490535736084, + "learning_rate": 0.0016057416267942584, + "loss": 1.2218, + "step": 6180 + }, + { + "epoch": 9.87, + "grad_norm": 2.8246653079986572, + "learning_rate": 0.001605103668261563, + "loss": 1.0756, + "step": 6190 + }, + { + "epoch": 9.89, + "grad_norm": 4.228902339935303, + "learning_rate": 0.0016044657097288677, + "loss": 1.133, + "step": 6200 + }, + { + "epoch": 9.9, + "grad_norm": 3.8225128650665283, + "learning_rate": 0.0016038277511961723, + "loss": 1.2428, + "step": 6210 + }, + { + "epoch": 9.92, + "grad_norm": 4.282769680023193, + "learning_rate": 0.0016031897926634768, + "loss": 1.2566, + "step": 6220 + }, + { + "epoch": 9.94, + "grad_norm": 4.843967437744141, + "learning_rate": 0.0016025518341307814, + "loss": 1.1299, + "step": 6230 + }, + { + "epoch": 9.95, + "grad_norm": 3.592618227005005, + "learning_rate": 0.0016019138755980862, + "loss": 1.2274, + "step": 6240 + }, + { + "epoch": 9.97, + "grad_norm": 4.132793426513672, + "learning_rate": 0.0016012759170653907, + "loss": 1.1314, + "step": 6250 + }, + { + "epoch": 9.98, + "grad_norm": 4.065629005432129, + "learning_rate": 0.0016006379585326953, + "loss": 1.2704, + "step": 6260 + }, + { + "epoch": 10.0, + "grad_norm": 4.359400749206543, + "learning_rate": 0.0016, + "loss": 1.1157, + "step": 6270 + }, + { + "epoch": 10.02, + "grad_norm": 3.357485771179199, + "learning_rate": 0.0015993620414673046, + "loss": 0.8463, + "step": 6280 + }, + { + "epoch": 10.03, + "grad_norm": 3.480729579925537, + "learning_rate": 0.0015987240829346092, + "loss": 0.7863, + "step": 6290 + }, + { + "epoch": 10.05, + "grad_norm": 2.2290802001953125, + "learning_rate": 0.0015980861244019138, + "loss": 0.7799, + "step": 6300 + }, + { + "epoch": 10.06, + "grad_norm": 2.3109190464019775, + "learning_rate": 0.0015974481658692185, + "loss": 0.6763, + "step": 6310 + }, + { + "epoch": 10.08, + "grad_norm": 4.066445350646973, + "learning_rate": 0.001596810207336523, + "loss": 0.8017, + "step": 6320 + }, + { + "epoch": 10.1, + "grad_norm": 3.2394723892211914, + "learning_rate": 0.0015961722488038277, + "loss": 0.9444, + "step": 6330 + }, + { + "epoch": 10.11, + "grad_norm": 3.250441551208496, + "learning_rate": 0.0015955342902711324, + "loss": 0.8472, + "step": 6340 + }, + { + "epoch": 10.13, + "grad_norm": 2.4321706295013428, + "learning_rate": 0.001594896331738437, + "loss": 0.784, + "step": 6350 + }, + { + "epoch": 10.14, + "grad_norm": 2.8150861263275146, + "learning_rate": 0.0015942583732057416, + "loss": 0.8903, + "step": 6360 + }, + { + "epoch": 10.16, + "grad_norm": 2.4516990184783936, + "learning_rate": 0.0015936204146730461, + "loss": 0.7983, + "step": 6370 + }, + { + "epoch": 10.18, + "grad_norm": 4.499327659606934, + "learning_rate": 0.001592982456140351, + "loss": 0.8822, + "step": 6380 + }, + { + "epoch": 10.19, + "grad_norm": 3.4871532917022705, + "learning_rate": 0.0015923444976076555, + "loss": 0.8933, + "step": 6390 + }, + { + "epoch": 10.21, + "grad_norm": 2.1855833530426025, + "learning_rate": 0.00159170653907496, + "loss": 0.7781, + "step": 6400 + }, + { + "epoch": 10.22, + "grad_norm": 2.45394229888916, + "learning_rate": 0.0015910685805422648, + "loss": 0.7879, + "step": 6410 + }, + { + "epoch": 10.24, + "grad_norm": 4.465210914611816, + "learning_rate": 0.0015904306220095694, + "loss": 0.9825, + "step": 6420 + }, + { + "epoch": 10.26, + "grad_norm": 4.026763916015625, + "learning_rate": 0.001589792663476874, + "loss": 0.9108, + "step": 6430 + }, + { + "epoch": 10.27, + "grad_norm": 3.5490238666534424, + "learning_rate": 0.0015891547049441785, + "loss": 0.8742, + "step": 6440 + }, + { + "epoch": 10.29, + "grad_norm": 3.857203960418701, + "learning_rate": 0.0015885167464114833, + "loss": 0.9243, + "step": 6450 + }, + { + "epoch": 10.3, + "grad_norm": 5.296510696411133, + "learning_rate": 0.0015878787878787879, + "loss": 1.0742, + "step": 6460 + }, + { + "epoch": 10.32, + "grad_norm": 3.6743974685668945, + "learning_rate": 0.0015872408293460924, + "loss": 1.0541, + "step": 6470 + }, + { + "epoch": 10.33, + "grad_norm": 3.527785301208496, + "learning_rate": 0.0015866028708133972, + "loss": 0.958, + "step": 6480 + }, + { + "epoch": 10.35, + "grad_norm": 2.7961020469665527, + "learning_rate": 0.0015859649122807018, + "loss": 1.0794, + "step": 6490 + }, + { + "epoch": 10.37, + "grad_norm": 5.286695957183838, + "learning_rate": 0.0015853269537480063, + "loss": 1.0691, + "step": 6500 + }, + { + "epoch": 10.38, + "grad_norm": 2.792459011077881, + "learning_rate": 0.0015846889952153109, + "loss": 1.005, + "step": 6510 + }, + { + "epoch": 10.4, + "grad_norm": 4.287434101104736, + "learning_rate": 0.0015840510366826157, + "loss": 1.0826, + "step": 6520 + }, + { + "epoch": 10.41, + "grad_norm": 3.291612148284912, + "learning_rate": 0.0015834130781499202, + "loss": 0.9482, + "step": 6530 + }, + { + "epoch": 10.43, + "grad_norm": 2.1570498943328857, + "learning_rate": 0.0015827751196172248, + "loss": 0.8198, + "step": 6540 + }, + { + "epoch": 10.45, + "grad_norm": 5.364358901977539, + "learning_rate": 0.0015821371610845293, + "loss": 0.922, + "step": 6550 + }, + { + "epoch": 10.46, + "grad_norm": 2.493326187133789, + "learning_rate": 0.0015814992025518341, + "loss": 1.0143, + "step": 6560 + }, + { + "epoch": 10.48, + "grad_norm": 4.336993217468262, + "learning_rate": 0.0015808612440191387, + "loss": 0.9091, + "step": 6570 + }, + { + "epoch": 10.49, + "grad_norm": 3.2809929847717285, + "learning_rate": 0.0015802232854864433, + "loss": 0.947, + "step": 6580 + }, + { + "epoch": 10.51, + "grad_norm": 3.941453456878662, + "learning_rate": 0.001579585326953748, + "loss": 1.0424, + "step": 6590 + }, + { + "epoch": 10.53, + "grad_norm": 2.2481088638305664, + "learning_rate": 0.0015789473684210526, + "loss": 0.9591, + "step": 6600 + }, + { + "epoch": 10.54, + "grad_norm": 2.889963388442993, + "learning_rate": 0.0015783094098883572, + "loss": 1.0191, + "step": 6610 + }, + { + "epoch": 10.56, + "grad_norm": 3.2319366931915283, + "learning_rate": 0.0015776714513556617, + "loss": 1.1833, + "step": 6620 + }, + { + "epoch": 10.57, + "grad_norm": 2.6110410690307617, + "learning_rate": 0.0015770334928229665, + "loss": 1.082, + "step": 6630 + }, + { + "epoch": 10.59, + "grad_norm": 2.9207470417022705, + "learning_rate": 0.001576395534290271, + "loss": 0.9628, + "step": 6640 + }, + { + "epoch": 10.61, + "grad_norm": 4.300070285797119, + "learning_rate": 0.0015757575757575756, + "loss": 0.9607, + "step": 6650 + }, + { + "epoch": 10.62, + "grad_norm": 3.435377597808838, + "learning_rate": 0.0015751196172248804, + "loss": 0.9709, + "step": 6660 + }, + { + "epoch": 10.64, + "grad_norm": 3.129941940307617, + "learning_rate": 0.001574481658692185, + "loss": 1.0623, + "step": 6670 + }, + { + "epoch": 10.65, + "grad_norm": 3.273089647293091, + "learning_rate": 0.0015738437001594895, + "loss": 1.0186, + "step": 6680 + }, + { + "epoch": 10.67, + "grad_norm": 3.147507667541504, + "learning_rate": 0.001573205741626794, + "loss": 0.9143, + "step": 6690 + }, + { + "epoch": 10.69, + "grad_norm": 3.1906449794769287, + "learning_rate": 0.0015725677830940989, + "loss": 1.0698, + "step": 6700 + }, + { + "epoch": 10.7, + "grad_norm": 2.52282452583313, + "learning_rate": 0.0015719298245614034, + "loss": 1.2058, + "step": 6710 + }, + { + "epoch": 10.72, + "grad_norm": 3.526111602783203, + "learning_rate": 0.001571291866028708, + "loss": 1.0494, + "step": 6720 + }, + { + "epoch": 10.73, + "grad_norm": 4.391296863555908, + "learning_rate": 0.001570653907496013, + "loss": 1.0639, + "step": 6730 + }, + { + "epoch": 10.75, + "grad_norm": 3.623323678970337, + "learning_rate": 0.0015700159489633176, + "loss": 1.1105, + "step": 6740 + }, + { + "epoch": 10.77, + "grad_norm": 3.705646514892578, + "learning_rate": 0.0015693779904306221, + "loss": 0.964, + "step": 6750 + }, + { + "epoch": 10.78, + "grad_norm": 2.726846694946289, + "learning_rate": 0.0015687400318979267, + "loss": 1.1207, + "step": 6760 + }, + { + "epoch": 10.8, + "grad_norm": 2.3796093463897705, + "learning_rate": 0.0015681020733652315, + "loss": 1.038, + "step": 6770 + }, + { + "epoch": 10.81, + "grad_norm": 2.321793794631958, + "learning_rate": 0.001567464114832536, + "loss": 1.1098, + "step": 6780 + }, + { + "epoch": 10.83, + "grad_norm": 4.951314926147461, + "learning_rate": 0.0015668261562998406, + "loss": 1.0766, + "step": 6790 + }, + { + "epoch": 10.85, + "grad_norm": 2.3192481994628906, + "learning_rate": 0.0015661881977671454, + "loss": 1.1615, + "step": 6800 + }, + { + "epoch": 10.86, + "grad_norm": 3.576709508895874, + "learning_rate": 0.00156555023923445, + "loss": 1.0031, + "step": 6810 + }, + { + "epoch": 10.88, + "grad_norm": 3.2805440425872803, + "learning_rate": 0.0015649122807017545, + "loss": 0.9738, + "step": 6820 + }, + { + "epoch": 10.89, + "grad_norm": 3.367990016937256, + "learning_rate": 0.001564274322169059, + "loss": 0.9948, + "step": 6830 + }, + { + "epoch": 10.91, + "grad_norm": 4.358039379119873, + "learning_rate": 0.0015636363636363638, + "loss": 1.0906, + "step": 6840 + }, + { + "epoch": 10.93, + "grad_norm": 3.0704123973846436, + "learning_rate": 0.0015629984051036684, + "loss": 1.1433, + "step": 6850 + }, + { + "epoch": 10.94, + "grad_norm": 3.6105406284332275, + "learning_rate": 0.001562360446570973, + "loss": 1.0975, + "step": 6860 + }, + { + "epoch": 10.96, + "grad_norm": 2.0646121501922607, + "learning_rate": 0.0015617224880382775, + "loss": 1.1646, + "step": 6870 + }, + { + "epoch": 10.97, + "grad_norm": 3.911951780319214, + "learning_rate": 0.0015610845295055823, + "loss": 1.0135, + "step": 6880 + }, + { + "epoch": 10.99, + "grad_norm": 3.6417315006256104, + "learning_rate": 0.0015604465709728869, + "loss": 1.1479, + "step": 6890 + }, + { + "epoch": 11.0, + "grad_norm": 2.3251378536224365, + "learning_rate": 0.0015598086124401914, + "loss": 1.058, + "step": 6900 + }, + { + "epoch": 11.02, + "grad_norm": 2.8822855949401855, + "learning_rate": 0.0015591706539074962, + "loss": 0.7655, + "step": 6910 + }, + { + "epoch": 11.04, + "grad_norm": 3.3327693939208984, + "learning_rate": 0.0015585326953748008, + "loss": 0.8605, + "step": 6920 + }, + { + "epoch": 11.05, + "grad_norm": 2.5779995918273926, + "learning_rate": 0.0015578947368421053, + "loss": 0.8452, + "step": 6930 + }, + { + "epoch": 11.07, + "grad_norm": 3.0843000411987305, + "learning_rate": 0.00155725677830941, + "loss": 0.7461, + "step": 6940 + }, + { + "epoch": 11.08, + "grad_norm": 3.354552984237671, + "learning_rate": 0.0015566188197767147, + "loss": 0.8311, + "step": 6950 + }, + { + "epoch": 11.1, + "grad_norm": 2.7717132568359375, + "learning_rate": 0.0015559808612440192, + "loss": 0.8472, + "step": 6960 + }, + { + "epoch": 11.12, + "grad_norm": 2.2292239665985107, + "learning_rate": 0.0015553429027113238, + "loss": 0.759, + "step": 6970 + }, + { + "epoch": 11.13, + "grad_norm": 4.184642791748047, + "learning_rate": 0.0015547049441786286, + "loss": 0.7796, + "step": 6980 + }, + { + "epoch": 11.15, + "grad_norm": 3.987525463104248, + "learning_rate": 0.0015540669856459331, + "loss": 0.7885, + "step": 6990 + }, + { + "epoch": 11.16, + "grad_norm": 2.9014410972595215, + "learning_rate": 0.0015534290271132377, + "loss": 0.8425, + "step": 7000 + }, + { + "epoch": 11.18, + "grad_norm": 2.4290761947631836, + "learning_rate": 0.0015527910685805423, + "loss": 0.7046, + "step": 7010 + }, + { + "epoch": 11.2, + "grad_norm": 4.5738701820373535, + "learning_rate": 0.001552153110047847, + "loss": 0.794, + "step": 7020 + }, + { + "epoch": 11.21, + "grad_norm": 3.999741792678833, + "learning_rate": 0.0015515151515151516, + "loss": 0.8193, + "step": 7030 + }, + { + "epoch": 11.23, + "grad_norm": 3.0708727836608887, + "learning_rate": 0.0015508771929824562, + "loss": 0.8712, + "step": 7040 + }, + { + "epoch": 11.24, + "grad_norm": 3.396559715270996, + "learning_rate": 0.001550239234449761, + "loss": 0.8411, + "step": 7050 + }, + { + "epoch": 11.26, + "grad_norm": 3.517340898513794, + "learning_rate": 0.0015496012759170655, + "loss": 0.7907, + "step": 7060 + }, + { + "epoch": 11.28, + "grad_norm": 2.170309066772461, + "learning_rate": 0.00154896331738437, + "loss": 0.7761, + "step": 7070 + }, + { + "epoch": 11.29, + "grad_norm": 4.765143871307373, + "learning_rate": 0.0015483253588516746, + "loss": 1.0072, + "step": 7080 + }, + { + "epoch": 11.31, + "grad_norm": 2.595566749572754, + "learning_rate": 0.0015476874003189794, + "loss": 0.9483, + "step": 7090 + }, + { + "epoch": 11.32, + "grad_norm": 3.8784263134002686, + "learning_rate": 0.001547049441786284, + "loss": 0.8049, + "step": 7100 + }, + { + "epoch": 11.34, + "grad_norm": 3.033404588699341, + "learning_rate": 0.0015464114832535885, + "loss": 0.8574, + "step": 7110 + }, + { + "epoch": 11.36, + "grad_norm": 3.059054136276245, + "learning_rate": 0.0015457735247208933, + "loss": 0.8477, + "step": 7120 + }, + { + "epoch": 11.37, + "grad_norm": 4.744221210479736, + "learning_rate": 0.0015451355661881979, + "loss": 1.0191, + "step": 7130 + }, + { + "epoch": 11.39, + "grad_norm": 2.8809046745300293, + "learning_rate": 0.0015444976076555024, + "loss": 0.8683, + "step": 7140 + }, + { + "epoch": 11.4, + "grad_norm": 2.913546323776245, + "learning_rate": 0.001543859649122807, + "loss": 0.9145, + "step": 7150 + }, + { + "epoch": 11.42, + "grad_norm": 3.83941650390625, + "learning_rate": 0.0015432216905901118, + "loss": 0.8532, + "step": 7160 + }, + { + "epoch": 11.44, + "grad_norm": 3.471904754638672, + "learning_rate": 0.0015425837320574164, + "loss": 0.8605, + "step": 7170 + }, + { + "epoch": 11.45, + "grad_norm": 3.6713290214538574, + "learning_rate": 0.001541945773524721, + "loss": 1.013, + "step": 7180 + }, + { + "epoch": 11.47, + "grad_norm": 3.537461996078491, + "learning_rate": 0.0015413078149920257, + "loss": 0.9493, + "step": 7190 + }, + { + "epoch": 11.48, + "grad_norm": 3.101954460144043, + "learning_rate": 0.0015406698564593303, + "loss": 0.892, + "step": 7200 + }, + { + "epoch": 11.5, + "grad_norm": 4.835020542144775, + "learning_rate": 0.0015400318979266348, + "loss": 0.9719, + "step": 7210 + }, + { + "epoch": 11.52, + "grad_norm": 3.35196852684021, + "learning_rate": 0.0015393939393939394, + "loss": 0.84, + "step": 7220 + }, + { + "epoch": 11.53, + "grad_norm": 3.0783281326293945, + "learning_rate": 0.0015387559808612442, + "loss": 1.0172, + "step": 7230 + }, + { + "epoch": 11.55, + "grad_norm": 3.5924274921417236, + "learning_rate": 0.0015381180223285487, + "loss": 0.9569, + "step": 7240 + }, + { + "epoch": 11.56, + "grad_norm": 4.351842403411865, + "learning_rate": 0.0015374800637958533, + "loss": 0.9124, + "step": 7250 + }, + { + "epoch": 11.58, + "grad_norm": 5.1138200759887695, + "learning_rate": 0.0015368421052631579, + "loss": 1.0265, + "step": 7260 + }, + { + "epoch": 11.59, + "grad_norm": 4.592616558074951, + "learning_rate": 0.0015362041467304626, + "loss": 0.8753, + "step": 7270 + }, + { + "epoch": 11.61, + "grad_norm": 2.198404550552368, + "learning_rate": 0.0015355661881977672, + "loss": 0.9964, + "step": 7280 + }, + { + "epoch": 11.63, + "grad_norm": 3.718247175216675, + "learning_rate": 0.0015349282296650718, + "loss": 0.9982, + "step": 7290 + }, + { + "epoch": 11.64, + "grad_norm": 2.973299980163574, + "learning_rate": 0.0015342902711323765, + "loss": 0.9022, + "step": 7300 + }, + { + "epoch": 11.66, + "grad_norm": 3.1553690433502197, + "learning_rate": 0.001533652312599681, + "loss": 0.8882, + "step": 7310 + }, + { + "epoch": 11.67, + "grad_norm": 5.204711437225342, + "learning_rate": 0.0015330143540669857, + "loss": 0.9439, + "step": 7320 + }, + { + "epoch": 11.69, + "grad_norm": 2.575793981552124, + "learning_rate": 0.0015323763955342902, + "loss": 0.9633, + "step": 7330 + }, + { + "epoch": 11.71, + "grad_norm": 3.682734251022339, + "learning_rate": 0.001531738437001595, + "loss": 0.8831, + "step": 7340 + }, + { + "epoch": 11.72, + "grad_norm": 4.238563060760498, + "learning_rate": 0.0015311004784688996, + "loss": 1.2109, + "step": 7350 + }, + { + "epoch": 11.74, + "grad_norm": 4.091822147369385, + "learning_rate": 0.0015304625199362041, + "loss": 0.9663, + "step": 7360 + }, + { + "epoch": 11.75, + "grad_norm": 4.6950154304504395, + "learning_rate": 0.001529824561403509, + "loss": 0.9541, + "step": 7370 + }, + { + "epoch": 11.77, + "grad_norm": 2.6994404792785645, + "learning_rate": 0.0015291866028708135, + "loss": 0.938, + "step": 7380 + }, + { + "epoch": 11.79, + "grad_norm": 3.632509708404541, + "learning_rate": 0.001528548644338118, + "loss": 1.0444, + "step": 7390 + }, + { + "epoch": 11.8, + "grad_norm": 3.1459712982177734, + "learning_rate": 0.0015279106858054226, + "loss": 1.0143, + "step": 7400 + }, + { + "epoch": 11.82, + "grad_norm": 3.5480315685272217, + "learning_rate": 0.0015272727272727274, + "loss": 1.001, + "step": 7410 + }, + { + "epoch": 11.83, + "grad_norm": 2.908008575439453, + "learning_rate": 0.001526634768740032, + "loss": 1.0674, + "step": 7420 + }, + { + "epoch": 11.85, + "grad_norm": 3.147965431213379, + "learning_rate": 0.0015259968102073365, + "loss": 1.0779, + "step": 7430 + }, + { + "epoch": 11.87, + "grad_norm": 3.2961347103118896, + "learning_rate": 0.0015253588516746413, + "loss": 0.9077, + "step": 7440 + }, + { + "epoch": 11.88, + "grad_norm": 3.34252667427063, + "learning_rate": 0.0015247208931419458, + "loss": 1.1163, + "step": 7450 + }, + { + "epoch": 11.9, + "grad_norm": 3.7476675510406494, + "learning_rate": 0.0015240829346092504, + "loss": 1.0136, + "step": 7460 + }, + { + "epoch": 11.91, + "grad_norm": 3.686720609664917, + "learning_rate": 0.001523444976076555, + "loss": 1.0083, + "step": 7470 + }, + { + "epoch": 11.93, + "grad_norm": 3.023853302001953, + "learning_rate": 0.0015228070175438597, + "loss": 1.1149, + "step": 7480 + }, + { + "epoch": 11.95, + "grad_norm": 2.11389422416687, + "learning_rate": 0.0015221690590111643, + "loss": 0.9332, + "step": 7490 + }, + { + "epoch": 11.96, + "grad_norm": 2.868576765060425, + "learning_rate": 0.0015215311004784689, + "loss": 1.0661, + "step": 7500 + }, + { + "epoch": 11.98, + "grad_norm": 2.1617486476898193, + "learning_rate": 0.0015208931419457737, + "loss": 1.0098, + "step": 7510 + }, + { + "epoch": 11.99, + "grad_norm": 3.540294647216797, + "learning_rate": 0.0015202551834130782, + "loss": 1.032, + "step": 7520 + }, + { + "epoch": 12.01, + "grad_norm": 3.1346607208251953, + "learning_rate": 0.0015196172248803828, + "loss": 0.9046, + "step": 7530 + }, + { + "epoch": 12.03, + "grad_norm": 2.131230115890503, + "learning_rate": 0.0015189792663476873, + "loss": 0.6433, + "step": 7540 + }, + { + "epoch": 12.04, + "grad_norm": 1.7812432050704956, + "learning_rate": 0.0015183413078149921, + "loss": 0.6029, + "step": 7550 + }, + { + "epoch": 12.06, + "grad_norm": 3.244680643081665, + "learning_rate": 0.0015177033492822967, + "loss": 0.7685, + "step": 7560 + }, + { + "epoch": 12.07, + "grad_norm": 2.641512393951416, + "learning_rate": 0.0015170653907496012, + "loss": 0.5811, + "step": 7570 + }, + { + "epoch": 12.09, + "grad_norm": 2.1574976444244385, + "learning_rate": 0.0015164274322169058, + "loss": 0.7704, + "step": 7580 + }, + { + "epoch": 12.11, + "grad_norm": 2.7403526306152344, + "learning_rate": 0.0015157894736842106, + "loss": 0.822, + "step": 7590 + }, + { + "epoch": 12.12, + "grad_norm": 4.00333309173584, + "learning_rate": 0.0015151515151515152, + "loss": 0.6791, + "step": 7600 + }, + { + "epoch": 12.14, + "grad_norm": 3.1871447563171387, + "learning_rate": 0.0015145135566188197, + "loss": 0.6589, + "step": 7610 + }, + { + "epoch": 12.15, + "grad_norm": 2.847644567489624, + "learning_rate": 0.0015138755980861245, + "loss": 0.7128, + "step": 7620 + }, + { + "epoch": 12.17, + "grad_norm": 2.5338680744171143, + "learning_rate": 0.001513237639553429, + "loss": 0.9216, + "step": 7630 + }, + { + "epoch": 12.19, + "grad_norm": 2.299643039703369, + "learning_rate": 0.0015125996810207336, + "loss": 0.8705, + "step": 7640 + }, + { + "epoch": 12.2, + "grad_norm": 2.6167166233062744, + "learning_rate": 0.0015119617224880382, + "loss": 0.7226, + "step": 7650 + }, + { + "epoch": 12.22, + "grad_norm": 1.9708589315414429, + "learning_rate": 0.001511323763955343, + "loss": 0.7894, + "step": 7660 + }, + { + "epoch": 12.23, + "grad_norm": 2.8870623111724854, + "learning_rate": 0.0015106858054226475, + "loss": 0.7716, + "step": 7670 + }, + { + "epoch": 12.25, + "grad_norm": 2.571887493133545, + "learning_rate": 0.001510047846889952, + "loss": 0.6934, + "step": 7680 + }, + { + "epoch": 12.26, + "grad_norm": 3.059251070022583, + "learning_rate": 0.0015094098883572569, + "loss": 0.7272, + "step": 7690 + }, + { + "epoch": 12.28, + "grad_norm": 2.94647216796875, + "learning_rate": 0.0015087719298245614, + "loss": 0.7966, + "step": 7700 + }, + { + "epoch": 12.3, + "grad_norm": 2.6510915756225586, + "learning_rate": 0.001508133971291866, + "loss": 0.735, + "step": 7710 + }, + { + "epoch": 12.31, + "grad_norm": 2.9655959606170654, + "learning_rate": 0.0015074960127591706, + "loss": 0.8989, + "step": 7720 + }, + { + "epoch": 12.33, + "grad_norm": 2.72773814201355, + "learning_rate": 0.0015068580542264753, + "loss": 0.8239, + "step": 7730 + }, + { + "epoch": 12.34, + "grad_norm": 2.8079593181610107, + "learning_rate": 0.00150622009569378, + "loss": 0.7945, + "step": 7740 + }, + { + "epoch": 12.36, + "grad_norm": 2.3012099266052246, + "learning_rate": 0.0015055821371610845, + "loss": 0.8224, + "step": 7750 + }, + { + "epoch": 12.38, + "grad_norm": 3.559399127960205, + "learning_rate": 0.0015049441786283892, + "loss": 0.7912, + "step": 7760 + }, + { + "epoch": 12.39, + "grad_norm": 2.993138551712036, + "learning_rate": 0.0015043062200956938, + "loss": 0.8127, + "step": 7770 + }, + { + "epoch": 12.41, + "grad_norm": 3.5749433040618896, + "learning_rate": 0.0015036682615629984, + "loss": 0.8071, + "step": 7780 + }, + { + "epoch": 12.42, + "grad_norm": 2.879560947418213, + "learning_rate": 0.001503030303030303, + "loss": 0.8794, + "step": 7790 + }, + { + "epoch": 12.44, + "grad_norm": 3.648130416870117, + "learning_rate": 0.0015023923444976077, + "loss": 0.8795, + "step": 7800 + }, + { + "epoch": 12.46, + "grad_norm": 5.283175468444824, + "learning_rate": 0.0015017543859649123, + "loss": 1.0504, + "step": 7810 + }, + { + "epoch": 12.47, + "grad_norm": 3.602062940597534, + "learning_rate": 0.0015011164274322168, + "loss": 0.8283, + "step": 7820 + }, + { + "epoch": 12.49, + "grad_norm": 2.755488872528076, + "learning_rate": 0.0015004784688995216, + "loss": 0.8203, + "step": 7830 + }, + { + "epoch": 12.5, + "grad_norm": 3.24674654006958, + "learning_rate": 0.0014998405103668262, + "loss": 0.7843, + "step": 7840 + }, + { + "epoch": 12.52, + "grad_norm": 2.072895050048828, + "learning_rate": 0.0014992025518341307, + "loss": 0.7738, + "step": 7850 + }, + { + "epoch": 12.54, + "grad_norm": 4.0108208656311035, + "learning_rate": 0.0014985645933014353, + "loss": 0.8728, + "step": 7860 + }, + { + "epoch": 12.55, + "grad_norm": 2.896224021911621, + "learning_rate": 0.00149792663476874, + "loss": 0.823, + "step": 7870 + }, + { + "epoch": 12.57, + "grad_norm": 3.3562960624694824, + "learning_rate": 0.0014972886762360446, + "loss": 1.0389, + "step": 7880 + }, + { + "epoch": 12.58, + "grad_norm": 3.14931058883667, + "learning_rate": 0.0014966507177033492, + "loss": 0.9448, + "step": 7890 + }, + { + "epoch": 12.6, + "grad_norm": 6.942476272583008, + "learning_rate": 0.001496012759170654, + "loss": 0.8526, + "step": 7900 + }, + { + "epoch": 12.62, + "grad_norm": 2.516266107559204, + "learning_rate": 0.0014953748006379585, + "loss": 0.8342, + "step": 7910 + }, + { + "epoch": 12.63, + "grad_norm": 2.6325111389160156, + "learning_rate": 0.001494736842105263, + "loss": 0.9933, + "step": 7920 + }, + { + "epoch": 12.65, + "grad_norm": 3.630423069000244, + "learning_rate": 0.0014940988835725677, + "loss": 0.8403, + "step": 7930 + }, + { + "epoch": 12.66, + "grad_norm": 3.6334409713745117, + "learning_rate": 0.0014934609250398724, + "loss": 1.0628, + "step": 7940 + }, + { + "epoch": 12.68, + "grad_norm": 3.110170841217041, + "learning_rate": 0.001492822966507177, + "loss": 0.8604, + "step": 7950 + }, + { + "epoch": 12.7, + "grad_norm": 3.0557703971862793, + "learning_rate": 0.0014921850079744816, + "loss": 0.9121, + "step": 7960 + }, + { + "epoch": 12.71, + "grad_norm": 3.6271071434020996, + "learning_rate": 0.0014915470494417861, + "loss": 0.9177, + "step": 7970 + }, + { + "epoch": 12.73, + "grad_norm": 3.5513288974761963, + "learning_rate": 0.001490909090909091, + "loss": 0.8542, + "step": 7980 + }, + { + "epoch": 12.74, + "grad_norm": 4.270805358886719, + "learning_rate": 0.0014902711323763955, + "loss": 0.9907, + "step": 7990 + }, + { + "epoch": 12.76, + "grad_norm": 2.8084616661071777, + "learning_rate": 0.0014896331738437, + "loss": 0.9405, + "step": 8000 + }, + { + "epoch": 12.78, + "grad_norm": 5.405944347381592, + "learning_rate": 0.0014889952153110048, + "loss": 0.8483, + "step": 8010 + }, + { + "epoch": 12.79, + "grad_norm": 3.2791013717651367, + "learning_rate": 0.0014883572567783094, + "loss": 0.9408, + "step": 8020 + }, + { + "epoch": 12.81, + "grad_norm": 3.3789143562316895, + "learning_rate": 0.001487719298245614, + "loss": 1.0307, + "step": 8030 + }, + { + "epoch": 12.82, + "grad_norm": 3.513697624206543, + "learning_rate": 0.0014870813397129185, + "loss": 0.8657, + "step": 8040 + }, + { + "epoch": 12.84, + "grad_norm": 3.4501123428344727, + "learning_rate": 0.0014864433811802233, + "loss": 0.9202, + "step": 8050 + }, + { + "epoch": 12.85, + "grad_norm": 3.0335283279418945, + "learning_rate": 0.0014858054226475279, + "loss": 0.941, + "step": 8060 + }, + { + "epoch": 12.87, + "grad_norm": 3.0770187377929688, + "learning_rate": 0.0014851674641148324, + "loss": 0.9562, + "step": 8070 + }, + { + "epoch": 12.89, + "grad_norm": 2.967750310897827, + "learning_rate": 0.0014845295055821372, + "loss": 0.9318, + "step": 8080 + }, + { + "epoch": 12.9, + "grad_norm": 4.517429828643799, + "learning_rate": 0.0014838915470494418, + "loss": 0.9225, + "step": 8090 + }, + { + "epoch": 12.92, + "grad_norm": 4.639514923095703, + "learning_rate": 0.0014832535885167463, + "loss": 0.8997, + "step": 8100 + }, + { + "epoch": 12.93, + "grad_norm": 4.017191410064697, + "learning_rate": 0.0014826156299840509, + "loss": 1.0325, + "step": 8110 + }, + { + "epoch": 12.95, + "grad_norm": 4.688587188720703, + "learning_rate": 0.0014819776714513557, + "loss": 0.8542, + "step": 8120 + }, + { + "epoch": 12.97, + "grad_norm": 5.4787821769714355, + "learning_rate": 0.0014813397129186602, + "loss": 1.2567, + "step": 8130 + }, + { + "epoch": 12.98, + "grad_norm": 3.8270418643951416, + "learning_rate": 0.0014807017543859648, + "loss": 0.9071, + "step": 8140 + }, + { + "epoch": 13.0, + "grad_norm": 5.171020984649658, + "learning_rate": 0.0014800637958532696, + "loss": 0.9396, + "step": 8150 + }, + { + "epoch": 13.01, + "grad_norm": 2.2651660442352295, + "learning_rate": 0.0014794258373205741, + "loss": 0.6466, + "step": 8160 + }, + { + "epoch": 13.03, + "grad_norm": 1.7244137525558472, + "learning_rate": 0.0014787878787878787, + "loss": 0.5765, + "step": 8170 + }, + { + "epoch": 13.05, + "grad_norm": 2.143556833267212, + "learning_rate": 0.0014781499202551833, + "loss": 0.6964, + "step": 8180 + }, + { + "epoch": 13.06, + "grad_norm": 3.048412561416626, + "learning_rate": 0.001477511961722488, + "loss": 0.5957, + "step": 8190 + }, + { + "epoch": 13.08, + "grad_norm": 3.002617120742798, + "learning_rate": 0.0014768740031897926, + "loss": 0.6172, + "step": 8200 + }, + { + "epoch": 13.09, + "grad_norm": 2.4327642917633057, + "learning_rate": 0.0014762360446570972, + "loss": 0.6952, + "step": 8210 + }, + { + "epoch": 13.11, + "grad_norm": 3.3259124755859375, + "learning_rate": 0.0014755980861244022, + "loss": 0.7637, + "step": 8220 + }, + { + "epoch": 13.13, + "grad_norm": 2.1302742958068848, + "learning_rate": 0.0014749601275917067, + "loss": 0.8759, + "step": 8230 + }, + { + "epoch": 13.14, + "grad_norm": 2.8593993186950684, + "learning_rate": 0.0014743221690590113, + "loss": 0.6421, + "step": 8240 + }, + { + "epoch": 13.16, + "grad_norm": 3.1945838928222656, + "learning_rate": 0.0014736842105263158, + "loss": 0.8016, + "step": 8250 + }, + { + "epoch": 13.17, + "grad_norm": 2.6106722354888916, + "learning_rate": 0.0014730462519936206, + "loss": 0.9062, + "step": 8260 + }, + { + "epoch": 13.19, + "grad_norm": 2.938920021057129, + "learning_rate": 0.0014724082934609252, + "loss": 0.6848, + "step": 8270 + }, + { + "epoch": 13.21, + "grad_norm": 2.4809677600860596, + "learning_rate": 0.0014717703349282297, + "loss": 0.7019, + "step": 8280 + }, + { + "epoch": 13.22, + "grad_norm": 3.0914158821105957, + "learning_rate": 0.0014711323763955343, + "loss": 0.8232, + "step": 8290 + }, + { + "epoch": 13.24, + "grad_norm": 3.0564115047454834, + "learning_rate": 0.001470494417862839, + "loss": 0.7435, + "step": 8300 + }, + { + "epoch": 13.25, + "grad_norm": 3.3561959266662598, + "learning_rate": 0.0014698564593301437, + "loss": 0.7295, + "step": 8310 + }, + { + "epoch": 13.27, + "grad_norm": 1.9883933067321777, + "learning_rate": 0.0014692185007974482, + "loss": 0.7224, + "step": 8320 + }, + { + "epoch": 13.29, + "grad_norm": 2.7677059173583984, + "learning_rate": 0.001468580542264753, + "loss": 0.7052, + "step": 8330 + }, + { + "epoch": 13.3, + "grad_norm": 2.8097822666168213, + "learning_rate": 0.0014679425837320576, + "loss": 0.8795, + "step": 8340 + }, + { + "epoch": 13.32, + "grad_norm": 2.9403786659240723, + "learning_rate": 0.0014673046251993621, + "loss": 0.8753, + "step": 8350 + }, + { + "epoch": 13.33, + "grad_norm": 2.103468179702759, + "learning_rate": 0.0014666666666666667, + "loss": 0.7013, + "step": 8360 + }, + { + "epoch": 13.35, + "grad_norm": 4.1119489669799805, + "learning_rate": 0.0014660287081339715, + "loss": 0.7666, + "step": 8370 + }, + { + "epoch": 13.37, + "grad_norm": 2.627279758453369, + "learning_rate": 0.001465390749601276, + "loss": 0.912, + "step": 8380 + }, + { + "epoch": 13.38, + "grad_norm": 3.824855327606201, + "learning_rate": 0.0014647527910685806, + "loss": 0.8233, + "step": 8390 + }, + { + "epoch": 13.4, + "grad_norm": 2.9254772663116455, + "learning_rate": 0.0014641148325358854, + "loss": 0.7541, + "step": 8400 + }, + { + "epoch": 13.41, + "grad_norm": 3.6978065967559814, + "learning_rate": 0.00146347687400319, + "loss": 0.7604, + "step": 8410 + }, + { + "epoch": 13.43, + "grad_norm": 2.875696897506714, + "learning_rate": 0.0014628389154704945, + "loss": 0.7459, + "step": 8420 + }, + { + "epoch": 13.44, + "grad_norm": 3.1799988746643066, + "learning_rate": 0.001462200956937799, + "loss": 0.7081, + "step": 8430 + }, + { + "epoch": 13.46, + "grad_norm": 1.9684711694717407, + "learning_rate": 0.0014615629984051038, + "loss": 0.7558, + "step": 8440 + }, + { + "epoch": 13.48, + "grad_norm": 2.5012054443359375, + "learning_rate": 0.0014609250398724084, + "loss": 0.8078, + "step": 8450 + }, + { + "epoch": 13.49, + "grad_norm": 2.650980234146118, + "learning_rate": 0.001460287081339713, + "loss": 0.7813, + "step": 8460 + }, + { + "epoch": 13.51, + "grad_norm": 2.5888872146606445, + "learning_rate": 0.0014596491228070177, + "loss": 0.8004, + "step": 8470 + }, + { + "epoch": 13.52, + "grad_norm": 4.472870349884033, + "learning_rate": 0.0014590111642743223, + "loss": 0.868, + "step": 8480 + }, + { + "epoch": 13.54, + "grad_norm": 2.7261528968811035, + "learning_rate": 0.0014583732057416269, + "loss": 0.7751, + "step": 8490 + }, + { + "epoch": 13.56, + "grad_norm": 5.15382194519043, + "learning_rate": 0.0014577352472089314, + "loss": 0.8288, + "step": 8500 + }, + { + "epoch": 13.57, + "grad_norm": 3.0572078227996826, + "learning_rate": 0.0014570972886762362, + "loss": 0.9282, + "step": 8510 + }, + { + "epoch": 13.59, + "grad_norm": 2.779832363128662, + "learning_rate": 0.0014564593301435408, + "loss": 0.8, + "step": 8520 + }, + { + "epoch": 13.6, + "grad_norm": 3.26220965385437, + "learning_rate": 0.0014558213716108453, + "loss": 0.7932, + "step": 8530 + }, + { + "epoch": 13.62, + "grad_norm": 5.765030384063721, + "learning_rate": 0.0014551834130781501, + "loss": 0.7697, + "step": 8540 + }, + { + "epoch": 13.64, + "grad_norm": 3.393489122390747, + "learning_rate": 0.0014545454545454547, + "loss": 0.7436, + "step": 8550 + }, + { + "epoch": 13.65, + "grad_norm": 3.4582221508026123, + "learning_rate": 0.0014539074960127592, + "loss": 0.8099, + "step": 8560 + }, + { + "epoch": 13.67, + "grad_norm": 2.931617498397827, + "learning_rate": 0.0014532695374800638, + "loss": 0.8833, + "step": 8570 + }, + { + "epoch": 13.68, + "grad_norm": 3.149649143218994, + "learning_rate": 0.0014526315789473686, + "loss": 0.826, + "step": 8580 + }, + { + "epoch": 13.7, + "grad_norm": 2.6606719493865967, + "learning_rate": 0.0014519936204146731, + "loss": 0.9458, + "step": 8590 + }, + { + "epoch": 13.72, + "grad_norm": 3.6608333587646484, + "learning_rate": 0.0014513556618819777, + "loss": 0.8182, + "step": 8600 + }, + { + "epoch": 13.73, + "grad_norm": 4.276224136352539, + "learning_rate": 0.0014507177033492825, + "loss": 0.8656, + "step": 8610 + }, + { + "epoch": 13.75, + "grad_norm": 3.306110382080078, + "learning_rate": 0.001450079744816587, + "loss": 0.8356, + "step": 8620 + }, + { + "epoch": 13.76, + "grad_norm": 3.0018744468688965, + "learning_rate": 0.0014494417862838916, + "loss": 0.8602, + "step": 8630 + }, + { + "epoch": 13.78, + "grad_norm": 3.3632960319519043, + "learning_rate": 0.0014488038277511962, + "loss": 1.0159, + "step": 8640 + }, + { + "epoch": 13.8, + "grad_norm": 2.006432056427002, + "learning_rate": 0.001448165869218501, + "loss": 0.8172, + "step": 8650 + }, + { + "epoch": 13.81, + "grad_norm": 3.5842230319976807, + "learning_rate": 0.0014475279106858055, + "loss": 0.8222, + "step": 8660 + }, + { + "epoch": 13.83, + "grad_norm": 3.855170488357544, + "learning_rate": 0.00144688995215311, + "loss": 0.8993, + "step": 8670 + }, + { + "epoch": 13.84, + "grad_norm": 3.3235816955566406, + "learning_rate": 0.0014462519936204146, + "loss": 0.8374, + "step": 8680 + }, + { + "epoch": 13.86, + "grad_norm": 3.43414568901062, + "learning_rate": 0.0014456140350877194, + "loss": 0.9525, + "step": 8690 + }, + { + "epoch": 13.88, + "grad_norm": 3.4128949642181396, + "learning_rate": 0.001444976076555024, + "loss": 0.8527, + "step": 8700 + }, + { + "epoch": 13.89, + "grad_norm": 5.165436744689941, + "learning_rate": 0.0014443381180223285, + "loss": 0.8698, + "step": 8710 + }, + { + "epoch": 13.91, + "grad_norm": 3.940591812133789, + "learning_rate": 0.0014437001594896333, + "loss": 0.9243, + "step": 8720 + }, + { + "epoch": 13.92, + "grad_norm": 3.3081157207489014, + "learning_rate": 0.0014430622009569379, + "loss": 0.9268, + "step": 8730 + }, + { + "epoch": 13.94, + "grad_norm": 3.6998980045318604, + "learning_rate": 0.0014424242424242424, + "loss": 0.9113, + "step": 8740 + }, + { + "epoch": 13.96, + "grad_norm": 3.386359214782715, + "learning_rate": 0.001441786283891547, + "loss": 0.9067, + "step": 8750 + }, + { + "epoch": 13.97, + "grad_norm": 2.559299945831299, + "learning_rate": 0.0014411483253588518, + "loss": 0.9951, + "step": 8760 + }, + { + "epoch": 13.99, + "grad_norm": 2.8027663230895996, + "learning_rate": 0.0014405103668261564, + "loss": 1.0755, + "step": 8770 + }, + { + "epoch": 14.0, + "grad_norm": 2.2676618099212646, + "learning_rate": 0.001439872408293461, + "loss": 0.7299, + "step": 8780 + }, + { + "epoch": 14.02, + "grad_norm": 3.9087278842926025, + "learning_rate": 0.0014392344497607657, + "loss": 0.8103, + "step": 8790 + }, + { + "epoch": 14.04, + "grad_norm": 3.4694509506225586, + "learning_rate": 0.0014385964912280703, + "loss": 0.6432, + "step": 8800 + }, + { + "epoch": 14.05, + "grad_norm": 2.3048603534698486, + "learning_rate": 0.0014379585326953748, + "loss": 0.5185, + "step": 8810 + }, + { + "epoch": 14.07, + "grad_norm": 3.251046895980835, + "learning_rate": 0.0014373205741626794, + "loss": 0.6668, + "step": 8820 + }, + { + "epoch": 14.08, + "grad_norm": 1.8965840339660645, + "learning_rate": 0.0014366826156299842, + "loss": 0.5914, + "step": 8830 + }, + { + "epoch": 14.1, + "grad_norm": 4.089531421661377, + "learning_rate": 0.0014360446570972887, + "loss": 0.6342, + "step": 8840 + }, + { + "epoch": 14.11, + "grad_norm": 2.4392364025115967, + "learning_rate": 0.0014354066985645933, + "loss": 0.6801, + "step": 8850 + }, + { + "epoch": 14.13, + "grad_norm": 2.7692840099334717, + "learning_rate": 0.001434768740031898, + "loss": 0.7158, + "step": 8860 + }, + { + "epoch": 14.15, + "grad_norm": 2.2414801120758057, + "learning_rate": 0.0014341307814992026, + "loss": 0.6251, + "step": 8870 + }, + { + "epoch": 14.16, + "grad_norm": 2.941929578781128, + "learning_rate": 0.0014334928229665072, + "loss": 0.7273, + "step": 8880 + }, + { + "epoch": 14.18, + "grad_norm": 2.245312452316284, + "learning_rate": 0.0014328548644338118, + "loss": 0.6797, + "step": 8890 + }, + { + "epoch": 14.19, + "grad_norm": 2.1441662311553955, + "learning_rate": 0.0014322169059011165, + "loss": 0.6244, + "step": 8900 + }, + { + "epoch": 14.21, + "grad_norm": 3.0492477416992188, + "learning_rate": 0.001431578947368421, + "loss": 0.6268, + "step": 8910 + }, + { + "epoch": 14.23, + "grad_norm": 2.6444950103759766, + "learning_rate": 0.0014309409888357257, + "loss": 0.6373, + "step": 8920 + }, + { + "epoch": 14.24, + "grad_norm": 2.9322099685668945, + "learning_rate": 0.0014303030303030304, + "loss": 0.7353, + "step": 8930 + }, + { + "epoch": 14.26, + "grad_norm": 2.753868341445923, + "learning_rate": 0.001429665071770335, + "loss": 0.6592, + "step": 8940 + }, + { + "epoch": 14.27, + "grad_norm": 3.1307361125946045, + "learning_rate": 0.0014290271132376396, + "loss": 0.6678, + "step": 8950 + }, + { + "epoch": 14.29, + "grad_norm": 2.1127524375915527, + "learning_rate": 0.0014283891547049441, + "loss": 0.6473, + "step": 8960 + }, + { + "epoch": 14.31, + "grad_norm": 2.359909772872925, + "learning_rate": 0.001427751196172249, + "loss": 0.8442, + "step": 8970 + }, + { + "epoch": 14.32, + "grad_norm": 3.395587205886841, + "learning_rate": 0.0014271132376395535, + "loss": 0.6634, + "step": 8980 + }, + { + "epoch": 14.34, + "grad_norm": 3.500505208969116, + "learning_rate": 0.001426475279106858, + "loss": 0.6735, + "step": 8990 + }, + { + "epoch": 14.35, + "grad_norm": 1.948743224143982, + "learning_rate": 0.0014258373205741626, + "loss": 0.8773, + "step": 9000 + }, + { + "epoch": 14.37, + "grad_norm": 4.593191146850586, + "learning_rate": 0.0014251993620414674, + "loss": 0.7344, + "step": 9010 + }, + { + "epoch": 14.39, + "grad_norm": 2.9138360023498535, + "learning_rate": 0.001424561403508772, + "loss": 0.7962, + "step": 9020 + }, + { + "epoch": 14.4, + "grad_norm": 2.7665469646453857, + "learning_rate": 0.0014239234449760765, + "loss": 0.7066, + "step": 9030 + }, + { + "epoch": 14.42, + "grad_norm": 2.5287930965423584, + "learning_rate": 0.0014232854864433813, + "loss": 0.799, + "step": 9040 + }, + { + "epoch": 14.43, + "grad_norm": 1.9143520593643188, + "learning_rate": 0.0014226475279106858, + "loss": 0.877, + "step": 9050 + }, + { + "epoch": 14.45, + "grad_norm": 3.114867925643921, + "learning_rate": 0.0014220095693779904, + "loss": 0.7229, + "step": 9060 + }, + { + "epoch": 14.47, + "grad_norm": 4.132133960723877, + "learning_rate": 0.001421371610845295, + "loss": 0.7723, + "step": 9070 + }, + { + "epoch": 14.48, + "grad_norm": 2.8847928047180176, + "learning_rate": 0.0014207336523125997, + "loss": 0.8349, + "step": 9080 + }, + { + "epoch": 14.5, + "grad_norm": 4.3192009925842285, + "learning_rate": 0.0014200956937799043, + "loss": 0.7454, + "step": 9090 + }, + { + "epoch": 14.51, + "grad_norm": 2.5490753650665283, + "learning_rate": 0.0014194577352472089, + "loss": 0.8047, + "step": 9100 + }, + { + "epoch": 14.53, + "grad_norm": 3.995173215866089, + "learning_rate": 0.0014188197767145137, + "loss": 0.7209, + "step": 9110 + }, + { + "epoch": 14.55, + "grad_norm": 3.334613084793091, + "learning_rate": 0.0014181818181818182, + "loss": 0.9342, + "step": 9120 + }, + { + "epoch": 14.56, + "grad_norm": 2.7369375228881836, + "learning_rate": 0.0014175438596491228, + "loss": 0.7473, + "step": 9130 + }, + { + "epoch": 14.58, + "grad_norm": 4.180137634277344, + "learning_rate": 0.0014169059011164273, + "loss": 0.8705, + "step": 9140 + }, + { + "epoch": 14.59, + "grad_norm": 3.7026357650756836, + "learning_rate": 0.0014162679425837321, + "loss": 0.7836, + "step": 9150 + }, + { + "epoch": 14.61, + "grad_norm": 1.8971599340438843, + "learning_rate": 0.0014156299840510367, + "loss": 0.7062, + "step": 9160 + }, + { + "epoch": 14.63, + "grad_norm": 2.8083083629608154, + "learning_rate": 0.0014149920255183412, + "loss": 0.7808, + "step": 9170 + }, + { + "epoch": 14.64, + "grad_norm": 2.1013123989105225, + "learning_rate": 0.001414354066985646, + "loss": 0.919, + "step": 9180 + }, + { + "epoch": 14.66, + "grad_norm": 2.5876877307891846, + "learning_rate": 0.0014137161084529506, + "loss": 0.7824, + "step": 9190 + }, + { + "epoch": 14.67, + "grad_norm": 2.3595352172851562, + "learning_rate": 0.0014130781499202552, + "loss": 0.8545, + "step": 9200 + }, + { + "epoch": 14.69, + "grad_norm": 6.161678314208984, + "learning_rate": 0.0014124401913875597, + "loss": 0.764, + "step": 9210 + }, + { + "epoch": 14.7, + "grad_norm": 2.7124509811401367, + "learning_rate": 0.0014118022328548645, + "loss": 0.7967, + "step": 9220 + }, + { + "epoch": 14.72, + "grad_norm": 3.200411081314087, + "learning_rate": 0.001411164274322169, + "loss": 0.8589, + "step": 9230 + }, + { + "epoch": 14.74, + "grad_norm": 1.9819875955581665, + "learning_rate": 0.0014105263157894736, + "loss": 0.809, + "step": 9240 + }, + { + "epoch": 14.75, + "grad_norm": 3.223145008087158, + "learning_rate": 0.0014098883572567784, + "loss": 0.739, + "step": 9250 + }, + { + "epoch": 14.77, + "grad_norm": 3.0328469276428223, + "learning_rate": 0.001409250398724083, + "loss": 0.8469, + "step": 9260 + }, + { + "epoch": 14.78, + "grad_norm": 2.144221305847168, + "learning_rate": 0.0014086124401913875, + "loss": 0.9141, + "step": 9270 + }, + { + "epoch": 14.8, + "grad_norm": 2.3607845306396484, + "learning_rate": 0.001407974481658692, + "loss": 0.8812, + "step": 9280 + }, + { + "epoch": 14.82, + "grad_norm": 2.356010913848877, + "learning_rate": 0.0014073365231259969, + "loss": 0.7773, + "step": 9290 + }, + { + "epoch": 14.83, + "grad_norm": 3.326063394546509, + "learning_rate": 0.0014066985645933014, + "loss": 0.9775, + "step": 9300 + }, + { + "epoch": 14.85, + "grad_norm": 3.0373737812042236, + "learning_rate": 0.001406060606060606, + "loss": 0.8173, + "step": 9310 + }, + { + "epoch": 14.86, + "grad_norm": 3.7840776443481445, + "learning_rate": 0.0014054226475279108, + "loss": 0.8301, + "step": 9320 + }, + { + "epoch": 14.88, + "grad_norm": 3.13913893699646, + "learning_rate": 0.0014047846889952153, + "loss": 0.7767, + "step": 9330 + }, + { + "epoch": 14.9, + "grad_norm": 4.028443813323975, + "learning_rate": 0.00140414673046252, + "loss": 0.7516, + "step": 9340 + }, + { + "epoch": 14.91, + "grad_norm": 3.6890182495117188, + "learning_rate": 0.0014035087719298245, + "loss": 0.769, + "step": 9350 + }, + { + "epoch": 14.93, + "grad_norm": 4.084263801574707, + "learning_rate": 0.0014028708133971292, + "loss": 0.8712, + "step": 9360 + }, + { + "epoch": 14.94, + "grad_norm": 2.6253440380096436, + "learning_rate": 0.0014022328548644338, + "loss": 0.9161, + "step": 9370 + }, + { + "epoch": 14.96, + "grad_norm": 3.6379435062408447, + "learning_rate": 0.0014015948963317384, + "loss": 0.9507, + "step": 9380 + }, + { + "epoch": 14.98, + "grad_norm": 3.1507678031921387, + "learning_rate": 0.001400956937799043, + "loss": 0.9133, + "step": 9390 + }, + { + "epoch": 14.99, + "grad_norm": 2.170366048812866, + "learning_rate": 0.0014003189792663477, + "loss": 0.7949, + "step": 9400 + }, + { + "epoch": 15.01, + "grad_norm": 1.886562705039978, + "learning_rate": 0.0013996810207336523, + "loss": 0.7133, + "step": 9410 + }, + { + "epoch": 15.02, + "grad_norm": 2.3615992069244385, + "learning_rate": 0.0013990430622009568, + "loss": 0.598, + "step": 9420 + }, + { + "epoch": 15.04, + "grad_norm": 2.0564517974853516, + "learning_rate": 0.0013984051036682616, + "loss": 0.6673, + "step": 9430 + }, + { + "epoch": 15.06, + "grad_norm": 2.599745273590088, + "learning_rate": 0.0013977671451355662, + "loss": 0.5725, + "step": 9440 + }, + { + "epoch": 15.07, + "grad_norm": 2.5613441467285156, + "learning_rate": 0.0013971291866028707, + "loss": 0.5816, + "step": 9450 + }, + { + "epoch": 15.09, + "grad_norm": 2.8341970443725586, + "learning_rate": 0.0013964912280701753, + "loss": 0.5968, + "step": 9460 + }, + { + "epoch": 15.1, + "grad_norm": 3.303835391998291, + "learning_rate": 0.00139585326953748, + "loss": 0.6412, + "step": 9470 + }, + { + "epoch": 15.12, + "grad_norm": 3.2321808338165283, + "learning_rate": 0.0013952153110047846, + "loss": 0.6451, + "step": 9480 + }, + { + "epoch": 15.14, + "grad_norm": 2.747515916824341, + "learning_rate": 0.0013945773524720892, + "loss": 0.6491, + "step": 9490 + }, + { + "epoch": 15.15, + "grad_norm": 2.1695239543914795, + "learning_rate": 0.001393939393939394, + "loss": 0.6463, + "step": 9500 + }, + { + "epoch": 15.17, + "grad_norm": 2.5514535903930664, + "learning_rate": 0.0013933014354066985, + "loss": 0.696, + "step": 9510 + }, + { + "epoch": 15.18, + "grad_norm": 2.224310874938965, + "learning_rate": 0.001392663476874003, + "loss": 0.6733, + "step": 9520 + }, + { + "epoch": 15.2, + "grad_norm": 3.0674171447753906, + "learning_rate": 0.0013920255183413077, + "loss": 0.655, + "step": 9530 + }, + { + "epoch": 15.22, + "grad_norm": 1.9924139976501465, + "learning_rate": 0.0013913875598086124, + "loss": 0.6315, + "step": 9540 + }, + { + "epoch": 15.23, + "grad_norm": 3.7744829654693604, + "learning_rate": 0.001390749601275917, + "loss": 0.6991, + "step": 9550 + }, + { + "epoch": 15.25, + "grad_norm": 3.4672529697418213, + "learning_rate": 0.0013901116427432216, + "loss": 0.7007, + "step": 9560 + }, + { + "epoch": 15.26, + "grad_norm": 3.2644975185394287, + "learning_rate": 0.0013894736842105264, + "loss": 0.6403, + "step": 9570 + }, + { + "epoch": 15.28, + "grad_norm": 2.8029818534851074, + "learning_rate": 0.001388835725677831, + "loss": 0.7316, + "step": 9580 + }, + { + "epoch": 15.3, + "grad_norm": 1.8042049407958984, + "learning_rate": 0.0013881977671451355, + "loss": 0.6332, + "step": 9590 + }, + { + "epoch": 15.31, + "grad_norm": 2.2891921997070312, + "learning_rate": 0.00138755980861244, + "loss": 0.6348, + "step": 9600 + }, + { + "epoch": 15.33, + "grad_norm": 2.8570570945739746, + "learning_rate": 0.0013869218500797448, + "loss": 0.6607, + "step": 9610 + }, + { + "epoch": 15.34, + "grad_norm": 2.2186977863311768, + "learning_rate": 0.0013862838915470494, + "loss": 0.627, + "step": 9620 + }, + { + "epoch": 15.36, + "grad_norm": 2.3791275024414062, + "learning_rate": 0.001385645933014354, + "loss": 0.6721, + "step": 9630 + }, + { + "epoch": 15.37, + "grad_norm": 2.992490530014038, + "learning_rate": 0.0013850079744816587, + "loss": 0.6734, + "step": 9640 + }, + { + "epoch": 15.39, + "grad_norm": 5.538806438446045, + "learning_rate": 0.0013843700159489633, + "loss": 0.678, + "step": 9650 + }, + { + "epoch": 15.41, + "grad_norm": 2.7970008850097656, + "learning_rate": 0.0013837320574162679, + "loss": 0.6576, + "step": 9660 + }, + { + "epoch": 15.42, + "grad_norm": 5.550447940826416, + "learning_rate": 0.0013830940988835724, + "loss": 0.7144, + "step": 9670 + }, + { + "epoch": 15.44, + "grad_norm": 2.3102047443389893, + "learning_rate": 0.0013824561403508772, + "loss": 0.669, + "step": 9680 + }, + { + "epoch": 15.45, + "grad_norm": 3.720393419265747, + "learning_rate": 0.0013818181818181818, + "loss": 0.7442, + "step": 9690 + }, + { + "epoch": 15.47, + "grad_norm": 2.284290075302124, + "learning_rate": 0.0013811802232854863, + "loss": 0.7835, + "step": 9700 + }, + { + "epoch": 15.49, + "grad_norm": 3.2873239517211914, + "learning_rate": 0.0013805422647527909, + "loss": 0.6662, + "step": 9710 + }, + { + "epoch": 15.5, + "grad_norm": 2.7117483615875244, + "learning_rate": 0.0013799043062200959, + "loss": 0.7348, + "step": 9720 + }, + { + "epoch": 15.52, + "grad_norm": 3.2797791957855225, + "learning_rate": 0.0013792663476874004, + "loss": 0.8664, + "step": 9730 + }, + { + "epoch": 15.53, + "grad_norm": 3.7056384086608887, + "learning_rate": 0.001378628389154705, + "loss": 0.7128, + "step": 9740 + }, + { + "epoch": 15.55, + "grad_norm": 2.3162360191345215, + "learning_rate": 0.0013779904306220098, + "loss": 0.6596, + "step": 9750 + }, + { + "epoch": 15.57, + "grad_norm": 2.1081748008728027, + "learning_rate": 0.0013773524720893143, + "loss": 0.7492, + "step": 9760 + }, + { + "epoch": 15.58, + "grad_norm": 3.5717201232910156, + "learning_rate": 0.001376714513556619, + "loss": 0.8277, + "step": 9770 + }, + { + "epoch": 15.6, + "grad_norm": 3.751756429672241, + "learning_rate": 0.0013760765550239235, + "loss": 0.7028, + "step": 9780 + }, + { + "epoch": 15.61, + "grad_norm": 3.4455363750457764, + "learning_rate": 0.0013754385964912283, + "loss": 0.7339, + "step": 9790 + }, + { + "epoch": 15.63, + "grad_norm": 2.6450400352478027, + "learning_rate": 0.0013748006379585328, + "loss": 0.8658, + "step": 9800 + }, + { + "epoch": 15.65, + "grad_norm": 2.7757487297058105, + "learning_rate": 0.0013741626794258374, + "loss": 0.7462, + "step": 9810 + }, + { + "epoch": 15.66, + "grad_norm": 2.791318416595459, + "learning_rate": 0.0013735247208931422, + "loss": 0.7724, + "step": 9820 + }, + { + "epoch": 15.68, + "grad_norm": 2.722747802734375, + "learning_rate": 0.0013728867623604467, + "loss": 0.8533, + "step": 9830 + }, + { + "epoch": 15.69, + "grad_norm": 2.778831958770752, + "learning_rate": 0.0013722488038277513, + "loss": 0.7243, + "step": 9840 + }, + { + "epoch": 15.71, + "grad_norm": 2.3124783039093018, + "learning_rate": 0.0013716108452950558, + "loss": 0.87, + "step": 9850 + }, + { + "epoch": 15.73, + "grad_norm": 2.3077304363250732, + "learning_rate": 0.0013709728867623606, + "loss": 0.8462, + "step": 9860 + }, + { + "epoch": 15.74, + "grad_norm": 4.141488552093506, + "learning_rate": 0.0013703349282296652, + "loss": 0.8513, + "step": 9870 + }, + { + "epoch": 15.76, + "grad_norm": 2.998544454574585, + "learning_rate": 0.0013696969696969697, + "loss": 0.7472, + "step": 9880 + }, + { + "epoch": 15.77, + "grad_norm": 2.3463869094848633, + "learning_rate": 0.0013690590111642745, + "loss": 0.8363, + "step": 9890 + }, + { + "epoch": 15.79, + "grad_norm": 2.782196521759033, + "learning_rate": 0.001368421052631579, + "loss": 0.7076, + "step": 9900 + }, + { + "epoch": 15.81, + "grad_norm": 3.6227550506591797, + "learning_rate": 0.0013677830940988837, + "loss": 0.677, + "step": 9910 + }, + { + "epoch": 15.82, + "grad_norm": 3.1042935848236084, + "learning_rate": 0.0013671451355661882, + "loss": 0.8747, + "step": 9920 + }, + { + "epoch": 15.84, + "grad_norm": 2.9278554916381836, + "learning_rate": 0.001366507177033493, + "loss": 0.8876, + "step": 9930 + }, + { + "epoch": 15.85, + "grad_norm": 2.6750121116638184, + "learning_rate": 0.0013658692185007976, + "loss": 0.7374, + "step": 9940 + }, + { + "epoch": 15.87, + "grad_norm": 2.563796043395996, + "learning_rate": 0.0013652312599681021, + "loss": 0.6978, + "step": 9950 + }, + { + "epoch": 15.89, + "grad_norm": 2.839409112930298, + "learning_rate": 0.001364593301435407, + "loss": 0.7307, + "step": 9960 + }, + { + "epoch": 15.9, + "grad_norm": 2.651336908340454, + "learning_rate": 0.0013639553429027115, + "loss": 0.6775, + "step": 9970 + }, + { + "epoch": 15.92, + "grad_norm": 3.166914701461792, + "learning_rate": 0.001363317384370016, + "loss": 0.9131, + "step": 9980 + }, + { + "epoch": 15.93, + "grad_norm": 2.0613489151000977, + "learning_rate": 0.0013626794258373206, + "loss": 0.8707, + "step": 9990 + }, + { + "epoch": 15.95, + "grad_norm": 3.3213391304016113, + "learning_rate": 0.0013620414673046254, + "loss": 0.7631, + "step": 10000 + }, + { + "epoch": 15.96, + "grad_norm": 3.0203397274017334, + "learning_rate": 0.00136140350877193, + "loss": 0.8512, + "step": 10010 + }, + { + "epoch": 15.98, + "grad_norm": 2.070725202560425, + "learning_rate": 0.0013607655502392345, + "loss": 0.8031, + "step": 10020 + }, + { + "epoch": 16.0, + "grad_norm": 2.3090660572052, + "learning_rate": 0.0013601275917065393, + "loss": 0.7327, + "step": 10030 + }, + { + "epoch": 16.01, + "grad_norm": 2.2141530513763428, + "learning_rate": 0.0013594896331738438, + "loss": 0.6028, + "step": 10040 + }, + { + "epoch": 16.03, + "grad_norm": 2.6139416694641113, + "learning_rate": 0.0013588516746411484, + "loss": 0.5008, + "step": 10050 + }, + { + "epoch": 16.04, + "grad_norm": 4.001714706420898, + "learning_rate": 0.001358213716108453, + "loss": 0.5758, + "step": 10060 + }, + { + "epoch": 16.06, + "grad_norm": 3.021545886993408, + "learning_rate": 0.0013575757575757577, + "loss": 0.5445, + "step": 10070 + }, + { + "epoch": 16.08, + "grad_norm": 3.3995559215545654, + "learning_rate": 0.0013569377990430623, + "loss": 0.6745, + "step": 10080 + }, + { + "epoch": 16.09, + "grad_norm": 1.990356683731079, + "learning_rate": 0.0013562998405103669, + "loss": 0.57, + "step": 10090 + }, + { + "epoch": 16.11, + "grad_norm": 2.0073490142822266, + "learning_rate": 0.0013556618819776714, + "loss": 0.532, + "step": 10100 + }, + { + "epoch": 16.12, + "grad_norm": 3.74519681930542, + "learning_rate": 0.0013550239234449762, + "loss": 0.5446, + "step": 10110 + }, + { + "epoch": 16.14, + "grad_norm": 3.367241144180298, + "learning_rate": 0.0013543859649122808, + "loss": 0.6287, + "step": 10120 + }, + { + "epoch": 16.16, + "grad_norm": 2.2096097469329834, + "learning_rate": 0.0013537480063795853, + "loss": 0.6978, + "step": 10130 + }, + { + "epoch": 16.17, + "grad_norm": 1.8970123529434204, + "learning_rate": 0.0013531100478468901, + "loss": 0.6808, + "step": 10140 + }, + { + "epoch": 16.19, + "grad_norm": 3.6559560298919678, + "learning_rate": 0.0013524720893141947, + "loss": 0.6263, + "step": 10150 + }, + { + "epoch": 16.2, + "grad_norm": 2.3549561500549316, + "learning_rate": 0.0013518341307814992, + "loss": 0.6148, + "step": 10160 + }, + { + "epoch": 16.22, + "grad_norm": 1.73717200756073, + "learning_rate": 0.0013511961722488038, + "loss": 0.5918, + "step": 10170 + }, + { + "epoch": 16.24, + "grad_norm": 2.160614252090454, + "learning_rate": 0.0013505582137161086, + "loss": 0.6008, + "step": 10180 + }, + { + "epoch": 16.25, + "grad_norm": 3.5011887550354004, + "learning_rate": 0.0013499202551834131, + "loss": 0.7248, + "step": 10190 + }, + { + "epoch": 16.27, + "grad_norm": 2.6921088695526123, + "learning_rate": 0.0013492822966507177, + "loss": 0.7165, + "step": 10200 + }, + { + "epoch": 16.28, + "grad_norm": 1.8108500242233276, + "learning_rate": 0.0013486443381180225, + "loss": 0.5676, + "step": 10210 + }, + { + "epoch": 16.3, + "grad_norm": 2.7839293479919434, + "learning_rate": 0.001348006379585327, + "loss": 0.65, + "step": 10220 + }, + { + "epoch": 16.32, + "grad_norm": 2.6478052139282227, + "learning_rate": 0.0013473684210526316, + "loss": 0.7421, + "step": 10230 + }, + { + "epoch": 16.33, + "grad_norm": 2.5701370239257812, + "learning_rate": 0.0013467304625199362, + "loss": 0.7106, + "step": 10240 + }, + { + "epoch": 16.35, + "grad_norm": 2.3916168212890625, + "learning_rate": 0.001346092503987241, + "loss": 0.5939, + "step": 10250 + }, + { + "epoch": 16.36, + "grad_norm": 2.6145966053009033, + "learning_rate": 0.0013454545454545455, + "loss": 0.6589, + "step": 10260 + }, + { + "epoch": 16.38, + "grad_norm": 2.416173219680786, + "learning_rate": 0.00134481658692185, + "loss": 0.5956, + "step": 10270 + }, + { + "epoch": 16.4, + "grad_norm": 3.0522923469543457, + "learning_rate": 0.0013441786283891549, + "loss": 0.7672, + "step": 10280 + }, + { + "epoch": 16.41, + "grad_norm": 3.9606542587280273, + "learning_rate": 0.0013435406698564594, + "loss": 0.6298, + "step": 10290 + }, + { + "epoch": 16.43, + "grad_norm": 2.6333351135253906, + "learning_rate": 0.001342902711323764, + "loss": 0.758, + "step": 10300 + }, + { + "epoch": 16.44, + "grad_norm": 3.0208117961883545, + "learning_rate": 0.0013422647527910685, + "loss": 0.7356, + "step": 10310 + }, + { + "epoch": 16.46, + "grad_norm": 2.344989776611328, + "learning_rate": 0.0013416267942583733, + "loss": 0.6947, + "step": 10320 + }, + { + "epoch": 16.48, + "grad_norm": 1.7995355129241943, + "learning_rate": 0.0013409888357256779, + "loss": 0.7519, + "step": 10330 + }, + { + "epoch": 16.49, + "grad_norm": 1.8002946376800537, + "learning_rate": 0.0013403508771929824, + "loss": 0.5739, + "step": 10340 + }, + { + "epoch": 16.51, + "grad_norm": 2.094810962677002, + "learning_rate": 0.0013397129186602872, + "loss": 0.6613, + "step": 10350 + }, + { + "epoch": 16.52, + "grad_norm": 3.815561294555664, + "learning_rate": 0.0013390749601275918, + "loss": 0.6778, + "step": 10360 + }, + { + "epoch": 16.54, + "grad_norm": 2.7428698539733887, + "learning_rate": 0.0013384370015948964, + "loss": 0.6428, + "step": 10370 + }, + { + "epoch": 16.56, + "grad_norm": 2.3527848720550537, + "learning_rate": 0.001337799043062201, + "loss": 0.703, + "step": 10380 + }, + { + "epoch": 16.57, + "grad_norm": 2.305804967880249, + "learning_rate": 0.0013371610845295057, + "loss": 0.6954, + "step": 10390 + }, + { + "epoch": 16.59, + "grad_norm": 2.0628771781921387, + "learning_rate": 0.0013365231259968103, + "loss": 0.6499, + "step": 10400 + }, + { + "epoch": 16.6, + "grad_norm": 3.61171555519104, + "learning_rate": 0.0013358851674641148, + "loss": 0.767, + "step": 10410 + }, + { + "epoch": 16.62, + "grad_norm": 1.9354444742202759, + "learning_rate": 0.0013352472089314194, + "loss": 0.7736, + "step": 10420 + }, + { + "epoch": 16.63, + "grad_norm": 2.2509772777557373, + "learning_rate": 0.0013346092503987242, + "loss": 0.6941, + "step": 10430 + }, + { + "epoch": 16.65, + "grad_norm": 3.0013530254364014, + "learning_rate": 0.0013339712918660287, + "loss": 0.7728, + "step": 10440 + }, + { + "epoch": 16.67, + "grad_norm": 2.995089292526245, + "learning_rate": 0.0013333333333333333, + "loss": 0.6111, + "step": 10450 + }, + { + "epoch": 16.68, + "grad_norm": 2.9852423667907715, + "learning_rate": 0.001332695374800638, + "loss": 0.7604, + "step": 10460 + }, + { + "epoch": 16.7, + "grad_norm": 1.8430482149124146, + "learning_rate": 0.0013320574162679426, + "loss": 0.6248, + "step": 10470 + }, + { + "epoch": 16.71, + "grad_norm": 2.271106481552124, + "learning_rate": 0.0013314194577352472, + "loss": 0.6576, + "step": 10480 + }, + { + "epoch": 16.73, + "grad_norm": 3.168851852416992, + "learning_rate": 0.0013307814992025518, + "loss": 0.7078, + "step": 10490 + }, + { + "epoch": 16.75, + "grad_norm": 3.591390371322632, + "learning_rate": 0.0013301435406698565, + "loss": 0.7218, + "step": 10500 + }, + { + "epoch": 16.76, + "grad_norm": 2.9601821899414062, + "learning_rate": 0.001329505582137161, + "loss": 0.6973, + "step": 10510 + }, + { + "epoch": 16.78, + "grad_norm": 2.4465489387512207, + "learning_rate": 0.0013288676236044657, + "loss": 0.6793, + "step": 10520 + }, + { + "epoch": 16.79, + "grad_norm": 3.1582698822021484, + "learning_rate": 0.0013282296650717704, + "loss": 0.6457, + "step": 10530 + }, + { + "epoch": 16.81, + "grad_norm": 2.704655408859253, + "learning_rate": 0.001327591706539075, + "loss": 0.7386, + "step": 10540 + }, + { + "epoch": 16.83, + "grad_norm": 2.8489794731140137, + "learning_rate": 0.0013269537480063796, + "loss": 0.7504, + "step": 10550 + }, + { + "epoch": 16.84, + "grad_norm": 3.1505606174468994, + "learning_rate": 0.0013263157894736841, + "loss": 0.7292, + "step": 10560 + }, + { + "epoch": 16.86, + "grad_norm": 2.3454043865203857, + "learning_rate": 0.001325677830940989, + "loss": 0.7907, + "step": 10570 + }, + { + "epoch": 16.87, + "grad_norm": 3.128525972366333, + "learning_rate": 0.0013250398724082935, + "loss": 0.6484, + "step": 10580 + }, + { + "epoch": 16.89, + "grad_norm": 3.890327215194702, + "learning_rate": 0.001324401913875598, + "loss": 0.7387, + "step": 10590 + }, + { + "epoch": 16.91, + "grad_norm": 3.827643394470215, + "learning_rate": 0.0013237639553429028, + "loss": 0.8169, + "step": 10600 + }, + { + "epoch": 16.92, + "grad_norm": 2.757068395614624, + "learning_rate": 0.0013231259968102074, + "loss": 0.7266, + "step": 10610 + }, + { + "epoch": 16.94, + "grad_norm": 2.3636882305145264, + "learning_rate": 0.001322488038277512, + "loss": 0.8095, + "step": 10620 + }, + { + "epoch": 16.95, + "grad_norm": 2.3341169357299805, + "learning_rate": 0.0013218500797448165, + "loss": 0.757, + "step": 10630 + }, + { + "epoch": 16.97, + "grad_norm": 3.235461950302124, + "learning_rate": 0.0013212121212121213, + "loss": 0.7138, + "step": 10640 + }, + { + "epoch": 16.99, + "grad_norm": 3.797213315963745, + "learning_rate": 0.0013205741626794258, + "loss": 0.7479, + "step": 10650 + }, + { + "epoch": 17.0, + "grad_norm": 2.1070356369018555, + "learning_rate": 0.0013199362041467304, + "loss": 0.7187, + "step": 10660 + }, + { + "epoch": 17.02, + "grad_norm": 2.2326266765594482, + "learning_rate": 0.0013192982456140352, + "loss": 0.4728, + "step": 10670 + }, + { + "epoch": 17.03, + "grad_norm": 2.8324732780456543, + "learning_rate": 0.0013186602870813397, + "loss": 0.5915, + "step": 10680 + }, + { + "epoch": 17.05, + "grad_norm": 2.2015562057495117, + "learning_rate": 0.0013180223285486443, + "loss": 0.6181, + "step": 10690 + }, + { + "epoch": 17.07, + "grad_norm": 1.9790899753570557, + "learning_rate": 0.0013173843700159489, + "loss": 0.5131, + "step": 10700 + }, + { + "epoch": 17.08, + "grad_norm": 2.4350438117980957, + "learning_rate": 0.0013167464114832537, + "loss": 0.5058, + "step": 10710 + }, + { + "epoch": 17.1, + "grad_norm": 2.701519250869751, + "learning_rate": 0.0013161084529505582, + "loss": 0.5272, + "step": 10720 + }, + { + "epoch": 17.11, + "grad_norm": 2.316878318786621, + "learning_rate": 0.0013154704944178628, + "loss": 0.5489, + "step": 10730 + }, + { + "epoch": 17.13, + "grad_norm": 2.2858500480651855, + "learning_rate": 0.0013148325358851676, + "loss": 0.6087, + "step": 10740 + }, + { + "epoch": 17.15, + "grad_norm": 1.5047816038131714, + "learning_rate": 0.0013141945773524721, + "loss": 0.5726, + "step": 10750 + }, + { + "epoch": 17.16, + "grad_norm": 2.081256628036499, + "learning_rate": 0.0013135566188197767, + "loss": 0.5417, + "step": 10760 + }, + { + "epoch": 17.18, + "grad_norm": 1.8512243032455444, + "learning_rate": 0.0013129186602870812, + "loss": 0.7781, + "step": 10770 + }, + { + "epoch": 17.19, + "grad_norm": 2.651259422302246, + "learning_rate": 0.001312280701754386, + "loss": 0.6324, + "step": 10780 + }, + { + "epoch": 17.21, + "grad_norm": 1.8741660118103027, + "learning_rate": 0.0013116427432216906, + "loss": 0.5242, + "step": 10790 + }, + { + "epoch": 17.22, + "grad_norm": 2.223308801651001, + "learning_rate": 0.0013110047846889952, + "loss": 0.5837, + "step": 10800 + }, + { + "epoch": 17.24, + "grad_norm": 2.954585552215576, + "learning_rate": 0.0013103668261562997, + "loss": 0.7762, + "step": 10810 + }, + { + "epoch": 17.26, + "grad_norm": 2.075242519378662, + "learning_rate": 0.0013097288676236045, + "loss": 0.6981, + "step": 10820 + }, + { + "epoch": 17.27, + "grad_norm": 1.9512617588043213, + "learning_rate": 0.001309090909090909, + "loss": 0.5878, + "step": 10830 + }, + { + "epoch": 17.29, + "grad_norm": 2.4567389488220215, + "learning_rate": 0.0013084529505582136, + "loss": 0.5861, + "step": 10840 + }, + { + "epoch": 17.3, + "grad_norm": 2.4589033126831055, + "learning_rate": 0.0013078149920255184, + "loss": 0.5776, + "step": 10850 + }, + { + "epoch": 17.32, + "grad_norm": 3.0933573246002197, + "learning_rate": 0.001307177033492823, + "loss": 0.6265, + "step": 10860 + }, + { + "epoch": 17.34, + "grad_norm": 2.9563870429992676, + "learning_rate": 0.0013065390749601275, + "loss": 0.688, + "step": 10870 + }, + { + "epoch": 17.35, + "grad_norm": 2.6502304077148438, + "learning_rate": 0.001305901116427432, + "loss": 0.6685, + "step": 10880 + }, + { + "epoch": 17.37, + "grad_norm": 2.815063238143921, + "learning_rate": 0.0013052631578947369, + "loss": 0.651, + "step": 10890 + }, + { + "epoch": 17.38, + "grad_norm": 2.2861077785491943, + "learning_rate": 0.0013046251993620414, + "loss": 0.588, + "step": 10900 + }, + { + "epoch": 17.4, + "grad_norm": 2.0195345878601074, + "learning_rate": 0.001303987240829346, + "loss": 0.6129, + "step": 10910 + }, + { + "epoch": 17.42, + "grad_norm": 2.192063331604004, + "learning_rate": 0.0013033492822966508, + "loss": 0.6819, + "step": 10920 + }, + { + "epoch": 17.43, + "grad_norm": 3.0410258769989014, + "learning_rate": 0.0013027113237639553, + "loss": 0.6265, + "step": 10930 + }, + { + "epoch": 17.45, + "grad_norm": 1.9278006553649902, + "learning_rate": 0.00130207336523126, + "loss": 0.5889, + "step": 10940 + }, + { + "epoch": 17.46, + "grad_norm": 2.2657618522644043, + "learning_rate": 0.0013014354066985645, + "loss": 0.6477, + "step": 10950 + }, + { + "epoch": 17.48, + "grad_norm": 3.8989851474761963, + "learning_rate": 0.0013007974481658692, + "loss": 0.6831, + "step": 10960 + }, + { + "epoch": 17.5, + "grad_norm": 2.630307197570801, + "learning_rate": 0.0013001594896331738, + "loss": 0.7138, + "step": 10970 + }, + { + "epoch": 17.51, + "grad_norm": 2.4029276371002197, + "learning_rate": 0.0012995215311004784, + "loss": 0.6426, + "step": 10980 + }, + { + "epoch": 17.53, + "grad_norm": 2.127747058868408, + "learning_rate": 0.0012988835725677831, + "loss": 0.7466, + "step": 10990 + }, + { + "epoch": 17.54, + "grad_norm": 1.9066559076309204, + "learning_rate": 0.0012982456140350877, + "loss": 0.578, + "step": 11000 + }, + { + "epoch": 17.56, + "grad_norm": 2.585181713104248, + "learning_rate": 0.0012976076555023923, + "loss": 0.6634, + "step": 11010 + }, + { + "epoch": 17.58, + "grad_norm": 1.7290911674499512, + "learning_rate": 0.0012969696969696968, + "loss": 0.6455, + "step": 11020 + }, + { + "epoch": 17.59, + "grad_norm": 3.596162796020508, + "learning_rate": 0.0012963317384370016, + "loss": 0.6049, + "step": 11030 + }, + { + "epoch": 17.61, + "grad_norm": 1.8785613775253296, + "learning_rate": 0.0012956937799043062, + "loss": 0.6151, + "step": 11040 + }, + { + "epoch": 17.62, + "grad_norm": 3.3277037143707275, + "learning_rate": 0.0012950558213716107, + "loss": 0.5931, + "step": 11050 + }, + { + "epoch": 17.64, + "grad_norm": 2.3854615688323975, + "learning_rate": 0.0012944178628389155, + "loss": 0.6332, + "step": 11060 + }, + { + "epoch": 17.66, + "grad_norm": 2.8721373081207275, + "learning_rate": 0.00129377990430622, + "loss": 0.731, + "step": 11070 + }, + { + "epoch": 17.67, + "grad_norm": 3.060612440109253, + "learning_rate": 0.0012931419457735246, + "loss": 0.7137, + "step": 11080 + }, + { + "epoch": 17.69, + "grad_norm": 2.5586180686950684, + "learning_rate": 0.0012925039872408292, + "loss": 0.5945, + "step": 11090 + }, + { + "epoch": 17.7, + "grad_norm": 2.5695533752441406, + "learning_rate": 0.001291866028708134, + "loss": 0.7658, + "step": 11100 + }, + { + "epoch": 17.72, + "grad_norm": 4.104732036590576, + "learning_rate": 0.0012912280701754385, + "loss": 0.7704, + "step": 11110 + }, + { + "epoch": 17.74, + "grad_norm": 3.1808011531829834, + "learning_rate": 0.001290590111642743, + "loss": 0.6476, + "step": 11120 + }, + { + "epoch": 17.75, + "grad_norm": 2.210597038269043, + "learning_rate": 0.0012899521531100477, + "loss": 0.722, + "step": 11130 + }, + { + "epoch": 17.77, + "grad_norm": 2.6710522174835205, + "learning_rate": 0.0012893141945773524, + "loss": 0.6275, + "step": 11140 + }, + { + "epoch": 17.78, + "grad_norm": 2.2379961013793945, + "learning_rate": 0.001288676236044657, + "loss": 0.6321, + "step": 11150 + }, + { + "epoch": 17.8, + "grad_norm": 2.719963312149048, + "learning_rate": 0.0012880382775119616, + "loss": 0.6912, + "step": 11160 + }, + { + "epoch": 17.81, + "grad_norm": 2.1712732315063477, + "learning_rate": 0.0012874003189792664, + "loss": 0.6567, + "step": 11170 + }, + { + "epoch": 17.83, + "grad_norm": 4.386512279510498, + "learning_rate": 0.001286762360446571, + "loss": 0.6933, + "step": 11180 + }, + { + "epoch": 17.85, + "grad_norm": 4.8913373947143555, + "learning_rate": 0.0012861244019138755, + "loss": 0.6339, + "step": 11190 + }, + { + "epoch": 17.86, + "grad_norm": 3.154282569885254, + "learning_rate": 0.00128548644338118, + "loss": 0.6558, + "step": 11200 + }, + { + "epoch": 17.88, + "grad_norm": 4.05545711517334, + "learning_rate": 0.001284848484848485, + "loss": 0.7379, + "step": 11210 + }, + { + "epoch": 17.89, + "grad_norm": 2.9737448692321777, + "learning_rate": 0.0012842105263157896, + "loss": 0.7563, + "step": 11220 + }, + { + "epoch": 17.91, + "grad_norm": 4.375244617462158, + "learning_rate": 0.0012835725677830942, + "loss": 0.7779, + "step": 11230 + }, + { + "epoch": 17.93, + "grad_norm": 2.775324821472168, + "learning_rate": 0.001282934609250399, + "loss": 0.6884, + "step": 11240 + }, + { + "epoch": 17.94, + "grad_norm": 2.1626110076904297, + "learning_rate": 0.0012822966507177035, + "loss": 0.7123, + "step": 11250 + }, + { + "epoch": 17.96, + "grad_norm": 3.951596260070801, + "learning_rate": 0.001281658692185008, + "loss": 0.7645, + "step": 11260 + }, + { + "epoch": 17.97, + "grad_norm": 2.271362066268921, + "learning_rate": 0.0012810207336523126, + "loss": 0.8049, + "step": 11270 + }, + { + "epoch": 17.99, + "grad_norm": 3.3153727054595947, + "learning_rate": 0.0012803827751196174, + "loss": 0.7484, + "step": 11280 + }, + { + "epoch": 18.01, + "grad_norm": 2.451831340789795, + "learning_rate": 0.001279744816586922, + "loss": 0.6549, + "step": 11290 + }, + { + "epoch": 18.02, + "grad_norm": 2.1044692993164062, + "learning_rate": 0.0012791068580542265, + "loss": 0.5569, + "step": 11300 + }, + { + "epoch": 18.04, + "grad_norm": 5.281918525695801, + "learning_rate": 0.0012784688995215313, + "loss": 0.5877, + "step": 11310 + }, + { + "epoch": 18.05, + "grad_norm": 2.706597328186035, + "learning_rate": 0.0012778309409888359, + "loss": 0.541, + "step": 11320 + }, + { + "epoch": 18.07, + "grad_norm": 2.2525746822357178, + "learning_rate": 0.0012771929824561404, + "loss": 0.5092, + "step": 11330 + }, + { + "epoch": 18.09, + "grad_norm": 2.823735475540161, + "learning_rate": 0.001276555023923445, + "loss": 0.5205, + "step": 11340 + }, + { + "epoch": 18.1, + "grad_norm": 2.758739948272705, + "learning_rate": 0.0012759170653907498, + "loss": 0.5012, + "step": 11350 + }, + { + "epoch": 18.12, + "grad_norm": 3.0362417697906494, + "learning_rate": 0.0012752791068580543, + "loss": 0.6093, + "step": 11360 + }, + { + "epoch": 18.13, + "grad_norm": 1.967462182044983, + "learning_rate": 0.001274641148325359, + "loss": 0.6387, + "step": 11370 + }, + { + "epoch": 18.15, + "grad_norm": 2.352168083190918, + "learning_rate": 0.0012740031897926637, + "loss": 0.6707, + "step": 11380 + }, + { + "epoch": 18.17, + "grad_norm": 1.6705100536346436, + "learning_rate": 0.0012733652312599683, + "loss": 0.6675, + "step": 11390 + }, + { + "epoch": 18.18, + "grad_norm": 2.1992321014404297, + "learning_rate": 0.0012727272727272728, + "loss": 0.7195, + "step": 11400 + }, + { + "epoch": 18.2, + "grad_norm": 1.7198143005371094, + "learning_rate": 0.0012720893141945774, + "loss": 0.6287, + "step": 11410 + }, + { + "epoch": 18.21, + "grad_norm": 2.229097604751587, + "learning_rate": 0.0012714513556618822, + "loss": 0.5207, + "step": 11420 + }, + { + "epoch": 18.23, + "grad_norm": 3.074547529220581, + "learning_rate": 0.0012708133971291867, + "loss": 0.5929, + "step": 11430 + }, + { + "epoch": 18.25, + "grad_norm": 2.6688926219940186, + "learning_rate": 0.0012701754385964913, + "loss": 0.5927, + "step": 11440 + }, + { + "epoch": 18.26, + "grad_norm": 2.6187679767608643, + "learning_rate": 0.0012695374800637958, + "loss": 0.5733, + "step": 11450 + }, + { + "epoch": 18.28, + "grad_norm": 2.056699752807617, + "learning_rate": 0.0012688995215311006, + "loss": 0.6502, + "step": 11460 + }, + { + "epoch": 18.29, + "grad_norm": 3.4182140827178955, + "learning_rate": 0.0012682615629984052, + "loss": 0.5932, + "step": 11470 + }, + { + "epoch": 18.31, + "grad_norm": 2.9311532974243164, + "learning_rate": 0.0012676236044657097, + "loss": 0.5568, + "step": 11480 + }, + { + "epoch": 18.33, + "grad_norm": 1.7414332628250122, + "learning_rate": 0.0012669856459330145, + "loss": 0.5494, + "step": 11490 + }, + { + "epoch": 18.34, + "grad_norm": 2.6820008754730225, + "learning_rate": 0.001266347687400319, + "loss": 0.7878, + "step": 11500 + }, + { + "epoch": 18.36, + "grad_norm": 2.811760663986206, + "learning_rate": 0.0012657097288676237, + "loss": 0.5324, + "step": 11510 + }, + { + "epoch": 18.37, + "grad_norm": 3.026895046234131, + "learning_rate": 0.0012650717703349282, + "loss": 0.7353, + "step": 11520 + }, + { + "epoch": 18.39, + "grad_norm": 2.6072068214416504, + "learning_rate": 0.001264433811802233, + "loss": 0.6509, + "step": 11530 + }, + { + "epoch": 18.41, + "grad_norm": 2.0730879306793213, + "learning_rate": 0.0012637958532695376, + "loss": 0.5353, + "step": 11540 + }, + { + "epoch": 18.42, + "grad_norm": 3.863426923751831, + "learning_rate": 0.0012631578947368421, + "loss": 0.6443, + "step": 11550 + }, + { + "epoch": 18.44, + "grad_norm": 1.9193871021270752, + "learning_rate": 0.001262519936204147, + "loss": 0.5827, + "step": 11560 + }, + { + "epoch": 18.45, + "grad_norm": 3.126490354537964, + "learning_rate": 0.0012618819776714515, + "loss": 0.6556, + "step": 11570 + }, + { + "epoch": 18.47, + "grad_norm": 3.189641237258911, + "learning_rate": 0.001261244019138756, + "loss": 0.693, + "step": 11580 + }, + { + "epoch": 18.48, + "grad_norm": 3.374671220779419, + "learning_rate": 0.0012606060606060606, + "loss": 0.6156, + "step": 11590 + }, + { + "epoch": 18.5, + "grad_norm": 1.7221401929855347, + "learning_rate": 0.0012599681020733654, + "loss": 0.6865, + "step": 11600 + }, + { + "epoch": 18.52, + "grad_norm": 1.6494935750961304, + "learning_rate": 0.00125933014354067, + "loss": 0.6058, + "step": 11610 + }, + { + "epoch": 18.53, + "grad_norm": 2.8912765979766846, + "learning_rate": 0.0012586921850079745, + "loss": 0.6022, + "step": 11620 + }, + { + "epoch": 18.55, + "grad_norm": 2.1293585300445557, + "learning_rate": 0.0012580542264752793, + "loss": 0.5817, + "step": 11630 + }, + { + "epoch": 18.56, + "grad_norm": 3.3972530364990234, + "learning_rate": 0.0012574162679425838, + "loss": 0.6279, + "step": 11640 + }, + { + "epoch": 18.58, + "grad_norm": 4.464833736419678, + "learning_rate": 0.0012567783094098884, + "loss": 0.6584, + "step": 11650 + }, + { + "epoch": 18.6, + "grad_norm": 3.3168396949768066, + "learning_rate": 0.001256140350877193, + "loss": 0.7492, + "step": 11660 + }, + { + "epoch": 18.61, + "grad_norm": 1.7018378973007202, + "learning_rate": 0.0012555023923444977, + "loss": 0.6064, + "step": 11670 + }, + { + "epoch": 18.63, + "grad_norm": 2.8935000896453857, + "learning_rate": 0.0012548644338118023, + "loss": 0.6793, + "step": 11680 + }, + { + "epoch": 18.64, + "grad_norm": 3.3293614387512207, + "learning_rate": 0.0012542264752791069, + "loss": 0.6478, + "step": 11690 + }, + { + "epoch": 18.66, + "grad_norm": 2.4878737926483154, + "learning_rate": 0.0012535885167464116, + "loss": 0.5137, + "step": 11700 + }, + { + "epoch": 18.68, + "grad_norm": 2.662574529647827, + "learning_rate": 0.0012529505582137162, + "loss": 0.6051, + "step": 11710 + }, + { + "epoch": 18.69, + "grad_norm": 2.5218799114227295, + "learning_rate": 0.0012523125996810208, + "loss": 0.6176, + "step": 11720 + }, + { + "epoch": 18.71, + "grad_norm": 2.6172173023223877, + "learning_rate": 0.0012516746411483253, + "loss": 0.627, + "step": 11730 + }, + { + "epoch": 18.72, + "grad_norm": 2.4706501960754395, + "learning_rate": 0.0012510366826156301, + "loss": 0.6826, + "step": 11740 + }, + { + "epoch": 18.74, + "grad_norm": 1.9907801151275635, + "learning_rate": 0.0012503987240829347, + "loss": 0.6528, + "step": 11750 + }, + { + "epoch": 18.76, + "grad_norm": 4.803826808929443, + "learning_rate": 0.0012497607655502392, + "loss": 0.599, + "step": 11760 + }, + { + "epoch": 18.77, + "grad_norm": 2.5642504692077637, + "learning_rate": 0.001249122807017544, + "loss": 0.6657, + "step": 11770 + }, + { + "epoch": 18.79, + "grad_norm": 4.334081649780273, + "learning_rate": 0.0012484848484848486, + "loss": 0.6764, + "step": 11780 + }, + { + "epoch": 18.8, + "grad_norm": 2.7521369457244873, + "learning_rate": 0.0012478468899521531, + "loss": 0.6906, + "step": 11790 + }, + { + "epoch": 18.82, + "grad_norm": 2.13214373588562, + "learning_rate": 0.0012472089314194577, + "loss": 0.6823, + "step": 11800 + }, + { + "epoch": 18.84, + "grad_norm": 3.1697006225585938, + "learning_rate": 0.0012465709728867625, + "loss": 0.6231, + "step": 11810 + }, + { + "epoch": 18.85, + "grad_norm": 2.6898703575134277, + "learning_rate": 0.001245933014354067, + "loss": 0.8077, + "step": 11820 + }, + { + "epoch": 18.87, + "grad_norm": 3.177943706512451, + "learning_rate": 0.0012452950558213716, + "loss": 0.634, + "step": 11830 + }, + { + "epoch": 18.88, + "grad_norm": 2.5923023223876953, + "learning_rate": 0.0012446570972886762, + "loss": 0.5839, + "step": 11840 + }, + { + "epoch": 18.9, + "grad_norm": 1.8359884023666382, + "learning_rate": 0.001244019138755981, + "loss": 0.5992, + "step": 11850 + }, + { + "epoch": 18.92, + "grad_norm": 2.252401828765869, + "learning_rate": 0.0012433811802232855, + "loss": 0.5877, + "step": 11860 + }, + { + "epoch": 18.93, + "grad_norm": 2.945974588394165, + "learning_rate": 0.00124274322169059, + "loss": 0.8981, + "step": 11870 + }, + { + "epoch": 18.95, + "grad_norm": 2.5869786739349365, + "learning_rate": 0.0012421052631578949, + "loss": 0.7204, + "step": 11880 + }, + { + "epoch": 18.96, + "grad_norm": 2.0073652267456055, + "learning_rate": 0.0012414673046251994, + "loss": 0.7112, + "step": 11890 + }, + { + "epoch": 18.98, + "grad_norm": 2.726731777191162, + "learning_rate": 0.001240829346092504, + "loss": 0.6721, + "step": 11900 + }, + { + "epoch": 19.0, + "grad_norm": 2.646214246749878, + "learning_rate": 0.0012401913875598085, + "loss": 0.6353, + "step": 11910 + }, + { + "epoch": 19.01, + "grad_norm": 1.7647764682769775, + "learning_rate": 0.0012395534290271133, + "loss": 0.5528, + "step": 11920 + }, + { + "epoch": 19.03, + "grad_norm": 1.6511797904968262, + "learning_rate": 0.0012389154704944179, + "loss": 0.5311, + "step": 11930 + }, + { + "epoch": 19.04, + "grad_norm": 3.120816707611084, + "learning_rate": 0.0012382775119617224, + "loss": 0.4873, + "step": 11940 + }, + { + "epoch": 19.06, + "grad_norm": 1.2211092710494995, + "learning_rate": 0.0012376395534290272, + "loss": 0.5031, + "step": 11950 + }, + { + "epoch": 19.07, + "grad_norm": 2.295135021209717, + "learning_rate": 0.0012370015948963318, + "loss": 0.5656, + "step": 11960 + }, + { + "epoch": 19.09, + "grad_norm": 1.805337905883789, + "learning_rate": 0.0012363636363636364, + "loss": 0.5322, + "step": 11970 + }, + { + "epoch": 19.11, + "grad_norm": 1.8517502546310425, + "learning_rate": 0.001235725677830941, + "loss": 0.5287, + "step": 11980 + }, + { + "epoch": 19.12, + "grad_norm": 2.464036464691162, + "learning_rate": 0.0012350877192982457, + "loss": 0.5865, + "step": 11990 + }, + { + "epoch": 19.14, + "grad_norm": 1.964254379272461, + "learning_rate": 0.0012344497607655503, + "loss": 0.512, + "step": 12000 + }, + { + "epoch": 19.15, + "grad_norm": 2.386060953140259, + "learning_rate": 0.0012338118022328548, + "loss": 0.6416, + "step": 12010 + }, + { + "epoch": 19.17, + "grad_norm": 2.4723477363586426, + "learning_rate": 0.0012331738437001596, + "loss": 0.5147, + "step": 12020 + }, + { + "epoch": 19.19, + "grad_norm": 1.7513999938964844, + "learning_rate": 0.0012325358851674642, + "loss": 0.5413, + "step": 12030 + }, + { + "epoch": 19.2, + "grad_norm": 1.3666512966156006, + "learning_rate": 0.0012318979266347687, + "loss": 0.619, + "step": 12040 + }, + { + "epoch": 19.22, + "grad_norm": 2.0821938514709473, + "learning_rate": 0.0012312599681020733, + "loss": 0.5351, + "step": 12050 + }, + { + "epoch": 19.23, + "grad_norm": 2.403721570968628, + "learning_rate": 0.001230622009569378, + "loss": 0.5107, + "step": 12060 + }, + { + "epoch": 19.25, + "grad_norm": 2.3420348167419434, + "learning_rate": 0.0012299840510366826, + "loss": 0.5017, + "step": 12070 + }, + { + "epoch": 19.27, + "grad_norm": 1.8931384086608887, + "learning_rate": 0.0012293460925039872, + "loss": 0.5291, + "step": 12080 + }, + { + "epoch": 19.28, + "grad_norm": 1.815537691116333, + "learning_rate": 0.001228708133971292, + "loss": 0.6106, + "step": 12090 + }, + { + "epoch": 19.3, + "grad_norm": 2.327855348587036, + "learning_rate": 0.0012280701754385965, + "loss": 0.4726, + "step": 12100 + }, + { + "epoch": 19.31, + "grad_norm": 2.388517141342163, + "learning_rate": 0.001227432216905901, + "loss": 0.5529, + "step": 12110 + }, + { + "epoch": 19.33, + "grad_norm": 3.0234811305999756, + "learning_rate": 0.0012267942583732057, + "loss": 0.6617, + "step": 12120 + }, + { + "epoch": 19.35, + "grad_norm": 3.1139323711395264, + "learning_rate": 0.0012261562998405104, + "loss": 0.5682, + "step": 12130 + }, + { + "epoch": 19.36, + "grad_norm": 3.9127554893493652, + "learning_rate": 0.001225518341307815, + "loss": 0.6042, + "step": 12140 + }, + { + "epoch": 19.38, + "grad_norm": 3.9032232761383057, + "learning_rate": 0.0012248803827751196, + "loss": 0.6022, + "step": 12150 + }, + { + "epoch": 19.39, + "grad_norm": 1.7738832235336304, + "learning_rate": 0.0012242424242424241, + "loss": 0.5483, + "step": 12160 + }, + { + "epoch": 19.41, + "grad_norm": 3.865807294845581, + "learning_rate": 0.001223604465709729, + "loss": 0.6323, + "step": 12170 + }, + { + "epoch": 19.43, + "grad_norm": 2.653740406036377, + "learning_rate": 0.0012229665071770335, + "loss": 0.6286, + "step": 12180 + }, + { + "epoch": 19.44, + "grad_norm": 1.727924108505249, + "learning_rate": 0.001222328548644338, + "loss": 0.5744, + "step": 12190 + }, + { + "epoch": 19.46, + "grad_norm": 2.1040127277374268, + "learning_rate": 0.0012216905901116428, + "loss": 0.5941, + "step": 12200 + }, + { + "epoch": 19.47, + "grad_norm": 2.8161518573760986, + "learning_rate": 0.0012210526315789474, + "loss": 0.542, + "step": 12210 + }, + { + "epoch": 19.49, + "grad_norm": 2.4196929931640625, + "learning_rate": 0.001220414673046252, + "loss": 0.5612, + "step": 12220 + }, + { + "epoch": 19.51, + "grad_norm": 2.2649526596069336, + "learning_rate": 0.0012197767145135565, + "loss": 0.4941, + "step": 12230 + }, + { + "epoch": 19.52, + "grad_norm": 3.1256422996520996, + "learning_rate": 0.0012191387559808613, + "loss": 0.551, + "step": 12240 + }, + { + "epoch": 19.54, + "grad_norm": 2.1946921348571777, + "learning_rate": 0.0012185007974481658, + "loss": 0.5294, + "step": 12250 + }, + { + "epoch": 19.55, + "grad_norm": 2.897484064102173, + "learning_rate": 0.0012178628389154704, + "loss": 0.7128, + "step": 12260 + }, + { + "epoch": 19.57, + "grad_norm": 2.024834156036377, + "learning_rate": 0.0012172248803827752, + "loss": 0.5942, + "step": 12270 + }, + { + "epoch": 19.59, + "grad_norm": 1.7685123682022095, + "learning_rate": 0.0012165869218500797, + "loss": 0.5827, + "step": 12280 + }, + { + "epoch": 19.6, + "grad_norm": 2.0234525203704834, + "learning_rate": 0.0012159489633173843, + "loss": 0.6377, + "step": 12290 + }, + { + "epoch": 19.62, + "grad_norm": 4.358128070831299, + "learning_rate": 0.0012153110047846889, + "loss": 0.7277, + "step": 12300 + }, + { + "epoch": 19.63, + "grad_norm": 2.4699137210845947, + "learning_rate": 0.0012146730462519937, + "loss": 0.5023, + "step": 12310 + }, + { + "epoch": 19.65, + "grad_norm": 2.4853904247283936, + "learning_rate": 0.0012140350877192982, + "loss": 0.585, + "step": 12320 + }, + { + "epoch": 19.67, + "grad_norm": 2.560833215713501, + "learning_rate": 0.0012133971291866028, + "loss": 0.6899, + "step": 12330 + }, + { + "epoch": 19.68, + "grad_norm": 3.2827863693237305, + "learning_rate": 0.0012127591706539076, + "loss": 0.6114, + "step": 12340 + }, + { + "epoch": 19.7, + "grad_norm": 2.643315553665161, + "learning_rate": 0.0012121212121212121, + "loss": 0.6714, + "step": 12350 + }, + { + "epoch": 19.71, + "grad_norm": 2.2757856845855713, + "learning_rate": 0.0012114832535885167, + "loss": 0.5607, + "step": 12360 + }, + { + "epoch": 19.73, + "grad_norm": 2.054987668991089, + "learning_rate": 0.0012108452950558212, + "loss": 0.5481, + "step": 12370 + }, + { + "epoch": 19.74, + "grad_norm": 2.3429064750671387, + "learning_rate": 0.001210207336523126, + "loss": 0.6629, + "step": 12380 + }, + { + "epoch": 19.76, + "grad_norm": 1.6089274883270264, + "learning_rate": 0.0012095693779904306, + "loss": 0.6237, + "step": 12390 + }, + { + "epoch": 19.78, + "grad_norm": 4.483922004699707, + "learning_rate": 0.0012089314194577352, + "loss": 0.6287, + "step": 12400 + }, + { + "epoch": 19.79, + "grad_norm": 2.133923292160034, + "learning_rate": 0.00120829346092504, + "loss": 0.6648, + "step": 12410 + }, + { + "epoch": 19.81, + "grad_norm": 2.3778302669525146, + "learning_rate": 0.0012076555023923445, + "loss": 0.639, + "step": 12420 + }, + { + "epoch": 19.82, + "grad_norm": 2.589620351791382, + "learning_rate": 0.001207017543859649, + "loss": 0.6414, + "step": 12430 + }, + { + "epoch": 19.84, + "grad_norm": 1.5527355670928955, + "learning_rate": 0.0012063795853269536, + "loss": 0.6587, + "step": 12440 + }, + { + "epoch": 19.86, + "grad_norm": 3.8891091346740723, + "learning_rate": 0.0012057416267942584, + "loss": 0.6672, + "step": 12450 + }, + { + "epoch": 19.87, + "grad_norm": 2.5779592990875244, + "learning_rate": 0.001205103668261563, + "loss": 0.6224, + "step": 12460 + }, + { + "epoch": 19.89, + "grad_norm": 2.218827486038208, + "learning_rate": 0.0012044657097288675, + "loss": 0.6988, + "step": 12470 + }, + { + "epoch": 19.9, + "grad_norm": 3.827039957046509, + "learning_rate": 0.0012038277511961723, + "loss": 0.5554, + "step": 12480 + }, + { + "epoch": 19.92, + "grad_norm": 3.635878562927246, + "learning_rate": 0.0012031897926634769, + "loss": 0.6447, + "step": 12490 + }, + { + "epoch": 19.94, + "grad_norm": 1.9988211393356323, + "learning_rate": 0.0012025518341307814, + "loss": 0.5721, + "step": 12500 + }, + { + "epoch": 19.95, + "grad_norm": 4.294229984283447, + "learning_rate": 0.001201913875598086, + "loss": 0.6425, + "step": 12510 + }, + { + "epoch": 19.97, + "grad_norm": 2.2810208797454834, + "learning_rate": 0.0012012759170653908, + "loss": 0.609, + "step": 12520 + }, + { + "epoch": 19.98, + "grad_norm": 2.6013190746307373, + "learning_rate": 0.0012006379585326953, + "loss": 0.615, + "step": 12530 + }, + { + "epoch": 20.0, + "grad_norm": 3.9176077842712402, + "learning_rate": 0.0012, + "loss": 0.6626, + "step": 12540 + }, + { + "epoch": 20.02, + "grad_norm": 1.4916435480117798, + "learning_rate": 0.0011993620414673045, + "loss": 0.4802, + "step": 12550 + }, + { + "epoch": 20.03, + "grad_norm": 1.8869787454605103, + "learning_rate": 0.0011987240829346092, + "loss": 0.4899, + "step": 12560 + }, + { + "epoch": 20.05, + "grad_norm": 1.645322561264038, + "learning_rate": 0.0011980861244019138, + "loss": 0.4875, + "step": 12570 + }, + { + "epoch": 20.06, + "grad_norm": 3.0053963661193848, + "learning_rate": 0.0011974481658692184, + "loss": 0.5344, + "step": 12580 + }, + { + "epoch": 20.08, + "grad_norm": 1.9125926494598389, + "learning_rate": 0.0011968102073365231, + "loss": 0.5512, + "step": 12590 + }, + { + "epoch": 20.1, + "grad_norm": 2.4130938053131104, + "learning_rate": 0.0011961722488038277, + "loss": 0.6046, + "step": 12600 + }, + { + "epoch": 20.11, + "grad_norm": 2.648345947265625, + "learning_rate": 0.0011955342902711323, + "loss": 0.5085, + "step": 12610 + }, + { + "epoch": 20.13, + "grad_norm": 3.288292646408081, + "learning_rate": 0.0011948963317384368, + "loss": 0.5101, + "step": 12620 + }, + { + "epoch": 20.14, + "grad_norm": 2.3620495796203613, + "learning_rate": 0.0011942583732057416, + "loss": 0.502, + "step": 12630 + }, + { + "epoch": 20.16, + "grad_norm": 2.2232260704040527, + "learning_rate": 0.0011936204146730462, + "loss": 0.5246, + "step": 12640 + }, + { + "epoch": 20.18, + "grad_norm": 3.120986223220825, + "learning_rate": 0.0011929824561403507, + "loss": 0.621, + "step": 12650 + }, + { + "epoch": 20.19, + "grad_norm": 1.5366686582565308, + "learning_rate": 0.0011923444976076555, + "loss": 0.4844, + "step": 12660 + }, + { + "epoch": 20.21, + "grad_norm": 2.0947461128234863, + "learning_rate": 0.00119170653907496, + "loss": 0.6025, + "step": 12670 + }, + { + "epoch": 20.22, + "grad_norm": 1.5178321599960327, + "learning_rate": 0.0011910685805422646, + "loss": 0.5421, + "step": 12680 + }, + { + "epoch": 20.24, + "grad_norm": 2.9309802055358887, + "learning_rate": 0.0011904306220095692, + "loss": 0.4816, + "step": 12690 + }, + { + "epoch": 20.26, + "grad_norm": 1.6734910011291504, + "learning_rate": 0.0011897926634768742, + "loss": 0.668, + "step": 12700 + }, + { + "epoch": 20.27, + "grad_norm": 1.755245327949524, + "learning_rate": 0.0011891547049441788, + "loss": 0.5436, + "step": 12710 + }, + { + "epoch": 20.29, + "grad_norm": 2.458543062210083, + "learning_rate": 0.0011885167464114833, + "loss": 0.5392, + "step": 12720 + }, + { + "epoch": 20.3, + "grad_norm": 2.2478575706481934, + "learning_rate": 0.001187878787878788, + "loss": 0.5686, + "step": 12730 + }, + { + "epoch": 20.32, + "grad_norm": 1.2976596355438232, + "learning_rate": 0.0011872408293460927, + "loss": 0.4879, + "step": 12740 + }, + { + "epoch": 20.33, + "grad_norm": 2.250114917755127, + "learning_rate": 0.0011866028708133972, + "loss": 0.6064, + "step": 12750 + }, + { + "epoch": 20.35, + "grad_norm": 2.2818796634674072, + "learning_rate": 0.0011859649122807018, + "loss": 0.5619, + "step": 12760 + }, + { + "epoch": 20.37, + "grad_norm": 2.3820178508758545, + "learning_rate": 0.0011853269537480066, + "loss": 0.5126, + "step": 12770 + }, + { + "epoch": 20.38, + "grad_norm": 3.2392003536224365, + "learning_rate": 0.0011846889952153111, + "loss": 0.544, + "step": 12780 + }, + { + "epoch": 20.4, + "grad_norm": 3.075946092605591, + "learning_rate": 0.0011840510366826157, + "loss": 0.6301, + "step": 12790 + }, + { + "epoch": 20.41, + "grad_norm": 4.1104230880737305, + "learning_rate": 0.0011834130781499205, + "loss": 0.6136, + "step": 12800 + }, + { + "epoch": 20.43, + "grad_norm": 1.715682029724121, + "learning_rate": 0.001182775119617225, + "loss": 0.5667, + "step": 12810 + }, + { + "epoch": 20.45, + "grad_norm": 1.5427650213241577, + "learning_rate": 0.0011821371610845296, + "loss": 0.6001, + "step": 12820 + }, + { + "epoch": 20.46, + "grad_norm": 2.109271764755249, + "learning_rate": 0.0011814992025518342, + "loss": 0.5762, + "step": 12830 + }, + { + "epoch": 20.48, + "grad_norm": 1.6117897033691406, + "learning_rate": 0.001180861244019139, + "loss": 0.5072, + "step": 12840 + }, + { + "epoch": 20.49, + "grad_norm": 2.525860071182251, + "learning_rate": 0.0011802232854864435, + "loss": 0.6302, + "step": 12850 + }, + { + "epoch": 20.51, + "grad_norm": 1.8826050758361816, + "learning_rate": 0.001179585326953748, + "loss": 0.5483, + "step": 12860 + }, + { + "epoch": 20.53, + "grad_norm": 1.7801835536956787, + "learning_rate": 0.0011789473684210526, + "loss": 0.5239, + "step": 12870 + }, + { + "epoch": 20.54, + "grad_norm": 3.124882459640503, + "learning_rate": 0.0011783094098883574, + "loss": 0.4724, + "step": 12880 + }, + { + "epoch": 20.56, + "grad_norm": 2.8056161403656006, + "learning_rate": 0.001177671451355662, + "loss": 0.5681, + "step": 12890 + }, + { + "epoch": 20.57, + "grad_norm": 2.165199041366577, + "learning_rate": 0.0011770334928229665, + "loss": 0.6012, + "step": 12900 + }, + { + "epoch": 20.59, + "grad_norm": 2.297102451324463, + "learning_rate": 0.0011763955342902713, + "loss": 0.517, + "step": 12910 + }, + { + "epoch": 20.61, + "grad_norm": 2.99562668800354, + "learning_rate": 0.0011757575757575759, + "loss": 0.5696, + "step": 12920 + }, + { + "epoch": 20.62, + "grad_norm": 2.0757791996002197, + "learning_rate": 0.0011751196172248804, + "loss": 0.6102, + "step": 12930 + }, + { + "epoch": 20.64, + "grad_norm": 2.441718816757202, + "learning_rate": 0.001174481658692185, + "loss": 0.6066, + "step": 12940 + }, + { + "epoch": 20.65, + "grad_norm": 1.4816184043884277, + "learning_rate": 0.0011738437001594898, + "loss": 0.5083, + "step": 12950 + }, + { + "epoch": 20.67, + "grad_norm": 2.349161386489868, + "learning_rate": 0.0011732057416267943, + "loss": 0.5705, + "step": 12960 + }, + { + "epoch": 20.69, + "grad_norm": 2.0626585483551025, + "learning_rate": 0.001172567783094099, + "loss": 0.5551, + "step": 12970 + }, + { + "epoch": 20.7, + "grad_norm": 2.3144423961639404, + "learning_rate": 0.0011719298245614037, + "loss": 0.7276, + "step": 12980 + }, + { + "epoch": 20.72, + "grad_norm": 2.9268980026245117, + "learning_rate": 0.0011712918660287083, + "loss": 0.6176, + "step": 12990 + }, + { + "epoch": 20.73, + "grad_norm": 2.339564323425293, + "learning_rate": 0.0011706539074960128, + "loss": 0.6206, + "step": 13000 + }, + { + "epoch": 20.75, + "grad_norm": 2.358088493347168, + "learning_rate": 0.0011700159489633174, + "loss": 0.5862, + "step": 13010 + }, + { + "epoch": 20.77, + "grad_norm": 2.127462863922119, + "learning_rate": 0.0011693779904306222, + "loss": 0.5488, + "step": 13020 + }, + { + "epoch": 20.78, + "grad_norm": 3.3488762378692627, + "learning_rate": 0.0011687400318979267, + "loss": 0.6345, + "step": 13030 + }, + { + "epoch": 20.8, + "grad_norm": 3.2236621379852295, + "learning_rate": 0.0011681020733652313, + "loss": 0.5897, + "step": 13040 + }, + { + "epoch": 20.81, + "grad_norm": 3.0065135955810547, + "learning_rate": 0.001167464114832536, + "loss": 0.6214, + "step": 13050 + }, + { + "epoch": 20.83, + "grad_norm": 1.797853946685791, + "learning_rate": 0.0011668261562998406, + "loss": 0.6224, + "step": 13060 + }, + { + "epoch": 20.85, + "grad_norm": 1.6769222021102905, + "learning_rate": 0.0011661881977671452, + "loss": 0.533, + "step": 13070 + }, + { + "epoch": 20.86, + "grad_norm": 2.111424207687378, + "learning_rate": 0.0011655502392344497, + "loss": 0.5674, + "step": 13080 + }, + { + "epoch": 20.88, + "grad_norm": 1.9882782697677612, + "learning_rate": 0.0011649122807017545, + "loss": 0.6407, + "step": 13090 + }, + { + "epoch": 20.89, + "grad_norm": 2.0077192783355713, + "learning_rate": 0.001164274322169059, + "loss": 0.4901, + "step": 13100 + }, + { + "epoch": 20.91, + "grad_norm": 1.3955817222595215, + "learning_rate": 0.0011636363636363637, + "loss": 0.6638, + "step": 13110 + }, + { + "epoch": 20.93, + "grad_norm": 2.236403226852417, + "learning_rate": 0.0011629984051036684, + "loss": 0.6267, + "step": 13120 + }, + { + "epoch": 20.94, + "grad_norm": 2.0299949645996094, + "learning_rate": 0.001162360446570973, + "loss": 0.6097, + "step": 13130 + }, + { + "epoch": 20.96, + "grad_norm": 3.4427030086517334, + "learning_rate": 0.0011617224880382776, + "loss": 0.6125, + "step": 13140 + }, + { + "epoch": 20.97, + "grad_norm": 2.427687168121338, + "learning_rate": 0.0011610845295055821, + "loss": 0.6464, + "step": 13150 + }, + { + "epoch": 20.99, + "grad_norm": 2.512589454650879, + "learning_rate": 0.001160446570972887, + "loss": 0.636, + "step": 13160 + }, + { + "epoch": 21.0, + "grad_norm": 1.6817240715026855, + "learning_rate": 0.0011598086124401915, + "loss": 0.5498, + "step": 13170 + }, + { + "epoch": 21.02, + "grad_norm": 2.4378724098205566, + "learning_rate": 0.001159170653907496, + "loss": 0.4912, + "step": 13180 + }, + { + "epoch": 21.04, + "grad_norm": 1.7349364757537842, + "learning_rate": 0.0011585326953748008, + "loss": 0.4135, + "step": 13190 + }, + { + "epoch": 21.05, + "grad_norm": 2.0071072578430176, + "learning_rate": 0.0011578947368421054, + "loss": 0.4892, + "step": 13200 + }, + { + "epoch": 21.07, + "grad_norm": 2.1567165851593018, + "learning_rate": 0.00115725677830941, + "loss": 0.4291, + "step": 13210 + }, + { + "epoch": 21.08, + "grad_norm": 1.5533453226089478, + "learning_rate": 0.0011566188197767145, + "loss": 0.4515, + "step": 13220 + }, + { + "epoch": 21.1, + "grad_norm": 1.481789231300354, + "learning_rate": 0.0011559808612440193, + "loss": 0.4933, + "step": 13230 + }, + { + "epoch": 21.12, + "grad_norm": 1.985859990119934, + "learning_rate": 0.0011553429027113238, + "loss": 0.6186, + "step": 13240 + }, + { + "epoch": 21.13, + "grad_norm": 0.9559075236320496, + "learning_rate": 0.0011547049441786284, + "loss": 0.4367, + "step": 13250 + }, + { + "epoch": 21.15, + "grad_norm": 3.953303575515747, + "learning_rate": 0.001154066985645933, + "loss": 0.5434, + "step": 13260 + }, + { + "epoch": 21.16, + "grad_norm": 1.7408164739608765, + "learning_rate": 0.0011534290271132377, + "loss": 0.5025, + "step": 13270 + }, + { + "epoch": 21.18, + "grad_norm": 2.5240061283111572, + "learning_rate": 0.0011527910685805423, + "loss": 0.5206, + "step": 13280 + }, + { + "epoch": 21.2, + "grad_norm": 1.7967180013656616, + "learning_rate": 0.0011521531100478469, + "loss": 0.4679, + "step": 13290 + }, + { + "epoch": 21.21, + "grad_norm": 1.5482749938964844, + "learning_rate": 0.0011515151515151516, + "loss": 0.5074, + "step": 13300 + }, + { + "epoch": 21.23, + "grad_norm": 2.0703771114349365, + "learning_rate": 0.0011508771929824562, + "loss": 0.5012, + "step": 13310 + }, + { + "epoch": 21.24, + "grad_norm": 1.4565823078155518, + "learning_rate": 0.0011502392344497608, + "loss": 0.5855, + "step": 13320 + }, + { + "epoch": 21.26, + "grad_norm": 2.0159592628479004, + "learning_rate": 0.0011496012759170653, + "loss": 0.5762, + "step": 13330 + }, + { + "epoch": 21.28, + "grad_norm": 1.8826504945755005, + "learning_rate": 0.0011489633173843701, + "loss": 0.5274, + "step": 13340 + }, + { + "epoch": 21.29, + "grad_norm": 1.7150112390518188, + "learning_rate": 0.0011483253588516747, + "loss": 0.5659, + "step": 13350 + }, + { + "epoch": 21.31, + "grad_norm": 2.8087666034698486, + "learning_rate": 0.0011476874003189792, + "loss": 0.4906, + "step": 13360 + }, + { + "epoch": 21.32, + "grad_norm": 1.7748334407806396, + "learning_rate": 0.001147049441786284, + "loss": 0.4622, + "step": 13370 + }, + { + "epoch": 21.34, + "grad_norm": 2.2863359451293945, + "learning_rate": 0.0011464114832535886, + "loss": 0.5015, + "step": 13380 + }, + { + "epoch": 21.36, + "grad_norm": 2.4490015506744385, + "learning_rate": 0.0011457735247208931, + "loss": 0.5743, + "step": 13390 + }, + { + "epoch": 21.37, + "grad_norm": 1.4806760549545288, + "learning_rate": 0.0011451355661881977, + "loss": 0.516, + "step": 13400 + }, + { + "epoch": 21.39, + "grad_norm": 1.909926176071167, + "learning_rate": 0.0011444976076555025, + "loss": 0.516, + "step": 13410 + }, + { + "epoch": 21.4, + "grad_norm": 2.3129677772521973, + "learning_rate": 0.001143859649122807, + "loss": 0.5169, + "step": 13420 + }, + { + "epoch": 21.42, + "grad_norm": 2.589088201522827, + "learning_rate": 0.0011432216905901116, + "loss": 0.5535, + "step": 13430 + }, + { + "epoch": 21.44, + "grad_norm": 2.4051127433776855, + "learning_rate": 0.0011425837320574164, + "loss": 0.5236, + "step": 13440 + }, + { + "epoch": 21.45, + "grad_norm": 2.466587781906128, + "learning_rate": 0.001141945773524721, + "loss": 0.5304, + "step": 13450 + }, + { + "epoch": 21.47, + "grad_norm": 1.5987040996551514, + "learning_rate": 0.0011413078149920255, + "loss": 0.4916, + "step": 13460 + }, + { + "epoch": 21.48, + "grad_norm": 3.281262159347534, + "learning_rate": 0.00114066985645933, + "loss": 0.5194, + "step": 13470 + }, + { + "epoch": 21.5, + "grad_norm": 2.3112425804138184, + "learning_rate": 0.0011400318979266349, + "loss": 0.5282, + "step": 13480 + }, + { + "epoch": 21.52, + "grad_norm": 1.8901677131652832, + "learning_rate": 0.0011393939393939394, + "loss": 0.5697, + "step": 13490 + }, + { + "epoch": 21.53, + "grad_norm": 2.8748323917388916, + "learning_rate": 0.001138755980861244, + "loss": 0.5219, + "step": 13500 + }, + { + "epoch": 21.55, + "grad_norm": 2.162447690963745, + "learning_rate": 0.0011381180223285488, + "loss": 0.5469, + "step": 13510 + }, + { + "epoch": 21.56, + "grad_norm": 2.3993029594421387, + "learning_rate": 0.0011374800637958533, + "loss": 0.5083, + "step": 13520 + }, + { + "epoch": 21.58, + "grad_norm": 2.262704372406006, + "learning_rate": 0.0011368421052631579, + "loss": 0.5152, + "step": 13530 + }, + { + "epoch": 21.59, + "grad_norm": 1.8415032625198364, + "learning_rate": 0.0011362041467304624, + "loss": 0.6413, + "step": 13540 + }, + { + "epoch": 21.61, + "grad_norm": 1.7143352031707764, + "learning_rate": 0.0011355661881977672, + "loss": 0.5897, + "step": 13550 + }, + { + "epoch": 21.63, + "grad_norm": 1.9199092388153076, + "learning_rate": 0.0011349282296650718, + "loss": 0.5259, + "step": 13560 + }, + { + "epoch": 21.64, + "grad_norm": 1.6734964847564697, + "learning_rate": 0.0011342902711323764, + "loss": 0.5658, + "step": 13570 + }, + { + "epoch": 21.66, + "grad_norm": 2.817392349243164, + "learning_rate": 0.001133652312599681, + "loss": 0.6579, + "step": 13580 + }, + { + "epoch": 21.67, + "grad_norm": 3.3291382789611816, + "learning_rate": 0.0011330143540669857, + "loss": 0.603, + "step": 13590 + }, + { + "epoch": 21.69, + "grad_norm": 2.5923471450805664, + "learning_rate": 0.0011323763955342903, + "loss": 0.659, + "step": 13600 + }, + { + "epoch": 21.71, + "grad_norm": 2.979832410812378, + "learning_rate": 0.0011317384370015948, + "loss": 0.6129, + "step": 13610 + }, + { + "epoch": 21.72, + "grad_norm": 3.666498899459839, + "learning_rate": 0.0011311004784688996, + "loss": 0.7636, + "step": 13620 + }, + { + "epoch": 21.74, + "grad_norm": 1.8010962009429932, + "learning_rate": 0.0011304625199362042, + "loss": 0.5779, + "step": 13630 + }, + { + "epoch": 21.75, + "grad_norm": 2.430271625518799, + "learning_rate": 0.0011298245614035087, + "loss": 0.5292, + "step": 13640 + }, + { + "epoch": 21.77, + "grad_norm": 2.2051026821136475, + "learning_rate": 0.0011291866028708133, + "loss": 0.5498, + "step": 13650 + }, + { + "epoch": 21.79, + "grad_norm": 3.7122042179107666, + "learning_rate": 0.001128548644338118, + "loss": 0.5772, + "step": 13660 + }, + { + "epoch": 21.8, + "grad_norm": 2.4475326538085938, + "learning_rate": 0.0011279106858054226, + "loss": 0.5221, + "step": 13670 + }, + { + "epoch": 21.82, + "grad_norm": 2.862783193588257, + "learning_rate": 0.0011272727272727272, + "loss": 0.5799, + "step": 13680 + }, + { + "epoch": 21.83, + "grad_norm": 2.2433278560638428, + "learning_rate": 0.001126634768740032, + "loss": 0.5334, + "step": 13690 + }, + { + "epoch": 21.85, + "grad_norm": 2.5554163455963135, + "learning_rate": 0.0011259968102073365, + "loss": 0.5254, + "step": 13700 + }, + { + "epoch": 21.87, + "grad_norm": 2.6535990238189697, + "learning_rate": 0.001125358851674641, + "loss": 0.6575, + "step": 13710 + }, + { + "epoch": 21.88, + "grad_norm": 2.348066806793213, + "learning_rate": 0.0011247208931419457, + "loss": 0.5326, + "step": 13720 + }, + { + "epoch": 21.9, + "grad_norm": 2.7629575729370117, + "learning_rate": 0.0011240829346092504, + "loss": 0.5429, + "step": 13730 + }, + { + "epoch": 21.91, + "grad_norm": 1.7561380863189697, + "learning_rate": 0.001123444976076555, + "loss": 0.612, + "step": 13740 + }, + { + "epoch": 21.93, + "grad_norm": 1.6795223951339722, + "learning_rate": 0.0011228070175438596, + "loss": 0.5954, + "step": 13750 + }, + { + "epoch": 21.95, + "grad_norm": 2.316612958908081, + "learning_rate": 0.0011221690590111643, + "loss": 0.6165, + "step": 13760 + }, + { + "epoch": 21.96, + "grad_norm": 2.91849422454834, + "learning_rate": 0.001121531100478469, + "loss": 0.5407, + "step": 13770 + }, + { + "epoch": 21.98, + "grad_norm": 1.6966789960861206, + "learning_rate": 0.0011208931419457735, + "loss": 0.5186, + "step": 13780 + }, + { + "epoch": 21.99, + "grad_norm": 2.0186002254486084, + "learning_rate": 0.001120255183413078, + "loss": 0.6633, + "step": 13790 + }, + { + "epoch": 22.01, + "grad_norm": 1.466770052909851, + "learning_rate": 0.0011196172248803828, + "loss": 0.4628, + "step": 13800 + }, + { + "epoch": 22.03, + "grad_norm": 1.2927073240280151, + "learning_rate": 0.0011189792663476874, + "loss": 0.3833, + "step": 13810 + }, + { + "epoch": 22.04, + "grad_norm": 1.5075204372406006, + "learning_rate": 0.001118341307814992, + "loss": 0.4408, + "step": 13820 + }, + { + "epoch": 22.06, + "grad_norm": 1.5921709537506104, + "learning_rate": 0.0011177033492822967, + "loss": 0.4546, + "step": 13830 + }, + { + "epoch": 22.07, + "grad_norm": 1.4669833183288574, + "learning_rate": 0.0011170653907496013, + "loss": 0.4423, + "step": 13840 + }, + { + "epoch": 22.09, + "grad_norm": 2.846984624862671, + "learning_rate": 0.0011164274322169058, + "loss": 0.5419, + "step": 13850 + }, + { + "epoch": 22.11, + "grad_norm": 1.1254881620407104, + "learning_rate": 0.0011157894736842104, + "loss": 0.4977, + "step": 13860 + }, + { + "epoch": 22.12, + "grad_norm": 1.3367946147918701, + "learning_rate": 0.0011151515151515152, + "loss": 0.4496, + "step": 13870 + }, + { + "epoch": 22.14, + "grad_norm": 1.633335828781128, + "learning_rate": 0.0011145135566188197, + "loss": 0.4433, + "step": 13880 + }, + { + "epoch": 22.15, + "grad_norm": 2.3413655757904053, + "learning_rate": 0.0011138755980861243, + "loss": 0.5003, + "step": 13890 + }, + { + "epoch": 22.17, + "grad_norm": 2.336428642272949, + "learning_rate": 0.001113237639553429, + "loss": 0.532, + "step": 13900 + }, + { + "epoch": 22.19, + "grad_norm": 1.713782787322998, + "learning_rate": 0.0011125996810207337, + "loss": 0.4474, + "step": 13910 + }, + { + "epoch": 22.2, + "grad_norm": 2.55415678024292, + "learning_rate": 0.0011119617224880382, + "loss": 0.4168, + "step": 13920 + }, + { + "epoch": 22.22, + "grad_norm": 1.7358187437057495, + "learning_rate": 0.0011113237639553428, + "loss": 0.5048, + "step": 13930 + }, + { + "epoch": 22.23, + "grad_norm": 1.6725515127182007, + "learning_rate": 0.0011106858054226476, + "loss": 0.508, + "step": 13940 + }, + { + "epoch": 22.25, + "grad_norm": 1.3164896965026855, + "learning_rate": 0.0011100478468899521, + "loss": 0.463, + "step": 13950 + }, + { + "epoch": 22.26, + "grad_norm": 1.724993348121643, + "learning_rate": 0.0011094098883572567, + "loss": 0.5165, + "step": 13960 + }, + { + "epoch": 22.28, + "grad_norm": 3.1152279376983643, + "learning_rate": 0.0011087719298245612, + "loss": 0.4615, + "step": 13970 + }, + { + "epoch": 22.3, + "grad_norm": 2.1127662658691406, + "learning_rate": 0.001108133971291866, + "loss": 0.4926, + "step": 13980 + }, + { + "epoch": 22.31, + "grad_norm": 2.012160062789917, + "learning_rate": 0.0011074960127591706, + "loss": 0.4331, + "step": 13990 + }, + { + "epoch": 22.33, + "grad_norm": 1.761988639831543, + "learning_rate": 0.0011068580542264752, + "loss": 0.4927, + "step": 14000 + }, + { + "epoch": 22.34, + "grad_norm": 1.8735899925231934, + "learning_rate": 0.00110622009569378, + "loss": 0.5162, + "step": 14010 + }, + { + "epoch": 22.36, + "grad_norm": 1.9117660522460938, + "learning_rate": 0.0011055821371610845, + "loss": 0.4583, + "step": 14020 + }, + { + "epoch": 22.38, + "grad_norm": 1.598494291305542, + "learning_rate": 0.001104944178628389, + "loss": 0.4943, + "step": 14030 + }, + { + "epoch": 22.39, + "grad_norm": 1.6611143350601196, + "learning_rate": 0.0011043062200956936, + "loss": 0.4889, + "step": 14040 + }, + { + "epoch": 22.41, + "grad_norm": 2.4984424114227295, + "learning_rate": 0.0011036682615629984, + "loss": 0.4838, + "step": 14050 + }, + { + "epoch": 22.42, + "grad_norm": 2.082078695297241, + "learning_rate": 0.001103030303030303, + "loss": 0.6166, + "step": 14060 + }, + { + "epoch": 22.44, + "grad_norm": 2.6350715160369873, + "learning_rate": 0.0011023923444976075, + "loss": 0.5082, + "step": 14070 + }, + { + "epoch": 22.46, + "grad_norm": 1.6463345289230347, + "learning_rate": 0.0011017543859649123, + "loss": 0.4486, + "step": 14080 + }, + { + "epoch": 22.47, + "grad_norm": 2.0142619609832764, + "learning_rate": 0.0011011164274322169, + "loss": 0.5573, + "step": 14090 + }, + { + "epoch": 22.49, + "grad_norm": 2.3120744228363037, + "learning_rate": 0.0011004784688995214, + "loss": 0.5478, + "step": 14100 + }, + { + "epoch": 22.5, + "grad_norm": 1.7484601736068726, + "learning_rate": 0.001099840510366826, + "loss": 0.5557, + "step": 14110 + }, + { + "epoch": 22.52, + "grad_norm": 2.2994306087493896, + "learning_rate": 0.0010992025518341308, + "loss": 0.507, + "step": 14120 + }, + { + "epoch": 22.54, + "grad_norm": 3.1111643314361572, + "learning_rate": 0.0010985645933014353, + "loss": 0.783, + "step": 14130 + }, + { + "epoch": 22.55, + "grad_norm": 2.3941569328308105, + "learning_rate": 0.00109792663476874, + "loss": 0.6618, + "step": 14140 + }, + { + "epoch": 22.57, + "grad_norm": 1.893367052078247, + "learning_rate": 0.0010972886762360447, + "loss": 0.5318, + "step": 14150 + }, + { + "epoch": 22.58, + "grad_norm": 2.1536896228790283, + "learning_rate": 0.0010966507177033492, + "loss": 0.5581, + "step": 14160 + }, + { + "epoch": 22.6, + "grad_norm": 2.7636032104492188, + "learning_rate": 0.0010960127591706538, + "loss": 0.5706, + "step": 14170 + }, + { + "epoch": 22.62, + "grad_norm": 2.516028642654419, + "learning_rate": 0.0010953748006379584, + "loss": 0.5557, + "step": 14180 + }, + { + "epoch": 22.63, + "grad_norm": 1.5299115180969238, + "learning_rate": 0.0010947368421052634, + "loss": 0.5118, + "step": 14190 + }, + { + "epoch": 22.65, + "grad_norm": 2.1962053775787354, + "learning_rate": 0.001094098883572568, + "loss": 0.5678, + "step": 14200 + }, + { + "epoch": 22.66, + "grad_norm": 4.639540195465088, + "learning_rate": 0.0010934609250398725, + "loss": 0.4226, + "step": 14210 + }, + { + "epoch": 22.68, + "grad_norm": 3.8349008560180664, + "learning_rate": 0.0010928229665071773, + "loss": 0.5659, + "step": 14220 + }, + { + "epoch": 22.7, + "grad_norm": 2.3924553394317627, + "learning_rate": 0.0010921850079744818, + "loss": 0.5681, + "step": 14230 + }, + { + "epoch": 22.71, + "grad_norm": 3.5269014835357666, + "learning_rate": 0.0010915470494417864, + "loss": 0.5181, + "step": 14240 + }, + { + "epoch": 22.73, + "grad_norm": 2.35038685798645, + "learning_rate": 0.001090909090909091, + "loss": 0.4825, + "step": 14250 + }, + { + "epoch": 22.74, + "grad_norm": 2.1526710987091064, + "learning_rate": 0.0010902711323763957, + "loss": 0.5347, + "step": 14260 + }, + { + "epoch": 22.76, + "grad_norm": 2.3087081909179688, + "learning_rate": 0.0010896331738437003, + "loss": 0.6681, + "step": 14270 + }, + { + "epoch": 22.78, + "grad_norm": 1.9781696796417236, + "learning_rate": 0.0010889952153110049, + "loss": 0.5765, + "step": 14280 + }, + { + "epoch": 22.79, + "grad_norm": 2.716538429260254, + "learning_rate": 0.0010883572567783094, + "loss": 0.6403, + "step": 14290 + }, + { + "epoch": 22.81, + "grad_norm": 2.0449490547180176, + "learning_rate": 0.0010877192982456142, + "loss": 0.4586, + "step": 14300 + }, + { + "epoch": 22.82, + "grad_norm": 2.0720322132110596, + "learning_rate": 0.0010870813397129188, + "loss": 0.597, + "step": 14310 + }, + { + "epoch": 22.84, + "grad_norm": 1.6174436807632446, + "learning_rate": 0.0010864433811802233, + "loss": 0.4503, + "step": 14320 + }, + { + "epoch": 22.85, + "grad_norm": 2.284149169921875, + "learning_rate": 0.001085805422647528, + "loss": 0.554, + "step": 14330 + }, + { + "epoch": 22.87, + "grad_norm": 1.6513159275054932, + "learning_rate": 0.0010851674641148327, + "loss": 0.4929, + "step": 14340 + }, + { + "epoch": 22.89, + "grad_norm": 3.105323076248169, + "learning_rate": 0.0010845295055821372, + "loss": 0.601, + "step": 14350 + }, + { + "epoch": 22.9, + "grad_norm": 1.6782584190368652, + "learning_rate": 0.0010838915470494418, + "loss": 0.5175, + "step": 14360 + }, + { + "epoch": 22.92, + "grad_norm": 2.065708875656128, + "learning_rate": 0.0010832535885167466, + "loss": 0.5765, + "step": 14370 + }, + { + "epoch": 22.93, + "grad_norm": 3.1577556133270264, + "learning_rate": 0.0010826156299840511, + "loss": 0.6024, + "step": 14380 + }, + { + "epoch": 22.95, + "grad_norm": 3.8669426441192627, + "learning_rate": 0.0010819776714513557, + "loss": 0.5703, + "step": 14390 + }, + { + "epoch": 22.97, + "grad_norm": 2.084577798843384, + "learning_rate": 0.0010813397129186605, + "loss": 0.5736, + "step": 14400 + }, + { + "epoch": 22.98, + "grad_norm": 2.3322348594665527, + "learning_rate": 0.001080701754385965, + "loss": 0.4955, + "step": 14410 + }, + { + "epoch": 23.0, + "grad_norm": 2.981834650039673, + "learning_rate": 0.0010800637958532696, + "loss": 0.5941, + "step": 14420 + }, + { + "epoch": 23.01, + "grad_norm": 1.649495244026184, + "learning_rate": 0.0010794258373205742, + "loss": 0.4623, + "step": 14430 + }, + { + "epoch": 23.03, + "grad_norm": 2.4361202716827393, + "learning_rate": 0.001078787878787879, + "loss": 0.5178, + "step": 14440 + }, + { + "epoch": 23.05, + "grad_norm": 1.9195847511291504, + "learning_rate": 0.0010781499202551835, + "loss": 0.5056, + "step": 14450 + }, + { + "epoch": 23.06, + "grad_norm": 1.472584843635559, + "learning_rate": 0.001077511961722488, + "loss": 0.422, + "step": 14460 + }, + { + "epoch": 23.08, + "grad_norm": 1.9220826625823975, + "learning_rate": 0.0010768740031897928, + "loss": 0.4461, + "step": 14470 + }, + { + "epoch": 23.09, + "grad_norm": 2.0163981914520264, + "learning_rate": 0.0010762360446570974, + "loss": 0.405, + "step": 14480 + }, + { + "epoch": 23.11, + "grad_norm": 2.0835061073303223, + "learning_rate": 0.001075598086124402, + "loss": 0.5177, + "step": 14490 + }, + { + "epoch": 23.13, + "grad_norm": 0.9891412258148193, + "learning_rate": 0.0010749601275917065, + "loss": 0.5585, + "step": 14500 + }, + { + "epoch": 23.14, + "grad_norm": 1.3112674951553345, + "learning_rate": 0.0010743221690590113, + "loss": 0.5358, + "step": 14510 + }, + { + "epoch": 23.16, + "grad_norm": 1.358392357826233, + "learning_rate": 0.0010736842105263159, + "loss": 0.3928, + "step": 14520 + }, + { + "epoch": 23.17, + "grad_norm": 1.7104527950286865, + "learning_rate": 0.0010730462519936204, + "loss": 0.4469, + "step": 14530 + }, + { + "epoch": 23.19, + "grad_norm": 2.007497787475586, + "learning_rate": 0.0010724082934609252, + "loss": 0.4123, + "step": 14540 + }, + { + "epoch": 23.21, + "grad_norm": 2.2213757038116455, + "learning_rate": 0.0010717703349282298, + "loss": 0.4139, + "step": 14550 + }, + { + "epoch": 23.22, + "grad_norm": 1.6128385066986084, + "learning_rate": 0.0010711323763955343, + "loss": 0.4715, + "step": 14560 + }, + { + "epoch": 23.24, + "grad_norm": 1.6998387575149536, + "learning_rate": 0.001070494417862839, + "loss": 0.433, + "step": 14570 + }, + { + "epoch": 23.25, + "grad_norm": 1.7560913562774658, + "learning_rate": 0.0010698564593301437, + "loss": 0.5458, + "step": 14580 + }, + { + "epoch": 23.27, + "grad_norm": 1.0924944877624512, + "learning_rate": 0.0010692185007974483, + "loss": 0.4552, + "step": 14590 + }, + { + "epoch": 23.29, + "grad_norm": 1.2721997499465942, + "learning_rate": 0.0010685805422647528, + "loss": 0.4142, + "step": 14600 + }, + { + "epoch": 23.3, + "grad_norm": 1.5277657508850098, + "learning_rate": 0.0010679425837320576, + "loss": 0.4749, + "step": 14610 + }, + { + "epoch": 23.32, + "grad_norm": 1.4912691116333008, + "learning_rate": 0.0010673046251993622, + "loss": 0.4994, + "step": 14620 + }, + { + "epoch": 23.33, + "grad_norm": 2.7884340286254883, + "learning_rate": 0.0010666666666666667, + "loss": 0.4654, + "step": 14630 + }, + { + "epoch": 23.35, + "grad_norm": 3.288153886795044, + "learning_rate": 0.0010660287081339713, + "loss": 0.4726, + "step": 14640 + }, + { + "epoch": 23.37, + "grad_norm": 1.869439721107483, + "learning_rate": 0.001065390749601276, + "loss": 0.5029, + "step": 14650 + }, + { + "epoch": 23.38, + "grad_norm": 1.9574953317642212, + "learning_rate": 0.0010647527910685806, + "loss": 0.5279, + "step": 14660 + }, + { + "epoch": 23.4, + "grad_norm": 3.001887321472168, + "learning_rate": 0.0010641148325358852, + "loss": 0.5283, + "step": 14670 + }, + { + "epoch": 23.41, + "grad_norm": 3.0924551486968994, + "learning_rate": 0.0010634768740031897, + "loss": 0.4474, + "step": 14680 + }, + { + "epoch": 23.43, + "grad_norm": 1.803222894668579, + "learning_rate": 0.0010628389154704945, + "loss": 0.4873, + "step": 14690 + }, + { + "epoch": 23.44, + "grad_norm": 2.564887762069702, + "learning_rate": 0.001062200956937799, + "loss": 0.4802, + "step": 14700 + }, + { + "epoch": 23.46, + "grad_norm": 2.3837051391601562, + "learning_rate": 0.0010615629984051037, + "loss": 0.5515, + "step": 14710 + }, + { + "epoch": 23.48, + "grad_norm": 2.775334358215332, + "learning_rate": 0.0010609250398724084, + "loss": 0.5124, + "step": 14720 + }, + { + "epoch": 23.49, + "grad_norm": 2.805455207824707, + "learning_rate": 0.001060287081339713, + "loss": 0.5103, + "step": 14730 + }, + { + "epoch": 23.51, + "grad_norm": 4.685495376586914, + "learning_rate": 0.0010596491228070176, + "loss": 0.5481, + "step": 14740 + }, + { + "epoch": 23.52, + "grad_norm": 1.6772174835205078, + "learning_rate": 0.0010590111642743221, + "loss": 0.5023, + "step": 14750 + }, + { + "epoch": 23.54, + "grad_norm": 3.1417901515960693, + "learning_rate": 0.001058373205741627, + "loss": 0.4909, + "step": 14760 + }, + { + "epoch": 23.56, + "grad_norm": 2.6341207027435303, + "learning_rate": 0.0010577352472089315, + "loss": 0.577, + "step": 14770 + }, + { + "epoch": 23.57, + "grad_norm": 1.981137990951538, + "learning_rate": 0.001057097288676236, + "loss": 0.527, + "step": 14780 + }, + { + "epoch": 23.59, + "grad_norm": 1.8690191507339478, + "learning_rate": 0.0010564593301435408, + "loss": 0.6524, + "step": 14790 + }, + { + "epoch": 23.6, + "grad_norm": 2.470585584640503, + "learning_rate": 0.0010558213716108454, + "loss": 0.5171, + "step": 14800 + }, + { + "epoch": 23.62, + "grad_norm": 1.5225473642349243, + "learning_rate": 0.00105518341307815, + "loss": 0.574, + "step": 14810 + }, + { + "epoch": 23.64, + "grad_norm": 1.244357705116272, + "learning_rate": 0.0010545454545454545, + "loss": 0.513, + "step": 14820 + }, + { + "epoch": 23.65, + "grad_norm": 3.7984049320220947, + "learning_rate": 0.0010539074960127593, + "loss": 0.4659, + "step": 14830 + }, + { + "epoch": 23.67, + "grad_norm": 2.2695350646972656, + "learning_rate": 0.0010532695374800638, + "loss": 0.5469, + "step": 14840 + }, + { + "epoch": 23.68, + "grad_norm": 2.1727049350738525, + "learning_rate": 0.0010526315789473684, + "loss": 0.4998, + "step": 14850 + }, + { + "epoch": 23.7, + "grad_norm": 2.2124183177948, + "learning_rate": 0.0010519936204146732, + "loss": 0.4202, + "step": 14860 + }, + { + "epoch": 23.72, + "grad_norm": 1.9910480976104736, + "learning_rate": 0.0010513556618819777, + "loss": 0.4944, + "step": 14870 + }, + { + "epoch": 23.73, + "grad_norm": 2.623316526412964, + "learning_rate": 0.0010507177033492823, + "loss": 0.5959, + "step": 14880 + }, + { + "epoch": 23.75, + "grad_norm": 3.7587718963623047, + "learning_rate": 0.0010500797448165869, + "loss": 0.5584, + "step": 14890 + }, + { + "epoch": 23.76, + "grad_norm": 2.0342280864715576, + "learning_rate": 0.0010494417862838916, + "loss": 0.5426, + "step": 14900 + }, + { + "epoch": 23.78, + "grad_norm": 2.8675320148468018, + "learning_rate": 0.0010488038277511962, + "loss": 0.6461, + "step": 14910 + }, + { + "epoch": 23.8, + "grad_norm": 2.026543617248535, + "learning_rate": 0.0010481658692185008, + "loss": 0.5184, + "step": 14920 + }, + { + "epoch": 23.81, + "grad_norm": 2.560939073562622, + "learning_rate": 0.0010475279106858055, + "loss": 0.5449, + "step": 14930 + }, + { + "epoch": 23.83, + "grad_norm": 2.085392951965332, + "learning_rate": 0.0010468899521531101, + "loss": 0.5984, + "step": 14940 + }, + { + "epoch": 23.84, + "grad_norm": 2.2556986808776855, + "learning_rate": 0.0010462519936204147, + "loss": 0.4763, + "step": 14950 + }, + { + "epoch": 23.86, + "grad_norm": 1.4370797872543335, + "learning_rate": 0.0010456140350877192, + "loss": 0.5311, + "step": 14960 + }, + { + "epoch": 23.88, + "grad_norm": 1.252243161201477, + "learning_rate": 0.001044976076555024, + "loss": 0.4934, + "step": 14970 + }, + { + "epoch": 23.89, + "grad_norm": 2.0001296997070312, + "learning_rate": 0.0010443381180223286, + "loss": 0.4836, + "step": 14980 + }, + { + "epoch": 23.91, + "grad_norm": 2.259216070175171, + "learning_rate": 0.0010437001594896331, + "loss": 0.5749, + "step": 14990 + }, + { + "epoch": 23.92, + "grad_norm": 1.5871505737304688, + "learning_rate": 0.0010430622009569377, + "loss": 0.5055, + "step": 15000 + }, + { + "epoch": 23.94, + "grad_norm": 3.0217132568359375, + "learning_rate": 0.0010424242424242425, + "loss": 0.4674, + "step": 15010 + }, + { + "epoch": 23.96, + "grad_norm": 2.425215482711792, + "learning_rate": 0.001041786283891547, + "loss": 0.4853, + "step": 15020 + }, + { + "epoch": 23.97, + "grad_norm": 2.7950572967529297, + "learning_rate": 0.0010411483253588516, + "loss": 0.5244, + "step": 15030 + }, + { + "epoch": 23.99, + "grad_norm": 1.8970431089401245, + "learning_rate": 0.0010405103668261564, + "loss": 0.538, + "step": 15040 + }, + { + "epoch": 24.0, + "grad_norm": 0.7786374688148499, + "learning_rate": 0.001039872408293461, + "loss": 0.4562, + "step": 15050 + }, + { + "epoch": 24.02, + "grad_norm": 1.385309100151062, + "learning_rate": 0.0010392344497607655, + "loss": 0.4353, + "step": 15060 + }, + { + "epoch": 24.04, + "grad_norm": 3.768200397491455, + "learning_rate": 0.00103859649122807, + "loss": 0.4483, + "step": 15070 + }, + { + "epoch": 24.05, + "grad_norm": 3.530329704284668, + "learning_rate": 0.0010379585326953749, + "loss": 0.4374, + "step": 15080 + }, + { + "epoch": 24.07, + "grad_norm": 1.2706865072250366, + "learning_rate": 0.0010373205741626794, + "loss": 0.3817, + "step": 15090 + }, + { + "epoch": 24.08, + "grad_norm": 0.9349244832992554, + "learning_rate": 0.001036682615629984, + "loss": 0.5057, + "step": 15100 + }, + { + "epoch": 24.1, + "grad_norm": 3.0068447589874268, + "learning_rate": 0.0010360446570972888, + "loss": 0.4718, + "step": 15110 + }, + { + "epoch": 24.11, + "grad_norm": 2.423353672027588, + "learning_rate": 0.0010354066985645933, + "loss": 0.4668, + "step": 15120 + }, + { + "epoch": 24.13, + "grad_norm": 1.5053311586380005, + "learning_rate": 0.0010347687400318979, + "loss": 0.4525, + "step": 15130 + }, + { + "epoch": 24.15, + "grad_norm": 1.660056710243225, + "learning_rate": 0.0010341307814992024, + "loss": 0.4734, + "step": 15140 + }, + { + "epoch": 24.16, + "grad_norm": 1.5876003503799438, + "learning_rate": 0.0010334928229665072, + "loss": 0.3713, + "step": 15150 + }, + { + "epoch": 24.18, + "grad_norm": 1.1910775899887085, + "learning_rate": 0.0010328548644338118, + "loss": 0.4542, + "step": 15160 + }, + { + "epoch": 24.19, + "grad_norm": 2.1305978298187256, + "learning_rate": 0.0010322169059011164, + "loss": 0.4496, + "step": 15170 + }, + { + "epoch": 24.21, + "grad_norm": 1.9429091215133667, + "learning_rate": 0.0010315789473684211, + "loss": 0.4965, + "step": 15180 + }, + { + "epoch": 24.23, + "grad_norm": 1.6149272918701172, + "learning_rate": 0.0010309409888357257, + "loss": 0.4369, + "step": 15190 + }, + { + "epoch": 24.24, + "grad_norm": 1.5995999574661255, + "learning_rate": 0.0010303030303030303, + "loss": 0.4579, + "step": 15200 + }, + { + "epoch": 24.26, + "grad_norm": 1.6771583557128906, + "learning_rate": 0.0010296650717703348, + "loss": 0.4289, + "step": 15210 + }, + { + "epoch": 24.27, + "grad_norm": 2.6575920581817627, + "learning_rate": 0.0010290271132376396, + "loss": 0.5622, + "step": 15220 + }, + { + "epoch": 24.29, + "grad_norm": 1.5423036813735962, + "learning_rate": 0.0010283891547049442, + "loss": 0.4558, + "step": 15230 + }, + { + "epoch": 24.31, + "grad_norm": 1.731204628944397, + "learning_rate": 0.0010277511961722487, + "loss": 0.4732, + "step": 15240 + }, + { + "epoch": 24.32, + "grad_norm": 2.5990333557128906, + "learning_rate": 0.0010271132376395535, + "loss": 0.5179, + "step": 15250 + }, + { + "epoch": 24.34, + "grad_norm": 1.5724194049835205, + "learning_rate": 0.001026475279106858, + "loss": 0.4896, + "step": 15260 + }, + { + "epoch": 24.35, + "grad_norm": 3.0556674003601074, + "learning_rate": 0.0010258373205741626, + "loss": 0.522, + "step": 15270 + }, + { + "epoch": 24.37, + "grad_norm": 2.603013515472412, + "learning_rate": 0.0010251993620414672, + "loss": 0.4626, + "step": 15280 + }, + { + "epoch": 24.39, + "grad_norm": 1.3041783571243286, + "learning_rate": 0.001024561403508772, + "loss": 0.3713, + "step": 15290 + }, + { + "epoch": 24.4, + "grad_norm": 1.5249779224395752, + "learning_rate": 0.0010239234449760765, + "loss": 0.4899, + "step": 15300 + }, + { + "epoch": 24.42, + "grad_norm": 1.814285159111023, + "learning_rate": 0.001023285486443381, + "loss": 0.534, + "step": 15310 + }, + { + "epoch": 24.43, + "grad_norm": 2.138099431991577, + "learning_rate": 0.0010226475279106859, + "loss": 0.4634, + "step": 15320 + }, + { + "epoch": 24.45, + "grad_norm": 1.3936606645584106, + "learning_rate": 0.0010220095693779904, + "loss": 0.4094, + "step": 15330 + }, + { + "epoch": 24.47, + "grad_norm": 1.609049677848816, + "learning_rate": 0.001021371610845295, + "loss": 0.4397, + "step": 15340 + }, + { + "epoch": 24.48, + "grad_norm": 1.38874351978302, + "learning_rate": 0.0010207336523125996, + "loss": 0.5197, + "step": 15350 + }, + { + "epoch": 24.5, + "grad_norm": 2.1596977710723877, + "learning_rate": 0.0010200956937799043, + "loss": 0.4791, + "step": 15360 + }, + { + "epoch": 24.51, + "grad_norm": 1.4566435813903809, + "learning_rate": 0.001019457735247209, + "loss": 0.4453, + "step": 15370 + }, + { + "epoch": 24.53, + "grad_norm": 1.784945011138916, + "learning_rate": 0.0010188197767145135, + "loss": 0.4974, + "step": 15380 + }, + { + "epoch": 24.55, + "grad_norm": 1.9153186082839966, + "learning_rate": 0.001018181818181818, + "loss": 0.4708, + "step": 15390 + }, + { + "epoch": 24.56, + "grad_norm": 1.7097647190093994, + "learning_rate": 0.0010175438596491228, + "loss": 0.4766, + "step": 15400 + }, + { + "epoch": 24.58, + "grad_norm": 1.6198031902313232, + "learning_rate": 0.0010169059011164274, + "loss": 0.4857, + "step": 15410 + }, + { + "epoch": 24.59, + "grad_norm": 2.2390496730804443, + "learning_rate": 0.001016267942583732, + "loss": 0.5341, + "step": 15420 + }, + { + "epoch": 24.61, + "grad_norm": 2.7094318866729736, + "learning_rate": 0.0010156299840510367, + "loss": 0.5316, + "step": 15430 + }, + { + "epoch": 24.63, + "grad_norm": 1.7831966876983643, + "learning_rate": 0.0010149920255183413, + "loss": 0.5638, + "step": 15440 + }, + { + "epoch": 24.64, + "grad_norm": 1.7682468891143799, + "learning_rate": 0.0010143540669856458, + "loss": 0.4232, + "step": 15450 + }, + { + "epoch": 24.66, + "grad_norm": 3.382634401321411, + "learning_rate": 0.0010137161084529504, + "loss": 0.5211, + "step": 15460 + }, + { + "epoch": 24.67, + "grad_norm": 1.6117042303085327, + "learning_rate": 0.0010130781499202552, + "loss": 0.5442, + "step": 15470 + }, + { + "epoch": 24.69, + "grad_norm": 2.2903084754943848, + "learning_rate": 0.0010124401913875597, + "loss": 0.5145, + "step": 15480 + }, + { + "epoch": 24.7, + "grad_norm": 1.3082456588745117, + "learning_rate": 0.0010118022328548643, + "loss": 0.5143, + "step": 15490 + }, + { + "epoch": 24.72, + "grad_norm": 1.9928056001663208, + "learning_rate": 0.001011164274322169, + "loss": 0.5222, + "step": 15500 + }, + { + "epoch": 24.74, + "grad_norm": 1.8907785415649414, + "learning_rate": 0.0010105263157894737, + "loss": 0.4507, + "step": 15510 + }, + { + "epoch": 24.75, + "grad_norm": 1.9465861320495605, + "learning_rate": 0.0010098883572567782, + "loss": 0.4636, + "step": 15520 + }, + { + "epoch": 24.77, + "grad_norm": 1.3651535511016846, + "learning_rate": 0.0010092503987240828, + "loss": 0.4484, + "step": 15530 + }, + { + "epoch": 24.78, + "grad_norm": 1.8174107074737549, + "learning_rate": 0.0010086124401913876, + "loss": 0.4479, + "step": 15540 + }, + { + "epoch": 24.8, + "grad_norm": 1.7005228996276855, + "learning_rate": 0.0010079744816586921, + "loss": 0.5416, + "step": 15550 + }, + { + "epoch": 24.82, + "grad_norm": 1.941279649734497, + "learning_rate": 0.0010073365231259967, + "loss": 0.5404, + "step": 15560 + }, + { + "epoch": 24.83, + "grad_norm": 2.1660587787628174, + "learning_rate": 0.0010066985645933015, + "loss": 0.5335, + "step": 15570 + }, + { + "epoch": 24.85, + "grad_norm": 2.4644267559051514, + "learning_rate": 0.001006060606060606, + "loss": 0.6635, + "step": 15580 + }, + { + "epoch": 24.86, + "grad_norm": 1.596439242362976, + "learning_rate": 0.0010054226475279106, + "loss": 0.4986, + "step": 15590 + }, + { + "epoch": 24.88, + "grad_norm": 1.4329978227615356, + "learning_rate": 0.0010047846889952152, + "loss": 0.4721, + "step": 15600 + }, + { + "epoch": 24.9, + "grad_norm": 1.454533338546753, + "learning_rate": 0.00100414673046252, + "loss": 0.4796, + "step": 15610 + }, + { + "epoch": 24.91, + "grad_norm": 2.5839779376983643, + "learning_rate": 0.0010035087719298245, + "loss": 0.5149, + "step": 15620 + }, + { + "epoch": 24.93, + "grad_norm": 2.21061110496521, + "learning_rate": 0.001002870813397129, + "loss": 0.5191, + "step": 15630 + }, + { + "epoch": 24.94, + "grad_norm": 1.6350433826446533, + "learning_rate": 0.0010022328548644338, + "loss": 0.4748, + "step": 15640 + }, + { + "epoch": 24.96, + "grad_norm": 1.638689637184143, + "learning_rate": 0.0010015948963317384, + "loss": 0.4639, + "step": 15650 + }, + { + "epoch": 24.98, + "grad_norm": 1.925967812538147, + "learning_rate": 0.001000956937799043, + "loss": 0.6168, + "step": 15660 + }, + { + "epoch": 24.99, + "grad_norm": 1.7674167156219482, + "learning_rate": 0.0010003189792663475, + "loss": 0.5219, + "step": 15670 + }, + { + "epoch": 25.01, + "grad_norm": 0.9835655689239502, + "learning_rate": 0.0009996810207336523, + "loss": 0.391, + "step": 15680 + }, + { + "epoch": 25.02, + "grad_norm": 1.7107539176940918, + "learning_rate": 0.000999043062200957, + "loss": 0.3556, + "step": 15690 + }, + { + "epoch": 25.04, + "grad_norm": 2.7259128093719482, + "learning_rate": 0.0009984051036682616, + "loss": 0.5067, + "step": 15700 + }, + { + "epoch": 25.06, + "grad_norm": 1.4780336618423462, + "learning_rate": 0.0009977671451355662, + "loss": 0.4045, + "step": 15710 + }, + { + "epoch": 25.07, + "grad_norm": 1.699403166770935, + "learning_rate": 0.000997129186602871, + "loss": 0.4629, + "step": 15720 + }, + { + "epoch": 25.09, + "grad_norm": 2.0610368251800537, + "learning_rate": 0.0009964912280701755, + "loss": 0.4457, + "step": 15730 + }, + { + "epoch": 25.1, + "grad_norm": 1.1959340572357178, + "learning_rate": 0.0009958532695374801, + "loss": 0.435, + "step": 15740 + }, + { + "epoch": 25.12, + "grad_norm": 2.4365720748901367, + "learning_rate": 0.0009952153110047847, + "loss": 0.444, + "step": 15750 + }, + { + "epoch": 25.14, + "grad_norm": 1.2574375867843628, + "learning_rate": 0.0009945773524720895, + "loss": 0.4484, + "step": 15760 + }, + { + "epoch": 25.15, + "grad_norm": 3.0744266510009766, + "learning_rate": 0.000993939393939394, + "loss": 0.5128, + "step": 15770 + }, + { + "epoch": 25.17, + "grad_norm": 1.3673443794250488, + "learning_rate": 0.0009933014354066986, + "loss": 0.528, + "step": 15780 + }, + { + "epoch": 25.18, + "grad_norm": 1.0166288614273071, + "learning_rate": 0.0009926634768740031, + "loss": 0.4576, + "step": 15790 + }, + { + "epoch": 25.2, + "grad_norm": 2.5745012760162354, + "learning_rate": 0.000992025518341308, + "loss": 0.4262, + "step": 15800 + }, + { + "epoch": 25.22, + "grad_norm": 1.265143871307373, + "learning_rate": 0.0009913875598086125, + "loss": 0.4196, + "step": 15810 + }, + { + "epoch": 25.23, + "grad_norm": 2.3100552558898926, + "learning_rate": 0.000990749601275917, + "loss": 0.5067, + "step": 15820 + }, + { + "epoch": 25.25, + "grad_norm": 1.1458524465560913, + "learning_rate": 0.0009901116427432218, + "loss": 0.4197, + "step": 15830 + }, + { + "epoch": 25.26, + "grad_norm": 1.4825867414474487, + "learning_rate": 0.0009894736842105264, + "loss": 0.4208, + "step": 15840 + }, + { + "epoch": 25.28, + "grad_norm": 2.505919933319092, + "learning_rate": 0.000988835725677831, + "loss": 0.5082, + "step": 15850 + }, + { + "epoch": 25.3, + "grad_norm": 1.547998070716858, + "learning_rate": 0.0009881977671451355, + "loss": 0.4517, + "step": 15860 + }, + { + "epoch": 25.31, + "grad_norm": 1.6311086416244507, + "learning_rate": 0.0009875598086124403, + "loss": 0.4928, + "step": 15870 + }, + { + "epoch": 25.33, + "grad_norm": 1.7544368505477905, + "learning_rate": 0.0009869218500797449, + "loss": 0.4175, + "step": 15880 + }, + { + "epoch": 25.34, + "grad_norm": 1.1133722066879272, + "learning_rate": 0.0009862838915470494, + "loss": 0.4709, + "step": 15890 + }, + { + "epoch": 25.36, + "grad_norm": 1.8425043821334839, + "learning_rate": 0.0009856459330143542, + "loss": 0.425, + "step": 15900 + }, + { + "epoch": 25.37, + "grad_norm": 1.6408649682998657, + "learning_rate": 0.0009850079744816588, + "loss": 0.4856, + "step": 15910 + }, + { + "epoch": 25.39, + "grad_norm": 2.6448709964752197, + "learning_rate": 0.0009843700159489633, + "loss": 0.5066, + "step": 15920 + }, + { + "epoch": 25.41, + "grad_norm": 3.6012330055236816, + "learning_rate": 0.0009837320574162679, + "loss": 0.4534, + "step": 15930 + }, + { + "epoch": 25.42, + "grad_norm": 3.10849666595459, + "learning_rate": 0.0009830940988835727, + "loss": 0.458, + "step": 15940 + }, + { + "epoch": 25.44, + "grad_norm": 1.3097262382507324, + "learning_rate": 0.0009824561403508772, + "loss": 0.4695, + "step": 15950 + }, + { + "epoch": 25.45, + "grad_norm": 1.4666467905044556, + "learning_rate": 0.0009818181818181818, + "loss": 0.4955, + "step": 15960 + }, + { + "epoch": 25.47, + "grad_norm": 2.279972791671753, + "learning_rate": 0.0009811802232854866, + "loss": 0.4531, + "step": 15970 + }, + { + "epoch": 25.49, + "grad_norm": 1.8388824462890625, + "learning_rate": 0.0009805422647527911, + "loss": 0.5209, + "step": 15980 + }, + { + "epoch": 25.5, + "grad_norm": 1.2906782627105713, + "learning_rate": 0.0009799043062200957, + "loss": 0.5712, + "step": 15990 + }, + { + "epoch": 25.52, + "grad_norm": 1.1561537981033325, + "learning_rate": 0.0009792663476874003, + "loss": 0.4498, + "step": 16000 + }, + { + "epoch": 25.53, + "grad_norm": 0.9394503831863403, + "learning_rate": 0.000978628389154705, + "loss": 0.4817, + "step": 16010 + }, + { + "epoch": 25.55, + "grad_norm": 1.3297114372253418, + "learning_rate": 0.0009779904306220096, + "loss": 0.4238, + "step": 16020 + }, + { + "epoch": 25.57, + "grad_norm": 1.832533597946167, + "learning_rate": 0.0009773524720893142, + "loss": 0.4492, + "step": 16030 + }, + { + "epoch": 25.58, + "grad_norm": 1.8517677783966064, + "learning_rate": 0.000976714513556619, + "loss": 0.4209, + "step": 16040 + }, + { + "epoch": 25.6, + "grad_norm": 1.57618248462677, + "learning_rate": 0.0009760765550239234, + "loss": 0.4237, + "step": 16050 + }, + { + "epoch": 25.61, + "grad_norm": 2.6795618534088135, + "learning_rate": 0.0009754385964912282, + "loss": 0.4727, + "step": 16060 + }, + { + "epoch": 25.63, + "grad_norm": 1.7043702602386475, + "learning_rate": 0.0009748006379585327, + "loss": 0.5071, + "step": 16070 + }, + { + "epoch": 25.65, + "grad_norm": 2.142303228378296, + "learning_rate": 0.0009741626794258374, + "loss": 0.5109, + "step": 16080 + }, + { + "epoch": 25.66, + "grad_norm": 1.5886117219924927, + "learning_rate": 0.000973524720893142, + "loss": 0.4755, + "step": 16090 + }, + { + "epoch": 25.68, + "grad_norm": 1.767467975616455, + "learning_rate": 0.0009728867623604466, + "loss": 0.455, + "step": 16100 + }, + { + "epoch": 25.69, + "grad_norm": 1.6067595481872559, + "learning_rate": 0.0009722488038277513, + "loss": 0.4701, + "step": 16110 + }, + { + "epoch": 25.71, + "grad_norm": 1.5716664791107178, + "learning_rate": 0.0009716108452950559, + "loss": 0.4849, + "step": 16120 + }, + { + "epoch": 25.73, + "grad_norm": 1.2191548347473145, + "learning_rate": 0.0009709728867623605, + "loss": 0.4129, + "step": 16130 + }, + { + "epoch": 25.74, + "grad_norm": 1.7351710796356201, + "learning_rate": 0.0009703349282296651, + "loss": 0.5078, + "step": 16140 + }, + { + "epoch": 25.76, + "grad_norm": 1.1435052156448364, + "learning_rate": 0.0009696969696969698, + "loss": 0.4075, + "step": 16150 + }, + { + "epoch": 25.77, + "grad_norm": 2.094747304916382, + "learning_rate": 0.0009690590111642743, + "loss": 0.3737, + "step": 16160 + }, + { + "epoch": 25.79, + "grad_norm": 2.186330556869507, + "learning_rate": 0.000968421052631579, + "loss": 0.4566, + "step": 16170 + }, + { + "epoch": 25.81, + "grad_norm": 2.0006825923919678, + "learning_rate": 0.0009677830940988836, + "loss": 0.458, + "step": 16180 + }, + { + "epoch": 25.82, + "grad_norm": 1.7449229955673218, + "learning_rate": 0.0009671451355661883, + "loss": 0.4487, + "step": 16190 + }, + { + "epoch": 25.84, + "grad_norm": 1.6336495876312256, + "learning_rate": 0.0009665071770334929, + "loss": 0.4743, + "step": 16200 + }, + { + "epoch": 25.85, + "grad_norm": 2.408162832260132, + "learning_rate": 0.0009658692185007975, + "loss": 0.5951, + "step": 16210 + }, + { + "epoch": 25.87, + "grad_norm": 1.5623067617416382, + "learning_rate": 0.0009652312599681022, + "loss": 0.4353, + "step": 16220 + }, + { + "epoch": 25.89, + "grad_norm": 1.187019944190979, + "learning_rate": 0.0009645933014354067, + "loss": 0.4622, + "step": 16230 + }, + { + "epoch": 25.9, + "grad_norm": 1.6125158071517944, + "learning_rate": 0.0009639553429027114, + "loss": 0.4263, + "step": 16240 + }, + { + "epoch": 25.92, + "grad_norm": 2.758575677871704, + "learning_rate": 0.000963317384370016, + "loss": 0.4074, + "step": 16250 + }, + { + "epoch": 25.93, + "grad_norm": 1.477206826210022, + "learning_rate": 0.0009626794258373206, + "loss": 0.5476, + "step": 16260 + }, + { + "epoch": 25.95, + "grad_norm": 2.584649085998535, + "learning_rate": 0.0009620414673046253, + "loss": 0.4626, + "step": 16270 + }, + { + "epoch": 25.96, + "grad_norm": 1.4972593784332275, + "learning_rate": 0.0009614035087719299, + "loss": 0.4884, + "step": 16280 + }, + { + "epoch": 25.98, + "grad_norm": 1.7186070680618286, + "learning_rate": 0.0009607655502392345, + "loss": 0.4263, + "step": 16290 + }, + { + "epoch": 26.0, + "grad_norm": 2.3209738731384277, + "learning_rate": 0.0009601275917065391, + "loss": 0.5385, + "step": 16300 + }, + { + "epoch": 26.01, + "grad_norm": 1.8050909042358398, + "learning_rate": 0.0009594896331738438, + "loss": 0.4111, + "step": 16310 + }, + { + "epoch": 26.03, + "grad_norm": 1.729257345199585, + "learning_rate": 0.0009588516746411483, + "loss": 0.3666, + "step": 16320 + }, + { + "epoch": 26.04, + "grad_norm": 2.2084038257598877, + "learning_rate": 0.000958213716108453, + "loss": 0.3862, + "step": 16330 + }, + { + "epoch": 26.06, + "grad_norm": 1.1707019805908203, + "learning_rate": 0.0009575757575757576, + "loss": 0.4629, + "step": 16340 + }, + { + "epoch": 26.08, + "grad_norm": 3.4062771797180176, + "learning_rate": 0.0009569377990430622, + "loss": 0.3774, + "step": 16350 + }, + { + "epoch": 26.09, + "grad_norm": 1.5490500926971436, + "learning_rate": 0.0009562998405103669, + "loss": 0.4169, + "step": 16360 + }, + { + "epoch": 26.11, + "grad_norm": 1.3803966045379639, + "learning_rate": 0.0009556618819776715, + "loss": 0.4544, + "step": 16370 + }, + { + "epoch": 26.12, + "grad_norm": 1.512718915939331, + "learning_rate": 0.0009550239234449761, + "loss": 0.4163, + "step": 16380 + }, + { + "epoch": 26.14, + "grad_norm": 1.462695837020874, + "learning_rate": 0.0009543859649122807, + "loss": 0.4556, + "step": 16390 + }, + { + "epoch": 26.16, + "grad_norm": 1.237164855003357, + "learning_rate": 0.0009537480063795854, + "loss": 0.3972, + "step": 16400 + }, + { + "epoch": 26.17, + "grad_norm": 1.3175599575042725, + "learning_rate": 0.0009531100478468899, + "loss": 0.4673, + "step": 16410 + }, + { + "epoch": 26.19, + "grad_norm": 1.3138153553009033, + "learning_rate": 0.0009524720893141946, + "loss": 0.3655, + "step": 16420 + }, + { + "epoch": 26.2, + "grad_norm": 1.4665474891662598, + "learning_rate": 0.0009518341307814993, + "loss": 0.4013, + "step": 16430 + }, + { + "epoch": 26.22, + "grad_norm": 1.0699955224990845, + "learning_rate": 0.0009511961722488038, + "loss": 0.4483, + "step": 16440 + }, + { + "epoch": 26.24, + "grad_norm": 1.7993961572647095, + "learning_rate": 0.0009505582137161085, + "loss": 0.5006, + "step": 16450 + }, + { + "epoch": 26.25, + "grad_norm": 2.075788736343384, + "learning_rate": 0.0009499202551834131, + "loss": 0.4183, + "step": 16460 + }, + { + "epoch": 26.27, + "grad_norm": 1.428053379058838, + "learning_rate": 0.0009492822966507177, + "loss": 0.4499, + "step": 16470 + }, + { + "epoch": 26.28, + "grad_norm": 0.9592264294624329, + "learning_rate": 0.0009486443381180223, + "loss": 0.4191, + "step": 16480 + }, + { + "epoch": 26.3, + "grad_norm": 1.708006739616394, + "learning_rate": 0.000948006379585327, + "loss": 0.4259, + "step": 16490 + }, + { + "epoch": 26.32, + "grad_norm": 2.1585805416107178, + "learning_rate": 0.0009473684210526315, + "loss": 0.5309, + "step": 16500 + }, + { + "epoch": 26.33, + "grad_norm": 1.7619798183441162, + "learning_rate": 0.0009467304625199362, + "loss": 0.4947, + "step": 16510 + }, + { + "epoch": 26.35, + "grad_norm": 2.408426523208618, + "learning_rate": 0.0009460925039872409, + "loss": 0.4576, + "step": 16520 + }, + { + "epoch": 26.36, + "grad_norm": 1.5698516368865967, + "learning_rate": 0.0009454545454545454, + "loss": 0.4161, + "step": 16530 + }, + { + "epoch": 26.38, + "grad_norm": 3.033655881881714, + "learning_rate": 0.0009448165869218501, + "loss": 0.5235, + "step": 16540 + }, + { + "epoch": 26.4, + "grad_norm": 2.2422995567321777, + "learning_rate": 0.0009441786283891547, + "loss": 0.5035, + "step": 16550 + }, + { + "epoch": 26.41, + "grad_norm": 2.3441011905670166, + "learning_rate": 0.0009435406698564593, + "loss": 0.4001, + "step": 16560 + }, + { + "epoch": 26.43, + "grad_norm": 1.529283881187439, + "learning_rate": 0.0009429027113237639, + "loss": 0.3795, + "step": 16570 + }, + { + "epoch": 26.44, + "grad_norm": 1.2047476768493652, + "learning_rate": 0.0009422647527910686, + "loss": 0.4504, + "step": 16580 + }, + { + "epoch": 26.46, + "grad_norm": 2.407144069671631, + "learning_rate": 0.0009416267942583733, + "loss": 0.3977, + "step": 16590 + }, + { + "epoch": 26.48, + "grad_norm": 1.3065524101257324, + "learning_rate": 0.0009409888357256778, + "loss": 0.4534, + "step": 16600 + }, + { + "epoch": 26.49, + "grad_norm": 2.119401693344116, + "learning_rate": 0.0009403508771929825, + "loss": 0.513, + "step": 16610 + }, + { + "epoch": 26.51, + "grad_norm": 1.1828601360321045, + "learning_rate": 0.000939712918660287, + "loss": 0.4113, + "step": 16620 + }, + { + "epoch": 26.52, + "grad_norm": 3.7420921325683594, + "learning_rate": 0.0009390749601275917, + "loss": 0.4948, + "step": 16630 + }, + { + "epoch": 26.54, + "grad_norm": 1.524720549583435, + "learning_rate": 0.0009384370015948963, + "loss": 0.4233, + "step": 16640 + }, + { + "epoch": 26.56, + "grad_norm": 2.136596918106079, + "learning_rate": 0.000937799043062201, + "loss": 0.472, + "step": 16650 + }, + { + "epoch": 26.57, + "grad_norm": 2.395744800567627, + "learning_rate": 0.0009371610845295055, + "loss": 0.4629, + "step": 16660 + }, + { + "epoch": 26.59, + "grad_norm": 1.2973766326904297, + "learning_rate": 0.0009365231259968102, + "loss": 0.4332, + "step": 16670 + }, + { + "epoch": 26.6, + "grad_norm": 2.164285659790039, + "learning_rate": 0.0009358851674641149, + "loss": 0.4451, + "step": 16680 + }, + { + "epoch": 26.62, + "grad_norm": 1.284764051437378, + "learning_rate": 0.0009352472089314194, + "loss": 0.5361, + "step": 16690 + }, + { + "epoch": 26.63, + "grad_norm": 1.187538743019104, + "learning_rate": 0.0009346092503987241, + "loss": 0.4414, + "step": 16700 + }, + { + "epoch": 26.65, + "grad_norm": 2.0321905612945557, + "learning_rate": 0.0009339712918660287, + "loss": 0.3823, + "step": 16710 + }, + { + "epoch": 26.67, + "grad_norm": 2.023181676864624, + "learning_rate": 0.0009333333333333333, + "loss": 0.5097, + "step": 16720 + }, + { + "epoch": 26.68, + "grad_norm": 1.6229287385940552, + "learning_rate": 0.0009326953748006379, + "loss": 0.4747, + "step": 16730 + }, + { + "epoch": 26.7, + "grad_norm": 1.848752498626709, + "learning_rate": 0.0009320574162679426, + "loss": 0.4416, + "step": 16740 + }, + { + "epoch": 26.71, + "grad_norm": 1.674248218536377, + "learning_rate": 0.0009314194577352472, + "loss": 0.5362, + "step": 16750 + }, + { + "epoch": 26.73, + "grad_norm": 0.7888638973236084, + "learning_rate": 0.0009307814992025518, + "loss": 0.4482, + "step": 16760 + }, + { + "epoch": 26.75, + "grad_norm": 1.2110415697097778, + "learning_rate": 0.0009301435406698565, + "loss": 0.4771, + "step": 16770 + }, + { + "epoch": 26.76, + "grad_norm": 2.884260654449463, + "learning_rate": 0.000929505582137161, + "loss": 0.44, + "step": 16780 + }, + { + "epoch": 26.78, + "grad_norm": 1.4633077383041382, + "learning_rate": 0.0009288676236044657, + "loss": 0.4565, + "step": 16790 + }, + { + "epoch": 26.79, + "grad_norm": 1.6688116788864136, + "learning_rate": 0.0009282296650717703, + "loss": 0.4524, + "step": 16800 + }, + { + "epoch": 26.81, + "grad_norm": 1.4576424360275269, + "learning_rate": 0.000927591706539075, + "loss": 0.4253, + "step": 16810 + }, + { + "epoch": 26.83, + "grad_norm": 1.47834312915802, + "learning_rate": 0.0009269537480063796, + "loss": 0.4857, + "step": 16820 + }, + { + "epoch": 26.84, + "grad_norm": 1.2933154106140137, + "learning_rate": 0.0009263157894736843, + "loss": 0.4483, + "step": 16830 + }, + { + "epoch": 26.86, + "grad_norm": 2.671135663986206, + "learning_rate": 0.0009256778309409889, + "loss": 0.4321, + "step": 16840 + }, + { + "epoch": 26.87, + "grad_norm": 0.900836169719696, + "learning_rate": 0.0009250398724082935, + "loss": 0.4142, + "step": 16850 + }, + { + "epoch": 26.89, + "grad_norm": 1.467921257019043, + "learning_rate": 0.0009244019138755982, + "loss": 0.4381, + "step": 16860 + }, + { + "epoch": 26.91, + "grad_norm": 1.2465593814849854, + "learning_rate": 0.0009237639553429027, + "loss": 0.4075, + "step": 16870 + }, + { + "epoch": 26.92, + "grad_norm": 1.7828130722045898, + "learning_rate": 0.0009231259968102074, + "loss": 0.4855, + "step": 16880 + }, + { + "epoch": 26.94, + "grad_norm": 2.368098735809326, + "learning_rate": 0.000922488038277512, + "loss": 0.5569, + "step": 16890 + }, + { + "epoch": 26.95, + "grad_norm": 1.8269487619400024, + "learning_rate": 0.0009218500797448166, + "loss": 0.5397, + "step": 16900 + }, + { + "epoch": 26.97, + "grad_norm": 1.5869868993759155, + "learning_rate": 0.0009212121212121213, + "loss": 0.447, + "step": 16910 + }, + { + "epoch": 26.99, + "grad_norm": 2.1379966735839844, + "learning_rate": 0.0009205741626794259, + "loss": 0.3943, + "step": 16920 + }, + { + "epoch": 27.0, + "grad_norm": 0.7937178611755371, + "learning_rate": 0.0009199362041467305, + "loss": 0.4762, + "step": 16930 + }, + { + "epoch": 27.02, + "grad_norm": 0.7434051036834717, + "learning_rate": 0.0009192982456140351, + "loss": 0.3192, + "step": 16940 + }, + { + "epoch": 27.03, + "grad_norm": 1.9126826524734497, + "learning_rate": 0.0009186602870813398, + "loss": 0.3383, + "step": 16950 + }, + { + "epoch": 27.05, + "grad_norm": 0.8884724378585815, + "learning_rate": 0.0009180223285486443, + "loss": 0.335, + "step": 16960 + }, + { + "epoch": 27.07, + "grad_norm": 0.7101998329162598, + "learning_rate": 0.000917384370015949, + "loss": 0.355, + "step": 16970 + }, + { + "epoch": 27.08, + "grad_norm": 1.6486220359802246, + "learning_rate": 0.0009167464114832537, + "loss": 0.4237, + "step": 16980 + }, + { + "epoch": 27.1, + "grad_norm": 1.5720986127853394, + "learning_rate": 0.0009161084529505583, + "loss": 0.3649, + "step": 16990 + }, + { + "epoch": 27.11, + "grad_norm": 1.331430196762085, + "learning_rate": 0.0009154704944178629, + "loss": 0.4673, + "step": 17000 + }, + { + "epoch": 27.13, + "grad_norm": 1.1665971279144287, + "learning_rate": 0.0009148325358851675, + "loss": 0.3722, + "step": 17010 + }, + { + "epoch": 27.15, + "grad_norm": 0.784977376461029, + "learning_rate": 0.0009141945773524722, + "loss": 0.3737, + "step": 17020 + }, + { + "epoch": 27.16, + "grad_norm": 1.2977066040039062, + "learning_rate": 0.0009135566188197767, + "loss": 0.4048, + "step": 17030 + }, + { + "epoch": 27.18, + "grad_norm": 0.9560506343841553, + "learning_rate": 0.0009129186602870814, + "loss": 0.4475, + "step": 17040 + }, + { + "epoch": 27.19, + "grad_norm": 1.2674915790557861, + "learning_rate": 0.000912280701754386, + "loss": 0.4099, + "step": 17050 + }, + { + "epoch": 27.21, + "grad_norm": 1.7103983163833618, + "learning_rate": 0.0009116427432216906, + "loss": 0.3881, + "step": 17060 + }, + { + "epoch": 27.22, + "grad_norm": 1.464312195777893, + "learning_rate": 0.0009110047846889953, + "loss": 0.3751, + "step": 17070 + }, + { + "epoch": 27.24, + "grad_norm": 1.2396901845932007, + "learning_rate": 0.0009103668261562999, + "loss": 0.3781, + "step": 17080 + }, + { + "epoch": 27.26, + "grad_norm": 1.704807162284851, + "learning_rate": 0.0009097288676236045, + "loss": 0.3462, + "step": 17090 + }, + { + "epoch": 27.27, + "grad_norm": 1.6080540418624878, + "learning_rate": 0.0009090909090909091, + "loss": 0.4836, + "step": 17100 + }, + { + "epoch": 27.29, + "grad_norm": 2.0571115016937256, + "learning_rate": 0.0009084529505582138, + "loss": 0.424, + "step": 17110 + }, + { + "epoch": 27.3, + "grad_norm": 1.9051276445388794, + "learning_rate": 0.0009078149920255183, + "loss": 0.4107, + "step": 17120 + }, + { + "epoch": 27.32, + "grad_norm": 1.0614899396896362, + "learning_rate": 0.000907177033492823, + "loss": 0.4258, + "step": 17130 + }, + { + "epoch": 27.34, + "grad_norm": 0.771443247795105, + "learning_rate": 0.0009065390749601277, + "loss": 0.4893, + "step": 17140 + }, + { + "epoch": 27.35, + "grad_norm": 1.2591562271118164, + "learning_rate": 0.0009059011164274322, + "loss": 0.3847, + "step": 17150 + }, + { + "epoch": 27.37, + "grad_norm": 2.423963785171509, + "learning_rate": 0.0009052631578947369, + "loss": 0.4034, + "step": 17160 + }, + { + "epoch": 27.38, + "grad_norm": 1.5385760068893433, + "learning_rate": 0.0009046251993620415, + "loss": 0.4347, + "step": 17170 + }, + { + "epoch": 27.4, + "grad_norm": 1.886620044708252, + "learning_rate": 0.0009039872408293461, + "loss": 0.5058, + "step": 17180 + }, + { + "epoch": 27.42, + "grad_norm": 1.3259475231170654, + "learning_rate": 0.0009033492822966507, + "loss": 0.4189, + "step": 17190 + }, + { + "epoch": 27.43, + "grad_norm": 2.392594814300537, + "learning_rate": 0.0009027113237639554, + "loss": 0.3627, + "step": 17200 + }, + { + "epoch": 27.45, + "grad_norm": 2.2240548133850098, + "learning_rate": 0.0009020733652312599, + "loss": 0.4367, + "step": 17210 + }, + { + "epoch": 27.46, + "grad_norm": 1.4467096328735352, + "learning_rate": 0.0009014354066985646, + "loss": 0.4677, + "step": 17220 + }, + { + "epoch": 27.48, + "grad_norm": 1.5662921667099, + "learning_rate": 0.0009007974481658693, + "loss": 0.508, + "step": 17230 + }, + { + "epoch": 27.5, + "grad_norm": 1.6414707899093628, + "learning_rate": 0.0009001594896331738, + "loss": 0.3679, + "step": 17240 + }, + { + "epoch": 27.51, + "grad_norm": 1.0565104484558105, + "learning_rate": 0.0008995215311004785, + "loss": 0.3557, + "step": 17250 + }, + { + "epoch": 27.53, + "grad_norm": 1.353499174118042, + "learning_rate": 0.0008988835725677831, + "loss": 0.4911, + "step": 17260 + }, + { + "epoch": 27.54, + "grad_norm": 3.16988205909729, + "learning_rate": 0.0008982456140350877, + "loss": 0.5339, + "step": 17270 + }, + { + "epoch": 27.56, + "grad_norm": 2.6091325283050537, + "learning_rate": 0.0008976076555023923, + "loss": 0.5084, + "step": 17280 + }, + { + "epoch": 27.58, + "grad_norm": 1.3425127267837524, + "learning_rate": 0.000896969696969697, + "loss": 0.4548, + "step": 17290 + }, + { + "epoch": 27.59, + "grad_norm": 1.9189682006835938, + "learning_rate": 0.0008963317384370016, + "loss": 0.4727, + "step": 17300 + }, + { + "epoch": 27.61, + "grad_norm": 3.497046709060669, + "learning_rate": 0.0008956937799043062, + "loss": 0.5446, + "step": 17310 + }, + { + "epoch": 27.62, + "grad_norm": 1.4161769151687622, + "learning_rate": 0.0008950558213716109, + "loss": 0.362, + "step": 17320 + }, + { + "epoch": 27.64, + "grad_norm": 1.7099406719207764, + "learning_rate": 0.0008944178628389154, + "loss": 0.3934, + "step": 17330 + }, + { + "epoch": 27.66, + "grad_norm": 2.129094362258911, + "learning_rate": 0.0008937799043062201, + "loss": 0.4169, + "step": 17340 + }, + { + "epoch": 27.67, + "grad_norm": 1.5544568300247192, + "learning_rate": 0.0008931419457735247, + "loss": 0.4201, + "step": 17350 + }, + { + "epoch": 27.69, + "grad_norm": 1.1022940874099731, + "learning_rate": 0.0008925039872408293, + "loss": 0.4958, + "step": 17360 + }, + { + "epoch": 27.7, + "grad_norm": 1.771380066871643, + "learning_rate": 0.0008918660287081339, + "loss": 0.3859, + "step": 17370 + }, + { + "epoch": 27.72, + "grad_norm": 2.647625207901001, + "learning_rate": 0.0008912280701754386, + "loss": 0.4444, + "step": 17380 + }, + { + "epoch": 27.74, + "grad_norm": 1.4977500438690186, + "learning_rate": 0.0008905901116427433, + "loss": 0.4672, + "step": 17390 + }, + { + "epoch": 27.75, + "grad_norm": 1.1140875816345215, + "learning_rate": 0.0008899521531100478, + "loss": 0.4828, + "step": 17400 + }, + { + "epoch": 27.77, + "grad_norm": 2.1960301399230957, + "learning_rate": 0.0008893141945773525, + "loss": 0.5277, + "step": 17410 + }, + { + "epoch": 27.78, + "grad_norm": 1.2357120513916016, + "learning_rate": 0.000888676236044657, + "loss": 0.3831, + "step": 17420 + }, + { + "epoch": 27.8, + "grad_norm": 2.183209180831909, + "learning_rate": 0.0008880382775119617, + "loss": 0.4646, + "step": 17430 + }, + { + "epoch": 27.81, + "grad_norm": 1.4991573095321655, + "learning_rate": 0.0008874003189792663, + "loss": 0.4445, + "step": 17440 + }, + { + "epoch": 27.83, + "grad_norm": 2.547933340072632, + "learning_rate": 0.000886762360446571, + "loss": 0.609, + "step": 17450 + }, + { + "epoch": 27.85, + "grad_norm": 1.751570224761963, + "learning_rate": 0.0008861244019138756, + "loss": 0.4723, + "step": 17460 + }, + { + "epoch": 27.86, + "grad_norm": 1.5204071998596191, + "learning_rate": 0.0008854864433811802, + "loss": 0.3944, + "step": 17470 + }, + { + "epoch": 27.88, + "grad_norm": 0.8941110372543335, + "learning_rate": 0.0008848484848484849, + "loss": 0.4384, + "step": 17480 + }, + { + "epoch": 27.89, + "grad_norm": 3.1257965564727783, + "learning_rate": 0.0008842105263157894, + "loss": 0.403, + "step": 17490 + }, + { + "epoch": 27.91, + "grad_norm": 1.1965994834899902, + "learning_rate": 0.0008835725677830941, + "loss": 0.4476, + "step": 17500 + }, + { + "epoch": 27.93, + "grad_norm": 2.3756983280181885, + "learning_rate": 0.0008829346092503987, + "loss": 0.5574, + "step": 17510 + }, + { + "epoch": 27.94, + "grad_norm": 2.028165578842163, + "learning_rate": 0.0008822966507177033, + "loss": 0.5269, + "step": 17520 + }, + { + "epoch": 27.96, + "grad_norm": 2.061138868331909, + "learning_rate": 0.0008816586921850079, + "loss": 0.3863, + "step": 17530 + }, + { + "epoch": 27.97, + "grad_norm": 1.6647778749465942, + "learning_rate": 0.0008810207336523126, + "loss": 0.4803, + "step": 17540 + }, + { + "epoch": 27.99, + "grad_norm": 1.8999004364013672, + "learning_rate": 0.0008803827751196173, + "loss": 0.4465, + "step": 17550 + }, + { + "epoch": 28.01, + "grad_norm": 1.4546337127685547, + "learning_rate": 0.0008797448165869219, + "loss": 0.4054, + "step": 17560 + }, + { + "epoch": 28.02, + "grad_norm": 0.5947902798652649, + "learning_rate": 0.0008791068580542266, + "loss": 0.3905, + "step": 17570 + }, + { + "epoch": 28.04, + "grad_norm": 1.231417179107666, + "learning_rate": 0.0008784688995215311, + "loss": 0.3693, + "step": 17580 + }, + { + "epoch": 28.05, + "grad_norm": 1.068305253982544, + "learning_rate": 0.0008778309409888358, + "loss": 0.3525, + "step": 17590 + }, + { + "epoch": 28.07, + "grad_norm": 1.7087610960006714, + "learning_rate": 0.0008771929824561404, + "loss": 0.3472, + "step": 17600 + }, + { + "epoch": 28.09, + "grad_norm": 1.412925362586975, + "learning_rate": 0.000876555023923445, + "loss": 0.469, + "step": 17610 + }, + { + "epoch": 28.1, + "grad_norm": 1.3570494651794434, + "learning_rate": 0.0008759170653907497, + "loss": 0.4222, + "step": 17620 + }, + { + "epoch": 28.12, + "grad_norm": 0.9123827219009399, + "learning_rate": 0.0008752791068580543, + "loss": 0.3517, + "step": 17630 + }, + { + "epoch": 28.13, + "grad_norm": 1.3093185424804688, + "learning_rate": 0.0008746411483253589, + "loss": 0.4117, + "step": 17640 + }, + { + "epoch": 28.15, + "grad_norm": 1.1676615476608276, + "learning_rate": 0.0008740031897926635, + "loss": 0.3839, + "step": 17650 + }, + { + "epoch": 28.17, + "grad_norm": 0.8572595119476318, + "learning_rate": 0.0008733652312599682, + "loss": 0.38, + "step": 17660 + }, + { + "epoch": 28.18, + "grad_norm": 1.9796086549758911, + "learning_rate": 0.0008727272727272727, + "loss": 0.4742, + "step": 17670 + }, + { + "epoch": 28.2, + "grad_norm": 1.599166989326477, + "learning_rate": 0.0008720893141945774, + "loss": 0.4556, + "step": 17680 + }, + { + "epoch": 28.21, + "grad_norm": 1.9437137842178345, + "learning_rate": 0.0008714513556618821, + "loss": 0.4952, + "step": 17690 + }, + { + "epoch": 28.23, + "grad_norm": 1.6551004648208618, + "learning_rate": 0.0008708133971291866, + "loss": 0.3986, + "step": 17700 + }, + { + "epoch": 28.25, + "grad_norm": 1.8391096591949463, + "learning_rate": 0.0008701754385964913, + "loss": 0.4194, + "step": 17710 + }, + { + "epoch": 28.26, + "grad_norm": 0.9920051097869873, + "learning_rate": 0.0008695374800637959, + "loss": 0.3206, + "step": 17720 + }, + { + "epoch": 28.28, + "grad_norm": 1.8732203245162964, + "learning_rate": 0.0008688995215311005, + "loss": 0.4823, + "step": 17730 + }, + { + "epoch": 28.29, + "grad_norm": 1.4714813232421875, + "learning_rate": 0.0008682615629984051, + "loss": 0.4007, + "step": 17740 + }, + { + "epoch": 28.31, + "grad_norm": 1.8994234800338745, + "learning_rate": 0.0008676236044657098, + "loss": 0.4203, + "step": 17750 + }, + { + "epoch": 28.33, + "grad_norm": 1.9376466274261475, + "learning_rate": 0.0008669856459330143, + "loss": 0.4652, + "step": 17760 + }, + { + "epoch": 28.34, + "grad_norm": 2.3434700965881348, + "learning_rate": 0.000866347687400319, + "loss": 0.3574, + "step": 17770 + }, + { + "epoch": 28.36, + "grad_norm": 1.5705221891403198, + "learning_rate": 0.0008657097288676237, + "loss": 0.3634, + "step": 17780 + }, + { + "epoch": 28.37, + "grad_norm": 2.1308560371398926, + "learning_rate": 0.0008650717703349283, + "loss": 0.4423, + "step": 17790 + }, + { + "epoch": 28.39, + "grad_norm": 0.966135561466217, + "learning_rate": 0.0008644338118022329, + "loss": 0.4221, + "step": 17800 + }, + { + "epoch": 28.41, + "grad_norm": 1.37132728099823, + "learning_rate": 0.0008637958532695375, + "loss": 0.4348, + "step": 17810 + }, + { + "epoch": 28.42, + "grad_norm": 1.676096796989441, + "learning_rate": 0.0008631578947368422, + "loss": 0.4338, + "step": 17820 + }, + { + "epoch": 28.44, + "grad_norm": 1.1030077934265137, + "learning_rate": 0.0008625199362041467, + "loss": 0.4399, + "step": 17830 + }, + { + "epoch": 28.45, + "grad_norm": 0.8978865146636963, + "learning_rate": 0.0008618819776714514, + "loss": 0.4306, + "step": 17840 + }, + { + "epoch": 28.47, + "grad_norm": 1.170512080192566, + "learning_rate": 0.0008612440191387561, + "loss": 0.4347, + "step": 17850 + }, + { + "epoch": 28.48, + "grad_norm": 1.0260136127471924, + "learning_rate": 0.0008606060606060606, + "loss": 0.3928, + "step": 17860 + }, + { + "epoch": 28.5, + "grad_norm": 1.04338800907135, + "learning_rate": 0.0008599681020733653, + "loss": 0.4193, + "step": 17870 + }, + { + "epoch": 28.52, + "grad_norm": 0.9068986177444458, + "learning_rate": 0.0008593301435406699, + "loss": 0.3889, + "step": 17880 + }, + { + "epoch": 28.53, + "grad_norm": 1.3259004354476929, + "learning_rate": 0.0008586921850079745, + "loss": 0.3873, + "step": 17890 + }, + { + "epoch": 28.55, + "grad_norm": 1.3916800022125244, + "learning_rate": 0.0008580542264752791, + "loss": 0.4631, + "step": 17900 + }, + { + "epoch": 28.56, + "grad_norm": 2.1619112491607666, + "learning_rate": 0.0008574162679425838, + "loss": 0.4275, + "step": 17910 + }, + { + "epoch": 28.58, + "grad_norm": 1.750162959098816, + "learning_rate": 0.0008567783094098883, + "loss": 0.4469, + "step": 17920 + }, + { + "epoch": 28.6, + "grad_norm": 1.2156579494476318, + "learning_rate": 0.000856140350877193, + "loss": 0.3898, + "step": 17930 + }, + { + "epoch": 28.61, + "grad_norm": 1.1427280902862549, + "learning_rate": 0.0008555023923444977, + "loss": 0.4424, + "step": 17940 + }, + { + "epoch": 28.63, + "grad_norm": 1.6410181522369385, + "learning_rate": 0.0008548644338118022, + "loss": 0.4153, + "step": 17950 + }, + { + "epoch": 28.64, + "grad_norm": 1.1331639289855957, + "learning_rate": 0.0008542264752791069, + "loss": 0.4638, + "step": 17960 + }, + { + "epoch": 28.66, + "grad_norm": 0.9264315366744995, + "learning_rate": 0.0008535885167464115, + "loss": 0.4634, + "step": 17970 + }, + { + "epoch": 28.68, + "grad_norm": 1.4615089893341064, + "learning_rate": 0.0008529505582137161, + "loss": 0.383, + "step": 17980 + }, + { + "epoch": 28.69, + "grad_norm": 1.291256070137024, + "learning_rate": 0.0008523125996810207, + "loss": 0.4415, + "step": 17990 + }, + { + "epoch": 28.71, + "grad_norm": 1.3759894371032715, + "learning_rate": 0.0008516746411483254, + "loss": 0.4275, + "step": 18000 + }, + { + "epoch": 28.72, + "grad_norm": 2.605381488800049, + "learning_rate": 0.00085103668261563, + "loss": 0.4614, + "step": 18010 + }, + { + "epoch": 28.74, + "grad_norm": 1.3442084789276123, + "learning_rate": 0.0008503987240829346, + "loss": 0.4276, + "step": 18020 + }, + { + "epoch": 28.76, + "grad_norm": 1.7800729274749756, + "learning_rate": 0.0008497607655502393, + "loss": 0.5137, + "step": 18030 + }, + { + "epoch": 28.77, + "grad_norm": 1.6473747491836548, + "learning_rate": 0.0008491228070175438, + "loss": 0.4381, + "step": 18040 + }, + { + "epoch": 28.79, + "grad_norm": 1.2551579475402832, + "learning_rate": 0.0008484848484848485, + "loss": 0.3784, + "step": 18050 + }, + { + "epoch": 28.8, + "grad_norm": 1.7706053256988525, + "learning_rate": 0.0008478468899521531, + "loss": 0.4137, + "step": 18060 + }, + { + "epoch": 28.82, + "grad_norm": 1.2189148664474487, + "learning_rate": 0.0008472089314194577, + "loss": 0.3987, + "step": 18070 + }, + { + "epoch": 28.84, + "grad_norm": 2.0609757900238037, + "learning_rate": 0.0008465709728867623, + "loss": 0.4321, + "step": 18080 + }, + { + "epoch": 28.85, + "grad_norm": 3.152968406677246, + "learning_rate": 0.000845933014354067, + "loss": 0.4866, + "step": 18090 + }, + { + "epoch": 28.87, + "grad_norm": 1.9931256771087646, + "learning_rate": 0.0008452950558213716, + "loss": 0.3915, + "step": 18100 + }, + { + "epoch": 28.88, + "grad_norm": 1.5088871717453003, + "learning_rate": 0.0008446570972886762, + "loss": 0.4172, + "step": 18110 + }, + { + "epoch": 28.9, + "grad_norm": 0.9786420464515686, + "learning_rate": 0.0008440191387559809, + "loss": 0.392, + "step": 18120 + }, + { + "epoch": 28.92, + "grad_norm": 1.9202160835266113, + "learning_rate": 0.0008433811802232854, + "loss": 0.3735, + "step": 18130 + }, + { + "epoch": 28.93, + "grad_norm": 1.7300411462783813, + "learning_rate": 0.0008427432216905901, + "loss": 0.493, + "step": 18140 + }, + { + "epoch": 28.95, + "grad_norm": 1.3852993249893188, + "learning_rate": 0.0008421052631578947, + "loss": 0.447, + "step": 18150 + }, + { + "epoch": 28.96, + "grad_norm": 1.5973821878433228, + "learning_rate": 0.0008414673046251993, + "loss": 0.4247, + "step": 18160 + }, + { + "epoch": 28.98, + "grad_norm": 1.521041989326477, + "learning_rate": 0.000840829346092504, + "loss": 0.514, + "step": 18170 + }, + { + "epoch": 29.0, + "grad_norm": 1.2108961343765259, + "learning_rate": 0.0008401913875598086, + "loss": 0.4258, + "step": 18180 + }, + { + "epoch": 29.01, + "grad_norm": 1.1732271909713745, + "learning_rate": 0.0008395534290271133, + "loss": 0.391, + "step": 18190 + }, + { + "epoch": 29.03, + "grad_norm": 2.4832112789154053, + "learning_rate": 0.0008389154704944178, + "loss": 0.4139, + "step": 18200 + }, + { + "epoch": 29.04, + "grad_norm": 1.2037804126739502, + "learning_rate": 0.0008382775119617225, + "loss": 0.3189, + "step": 18210 + }, + { + "epoch": 29.06, + "grad_norm": 1.1315257549285889, + "learning_rate": 0.000837639553429027, + "loss": 0.3455, + "step": 18220 + }, + { + "epoch": 29.07, + "grad_norm": 0.7903701663017273, + "learning_rate": 0.0008370015948963317, + "loss": 0.3511, + "step": 18230 + }, + { + "epoch": 29.09, + "grad_norm": 1.1077697277069092, + "learning_rate": 0.0008363636363636363, + "loss": 0.4795, + "step": 18240 + }, + { + "epoch": 29.11, + "grad_norm": 0.752619206905365, + "learning_rate": 0.000835725677830941, + "loss": 0.3194, + "step": 18250 + }, + { + "epoch": 29.12, + "grad_norm": 2.1113548278808594, + "learning_rate": 0.0008350877192982456, + "loss": 0.3937, + "step": 18260 + }, + { + "epoch": 29.14, + "grad_norm": 1.051826000213623, + "learning_rate": 0.0008344497607655502, + "loss": 0.3966, + "step": 18270 + }, + { + "epoch": 29.15, + "grad_norm": 1.329938530921936, + "learning_rate": 0.0008338118022328549, + "loss": 0.4212, + "step": 18280 + }, + { + "epoch": 29.17, + "grad_norm": 1.7144334316253662, + "learning_rate": 0.0008331738437001594, + "loss": 0.4333, + "step": 18290 + }, + { + "epoch": 29.19, + "grad_norm": 1.252589464187622, + "learning_rate": 0.0008325358851674642, + "loss": 0.379, + "step": 18300 + }, + { + "epoch": 29.2, + "grad_norm": 0.7238291501998901, + "learning_rate": 0.0008318979266347688, + "loss": 0.353, + "step": 18310 + }, + { + "epoch": 29.22, + "grad_norm": 1.5246005058288574, + "learning_rate": 0.0008312599681020734, + "loss": 0.361, + "step": 18320 + }, + { + "epoch": 29.23, + "grad_norm": 1.488550066947937, + "learning_rate": 0.0008306220095693781, + "loss": 0.4171, + "step": 18330 + }, + { + "epoch": 29.25, + "grad_norm": 1.30950129032135, + "learning_rate": 0.0008299840510366827, + "loss": 0.37, + "step": 18340 + }, + { + "epoch": 29.27, + "grad_norm": 2.5584652423858643, + "learning_rate": 0.0008293460925039873, + "loss": 0.4683, + "step": 18350 + }, + { + "epoch": 29.28, + "grad_norm": 1.2544807195663452, + "learning_rate": 0.0008287081339712919, + "loss": 0.3517, + "step": 18360 + }, + { + "epoch": 29.3, + "grad_norm": 1.9312729835510254, + "learning_rate": 0.0008280701754385966, + "loss": 0.3754, + "step": 18370 + }, + { + "epoch": 29.31, + "grad_norm": 1.9227901697158813, + "learning_rate": 0.0008274322169059011, + "loss": 0.353, + "step": 18380 + }, + { + "epoch": 29.33, + "grad_norm": 0.7560509443283081, + "learning_rate": 0.0008267942583732058, + "loss": 0.4141, + "step": 18390 + }, + { + "epoch": 29.35, + "grad_norm": 1.5966806411743164, + "learning_rate": 0.0008261562998405105, + "loss": 0.419, + "step": 18400 + }, + { + "epoch": 29.36, + "grad_norm": 1.8788731098175049, + "learning_rate": 0.000825518341307815, + "loss": 0.4134, + "step": 18410 + }, + { + "epoch": 29.38, + "grad_norm": 0.7582562565803528, + "learning_rate": 0.0008248803827751197, + "loss": 0.3092, + "step": 18420 + }, + { + "epoch": 29.39, + "grad_norm": 1.155375599861145, + "learning_rate": 0.0008242424242424243, + "loss": 0.4677, + "step": 18430 + }, + { + "epoch": 29.41, + "grad_norm": 0.581142246723175, + "learning_rate": 0.0008236044657097289, + "loss": 0.4426, + "step": 18440 + }, + { + "epoch": 29.43, + "grad_norm": 1.6078975200653076, + "learning_rate": 0.0008229665071770335, + "loss": 0.3892, + "step": 18450 + }, + { + "epoch": 29.44, + "grad_norm": 1.3083795309066772, + "learning_rate": 0.0008223285486443382, + "loss": 0.3718, + "step": 18460 + }, + { + "epoch": 29.46, + "grad_norm": 1.41934072971344, + "learning_rate": 0.0008216905901116427, + "loss": 0.3701, + "step": 18470 + }, + { + "epoch": 29.47, + "grad_norm": 1.7969826459884644, + "learning_rate": 0.0008210526315789474, + "loss": 0.44, + "step": 18480 + }, + { + "epoch": 29.49, + "grad_norm": 1.134151577949524, + "learning_rate": 0.0008204146730462521, + "loss": 0.3765, + "step": 18490 + }, + { + "epoch": 29.51, + "grad_norm": 1.9421136379241943, + "learning_rate": 0.0008197767145135566, + "loss": 0.3526, + "step": 18500 + }, + { + "epoch": 29.52, + "grad_norm": 1.0447088479995728, + "learning_rate": 0.0008191387559808613, + "loss": 0.3929, + "step": 18510 + }, + { + "epoch": 29.54, + "grad_norm": 2.2842037677764893, + "learning_rate": 0.0008185007974481659, + "loss": 0.428, + "step": 18520 + }, + { + "epoch": 29.55, + "grad_norm": 1.4780536890029907, + "learning_rate": 0.0008178628389154705, + "loss": 0.4503, + "step": 18530 + }, + { + "epoch": 29.57, + "grad_norm": 1.1551343202590942, + "learning_rate": 0.0008172248803827751, + "loss": 0.3608, + "step": 18540 + }, + { + "epoch": 29.59, + "grad_norm": 0.9097251892089844, + "learning_rate": 0.0008165869218500798, + "loss": 0.3289, + "step": 18550 + }, + { + "epoch": 29.6, + "grad_norm": 1.372117042541504, + "learning_rate": 0.0008159489633173845, + "loss": 0.4009, + "step": 18560 + }, + { + "epoch": 29.62, + "grad_norm": 1.3583626747131348, + "learning_rate": 0.000815311004784689, + "loss": 0.4491, + "step": 18570 + }, + { + "epoch": 29.63, + "grad_norm": 1.208733320236206, + "learning_rate": 0.0008146730462519937, + "loss": 0.4185, + "step": 18580 + }, + { + "epoch": 29.65, + "grad_norm": 1.2088313102722168, + "learning_rate": 0.0008140350877192983, + "loss": 0.3733, + "step": 18590 + }, + { + "epoch": 29.67, + "grad_norm": 1.3854396343231201, + "learning_rate": 0.0008133971291866029, + "loss": 0.4189, + "step": 18600 + }, + { + "epoch": 29.68, + "grad_norm": 2.3625354766845703, + "learning_rate": 0.0008127591706539075, + "loss": 0.4158, + "step": 18610 + }, + { + "epoch": 29.7, + "grad_norm": 1.1244155168533325, + "learning_rate": 0.0008121212121212122, + "loss": 0.4763, + "step": 18620 + }, + { + "epoch": 29.71, + "grad_norm": 1.6580774784088135, + "learning_rate": 0.0008114832535885167, + "loss": 0.441, + "step": 18630 + }, + { + "epoch": 29.73, + "grad_norm": 1.9566985368728638, + "learning_rate": 0.0008108452950558214, + "loss": 0.5087, + "step": 18640 + }, + { + "epoch": 29.74, + "grad_norm": 1.5847853422164917, + "learning_rate": 0.0008102073365231261, + "loss": 0.4876, + "step": 18650 + }, + { + "epoch": 29.76, + "grad_norm": 1.6287412643432617, + "learning_rate": 0.0008095693779904306, + "loss": 0.3967, + "step": 18660 + }, + { + "epoch": 29.78, + "grad_norm": 1.0776193141937256, + "learning_rate": 0.0008089314194577353, + "loss": 0.3406, + "step": 18670 + }, + { + "epoch": 29.79, + "grad_norm": 1.6697405576705933, + "learning_rate": 0.0008082934609250399, + "loss": 0.3976, + "step": 18680 + }, + { + "epoch": 29.81, + "grad_norm": 1.0817621946334839, + "learning_rate": 0.0008076555023923445, + "loss": 0.4045, + "step": 18690 + }, + { + "epoch": 29.82, + "grad_norm": 1.333869457244873, + "learning_rate": 0.0008070175438596491, + "loss": 0.4219, + "step": 18700 + }, + { + "epoch": 29.84, + "grad_norm": 1.135141134262085, + "learning_rate": 0.0008063795853269538, + "loss": 0.4283, + "step": 18710 + }, + { + "epoch": 29.86, + "grad_norm": 1.497247576713562, + "learning_rate": 0.0008057416267942584, + "loss": 0.4653, + "step": 18720 + }, + { + "epoch": 29.87, + "grad_norm": 1.3332675695419312, + "learning_rate": 0.000805103668261563, + "loss": 0.4842, + "step": 18730 + }, + { + "epoch": 29.89, + "grad_norm": 1.2101079225540161, + "learning_rate": 0.0008044657097288677, + "loss": 0.4352, + "step": 18740 + }, + { + "epoch": 29.9, + "grad_norm": 1.161740779876709, + "learning_rate": 0.0008038277511961722, + "loss": 0.4775, + "step": 18750 + }, + { + "epoch": 29.92, + "grad_norm": 2.115443229675293, + "learning_rate": 0.0008031897926634769, + "loss": 0.4047, + "step": 18760 + }, + { + "epoch": 29.94, + "grad_norm": 1.6414830684661865, + "learning_rate": 0.0008025518341307815, + "loss": 0.4002, + "step": 18770 + }, + { + "epoch": 29.95, + "grad_norm": 2.5109338760375977, + "learning_rate": 0.0008019138755980861, + "loss": 0.456, + "step": 18780 + }, + { + "epoch": 29.97, + "grad_norm": 1.0796329975128174, + "learning_rate": 0.0008012759170653907, + "loss": 0.3812, + "step": 18790 + }, + { + "epoch": 29.98, + "grad_norm": 1.581429362297058, + "learning_rate": 0.0008006379585326954, + "loss": 0.4038, + "step": 18800 + }, + { + "epoch": 30.0, + "grad_norm": 1.5293798446655273, + "learning_rate": 0.0008, + "loss": 0.4527, + "step": 18810 + }, + { + "epoch": 30.02, + "grad_norm": 0.3642142117023468, + "learning_rate": 0.0007993620414673046, + "loss": 0.3424, + "step": 18820 + }, + { + "epoch": 30.03, + "grad_norm": 1.181534767150879, + "learning_rate": 0.0007987240829346093, + "loss": 0.3467, + "step": 18830 + }, + { + "epoch": 30.05, + "grad_norm": 1.3809243440628052, + "learning_rate": 0.0007980861244019138, + "loss": 0.3445, + "step": 18840 + }, + { + "epoch": 30.06, + "grad_norm": 0.9193634986877441, + "learning_rate": 0.0007974481658692185, + "loss": 0.418, + "step": 18850 + }, + { + "epoch": 30.08, + "grad_norm": 1.8701001405715942, + "learning_rate": 0.0007968102073365231, + "loss": 0.3377, + "step": 18860 + }, + { + "epoch": 30.1, + "grad_norm": 1.0947514772415161, + "learning_rate": 0.0007961722488038277, + "loss": 0.3047, + "step": 18870 + }, + { + "epoch": 30.11, + "grad_norm": 1.5115679502487183, + "learning_rate": 0.0007955342902711324, + "loss": 0.4687, + "step": 18880 + }, + { + "epoch": 30.13, + "grad_norm": 1.4967734813690186, + "learning_rate": 0.000794896331738437, + "loss": 0.3585, + "step": 18890 + }, + { + "epoch": 30.14, + "grad_norm": 1.0271711349487305, + "learning_rate": 0.0007942583732057416, + "loss": 0.3857, + "step": 18900 + }, + { + "epoch": 30.16, + "grad_norm": 0.6969228982925415, + "learning_rate": 0.0007936204146730462, + "loss": 0.3916, + "step": 18910 + }, + { + "epoch": 30.18, + "grad_norm": 1.0234570503234863, + "learning_rate": 0.0007929824561403509, + "loss": 0.2736, + "step": 18920 + }, + { + "epoch": 30.19, + "grad_norm": 0.3337653577327728, + "learning_rate": 0.0007923444976076554, + "loss": 0.3358, + "step": 18930 + }, + { + "epoch": 30.21, + "grad_norm": 0.8475213646888733, + "learning_rate": 0.0007917065390749601, + "loss": 0.4215, + "step": 18940 + }, + { + "epoch": 30.22, + "grad_norm": 1.190527319908142, + "learning_rate": 0.0007910685805422647, + "loss": 0.3912, + "step": 18950 + }, + { + "epoch": 30.24, + "grad_norm": 2.408203601837158, + "learning_rate": 0.0007904306220095693, + "loss": 0.4475, + "step": 18960 + }, + { + "epoch": 30.26, + "grad_norm": 0.8776838183403015, + "learning_rate": 0.000789792663476874, + "loss": 0.3504, + "step": 18970 + }, + { + "epoch": 30.27, + "grad_norm": 2.3490686416625977, + "learning_rate": 0.0007891547049441786, + "loss": 0.3283, + "step": 18980 + }, + { + "epoch": 30.29, + "grad_norm": 0.9696643948554993, + "learning_rate": 0.0007885167464114833, + "loss": 0.3629, + "step": 18990 + }, + { + "epoch": 30.3, + "grad_norm": 1.1829396486282349, + "learning_rate": 0.0007878787878787878, + "loss": 0.3908, + "step": 19000 + }, + { + "epoch": 30.32, + "grad_norm": 1.0642168521881104, + "learning_rate": 0.0007872408293460925, + "loss": 0.359, + "step": 19010 + }, + { + "epoch": 30.33, + "grad_norm": 1.3545867204666138, + "learning_rate": 0.000786602870813397, + "loss": 0.3656, + "step": 19020 + }, + { + "epoch": 30.35, + "grad_norm": 1.1527637243270874, + "learning_rate": 0.0007859649122807017, + "loss": 0.358, + "step": 19030 + }, + { + "epoch": 30.37, + "grad_norm": 1.6512736082077026, + "learning_rate": 0.0007853269537480065, + "loss": 0.3228, + "step": 19040 + }, + { + "epoch": 30.38, + "grad_norm": 0.9381676912307739, + "learning_rate": 0.0007846889952153111, + "loss": 0.3604, + "step": 19050 + }, + { + "epoch": 30.4, + "grad_norm": 0.786491870880127, + "learning_rate": 0.0007840510366826157, + "loss": 0.4115, + "step": 19060 + }, + { + "epoch": 30.41, + "grad_norm": 1.0724458694458008, + "learning_rate": 0.0007834130781499203, + "loss": 0.431, + "step": 19070 + }, + { + "epoch": 30.43, + "grad_norm": 1.1807036399841309, + "learning_rate": 0.000782775119617225, + "loss": 0.3628, + "step": 19080 + }, + { + "epoch": 30.45, + "grad_norm": 2.059079647064209, + "learning_rate": 0.0007821371610845295, + "loss": 0.4, + "step": 19090 + }, + { + "epoch": 30.46, + "grad_norm": 2.4132237434387207, + "learning_rate": 0.0007814992025518342, + "loss": 0.41, + "step": 19100 + }, + { + "epoch": 30.48, + "grad_norm": 1.3958185911178589, + "learning_rate": 0.0007808612440191388, + "loss": 0.4278, + "step": 19110 + }, + { + "epoch": 30.49, + "grad_norm": 1.9119430780410767, + "learning_rate": 0.0007802232854864434, + "loss": 0.3688, + "step": 19120 + }, + { + "epoch": 30.51, + "grad_norm": 0.8936794996261597, + "learning_rate": 0.0007795853269537481, + "loss": 0.3909, + "step": 19130 + }, + { + "epoch": 30.53, + "grad_norm": 0.8988013863563538, + "learning_rate": 0.0007789473684210527, + "loss": 0.413, + "step": 19140 + }, + { + "epoch": 30.54, + "grad_norm": 1.2821409702301025, + "learning_rate": 0.0007783094098883573, + "loss": 0.3746, + "step": 19150 + }, + { + "epoch": 30.56, + "grad_norm": 1.8209261894226074, + "learning_rate": 0.0007776714513556619, + "loss": 0.4328, + "step": 19160 + }, + { + "epoch": 30.57, + "grad_norm": 0.5443445444107056, + "learning_rate": 0.0007770334928229666, + "loss": 0.3094, + "step": 19170 + }, + { + "epoch": 30.59, + "grad_norm": 1.1508780717849731, + "learning_rate": 0.0007763955342902711, + "loss": 0.3781, + "step": 19180 + }, + { + "epoch": 30.61, + "grad_norm": 1.742360234260559, + "learning_rate": 0.0007757575757575758, + "loss": 0.4211, + "step": 19190 + }, + { + "epoch": 30.62, + "grad_norm": 1.645337462425232, + "learning_rate": 0.0007751196172248805, + "loss": 0.4973, + "step": 19200 + }, + { + "epoch": 30.64, + "grad_norm": 3.0447423458099365, + "learning_rate": 0.000774481658692185, + "loss": 0.4262, + "step": 19210 + }, + { + "epoch": 30.65, + "grad_norm": 1.7042852640151978, + "learning_rate": 0.0007738437001594897, + "loss": 0.3602, + "step": 19220 + }, + { + "epoch": 30.67, + "grad_norm": 1.517238974571228, + "learning_rate": 0.0007732057416267943, + "loss": 0.4683, + "step": 19230 + }, + { + "epoch": 30.69, + "grad_norm": 1.3933392763137817, + "learning_rate": 0.0007725677830940989, + "loss": 0.3464, + "step": 19240 + }, + { + "epoch": 30.7, + "grad_norm": 1.2021284103393555, + "learning_rate": 0.0007719298245614035, + "loss": 0.3205, + "step": 19250 + }, + { + "epoch": 30.72, + "grad_norm": 2.1373493671417236, + "learning_rate": 0.0007712918660287082, + "loss": 0.3901, + "step": 19260 + }, + { + "epoch": 30.73, + "grad_norm": 1.1844630241394043, + "learning_rate": 0.0007706539074960128, + "loss": 0.4874, + "step": 19270 + }, + { + "epoch": 30.75, + "grad_norm": 1.7633776664733887, + "learning_rate": 0.0007700159489633174, + "loss": 0.4525, + "step": 19280 + }, + { + "epoch": 30.77, + "grad_norm": 0.8574751615524292, + "learning_rate": 0.0007693779904306221, + "loss": 0.4477, + "step": 19290 + }, + { + "epoch": 30.78, + "grad_norm": 0.9806014895439148, + "learning_rate": 0.0007687400318979266, + "loss": 0.4231, + "step": 19300 + }, + { + "epoch": 30.8, + "grad_norm": 1.515453577041626, + "learning_rate": 0.0007681020733652313, + "loss": 0.4159, + "step": 19310 + }, + { + "epoch": 30.81, + "grad_norm": 1.195142149925232, + "learning_rate": 0.0007674641148325359, + "loss": 0.463, + "step": 19320 + }, + { + "epoch": 30.83, + "grad_norm": 1.2401401996612549, + "learning_rate": 0.0007668261562998405, + "loss": 0.3719, + "step": 19330 + }, + { + "epoch": 30.85, + "grad_norm": 1.4845614433288574, + "learning_rate": 0.0007661881977671451, + "loss": 0.4013, + "step": 19340 + }, + { + "epoch": 30.86, + "grad_norm": 1.5066015720367432, + "learning_rate": 0.0007655502392344498, + "loss": 0.3917, + "step": 19350 + }, + { + "epoch": 30.88, + "grad_norm": 1.3425683975219727, + "learning_rate": 0.0007649122807017545, + "loss": 0.4719, + "step": 19360 + }, + { + "epoch": 30.89, + "grad_norm": 1.1638840436935425, + "learning_rate": 0.000764274322169059, + "loss": 0.3526, + "step": 19370 + }, + { + "epoch": 30.91, + "grad_norm": 1.2238682508468628, + "learning_rate": 0.0007636363636363637, + "loss": 0.4084, + "step": 19380 + }, + { + "epoch": 30.93, + "grad_norm": 2.78072190284729, + "learning_rate": 0.0007629984051036683, + "loss": 0.4198, + "step": 19390 + }, + { + "epoch": 30.94, + "grad_norm": 1.495713472366333, + "learning_rate": 0.0007623604465709729, + "loss": 0.451, + "step": 19400 + }, + { + "epoch": 30.96, + "grad_norm": 1.5464080572128296, + "learning_rate": 0.0007617224880382775, + "loss": 0.492, + "step": 19410 + }, + { + "epoch": 30.97, + "grad_norm": 1.054413080215454, + "learning_rate": 0.0007610845295055822, + "loss": 0.372, + "step": 19420 + }, + { + "epoch": 30.99, + "grad_norm": 1.212849736213684, + "learning_rate": 0.0007604465709728868, + "loss": 0.4701, + "step": 19430 + }, + { + "epoch": 31.0, + "grad_norm": 1.4419000148773193, + "learning_rate": 0.0007598086124401914, + "loss": 0.4366, + "step": 19440 + }, + { + "epoch": 31.02, + "grad_norm": 1.8938413858413696, + "learning_rate": 0.0007591706539074961, + "loss": 0.3519, + "step": 19450 + }, + { + "epoch": 31.04, + "grad_norm": 0.6526773571968079, + "learning_rate": 0.0007585326953748006, + "loss": 0.3049, + "step": 19460 + }, + { + "epoch": 31.05, + "grad_norm": 1.1578338146209717, + "learning_rate": 0.0007578947368421053, + "loss": 0.32, + "step": 19470 + }, + { + "epoch": 31.07, + "grad_norm": 1.577438235282898, + "learning_rate": 0.0007572567783094099, + "loss": 0.3386, + "step": 19480 + }, + { + "epoch": 31.08, + "grad_norm": 1.2335830926895142, + "learning_rate": 0.0007566188197767145, + "loss": 0.3304, + "step": 19490 + }, + { + "epoch": 31.1, + "grad_norm": 1.2585694789886475, + "learning_rate": 0.0007559808612440191, + "loss": 0.347, + "step": 19500 + }, + { + "epoch": 31.12, + "grad_norm": 0.6757459044456482, + "learning_rate": 0.0007553429027113238, + "loss": 0.3262, + "step": 19510 + }, + { + "epoch": 31.13, + "grad_norm": 1.0831152200698853, + "learning_rate": 0.0007547049441786284, + "loss": 0.3829, + "step": 19520 + }, + { + "epoch": 31.15, + "grad_norm": 0.7511752843856812, + "learning_rate": 0.000754066985645933, + "loss": 0.3401, + "step": 19530 + }, + { + "epoch": 31.16, + "grad_norm": 1.0880722999572754, + "learning_rate": 0.0007534290271132377, + "loss": 0.3095, + "step": 19540 + }, + { + "epoch": 31.18, + "grad_norm": 0.5444307923316956, + "learning_rate": 0.0007527910685805422, + "loss": 0.3768, + "step": 19550 + }, + { + "epoch": 31.2, + "grad_norm": 0.3923839330673218, + "learning_rate": 0.0007521531100478469, + "loss": 0.2829, + "step": 19560 + }, + { + "epoch": 31.21, + "grad_norm": 1.0065523386001587, + "learning_rate": 0.0007515151515151515, + "loss": 0.3505, + "step": 19570 + }, + { + "epoch": 31.23, + "grad_norm": 1.3828843832015991, + "learning_rate": 0.0007508771929824561, + "loss": 0.3315, + "step": 19580 + }, + { + "epoch": 31.24, + "grad_norm": 1.0128448009490967, + "learning_rate": 0.0007502392344497608, + "loss": 0.3646, + "step": 19590 + }, + { + "epoch": 31.26, + "grad_norm": 0.8419422507286072, + "learning_rate": 0.0007496012759170654, + "loss": 0.3779, + "step": 19600 + }, + { + "epoch": 31.28, + "grad_norm": 0.926729679107666, + "learning_rate": 0.00074896331738437, + "loss": 0.3611, + "step": 19610 + }, + { + "epoch": 31.29, + "grad_norm": 0.6536783576011658, + "learning_rate": 0.0007483253588516746, + "loss": 0.4033, + "step": 19620 + }, + { + "epoch": 31.31, + "grad_norm": 0.8884857892990112, + "learning_rate": 0.0007476874003189793, + "loss": 0.3712, + "step": 19630 + }, + { + "epoch": 31.32, + "grad_norm": 0.7593963146209717, + "learning_rate": 0.0007470494417862838, + "loss": 0.3676, + "step": 19640 + }, + { + "epoch": 31.34, + "grad_norm": 0.4570366442203522, + "learning_rate": 0.0007464114832535885, + "loss": 0.3733, + "step": 19650 + }, + { + "epoch": 31.36, + "grad_norm": 0.5554494857788086, + "learning_rate": 0.0007457735247208931, + "loss": 0.3223, + "step": 19660 + }, + { + "epoch": 31.37, + "grad_norm": 0.5048463344573975, + "learning_rate": 0.0007451355661881977, + "loss": 0.3381, + "step": 19670 + }, + { + "epoch": 31.39, + "grad_norm": 0.8749020099639893, + "learning_rate": 0.0007444976076555024, + "loss": 0.4279, + "step": 19680 + }, + { + "epoch": 31.4, + "grad_norm": 1.8333324193954468, + "learning_rate": 0.000743859649122807, + "loss": 0.3923, + "step": 19690 + }, + { + "epoch": 31.42, + "grad_norm": 0.5864129662513733, + "learning_rate": 0.0007432216905901116, + "loss": 0.3666, + "step": 19700 + }, + { + "epoch": 31.44, + "grad_norm": 1.0455960035324097, + "learning_rate": 0.0007425837320574162, + "loss": 0.2746, + "step": 19710 + }, + { + "epoch": 31.45, + "grad_norm": 0.851701021194458, + "learning_rate": 0.0007419457735247209, + "loss": 0.4655, + "step": 19720 + }, + { + "epoch": 31.47, + "grad_norm": 1.4521914720535278, + "learning_rate": 0.0007413078149920254, + "loss": 0.3904, + "step": 19730 + }, + { + "epoch": 31.48, + "grad_norm": 0.7903003096580505, + "learning_rate": 0.0007406698564593301, + "loss": 0.3511, + "step": 19740 + }, + { + "epoch": 31.5, + "grad_norm": 0.5965768694877625, + "learning_rate": 0.0007400318979266348, + "loss": 0.348, + "step": 19750 + }, + { + "epoch": 31.52, + "grad_norm": 1.1456114053726196, + "learning_rate": 0.0007393939393939393, + "loss": 0.3759, + "step": 19760 + }, + { + "epoch": 31.53, + "grad_norm": 1.1218417882919312, + "learning_rate": 0.000738755980861244, + "loss": 0.3504, + "step": 19770 + }, + { + "epoch": 31.55, + "grad_norm": 1.06869375705719, + "learning_rate": 0.0007381180223285486, + "loss": 0.3465, + "step": 19780 + }, + { + "epoch": 31.56, + "grad_norm": 1.4235601425170898, + "learning_rate": 0.0007374800637958534, + "loss": 0.4782, + "step": 19790 + }, + { + "epoch": 31.58, + "grad_norm": 1.7712465524673462, + "learning_rate": 0.0007368421052631579, + "loss": 0.4207, + "step": 19800 + }, + { + "epoch": 31.59, + "grad_norm": 2.5546319484710693, + "learning_rate": 0.0007362041467304626, + "loss": 0.3945, + "step": 19810 + }, + { + "epoch": 31.61, + "grad_norm": 1.4989862442016602, + "learning_rate": 0.0007355661881977672, + "loss": 0.3541, + "step": 19820 + }, + { + "epoch": 31.63, + "grad_norm": 1.1824603080749512, + "learning_rate": 0.0007349282296650718, + "loss": 0.3834, + "step": 19830 + }, + { + "epoch": 31.64, + "grad_norm": 1.468544602394104, + "learning_rate": 0.0007342902711323765, + "loss": 0.4389, + "step": 19840 + }, + { + "epoch": 31.66, + "grad_norm": 1.5891380310058594, + "learning_rate": 0.0007336523125996811, + "loss": 0.3104, + "step": 19850 + }, + { + "epoch": 31.67, + "grad_norm": 1.6829235553741455, + "learning_rate": 0.0007330143540669857, + "loss": 0.4124, + "step": 19860 + }, + { + "epoch": 31.69, + "grad_norm": 0.8262020349502563, + "learning_rate": 0.0007323763955342903, + "loss": 0.4355, + "step": 19870 + }, + { + "epoch": 31.71, + "grad_norm": 1.0964784622192383, + "learning_rate": 0.000731738437001595, + "loss": 0.4051, + "step": 19880 + }, + { + "epoch": 31.72, + "grad_norm": 1.241181492805481, + "learning_rate": 0.0007311004784688995, + "loss": 0.3317, + "step": 19890 + }, + { + "epoch": 31.74, + "grad_norm": 0.7173839807510376, + "learning_rate": 0.0007304625199362042, + "loss": 0.303, + "step": 19900 + }, + { + "epoch": 31.75, + "grad_norm": 1.3341323137283325, + "learning_rate": 0.0007298245614035089, + "loss": 0.4809, + "step": 19910 + }, + { + "epoch": 31.77, + "grad_norm": 0.7707849144935608, + "learning_rate": 0.0007291866028708134, + "loss": 0.3865, + "step": 19920 + }, + { + "epoch": 31.79, + "grad_norm": 1.0697121620178223, + "learning_rate": 0.0007285486443381181, + "loss": 0.3991, + "step": 19930 + }, + { + "epoch": 31.8, + "grad_norm": 0.8612807989120483, + "learning_rate": 0.0007279106858054227, + "loss": 0.3753, + "step": 19940 + }, + { + "epoch": 31.82, + "grad_norm": 1.1503595113754272, + "learning_rate": 0.0007272727272727273, + "loss": 0.4917, + "step": 19950 + }, + { + "epoch": 31.83, + "grad_norm": 2.088966131210327, + "learning_rate": 0.0007266347687400319, + "loss": 0.4367, + "step": 19960 + }, + { + "epoch": 31.85, + "grad_norm": 1.2572288513183594, + "learning_rate": 0.0007259968102073366, + "loss": 0.4804, + "step": 19970 + }, + { + "epoch": 31.87, + "grad_norm": 1.781175136566162, + "learning_rate": 0.0007253588516746412, + "loss": 0.4508, + "step": 19980 + }, + { + "epoch": 31.88, + "grad_norm": 1.0523390769958496, + "learning_rate": 0.0007247208931419458, + "loss": 0.364, + "step": 19990 + }, + { + "epoch": 31.9, + "grad_norm": 1.5974029302597046, + "learning_rate": 0.0007240829346092505, + "loss": 0.3279, + "step": 20000 + }, + { + "epoch": 31.91, + "grad_norm": 1.0655145645141602, + "learning_rate": 0.000723444976076555, + "loss": 0.3507, + "step": 20010 + }, + { + "epoch": 31.93, + "grad_norm": 1.4828819036483765, + "learning_rate": 0.0007228070175438597, + "loss": 0.3621, + "step": 20020 + }, + { + "epoch": 31.95, + "grad_norm": 0.6305584907531738, + "learning_rate": 0.0007221690590111643, + "loss": 0.47, + "step": 20030 + }, + { + "epoch": 31.96, + "grad_norm": 0.873736560344696, + "learning_rate": 0.0007215311004784689, + "loss": 0.3372, + "step": 20040 + }, + { + "epoch": 31.98, + "grad_norm": 0.7609186768531799, + "learning_rate": 0.0007208931419457735, + "loss": 0.4067, + "step": 20050 + }, + { + "epoch": 31.99, + "grad_norm": 0.638640284538269, + "learning_rate": 0.0007202551834130782, + "loss": 0.4417, + "step": 20060 + }, + { + "epoch": 32.01, + "grad_norm": 1.3461627960205078, + "learning_rate": 0.0007196172248803828, + "loss": 0.3463, + "step": 20070 + }, + { + "epoch": 32.03, + "grad_norm": 0.9440305233001709, + "learning_rate": 0.0007189792663476874, + "loss": 0.3378, + "step": 20080 + }, + { + "epoch": 32.04, + "grad_norm": 1.406764268875122, + "learning_rate": 0.0007183413078149921, + "loss": 0.3547, + "step": 20090 + }, + { + "epoch": 32.06, + "grad_norm": 1.0603829622268677, + "learning_rate": 0.0007177033492822966, + "loss": 0.303, + "step": 20100 + }, + { + "epoch": 32.07, + "grad_norm": 0.5735631585121155, + "learning_rate": 0.0007170653907496013, + "loss": 0.321, + "step": 20110 + }, + { + "epoch": 32.09, + "grad_norm": 0.5425032377243042, + "learning_rate": 0.0007164274322169059, + "loss": 0.3486, + "step": 20120 + }, + { + "epoch": 32.11, + "grad_norm": 0.4281626045703888, + "learning_rate": 0.0007157894736842105, + "loss": 0.4408, + "step": 20130 + }, + { + "epoch": 32.12, + "grad_norm": 0.6306964755058289, + "learning_rate": 0.0007151515151515152, + "loss": 0.3664, + "step": 20140 + }, + { + "epoch": 32.14, + "grad_norm": 1.3869534730911255, + "learning_rate": 0.0007145135566188198, + "loss": 0.3345, + "step": 20150 + }, + { + "epoch": 32.15, + "grad_norm": 1.281069040298462, + "learning_rate": 0.0007138755980861245, + "loss": 0.3758, + "step": 20160 + }, + { + "epoch": 32.17, + "grad_norm": 0.7174175381660461, + "learning_rate": 0.000713237639553429, + "loss": 0.4612, + "step": 20170 + }, + { + "epoch": 32.19, + "grad_norm": 0.7497925758361816, + "learning_rate": 0.0007125996810207337, + "loss": 0.3671, + "step": 20180 + }, + { + "epoch": 32.2, + "grad_norm": 0.6942813992500305, + "learning_rate": 0.0007119617224880383, + "loss": 0.3147, + "step": 20190 + }, + { + "epoch": 32.22, + "grad_norm": 1.1213644742965698, + "learning_rate": 0.0007113237639553429, + "loss": 0.3739, + "step": 20200 + }, + { + "epoch": 32.23, + "grad_norm": 0.7664075493812561, + "learning_rate": 0.0007106858054226475, + "loss": 0.3105, + "step": 20210 + }, + { + "epoch": 32.25, + "grad_norm": 0.9661602973937988, + "learning_rate": 0.0007100478468899522, + "loss": 0.3266, + "step": 20220 + }, + { + "epoch": 32.26, + "grad_norm": 1.2888504266738892, + "learning_rate": 0.0007094098883572568, + "loss": 0.3405, + "step": 20230 + }, + { + "epoch": 32.28, + "grad_norm": 0.829325258731842, + "learning_rate": 0.0007087719298245614, + "loss": 0.3778, + "step": 20240 + }, + { + "epoch": 32.3, + "grad_norm": 0.8283563256263733, + "learning_rate": 0.0007081339712918661, + "loss": 0.389, + "step": 20250 + }, + { + "epoch": 32.31, + "grad_norm": 2.8405203819274902, + "learning_rate": 0.0007074960127591706, + "loss": 0.4051, + "step": 20260 + }, + { + "epoch": 32.33, + "grad_norm": 0.9580861330032349, + "learning_rate": 0.0007068580542264753, + "loss": 0.3376, + "step": 20270 + }, + { + "epoch": 32.34, + "grad_norm": 1.7252624034881592, + "learning_rate": 0.0007062200956937799, + "loss": 0.3626, + "step": 20280 + }, + { + "epoch": 32.36, + "grad_norm": 0.8010210990905762, + "learning_rate": 0.0007055821371610845, + "loss": 0.3524, + "step": 20290 + }, + { + "epoch": 32.38, + "grad_norm": 0.7796013951301575, + "learning_rate": 0.0007049441786283892, + "loss": 0.3778, + "step": 20300 + }, + { + "epoch": 32.39, + "grad_norm": 0.8661381602287292, + "learning_rate": 0.0007043062200956938, + "loss": 0.4093, + "step": 20310 + }, + { + "epoch": 32.41, + "grad_norm": 0.987169623374939, + "learning_rate": 0.0007036682615629984, + "loss": 0.2886, + "step": 20320 + }, + { + "epoch": 32.42, + "grad_norm": 1.090738296508789, + "learning_rate": 0.000703030303030303, + "loss": 0.3848, + "step": 20330 + }, + { + "epoch": 32.44, + "grad_norm": 2.1070291996002197, + "learning_rate": 0.0007023923444976077, + "loss": 0.3214, + "step": 20340 + }, + { + "epoch": 32.46, + "grad_norm": 1.216748833656311, + "learning_rate": 0.0007017543859649122, + "loss": 0.3446, + "step": 20350 + }, + { + "epoch": 32.47, + "grad_norm": 0.8944370150566101, + "learning_rate": 0.0007011164274322169, + "loss": 0.3595, + "step": 20360 + }, + { + "epoch": 32.49, + "grad_norm": 0.7445225119590759, + "learning_rate": 0.0007004784688995215, + "loss": 0.3376, + "step": 20370 + }, + { + "epoch": 32.5, + "grad_norm": 0.584930419921875, + "learning_rate": 0.0006998405103668261, + "loss": 0.361, + "step": 20380 + }, + { + "epoch": 32.52, + "grad_norm": 1.1852445602416992, + "learning_rate": 0.0006992025518341308, + "loss": 0.423, + "step": 20390 + }, + { + "epoch": 32.54, + "grad_norm": 0.8096782565116882, + "learning_rate": 0.0006985645933014354, + "loss": 0.3028, + "step": 20400 + }, + { + "epoch": 32.55, + "grad_norm": 0.7330004572868347, + "learning_rate": 0.00069792663476874, + "loss": 0.3499, + "step": 20410 + }, + { + "epoch": 32.57, + "grad_norm": 0.556844174861908, + "learning_rate": 0.0006972886762360446, + "loss": 0.4042, + "step": 20420 + }, + { + "epoch": 32.58, + "grad_norm": 1.2681604623794556, + "learning_rate": 0.0006966507177033493, + "loss": 0.2892, + "step": 20430 + }, + { + "epoch": 32.6, + "grad_norm": 0.985937237739563, + "learning_rate": 0.0006960127591706538, + "loss": 0.367, + "step": 20440 + }, + { + "epoch": 32.62, + "grad_norm": 0.685664176940918, + "learning_rate": 0.0006953748006379585, + "loss": 0.4311, + "step": 20450 + }, + { + "epoch": 32.63, + "grad_norm": 0.580774188041687, + "learning_rate": 0.0006947368421052632, + "loss": 0.3973, + "step": 20460 + }, + { + "epoch": 32.65, + "grad_norm": 0.5588364601135254, + "learning_rate": 0.0006940988835725677, + "loss": 0.3902, + "step": 20470 + }, + { + "epoch": 32.66, + "grad_norm": 2.530954599380493, + "learning_rate": 0.0006934609250398724, + "loss": 0.4405, + "step": 20480 + }, + { + "epoch": 32.68, + "grad_norm": 0.9018158316612244, + "learning_rate": 0.000692822966507177, + "loss": 0.4219, + "step": 20490 + }, + { + "epoch": 32.7, + "grad_norm": 1.1370121240615845, + "learning_rate": 0.0006921850079744816, + "loss": 0.3901, + "step": 20500 + }, + { + "epoch": 32.71, + "grad_norm": 0.9494215250015259, + "learning_rate": 0.0006915470494417862, + "loss": 0.3377, + "step": 20510 + }, + { + "epoch": 32.73, + "grad_norm": 0.7825329899787903, + "learning_rate": 0.0006909090909090909, + "loss": 0.3479, + "step": 20520 + }, + { + "epoch": 32.74, + "grad_norm": 1.0042078495025635, + "learning_rate": 0.0006902711323763954, + "loss": 0.3888, + "step": 20530 + }, + { + "epoch": 32.76, + "grad_norm": 1.4132115840911865, + "learning_rate": 0.0006896331738437002, + "loss": 0.3898, + "step": 20540 + }, + { + "epoch": 32.78, + "grad_norm": 0.9790666103363037, + "learning_rate": 0.0006889952153110049, + "loss": 0.4135, + "step": 20550 + }, + { + "epoch": 32.79, + "grad_norm": 1.599612832069397, + "learning_rate": 0.0006883572567783095, + "loss": 0.4875, + "step": 20560 + }, + { + "epoch": 32.81, + "grad_norm": 0.934172511100769, + "learning_rate": 0.0006877192982456141, + "loss": 0.4245, + "step": 20570 + }, + { + "epoch": 32.82, + "grad_norm": 1.2698485851287842, + "learning_rate": 0.0006870813397129187, + "loss": 0.3811, + "step": 20580 + }, + { + "epoch": 32.84, + "grad_norm": 1.3154641389846802, + "learning_rate": 0.0006864433811802234, + "loss": 0.3682, + "step": 20590 + }, + { + "epoch": 32.85, + "grad_norm": 0.9714843034744263, + "learning_rate": 0.0006858054226475279, + "loss": 0.3105, + "step": 20600 + }, + { + "epoch": 32.87, + "grad_norm": 0.6481144428253174, + "learning_rate": 0.0006851674641148326, + "loss": 0.3489, + "step": 20610 + }, + { + "epoch": 32.89, + "grad_norm": 1.0251431465148926, + "learning_rate": 0.0006845295055821373, + "loss": 0.382, + "step": 20620 + }, + { + "epoch": 32.9, + "grad_norm": 1.046749234199524, + "learning_rate": 0.0006838915470494418, + "loss": 0.4344, + "step": 20630 + }, + { + "epoch": 32.92, + "grad_norm": 1.45224928855896, + "learning_rate": 0.0006832535885167465, + "loss": 0.3381, + "step": 20640 + }, + { + "epoch": 32.93, + "grad_norm": 1.0642521381378174, + "learning_rate": 0.0006826156299840511, + "loss": 0.4037, + "step": 20650 + }, + { + "epoch": 32.95, + "grad_norm": 0.7638436555862427, + "learning_rate": 0.0006819776714513557, + "loss": 0.4348, + "step": 20660 + }, + { + "epoch": 32.97, + "grad_norm": 0.9343836307525635, + "learning_rate": 0.0006813397129186603, + "loss": 0.4039, + "step": 20670 + }, + { + "epoch": 32.98, + "grad_norm": 0.6389197707176208, + "learning_rate": 0.000680701754385965, + "loss": 0.3571, + "step": 20680 + }, + { + "epoch": 33.0, + "grad_norm": 1.0805469751358032, + "learning_rate": 0.0006800637958532696, + "loss": 0.3274, + "step": 20690 + }, + { + "epoch": 33.01, + "grad_norm": 1.199524998664856, + "learning_rate": 0.0006794258373205742, + "loss": 0.3143, + "step": 20700 + }, + { + "epoch": 33.03, + "grad_norm": 1.2967311143875122, + "learning_rate": 0.0006787878787878789, + "loss": 0.3555, + "step": 20710 + }, + { + "epoch": 33.05, + "grad_norm": 1.0752925872802734, + "learning_rate": 0.0006781499202551834, + "loss": 0.3797, + "step": 20720 + }, + { + "epoch": 33.06, + "grad_norm": 0.7720149159431458, + "learning_rate": 0.0006775119617224881, + "loss": 0.2873, + "step": 20730 + }, + { + "epoch": 33.08, + "grad_norm": 0.6133707761764526, + "learning_rate": 0.0006768740031897927, + "loss": 0.3092, + "step": 20740 + }, + { + "epoch": 33.09, + "grad_norm": 0.8874982595443726, + "learning_rate": 0.0006762360446570973, + "loss": 0.3187, + "step": 20750 + }, + { + "epoch": 33.11, + "grad_norm": 1.3732993602752686, + "learning_rate": 0.0006755980861244019, + "loss": 0.2782, + "step": 20760 + }, + { + "epoch": 33.13, + "grad_norm": 0.9452306032180786, + "learning_rate": 0.0006749601275917066, + "loss": 0.2779, + "step": 20770 + }, + { + "epoch": 33.14, + "grad_norm": 1.7680912017822266, + "learning_rate": 0.0006743221690590112, + "loss": 0.2884, + "step": 20780 + }, + { + "epoch": 33.16, + "grad_norm": 1.6482670307159424, + "learning_rate": 0.0006736842105263158, + "loss": 0.359, + "step": 20790 + }, + { + "epoch": 33.17, + "grad_norm": 0.7076551914215088, + "learning_rate": 0.0006730462519936205, + "loss": 0.3134, + "step": 20800 + }, + { + "epoch": 33.19, + "grad_norm": 0.630064845085144, + "learning_rate": 0.000672408293460925, + "loss": 0.4372, + "step": 20810 + }, + { + "epoch": 33.21, + "grad_norm": 1.7952457666397095, + "learning_rate": 0.0006717703349282297, + "loss": 0.3711, + "step": 20820 + }, + { + "epoch": 33.22, + "grad_norm": 2.3427815437316895, + "learning_rate": 0.0006711323763955343, + "loss": 0.3385, + "step": 20830 + }, + { + "epoch": 33.24, + "grad_norm": 0.6796151995658875, + "learning_rate": 0.0006704944178628389, + "loss": 0.2952, + "step": 20840 + }, + { + "epoch": 33.25, + "grad_norm": 1.292067527770996, + "learning_rate": 0.0006698564593301436, + "loss": 0.3058, + "step": 20850 + }, + { + "epoch": 33.27, + "grad_norm": 0.8857368230819702, + "learning_rate": 0.0006692185007974482, + "loss": 0.3184, + "step": 20860 + }, + { + "epoch": 33.29, + "grad_norm": 0.8489099740982056, + "learning_rate": 0.0006685805422647528, + "loss": 0.3405, + "step": 20870 + }, + { + "epoch": 33.3, + "grad_norm": 0.4213086664676666, + "learning_rate": 0.0006679425837320574, + "loss": 0.2919, + "step": 20880 + }, + { + "epoch": 33.32, + "grad_norm": 0.3164719343185425, + "learning_rate": 0.0006673046251993621, + "loss": 0.3406, + "step": 20890 + }, + { + "epoch": 33.33, + "grad_norm": 1.2104874849319458, + "learning_rate": 0.0006666666666666666, + "loss": 0.3277, + "step": 20900 + }, + { + "epoch": 33.35, + "grad_norm": 1.2871508598327637, + "learning_rate": 0.0006660287081339713, + "loss": 0.4363, + "step": 20910 + }, + { + "epoch": 33.37, + "grad_norm": 1.1551439762115479, + "learning_rate": 0.0006653907496012759, + "loss": 0.4146, + "step": 20920 + }, + { + "epoch": 33.38, + "grad_norm": 0.4967116713523865, + "learning_rate": 0.0006647527910685805, + "loss": 0.3614, + "step": 20930 + }, + { + "epoch": 33.4, + "grad_norm": 1.4939340353012085, + "learning_rate": 0.0006641148325358852, + "loss": 0.2574, + "step": 20940 + }, + { + "epoch": 33.41, + "grad_norm": 2.036379337310791, + "learning_rate": 0.0006634768740031898, + "loss": 0.2869, + "step": 20950 + }, + { + "epoch": 33.43, + "grad_norm": 0.6936982870101929, + "learning_rate": 0.0006628389154704945, + "loss": 0.3495, + "step": 20960 + }, + { + "epoch": 33.44, + "grad_norm": 1.4173444509506226, + "learning_rate": 0.000662200956937799, + "loss": 0.4171, + "step": 20970 + }, + { + "epoch": 33.46, + "grad_norm": 0.9318954348564148, + "learning_rate": 0.0006615629984051037, + "loss": 0.368, + "step": 20980 + }, + { + "epoch": 33.48, + "grad_norm": 0.6373530030250549, + "learning_rate": 0.0006609250398724083, + "loss": 0.3407, + "step": 20990 + }, + { + "epoch": 33.49, + "grad_norm": 0.5435881614685059, + "learning_rate": 0.0006602870813397129, + "loss": 0.3582, + "step": 21000 + }, + { + "epoch": 33.51, + "grad_norm": 0.5529409050941467, + "learning_rate": 0.0006596491228070176, + "loss": 0.397, + "step": 21010 + }, + { + "epoch": 33.52, + "grad_norm": 0.6477614641189575, + "learning_rate": 0.0006590111642743222, + "loss": 0.4706, + "step": 21020 + }, + { + "epoch": 33.54, + "grad_norm": 0.6400772929191589, + "learning_rate": 0.0006583732057416268, + "loss": 0.3936, + "step": 21030 + }, + { + "epoch": 33.56, + "grad_norm": 0.7568894624710083, + "learning_rate": 0.0006577352472089314, + "loss": 0.2936, + "step": 21040 + }, + { + "epoch": 33.57, + "grad_norm": 0.6574100852012634, + "learning_rate": 0.0006570972886762361, + "loss": 0.3454, + "step": 21050 + }, + { + "epoch": 33.59, + "grad_norm": 0.6999270915985107, + "learning_rate": 0.0006564593301435406, + "loss": 0.3597, + "step": 21060 + }, + { + "epoch": 33.6, + "grad_norm": 1.4358758926391602, + "learning_rate": 0.0006558213716108453, + "loss": 0.4673, + "step": 21070 + }, + { + "epoch": 33.62, + "grad_norm": 0.8735805749893188, + "learning_rate": 0.0006551834130781499, + "loss": 0.3329, + "step": 21080 + }, + { + "epoch": 33.64, + "grad_norm": 0.42257770895957947, + "learning_rate": 0.0006545454545454545, + "loss": 0.3934, + "step": 21090 + }, + { + "epoch": 33.65, + "grad_norm": 0.5228465795516968, + "learning_rate": 0.0006539074960127592, + "loss": 0.3563, + "step": 21100 + }, + { + "epoch": 33.67, + "grad_norm": 0.7257753014564514, + "learning_rate": 0.0006532695374800638, + "loss": 0.3194, + "step": 21110 + }, + { + "epoch": 33.68, + "grad_norm": 1.101475477218628, + "learning_rate": 0.0006526315789473684, + "loss": 0.3633, + "step": 21120 + }, + { + "epoch": 33.7, + "grad_norm": 1.2462613582611084, + "learning_rate": 0.000651993620414673, + "loss": 0.3845, + "step": 21130 + }, + { + "epoch": 33.72, + "grad_norm": 0.8615121841430664, + "learning_rate": 0.0006513556618819777, + "loss": 0.3183, + "step": 21140 + }, + { + "epoch": 33.73, + "grad_norm": 1.6341915130615234, + "learning_rate": 0.0006507177033492822, + "loss": 0.3739, + "step": 21150 + }, + { + "epoch": 33.75, + "grad_norm": 0.7129934430122375, + "learning_rate": 0.0006500797448165869, + "loss": 0.3184, + "step": 21160 + }, + { + "epoch": 33.76, + "grad_norm": 1.0505317449569702, + "learning_rate": 0.0006494417862838916, + "loss": 0.3377, + "step": 21170 + }, + { + "epoch": 33.78, + "grad_norm": 0.6486239433288574, + "learning_rate": 0.0006488038277511961, + "loss": 0.4064, + "step": 21180 + }, + { + "epoch": 33.8, + "grad_norm": 0.805962324142456, + "learning_rate": 0.0006481658692185008, + "loss": 0.4032, + "step": 21190 + }, + { + "epoch": 33.81, + "grad_norm": 0.8866637349128723, + "learning_rate": 0.0006475279106858054, + "loss": 0.4194, + "step": 21200 + }, + { + "epoch": 33.83, + "grad_norm": 2.0624029636383057, + "learning_rate": 0.00064688995215311, + "loss": 0.3906, + "step": 21210 + }, + { + "epoch": 33.84, + "grad_norm": 0.9357002377510071, + "learning_rate": 0.0006462519936204146, + "loss": 0.4074, + "step": 21220 + }, + { + "epoch": 33.86, + "grad_norm": 0.7102904915809631, + "learning_rate": 0.0006456140350877193, + "loss": 0.365, + "step": 21230 + }, + { + "epoch": 33.88, + "grad_norm": 1.7485020160675049, + "learning_rate": 0.0006449760765550238, + "loss": 0.3714, + "step": 21240 + }, + { + "epoch": 33.89, + "grad_norm": 1.0567692518234253, + "learning_rate": 0.0006443381180223285, + "loss": 0.3404, + "step": 21250 + }, + { + "epoch": 33.91, + "grad_norm": 1.1951782703399658, + "learning_rate": 0.0006437001594896332, + "loss": 0.3729, + "step": 21260 + }, + { + "epoch": 33.92, + "grad_norm": 1.056022047996521, + "learning_rate": 0.0006430622009569377, + "loss": 0.4126, + "step": 21270 + }, + { + "epoch": 33.94, + "grad_norm": 0.45082104206085205, + "learning_rate": 0.0006424242424242425, + "loss": 0.3527, + "step": 21280 + }, + { + "epoch": 33.96, + "grad_norm": 0.9164630174636841, + "learning_rate": 0.0006417862838915471, + "loss": 0.3731, + "step": 21290 + }, + { + "epoch": 33.97, + "grad_norm": 1.3435860872268677, + "learning_rate": 0.0006411483253588518, + "loss": 0.3889, + "step": 21300 + }, + { + "epoch": 33.99, + "grad_norm": 1.0820218324661255, + "learning_rate": 0.0006405103668261563, + "loss": 0.3823, + "step": 21310 + }, + { + "epoch": 34.0, + "grad_norm": 0.43368542194366455, + "learning_rate": 0.000639872408293461, + "loss": 0.2854, + "step": 21320 + }, + { + "epoch": 34.02, + "grad_norm": 1.282607913017273, + "learning_rate": 0.0006392344497607657, + "loss": 0.3346, + "step": 21330 + }, + { + "epoch": 34.04, + "grad_norm": 0.48912495374679565, + "learning_rate": 0.0006385964912280702, + "loss": 0.3253, + "step": 21340 + }, + { + "epoch": 34.05, + "grad_norm": 0.5396437644958496, + "learning_rate": 0.0006379585326953749, + "loss": 0.3848, + "step": 21350 + }, + { + "epoch": 34.07, + "grad_norm": 0.43667685985565186, + "learning_rate": 0.0006373205741626795, + "loss": 0.389, + "step": 21360 + }, + { + "epoch": 34.08, + "grad_norm": 0.7458956837654114, + "learning_rate": 0.0006366826156299841, + "loss": 0.3669, + "step": 21370 + }, + { + "epoch": 34.1, + "grad_norm": 0.5539690256118774, + "learning_rate": 0.0006360446570972887, + "loss": 0.3428, + "step": 21380 + }, + { + "epoch": 34.11, + "grad_norm": 0.5653215646743774, + "learning_rate": 0.0006354066985645934, + "loss": 0.3716, + "step": 21390 + }, + { + "epoch": 34.13, + "grad_norm": 0.7448431849479675, + "learning_rate": 0.0006347687400318979, + "loss": 0.3799, + "step": 21400 + }, + { + "epoch": 34.15, + "grad_norm": 1.6342990398406982, + "learning_rate": 0.0006341307814992026, + "loss": 0.281, + "step": 21410 + }, + { + "epoch": 34.16, + "grad_norm": 1.0939606428146362, + "learning_rate": 0.0006334928229665073, + "loss": 0.2995, + "step": 21420 + }, + { + "epoch": 34.18, + "grad_norm": 0.4550718665122986, + "learning_rate": 0.0006328548644338118, + "loss": 0.2839, + "step": 21430 + }, + { + "epoch": 34.19, + "grad_norm": 0.7015230655670166, + "learning_rate": 0.0006322169059011165, + "loss": 0.3624, + "step": 21440 + }, + { + "epoch": 34.21, + "grad_norm": 0.9311388731002808, + "learning_rate": 0.0006315789473684211, + "loss": 0.3116, + "step": 21450 + }, + { + "epoch": 34.23, + "grad_norm": 0.519597053527832, + "learning_rate": 0.0006309409888357257, + "loss": 0.3464, + "step": 21460 + }, + { + "epoch": 34.24, + "grad_norm": 0.687154233455658, + "learning_rate": 0.0006303030303030303, + "loss": 0.2955, + "step": 21470 + }, + { + "epoch": 34.26, + "grad_norm": 0.6777644753456116, + "learning_rate": 0.000629665071770335, + "loss": 0.3579, + "step": 21480 + }, + { + "epoch": 34.27, + "grad_norm": 1.1561830043792725, + "learning_rate": 0.0006290271132376396, + "loss": 0.324, + "step": 21490 + }, + { + "epoch": 34.29, + "grad_norm": 0.6058475375175476, + "learning_rate": 0.0006283891547049442, + "loss": 0.4582, + "step": 21500 + }, + { + "epoch": 34.31, + "grad_norm": 1.3952281475067139, + "learning_rate": 0.0006277511961722489, + "loss": 0.2972, + "step": 21510 + }, + { + "epoch": 34.32, + "grad_norm": 0.9021815061569214, + "learning_rate": 0.0006271132376395534, + "loss": 0.3736, + "step": 21520 + }, + { + "epoch": 34.34, + "grad_norm": 0.5777958631515503, + "learning_rate": 0.0006264752791068581, + "loss": 0.3418, + "step": 21530 + }, + { + "epoch": 34.35, + "grad_norm": 0.5624024271965027, + "learning_rate": 0.0006258373205741627, + "loss": 0.3897, + "step": 21540 + }, + { + "epoch": 34.37, + "grad_norm": 1.0554344654083252, + "learning_rate": 0.0006251993620414673, + "loss": 0.3636, + "step": 21550 + }, + { + "epoch": 34.39, + "grad_norm": 0.39624953269958496, + "learning_rate": 0.000624561403508772, + "loss": 0.3199, + "step": 21560 + }, + { + "epoch": 34.4, + "grad_norm": 0.8201066255569458, + "learning_rate": 0.0006239234449760766, + "loss": 0.4237, + "step": 21570 + }, + { + "epoch": 34.42, + "grad_norm": 0.7447034120559692, + "learning_rate": 0.0006232854864433812, + "loss": 0.3045, + "step": 21580 + }, + { + "epoch": 34.43, + "grad_norm": 0.37216076254844666, + "learning_rate": 0.0006226475279106858, + "loss": 0.287, + "step": 21590 + }, + { + "epoch": 34.45, + "grad_norm": 1.3851195573806763, + "learning_rate": 0.0006220095693779905, + "loss": 0.3752, + "step": 21600 + }, + { + "epoch": 34.47, + "grad_norm": 0.5135475397109985, + "learning_rate": 0.000621371610845295, + "loss": 0.2575, + "step": 21610 + }, + { + "epoch": 34.48, + "grad_norm": 1.3252980709075928, + "learning_rate": 0.0006207336523125997, + "loss": 0.3204, + "step": 21620 + }, + { + "epoch": 34.5, + "grad_norm": 1.036947250366211, + "learning_rate": 0.0006200956937799043, + "loss": 0.3715, + "step": 21630 + }, + { + "epoch": 34.51, + "grad_norm": 0.9725881218910217, + "learning_rate": 0.0006194577352472089, + "loss": 0.2895, + "step": 21640 + }, + { + "epoch": 34.53, + "grad_norm": 0.8383840322494507, + "learning_rate": 0.0006188197767145136, + "loss": 0.2823, + "step": 21650 + }, + { + "epoch": 34.55, + "grad_norm": 0.5011244416236877, + "learning_rate": 0.0006181818181818182, + "loss": 0.2946, + "step": 21660 + }, + { + "epoch": 34.56, + "grad_norm": 0.5851901769638062, + "learning_rate": 0.0006175438596491228, + "loss": 0.3679, + "step": 21670 + }, + { + "epoch": 34.58, + "grad_norm": 1.2106326818466187, + "learning_rate": 0.0006169059011164274, + "loss": 0.3548, + "step": 21680 + }, + { + "epoch": 34.59, + "grad_norm": 0.7996150255203247, + "learning_rate": 0.0006162679425837321, + "loss": 0.3279, + "step": 21690 + }, + { + "epoch": 34.61, + "grad_norm": 0.9852333664894104, + "learning_rate": 0.0006156299840510366, + "loss": 0.3803, + "step": 21700 + }, + { + "epoch": 34.63, + "grad_norm": 1.8588385581970215, + "learning_rate": 0.0006149920255183413, + "loss": 0.3173, + "step": 21710 + }, + { + "epoch": 34.64, + "grad_norm": 1.403646469116211, + "learning_rate": 0.000614354066985646, + "loss": 0.383, + "step": 21720 + }, + { + "epoch": 34.66, + "grad_norm": 0.7591367363929749, + "learning_rate": 0.0006137161084529505, + "loss": 0.3134, + "step": 21730 + }, + { + "epoch": 34.67, + "grad_norm": 0.8111428022384644, + "learning_rate": 0.0006130781499202552, + "loss": 0.4017, + "step": 21740 + }, + { + "epoch": 34.69, + "grad_norm": 0.7398600578308105, + "learning_rate": 0.0006124401913875598, + "loss": 0.3378, + "step": 21750 + }, + { + "epoch": 34.7, + "grad_norm": 2.6811933517456055, + "learning_rate": 0.0006118022328548645, + "loss": 0.4099, + "step": 21760 + }, + { + "epoch": 34.72, + "grad_norm": 0.5849010944366455, + "learning_rate": 0.000611164274322169, + "loss": 0.3977, + "step": 21770 + }, + { + "epoch": 34.74, + "grad_norm": 1.6892285346984863, + "learning_rate": 0.0006105263157894737, + "loss": 0.3775, + "step": 21780 + }, + { + "epoch": 34.75, + "grad_norm": 0.6772777438163757, + "learning_rate": 0.0006098883572567783, + "loss": 0.3051, + "step": 21790 + }, + { + "epoch": 34.77, + "grad_norm": 0.7815658450126648, + "learning_rate": 0.0006092503987240829, + "loss": 0.3252, + "step": 21800 + }, + { + "epoch": 34.78, + "grad_norm": 0.7828931212425232, + "learning_rate": 0.0006086124401913876, + "loss": 0.3276, + "step": 21810 + }, + { + "epoch": 34.8, + "grad_norm": 0.6614720821380615, + "learning_rate": 0.0006079744816586922, + "loss": 0.3114, + "step": 21820 + }, + { + "epoch": 34.82, + "grad_norm": 0.6951574087142944, + "learning_rate": 0.0006073365231259968, + "loss": 0.4708, + "step": 21830 + }, + { + "epoch": 34.83, + "grad_norm": 0.5724729895591736, + "learning_rate": 0.0006066985645933014, + "loss": 0.4992, + "step": 21840 + }, + { + "epoch": 34.85, + "grad_norm": 0.5912214517593384, + "learning_rate": 0.0006060606060606061, + "loss": 0.3571, + "step": 21850 + }, + { + "epoch": 34.86, + "grad_norm": 1.9406144618988037, + "learning_rate": 0.0006054226475279106, + "loss": 0.4081, + "step": 21860 + }, + { + "epoch": 34.88, + "grad_norm": 0.6928081512451172, + "learning_rate": 0.0006047846889952153, + "loss": 0.3651, + "step": 21870 + }, + { + "epoch": 34.9, + "grad_norm": 1.4750044345855713, + "learning_rate": 0.00060414673046252, + "loss": 0.3445, + "step": 21880 + }, + { + "epoch": 34.91, + "grad_norm": 1.0808309316635132, + "learning_rate": 0.0006035087719298245, + "loss": 0.3738, + "step": 21890 + }, + { + "epoch": 34.93, + "grad_norm": 0.8171405792236328, + "learning_rate": 0.0006028708133971292, + "loss": 0.3313, + "step": 21900 + }, + { + "epoch": 34.94, + "grad_norm": 0.9406991004943848, + "learning_rate": 0.0006022328548644338, + "loss": 0.3488, + "step": 21910 + }, + { + "epoch": 34.96, + "grad_norm": 0.7322232127189636, + "learning_rate": 0.0006015948963317384, + "loss": 0.3948, + "step": 21920 + }, + { + "epoch": 34.98, + "grad_norm": 1.1117455959320068, + "learning_rate": 0.000600956937799043, + "loss": 0.385, + "step": 21930 + }, + { + "epoch": 34.99, + "grad_norm": 1.0977877378463745, + "learning_rate": 0.0006003189792663477, + "loss": 0.419, + "step": 21940 + }, + { + "epoch": 35.01, + "grad_norm": 0.726335346698761, + "learning_rate": 0.0005996810207336522, + "loss": 0.3169, + "step": 21950 + }, + { + "epoch": 35.02, + "grad_norm": 0.49332767724990845, + "learning_rate": 0.0005990430622009569, + "loss": 0.2902, + "step": 21960 + }, + { + "epoch": 35.04, + "grad_norm": 0.9816588759422302, + "learning_rate": 0.0005984051036682616, + "loss": 0.3322, + "step": 21970 + }, + { + "epoch": 35.06, + "grad_norm": 0.8066359162330627, + "learning_rate": 0.0005977671451355661, + "loss": 0.3058, + "step": 21980 + }, + { + "epoch": 35.07, + "grad_norm": 0.38948720693588257, + "learning_rate": 0.0005971291866028708, + "loss": 0.2839, + "step": 21990 + }, + { + "epoch": 35.09, + "grad_norm": 0.2944769561290741, + "learning_rate": 0.0005964912280701754, + "loss": 0.3656, + "step": 22000 + }, + { + "epoch": 35.1, + "grad_norm": 0.3112677335739136, + "learning_rate": 0.00059585326953748, + "loss": 0.2358, + "step": 22010 + }, + { + "epoch": 35.12, + "grad_norm": 2.4940788745880127, + "learning_rate": 0.0005952153110047846, + "loss": 0.3809, + "step": 22020 + }, + { + "epoch": 35.14, + "grad_norm": 0.9833939671516418, + "learning_rate": 0.0005945773524720894, + "loss": 0.3437, + "step": 22030 + }, + { + "epoch": 35.15, + "grad_norm": 1.0946290493011475, + "learning_rate": 0.000593939393939394, + "loss": 0.3872, + "step": 22040 + }, + { + "epoch": 35.17, + "grad_norm": 1.2367923259735107, + "learning_rate": 0.0005933014354066986, + "loss": 0.3041, + "step": 22050 + }, + { + "epoch": 35.18, + "grad_norm": 1.032891035079956, + "learning_rate": 0.0005926634768740033, + "loss": 0.2409, + "step": 22060 + }, + { + "epoch": 35.2, + "grad_norm": 0.7659148573875427, + "learning_rate": 0.0005920255183413078, + "loss": 0.3928, + "step": 22070 + }, + { + "epoch": 35.22, + "grad_norm": 0.337522953748703, + "learning_rate": 0.0005913875598086125, + "loss": 0.3563, + "step": 22080 + }, + { + "epoch": 35.23, + "grad_norm": 0.6713753342628479, + "learning_rate": 0.0005907496012759171, + "loss": 0.3728, + "step": 22090 + }, + { + "epoch": 35.25, + "grad_norm": 1.1470608711242676, + "learning_rate": 0.0005901116427432218, + "loss": 0.3121, + "step": 22100 + }, + { + "epoch": 35.26, + "grad_norm": 0.5234013199806213, + "learning_rate": 0.0005894736842105263, + "loss": 0.42, + "step": 22110 + }, + { + "epoch": 35.28, + "grad_norm": 0.6255330443382263, + "learning_rate": 0.000588835725677831, + "loss": 0.2818, + "step": 22120 + }, + { + "epoch": 35.3, + "grad_norm": 1.1830130815505981, + "learning_rate": 0.0005881977671451357, + "loss": 0.3217, + "step": 22130 + }, + { + "epoch": 35.31, + "grad_norm": 0.47124946117401123, + "learning_rate": 0.0005875598086124402, + "loss": 0.3054, + "step": 22140 + }, + { + "epoch": 35.33, + "grad_norm": 0.5270739793777466, + "learning_rate": 0.0005869218500797449, + "loss": 0.3684, + "step": 22150 + }, + { + "epoch": 35.34, + "grad_norm": 1.9852588176727295, + "learning_rate": 0.0005862838915470495, + "loss": 0.3598, + "step": 22160 + }, + { + "epoch": 35.36, + "grad_norm": 1.0637511014938354, + "learning_rate": 0.0005856459330143541, + "loss": 0.3553, + "step": 22170 + }, + { + "epoch": 35.37, + "grad_norm": 0.7305306792259216, + "learning_rate": 0.0005850079744816587, + "loss": 0.3361, + "step": 22180 + }, + { + "epoch": 35.39, + "grad_norm": 1.0449053049087524, + "learning_rate": 0.0005843700159489634, + "loss": 0.45, + "step": 22190 + }, + { + "epoch": 35.41, + "grad_norm": 0.3895207643508911, + "learning_rate": 0.000583732057416268, + "loss": 0.38, + "step": 22200 + }, + { + "epoch": 35.42, + "grad_norm": 0.8981882333755493, + "learning_rate": 0.0005830940988835726, + "loss": 0.376, + "step": 22210 + }, + { + "epoch": 35.44, + "grad_norm": 1.1853015422821045, + "learning_rate": 0.0005824561403508773, + "loss": 0.4054, + "step": 22220 + }, + { + "epoch": 35.45, + "grad_norm": 0.6197064518928528, + "learning_rate": 0.0005818181818181818, + "loss": 0.3198, + "step": 22230 + }, + { + "epoch": 35.47, + "grad_norm": 0.5569806694984436, + "learning_rate": 0.0005811802232854865, + "loss": 0.4118, + "step": 22240 + }, + { + "epoch": 35.49, + "grad_norm": 0.48562178015708923, + "learning_rate": 0.0005805422647527911, + "loss": 0.2063, + "step": 22250 + }, + { + "epoch": 35.5, + "grad_norm": 0.5743929743766785, + "learning_rate": 0.0005799043062200957, + "loss": 0.2263, + "step": 22260 + }, + { + "epoch": 35.52, + "grad_norm": 0.5665689706802368, + "learning_rate": 0.0005792663476874004, + "loss": 0.3017, + "step": 22270 + }, + { + "epoch": 35.53, + "grad_norm": 0.7719668745994568, + "learning_rate": 0.000578628389154705, + "loss": 0.3591, + "step": 22280 + }, + { + "epoch": 35.55, + "grad_norm": 1.785213828086853, + "learning_rate": 0.0005779904306220096, + "loss": 0.3357, + "step": 22290 + }, + { + "epoch": 35.57, + "grad_norm": 0.3386642336845398, + "learning_rate": 0.0005773524720893142, + "loss": 0.3164, + "step": 22300 + }, + { + "epoch": 35.58, + "grad_norm": 0.8696405291557312, + "learning_rate": 0.0005767145135566189, + "loss": 0.3237, + "step": 22310 + }, + { + "epoch": 35.6, + "grad_norm": 0.32794955372810364, + "learning_rate": 0.0005760765550239234, + "loss": 0.2756, + "step": 22320 + }, + { + "epoch": 35.61, + "grad_norm": 0.3796286880970001, + "learning_rate": 0.0005754385964912281, + "loss": 0.3981, + "step": 22330 + }, + { + "epoch": 35.63, + "grad_norm": 0.31685948371887207, + "learning_rate": 0.0005748006379585327, + "loss": 0.3877, + "step": 22340 + }, + { + "epoch": 35.65, + "grad_norm": 0.3694205582141876, + "learning_rate": 0.0005741626794258373, + "loss": 0.2914, + "step": 22350 + }, + { + "epoch": 35.66, + "grad_norm": 0.6097325086593628, + "learning_rate": 0.000573524720893142, + "loss": 0.2863, + "step": 22360 + }, + { + "epoch": 35.68, + "grad_norm": 0.7454453110694885, + "learning_rate": 0.0005728867623604466, + "loss": 0.3531, + "step": 22370 + }, + { + "epoch": 35.69, + "grad_norm": 0.4996640086174011, + "learning_rate": 0.0005722488038277512, + "loss": 0.3063, + "step": 22380 + }, + { + "epoch": 35.71, + "grad_norm": 0.4868077337741852, + "learning_rate": 0.0005716108452950558, + "loss": 0.3087, + "step": 22390 + }, + { + "epoch": 35.73, + "grad_norm": 0.3814201056957245, + "learning_rate": 0.0005709728867623605, + "loss": 0.3242, + "step": 22400 + }, + { + "epoch": 35.74, + "grad_norm": 0.5458118915557861, + "learning_rate": 0.000570334928229665, + "loss": 0.2917, + "step": 22410 + }, + { + "epoch": 35.76, + "grad_norm": 0.7367342114448547, + "learning_rate": 0.0005696969696969697, + "loss": 0.3678, + "step": 22420 + }, + { + "epoch": 35.77, + "grad_norm": 0.876809298992157, + "learning_rate": 0.0005690590111642744, + "loss": 0.2996, + "step": 22430 + }, + { + "epoch": 35.79, + "grad_norm": 0.392926424741745, + "learning_rate": 0.0005684210526315789, + "loss": 0.2191, + "step": 22440 + }, + { + "epoch": 35.81, + "grad_norm": 0.5339792966842651, + "learning_rate": 0.0005677830940988836, + "loss": 0.3324, + "step": 22450 + }, + { + "epoch": 35.82, + "grad_norm": 0.31976428627967834, + "learning_rate": 0.0005671451355661882, + "loss": 0.3723, + "step": 22460 + }, + { + "epoch": 35.84, + "grad_norm": 1.3592702150344849, + "learning_rate": 0.0005665071770334928, + "loss": 0.3606, + "step": 22470 + }, + { + "epoch": 35.85, + "grad_norm": 0.4233976900577545, + "learning_rate": 0.0005658692185007974, + "loss": 0.329, + "step": 22480 + }, + { + "epoch": 35.87, + "grad_norm": 0.6980434656143188, + "learning_rate": 0.0005652312599681021, + "loss": 0.3132, + "step": 22490 + }, + { + "epoch": 35.89, + "grad_norm": 0.766575813293457, + "learning_rate": 0.0005645933014354066, + "loss": 0.3447, + "step": 22500 + }, + { + "epoch": 35.9, + "grad_norm": 0.6142354011535645, + "learning_rate": 0.0005639553429027113, + "loss": 0.2992, + "step": 22510 + }, + { + "epoch": 35.92, + "grad_norm": 0.41867053508758545, + "learning_rate": 0.000563317384370016, + "loss": 0.3276, + "step": 22520 + }, + { + "epoch": 35.93, + "grad_norm": 0.5943330526351929, + "learning_rate": 0.0005626794258373205, + "loss": 0.4112, + "step": 22530 + }, + { + "epoch": 35.95, + "grad_norm": 1.2840982675552368, + "learning_rate": 0.0005620414673046252, + "loss": 0.406, + "step": 22540 + }, + { + "epoch": 35.96, + "grad_norm": 0.5472711324691772, + "learning_rate": 0.0005614035087719298, + "loss": 0.3947, + "step": 22550 + }, + { + "epoch": 35.98, + "grad_norm": 0.49946820735931396, + "learning_rate": 0.0005607655502392345, + "loss": 0.3561, + "step": 22560 + }, + { + "epoch": 36.0, + "grad_norm": 0.5711825489997864, + "learning_rate": 0.000560127591706539, + "loss": 0.4207, + "step": 22570 + }, + { + "epoch": 36.01, + "grad_norm": 2.452195882797241, + "learning_rate": 0.0005594896331738437, + "loss": 0.3378, + "step": 22580 + }, + { + "epoch": 36.03, + "grad_norm": 0.39312276244163513, + "learning_rate": 0.0005588516746411484, + "loss": 0.3136, + "step": 22590 + }, + { + "epoch": 36.04, + "grad_norm": 0.7896597981452942, + "learning_rate": 0.0005582137161084529, + "loss": 0.3815, + "step": 22600 + }, + { + "epoch": 36.06, + "grad_norm": 0.5603874921798706, + "learning_rate": 0.0005575757575757576, + "loss": 0.3509, + "step": 22610 + }, + { + "epoch": 36.08, + "grad_norm": 0.3025873899459839, + "learning_rate": 0.0005569377990430622, + "loss": 0.2575, + "step": 22620 + }, + { + "epoch": 36.09, + "grad_norm": 0.2621009349822998, + "learning_rate": 0.0005562998405103668, + "loss": 0.2562, + "step": 22630 + }, + { + "epoch": 36.11, + "grad_norm": 0.6688899397850037, + "learning_rate": 0.0005556618819776714, + "loss": 0.3194, + "step": 22640 + }, + { + "epoch": 36.12, + "grad_norm": 2.3156378269195557, + "learning_rate": 0.0005550239234449761, + "loss": 0.3564, + "step": 22650 + }, + { + "epoch": 36.14, + "grad_norm": 0.35387060046195984, + "learning_rate": 0.0005543859649122806, + "loss": 0.3054, + "step": 22660 + }, + { + "epoch": 36.16, + "grad_norm": 0.3707694709300995, + "learning_rate": 0.0005537480063795853, + "loss": 0.3391, + "step": 22670 + }, + { + "epoch": 36.17, + "grad_norm": 0.675459086894989, + "learning_rate": 0.00055311004784689, + "loss": 0.3202, + "step": 22680 + }, + { + "epoch": 36.19, + "grad_norm": 0.46194231510162354, + "learning_rate": 0.0005524720893141945, + "loss": 0.2755, + "step": 22690 + }, + { + "epoch": 36.2, + "grad_norm": 0.4732086956501007, + "learning_rate": 0.0005518341307814992, + "loss": 0.2891, + "step": 22700 + }, + { + "epoch": 36.22, + "grad_norm": 0.5394445061683655, + "learning_rate": 0.0005511961722488038, + "loss": 0.2845, + "step": 22710 + }, + { + "epoch": 36.24, + "grad_norm": 0.7429685592651367, + "learning_rate": 0.0005505582137161084, + "loss": 0.3268, + "step": 22720 + }, + { + "epoch": 36.25, + "grad_norm": 0.4031120240688324, + "learning_rate": 0.000549920255183413, + "loss": 0.3197, + "step": 22730 + }, + { + "epoch": 36.27, + "grad_norm": 1.3633867502212524, + "learning_rate": 0.0005492822966507177, + "loss": 0.2912, + "step": 22740 + }, + { + "epoch": 36.28, + "grad_norm": 0.246135875582695, + "learning_rate": 0.0005486443381180223, + "loss": 0.3485, + "step": 22750 + }, + { + "epoch": 36.3, + "grad_norm": 0.7717587351799011, + "learning_rate": 0.0005480063795853269, + "loss": 0.2938, + "step": 22760 + }, + { + "epoch": 36.32, + "grad_norm": 0.5031578540802002, + "learning_rate": 0.0005473684210526317, + "loss": 0.3693, + "step": 22770 + }, + { + "epoch": 36.33, + "grad_norm": 0.46057426929473877, + "learning_rate": 0.0005467304625199362, + "loss": 0.2782, + "step": 22780 + }, + { + "epoch": 36.35, + "grad_norm": 0.33407339453697205, + "learning_rate": 0.0005460925039872409, + "loss": 0.3133, + "step": 22790 + }, + { + "epoch": 36.36, + "grad_norm": 0.7417854070663452, + "learning_rate": 0.0005454545454545455, + "loss": 0.3441, + "step": 22800 + }, + { + "epoch": 36.38, + "grad_norm": 0.3010425567626953, + "learning_rate": 0.0005448165869218501, + "loss": 0.314, + "step": 22810 + }, + { + "epoch": 36.4, + "grad_norm": 0.5968150496482849, + "learning_rate": 0.0005441786283891547, + "loss": 0.3526, + "step": 22820 + }, + { + "epoch": 36.41, + "grad_norm": 0.8175147771835327, + "learning_rate": 0.0005435406698564594, + "loss": 0.3451, + "step": 22830 + }, + { + "epoch": 36.43, + "grad_norm": 1.3906422853469849, + "learning_rate": 0.000542902711323764, + "loss": 0.2813, + "step": 22840 + }, + { + "epoch": 36.44, + "grad_norm": 0.47024595737457275, + "learning_rate": 0.0005422647527910686, + "loss": 0.3559, + "step": 22850 + }, + { + "epoch": 36.46, + "grad_norm": 0.3460497558116913, + "learning_rate": 0.0005416267942583733, + "loss": 0.2783, + "step": 22860 + }, + { + "epoch": 36.48, + "grad_norm": 0.5971447825431824, + "learning_rate": 0.0005409888357256778, + "loss": 0.3274, + "step": 22870 + }, + { + "epoch": 36.49, + "grad_norm": 0.9573736190795898, + "learning_rate": 0.0005403508771929825, + "loss": 0.2442, + "step": 22880 + }, + { + "epoch": 36.51, + "grad_norm": 0.4627261757850647, + "learning_rate": 0.0005397129186602871, + "loss": 0.2677, + "step": 22890 + }, + { + "epoch": 36.52, + "grad_norm": 0.45995354652404785, + "learning_rate": 0.0005390749601275918, + "loss": 0.3892, + "step": 22900 + }, + { + "epoch": 36.54, + "grad_norm": 0.2959776818752289, + "learning_rate": 0.0005384370015948964, + "loss": 0.3216, + "step": 22910 + }, + { + "epoch": 36.56, + "grad_norm": 0.4786494970321655, + "learning_rate": 0.000537799043062201, + "loss": 0.3105, + "step": 22920 + }, + { + "epoch": 36.57, + "grad_norm": 0.462162584066391, + "learning_rate": 0.0005371610845295057, + "loss": 0.4404, + "step": 22930 + }, + { + "epoch": 36.59, + "grad_norm": 0.37563401460647583, + "learning_rate": 0.0005365231259968102, + "loss": 0.2949, + "step": 22940 + }, + { + "epoch": 36.6, + "grad_norm": 0.4217167794704437, + "learning_rate": 0.0005358851674641149, + "loss": 0.425, + "step": 22950 + }, + { + "epoch": 36.62, + "grad_norm": 0.5127308964729309, + "learning_rate": 0.0005352472089314195, + "loss": 0.3215, + "step": 22960 + }, + { + "epoch": 36.63, + "grad_norm": 1.0700709819793701, + "learning_rate": 0.0005346092503987241, + "loss": 0.3379, + "step": 22970 + }, + { + "epoch": 36.65, + "grad_norm": 0.6836196184158325, + "learning_rate": 0.0005339712918660288, + "loss": 0.3811, + "step": 22980 + }, + { + "epoch": 36.67, + "grad_norm": 0.2946398854255676, + "learning_rate": 0.0005333333333333334, + "loss": 0.2362, + "step": 22990 + }, + { + "epoch": 36.68, + "grad_norm": 0.38813692331314087, + "learning_rate": 0.000532695374800638, + "loss": 0.3188, + "step": 23000 + }, + { + "epoch": 36.7, + "grad_norm": 0.483698308467865, + "learning_rate": 0.0005320574162679426, + "loss": 0.4241, + "step": 23010 + }, + { + "epoch": 36.71, + "grad_norm": 0.5879315733909607, + "learning_rate": 0.0005314194577352473, + "loss": 0.3343, + "step": 23020 + }, + { + "epoch": 36.73, + "grad_norm": 0.3913237154483795, + "learning_rate": 0.0005307814992025518, + "loss": 0.3438, + "step": 23030 + }, + { + "epoch": 36.75, + "grad_norm": 0.9392869472503662, + "learning_rate": 0.0005301435406698565, + "loss": 0.3663, + "step": 23040 + }, + { + "epoch": 36.76, + "grad_norm": 0.4291793704032898, + "learning_rate": 0.0005295055821371611, + "loss": 0.3078, + "step": 23050 + }, + { + "epoch": 36.78, + "grad_norm": 0.6778882741928101, + "learning_rate": 0.0005288676236044657, + "loss": 0.3618, + "step": 23060 + }, + { + "epoch": 36.79, + "grad_norm": 0.9089276194572449, + "learning_rate": 0.0005282296650717704, + "loss": 0.3424, + "step": 23070 + }, + { + "epoch": 36.81, + "grad_norm": 0.6602213978767395, + "learning_rate": 0.000527591706539075, + "loss": 0.3148, + "step": 23080 + }, + { + "epoch": 36.83, + "grad_norm": 0.4564104378223419, + "learning_rate": 0.0005269537480063796, + "loss": 0.4301, + "step": 23090 + }, + { + "epoch": 36.84, + "grad_norm": 0.23501376807689667, + "learning_rate": 0.0005263157894736842, + "loss": 0.3264, + "step": 23100 + }, + { + "epoch": 36.86, + "grad_norm": 1.654263973236084, + "learning_rate": 0.0005256778309409889, + "loss": 0.3627, + "step": 23110 + }, + { + "epoch": 36.87, + "grad_norm": 0.8504493236541748, + "learning_rate": 0.0005250398724082934, + "loss": 0.3381, + "step": 23120 + }, + { + "epoch": 36.89, + "grad_norm": 0.7040032744407654, + "learning_rate": 0.0005244019138755981, + "loss": 0.4432, + "step": 23130 + }, + { + "epoch": 36.91, + "grad_norm": 0.5224348902702332, + "learning_rate": 0.0005237639553429028, + "loss": 0.3652, + "step": 23140 + }, + { + "epoch": 36.92, + "grad_norm": 0.5879861116409302, + "learning_rate": 0.0005231259968102073, + "loss": 0.3216, + "step": 23150 + }, + { + "epoch": 36.94, + "grad_norm": 0.3892087936401367, + "learning_rate": 0.000522488038277512, + "loss": 0.3308, + "step": 23160 + }, + { + "epoch": 36.95, + "grad_norm": 0.2915053069591522, + "learning_rate": 0.0005218500797448166, + "loss": 0.3044, + "step": 23170 + }, + { + "epoch": 36.97, + "grad_norm": 0.515186607837677, + "learning_rate": 0.0005212121212121212, + "loss": 0.4347, + "step": 23180 + }, + { + "epoch": 36.99, + "grad_norm": 0.4125446677207947, + "learning_rate": 0.0005205741626794258, + "loss": 0.3828, + "step": 23190 + }, + { + "epoch": 37.0, + "grad_norm": 0.4284899830818176, + "learning_rate": 0.0005199362041467305, + "loss": 0.4189, + "step": 23200 + }, + { + "epoch": 37.02, + "grad_norm": 1.1735564470291138, + "learning_rate": 0.000519298245614035, + "loss": 0.3337, + "step": 23210 + }, + { + "epoch": 37.03, + "grad_norm": 1.21298348903656, + "learning_rate": 0.0005186602870813397, + "loss": 0.3879, + "step": 23220 + }, + { + "epoch": 37.05, + "grad_norm": 3.3211417198181152, + "learning_rate": 0.0005180223285486444, + "loss": 0.3724, + "step": 23230 + }, + { + "epoch": 37.07, + "grad_norm": 0.5634852647781372, + "learning_rate": 0.0005173843700159489, + "loss": 0.2916, + "step": 23240 + }, + { + "epoch": 37.08, + "grad_norm": 0.40934479236602783, + "learning_rate": 0.0005167464114832536, + "loss": 0.3133, + "step": 23250 + }, + { + "epoch": 37.1, + "grad_norm": 0.6190032958984375, + "learning_rate": 0.0005161084529505582, + "loss": 0.2639, + "step": 23260 + }, + { + "epoch": 37.11, + "grad_norm": 0.38555908203125, + "learning_rate": 0.0005154704944178628, + "loss": 0.3493, + "step": 23270 + }, + { + "epoch": 37.13, + "grad_norm": 0.2890884280204773, + "learning_rate": 0.0005148325358851674, + "loss": 0.3042, + "step": 23280 + }, + { + "epoch": 37.15, + "grad_norm": 0.3978734016418457, + "learning_rate": 0.0005141945773524721, + "loss": 0.276, + "step": 23290 + }, + { + "epoch": 37.16, + "grad_norm": 0.6064948439598083, + "learning_rate": 0.0005135566188197768, + "loss": 0.3809, + "step": 23300 + }, + { + "epoch": 37.18, + "grad_norm": 0.6788705587387085, + "learning_rate": 0.0005129186602870813, + "loss": 0.2914, + "step": 23310 + }, + { + "epoch": 37.19, + "grad_norm": 0.4636113941669464, + "learning_rate": 0.000512280701754386, + "loss": 0.3822, + "step": 23320 + }, + { + "epoch": 37.21, + "grad_norm": 0.6636508107185364, + "learning_rate": 0.0005116427432216905, + "loss": 0.3767, + "step": 23330 + }, + { + "epoch": 37.22, + "grad_norm": 0.435531347990036, + "learning_rate": 0.0005110047846889952, + "loss": 0.4045, + "step": 23340 + }, + { + "epoch": 37.24, + "grad_norm": 0.5816912651062012, + "learning_rate": 0.0005103668261562998, + "loss": 0.3083, + "step": 23350 + }, + { + "epoch": 37.26, + "grad_norm": 0.8348118662834167, + "learning_rate": 0.0005097288676236045, + "loss": 0.2738, + "step": 23360 + }, + { + "epoch": 37.27, + "grad_norm": 0.5250842571258545, + "learning_rate": 0.000509090909090909, + "loss": 0.3266, + "step": 23370 + }, + { + "epoch": 37.29, + "grad_norm": 0.3116588592529297, + "learning_rate": 0.0005084529505582137, + "loss": 0.2874, + "step": 23380 + }, + { + "epoch": 37.3, + "grad_norm": 0.5619212985038757, + "learning_rate": 0.0005078149920255184, + "loss": 0.2712, + "step": 23390 + }, + { + "epoch": 37.32, + "grad_norm": 0.34848636388778687, + "learning_rate": 0.0005071770334928229, + "loss": 0.4232, + "step": 23400 + }, + { + "epoch": 37.34, + "grad_norm": 0.38688668608665466, + "learning_rate": 0.0005065390749601276, + "loss": 0.2947, + "step": 23410 + }, + { + "epoch": 37.35, + "grad_norm": 1.7782784700393677, + "learning_rate": 0.0005059011164274322, + "loss": 0.3313, + "step": 23420 + }, + { + "epoch": 37.37, + "grad_norm": 0.2640959620475769, + "learning_rate": 0.0005052631578947368, + "loss": 0.2622, + "step": 23430 + }, + { + "epoch": 37.38, + "grad_norm": 0.2727811932563782, + "learning_rate": 0.0005046251993620414, + "loss": 0.3136, + "step": 23440 + }, + { + "epoch": 37.4, + "grad_norm": 0.5404552817344666, + "learning_rate": 0.0005039872408293461, + "loss": 0.3078, + "step": 23450 + }, + { + "epoch": 37.42, + "grad_norm": 0.38602226972579956, + "learning_rate": 0.0005033492822966507, + "loss": 0.2807, + "step": 23460 + }, + { + "epoch": 37.43, + "grad_norm": 0.30310848355293274, + "learning_rate": 0.0005027113237639553, + "loss": 0.3203, + "step": 23470 + }, + { + "epoch": 37.45, + "grad_norm": 0.41210854053497314, + "learning_rate": 0.00050207336523126, + "loss": 0.3025, + "step": 23480 + }, + { + "epoch": 37.46, + "grad_norm": 0.7113584876060486, + "learning_rate": 0.0005014354066985645, + "loss": 0.3876, + "step": 23490 + }, + { + "epoch": 37.48, + "grad_norm": 0.8924645185470581, + "learning_rate": 0.0005007974481658692, + "loss": 0.2642, + "step": 23500 + }, + { + "epoch": 37.5, + "grad_norm": 0.5637812614440918, + "learning_rate": 0.0005001594896331738, + "loss": 0.3732, + "step": 23510 + }, + { + "epoch": 37.51, + "grad_norm": 0.34932073950767517, + "learning_rate": 0.0004995215311004785, + "loss": 0.2753, + "step": 23520 + }, + { + "epoch": 37.53, + "grad_norm": 0.39498457312583923, + "learning_rate": 0.0004988835725677831, + "loss": 0.2949, + "step": 23530 + }, + { + "epoch": 37.54, + "grad_norm": 0.4476890563964844, + "learning_rate": 0.0004982456140350878, + "loss": 0.318, + "step": 23540 + }, + { + "epoch": 37.56, + "grad_norm": 0.3034002482891083, + "learning_rate": 0.0004976076555023923, + "loss": 0.2778, + "step": 23550 + }, + { + "epoch": 37.58, + "grad_norm": 0.7696762084960938, + "learning_rate": 0.000496969696969697, + "loss": 0.3525, + "step": 23560 + }, + { + "epoch": 37.59, + "grad_norm": 0.6639572978019714, + "learning_rate": 0.0004963317384370016, + "loss": 0.2929, + "step": 23570 + }, + { + "epoch": 37.61, + "grad_norm": 0.8098918199539185, + "learning_rate": 0.0004956937799043062, + "loss": 0.3162, + "step": 23580 + }, + { + "epoch": 37.62, + "grad_norm": 0.7061499357223511, + "learning_rate": 0.0004950558213716109, + "loss": 0.4247, + "step": 23590 + }, + { + "epoch": 37.64, + "grad_norm": 0.7736586928367615, + "learning_rate": 0.0004944178628389155, + "loss": 0.3253, + "step": 23600 + }, + { + "epoch": 37.66, + "grad_norm": 0.22601386904716492, + "learning_rate": 0.0004937799043062201, + "loss": 0.322, + "step": 23610 + }, + { + "epoch": 37.67, + "grad_norm": 0.34596702456474304, + "learning_rate": 0.0004931419457735247, + "loss": 0.381, + "step": 23620 + }, + { + "epoch": 37.69, + "grad_norm": 0.399099737405777, + "learning_rate": 0.0004925039872408294, + "loss": 0.367, + "step": 23630 + }, + { + "epoch": 37.7, + "grad_norm": 0.4223106801509857, + "learning_rate": 0.0004918660287081339, + "loss": 0.3502, + "step": 23640 + }, + { + "epoch": 37.72, + "grad_norm": 0.36701181530952454, + "learning_rate": 0.0004912280701754386, + "loss": 0.3696, + "step": 23650 + }, + { + "epoch": 37.74, + "grad_norm": 1.0397878885269165, + "learning_rate": 0.0004905901116427433, + "loss": 0.2922, + "step": 23660 + }, + { + "epoch": 37.75, + "grad_norm": 0.6061972975730896, + "learning_rate": 0.0004899521531100478, + "loss": 0.3406, + "step": 23670 + }, + { + "epoch": 37.77, + "grad_norm": 0.46018704771995544, + "learning_rate": 0.0004893141945773525, + "loss": 0.3847, + "step": 23680 + }, + { + "epoch": 37.78, + "grad_norm": 0.4098079204559326, + "learning_rate": 0.0004886762360446571, + "loss": 0.2944, + "step": 23690 + }, + { + "epoch": 37.8, + "grad_norm": 0.33187025785446167, + "learning_rate": 0.0004880382775119617, + "loss": 0.3696, + "step": 23700 + }, + { + "epoch": 37.81, + "grad_norm": 1.5086455345153809, + "learning_rate": 0.00048740031897926637, + "loss": 0.3415, + "step": 23710 + }, + { + "epoch": 37.83, + "grad_norm": 0.1812012493610382, + "learning_rate": 0.000486762360446571, + "loss": 0.3317, + "step": 23720 + }, + { + "epoch": 37.85, + "grad_norm": 0.4595651924610138, + "learning_rate": 0.00048612440191387566, + "loss": 0.3449, + "step": 23730 + }, + { + "epoch": 37.86, + "grad_norm": 0.7050609588623047, + "learning_rate": 0.0004854864433811803, + "loss": 0.3273, + "step": 23740 + }, + { + "epoch": 37.88, + "grad_norm": 0.4877799451351166, + "learning_rate": 0.0004848484848484849, + "loss": 0.2679, + "step": 23750 + }, + { + "epoch": 37.89, + "grad_norm": 0.4837338328361511, + "learning_rate": 0.0004842105263157895, + "loss": 0.3203, + "step": 23760 + }, + { + "epoch": 37.91, + "grad_norm": 0.5711174607276917, + "learning_rate": 0.0004835725677830941, + "loss": 0.3088, + "step": 23770 + }, + { + "epoch": 37.93, + "grad_norm": 0.7363555431365967, + "learning_rate": 0.00048293460925039874, + "loss": 0.3494, + "step": 23780 + }, + { + "epoch": 37.94, + "grad_norm": 0.4688860774040222, + "learning_rate": 0.00048229665071770336, + "loss": 0.406, + "step": 23790 + }, + { + "epoch": 37.96, + "grad_norm": 0.22278854250907898, + "learning_rate": 0.000481658692185008, + "loss": 0.2775, + "step": 23800 + }, + { + "epoch": 37.97, + "grad_norm": 0.5794351100921631, + "learning_rate": 0.00048102073365231265, + "loss": 0.3613, + "step": 23810 + }, + { + "epoch": 37.99, + "grad_norm": 0.7034667730331421, + "learning_rate": 0.00048038277511961726, + "loss": 0.3417, + "step": 23820 + }, + { + "epoch": 38.01, + "grad_norm": 0.5369040966033936, + "learning_rate": 0.0004797448165869219, + "loss": 0.2919, + "step": 23830 + }, + { + "epoch": 38.02, + "grad_norm": 0.4583072066307068, + "learning_rate": 0.0004791068580542265, + "loss": 0.2711, + "step": 23840 + }, + { + "epoch": 38.04, + "grad_norm": 0.32047978043556213, + "learning_rate": 0.0004784688995215311, + "loss": 0.3133, + "step": 23850 + }, + { + "epoch": 38.05, + "grad_norm": 0.4489063024520874, + "learning_rate": 0.00047783094098883573, + "loss": 0.3027, + "step": 23860 + }, + { + "epoch": 38.07, + "grad_norm": 0.29304754734039307, + "learning_rate": 0.00047719298245614035, + "loss": 0.2937, + "step": 23870 + }, + { + "epoch": 38.09, + "grad_norm": 0.5141634345054626, + "learning_rate": 0.00047655502392344496, + "loss": 0.2787, + "step": 23880 + }, + { + "epoch": 38.1, + "grad_norm": 0.6913502216339111, + "learning_rate": 0.00047591706539074964, + "loss": 0.3666, + "step": 23890 + }, + { + "epoch": 38.12, + "grad_norm": 0.49919384717941284, + "learning_rate": 0.00047527910685805425, + "loss": 0.3182, + "step": 23900 + }, + { + "epoch": 38.13, + "grad_norm": 0.27605143189430237, + "learning_rate": 0.00047464114832535887, + "loss": 0.2354, + "step": 23910 + }, + { + "epoch": 38.15, + "grad_norm": 1.246079921722412, + "learning_rate": 0.0004740031897926635, + "loss": 0.3429, + "step": 23920 + }, + { + "epoch": 38.17, + "grad_norm": 0.18399390578269958, + "learning_rate": 0.0004733652312599681, + "loss": 0.2916, + "step": 23930 + }, + { + "epoch": 38.18, + "grad_norm": 0.3015744388103485, + "learning_rate": 0.0004727272727272727, + "loss": 0.3216, + "step": 23940 + }, + { + "epoch": 38.2, + "grad_norm": 0.5281094312667847, + "learning_rate": 0.00047208931419457734, + "loss": 0.4193, + "step": 23950 + }, + { + "epoch": 38.21, + "grad_norm": 0.6574485301971436, + "learning_rate": 0.00047145135566188195, + "loss": 0.318, + "step": 23960 + }, + { + "epoch": 38.23, + "grad_norm": 0.5636985898017883, + "learning_rate": 0.0004708133971291866, + "loss": 0.3493, + "step": 23970 + }, + { + "epoch": 38.25, + "grad_norm": 0.3899206817150116, + "learning_rate": 0.00047017543859649124, + "loss": 0.313, + "step": 23980 + }, + { + "epoch": 38.26, + "grad_norm": 0.465703547000885, + "learning_rate": 0.00046953748006379586, + "loss": 0.3227, + "step": 23990 + }, + { + "epoch": 38.28, + "grad_norm": 0.9873224496841431, + "learning_rate": 0.0004688995215311005, + "loss": 0.3093, + "step": 24000 + }, + { + "epoch": 38.29, + "grad_norm": 0.545748233795166, + "learning_rate": 0.0004682615629984051, + "loss": 0.3427, + "step": 24010 + }, + { + "epoch": 38.31, + "grad_norm": 1.6173831224441528, + "learning_rate": 0.0004676236044657097, + "loss": 0.3487, + "step": 24020 + }, + { + "epoch": 38.33, + "grad_norm": 0.43845269083976746, + "learning_rate": 0.0004669856459330143, + "loss": 0.3368, + "step": 24030 + }, + { + "epoch": 38.34, + "grad_norm": 0.6073929071426392, + "learning_rate": 0.00046634768740031894, + "loss": 0.4022, + "step": 24040 + }, + { + "epoch": 38.36, + "grad_norm": 0.3742305636405945, + "learning_rate": 0.0004657097288676236, + "loss": 0.2748, + "step": 24050 + }, + { + "epoch": 38.37, + "grad_norm": 0.2694351375102997, + "learning_rate": 0.00046507177033492823, + "loss": 0.3874, + "step": 24060 + }, + { + "epoch": 38.39, + "grad_norm": 0.48228031396865845, + "learning_rate": 0.00046443381180223285, + "loss": 0.2613, + "step": 24070 + }, + { + "epoch": 38.41, + "grad_norm": 0.7061280608177185, + "learning_rate": 0.0004637958532695375, + "loss": 0.3152, + "step": 24080 + }, + { + "epoch": 38.42, + "grad_norm": 0.5890529751777649, + "learning_rate": 0.00046315789473684214, + "loss": 0.2593, + "step": 24090 + }, + { + "epoch": 38.44, + "grad_norm": 0.5934563875198364, + "learning_rate": 0.00046251993620414675, + "loss": 0.3712, + "step": 24100 + }, + { + "epoch": 38.45, + "grad_norm": 0.30438482761383057, + "learning_rate": 0.00046188197767145137, + "loss": 0.3082, + "step": 24110 + }, + { + "epoch": 38.47, + "grad_norm": 0.1404085010290146, + "learning_rate": 0.000461244019138756, + "loss": 0.2458, + "step": 24120 + }, + { + "epoch": 38.48, + "grad_norm": 0.45408958196640015, + "learning_rate": 0.00046060606060606066, + "loss": 0.302, + "step": 24130 + }, + { + "epoch": 38.5, + "grad_norm": 0.4974878430366516, + "learning_rate": 0.0004599681020733653, + "loss": 0.3416, + "step": 24140 + }, + { + "epoch": 38.52, + "grad_norm": 0.2546900510787964, + "learning_rate": 0.0004593301435406699, + "loss": 0.2762, + "step": 24150 + }, + { + "epoch": 38.53, + "grad_norm": 0.5472551584243774, + "learning_rate": 0.0004586921850079745, + "loss": 0.3176, + "step": 24160 + }, + { + "epoch": 38.55, + "grad_norm": 0.7795162200927734, + "learning_rate": 0.0004580542264752791, + "loss": 0.3699, + "step": 24170 + }, + { + "epoch": 38.56, + "grad_norm": 0.4223695397377014, + "learning_rate": 0.00045741626794258374, + "loss": 0.3011, + "step": 24180 + }, + { + "epoch": 38.58, + "grad_norm": 2.152009963989258, + "learning_rate": 0.00045677830940988836, + "loss": 0.36, + "step": 24190 + }, + { + "epoch": 38.6, + "grad_norm": 0.477445513010025, + "learning_rate": 0.000456140350877193, + "loss": 0.279, + "step": 24200 + }, + { + "epoch": 38.61, + "grad_norm": 0.546576738357544, + "learning_rate": 0.00045550239234449765, + "loss": 0.3519, + "step": 24210 + }, + { + "epoch": 38.63, + "grad_norm": 0.3089749217033386, + "learning_rate": 0.00045486443381180226, + "loss": 0.3306, + "step": 24220 + }, + { + "epoch": 38.64, + "grad_norm": 0.5986670851707458, + "learning_rate": 0.0004542264752791069, + "loss": 0.2692, + "step": 24230 + }, + { + "epoch": 38.66, + "grad_norm": 1.1655359268188477, + "learning_rate": 0.0004535885167464115, + "loss": 0.3717, + "step": 24240 + }, + { + "epoch": 38.68, + "grad_norm": 0.349162757396698, + "learning_rate": 0.0004529505582137161, + "loss": 0.4474, + "step": 24250 + }, + { + "epoch": 38.69, + "grad_norm": 0.3474232256412506, + "learning_rate": 0.00045231259968102073, + "loss": 0.3711, + "step": 24260 + }, + { + "epoch": 38.71, + "grad_norm": 0.38125041127204895, + "learning_rate": 0.00045167464114832535, + "loss": 0.3417, + "step": 24270 + }, + { + "epoch": 38.72, + "grad_norm": 0.49059632420539856, + "learning_rate": 0.00045103668261562996, + "loss": 0.3728, + "step": 24280 + }, + { + "epoch": 38.74, + "grad_norm": 0.34616127610206604, + "learning_rate": 0.00045039872408293464, + "loss": 0.3007, + "step": 24290 + }, + { + "epoch": 38.76, + "grad_norm": 0.6310774087905884, + "learning_rate": 0.00044976076555023925, + "loss": 0.2896, + "step": 24300 + }, + { + "epoch": 38.77, + "grad_norm": 1.5255939960479736, + "learning_rate": 0.00044912280701754387, + "loss": 0.3429, + "step": 24310 + }, + { + "epoch": 38.79, + "grad_norm": 0.38608258962631226, + "learning_rate": 0.0004484848484848485, + "loss": 0.3438, + "step": 24320 + }, + { + "epoch": 38.8, + "grad_norm": 1.0546627044677734, + "learning_rate": 0.0004478468899521531, + "loss": 0.3801, + "step": 24330 + }, + { + "epoch": 38.82, + "grad_norm": 0.3056943118572235, + "learning_rate": 0.0004472089314194577, + "loss": 0.2759, + "step": 24340 + }, + { + "epoch": 38.84, + "grad_norm": 0.7335503101348877, + "learning_rate": 0.00044657097288676234, + "loss": 0.3331, + "step": 24350 + }, + { + "epoch": 38.85, + "grad_norm": 0.36230140924453735, + "learning_rate": 0.00044593301435406695, + "loss": 0.2871, + "step": 24360 + }, + { + "epoch": 38.87, + "grad_norm": 0.3868005573749542, + "learning_rate": 0.0004452950558213716, + "loss": 0.3049, + "step": 24370 + }, + { + "epoch": 38.88, + "grad_norm": 0.4695385992527008, + "learning_rate": 0.00044465709728867624, + "loss": 0.3754, + "step": 24380 + }, + { + "epoch": 38.9, + "grad_norm": 0.2892504036426544, + "learning_rate": 0.00044401913875598086, + "loss": 0.3438, + "step": 24390 + }, + { + "epoch": 38.92, + "grad_norm": 0.7235500812530518, + "learning_rate": 0.0004433811802232855, + "loss": 0.3345, + "step": 24400 + }, + { + "epoch": 38.93, + "grad_norm": 0.48276352882385254, + "learning_rate": 0.0004427432216905901, + "loss": 0.3486, + "step": 24410 + }, + { + "epoch": 38.95, + "grad_norm": 0.384084016084671, + "learning_rate": 0.0004421052631578947, + "loss": 0.3657, + "step": 24420 + }, + { + "epoch": 38.96, + "grad_norm": 0.31639254093170166, + "learning_rate": 0.0004414673046251993, + "loss": 0.3392, + "step": 24430 + }, + { + "epoch": 38.98, + "grad_norm": 0.3250158131122589, + "learning_rate": 0.00044082934609250394, + "loss": 0.3391, + "step": 24440 + }, + { + "epoch": 39.0, + "grad_norm": 0.37524476647377014, + "learning_rate": 0.00044019138755980867, + "loss": 0.3313, + "step": 24450 + }, + { + "epoch": 39.01, + "grad_norm": 0.5987895131111145, + "learning_rate": 0.0004395534290271133, + "loss": 0.2781, + "step": 24460 + }, + { + "epoch": 39.03, + "grad_norm": 0.18153107166290283, + "learning_rate": 0.0004389154704944179, + "loss": 0.2496, + "step": 24470 + }, + { + "epoch": 39.04, + "grad_norm": 0.34211575984954834, + "learning_rate": 0.0004382775119617225, + "loss": 0.272, + "step": 24480 + }, + { + "epoch": 39.06, + "grad_norm": 0.391075074672699, + "learning_rate": 0.00043763955342902714, + "loss": 0.3307, + "step": 24490 + }, + { + "epoch": 39.07, + "grad_norm": 0.2632424831390381, + "learning_rate": 0.00043700159489633175, + "loss": 0.2906, + "step": 24500 + }, + { + "epoch": 39.09, + "grad_norm": 0.5995433926582336, + "learning_rate": 0.00043636363636363637, + "loss": 0.293, + "step": 24510 + }, + { + "epoch": 39.11, + "grad_norm": 0.6448796987533569, + "learning_rate": 0.00043572567783094104, + "loss": 0.251, + "step": 24520 + }, + { + "epoch": 39.12, + "grad_norm": 0.5249642729759216, + "learning_rate": 0.00043508771929824566, + "loss": 0.3272, + "step": 24530 + }, + { + "epoch": 39.14, + "grad_norm": 0.8031821250915527, + "learning_rate": 0.0004344497607655503, + "loss": 0.3417, + "step": 24540 + }, + { + "epoch": 39.15, + "grad_norm": 0.9898377656936646, + "learning_rate": 0.0004338118022328549, + "loss": 0.3799, + "step": 24550 + }, + { + "epoch": 39.17, + "grad_norm": 0.301408976316452, + "learning_rate": 0.0004331738437001595, + "loss": 0.2618, + "step": 24560 + }, + { + "epoch": 39.19, + "grad_norm": 0.3909609317779541, + "learning_rate": 0.0004325358851674641, + "loss": 0.3, + "step": 24570 + }, + { + "epoch": 39.2, + "grad_norm": 0.3314201533794403, + "learning_rate": 0.00043189792663476874, + "loss": 0.3461, + "step": 24580 + }, + { + "epoch": 39.22, + "grad_norm": 0.8803900480270386, + "learning_rate": 0.00043125996810207336, + "loss": 0.3438, + "step": 24590 + }, + { + "epoch": 39.23, + "grad_norm": 0.3051396906375885, + "learning_rate": 0.00043062200956937803, + "loss": 0.3489, + "step": 24600 + }, + { + "epoch": 39.25, + "grad_norm": 0.5020725131034851, + "learning_rate": 0.00042998405103668265, + "loss": 0.316, + "step": 24610 + }, + { + "epoch": 39.27, + "grad_norm": 0.7016777396202087, + "learning_rate": 0.00042934609250398726, + "loss": 0.3582, + "step": 24620 + }, + { + "epoch": 39.28, + "grad_norm": 0.21689297258853912, + "learning_rate": 0.0004287081339712919, + "loss": 0.2597, + "step": 24630 + }, + { + "epoch": 39.3, + "grad_norm": 0.6638566851615906, + "learning_rate": 0.0004280701754385965, + "loss": 0.3905, + "step": 24640 + }, + { + "epoch": 39.31, + "grad_norm": 0.24087496101856232, + "learning_rate": 0.0004274322169059011, + "loss": 0.3349, + "step": 24650 + }, + { + "epoch": 39.33, + "grad_norm": 0.14746366441249847, + "learning_rate": 0.00042679425837320573, + "loss": 0.2942, + "step": 24660 + }, + { + "epoch": 39.35, + "grad_norm": 0.3620028495788574, + "learning_rate": 0.00042615629984051035, + "loss": 0.2394, + "step": 24670 + }, + { + "epoch": 39.36, + "grad_norm": 0.5359326004981995, + "learning_rate": 0.000425518341307815, + "loss": 0.3661, + "step": 24680 + }, + { + "epoch": 39.38, + "grad_norm": 0.26914021372795105, + "learning_rate": 0.00042488038277511964, + "loss": 0.2544, + "step": 24690 + }, + { + "epoch": 39.39, + "grad_norm": 0.22984707355499268, + "learning_rate": 0.00042424242424242425, + "loss": 0.2545, + "step": 24700 + }, + { + "epoch": 39.41, + "grad_norm": 0.2788347601890564, + "learning_rate": 0.00042360446570972887, + "loss": 0.3354, + "step": 24710 + }, + { + "epoch": 39.43, + "grad_norm": 0.24124827980995178, + "learning_rate": 0.0004229665071770335, + "loss": 0.2551, + "step": 24720 + }, + { + "epoch": 39.44, + "grad_norm": 0.547863781452179, + "learning_rate": 0.0004223285486443381, + "loss": 0.2597, + "step": 24730 + }, + { + "epoch": 39.46, + "grad_norm": 0.25198522210121155, + "learning_rate": 0.0004216905901116427, + "loss": 0.3179, + "step": 24740 + }, + { + "epoch": 39.47, + "grad_norm": 0.3968208134174347, + "learning_rate": 0.00042105263157894734, + "loss": 0.2876, + "step": 24750 + }, + { + "epoch": 39.49, + "grad_norm": 0.33785438537597656, + "learning_rate": 0.000420414673046252, + "loss": 0.3097, + "step": 24760 + }, + { + "epoch": 39.51, + "grad_norm": 0.5009357333183289, + "learning_rate": 0.0004197767145135566, + "loss": 0.3202, + "step": 24770 + }, + { + "epoch": 39.52, + "grad_norm": 0.4793984889984131, + "learning_rate": 0.00041913875598086124, + "loss": 0.3518, + "step": 24780 + }, + { + "epoch": 39.54, + "grad_norm": 0.19300325214862823, + "learning_rate": 0.00041850079744816586, + "loss": 0.3672, + "step": 24790 + }, + { + "epoch": 39.55, + "grad_norm": 0.5630788803100586, + "learning_rate": 0.0004178628389154705, + "loss": 0.3217, + "step": 24800 + }, + { + "epoch": 39.57, + "grad_norm": 0.28488433361053467, + "learning_rate": 0.0004172248803827751, + "loss": 0.3063, + "step": 24810 + }, + { + "epoch": 39.59, + "grad_norm": 0.25450441241264343, + "learning_rate": 0.0004165869218500797, + "loss": 0.3638, + "step": 24820 + }, + { + "epoch": 39.6, + "grad_norm": 0.4360348880290985, + "learning_rate": 0.0004159489633173844, + "loss": 0.3122, + "step": 24830 + }, + { + "epoch": 39.62, + "grad_norm": 0.5293656587600708, + "learning_rate": 0.00041531100478468905, + "loss": 0.3944, + "step": 24840 + }, + { + "epoch": 39.63, + "grad_norm": 0.46485990285873413, + "learning_rate": 0.00041467304625199367, + "loss": 0.2323, + "step": 24850 + }, + { + "epoch": 39.65, + "grad_norm": 0.501832127571106, + "learning_rate": 0.0004140350877192983, + "loss": 0.3502, + "step": 24860 + }, + { + "epoch": 39.67, + "grad_norm": 0.4300176799297333, + "learning_rate": 0.0004133971291866029, + "loss": 0.2736, + "step": 24870 + }, + { + "epoch": 39.68, + "grad_norm": 0.253682941198349, + "learning_rate": 0.0004127591706539075, + "loss": 0.3306, + "step": 24880 + }, + { + "epoch": 39.7, + "grad_norm": 0.18599876761436462, + "learning_rate": 0.00041212121212121214, + "loss": 0.3534, + "step": 24890 + }, + { + "epoch": 39.71, + "grad_norm": 0.21810634434223175, + "learning_rate": 0.00041148325358851675, + "loss": 0.2772, + "step": 24900 + }, + { + "epoch": 39.73, + "grad_norm": 0.3228086233139038, + "learning_rate": 0.00041084529505582137, + "loss": 0.2433, + "step": 24910 + }, + { + "epoch": 39.74, + "grad_norm": 0.30225640535354614, + "learning_rate": 0.00041020733652312604, + "loss": 0.2261, + "step": 24920 + }, + { + "epoch": 39.76, + "grad_norm": 0.19185695052146912, + "learning_rate": 0.00040956937799043066, + "loss": 0.3096, + "step": 24930 + }, + { + "epoch": 39.78, + "grad_norm": 0.40327930450439453, + "learning_rate": 0.0004089314194577353, + "loss": 0.3472, + "step": 24940 + }, + { + "epoch": 39.79, + "grad_norm": 0.4578391909599304, + "learning_rate": 0.0004082934609250399, + "loss": 0.3177, + "step": 24950 + }, + { + "epoch": 39.81, + "grad_norm": 0.24900272488594055, + "learning_rate": 0.0004076555023923445, + "loss": 0.3526, + "step": 24960 + }, + { + "epoch": 39.82, + "grad_norm": 0.8984745144844055, + "learning_rate": 0.0004070175438596491, + "loss": 0.3307, + "step": 24970 + }, + { + "epoch": 39.84, + "grad_norm": 0.2043074071407318, + "learning_rate": 0.00040637958532695374, + "loss": 0.3346, + "step": 24980 + }, + { + "epoch": 39.86, + "grad_norm": 0.293965220451355, + "learning_rate": 0.00040574162679425836, + "loss": 0.3562, + "step": 24990 + }, + { + "epoch": 39.87, + "grad_norm": 0.1676713526248932, + "learning_rate": 0.00040510366826156303, + "loss": 0.2771, + "step": 25000 + }, + { + "epoch": 39.89, + "grad_norm": 0.7040833830833435, + "learning_rate": 0.00040446570972886765, + "loss": 0.3782, + "step": 25010 + }, + { + "epoch": 39.9, + "grad_norm": 1.6222413778305054, + "learning_rate": 0.00040382775119617226, + "loss": 0.2836, + "step": 25020 + }, + { + "epoch": 39.92, + "grad_norm": 0.3965054750442505, + "learning_rate": 0.0004031897926634769, + "loss": 0.4055, + "step": 25030 + }, + { + "epoch": 39.94, + "grad_norm": 0.5142346024513245, + "learning_rate": 0.0004025518341307815, + "loss": 0.34, + "step": 25040 + }, + { + "epoch": 39.95, + "grad_norm": 0.4719744622707367, + "learning_rate": 0.0004019138755980861, + "loss": 0.3643, + "step": 25050 + }, + { + "epoch": 39.97, + "grad_norm": 0.29006433486938477, + "learning_rate": 0.00040127591706539073, + "loss": 0.3195, + "step": 25060 + }, + { + "epoch": 39.98, + "grad_norm": 0.40275096893310547, + "learning_rate": 0.00040063795853269535, + "loss": 0.3247, + "step": 25070 + }, + { + "epoch": 40.0, + "grad_norm": 0.19441524147987366, + "learning_rate": 0.0004, + "loss": 0.2905, + "step": 25080 + }, + { + "epoch": 40.02, + "grad_norm": 0.30110710859298706, + "learning_rate": 0.00039936204146730464, + "loss": 0.2916, + "step": 25090 + }, + { + "epoch": 40.03, + "grad_norm": 0.36237674951553345, + "learning_rate": 0.00039872408293460925, + "loss": 0.3144, + "step": 25100 + }, + { + "epoch": 40.05, + "grad_norm": 0.4144202172756195, + "learning_rate": 0.00039808612440191387, + "loss": 0.2537, + "step": 25110 + }, + { + "epoch": 40.06, + "grad_norm": 0.5469448566436768, + "learning_rate": 0.0003974481658692185, + "loss": 0.2778, + "step": 25120 + }, + { + "epoch": 40.08, + "grad_norm": 0.6350633502006531, + "learning_rate": 0.0003968102073365231, + "loss": 0.3139, + "step": 25130 + }, + { + "epoch": 40.1, + "grad_norm": 0.6425772905349731, + "learning_rate": 0.0003961722488038277, + "loss": 0.3137, + "step": 25140 + }, + { + "epoch": 40.11, + "grad_norm": 0.5132192373275757, + "learning_rate": 0.00039553429027113234, + "loss": 0.3182, + "step": 25150 + }, + { + "epoch": 40.13, + "grad_norm": 0.3655058443546295, + "learning_rate": 0.000394896331738437, + "loss": 0.3213, + "step": 25160 + }, + { + "epoch": 40.14, + "grad_norm": 0.3207656145095825, + "learning_rate": 0.0003942583732057416, + "loss": 0.2839, + "step": 25170 + }, + { + "epoch": 40.16, + "grad_norm": 0.4457024037837982, + "learning_rate": 0.00039362041467304624, + "loss": 0.3271, + "step": 25180 + }, + { + "epoch": 40.18, + "grad_norm": 0.457660049200058, + "learning_rate": 0.00039298245614035086, + "loss": 0.3049, + "step": 25190 + }, + { + "epoch": 40.19, + "grad_norm": 0.44609880447387695, + "learning_rate": 0.00039234449760765553, + "loss": 0.3159, + "step": 25200 + }, + { + "epoch": 40.21, + "grad_norm": 0.14960619807243347, + "learning_rate": 0.00039170653907496015, + "loss": 0.2678, + "step": 25210 + }, + { + "epoch": 40.22, + "grad_norm": 0.20554865896701813, + "learning_rate": 0.00039106858054226476, + "loss": 0.2969, + "step": 25220 + }, + { + "epoch": 40.24, + "grad_norm": 0.25997835397720337, + "learning_rate": 0.0003904306220095694, + "loss": 0.2159, + "step": 25230 + }, + { + "epoch": 40.26, + "grad_norm": 0.18251359462738037, + "learning_rate": 0.00038979266347687405, + "loss": 0.3482, + "step": 25240 + }, + { + "epoch": 40.27, + "grad_norm": 0.3024716377258301, + "learning_rate": 0.00038915470494417867, + "loss": 0.3027, + "step": 25250 + }, + { + "epoch": 40.29, + "grad_norm": 0.38427066802978516, + "learning_rate": 0.0003885167464114833, + "loss": 0.297, + "step": 25260 + }, + { + "epoch": 40.3, + "grad_norm": 0.4605743885040283, + "learning_rate": 0.0003878787878787879, + "loss": 0.2966, + "step": 25270 + }, + { + "epoch": 40.32, + "grad_norm": 0.3320145010948181, + "learning_rate": 0.0003872408293460925, + "loss": 0.2449, + "step": 25280 + }, + { + "epoch": 40.33, + "grad_norm": 0.23880721628665924, + "learning_rate": 0.00038660287081339714, + "loss": 0.2779, + "step": 25290 + }, + { + "epoch": 40.35, + "grad_norm": 0.442751407623291, + "learning_rate": 0.00038596491228070175, + "loss": 0.2729, + "step": 25300 + }, + { + "epoch": 40.37, + "grad_norm": 0.2670186758041382, + "learning_rate": 0.0003853269537480064, + "loss": 0.3296, + "step": 25310 + }, + { + "epoch": 40.38, + "grad_norm": 0.2149314135313034, + "learning_rate": 0.00038468899521531104, + "loss": 0.3094, + "step": 25320 + }, + { + "epoch": 40.4, + "grad_norm": 0.15769945085048676, + "learning_rate": 0.00038405103668261566, + "loss": 0.2962, + "step": 25330 + }, + { + "epoch": 40.41, + "grad_norm": 0.30012694001197815, + "learning_rate": 0.0003834130781499203, + "loss": 0.2864, + "step": 25340 + }, + { + "epoch": 40.43, + "grad_norm": 0.6400253772735596, + "learning_rate": 0.0003827751196172249, + "loss": 0.4076, + "step": 25350 + }, + { + "epoch": 40.45, + "grad_norm": 0.5464116334915161, + "learning_rate": 0.0003821371610845295, + "loss": 0.3281, + "step": 25360 + }, + { + "epoch": 40.46, + "grad_norm": 0.463392972946167, + "learning_rate": 0.0003814992025518341, + "loss": 0.3192, + "step": 25370 + }, + { + "epoch": 40.48, + "grad_norm": 0.1991080492734909, + "learning_rate": 0.00038086124401913874, + "loss": 0.2582, + "step": 25380 + }, + { + "epoch": 40.49, + "grad_norm": 0.5955290198326111, + "learning_rate": 0.0003802232854864434, + "loss": 0.4031, + "step": 25390 + }, + { + "epoch": 40.51, + "grad_norm": 0.22706195712089539, + "learning_rate": 0.00037958532695374803, + "loss": 0.2928, + "step": 25400 + }, + { + "epoch": 40.53, + "grad_norm": 0.4163839817047119, + "learning_rate": 0.00037894736842105265, + "loss": 0.2956, + "step": 25410 + }, + { + "epoch": 40.54, + "grad_norm": 0.2746015787124634, + "learning_rate": 0.00037830940988835726, + "loss": 0.2378, + "step": 25420 + }, + { + "epoch": 40.56, + "grad_norm": 0.23401568830013275, + "learning_rate": 0.0003776714513556619, + "loss": 0.3618, + "step": 25430 + }, + { + "epoch": 40.57, + "grad_norm": 1.4698227643966675, + "learning_rate": 0.0003770334928229665, + "loss": 0.3472, + "step": 25440 + }, + { + "epoch": 40.59, + "grad_norm": 0.29799923300743103, + "learning_rate": 0.0003763955342902711, + "loss": 0.321, + "step": 25450 + }, + { + "epoch": 40.61, + "grad_norm": 0.27735623717308044, + "learning_rate": 0.00037575757575757573, + "loss": 0.346, + "step": 25460 + }, + { + "epoch": 40.62, + "grad_norm": 0.34145793318748474, + "learning_rate": 0.0003751196172248804, + "loss": 0.3426, + "step": 25470 + }, + { + "epoch": 40.64, + "grad_norm": 0.24481597542762756, + "learning_rate": 0.000374481658692185, + "loss": 0.2791, + "step": 25480 + }, + { + "epoch": 40.65, + "grad_norm": 0.5041400194168091, + "learning_rate": 0.00037384370015948964, + "loss": 0.2817, + "step": 25490 + }, + { + "epoch": 40.67, + "grad_norm": 0.3849920332431793, + "learning_rate": 0.00037320574162679425, + "loss": 0.3522, + "step": 25500 + }, + { + "epoch": 40.69, + "grad_norm": 0.4459153413772583, + "learning_rate": 0.00037256778309409887, + "loss": 0.3244, + "step": 25510 + }, + { + "epoch": 40.7, + "grad_norm": 0.441022127866745, + "learning_rate": 0.0003719298245614035, + "loss": 0.2834, + "step": 25520 + }, + { + "epoch": 40.72, + "grad_norm": 0.16988414525985718, + "learning_rate": 0.0003712918660287081, + "loss": 0.2639, + "step": 25530 + }, + { + "epoch": 40.73, + "grad_norm": 0.3544873893260956, + "learning_rate": 0.0003706539074960127, + "loss": 0.3157, + "step": 25540 + }, + { + "epoch": 40.75, + "grad_norm": 0.6139649152755737, + "learning_rate": 0.0003700159489633174, + "loss": 0.3729, + "step": 25550 + }, + { + "epoch": 40.77, + "grad_norm": 0.22452673316001892, + "learning_rate": 0.000369377990430622, + "loss": 0.2209, + "step": 25560 + }, + { + "epoch": 40.78, + "grad_norm": 0.458019495010376, + "learning_rate": 0.0003687400318979267, + "loss": 0.3738, + "step": 25570 + }, + { + "epoch": 40.8, + "grad_norm": 0.24333609640598297, + "learning_rate": 0.0003681020733652313, + "loss": 0.3406, + "step": 25580 + }, + { + "epoch": 40.81, + "grad_norm": 0.135534405708313, + "learning_rate": 0.0003674641148325359, + "loss": 0.2845, + "step": 25590 + }, + { + "epoch": 40.83, + "grad_norm": 1.0264251232147217, + "learning_rate": 0.00036682615629984053, + "loss": 0.3014, + "step": 25600 + }, + { + "epoch": 40.85, + "grad_norm": 0.5027388334274292, + "learning_rate": 0.00036618819776714515, + "loss": 0.2648, + "step": 25610 + }, + { + "epoch": 40.86, + "grad_norm": 0.37629154324531555, + "learning_rate": 0.00036555023923444976, + "loss": 0.3263, + "step": 25620 + }, + { + "epoch": 40.88, + "grad_norm": 0.16155029833316803, + "learning_rate": 0.00036491228070175443, + "loss": 0.2677, + "step": 25630 + }, + { + "epoch": 40.89, + "grad_norm": 0.5950889587402344, + "learning_rate": 0.00036427432216905905, + "loss": 0.2737, + "step": 25640 + }, + { + "epoch": 40.91, + "grad_norm": 1.288246750831604, + "learning_rate": 0.00036363636363636367, + "loss": 0.3458, + "step": 25650 + }, + { + "epoch": 40.93, + "grad_norm": 0.21823683381080627, + "learning_rate": 0.0003629984051036683, + "loss": 0.2528, + "step": 25660 + }, + { + "epoch": 40.94, + "grad_norm": 0.2102632224559784, + "learning_rate": 0.0003623604465709729, + "loss": 0.3346, + "step": 25670 + }, + { + "epoch": 40.96, + "grad_norm": 0.753999650478363, + "learning_rate": 0.0003617224880382775, + "loss": 0.3758, + "step": 25680 + }, + { + "epoch": 40.97, + "grad_norm": 0.20464596152305603, + "learning_rate": 0.00036108452950558214, + "loss": 0.3724, + "step": 25690 + }, + { + "epoch": 40.99, + "grad_norm": 0.38693875074386597, + "learning_rate": 0.00036044657097288675, + "loss": 0.4987, + "step": 25700 + }, + { + "epoch": 41.0, + "grad_norm": 1.1584486961364746, + "learning_rate": 0.0003598086124401914, + "loss": 0.3108, + "step": 25710 + }, + { + "epoch": 41.02, + "grad_norm": 0.23398354649543762, + "learning_rate": 0.00035917065390749604, + "loss": 0.321, + "step": 25720 + }, + { + "epoch": 41.04, + "grad_norm": 0.265209823846817, + "learning_rate": 0.00035853269537480066, + "loss": 0.379, + "step": 25730 + }, + { + "epoch": 41.05, + "grad_norm": 0.5159454941749573, + "learning_rate": 0.0003578947368421053, + "loss": 0.2849, + "step": 25740 + }, + { + "epoch": 41.07, + "grad_norm": 0.3185652792453766, + "learning_rate": 0.0003572567783094099, + "loss": 0.2885, + "step": 25750 + }, + { + "epoch": 41.08, + "grad_norm": 0.6398610472679138, + "learning_rate": 0.0003566188197767145, + "loss": 0.3583, + "step": 25760 + }, + { + "epoch": 41.1, + "grad_norm": 0.5768219232559204, + "learning_rate": 0.0003559808612440191, + "loss": 0.3427, + "step": 25770 + }, + { + "epoch": 41.12, + "grad_norm": 0.5042071342468262, + "learning_rate": 0.00035534290271132374, + "loss": 0.3047, + "step": 25780 + }, + { + "epoch": 41.13, + "grad_norm": 0.20871587097644806, + "learning_rate": 0.0003547049441786284, + "loss": 0.2634, + "step": 25790 + }, + { + "epoch": 41.15, + "grad_norm": 0.20863570272922516, + "learning_rate": 0.00035406698564593303, + "loss": 0.3444, + "step": 25800 + }, + { + "epoch": 41.16, + "grad_norm": 0.43497905135154724, + "learning_rate": 0.00035342902711323765, + "loss": 0.3717, + "step": 25810 + }, + { + "epoch": 41.18, + "grad_norm": 0.5420474410057068, + "learning_rate": 0.00035279106858054226, + "loss": 0.2936, + "step": 25820 + }, + { + "epoch": 41.2, + "grad_norm": 0.16857664287090302, + "learning_rate": 0.0003521531100478469, + "loss": 0.2317, + "step": 25830 + }, + { + "epoch": 41.21, + "grad_norm": 0.176952064037323, + "learning_rate": 0.0003515151515151515, + "loss": 0.2895, + "step": 25840 + }, + { + "epoch": 41.23, + "grad_norm": 0.3629634380340576, + "learning_rate": 0.0003508771929824561, + "loss": 0.3161, + "step": 25850 + }, + { + "epoch": 41.24, + "grad_norm": 0.3649951219558716, + "learning_rate": 0.00035023923444976073, + "loss": 0.3293, + "step": 25860 + }, + { + "epoch": 41.26, + "grad_norm": 0.2517475187778473, + "learning_rate": 0.0003496012759170654, + "loss": 0.3179, + "step": 25870 + }, + { + "epoch": 41.28, + "grad_norm": 0.18728438019752502, + "learning_rate": 0.00034896331738437, + "loss": 0.278, + "step": 25880 + }, + { + "epoch": 41.29, + "grad_norm": 0.3795156180858612, + "learning_rate": 0.00034832535885167464, + "loss": 0.3076, + "step": 25890 + }, + { + "epoch": 41.31, + "grad_norm": 0.6630691289901733, + "learning_rate": 0.00034768740031897925, + "loss": 0.3059, + "step": 25900 + }, + { + "epoch": 41.32, + "grad_norm": 0.5528631210327148, + "learning_rate": 0.00034704944178628387, + "loss": 0.287, + "step": 25910 + }, + { + "epoch": 41.34, + "grad_norm": 0.43808212876319885, + "learning_rate": 0.0003464114832535885, + "loss": 0.2863, + "step": 25920 + }, + { + "epoch": 41.36, + "grad_norm": 0.18791545927524567, + "learning_rate": 0.0003457735247208931, + "loss": 0.3391, + "step": 25930 + }, + { + "epoch": 41.37, + "grad_norm": 0.3744913935661316, + "learning_rate": 0.0003451355661881977, + "loss": 0.3083, + "step": 25940 + }, + { + "epoch": 41.39, + "grad_norm": 0.48115044832229614, + "learning_rate": 0.00034449760765550245, + "loss": 0.3296, + "step": 25950 + }, + { + "epoch": 41.4, + "grad_norm": 0.38108351826667786, + "learning_rate": 0.00034385964912280706, + "loss": 0.2663, + "step": 25960 + }, + { + "epoch": 41.42, + "grad_norm": 0.3938140869140625, + "learning_rate": 0.0003432216905901117, + "loss": 0.3353, + "step": 25970 + }, + { + "epoch": 41.44, + "grad_norm": 0.2402111142873764, + "learning_rate": 0.0003425837320574163, + "loss": 0.3786, + "step": 25980 + }, + { + "epoch": 41.45, + "grad_norm": 0.39668262004852295, + "learning_rate": 0.0003419457735247209, + "loss": 0.2367, + "step": 25990 + }, + { + "epoch": 41.47, + "grad_norm": 0.3418915569782257, + "learning_rate": 0.00034130781499202553, + "loss": 0.2675, + "step": 26000 + }, + { + "epoch": 41.48, + "grad_norm": 0.5036392211914062, + "learning_rate": 0.00034066985645933015, + "loss": 0.4014, + "step": 26010 + }, + { + "epoch": 41.5, + "grad_norm": 0.4944436550140381, + "learning_rate": 0.0003400318979266348, + "loss": 0.3061, + "step": 26020 + }, + { + "epoch": 41.52, + "grad_norm": 0.36498111486434937, + "learning_rate": 0.00033939393939393943, + "loss": 0.2843, + "step": 26030 + }, + { + "epoch": 41.53, + "grad_norm": 0.4892807900905609, + "learning_rate": 0.00033875598086124405, + "loss": 0.349, + "step": 26040 + }, + { + "epoch": 41.55, + "grad_norm": 0.23948755860328674, + "learning_rate": 0.00033811802232854867, + "loss": 0.2737, + "step": 26050 + }, + { + "epoch": 41.56, + "grad_norm": 0.22319771349430084, + "learning_rate": 0.0003374800637958533, + "loss": 0.2962, + "step": 26060 + }, + { + "epoch": 41.58, + "grad_norm": 0.3389337956905365, + "learning_rate": 0.0003368421052631579, + "loss": 0.3462, + "step": 26070 + }, + { + "epoch": 41.59, + "grad_norm": 0.15749427676200867, + "learning_rate": 0.0003362041467304625, + "loss": 0.3707, + "step": 26080 + }, + { + "epoch": 41.61, + "grad_norm": 0.5987353324890137, + "learning_rate": 0.00033556618819776714, + "loss": 0.3666, + "step": 26090 + }, + { + "epoch": 41.63, + "grad_norm": 0.21494194865226746, + "learning_rate": 0.0003349282296650718, + "loss": 0.296, + "step": 26100 + }, + { + "epoch": 41.64, + "grad_norm": 0.4202018976211548, + "learning_rate": 0.0003342902711323764, + "loss": 0.3, + "step": 26110 + }, + { + "epoch": 41.66, + "grad_norm": 0.33832699060440063, + "learning_rate": 0.00033365231259968104, + "loss": 0.3593, + "step": 26120 + }, + { + "epoch": 41.67, + "grad_norm": 0.18312333524227142, + "learning_rate": 0.00033301435406698566, + "loss": 0.2593, + "step": 26130 + }, + { + "epoch": 41.69, + "grad_norm": 0.6346192359924316, + "learning_rate": 0.0003323763955342903, + "loss": 0.3646, + "step": 26140 + }, + { + "epoch": 41.71, + "grad_norm": 0.2041671872138977, + "learning_rate": 0.0003317384370015949, + "loss": 0.2913, + "step": 26150 + }, + { + "epoch": 41.72, + "grad_norm": 0.23247523605823517, + "learning_rate": 0.0003311004784688995, + "loss": 0.2864, + "step": 26160 + }, + { + "epoch": 41.74, + "grad_norm": 0.6074626445770264, + "learning_rate": 0.0003304625199362041, + "loss": 0.2761, + "step": 26170 + }, + { + "epoch": 41.75, + "grad_norm": 0.2906535267829895, + "learning_rate": 0.0003298245614035088, + "loss": 0.2929, + "step": 26180 + }, + { + "epoch": 41.77, + "grad_norm": 0.36293816566467285, + "learning_rate": 0.0003291866028708134, + "loss": 0.2971, + "step": 26190 + }, + { + "epoch": 41.79, + "grad_norm": 0.3410266041755676, + "learning_rate": 0.00032854864433811803, + "loss": 0.2934, + "step": 26200 + }, + { + "epoch": 41.8, + "grad_norm": 0.5327407717704773, + "learning_rate": 0.00032791068580542265, + "loss": 0.3566, + "step": 26210 + }, + { + "epoch": 41.82, + "grad_norm": 0.4243089556694031, + "learning_rate": 0.00032727272727272726, + "loss": 0.2588, + "step": 26220 + }, + { + "epoch": 41.83, + "grad_norm": 0.3032602369785309, + "learning_rate": 0.0003266347687400319, + "loss": 0.2629, + "step": 26230 + }, + { + "epoch": 41.85, + "grad_norm": 0.4830479621887207, + "learning_rate": 0.0003259968102073365, + "loss": 0.3939, + "step": 26240 + }, + { + "epoch": 41.87, + "grad_norm": 0.11178059130907059, + "learning_rate": 0.0003253588516746411, + "loss": 0.2336, + "step": 26250 + }, + { + "epoch": 41.88, + "grad_norm": 0.11337348073720932, + "learning_rate": 0.0003247208931419458, + "loss": 0.2311, + "step": 26260 + }, + { + "epoch": 41.9, + "grad_norm": 0.42159444093704224, + "learning_rate": 0.0003240829346092504, + "loss": 0.3213, + "step": 26270 + }, + { + "epoch": 41.91, + "grad_norm": 0.36887168884277344, + "learning_rate": 0.000323444976076555, + "loss": 0.2928, + "step": 26280 + }, + { + "epoch": 41.93, + "grad_norm": 0.4706740081310272, + "learning_rate": 0.00032280701754385964, + "loss": 0.2848, + "step": 26290 + }, + { + "epoch": 41.95, + "grad_norm": 0.5931901335716248, + "learning_rate": 0.00032216905901116425, + "loss": 0.3273, + "step": 26300 + }, + { + "epoch": 41.96, + "grad_norm": 0.21357150375843048, + "learning_rate": 0.00032153110047846887, + "loss": 0.2859, + "step": 26310 + }, + { + "epoch": 41.98, + "grad_norm": 0.48659244179725647, + "learning_rate": 0.00032089314194577354, + "loss": 0.3588, + "step": 26320 + }, + { + "epoch": 41.99, + "grad_norm": 0.28712713718414307, + "learning_rate": 0.00032025518341307816, + "loss": 0.3016, + "step": 26330 + }, + { + "epoch": 42.01, + "grad_norm": 0.5351189970970154, + "learning_rate": 0.00031961722488038283, + "loss": 0.3306, + "step": 26340 + }, + { + "epoch": 42.03, + "grad_norm": 0.22305412590503693, + "learning_rate": 0.00031897926634768745, + "loss": 0.283, + "step": 26350 + }, + { + "epoch": 42.04, + "grad_norm": 0.3026597797870636, + "learning_rate": 0.00031834130781499206, + "loss": 0.2912, + "step": 26360 + }, + { + "epoch": 42.06, + "grad_norm": 0.3411235809326172, + "learning_rate": 0.0003177033492822967, + "loss": 0.3273, + "step": 26370 + }, + { + "epoch": 42.07, + "grad_norm": 0.31902214884757996, + "learning_rate": 0.0003170653907496013, + "loss": 0.2511, + "step": 26380 + }, + { + "epoch": 42.09, + "grad_norm": 0.2367999106645584, + "learning_rate": 0.0003164274322169059, + "loss": 0.2427, + "step": 26390 + }, + { + "epoch": 42.11, + "grad_norm": 0.24773749709129333, + "learning_rate": 0.00031578947368421053, + "loss": 0.2991, + "step": 26400 + }, + { + "epoch": 42.12, + "grad_norm": 0.33940422534942627, + "learning_rate": 0.00031515151515151515, + "loss": 0.2688, + "step": 26410 + }, + { + "epoch": 42.14, + "grad_norm": 0.4297594130039215, + "learning_rate": 0.0003145135566188198, + "loss": 0.2882, + "step": 26420 + }, + { + "epoch": 42.15, + "grad_norm": 0.38739773631095886, + "learning_rate": 0.00031387559808612443, + "loss": 0.299, + "step": 26430 + }, + { + "epoch": 42.17, + "grad_norm": 0.19908225536346436, + "learning_rate": 0.00031323763955342905, + "loss": 0.2284, + "step": 26440 + }, + { + "epoch": 42.19, + "grad_norm": 0.2024683952331543, + "learning_rate": 0.00031259968102073367, + "loss": 0.27, + "step": 26450 + }, + { + "epoch": 42.2, + "grad_norm": 0.27837881445884705, + "learning_rate": 0.0003119617224880383, + "loss": 0.3353, + "step": 26460 + }, + { + "epoch": 42.22, + "grad_norm": 0.25491103529930115, + "learning_rate": 0.0003113237639553429, + "loss": 0.3187, + "step": 26470 + }, + { + "epoch": 42.23, + "grad_norm": 0.430846244096756, + "learning_rate": 0.0003106858054226475, + "loss": 0.2994, + "step": 26480 + }, + { + "epoch": 42.25, + "grad_norm": 0.3018259108066559, + "learning_rate": 0.00031004784688995214, + "loss": 0.3171, + "step": 26490 + }, + { + "epoch": 42.26, + "grad_norm": 0.29348355531692505, + "learning_rate": 0.0003094098883572568, + "loss": 0.3, + "step": 26500 + }, + { + "epoch": 42.28, + "grad_norm": 0.3258605897426605, + "learning_rate": 0.0003087719298245614, + "loss": 0.2795, + "step": 26510 + }, + { + "epoch": 42.3, + "grad_norm": 0.17465408146381378, + "learning_rate": 0.00030813397129186604, + "loss": 0.3106, + "step": 26520 + }, + { + "epoch": 42.31, + "grad_norm": 0.2361348271369934, + "learning_rate": 0.00030749601275917066, + "loss": 0.2802, + "step": 26530 + }, + { + "epoch": 42.33, + "grad_norm": 0.18255957961082458, + "learning_rate": 0.0003068580542264753, + "loss": 0.2396, + "step": 26540 + }, + { + "epoch": 42.34, + "grad_norm": 0.5694864988327026, + "learning_rate": 0.0003062200956937799, + "loss": 0.3536, + "step": 26550 + }, + { + "epoch": 42.36, + "grad_norm": 0.37303659319877625, + "learning_rate": 0.0003055821371610845, + "loss": 0.263, + "step": 26560 + }, + { + "epoch": 42.38, + "grad_norm": 0.3398790657520294, + "learning_rate": 0.0003049441786283891, + "loss": 0.2204, + "step": 26570 + }, + { + "epoch": 42.39, + "grad_norm": 0.28415796160697937, + "learning_rate": 0.0003043062200956938, + "loss": 0.3556, + "step": 26580 + }, + { + "epoch": 42.41, + "grad_norm": 0.4093596637248993, + "learning_rate": 0.0003036682615629984, + "loss": 0.2996, + "step": 26590 + }, + { + "epoch": 42.42, + "grad_norm": 0.25546014308929443, + "learning_rate": 0.00030303030303030303, + "loss": 0.3492, + "step": 26600 + }, + { + "epoch": 42.44, + "grad_norm": 0.7774071097373962, + "learning_rate": 0.00030239234449760765, + "loss": 0.3175, + "step": 26610 + }, + { + "epoch": 42.46, + "grad_norm": 0.7066117525100708, + "learning_rate": 0.00030175438596491226, + "loss": 0.3957, + "step": 26620 + }, + { + "epoch": 42.47, + "grad_norm": 0.42754918336868286, + "learning_rate": 0.0003011164274322169, + "loss": 0.2983, + "step": 26630 + }, + { + "epoch": 42.49, + "grad_norm": 0.5412092208862305, + "learning_rate": 0.0003004784688995215, + "loss": 0.3556, + "step": 26640 + }, + { + "epoch": 42.5, + "grad_norm": 0.19610168039798737, + "learning_rate": 0.0002998405103668261, + "loss": 0.309, + "step": 26650 + }, + { + "epoch": 42.52, + "grad_norm": 0.4178897738456726, + "learning_rate": 0.0002992025518341308, + "loss": 0.2874, + "step": 26660 + }, + { + "epoch": 42.54, + "grad_norm": 0.24159128963947296, + "learning_rate": 0.0002985645933014354, + "loss": 0.3137, + "step": 26670 + }, + { + "epoch": 42.55, + "grad_norm": 0.3273567259311676, + "learning_rate": 0.00029792663476874, + "loss": 0.3238, + "step": 26680 + }, + { + "epoch": 42.57, + "grad_norm": 0.48245471715927124, + "learning_rate": 0.0002972886762360447, + "loss": 0.42, + "step": 26690 + }, + { + "epoch": 42.58, + "grad_norm": 0.7114046216011047, + "learning_rate": 0.0002966507177033493, + "loss": 0.3498, + "step": 26700 + }, + { + "epoch": 42.6, + "grad_norm": 0.6506601572036743, + "learning_rate": 0.0002960127591706539, + "loss": 0.3406, + "step": 26710 + }, + { + "epoch": 42.62, + "grad_norm": 0.5458781719207764, + "learning_rate": 0.00029537480063795854, + "loss": 0.2737, + "step": 26720 + }, + { + "epoch": 42.63, + "grad_norm": 0.19456742703914642, + "learning_rate": 0.00029473684210526316, + "loss": 0.31, + "step": 26730 + }, + { + "epoch": 42.65, + "grad_norm": 0.178878054022789, + "learning_rate": 0.00029409888357256783, + "loss": 0.3827, + "step": 26740 + }, + { + "epoch": 42.66, + "grad_norm": 0.40357646346092224, + "learning_rate": 0.00029346092503987245, + "loss": 0.2755, + "step": 26750 + }, + { + "epoch": 42.68, + "grad_norm": 0.5037977695465088, + "learning_rate": 0.00029282296650717706, + "loss": 0.3228, + "step": 26760 + }, + { + "epoch": 42.7, + "grad_norm": 0.20705698430538177, + "learning_rate": 0.0002921850079744817, + "loss": 0.2309, + "step": 26770 + }, + { + "epoch": 42.71, + "grad_norm": 0.22491195797920227, + "learning_rate": 0.0002915470494417863, + "loss": 0.2569, + "step": 26780 + }, + { + "epoch": 42.73, + "grad_norm": 0.270967036485672, + "learning_rate": 0.0002909090909090909, + "loss": 0.2846, + "step": 26790 + }, + { + "epoch": 42.74, + "grad_norm": 0.1675962209701538, + "learning_rate": 0.00029027113237639553, + "loss": 0.3479, + "step": 26800 + }, + { + "epoch": 42.76, + "grad_norm": 0.24002137780189514, + "learning_rate": 0.0002896331738437002, + "loss": 0.3825, + "step": 26810 + }, + { + "epoch": 42.78, + "grad_norm": 0.7108230590820312, + "learning_rate": 0.0002889952153110048, + "loss": 0.4057, + "step": 26820 + }, + { + "epoch": 42.79, + "grad_norm": 0.5931742787361145, + "learning_rate": 0.00028835725677830943, + "loss": 0.3256, + "step": 26830 + }, + { + "epoch": 42.81, + "grad_norm": 0.4527370035648346, + "learning_rate": 0.00028771929824561405, + "loss": 0.2943, + "step": 26840 + }, + { + "epoch": 42.82, + "grad_norm": 0.6159200072288513, + "learning_rate": 0.00028708133971291867, + "loss": 0.3117, + "step": 26850 + }, + { + "epoch": 42.84, + "grad_norm": 0.1614978313446045, + "learning_rate": 0.0002864433811802233, + "loss": 0.2834, + "step": 26860 + }, + { + "epoch": 42.85, + "grad_norm": 0.37030118703842163, + "learning_rate": 0.0002858054226475279, + "loss": 0.3029, + "step": 26870 + }, + { + "epoch": 42.87, + "grad_norm": 0.13131965696811676, + "learning_rate": 0.0002851674641148325, + "loss": 0.27, + "step": 26880 + }, + { + "epoch": 42.89, + "grad_norm": 0.42525768280029297, + "learning_rate": 0.0002845295055821372, + "loss": 0.3307, + "step": 26890 + }, + { + "epoch": 42.9, + "grad_norm": 0.17870941758155823, + "learning_rate": 0.0002838915470494418, + "loss": 0.2564, + "step": 26900 + }, + { + "epoch": 42.92, + "grad_norm": 0.7622866630554199, + "learning_rate": 0.0002832535885167464, + "loss": 0.3327, + "step": 26910 + }, + { + "epoch": 42.93, + "grad_norm": 0.5731341242790222, + "learning_rate": 0.00028261562998405104, + "loss": 0.3194, + "step": 26920 + }, + { + "epoch": 42.95, + "grad_norm": 0.3763886094093323, + "learning_rate": 0.00028197767145135566, + "loss": 0.2775, + "step": 26930 + }, + { + "epoch": 42.97, + "grad_norm": 0.33604711294174194, + "learning_rate": 0.0002813397129186603, + "loss": 0.2825, + "step": 26940 + }, + { + "epoch": 42.98, + "grad_norm": 0.2752174437046051, + "learning_rate": 0.0002807017543859649, + "loss": 0.3769, + "step": 26950 + }, + { + "epoch": 43.0, + "grad_norm": 0.4602324962615967, + "learning_rate": 0.0002800637958532695, + "loss": 0.3297, + "step": 26960 + }, + { + "epoch": 43.01, + "grad_norm": 0.263231486082077, + "learning_rate": 0.0002794258373205742, + "loss": 0.226, + "step": 26970 + }, + { + "epoch": 43.03, + "grad_norm": 0.085409976541996, + "learning_rate": 0.0002787878787878788, + "loss": 0.2887, + "step": 26980 + }, + { + "epoch": 43.05, + "grad_norm": 0.3499665856361389, + "learning_rate": 0.0002781499202551834, + "loss": 0.3344, + "step": 26990 + }, + { + "epoch": 43.06, + "grad_norm": 0.6164402365684509, + "learning_rate": 0.00027751196172248803, + "loss": 0.3555, + "step": 27000 + }, + { + "epoch": 43.08, + "grad_norm": 0.22411352396011353, + "learning_rate": 0.00027687400318979265, + "loss": 0.3044, + "step": 27010 + }, + { + "epoch": 43.09, + "grad_norm": 0.5322696566581726, + "learning_rate": 0.00027623604465709726, + "loss": 0.2515, + "step": 27020 + }, + { + "epoch": 43.11, + "grad_norm": 0.382097989320755, + "learning_rate": 0.0002755980861244019, + "loss": 0.399, + "step": 27030 + }, + { + "epoch": 43.13, + "grad_norm": 0.13839659094810486, + "learning_rate": 0.0002749601275917065, + "loss": 0.3297, + "step": 27040 + }, + { + "epoch": 43.14, + "grad_norm": 0.22009891271591187, + "learning_rate": 0.00027432216905901117, + "loss": 0.277, + "step": 27050 + }, + { + "epoch": 43.16, + "grad_norm": 0.41159576177597046, + "learning_rate": 0.00027368421052631584, + "loss": 0.3107, + "step": 27060 + }, + { + "epoch": 43.17, + "grad_norm": 0.21699748933315277, + "learning_rate": 0.00027304625199362046, + "loss": 0.2765, + "step": 27070 + }, + { + "epoch": 43.19, + "grad_norm": 0.21291545033454895, + "learning_rate": 0.0002724082934609251, + "loss": 0.2479, + "step": 27080 + }, + { + "epoch": 43.21, + "grad_norm": 0.20848800241947174, + "learning_rate": 0.0002717703349282297, + "loss": 0.316, + "step": 27090 + }, + { + "epoch": 43.22, + "grad_norm": 0.41950148344039917, + "learning_rate": 0.0002711323763955343, + "loss": 0.3588, + "step": 27100 + }, + { + "epoch": 43.24, + "grad_norm": 0.33547741174697876, + "learning_rate": 0.0002704944178628389, + "loss": 0.2836, + "step": 27110 + }, + { + "epoch": 43.25, + "grad_norm": 1.4663803577423096, + "learning_rate": 0.00026985645933014354, + "loss": 0.3852, + "step": 27120 + }, + { + "epoch": 43.27, + "grad_norm": 0.2404787391424179, + "learning_rate": 0.0002692185007974482, + "loss": 0.2629, + "step": 27130 + }, + { + "epoch": 43.29, + "grad_norm": 0.08930987864732742, + "learning_rate": 0.00026858054226475283, + "loss": 0.1914, + "step": 27140 + }, + { + "epoch": 43.3, + "grad_norm": 0.3522126376628876, + "learning_rate": 0.00026794258373205745, + "loss": 0.3222, + "step": 27150 + }, + { + "epoch": 43.32, + "grad_norm": 2.05954909324646, + "learning_rate": 0.00026730462519936206, + "loss": 0.3058, + "step": 27160 + }, + { + "epoch": 43.33, + "grad_norm": 0.36962321400642395, + "learning_rate": 0.0002666666666666667, + "loss": 0.2829, + "step": 27170 + }, + { + "epoch": 43.35, + "grad_norm": 0.18911263346672058, + "learning_rate": 0.0002660287081339713, + "loss": 0.3731, + "step": 27180 + }, + { + "epoch": 43.37, + "grad_norm": 0.19024628400802612, + "learning_rate": 0.0002653907496012759, + "loss": 0.2726, + "step": 27190 + }, + { + "epoch": 43.38, + "grad_norm": 0.20783045887947083, + "learning_rate": 0.00026475279106858053, + "loss": 0.2787, + "step": 27200 + }, + { + "epoch": 43.4, + "grad_norm": 1.8203842639923096, + "learning_rate": 0.0002641148325358852, + "loss": 0.2755, + "step": 27210 + }, + { + "epoch": 43.41, + "grad_norm": 0.41969624161720276, + "learning_rate": 0.0002634768740031898, + "loss": 0.3315, + "step": 27220 + }, + { + "epoch": 43.43, + "grad_norm": 0.17119190096855164, + "learning_rate": 0.00026283891547049443, + "loss": 0.2718, + "step": 27230 + }, + { + "epoch": 43.44, + "grad_norm": 0.24514427781105042, + "learning_rate": 0.00026220095693779905, + "loss": 0.2831, + "step": 27240 + }, + { + "epoch": 43.46, + "grad_norm": 0.24649424850940704, + "learning_rate": 0.00026156299840510367, + "loss": 0.2647, + "step": 27250 + }, + { + "epoch": 43.48, + "grad_norm": 0.3236254155635834, + "learning_rate": 0.0002609250398724083, + "loss": 0.2733, + "step": 27260 + }, + { + "epoch": 43.49, + "grad_norm": 0.4180354177951813, + "learning_rate": 0.0002602870813397129, + "loss": 0.3352, + "step": 27270 + }, + { + "epoch": 43.51, + "grad_norm": 0.4652386009693146, + "learning_rate": 0.0002596491228070175, + "loss": 0.301, + "step": 27280 + }, + { + "epoch": 43.52, + "grad_norm": 0.30387723445892334, + "learning_rate": 0.0002590111642743222, + "loss": 0.2286, + "step": 27290 + }, + { + "epoch": 43.54, + "grad_norm": 0.13368535041809082, + "learning_rate": 0.0002583732057416268, + "loss": 0.2803, + "step": 27300 + }, + { + "epoch": 43.56, + "grad_norm": 0.10856983065605164, + "learning_rate": 0.0002577352472089314, + "loss": 0.3184, + "step": 27310 + }, + { + "epoch": 43.57, + "grad_norm": 0.3115447759628296, + "learning_rate": 0.00025709728867623604, + "loss": 0.3274, + "step": 27320 + }, + { + "epoch": 43.59, + "grad_norm": 0.3176775276660919, + "learning_rate": 0.00025645933014354066, + "loss": 0.3476, + "step": 27330 + }, + { + "epoch": 43.6, + "grad_norm": 0.17715303599834442, + "learning_rate": 0.0002558213716108453, + "loss": 0.2684, + "step": 27340 + }, + { + "epoch": 43.62, + "grad_norm": 0.39675870537757874, + "learning_rate": 0.0002551834130781499, + "loss": 0.317, + "step": 27350 + }, + { + "epoch": 43.64, + "grad_norm": 0.29539576172828674, + "learning_rate": 0.0002545454545454545, + "loss": 0.3124, + "step": 27360 + }, + { + "epoch": 43.65, + "grad_norm": 0.1516566276550293, + "learning_rate": 0.0002539074960127592, + "loss": 0.3537, + "step": 27370 + }, + { + "epoch": 43.67, + "grad_norm": 0.3762792944908142, + "learning_rate": 0.0002532695374800638, + "loss": 0.3026, + "step": 27380 + }, + { + "epoch": 43.68, + "grad_norm": 0.15428495407104492, + "learning_rate": 0.0002526315789473684, + "loss": 0.2849, + "step": 27390 + }, + { + "epoch": 43.7, + "grad_norm": 0.22668874263763428, + "learning_rate": 0.00025199362041467303, + "loss": 0.2174, + "step": 27400 + }, + { + "epoch": 43.72, + "grad_norm": 0.12878923118114471, + "learning_rate": 0.00025135566188197765, + "loss": 0.2367, + "step": 27410 + }, + { + "epoch": 43.73, + "grad_norm": 0.1742442101240158, + "learning_rate": 0.00025071770334928226, + "loss": 0.3225, + "step": 27420 + }, + { + "epoch": 43.75, + "grad_norm": 0.2178335189819336, + "learning_rate": 0.0002500797448165869, + "loss": 0.3674, + "step": 27430 + }, + { + "epoch": 43.76, + "grad_norm": 0.07598412036895752, + "learning_rate": 0.00024944178628389155, + "loss": 0.3825, + "step": 27440 + }, + { + "epoch": 43.78, + "grad_norm": 0.3597804605960846, + "learning_rate": 0.00024880382775119617, + "loss": 0.2679, + "step": 27450 + }, + { + "epoch": 43.8, + "grad_norm": 0.5584509968757629, + "learning_rate": 0.0002481658692185008, + "loss": 0.3459, + "step": 27460 + }, + { + "epoch": 43.81, + "grad_norm": 0.19587256014347076, + "learning_rate": 0.00024752791068580546, + "loss": 0.3732, + "step": 27470 + }, + { + "epoch": 43.83, + "grad_norm": 0.4442209303379059, + "learning_rate": 0.0002468899521531101, + "loss": 0.3049, + "step": 27480 + }, + { + "epoch": 43.84, + "grad_norm": 0.259143590927124, + "learning_rate": 0.0002462519936204147, + "loss": 0.2928, + "step": 27490 + }, + { + "epoch": 43.86, + "grad_norm": 0.19528359174728394, + "learning_rate": 0.0002456140350877193, + "loss": 0.2799, + "step": 27500 + }, + { + "epoch": 43.88, + "grad_norm": 0.47608378529548645, + "learning_rate": 0.0002449760765550239, + "loss": 0.3079, + "step": 27510 + }, + { + "epoch": 43.89, + "grad_norm": 0.2542645335197449, + "learning_rate": 0.00024433811802232854, + "loss": 0.2834, + "step": 27520 + }, + { + "epoch": 43.91, + "grad_norm": 0.37310686707496643, + "learning_rate": 0.00024370015948963318, + "loss": 0.3171, + "step": 27530 + }, + { + "epoch": 43.92, + "grad_norm": 0.6291790008544922, + "learning_rate": 0.00024306220095693783, + "loss": 0.3439, + "step": 27540 + }, + { + "epoch": 43.94, + "grad_norm": 0.5721063613891602, + "learning_rate": 0.00024242424242424245, + "loss": 0.3039, + "step": 27550 + }, + { + "epoch": 43.96, + "grad_norm": 0.1536693423986435, + "learning_rate": 0.00024178628389154706, + "loss": 0.2981, + "step": 27560 + }, + { + "epoch": 43.97, + "grad_norm": 0.3179001212120056, + "learning_rate": 0.00024114832535885168, + "loss": 0.2213, + "step": 27570 + }, + { + "epoch": 43.99, + "grad_norm": 0.19436044991016388, + "learning_rate": 0.00024051036682615632, + "loss": 0.2756, + "step": 27580 + }, + { + "epoch": 44.0, + "grad_norm": 0.21824301779270172, + "learning_rate": 0.00023987240829346094, + "loss": 0.2627, + "step": 27590 + }, + { + "epoch": 44.02, + "grad_norm": 0.38110193610191345, + "learning_rate": 0.00023923444976076556, + "loss": 0.248, + "step": 27600 + }, + { + "epoch": 44.04, + "grad_norm": 0.2160405069589615, + "learning_rate": 0.00023859649122807017, + "loss": 0.2404, + "step": 27610 + }, + { + "epoch": 44.05, + "grad_norm": 0.3136873245239258, + "learning_rate": 0.00023795853269537482, + "loss": 0.3034, + "step": 27620 + }, + { + "epoch": 44.07, + "grad_norm": 0.21699780225753784, + "learning_rate": 0.00023732057416267943, + "loss": 0.3371, + "step": 27630 + }, + { + "epoch": 44.08, + "grad_norm": 0.3122328221797943, + "learning_rate": 0.00023668261562998405, + "loss": 0.2569, + "step": 27640 + }, + { + "epoch": 44.1, + "grad_norm": 0.45483753085136414, + "learning_rate": 0.00023604465709728867, + "loss": 0.2873, + "step": 27650 + }, + { + "epoch": 44.11, + "grad_norm": 0.39906224608421326, + "learning_rate": 0.0002354066985645933, + "loss": 0.3227, + "step": 27660 + }, + { + "epoch": 44.13, + "grad_norm": 0.24932830035686493, + "learning_rate": 0.00023476874003189793, + "loss": 0.2343, + "step": 27670 + }, + { + "epoch": 44.15, + "grad_norm": 0.09502261132001877, + "learning_rate": 0.00023413078149920255, + "loss": 0.3065, + "step": 27680 + }, + { + "epoch": 44.16, + "grad_norm": 0.3910047709941864, + "learning_rate": 0.00023349282296650716, + "loss": 0.3902, + "step": 27690 + }, + { + "epoch": 44.18, + "grad_norm": 0.2578485310077667, + "learning_rate": 0.0002328548644338118, + "loss": 0.3062, + "step": 27700 + }, + { + "epoch": 44.19, + "grad_norm": 0.40186047554016113, + "learning_rate": 0.00023221690590111642, + "loss": 0.3129, + "step": 27710 + }, + { + "epoch": 44.21, + "grad_norm": 0.8674927353858948, + "learning_rate": 0.00023157894736842107, + "loss": 0.3522, + "step": 27720 + }, + { + "epoch": 44.23, + "grad_norm": 0.1684367060661316, + "learning_rate": 0.00023094098883572568, + "loss": 0.2683, + "step": 27730 + }, + { + "epoch": 44.24, + "grad_norm": 0.34888872504234314, + "learning_rate": 0.00023030303030303033, + "loss": 0.2477, + "step": 27740 + }, + { + "epoch": 44.26, + "grad_norm": 0.5431171655654907, + "learning_rate": 0.00022966507177033495, + "loss": 0.35, + "step": 27750 + }, + { + "epoch": 44.27, + "grad_norm": 0.3396085202693939, + "learning_rate": 0.00022902711323763956, + "loss": 0.3115, + "step": 27760 + }, + { + "epoch": 44.29, + "grad_norm": 0.327421635389328, + "learning_rate": 0.00022838915470494418, + "loss": 0.3153, + "step": 27770 + }, + { + "epoch": 44.31, + "grad_norm": 0.34646356105804443, + "learning_rate": 0.00022775119617224882, + "loss": 0.3603, + "step": 27780 + }, + { + "epoch": 44.32, + "grad_norm": 0.3496292233467102, + "learning_rate": 0.00022711323763955344, + "loss": 0.2877, + "step": 27790 + }, + { + "epoch": 44.34, + "grad_norm": 0.19173116981983185, + "learning_rate": 0.00022647527910685806, + "loss": 0.2755, + "step": 27800 + }, + { + "epoch": 44.35, + "grad_norm": 0.6964245438575745, + "learning_rate": 0.00022583732057416267, + "loss": 0.3568, + "step": 27810 + }, + { + "epoch": 44.37, + "grad_norm": 0.283237099647522, + "learning_rate": 0.00022519936204146732, + "loss": 0.3004, + "step": 27820 + }, + { + "epoch": 44.39, + "grad_norm": 0.3077571988105774, + "learning_rate": 0.00022456140350877193, + "loss": 0.3139, + "step": 27830 + }, + { + "epoch": 44.4, + "grad_norm": 0.44178569316864014, + "learning_rate": 0.00022392344497607655, + "loss": 0.2253, + "step": 27840 + }, + { + "epoch": 44.42, + "grad_norm": 0.23611438274383545, + "learning_rate": 0.00022328548644338117, + "loss": 0.3357, + "step": 27850 + }, + { + "epoch": 44.43, + "grad_norm": 0.402852326631546, + "learning_rate": 0.0002226475279106858, + "loss": 0.3024, + "step": 27860 + }, + { + "epoch": 44.45, + "grad_norm": 0.5001922249794006, + "learning_rate": 0.00022200956937799043, + "loss": 0.2424, + "step": 27870 + }, + { + "epoch": 44.47, + "grad_norm": 0.5164135098457336, + "learning_rate": 0.00022137161084529505, + "loss": 0.2952, + "step": 27880 + }, + { + "epoch": 44.48, + "grad_norm": 0.35648113489151, + "learning_rate": 0.00022073365231259966, + "loss": 0.2745, + "step": 27890 + }, + { + "epoch": 44.5, + "grad_norm": 0.6341779232025146, + "learning_rate": 0.00022009569377990433, + "loss": 0.3428, + "step": 27900 + }, + { + "epoch": 44.51, + "grad_norm": 0.5282499194145203, + "learning_rate": 0.00021945773524720895, + "loss": 0.3385, + "step": 27910 + }, + { + "epoch": 44.53, + "grad_norm": 0.34089717268943787, + "learning_rate": 0.00021881977671451357, + "loss": 0.3049, + "step": 27920 + }, + { + "epoch": 44.55, + "grad_norm": 0.44440943002700806, + "learning_rate": 0.00021818181818181818, + "loss": 0.3613, + "step": 27930 + }, + { + "epoch": 44.56, + "grad_norm": 0.3817773461341858, + "learning_rate": 0.00021754385964912283, + "loss": 0.3126, + "step": 27940 + }, + { + "epoch": 44.58, + "grad_norm": 0.24557062983512878, + "learning_rate": 0.00021690590111642745, + "loss": 0.2949, + "step": 27950 + }, + { + "epoch": 44.59, + "grad_norm": 0.7320693135261536, + "learning_rate": 0.00021626794258373206, + "loss": 0.3664, + "step": 27960 + }, + { + "epoch": 44.61, + "grad_norm": 0.4003210663795471, + "learning_rate": 0.00021562998405103668, + "loss": 0.3629, + "step": 27970 + }, + { + "epoch": 44.63, + "grad_norm": 0.30994275212287903, + "learning_rate": 0.00021499202551834132, + "loss": 0.2988, + "step": 27980 + }, + { + "epoch": 44.64, + "grad_norm": 0.2852626442909241, + "learning_rate": 0.00021435406698564594, + "loss": 0.2767, + "step": 27990 + }, + { + "epoch": 44.66, + "grad_norm": 0.2598101794719696, + "learning_rate": 0.00021371610845295056, + "loss": 0.3016, + "step": 28000 + }, + { + "epoch": 44.67, + "grad_norm": 0.1722613275051117, + "learning_rate": 0.00021307814992025517, + "loss": 0.1633, + "step": 28010 + }, + { + "epoch": 44.69, + "grad_norm": 0.2119804471731186, + "learning_rate": 0.00021244019138755982, + "loss": 0.2715, + "step": 28020 + }, + { + "epoch": 44.7, + "grad_norm": 0.27806442975997925, + "learning_rate": 0.00021180223285486443, + "loss": 0.2886, + "step": 28030 + }, + { + "epoch": 44.72, + "grad_norm": 0.47258105874061584, + "learning_rate": 0.00021116427432216905, + "loss": 0.2836, + "step": 28040 + }, + { + "epoch": 44.74, + "grad_norm": 0.295608252286911, + "learning_rate": 0.00021052631578947367, + "loss": 0.306, + "step": 28050 + }, + { + "epoch": 44.75, + "grad_norm": 0.2584683895111084, + "learning_rate": 0.0002098883572567783, + "loss": 0.3159, + "step": 28060 + }, + { + "epoch": 44.77, + "grad_norm": 0.41258344054222107, + "learning_rate": 0.00020925039872408293, + "loss": 0.3437, + "step": 28070 + }, + { + "epoch": 44.78, + "grad_norm": 0.13248884677886963, + "learning_rate": 0.00020861244019138755, + "loss": 0.3474, + "step": 28080 + }, + { + "epoch": 44.8, + "grad_norm": 0.2799845337867737, + "learning_rate": 0.0002079744816586922, + "loss": 0.3041, + "step": 28090 + }, + { + "epoch": 44.82, + "grad_norm": 0.34866270422935486, + "learning_rate": 0.00020733652312599683, + "loss": 0.2709, + "step": 28100 + }, + { + "epoch": 44.83, + "grad_norm": 0.22995953261852264, + "learning_rate": 0.00020669856459330145, + "loss": 0.3103, + "step": 28110 + }, + { + "epoch": 44.85, + "grad_norm": 0.6735871434211731, + "learning_rate": 0.00020606060606060607, + "loss": 0.3272, + "step": 28120 + }, + { + "epoch": 44.86, + "grad_norm": 0.8896424770355225, + "learning_rate": 0.00020542264752791068, + "loss": 0.2944, + "step": 28130 + }, + { + "epoch": 44.88, + "grad_norm": 0.12463853508234024, + "learning_rate": 0.00020478468899521533, + "loss": 0.3212, + "step": 28140 + }, + { + "epoch": 44.9, + "grad_norm": 0.29286473989486694, + "learning_rate": 0.00020414673046251995, + "loss": 0.2537, + "step": 28150 + }, + { + "epoch": 44.91, + "grad_norm": 0.4333782494068146, + "learning_rate": 0.00020350877192982456, + "loss": 0.2964, + "step": 28160 + }, + { + "epoch": 44.93, + "grad_norm": 0.25017327070236206, + "learning_rate": 0.00020287081339712918, + "loss": 0.3095, + "step": 28170 + }, + { + "epoch": 44.94, + "grad_norm": 0.18606650829315186, + "learning_rate": 0.00020223285486443382, + "loss": 0.2461, + "step": 28180 + }, + { + "epoch": 44.96, + "grad_norm": 0.14284665882587433, + "learning_rate": 0.00020159489633173844, + "loss": 0.2787, + "step": 28190 + }, + { + "epoch": 44.98, + "grad_norm": 0.6224771738052368, + "learning_rate": 0.00020095693779904306, + "loss": 0.3598, + "step": 28200 + }, + { + "epoch": 44.99, + "grad_norm": 0.32806506752967834, + "learning_rate": 0.00020031897926634767, + "loss": 0.2615, + "step": 28210 + }, + { + "epoch": 45.01, + "grad_norm": 0.45343583822250366, + "learning_rate": 0.00019968102073365232, + "loss": 0.3322, + "step": 28220 + }, + { + "epoch": 45.02, + "grad_norm": 0.18727990984916687, + "learning_rate": 0.00019904306220095693, + "loss": 0.2696, + "step": 28230 + }, + { + "epoch": 45.04, + "grad_norm": 0.28035393357276917, + "learning_rate": 0.00019840510366826155, + "loss": 0.256, + "step": 28240 + }, + { + "epoch": 45.06, + "grad_norm": 0.37490570545196533, + "learning_rate": 0.00019776714513556617, + "loss": 0.3105, + "step": 28250 + }, + { + "epoch": 45.07, + "grad_norm": 0.27727392315864563, + "learning_rate": 0.0001971291866028708, + "loss": 0.2573, + "step": 28260 + }, + { + "epoch": 45.09, + "grad_norm": 0.2856091856956482, + "learning_rate": 0.00019649122807017543, + "loss": 0.297, + "step": 28270 + }, + { + "epoch": 45.1, + "grad_norm": 0.3423827886581421, + "learning_rate": 0.00019585326953748007, + "loss": 0.3219, + "step": 28280 + }, + { + "epoch": 45.12, + "grad_norm": 0.2217862457036972, + "learning_rate": 0.0001952153110047847, + "loss": 0.2413, + "step": 28290 + }, + { + "epoch": 45.14, + "grad_norm": 0.49296557903289795, + "learning_rate": 0.00019457735247208933, + "loss": 0.2936, + "step": 28300 + }, + { + "epoch": 45.15, + "grad_norm": 0.28188827633857727, + "learning_rate": 0.00019393939393939395, + "loss": 0.2937, + "step": 28310 + }, + { + "epoch": 45.17, + "grad_norm": 0.6118289232254028, + "learning_rate": 0.00019330143540669857, + "loss": 0.3172, + "step": 28320 + }, + { + "epoch": 45.18, + "grad_norm": 0.38920632004737854, + "learning_rate": 0.0001926634768740032, + "loss": 0.3467, + "step": 28330 + }, + { + "epoch": 45.2, + "grad_norm": 0.2669709324836731, + "learning_rate": 0.00019202551834130783, + "loss": 0.2941, + "step": 28340 + }, + { + "epoch": 45.22, + "grad_norm": 0.17795272171497345, + "learning_rate": 0.00019138755980861245, + "loss": 0.3487, + "step": 28350 + }, + { + "epoch": 45.23, + "grad_norm": 0.3200840651988983, + "learning_rate": 0.00019074960127591706, + "loss": 0.3224, + "step": 28360 + }, + { + "epoch": 45.25, + "grad_norm": 0.3185681700706482, + "learning_rate": 0.0001901116427432217, + "loss": 0.2754, + "step": 28370 + }, + { + "epoch": 45.26, + "grad_norm": 0.35010969638824463, + "learning_rate": 0.00018947368421052632, + "loss": 0.2843, + "step": 28380 + }, + { + "epoch": 45.28, + "grad_norm": 0.19338567554950714, + "learning_rate": 0.00018883572567783094, + "loss": 0.3434, + "step": 28390 + }, + { + "epoch": 45.3, + "grad_norm": 0.13185134530067444, + "learning_rate": 0.00018819776714513556, + "loss": 0.2991, + "step": 28400 + }, + { + "epoch": 45.31, + "grad_norm": 0.2024078220129013, + "learning_rate": 0.0001875598086124402, + "loss": 0.3248, + "step": 28410 + }, + { + "epoch": 45.33, + "grad_norm": 0.22243604063987732, + "learning_rate": 0.00018692185007974482, + "loss": 0.2409, + "step": 28420 + }, + { + "epoch": 45.34, + "grad_norm": 0.5372808575630188, + "learning_rate": 0.00018628389154704943, + "loss": 0.2738, + "step": 28430 + }, + { + "epoch": 45.36, + "grad_norm": 0.17532573640346527, + "learning_rate": 0.00018564593301435405, + "loss": 0.2954, + "step": 28440 + }, + { + "epoch": 45.37, + "grad_norm": 0.2568674087524414, + "learning_rate": 0.0001850079744816587, + "loss": 0.2461, + "step": 28450 + }, + { + "epoch": 45.39, + "grad_norm": 0.36683690547943115, + "learning_rate": 0.00018437001594896334, + "loss": 0.301, + "step": 28460 + }, + { + "epoch": 45.41, + "grad_norm": 0.32988253235816956, + "learning_rate": 0.00018373205741626796, + "loss": 0.2522, + "step": 28470 + }, + { + "epoch": 45.42, + "grad_norm": 0.28334781527519226, + "learning_rate": 0.00018309409888357257, + "loss": 0.2795, + "step": 28480 + }, + { + "epoch": 45.44, + "grad_norm": 0.26257234811782837, + "learning_rate": 0.00018245614035087722, + "loss": 0.3357, + "step": 28490 + }, + { + "epoch": 45.45, + "grad_norm": 0.376924067735672, + "learning_rate": 0.00018181818181818183, + "loss": 0.3157, + "step": 28500 + }, + { + "epoch": 45.47, + "grad_norm": 0.06856755167245865, + "learning_rate": 0.00018118022328548645, + "loss": 0.3126, + "step": 28510 + }, + { + "epoch": 45.49, + "grad_norm": 0.3555695414543152, + "learning_rate": 0.00018054226475279107, + "loss": 0.2759, + "step": 28520 + }, + { + "epoch": 45.5, + "grad_norm": 0.44711726903915405, + "learning_rate": 0.0001799043062200957, + "loss": 0.3947, + "step": 28530 + }, + { + "epoch": 45.52, + "grad_norm": 0.5563350319862366, + "learning_rate": 0.00017926634768740033, + "loss": 0.3077, + "step": 28540 + }, + { + "epoch": 45.53, + "grad_norm": 0.22353103756904602, + "learning_rate": 0.00017862838915470495, + "loss": 0.319, + "step": 28550 + }, + { + "epoch": 45.55, + "grad_norm": 0.23482950031757355, + "learning_rate": 0.00017799043062200956, + "loss": 0.2164, + "step": 28560 + }, + { + "epoch": 45.57, + "grad_norm": 0.3976686894893646, + "learning_rate": 0.0001773524720893142, + "loss": 0.2903, + "step": 28570 + }, + { + "epoch": 45.58, + "grad_norm": 0.31743720173835754, + "learning_rate": 0.00017671451355661882, + "loss": 0.3855, + "step": 28580 + }, + { + "epoch": 45.6, + "grad_norm": 0.2157888561487198, + "learning_rate": 0.00017607655502392344, + "loss": 0.2198, + "step": 28590 + }, + { + "epoch": 45.61, + "grad_norm": 0.42237186431884766, + "learning_rate": 0.00017543859649122806, + "loss": 0.3859, + "step": 28600 + }, + { + "epoch": 45.63, + "grad_norm": 0.1533055305480957, + "learning_rate": 0.0001748006379585327, + "loss": 0.2198, + "step": 28610 + }, + { + "epoch": 45.65, + "grad_norm": 0.16389824450016022, + "learning_rate": 0.00017416267942583732, + "loss": 0.2789, + "step": 28620 + }, + { + "epoch": 45.66, + "grad_norm": 0.4902271032333374, + "learning_rate": 0.00017352472089314193, + "loss": 0.3184, + "step": 28630 + }, + { + "epoch": 45.68, + "grad_norm": 0.31961241364479065, + "learning_rate": 0.00017288676236044655, + "loss": 0.2653, + "step": 28640 + }, + { + "epoch": 45.69, + "grad_norm": 1.2578412294387817, + "learning_rate": 0.00017224880382775122, + "loss": 0.2537, + "step": 28650 + }, + { + "epoch": 45.71, + "grad_norm": 0.19706355035305023, + "learning_rate": 0.00017161084529505584, + "loss": 0.2666, + "step": 28660 + }, + { + "epoch": 45.73, + "grad_norm": 0.17647922039031982, + "learning_rate": 0.00017097288676236046, + "loss": 0.2909, + "step": 28670 + }, + { + "epoch": 45.74, + "grad_norm": 0.20171548426151276, + "learning_rate": 0.00017033492822966507, + "loss": 0.3303, + "step": 28680 + }, + { + "epoch": 45.76, + "grad_norm": 0.1995372623205185, + "learning_rate": 0.00016969696969696972, + "loss": 0.2621, + "step": 28690 + }, + { + "epoch": 45.77, + "grad_norm": 0.23527149856090546, + "learning_rate": 0.00016905901116427433, + "loss": 0.3213, + "step": 28700 + }, + { + "epoch": 45.79, + "grad_norm": 0.2143118530511856, + "learning_rate": 0.00016842105263157895, + "loss": 0.2584, + "step": 28710 + }, + { + "epoch": 45.81, + "grad_norm": 0.05645094811916351, + "learning_rate": 0.00016778309409888357, + "loss": 0.2731, + "step": 28720 + }, + { + "epoch": 45.82, + "grad_norm": 0.6314740777015686, + "learning_rate": 0.0001671451355661882, + "loss": 0.3243, + "step": 28730 + }, + { + "epoch": 45.84, + "grad_norm": 0.15495331585407257, + "learning_rate": 0.00016650717703349283, + "loss": 0.2936, + "step": 28740 + }, + { + "epoch": 45.85, + "grad_norm": 0.47223085165023804, + "learning_rate": 0.00016586921850079745, + "loss": 0.2497, + "step": 28750 + }, + { + "epoch": 45.87, + "grad_norm": 0.3611065447330475, + "learning_rate": 0.00016523125996810206, + "loss": 0.356, + "step": 28760 + }, + { + "epoch": 45.89, + "grad_norm": 0.38601604104042053, + "learning_rate": 0.0001645933014354067, + "loss": 0.2902, + "step": 28770 + }, + { + "epoch": 45.9, + "grad_norm": 0.36279168725013733, + "learning_rate": 0.00016395534290271132, + "loss": 0.2713, + "step": 28780 + }, + { + "epoch": 45.92, + "grad_norm": 0.15732410550117493, + "learning_rate": 0.00016331738437001594, + "loss": 0.323, + "step": 28790 + }, + { + "epoch": 45.93, + "grad_norm": 0.21142350137233734, + "learning_rate": 0.00016267942583732056, + "loss": 0.2383, + "step": 28800 + }, + { + "epoch": 45.95, + "grad_norm": 0.17822466790676117, + "learning_rate": 0.0001620414673046252, + "loss": 0.2362, + "step": 28810 + }, + { + "epoch": 45.96, + "grad_norm": 0.21047089993953705, + "learning_rate": 0.00016140350877192982, + "loss": 0.3138, + "step": 28820 + }, + { + "epoch": 45.98, + "grad_norm": 0.4122728109359741, + "learning_rate": 0.00016076555023923443, + "loss": 0.2932, + "step": 28830 + }, + { + "epoch": 46.0, + "grad_norm": 0.27697211503982544, + "learning_rate": 0.00016012759170653908, + "loss": 0.3276, + "step": 28840 + }, + { + "epoch": 46.01, + "grad_norm": 0.32727357745170593, + "learning_rate": 0.00015948963317384372, + "loss": 0.2403, + "step": 28850 + }, + { + "epoch": 46.03, + "grad_norm": 0.26314985752105713, + "learning_rate": 0.00015885167464114834, + "loss": 0.2484, + "step": 28860 + }, + { + "epoch": 46.04, + "grad_norm": 0.12502922117710114, + "learning_rate": 0.00015821371610845296, + "loss": 0.2411, + "step": 28870 + }, + { + "epoch": 46.06, + "grad_norm": 0.35499653220176697, + "learning_rate": 0.00015757575757575757, + "loss": 0.2731, + "step": 28880 + }, + { + "epoch": 46.08, + "grad_norm": 0.4001838266849518, + "learning_rate": 0.00015693779904306222, + "loss": 0.256, + "step": 28890 + }, + { + "epoch": 46.09, + "grad_norm": 0.49199333786964417, + "learning_rate": 0.00015629984051036683, + "loss": 0.3412, + "step": 28900 + }, + { + "epoch": 46.11, + "grad_norm": 0.22476720809936523, + "learning_rate": 0.00015566188197767145, + "loss": 0.2704, + "step": 28910 + }, + { + "epoch": 46.12, + "grad_norm": 0.42547646164894104, + "learning_rate": 0.00015502392344497607, + "loss": 0.3282, + "step": 28920 + }, + { + "epoch": 46.14, + "grad_norm": 0.14458052814006805, + "learning_rate": 0.0001543859649122807, + "loss": 0.3036, + "step": 28930 + }, + { + "epoch": 46.16, + "grad_norm": 0.23600299656391144, + "learning_rate": 0.00015374800637958533, + "loss": 0.3372, + "step": 28940 + }, + { + "epoch": 46.17, + "grad_norm": 0.31214261054992676, + "learning_rate": 0.00015311004784688995, + "loss": 0.2513, + "step": 28950 + }, + { + "epoch": 46.19, + "grad_norm": 0.6175329685211182, + "learning_rate": 0.00015247208931419456, + "loss": 0.297, + "step": 28960 + }, + { + "epoch": 46.2, + "grad_norm": 0.3160916864871979, + "learning_rate": 0.0001518341307814992, + "loss": 0.3279, + "step": 28970 + }, + { + "epoch": 46.22, + "grad_norm": 0.37880146503448486, + "learning_rate": 0.00015119617224880382, + "loss": 0.253, + "step": 28980 + }, + { + "epoch": 46.24, + "grad_norm": 0.16760538518428802, + "learning_rate": 0.00015055821371610844, + "loss": 0.2691, + "step": 28990 + }, + { + "epoch": 46.25, + "grad_norm": 0.34026291966438293, + "learning_rate": 0.00014992025518341306, + "loss": 0.4, + "step": 29000 + }, + { + "epoch": 46.27, + "grad_norm": 0.22685553133487701, + "learning_rate": 0.0001492822966507177, + "loss": 0.2779, + "step": 29010 + }, + { + "epoch": 46.28, + "grad_norm": 0.3551049530506134, + "learning_rate": 0.00014864433811802235, + "loss": 0.3183, + "step": 29020 + }, + { + "epoch": 46.3, + "grad_norm": 0.5112924575805664, + "learning_rate": 0.00014800637958532696, + "loss": 0.3357, + "step": 29030 + }, + { + "epoch": 46.32, + "grad_norm": 0.4620679020881653, + "learning_rate": 0.00014736842105263158, + "loss": 0.2737, + "step": 29040 + }, + { + "epoch": 46.33, + "grad_norm": 0.25304239988327026, + "learning_rate": 0.00014673046251993622, + "loss": 0.2402, + "step": 29050 + }, + { + "epoch": 46.35, + "grad_norm": 0.20634669065475464, + "learning_rate": 0.00014609250398724084, + "loss": 0.2407, + "step": 29060 + }, + { + "epoch": 46.36, + "grad_norm": 0.4903095066547394, + "learning_rate": 0.00014545454545454546, + "loss": 0.3264, + "step": 29070 + }, + { + "epoch": 46.38, + "grad_norm": 0.09498465806245804, + "learning_rate": 0.0001448165869218501, + "loss": 0.2885, + "step": 29080 + }, + { + "epoch": 46.4, + "grad_norm": 0.292100191116333, + "learning_rate": 0.00014417862838915472, + "loss": 0.3582, + "step": 29090 + }, + { + "epoch": 46.41, + "grad_norm": 0.23083628714084625, + "learning_rate": 0.00014354066985645933, + "loss": 0.2363, + "step": 29100 + }, + { + "epoch": 46.43, + "grad_norm": 0.3492584228515625, + "learning_rate": 0.00014290271132376395, + "loss": 0.3224, + "step": 29110 + }, + { + "epoch": 46.44, + "grad_norm": 0.5817916393280029, + "learning_rate": 0.0001422647527910686, + "loss": 0.3166, + "step": 29120 + }, + { + "epoch": 46.46, + "grad_norm": 0.3647211194038391, + "learning_rate": 0.0001416267942583732, + "loss": 0.2748, + "step": 29130 + }, + { + "epoch": 46.48, + "grad_norm": 0.46294817328453064, + "learning_rate": 0.00014098883572567783, + "loss": 0.3511, + "step": 29140 + }, + { + "epoch": 46.49, + "grad_norm": 0.09461899846792221, + "learning_rate": 0.00014035087719298245, + "loss": 0.2238, + "step": 29150 + }, + { + "epoch": 46.51, + "grad_norm": 0.3371366262435913, + "learning_rate": 0.0001397129186602871, + "loss": 0.314, + "step": 29160 + }, + { + "epoch": 46.52, + "grad_norm": 0.4762924313545227, + "learning_rate": 0.0001390749601275917, + "loss": 0.2712, + "step": 29170 + }, + { + "epoch": 46.54, + "grad_norm": 0.22956405580043793, + "learning_rate": 0.00013843700159489632, + "loss": 0.3876, + "step": 29180 + }, + { + "epoch": 46.56, + "grad_norm": 0.11843441426753998, + "learning_rate": 0.00013779904306220094, + "loss": 0.2649, + "step": 29190 + }, + { + "epoch": 46.57, + "grad_norm": 0.38579946756362915, + "learning_rate": 0.00013716108452950558, + "loss": 0.2696, + "step": 29200 + }, + { + "epoch": 46.59, + "grad_norm": 0.33288395404815674, + "learning_rate": 0.00013652312599681023, + "loss": 0.3079, + "step": 29210 + }, + { + "epoch": 46.6, + "grad_norm": 0.2519354224205017, + "learning_rate": 0.00013588516746411485, + "loss": 0.3065, + "step": 29220 + }, + { + "epoch": 46.62, + "grad_norm": 0.1389375776052475, + "learning_rate": 0.00013524720893141946, + "loss": 0.2599, + "step": 29230 + }, + { + "epoch": 46.63, + "grad_norm": 0.33025360107421875, + "learning_rate": 0.0001346092503987241, + "loss": 0.3191, + "step": 29240 + }, + { + "epoch": 46.65, + "grad_norm": 0.1264023780822754, + "learning_rate": 0.00013397129186602872, + "loss": 0.2695, + "step": 29250 + }, + { + "epoch": 46.67, + "grad_norm": 0.4093076288700104, + "learning_rate": 0.00013333333333333334, + "loss": 0.2471, + "step": 29260 + }, + { + "epoch": 46.68, + "grad_norm": 0.722159743309021, + "learning_rate": 0.00013269537480063796, + "loss": 0.28, + "step": 29270 + }, + { + "epoch": 46.7, + "grad_norm": 0.4530355632305145, + "learning_rate": 0.0001320574162679426, + "loss": 0.3053, + "step": 29280 + }, + { + "epoch": 46.71, + "grad_norm": 0.10739301890134811, + "learning_rate": 0.00013141945773524722, + "loss": 0.2983, + "step": 29290 + }, + { + "epoch": 46.73, + "grad_norm": 0.1723739355802536, + "learning_rate": 0.00013078149920255183, + "loss": 0.2239, + "step": 29300 + }, + { + "epoch": 46.75, + "grad_norm": 0.2270219624042511, + "learning_rate": 0.00013014354066985645, + "loss": 0.3093, + "step": 29310 + }, + { + "epoch": 46.76, + "grad_norm": 0.6445925831794739, + "learning_rate": 0.0001295055821371611, + "loss": 0.3212, + "step": 29320 + }, + { + "epoch": 46.78, + "grad_norm": 0.22848792374134064, + "learning_rate": 0.0001288676236044657, + "loss": 0.3183, + "step": 29330 + }, + { + "epoch": 46.79, + "grad_norm": 0.3686947822570801, + "learning_rate": 0.00012822966507177033, + "loss": 0.2628, + "step": 29340 + }, + { + "epoch": 46.81, + "grad_norm": 0.27950429916381836, + "learning_rate": 0.00012759170653907495, + "loss": 0.312, + "step": 29350 + }, + { + "epoch": 46.83, + "grad_norm": 0.13954879343509674, + "learning_rate": 0.0001269537480063796, + "loss": 0.315, + "step": 29360 + }, + { + "epoch": 46.84, + "grad_norm": 0.314480185508728, + "learning_rate": 0.0001263157894736842, + "loss": 0.3344, + "step": 29370 + }, + { + "epoch": 46.86, + "grad_norm": 0.3248406946659088, + "learning_rate": 0.00012567783094098882, + "loss": 0.3106, + "step": 29380 + }, + { + "epoch": 46.87, + "grad_norm": 0.3097328543663025, + "learning_rate": 0.00012503987240829344, + "loss": 0.2933, + "step": 29390 + }, + { + "epoch": 46.89, + "grad_norm": 0.4338608384132385, + "learning_rate": 0.00012440191387559808, + "loss": 0.2809, + "step": 29400 + }, + { + "epoch": 46.91, + "grad_norm": 0.35394251346588135, + "learning_rate": 0.00012376395534290273, + "loss": 0.3051, + "step": 29410 + }, + { + "epoch": 46.92, + "grad_norm": 0.07790148258209229, + "learning_rate": 0.00012312599681020735, + "loss": 0.3595, + "step": 29420 + }, + { + "epoch": 46.94, + "grad_norm": 0.2738390564918518, + "learning_rate": 0.00012248803827751196, + "loss": 0.2533, + "step": 29430 + }, + { + "epoch": 46.95, + "grad_norm": 0.19870556890964508, + "learning_rate": 0.00012185007974481659, + "loss": 0.2967, + "step": 29440 + }, + { + "epoch": 46.97, + "grad_norm": 0.15914097428321838, + "learning_rate": 0.00012121212121212122, + "loss": 0.3222, + "step": 29450 + }, + { + "epoch": 46.99, + "grad_norm": 0.22630850970745087, + "learning_rate": 0.00012057416267942584, + "loss": 0.2994, + "step": 29460 + }, + { + "epoch": 47.0, + "grad_norm": 0.32556214928627014, + "learning_rate": 0.00011993620414673047, + "loss": 0.2816, + "step": 29470 + }, + { + "epoch": 47.02, + "grad_norm": 0.274972528219223, + "learning_rate": 0.00011929824561403509, + "loss": 0.2624, + "step": 29480 + }, + { + "epoch": 47.03, + "grad_norm": 0.3284093737602234, + "learning_rate": 0.00011866028708133972, + "loss": 0.2596, + "step": 29490 + }, + { + "epoch": 47.05, + "grad_norm": 0.2033546268939972, + "learning_rate": 0.00011802232854864433, + "loss": 0.2444, + "step": 29500 + }, + { + "epoch": 47.07, + "grad_norm": 0.3881695866584778, + "learning_rate": 0.00011738437001594896, + "loss": 0.2642, + "step": 29510 + }, + { + "epoch": 47.08, + "grad_norm": 0.3856006860733032, + "learning_rate": 0.00011674641148325358, + "loss": 0.2641, + "step": 29520 + }, + { + "epoch": 47.1, + "grad_norm": 0.3555915355682373, + "learning_rate": 0.00011610845295055821, + "loss": 0.2547, + "step": 29530 + }, + { + "epoch": 47.11, + "grad_norm": 0.39494889974594116, + "learning_rate": 0.00011547049441786284, + "loss": 0.3229, + "step": 29540 + }, + { + "epoch": 47.13, + "grad_norm": 0.39036959409713745, + "learning_rate": 0.00011483253588516747, + "loss": 0.2395, + "step": 29550 + }, + { + "epoch": 47.15, + "grad_norm": 0.14146322011947632, + "learning_rate": 0.00011419457735247209, + "loss": 0.1988, + "step": 29560 + }, + { + "epoch": 47.16, + "grad_norm": 0.22183720767498016, + "learning_rate": 0.00011355661881977672, + "loss": 0.3234, + "step": 29570 + }, + { + "epoch": 47.18, + "grad_norm": 0.19865743815898895, + "learning_rate": 0.00011291866028708134, + "loss": 0.2811, + "step": 29580 + }, + { + "epoch": 47.19, + "grad_norm": 0.457445353269577, + "learning_rate": 0.00011228070175438597, + "loss": 0.2627, + "step": 29590 + }, + { + "epoch": 47.21, + "grad_norm": 0.28596189618110657, + "learning_rate": 0.00011164274322169058, + "loss": 0.2277, + "step": 29600 + }, + { + "epoch": 47.22, + "grad_norm": 0.4201318025588989, + "learning_rate": 0.00011100478468899521, + "loss": 0.3182, + "step": 29610 + }, + { + "epoch": 47.24, + "grad_norm": 0.31965920329093933, + "learning_rate": 0.00011036682615629983, + "loss": 0.2838, + "step": 29620 + }, + { + "epoch": 47.26, + "grad_norm": 0.10794230550527573, + "learning_rate": 0.00010972886762360448, + "loss": 0.2655, + "step": 29630 + }, + { + "epoch": 47.27, + "grad_norm": 0.13269487023353577, + "learning_rate": 0.00010909090909090909, + "loss": 0.3302, + "step": 29640 + }, + { + "epoch": 47.29, + "grad_norm": 0.5231210589408875, + "learning_rate": 0.00010845295055821372, + "loss": 0.3293, + "step": 29650 + }, + { + "epoch": 47.3, + "grad_norm": 0.158706933259964, + "learning_rate": 0.00010781499202551834, + "loss": 0.2234, + "step": 29660 + }, + { + "epoch": 47.32, + "grad_norm": 0.2540994882583618, + "learning_rate": 0.00010717703349282297, + "loss": 0.3066, + "step": 29670 + }, + { + "epoch": 47.34, + "grad_norm": 0.32114022970199585, + "learning_rate": 0.00010653907496012759, + "loss": 0.26, + "step": 29680 + }, + { + "epoch": 47.35, + "grad_norm": 0.14222322404384613, + "learning_rate": 0.00010590111642743222, + "loss": 0.1941, + "step": 29690 + }, + { + "epoch": 47.37, + "grad_norm": 0.33291783928871155, + "learning_rate": 0.00010526315789473683, + "loss": 0.3337, + "step": 29700 + }, + { + "epoch": 47.38, + "grad_norm": 0.21735547482967377, + "learning_rate": 0.00010462519936204146, + "loss": 0.2755, + "step": 29710 + }, + { + "epoch": 47.4, + "grad_norm": 0.37341004610061646, + "learning_rate": 0.0001039872408293461, + "loss": 0.2765, + "step": 29720 + }, + { + "epoch": 47.42, + "grad_norm": 0.13885751366615295, + "learning_rate": 0.00010334928229665073, + "loss": 0.3081, + "step": 29730 + }, + { + "epoch": 47.43, + "grad_norm": 0.6437707543373108, + "learning_rate": 0.00010271132376395534, + "loss": 0.322, + "step": 29740 + }, + { + "epoch": 47.45, + "grad_norm": 0.13305498659610748, + "learning_rate": 0.00010207336523125997, + "loss": 0.2947, + "step": 29750 + }, + { + "epoch": 47.46, + "grad_norm": 0.6117695569992065, + "learning_rate": 0.00010143540669856459, + "loss": 0.3218, + "step": 29760 + }, + { + "epoch": 47.48, + "grad_norm": 0.34142374992370605, + "learning_rate": 0.00010079744816586922, + "loss": 0.3295, + "step": 29770 + }, + { + "epoch": 47.5, + "grad_norm": 0.37447261810302734, + "learning_rate": 0.00010015948963317384, + "loss": 0.2824, + "step": 29780 + }, + { + "epoch": 47.51, + "grad_norm": 0.14651019871234894, + "learning_rate": 9.952153110047847e-05, + "loss": 0.2739, + "step": 29790 + }, + { + "epoch": 47.53, + "grad_norm": 0.14142945408821106, + "learning_rate": 9.888357256778308e-05, + "loss": 0.217, + "step": 29800 + }, + { + "epoch": 47.54, + "grad_norm": 0.3807011544704437, + "learning_rate": 9.824561403508771e-05, + "loss": 0.3054, + "step": 29810 + }, + { + "epoch": 47.56, + "grad_norm": 0.2842819392681122, + "learning_rate": 9.760765550239235e-05, + "loss": 0.3078, + "step": 29820 + }, + { + "epoch": 47.58, + "grad_norm": 0.7402997016906738, + "learning_rate": 9.696969696969698e-05, + "loss": 0.3118, + "step": 29830 + }, + { + "epoch": 47.59, + "grad_norm": 0.13281618058681488, + "learning_rate": 9.63317384370016e-05, + "loss": 0.3055, + "step": 29840 + }, + { + "epoch": 47.61, + "grad_norm": 0.3724515736103058, + "learning_rate": 9.569377990430622e-05, + "loss": 0.3274, + "step": 29850 + }, + { + "epoch": 47.62, + "grad_norm": 0.33854445815086365, + "learning_rate": 9.505582137161085e-05, + "loss": 0.2533, + "step": 29860 + }, + { + "epoch": 47.64, + "grad_norm": 0.42690280079841614, + "learning_rate": 9.441786283891547e-05, + "loss": 0.3262, + "step": 29870 + }, + { + "epoch": 47.66, + "grad_norm": 0.6151228547096252, + "learning_rate": 9.37799043062201e-05, + "loss": 0.2889, + "step": 29880 + }, + { + "epoch": 47.67, + "grad_norm": 0.26469776034355164, + "learning_rate": 9.314194577352472e-05, + "loss": 0.3036, + "step": 29890 + }, + { + "epoch": 47.69, + "grad_norm": 0.2703404426574707, + "learning_rate": 9.250398724082935e-05, + "loss": 0.2673, + "step": 29900 + }, + { + "epoch": 47.7, + "grad_norm": 0.3791040778160095, + "learning_rate": 9.186602870813398e-05, + "loss": 0.3244, + "step": 29910 + }, + { + "epoch": 47.72, + "grad_norm": 0.29400941729545593, + "learning_rate": 9.122807017543861e-05, + "loss": 0.2538, + "step": 29920 + }, + { + "epoch": 47.74, + "grad_norm": 0.4795028567314148, + "learning_rate": 9.059011164274323e-05, + "loss": 0.2482, + "step": 29930 + }, + { + "epoch": 47.75, + "grad_norm": 0.36813196539878845, + "learning_rate": 8.995215311004786e-05, + "loss": 0.2548, + "step": 29940 + }, + { + "epoch": 47.77, + "grad_norm": 0.22788019478321075, + "learning_rate": 8.931419457735247e-05, + "loss": 0.2961, + "step": 29950 + }, + { + "epoch": 47.78, + "grad_norm": 0.24274033308029175, + "learning_rate": 8.86762360446571e-05, + "loss": 0.2933, + "step": 29960 + }, + { + "epoch": 47.8, + "grad_norm": 0.4556421637535095, + "learning_rate": 8.803827751196172e-05, + "loss": 0.3562, + "step": 29970 + }, + { + "epoch": 47.81, + "grad_norm": 0.5702171921730042, + "learning_rate": 8.740031897926635e-05, + "loss": 0.3005, + "step": 29980 + }, + { + "epoch": 47.83, + "grad_norm": 0.32142460346221924, + "learning_rate": 8.676236044657097e-05, + "loss": 0.2663, + "step": 29990 + }, + { + "epoch": 47.85, + "grad_norm": 0.17863740026950836, + "learning_rate": 8.612440191387561e-05, + "loss": 0.2509, + "step": 30000 + }, + { + "epoch": 47.86, + "grad_norm": 0.09491372853517532, + "learning_rate": 8.548644338118023e-05, + "loss": 0.3576, + "step": 30010 + }, + { + "epoch": 47.88, + "grad_norm": 0.4455479085445404, + "learning_rate": 8.484848484848486e-05, + "loss": 0.4137, + "step": 30020 + }, + { + "epoch": 47.89, + "grad_norm": 0.33983567357063293, + "learning_rate": 8.421052631578948e-05, + "loss": 0.2838, + "step": 30030 + }, + { + "epoch": 47.91, + "grad_norm": 0.4801020622253418, + "learning_rate": 8.35725677830941e-05, + "loss": 0.2983, + "step": 30040 + }, + { + "epoch": 47.93, + "grad_norm": 0.33874234557151794, + "learning_rate": 8.293460925039872e-05, + "loss": 0.2689, + "step": 30050 + }, + { + "epoch": 47.94, + "grad_norm": 0.269828736782074, + "learning_rate": 8.229665071770335e-05, + "loss": 0.3258, + "step": 30060 + }, + { + "epoch": 47.96, + "grad_norm": 0.0987486019730568, + "learning_rate": 8.165869218500797e-05, + "loss": 0.2461, + "step": 30070 + }, + { + "epoch": 47.97, + "grad_norm": 0.3457973897457123, + "learning_rate": 8.10207336523126e-05, + "loss": 0.2783, + "step": 30080 + }, + { + "epoch": 47.99, + "grad_norm": 0.10124126076698303, + "learning_rate": 8.038277511961722e-05, + "loss": 0.2794, + "step": 30090 + }, + { + "epoch": 48.01, + "grad_norm": 0.40085652470588684, + "learning_rate": 7.974481658692186e-05, + "loss": 0.326, + "step": 30100 + }, + { + "epoch": 48.02, + "grad_norm": 0.184198796749115, + "learning_rate": 7.910685805422648e-05, + "loss": 0.2469, + "step": 30110 + }, + { + "epoch": 48.04, + "grad_norm": 0.2005092054605484, + "learning_rate": 7.846889952153111e-05, + "loss": 0.3044, + "step": 30120 + }, + { + "epoch": 48.05, + "grad_norm": 0.35767000913619995, + "learning_rate": 7.783094098883573e-05, + "loss": 0.2981, + "step": 30130 + }, + { + "epoch": 48.07, + "grad_norm": 0.38873291015625, + "learning_rate": 7.719298245614036e-05, + "loss": 0.3039, + "step": 30140 + }, + { + "epoch": 48.09, + "grad_norm": 0.22854940593242645, + "learning_rate": 7.655502392344497e-05, + "loss": 0.2068, + "step": 30150 + }, + { + "epoch": 48.1, + "grad_norm": 0.1659734845161438, + "learning_rate": 7.59170653907496e-05, + "loss": 0.277, + "step": 30160 + }, + { + "epoch": 48.12, + "grad_norm": 0.1869482696056366, + "learning_rate": 7.527910685805422e-05, + "loss": 0.2194, + "step": 30170 + }, + { + "epoch": 48.13, + "grad_norm": 0.08279826492071152, + "learning_rate": 7.464114832535885e-05, + "loss": 0.2824, + "step": 30180 + }, + { + "epoch": 48.15, + "grad_norm": 0.4725863039493561, + "learning_rate": 7.400318979266348e-05, + "loss": 0.2503, + "step": 30190 + }, + { + "epoch": 48.17, + "grad_norm": 0.172104611992836, + "learning_rate": 7.336523125996811e-05, + "loss": 0.2658, + "step": 30200 + }, + { + "epoch": 48.18, + "grad_norm": 0.21676242351531982, + "learning_rate": 7.272727272727273e-05, + "loss": 0.2658, + "step": 30210 + }, + { + "epoch": 48.2, + "grad_norm": 0.3602610230445862, + "learning_rate": 7.208931419457736e-05, + "loss": 0.2367, + "step": 30220 + }, + { + "epoch": 48.21, + "grad_norm": 0.3500073552131653, + "learning_rate": 7.145135566188198e-05, + "loss": 0.3014, + "step": 30230 + }, + { + "epoch": 48.23, + "grad_norm": 0.3083650469779968, + "learning_rate": 7.08133971291866e-05, + "loss": 0.3093, + "step": 30240 + }, + { + "epoch": 48.25, + "grad_norm": 0.20540174841880798, + "learning_rate": 7.017543859649122e-05, + "loss": 0.2626, + "step": 30250 + }, + { + "epoch": 48.26, + "grad_norm": 0.26233381032943726, + "learning_rate": 6.953748006379585e-05, + "loss": 0.3743, + "step": 30260 + }, + { + "epoch": 48.28, + "grad_norm": 0.6622065901756287, + "learning_rate": 6.889952153110047e-05, + "loss": 0.3261, + "step": 30270 + }, + { + "epoch": 48.29, + "grad_norm": 0.50579833984375, + "learning_rate": 6.826156299840511e-05, + "loss": 0.2812, + "step": 30280 + }, + { + "epoch": 48.31, + "grad_norm": 0.22522664070129395, + "learning_rate": 6.762360446570973e-05, + "loss": 0.247, + "step": 30290 + }, + { + "epoch": 48.33, + "grad_norm": 0.334440141916275, + "learning_rate": 6.698564593301436e-05, + "loss": 0.2827, + "step": 30300 + }, + { + "epoch": 48.34, + "grad_norm": 0.10422962158918381, + "learning_rate": 6.634768740031898e-05, + "loss": 0.2475, + "step": 30310 + }, + { + "epoch": 48.36, + "grad_norm": 0.409278005361557, + "learning_rate": 6.570972886762361e-05, + "loss": 0.3053, + "step": 30320 + }, + { + "epoch": 48.37, + "grad_norm": 0.15748478472232819, + "learning_rate": 6.507177033492823e-05, + "loss": 0.3292, + "step": 30330 + }, + { + "epoch": 48.39, + "grad_norm": 0.1966976523399353, + "learning_rate": 6.443381180223286e-05, + "loss": 0.2462, + "step": 30340 + }, + { + "epoch": 48.41, + "grad_norm": 0.34300366044044495, + "learning_rate": 6.379585326953747e-05, + "loss": 0.3215, + "step": 30350 + }, + { + "epoch": 48.42, + "grad_norm": 0.15784505009651184, + "learning_rate": 6.31578947368421e-05, + "loss": 0.2191, + "step": 30360 + }, + { + "epoch": 48.44, + "grad_norm": 0.1942838877439499, + "learning_rate": 6.251993620414672e-05, + "loss": 0.2964, + "step": 30370 + }, + { + "epoch": 48.45, + "grad_norm": 0.23638346791267395, + "learning_rate": 6.188197767145136e-05, + "loss": 0.2913, + "step": 30380 + }, + { + "epoch": 48.47, + "grad_norm": 0.18222945928573608, + "learning_rate": 6.124401913875598e-05, + "loss": 0.2863, + "step": 30390 + }, + { + "epoch": 48.48, + "grad_norm": 0.13442523777484894, + "learning_rate": 6.060606060606061e-05, + "loss": 0.3137, + "step": 30400 + }, + { + "epoch": 48.5, + "grad_norm": 0.09403583407402039, + "learning_rate": 5.9968102073365235e-05, + "loss": 0.2212, + "step": 30410 + }, + { + "epoch": 48.52, + "grad_norm": 0.21507516503334045, + "learning_rate": 5.933014354066986e-05, + "loss": 0.2613, + "step": 30420 + }, + { + "epoch": 48.53, + "grad_norm": 0.41693365573883057, + "learning_rate": 5.869218500797448e-05, + "loss": 0.2824, + "step": 30430 + }, + { + "epoch": 48.55, + "grad_norm": 0.23327617347240448, + "learning_rate": 5.8054226475279106e-05, + "loss": 0.3104, + "step": 30440 + }, + { + "epoch": 48.56, + "grad_norm": 0.6092672348022461, + "learning_rate": 5.7416267942583736e-05, + "loss": 0.309, + "step": 30450 + }, + { + "epoch": 48.58, + "grad_norm": 0.14301355183124542, + "learning_rate": 5.677830940988836e-05, + "loss": 0.2445, + "step": 30460 + }, + { + "epoch": 48.6, + "grad_norm": 0.42832037806510925, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.2872, + "step": 30470 + }, + { + "epoch": 48.61, + "grad_norm": 0.25466400384902954, + "learning_rate": 5.550239234449761e-05, + "loss": 0.2659, + "step": 30480 + }, + { + "epoch": 48.63, + "grad_norm": 0.2657581865787506, + "learning_rate": 5.486443381180224e-05, + "loss": 0.2374, + "step": 30490 + }, + { + "epoch": 48.64, + "grad_norm": 0.40479007363319397, + "learning_rate": 5.422647527910686e-05, + "loss": 0.3537, + "step": 30500 + }, + { + "epoch": 48.66, + "grad_norm": 0.28331390023231506, + "learning_rate": 5.3588516746411485e-05, + "loss": 0.3156, + "step": 30510 + }, + { + "epoch": 48.68, + "grad_norm": 0.27074429392814636, + "learning_rate": 5.295055821371611e-05, + "loss": 0.2869, + "step": 30520 + }, + { + "epoch": 48.69, + "grad_norm": 0.21443207561969757, + "learning_rate": 5.231259968102073e-05, + "loss": 0.2715, + "step": 30530 + }, + { + "epoch": 48.71, + "grad_norm": 0.28873592615127563, + "learning_rate": 5.167464114832536e-05, + "loss": 0.283, + "step": 30540 + }, + { + "epoch": 48.72, + "grad_norm": 0.2248823344707489, + "learning_rate": 5.1036682615629986e-05, + "loss": 0.2609, + "step": 30550 + }, + { + "epoch": 48.74, + "grad_norm": 0.16412192583084106, + "learning_rate": 5.039872408293461e-05, + "loss": 0.2673, + "step": 30560 + }, + { + "epoch": 48.76, + "grad_norm": 0.37860092520713806, + "learning_rate": 4.9760765550239234e-05, + "loss": 0.2795, + "step": 30570 + }, + { + "epoch": 48.77, + "grad_norm": 0.5846998691558838, + "learning_rate": 4.912280701754386e-05, + "loss": 0.3345, + "step": 30580 + }, + { + "epoch": 48.79, + "grad_norm": 0.4207826554775238, + "learning_rate": 4.848484848484849e-05, + "loss": 0.2574, + "step": 30590 + }, + { + "epoch": 48.8, + "grad_norm": 0.2351989895105362, + "learning_rate": 4.784688995215311e-05, + "loss": 0.2994, + "step": 30600 + }, + { + "epoch": 48.82, + "grad_norm": 0.29773497581481934, + "learning_rate": 4.7208931419457735e-05, + "loss": 0.3491, + "step": 30610 + }, + { + "epoch": 48.84, + "grad_norm": 0.3682696521282196, + "learning_rate": 4.657097288676236e-05, + "loss": 0.2976, + "step": 30620 + }, + { + "epoch": 48.85, + "grad_norm": 0.33122923970222473, + "learning_rate": 4.593301435406699e-05, + "loss": 0.314, + "step": 30630 + }, + { + "epoch": 48.87, + "grad_norm": 0.3438310921192169, + "learning_rate": 4.529505582137161e-05, + "loss": 0.3169, + "step": 30640 + }, + { + "epoch": 48.88, + "grad_norm": 0.32344672083854675, + "learning_rate": 4.4657097288676236e-05, + "loss": 0.2184, + "step": 30650 + }, + { + "epoch": 48.9, + "grad_norm": 0.4275621771812439, + "learning_rate": 4.401913875598086e-05, + "loss": 0.3592, + "step": 30660 + }, + { + "epoch": 48.92, + "grad_norm": 0.514369785785675, + "learning_rate": 4.3381180223285484e-05, + "loss": 0.2393, + "step": 30670 + }, + { + "epoch": 48.93, + "grad_norm": 0.23344865441322327, + "learning_rate": 4.2743221690590114e-05, + "loss": 0.3637, + "step": 30680 + }, + { + "epoch": 48.95, + "grad_norm": 0.2496626079082489, + "learning_rate": 4.210526315789474e-05, + "loss": 0.3157, + "step": 30690 + }, + { + "epoch": 48.96, + "grad_norm": 0.15069235861301422, + "learning_rate": 4.146730462519936e-05, + "loss": 0.2731, + "step": 30700 + }, + { + "epoch": 48.98, + "grad_norm": 0.5047960877418518, + "learning_rate": 4.0829346092503985e-05, + "loss": 0.2811, + "step": 30710 + }, + { + "epoch": 49.0, + "grad_norm": 0.34830254316329956, + "learning_rate": 4.019138755980861e-05, + "loss": 0.2925, + "step": 30720 + }, + { + "epoch": 49.01, + "grad_norm": 0.4893124997615814, + "learning_rate": 3.955342902711324e-05, + "loss": 0.308, + "step": 30730 + }, + { + "epoch": 49.03, + "grad_norm": 0.3630107045173645, + "learning_rate": 3.891547049441786e-05, + "loss": 0.2671, + "step": 30740 + }, + { + "epoch": 49.04, + "grad_norm": 0.16974857449531555, + "learning_rate": 3.8277511961722486e-05, + "loss": 0.3295, + "step": 30750 + }, + { + "epoch": 49.06, + "grad_norm": 0.34105682373046875, + "learning_rate": 3.763955342902711e-05, + "loss": 0.2895, + "step": 30760 + }, + { + "epoch": 49.07, + "grad_norm": 0.47773271799087524, + "learning_rate": 3.700159489633174e-05, + "loss": 0.2591, + "step": 30770 + }, + { + "epoch": 49.09, + "grad_norm": 0.3436296582221985, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.2138, + "step": 30780 + }, + { + "epoch": 49.11, + "grad_norm": 0.1262790709733963, + "learning_rate": 3.572567783094099e-05, + "loss": 0.2472, + "step": 30790 + }, + { + "epoch": 49.12, + "grad_norm": 0.2755976915359497, + "learning_rate": 3.508771929824561e-05, + "loss": 0.2532, + "step": 30800 + }, + { + "epoch": 49.14, + "grad_norm": 0.16442789137363434, + "learning_rate": 3.4449760765550235e-05, + "loss": 0.2813, + "step": 30810 + }, + { + "epoch": 49.15, + "grad_norm": 0.29541754722595215, + "learning_rate": 3.3811802232854866e-05, + "loss": 0.3036, + "step": 30820 + }, + { + "epoch": 49.17, + "grad_norm": 0.07406118512153625, + "learning_rate": 3.317384370015949e-05, + "loss": 0.2552, + "step": 30830 + }, + { + "epoch": 49.19, + "grad_norm": 0.413967102766037, + "learning_rate": 3.253588516746411e-05, + "loss": 0.3118, + "step": 30840 + }, + { + "epoch": 49.2, + "grad_norm": 0.567054808139801, + "learning_rate": 3.1897926634768736e-05, + "loss": 0.259, + "step": 30850 + }, + { + "epoch": 49.22, + "grad_norm": 0.19133225083351135, + "learning_rate": 3.125996810207336e-05, + "loss": 0.216, + "step": 30860 + }, + { + "epoch": 49.23, + "grad_norm": 0.35869938135147095, + "learning_rate": 3.062200956937799e-05, + "loss": 0.3244, + "step": 30870 + }, + { + "epoch": 49.25, + "grad_norm": 0.3546787202358246, + "learning_rate": 2.9984051036682618e-05, + "loss": 0.3345, + "step": 30880 + }, + { + "epoch": 49.27, + "grad_norm": 0.3473091721534729, + "learning_rate": 2.934609250398724e-05, + "loss": 0.2133, + "step": 30890 + }, + { + "epoch": 49.28, + "grad_norm": 0.4771929979324341, + "learning_rate": 2.8708133971291868e-05, + "loss": 0.3053, + "step": 30900 + }, + { + "epoch": 49.3, + "grad_norm": 0.3776096701622009, + "learning_rate": 2.8070175438596492e-05, + "loss": 0.2765, + "step": 30910 + }, + { + "epoch": 49.31, + "grad_norm": 0.2937834560871124, + "learning_rate": 2.743221690590112e-05, + "loss": 0.2581, + "step": 30920 + }, + { + "epoch": 49.33, + "grad_norm": 0.2534268796443939, + "learning_rate": 2.6794258373205743e-05, + "loss": 0.2985, + "step": 30930 + }, + { + "epoch": 49.35, + "grad_norm": 0.18742942810058594, + "learning_rate": 2.6156299840510366e-05, + "loss": 0.2159, + "step": 30940 + }, + { + "epoch": 49.36, + "grad_norm": 0.3918183147907257, + "learning_rate": 2.5518341307814993e-05, + "loss": 0.3054, + "step": 30950 + }, + { + "epoch": 49.38, + "grad_norm": 0.33097043633461, + "learning_rate": 2.4880382775119617e-05, + "loss": 0.3201, + "step": 30960 + }, + { + "epoch": 49.39, + "grad_norm": 0.37174108624458313, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.2708, + "step": 30970 + }, + { + "epoch": 49.41, + "grad_norm": 0.27249741554260254, + "learning_rate": 2.3604465709728868e-05, + "loss": 0.2347, + "step": 30980 + }, + { + "epoch": 49.43, + "grad_norm": 0.7410305738449097, + "learning_rate": 2.2966507177033495e-05, + "loss": 0.3456, + "step": 30990 + }, + { + "epoch": 49.44, + "grad_norm": 0.4471137225627899, + "learning_rate": 2.2328548644338118e-05, + "loss": 0.2472, + "step": 31000 + } + ], + "logging_steps": 10, + "max_steps": 31350, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 500, + "total_flos": 8.371975248433152e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}