{ "best_metric": null, "best_model_checkpoint": null, "epoch": 494.7368421052632, "eval_steps": 500, "global_step": 23500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21052631578947367, "grad_norm": 0.6672950983047485, "learning_rate": 0.0001999999106418929, "loss": 2.6441, "step": 10 }, { "epoch": 0.42105263157894735, "grad_norm": 0.7060385346412659, "learning_rate": 0.00019999964256773125, "loss": 2.4667, "step": 20 }, { "epoch": 0.631578947368421, "grad_norm": 1.0146973133087158, "learning_rate": 0.00019999919577799412, "loss": 2.1191, "step": 30 }, { "epoch": 0.8421052631578947, "grad_norm": 0.9969562292098999, "learning_rate": 0.00019999857027348008, "loss": 1.7304, "step": 40 }, { "epoch": 1.0526315789473684, "grad_norm": 1.6712026596069336, "learning_rate": 0.0001999977660553069, "loss": 1.5827, "step": 50 }, { "epoch": 1.263157894736842, "grad_norm": 1.2836483716964722, "learning_rate": 0.00019999678312491195, "loss": 1.3176, "step": 60 }, { "epoch": 1.4736842105263157, "grad_norm": 1.425754189491272, "learning_rate": 0.00019999562148405184, "loss": 1.1686, "step": 70 }, { "epoch": 1.6842105263157894, "grad_norm": 1.4164149761199951, "learning_rate": 0.00019999428113480258, "loss": 0.9951, "step": 80 }, { "epoch": 1.8947368421052633, "grad_norm": 1.2195743322372437, "learning_rate": 0.00019999276207955963, "loss": 0.858, "step": 90 }, { "epoch": 2.1052631578947367, "grad_norm": 1.5156681537628174, "learning_rate": 0.0001999910643210378, "loss": 0.8601, "step": 100 }, { "epoch": 2.3157894736842106, "grad_norm": 1.2401676177978516, "learning_rate": 0.00019998918786227123, "loss": 0.8665, "step": 110 }, { "epoch": 2.526315789473684, "grad_norm": 1.550382137298584, "learning_rate": 0.0001999871327066135, "loss": 0.7616, "step": 120 }, { "epoch": 2.736842105263158, "grad_norm": 1.254746437072754, "learning_rate": 0.00019998489885773743, "loss": 0.7674, "step": 130 }, { "epoch": 2.9473684210526314, "grad_norm": 1.6343107223510742, "learning_rate": 0.00019998248631963533, "loss": 0.7868, "step": 140 }, { "epoch": 3.1578947368421053, "grad_norm": 1.5750573873519897, "learning_rate": 0.0001999798950966188, "loss": 0.7374, "step": 150 }, { "epoch": 3.3684210526315788, "grad_norm": 1.6087182760238647, "learning_rate": 0.0001999771251933187, "loss": 0.7236, "step": 160 }, { "epoch": 3.5789473684210527, "grad_norm": 1.918428659439087, "learning_rate": 0.0001999741766146854, "loss": 0.711, "step": 170 }, { "epoch": 3.7894736842105265, "grad_norm": 1.64993155002594, "learning_rate": 0.00019997137013085418, "loss": 0.6914, "step": 180 }, { "epoch": 4.0, "grad_norm": 2.8061554431915283, "learning_rate": 0.00019996808208386932, "loss": 0.6858, "step": 190 }, { "epoch": 4.2105263157894735, "grad_norm": 1.626227617263794, "learning_rate": 0.00019996461537771275, "loss": 0.6806, "step": 200 }, { "epoch": 4.421052631578947, "grad_norm": 1.505091905593872, "learning_rate": 0.00019996097001857995, "loss": 0.6467, "step": 210 }, { "epoch": 4.631578947368421, "grad_norm": 2.134430170059204, "learning_rate": 0.00019995714601298584, "loss": 0.5943, "step": 220 }, { "epoch": 4.842105263157895, "grad_norm": 1.6179273128509521, "learning_rate": 0.00019995314336776452, "loss": 0.6368, "step": 230 }, { "epoch": 5.052631578947368, "grad_norm": 1.7538886070251465, "learning_rate": 0.00019994896209006932, "loss": 0.5949, "step": 240 }, { "epoch": 5.2631578947368425, "grad_norm": 1.7176129817962646, "learning_rate": 0.00019994460218737293, "loss": 0.5982, "step": 250 }, { "epoch": 5.473684210526316, "grad_norm": 1.5106130838394165, "learning_rate": 0.00019994006366746723, "loss": 0.6008, "step": 260 }, { "epoch": 5.684210526315789, "grad_norm": 1.4125791788101196, "learning_rate": 0.00019993534653846317, "loss": 0.5772, "step": 270 }, { "epoch": 5.894736842105263, "grad_norm": 1.6951205730438232, "learning_rate": 0.0001999304508087911, "loss": 0.6063, "step": 280 }, { "epoch": 6.105263157894737, "grad_norm": 2.1900129318237305, "learning_rate": 0.0001999253764872005, "loss": 0.5803, "step": 290 }, { "epoch": 6.315789473684211, "grad_norm": 1.5468244552612305, "learning_rate": 0.00019992012358276, "loss": 0.5515, "step": 300 }, { "epoch": 6.526315789473684, "grad_norm": 1.7162660360336304, "learning_rate": 0.00019991469210485732, "loss": 0.5634, "step": 310 }, { "epoch": 6.7368421052631575, "grad_norm": 1.6360913515090942, "learning_rate": 0.00019990908206319948, "loss": 0.5589, "step": 320 }, { "epoch": 6.947368421052632, "grad_norm": 2.227215051651001, "learning_rate": 0.0001999032934678125, "loss": 0.5383, "step": 330 }, { "epoch": 7.157894736842105, "grad_norm": 2.0045015811920166, "learning_rate": 0.0001998973263290415, "loss": 0.5386, "step": 340 }, { "epoch": 7.368421052631579, "grad_norm": 1.3294235467910767, "learning_rate": 0.0001998911806575508, "loss": 0.5038, "step": 350 }, { "epoch": 7.578947368421053, "grad_norm": 1.959319829940796, "learning_rate": 0.00019988485646432364, "loss": 0.5292, "step": 360 }, { "epoch": 7.7894736842105265, "grad_norm": 2.2458395957946777, "learning_rate": 0.00019987835376066243, "loss": 0.4982, "step": 370 }, { "epoch": 8.0, "grad_norm": 2.295234203338623, "learning_rate": 0.00019987167255818854, "loss": 0.5044, "step": 380 }, { "epoch": 8.210526315789474, "grad_norm": 2.3475089073181152, "learning_rate": 0.00019986481286884234, "loss": 0.476, "step": 390 }, { "epoch": 8.421052631578947, "grad_norm": 1.692319393157959, "learning_rate": 0.00019985777470488326, "loss": 0.5153, "step": 400 }, { "epoch": 8.631578947368421, "grad_norm": 1.9400404691696167, "learning_rate": 0.00019985055807888958, "loss": 0.5034, "step": 410 }, { "epoch": 8.842105263157894, "grad_norm": 2.033897876739502, "learning_rate": 0.00019984316300375863, "loss": 0.505, "step": 420 }, { "epoch": 9.052631578947368, "grad_norm": 1.9000393152236938, "learning_rate": 0.00019983558949270656, "loss": 0.5238, "step": 430 }, { "epoch": 9.263157894736842, "grad_norm": 2.177555561065674, "learning_rate": 0.0001998278375592685, "loss": 0.4655, "step": 440 }, { "epoch": 9.473684210526315, "grad_norm": 1.841349720954895, "learning_rate": 0.00019981990721729842, "loss": 0.4829, "step": 450 }, { "epoch": 9.68421052631579, "grad_norm": 2.5707108974456787, "learning_rate": 0.0001998117984809691, "loss": 0.4808, "step": 460 }, { "epoch": 9.894736842105264, "grad_norm": 1.9689738750457764, "learning_rate": 0.00019980351136477214, "loss": 0.476, "step": 470 }, { "epoch": 10.105263157894736, "grad_norm": 2.5027239322662354, "learning_rate": 0.000199795045883518, "loss": 0.4365, "step": 480 }, { "epoch": 10.31578947368421, "grad_norm": 2.014554500579834, "learning_rate": 0.0001997864020523359, "loss": 0.431, "step": 490 }, { "epoch": 10.526315789473685, "grad_norm": 1.5790047645568848, "learning_rate": 0.0001997775798866737, "loss": 0.4584, "step": 500 }, { "epoch": 10.736842105263158, "grad_norm": 1.7680073976516724, "learning_rate": 0.00019976857940229807, "loss": 0.4736, "step": 510 }, { "epoch": 10.947368421052632, "grad_norm": 2.524589776992798, "learning_rate": 0.00019975940061529434, "loss": 0.5041, "step": 520 }, { "epoch": 11.157894736842104, "grad_norm": 2.058872699737549, "learning_rate": 0.00019975004354206647, "loss": 0.4339, "step": 530 }, { "epoch": 11.368421052631579, "grad_norm": 2.1387035846710205, "learning_rate": 0.00019974050819933709, "loss": 0.4611, "step": 540 }, { "epoch": 11.578947368421053, "grad_norm": 2.866647720336914, "learning_rate": 0.00019973079460414744, "loss": 0.4361, "step": 550 }, { "epoch": 11.789473684210526, "grad_norm": 2.1138830184936523, "learning_rate": 0.0001997209027738572, "loss": 0.4475, "step": 560 }, { "epoch": 12.0, "grad_norm": 2.3789708614349365, "learning_rate": 0.00019971083272614474, "loss": 0.4515, "step": 570 }, { "epoch": 12.210526315789474, "grad_norm": 1.8684824705123901, "learning_rate": 0.00019970058447900684, "loss": 0.3823, "step": 580 }, { "epoch": 12.421052631578947, "grad_norm": 2.0576510429382324, "learning_rate": 0.00019969015805075877, "loss": 0.4218, "step": 590 }, { "epoch": 12.631578947368421, "grad_norm": 2.553516387939453, "learning_rate": 0.00019967955346003428, "loss": 0.4429, "step": 600 }, { "epoch": 12.842105263157894, "grad_norm": 2.1283926963806152, "learning_rate": 0.00019966877072578548, "loss": 0.4622, "step": 610 }, { "epoch": 13.052631578947368, "grad_norm": 2.276144504547119, "learning_rate": 0.00019965780986728286, "loss": 0.4225, "step": 620 }, { "epoch": 13.263157894736842, "grad_norm": 1.8524142503738403, "learning_rate": 0.00019964667090411524, "loss": 0.4001, "step": 630 }, { "epoch": 13.473684210526315, "grad_norm": 1.7885457277297974, "learning_rate": 0.0001996353538561898, "loss": 0.3619, "step": 640 }, { "epoch": 13.68421052631579, "grad_norm": 2.346691370010376, "learning_rate": 0.0001996238587437319, "loss": 0.4234, "step": 650 }, { "epoch": 13.894736842105264, "grad_norm": 2.195350170135498, "learning_rate": 0.00019961218558728515, "loss": 0.4107, "step": 660 }, { "epoch": 14.105263157894736, "grad_norm": 1.7246651649475098, "learning_rate": 0.00019960033440771143, "loss": 0.4447, "step": 670 }, { "epoch": 14.31578947368421, "grad_norm": 1.8804007768630981, "learning_rate": 0.00019958830522619065, "loss": 0.3807, "step": 680 }, { "epoch": 14.526315789473685, "grad_norm": 1.7152245044708252, "learning_rate": 0.000199576098064221, "loss": 0.3791, "step": 690 }, { "epoch": 14.736842105263158, "grad_norm": 1.8070579767227173, "learning_rate": 0.00019956371294361857, "loss": 0.3976, "step": 700 }, { "epoch": 14.947368421052632, "grad_norm": 1.8141262531280518, "learning_rate": 0.00019955114988651765, "loss": 0.4205, "step": 710 }, { "epoch": 15.157894736842104, "grad_norm": 1.9717251062393188, "learning_rate": 0.00019953840891537037, "loss": 0.3989, "step": 720 }, { "epoch": 15.368421052631579, "grad_norm": 2.3383822441101074, "learning_rate": 0.000199525490052947, "loss": 0.3747, "step": 730 }, { "epoch": 15.578947368421053, "grad_norm": 2.188035726547241, "learning_rate": 0.0001995123933223356, "loss": 0.3734, "step": 740 }, { "epoch": 15.789473684210526, "grad_norm": 2.967191457748413, "learning_rate": 0.00019949911874694217, "loss": 0.3857, "step": 750 }, { "epoch": 16.0, "grad_norm": 2.172882080078125, "learning_rate": 0.0001994856663504905, "loss": 0.417, "step": 760 }, { "epoch": 16.210526315789473, "grad_norm": 2.480196952819824, "learning_rate": 0.00019947203615702224, "loss": 0.3707, "step": 770 }, { "epoch": 16.42105263157895, "grad_norm": 1.7600008249282837, "learning_rate": 0.0001994582281908967, "loss": 0.3799, "step": 780 }, { "epoch": 16.63157894736842, "grad_norm": 1.875954031944275, "learning_rate": 0.00019944424247679102, "loss": 0.365, "step": 790 }, { "epoch": 16.842105263157894, "grad_norm": 1.9729390144348145, "learning_rate": 0.0001994300790396999, "loss": 0.3698, "step": 800 }, { "epoch": 17.05263157894737, "grad_norm": 1.9465049505233765, "learning_rate": 0.0001994157379049357, "loss": 0.3945, "step": 810 }, { "epoch": 17.263157894736842, "grad_norm": 2.3426170349121094, "learning_rate": 0.00019940121909812838, "loss": 0.3627, "step": 820 }, { "epoch": 17.473684210526315, "grad_norm": 1.4503591060638428, "learning_rate": 0.00019938652264522538, "loss": 0.3709, "step": 830 }, { "epoch": 17.68421052631579, "grad_norm": 2.0927889347076416, "learning_rate": 0.00019937164857249164, "loss": 0.358, "step": 840 }, { "epoch": 17.894736842105264, "grad_norm": 2.4150617122650146, "learning_rate": 0.00019935659690650955, "loss": 0.3579, "step": 850 }, { "epoch": 18.105263157894736, "grad_norm": 1.8455313444137573, "learning_rate": 0.00019934136767417888, "loss": 0.3693, "step": 860 }, { "epoch": 18.31578947368421, "grad_norm": 2.0851595401763916, "learning_rate": 0.00019932596090271672, "loss": 0.3305, "step": 870 }, { "epoch": 18.526315789473685, "grad_norm": 2.065988779067993, "learning_rate": 0.00019931037661965749, "loss": 0.362, "step": 880 }, { "epoch": 18.736842105263158, "grad_norm": 2.07012939453125, "learning_rate": 0.0001992946148528528, "loss": 0.3819, "step": 890 }, { "epoch": 18.94736842105263, "grad_norm": 1.9075250625610352, "learning_rate": 0.00019927867563047152, "loss": 0.3557, "step": 900 }, { "epoch": 19.157894736842106, "grad_norm": 1.9468849897384644, "learning_rate": 0.0001992625589809996, "loss": 0.32, "step": 910 }, { "epoch": 19.36842105263158, "grad_norm": 2.025853395462036, "learning_rate": 0.00019924626493324015, "loss": 0.3372, "step": 920 }, { "epoch": 19.57894736842105, "grad_norm": 1.625706434249878, "learning_rate": 0.00019922979351631322, "loss": 0.3378, "step": 930 }, { "epoch": 19.789473684210527, "grad_norm": 1.993627667427063, "learning_rate": 0.0001992131447596559, "loss": 0.3488, "step": 940 }, { "epoch": 20.0, "grad_norm": 1.914183497428894, "learning_rate": 0.00019919631869302226, "loss": 0.3836, "step": 950 }, { "epoch": 20.210526315789473, "grad_norm": 1.5059126615524292, "learning_rate": 0.0001991793153464832, "loss": 0.331, "step": 960 }, { "epoch": 20.42105263157895, "grad_norm": 1.9410295486450195, "learning_rate": 0.00019916213475042644, "loss": 0.3474, "step": 970 }, { "epoch": 20.63157894736842, "grad_norm": 2.240558624267578, "learning_rate": 0.00019914477693555647, "loss": 0.3364, "step": 980 }, { "epoch": 20.842105263157894, "grad_norm": 1.5858815908432007, "learning_rate": 0.00019912724193289457, "loss": 0.3417, "step": 990 }, { "epoch": 21.05263157894737, "grad_norm": 2.031370162963867, "learning_rate": 0.00019910952977377863, "loss": 0.3281, "step": 1000 }, { "epoch": 21.263157894736842, "grad_norm": 1.5948817729949951, "learning_rate": 0.0001990916404898631, "loss": 0.3147, "step": 1010 }, { "epoch": 21.473684210526315, "grad_norm": 1.7134989500045776, "learning_rate": 0.00019907357411311907, "loss": 0.3325, "step": 1020 }, { "epoch": 21.68421052631579, "grad_norm": 1.8072121143341064, "learning_rate": 0.0001990553306758341, "loss": 0.3453, "step": 1030 }, { "epoch": 21.894736842105264, "grad_norm": 2.8281702995300293, "learning_rate": 0.00019903691021061213, "loss": 0.3514, "step": 1040 }, { "epoch": 22.105263157894736, "grad_norm": 2.1076014041900635, "learning_rate": 0.00019901831275037353, "loss": 0.3241, "step": 1050 }, { "epoch": 22.31578947368421, "grad_norm": 2.392726421356201, "learning_rate": 0.00019899953832835498, "loss": 0.3078, "step": 1060 }, { "epoch": 22.526315789473685, "grad_norm": 2.0120134353637695, "learning_rate": 0.00019898058697810945, "loss": 0.3438, "step": 1070 }, { "epoch": 22.736842105263158, "grad_norm": 1.7498539686203003, "learning_rate": 0.000198961458733506, "loss": 0.3152, "step": 1080 }, { "epoch": 22.94736842105263, "grad_norm": 1.975158452987671, "learning_rate": 0.00019894215362872996, "loss": 0.3443, "step": 1090 }, { "epoch": 23.157894736842106, "grad_norm": 1.8120285272598267, "learning_rate": 0.0001989226716982827, "loss": 0.3078, "step": 1100 }, { "epoch": 23.36842105263158, "grad_norm": 2.1680185794830322, "learning_rate": 0.0001989030129769815, "loss": 0.2995, "step": 1110 }, { "epoch": 23.57894736842105, "grad_norm": 1.811830997467041, "learning_rate": 0.00019888317749995978, "loss": 0.3232, "step": 1120 }, { "epoch": 23.789473684210527, "grad_norm": 1.6087640523910522, "learning_rate": 0.00019886316530266673, "loss": 0.3307, "step": 1130 }, { "epoch": 24.0, "grad_norm": 2.173571825027466, "learning_rate": 0.00019884297642086736, "loss": 0.3286, "step": 1140 }, { "epoch": 24.210526315789473, "grad_norm": 1.9915390014648438, "learning_rate": 0.0001988226108906425, "loss": 0.287, "step": 1150 }, { "epoch": 24.42105263157895, "grad_norm": 1.5517760515213013, "learning_rate": 0.0001988020687483886, "loss": 0.3169, "step": 1160 }, { "epoch": 24.63157894736842, "grad_norm": 1.8992213010787964, "learning_rate": 0.0001987813500308179, "loss": 0.3393, "step": 1170 }, { "epoch": 24.842105263157894, "grad_norm": 2.124034881591797, "learning_rate": 0.00019876045477495804, "loss": 0.3214, "step": 1180 }, { "epoch": 25.05263157894737, "grad_norm": 1.5663875341415405, "learning_rate": 0.00019873938301815224, "loss": 0.3076, "step": 1190 }, { "epoch": 25.263157894736842, "grad_norm": 2.0292165279388428, "learning_rate": 0.00019871813479805915, "loss": 0.3133, "step": 1200 }, { "epoch": 25.473684210526315, "grad_norm": 1.7555932998657227, "learning_rate": 0.00019869671015265277, "loss": 0.3019, "step": 1210 }, { "epoch": 25.68421052631579, "grad_norm": 2.3077430725097656, "learning_rate": 0.00019867510912022245, "loss": 0.3369, "step": 1220 }, { "epoch": 25.894736842105264, "grad_norm": 2.0065627098083496, "learning_rate": 0.0001986533317393727, "loss": 0.3225, "step": 1230 }, { "epoch": 26.105263157894736, "grad_norm": 1.531724214553833, "learning_rate": 0.00019863137804902324, "loss": 0.2914, "step": 1240 }, { "epoch": 26.31578947368421, "grad_norm": 1.7669227123260498, "learning_rate": 0.00019860924808840893, "loss": 0.2988, "step": 1250 }, { "epoch": 26.526315789473685, "grad_norm": 1.8059635162353516, "learning_rate": 0.0001985869418970795, "loss": 0.2805, "step": 1260 }, { "epoch": 26.736842105263158, "grad_norm": 1.8464925289154053, "learning_rate": 0.00019856445951489982, "loss": 0.297, "step": 1270 }, { "epoch": 26.94736842105263, "grad_norm": 2.014112710952759, "learning_rate": 0.00019854180098204948, "loss": 0.308, "step": 1280 }, { "epoch": 27.157894736842106, "grad_norm": 1.5433177947998047, "learning_rate": 0.000198518966339023, "loss": 0.3105, "step": 1290 }, { "epoch": 27.36842105263158, "grad_norm": 1.5101069211959839, "learning_rate": 0.00019849595562662956, "loss": 0.2919, "step": 1300 }, { "epoch": 27.57894736842105, "grad_norm": 1.640114188194275, "learning_rate": 0.00019847276888599306, "loss": 0.278, "step": 1310 }, { "epoch": 27.789473684210527, "grad_norm": 1.9816792011260986, "learning_rate": 0.00019844940615855198, "loss": 0.3087, "step": 1320 }, { "epoch": 28.0, "grad_norm": 2.4635629653930664, "learning_rate": 0.0001984258674860592, "loss": 0.3174, "step": 1330 }, { "epoch": 28.210526315789473, "grad_norm": 1.7017345428466797, "learning_rate": 0.00019840215291058228, "loss": 0.2855, "step": 1340 }, { "epoch": 28.42105263157895, "grad_norm": 1.8059803247451782, "learning_rate": 0.00019837826247450294, "loss": 0.2753, "step": 1350 }, { "epoch": 28.63157894736842, "grad_norm": 1.9220434427261353, "learning_rate": 0.00019835419622051727, "loss": 0.313, "step": 1360 }, { "epoch": 28.842105263157894, "grad_norm": 1.790998101234436, "learning_rate": 0.00019832995419163555, "loss": 0.3051, "step": 1370 }, { "epoch": 29.05263157894737, "grad_norm": 1.4004263877868652, "learning_rate": 0.0001983055364311823, "loss": 0.2949, "step": 1380 }, { "epoch": 29.263157894736842, "grad_norm": 1.7643471956253052, "learning_rate": 0.00019828094298279589, "loss": 0.2933, "step": 1390 }, { "epoch": 29.473684210526315, "grad_norm": 1.7527259588241577, "learning_rate": 0.0001982561738904289, "loss": 0.2849, "step": 1400 }, { "epoch": 29.68421052631579, "grad_norm": 2.0102221965789795, "learning_rate": 0.00019823122919834766, "loss": 0.2916, "step": 1410 }, { "epoch": 29.894736842105264, "grad_norm": 1.9086006879806519, "learning_rate": 0.0001982061089511324, "loss": 0.2833, "step": 1420 }, { "epoch": 30.105263157894736, "grad_norm": 2.0103325843811035, "learning_rate": 0.00019818081319367709, "loss": 0.2753, "step": 1430 }, { "epoch": 30.31578947368421, "grad_norm": 1.5917645692825317, "learning_rate": 0.0001981553419711893, "loss": 0.2705, "step": 1440 }, { "epoch": 30.526315789473685, "grad_norm": 1.8320797681808472, "learning_rate": 0.00019812969532919032, "loss": 0.295, "step": 1450 }, { "epoch": 30.736842105263158, "grad_norm": 2.35292911529541, "learning_rate": 0.00019810387331351478, "loss": 0.2913, "step": 1460 }, { "epoch": 30.94736842105263, "grad_norm": 1.739706039428711, "learning_rate": 0.00019807787597031084, "loss": 0.2926, "step": 1470 }, { "epoch": 31.157894736842106, "grad_norm": 1.9969830513000488, "learning_rate": 0.00019805170334603997, "loss": 0.2991, "step": 1480 }, { "epoch": 31.36842105263158, "grad_norm": 1.7005611658096313, "learning_rate": 0.00019802535548747686, "loss": 0.2941, "step": 1490 }, { "epoch": 31.57894736842105, "grad_norm": 1.9884356260299683, "learning_rate": 0.00019799883244170946, "loss": 0.2743, "step": 1500 }, { "epoch": 31.789473684210527, "grad_norm": 2.2900278568267822, "learning_rate": 0.00019797213425613869, "loss": 0.299, "step": 1510 }, { "epoch": 32.0, "grad_norm": 2.4475691318511963, "learning_rate": 0.00019794526097847862, "loss": 0.2805, "step": 1520 }, { "epoch": 32.21052631578947, "grad_norm": 1.836012601852417, "learning_rate": 0.00019791821265675603, "loss": 0.2549, "step": 1530 }, { "epoch": 32.421052631578945, "grad_norm": 2.088721752166748, "learning_rate": 0.0001978909893393108, "loss": 0.3096, "step": 1540 }, { "epoch": 32.63157894736842, "grad_norm": 1.7685314416885376, "learning_rate": 0.0001978635910747953, "loss": 0.2725, "step": 1550 }, { "epoch": 32.8421052631579, "grad_norm": 1.8119862079620361, "learning_rate": 0.00019783601791217474, "loss": 0.2915, "step": 1560 }, { "epoch": 33.05263157894737, "grad_norm": 1.7417136430740356, "learning_rate": 0.0001978082699007268, "loss": 0.2695, "step": 1570 }, { "epoch": 33.26315789473684, "grad_norm": 1.4141062498092651, "learning_rate": 0.0001977803470900417, "loss": 0.2647, "step": 1580 }, { "epoch": 33.473684210526315, "grad_norm": 2.370269775390625, "learning_rate": 0.000197752249530022, "loss": 0.2753, "step": 1590 }, { "epoch": 33.68421052631579, "grad_norm": 1.886096715927124, "learning_rate": 0.00019772397727088262, "loss": 0.283, "step": 1600 }, { "epoch": 33.89473684210526, "grad_norm": 1.6945075988769531, "learning_rate": 0.00019769553036315065, "loss": 0.2774, "step": 1610 }, { "epoch": 34.10526315789474, "grad_norm": 1.800110101699829, "learning_rate": 0.00019766690885766533, "loss": 0.2646, "step": 1620 }, { "epoch": 34.31578947368421, "grad_norm": 1.670495867729187, "learning_rate": 0.00019763811280557793, "loss": 0.2409, "step": 1630 }, { "epoch": 34.526315789473685, "grad_norm": 2.0900466442108154, "learning_rate": 0.0001976091422583517, "loss": 0.2994, "step": 1640 }, { "epoch": 34.73684210526316, "grad_norm": 2.043393611907959, "learning_rate": 0.00019757999726776167, "loss": 0.2862, "step": 1650 }, { "epoch": 34.94736842105263, "grad_norm": 2.35016131401062, "learning_rate": 0.00019755067788589467, "loss": 0.2816, "step": 1660 }, { "epoch": 35.1578947368421, "grad_norm": 1.4299836158752441, "learning_rate": 0.00019752118416514915, "loss": 0.255, "step": 1670 }, { "epoch": 35.36842105263158, "grad_norm": 1.9748225212097168, "learning_rate": 0.00019749151615823526, "loss": 0.2635, "step": 1680 }, { "epoch": 35.578947368421055, "grad_norm": 1.6960207223892212, "learning_rate": 0.00019746167391817448, "loss": 0.2682, "step": 1690 }, { "epoch": 35.78947368421053, "grad_norm": 2.131016969680786, "learning_rate": 0.00019743165749829973, "loss": 0.2812, "step": 1700 }, { "epoch": 36.0, "grad_norm": 1.7239797115325928, "learning_rate": 0.00019740146695225525, "loss": 0.2858, "step": 1710 }, { "epoch": 36.21052631578947, "grad_norm": 1.5549391508102417, "learning_rate": 0.0001973711023339964, "loss": 0.2619, "step": 1720 }, { "epoch": 36.421052631578945, "grad_norm": 1.949060320854187, "learning_rate": 0.00019734056369778973, "loss": 0.2489, "step": 1730 }, { "epoch": 36.63157894736842, "grad_norm": 1.5633726119995117, "learning_rate": 0.00019730985109821266, "loss": 0.2791, "step": 1740 }, { "epoch": 36.8421052631579, "grad_norm": 1.7164374589920044, "learning_rate": 0.00019727896459015366, "loss": 0.2764, "step": 1750 }, { "epoch": 37.05263157894737, "grad_norm": 1.5717780590057373, "learning_rate": 0.00019724790422881187, "loss": 0.2574, "step": 1760 }, { "epoch": 37.26315789473684, "grad_norm": 1.7981038093566895, "learning_rate": 0.00019721667006969727, "loss": 0.2558, "step": 1770 }, { "epoch": 37.473684210526315, "grad_norm": 1.596404790878296, "learning_rate": 0.00019718526216863026, "loss": 0.2742, "step": 1780 }, { "epoch": 37.68421052631579, "grad_norm": 1.3674874305725098, "learning_rate": 0.00019715368058174194, "loss": 0.2583, "step": 1790 }, { "epoch": 37.89473684210526, "grad_norm": 1.6358920335769653, "learning_rate": 0.00019712192536547364, "loss": 0.2728, "step": 1800 }, { "epoch": 38.10526315789474, "grad_norm": 1.784155011177063, "learning_rate": 0.0001970899965765772, "loss": 0.2706, "step": 1810 }, { "epoch": 38.31578947368421, "grad_norm": 1.6264429092407227, "learning_rate": 0.00019705789427211444, "loss": 0.2566, "step": 1820 }, { "epoch": 38.526315789473685, "grad_norm": 1.6494293212890625, "learning_rate": 0.00019702561850945744, "loss": 0.2691, "step": 1830 }, { "epoch": 38.73684210526316, "grad_norm": 2.1020774841308594, "learning_rate": 0.00019699316934628818, "loss": 0.2508, "step": 1840 }, { "epoch": 38.94736842105263, "grad_norm": 1.4856878519058228, "learning_rate": 0.0001969605468405986, "loss": 0.261, "step": 1850 }, { "epoch": 39.1578947368421, "grad_norm": 2.1468899250030518, "learning_rate": 0.00019692775105069042, "loss": 0.261, "step": 1860 }, { "epoch": 39.36842105263158, "grad_norm": 1.5866405963897705, "learning_rate": 0.00019689478203517498, "loss": 0.2684, "step": 1870 }, { "epoch": 39.578947368421055, "grad_norm": 1.5345929861068726, "learning_rate": 0.0001968616398529733, "loss": 0.2641, "step": 1880 }, { "epoch": 39.78947368421053, "grad_norm": 1.664137601852417, "learning_rate": 0.0001968283245633159, "loss": 0.2619, "step": 1890 }, { "epoch": 40.0, "grad_norm": 1.8158512115478516, "learning_rate": 0.00019679483622574246, "loss": 0.2493, "step": 1900 }, { "epoch": 40.21052631578947, "grad_norm": 1.8335703611373901, "learning_rate": 0.00019676117490010215, "loss": 0.2377, "step": 1910 }, { "epoch": 40.421052631578945, "grad_norm": 1.6368952989578247, "learning_rate": 0.00019672734064655326, "loss": 0.238, "step": 1920 }, { "epoch": 40.63157894736842, "grad_norm": 1.600355863571167, "learning_rate": 0.00019669333352556297, "loss": 0.2727, "step": 1930 }, { "epoch": 40.8421052631579, "grad_norm": 1.7188122272491455, "learning_rate": 0.0001966591535979076, "loss": 0.2563, "step": 1940 }, { "epoch": 41.05263157894737, "grad_norm": 1.2512385845184326, "learning_rate": 0.0001966248009246722, "loss": 0.2662, "step": 1950 }, { "epoch": 41.26315789473684, "grad_norm": 1.8242894411087036, "learning_rate": 0.0001965902755672506, "loss": 0.2451, "step": 1960 }, { "epoch": 41.473684210526315, "grad_norm": 1.5848056077957153, "learning_rate": 0.00019655557758734517, "loss": 0.2509, "step": 1970 }, { "epoch": 41.68421052631579, "grad_norm": 2.00626277923584, "learning_rate": 0.00019652070704696687, "loss": 0.2662, "step": 1980 }, { "epoch": 41.89473684210526, "grad_norm": 1.7578226327896118, "learning_rate": 0.00019648566400843492, "loss": 0.2553, "step": 1990 }, { "epoch": 42.10526315789474, "grad_norm": 1.1532166004180908, "learning_rate": 0.00019645044853437704, "loss": 0.2525, "step": 2000 }, { "epoch": 42.31578947368421, "grad_norm": 1.355414628982544, "learning_rate": 0.00019641506068772887, "loss": 0.2409, "step": 2010 }, { "epoch": 42.526315789473685, "grad_norm": 1.8452808856964111, "learning_rate": 0.00019637950053173433, "loss": 0.2716, "step": 2020 }, { "epoch": 42.73684210526316, "grad_norm": 2.008028745651245, "learning_rate": 0.0001963437681299451, "loss": 0.2482, "step": 2030 }, { "epoch": 42.94736842105263, "grad_norm": 1.8328056335449219, "learning_rate": 0.00019630786354622086, "loss": 0.26, "step": 2040 }, { "epoch": 43.1578947368421, "grad_norm": 1.332192301750183, "learning_rate": 0.00019627178684472884, "loss": 0.2586, "step": 2050 }, { "epoch": 43.36842105263158, "grad_norm": 1.4395008087158203, "learning_rate": 0.00019623553808994403, "loss": 0.2461, "step": 2060 }, { "epoch": 43.578947368421055, "grad_norm": 1.8931459188461304, "learning_rate": 0.00019619911734664877, "loss": 0.2476, "step": 2070 }, { "epoch": 43.78947368421053, "grad_norm": 1.6473420858383179, "learning_rate": 0.00019616252467993283, "loss": 0.2497, "step": 2080 }, { "epoch": 44.0, "grad_norm": 1.7745323181152344, "learning_rate": 0.0001961257601551933, "loss": 0.255, "step": 2090 }, { "epoch": 44.21052631578947, "grad_norm": 1.6052919626235962, "learning_rate": 0.00019608882383813435, "loss": 0.2411, "step": 2100 }, { "epoch": 44.421052631578945, "grad_norm": 1.1463373899459839, "learning_rate": 0.00019605171579476708, "loss": 0.2393, "step": 2110 }, { "epoch": 44.63157894736842, "grad_norm": 1.6653571128845215, "learning_rate": 0.00019601443609140967, "loss": 0.2608, "step": 2120 }, { "epoch": 44.8421052631579, "grad_norm": 2.1662609577178955, "learning_rate": 0.0001959769847946869, "loss": 0.252, "step": 2130 }, { "epoch": 45.05263157894737, "grad_norm": 1.2638843059539795, "learning_rate": 0.0001959393619715304, "loss": 0.2529, "step": 2140 }, { "epoch": 45.26315789473684, "grad_norm": 1.7441433668136597, "learning_rate": 0.00019590156768917822, "loss": 0.2544, "step": 2150 }, { "epoch": 45.473684210526315, "grad_norm": 1.5743330717086792, "learning_rate": 0.0001958636020151749, "loss": 0.2504, "step": 2160 }, { "epoch": 45.68421052631579, "grad_norm": 1.2976667881011963, "learning_rate": 0.0001958254650173712, "loss": 0.2479, "step": 2170 }, { "epoch": 45.89473684210526, "grad_norm": 1.9019654989242554, "learning_rate": 0.00019578715676392414, "loss": 0.2612, "step": 2180 }, { "epoch": 46.10526315789474, "grad_norm": 1.24739670753479, "learning_rate": 0.0001957486773232968, "loss": 0.2225, "step": 2190 }, { "epoch": 46.31578947368421, "grad_norm": 1.4010628461837769, "learning_rate": 0.00019571002676425816, "loss": 0.2322, "step": 2200 }, { "epoch": 46.526315789473685, "grad_norm": 1.3741133213043213, "learning_rate": 0.00019567120515588308, "loss": 0.2537, "step": 2210 }, { "epoch": 46.73684210526316, "grad_norm": 1.5921913385391235, "learning_rate": 0.00019563221256755197, "loss": 0.2566, "step": 2220 }, { "epoch": 46.94736842105263, "grad_norm": 1.7186962366104126, "learning_rate": 0.000195593049068951, "loss": 0.2549, "step": 2230 }, { "epoch": 47.1578947368421, "grad_norm": 1.5977070331573486, "learning_rate": 0.00019555371473007168, "loss": 0.2303, "step": 2240 }, { "epoch": 47.36842105263158, "grad_norm": 1.6327919960021973, "learning_rate": 0.00019551420962121078, "loss": 0.2463, "step": 2250 }, { "epoch": 47.578947368421055, "grad_norm": 1.2250909805297852, "learning_rate": 0.00019547453381297042, "loss": 0.247, "step": 2260 }, { "epoch": 47.78947368421053, "grad_norm": 1.8708374500274658, "learning_rate": 0.00019543468737625766, "loss": 0.2478, "step": 2270 }, { "epoch": 48.0, "grad_norm": 1.1603832244873047, "learning_rate": 0.00019539467038228456, "loss": 0.2447, "step": 2280 }, { "epoch": 48.21052631578947, "grad_norm": 1.687369704246521, "learning_rate": 0.00019535448290256796, "loss": 0.2228, "step": 2290 }, { "epoch": 48.421052631578945, "grad_norm": 1.7475471496582031, "learning_rate": 0.00019531412500892943, "loss": 0.2424, "step": 2300 }, { "epoch": 48.63157894736842, "grad_norm": 1.5487775802612305, "learning_rate": 0.00019527359677349505, "loss": 0.2447, "step": 2310 }, { "epoch": 48.8421052631579, "grad_norm": 1.3614391088485718, "learning_rate": 0.00019523289826869534, "loss": 0.2548, "step": 2320 }, { "epoch": 49.05263157894737, "grad_norm": 1.19809091091156, "learning_rate": 0.00019519202956726512, "loss": 0.2374, "step": 2330 }, { "epoch": 49.26315789473684, "grad_norm": 1.5943597555160522, "learning_rate": 0.00019515099074224343, "loss": 0.2411, "step": 2340 }, { "epoch": 49.473684210526315, "grad_norm": 1.6664204597473145, "learning_rate": 0.0001951097818669733, "loss": 0.2429, "step": 2350 }, { "epoch": 49.68421052631579, "grad_norm": 1.5013232231140137, "learning_rate": 0.00019506840301510158, "loss": 0.2406, "step": 2360 }, { "epoch": 49.89473684210526, "grad_norm": 1.2071912288665771, "learning_rate": 0.0001950268542605791, "loss": 0.251, "step": 2370 }, { "epoch": 50.10526315789474, "grad_norm": 1.1436349153518677, "learning_rate": 0.00019498513567766016, "loss": 0.2405, "step": 2380 }, { "epoch": 50.31578947368421, "grad_norm": 1.7050838470458984, "learning_rate": 0.00019494324734090266, "loss": 0.229, "step": 2390 }, { "epoch": 50.526315789473685, "grad_norm": 1.4147546291351318, "learning_rate": 0.00019490118932516786, "loss": 0.2456, "step": 2400 }, { "epoch": 50.73684210526316, "grad_norm": 1.4983457326889038, "learning_rate": 0.00019485896170562018, "loss": 0.2424, "step": 2410 }, { "epoch": 50.94736842105263, "grad_norm": 1.6988271474838257, "learning_rate": 0.00019481656455772734, "loss": 0.2476, "step": 2420 }, { "epoch": 51.1578947368421, "grad_norm": 1.4890905618667603, "learning_rate": 0.00019477399795725983, "loss": 0.2414, "step": 2430 }, { "epoch": 51.36842105263158, "grad_norm": 1.7915074825286865, "learning_rate": 0.00019473126198029105, "loss": 0.2407, "step": 2440 }, { "epoch": 51.578947368421055, "grad_norm": 1.7061022520065308, "learning_rate": 0.0001946883567031972, "loss": 0.2362, "step": 2450 }, { "epoch": 51.78947368421053, "grad_norm": 2.6335580348968506, "learning_rate": 0.00019464528220265693, "loss": 0.2481, "step": 2460 }, { "epoch": 52.0, "grad_norm": 1.3824809789657593, "learning_rate": 0.00019460203855565134, "loss": 0.2303, "step": 2470 }, { "epoch": 52.21052631578947, "grad_norm": 1.4830249547958374, "learning_rate": 0.00019455862583946386, "loss": 0.216, "step": 2480 }, { "epoch": 52.421052631578945, "grad_norm": 1.3665930032730103, "learning_rate": 0.00019451504413168003, "loss": 0.2372, "step": 2490 }, { "epoch": 52.63157894736842, "grad_norm": 1.4474061727523804, "learning_rate": 0.00019447129351018744, "loss": 0.2359, "step": 2500 }, { "epoch": 52.8421052631579, "grad_norm": 1.353022813796997, "learning_rate": 0.00019442737405317556, "loss": 0.2523, "step": 2510 }, { "epoch": 53.05263157894737, "grad_norm": 1.4885058403015137, "learning_rate": 0.00019438328583913558, "loss": 0.2276, "step": 2520 }, { "epoch": 53.26315789473684, "grad_norm": 1.1576405763626099, "learning_rate": 0.00019433902894686026, "loss": 0.2277, "step": 2530 }, { "epoch": 53.473684210526315, "grad_norm": 1.4939807653427124, "learning_rate": 0.0001942946034554439, "loss": 0.2312, "step": 2540 }, { "epoch": 53.68421052631579, "grad_norm": 1.3100343942642212, "learning_rate": 0.00019425000944428198, "loss": 0.242, "step": 2550 }, { "epoch": 53.89473684210526, "grad_norm": 1.3382755517959595, "learning_rate": 0.00019420524699307126, "loss": 0.2554, "step": 2560 }, { "epoch": 54.10526315789474, "grad_norm": 1.488303542137146, "learning_rate": 0.0001941603161818095, "loss": 0.2217, "step": 2570 }, { "epoch": 54.31578947368421, "grad_norm": 1.3764780759811401, "learning_rate": 0.00019411521709079534, "loss": 0.2352, "step": 2580 }, { "epoch": 54.526315789473685, "grad_norm": 1.777108073234558, "learning_rate": 0.0001940699498006282, "loss": 0.2366, "step": 2590 }, { "epoch": 54.73684210526316, "grad_norm": 1.7822355031967163, "learning_rate": 0.00019402451439220803, "loss": 0.2348, "step": 2600 }, { "epoch": 54.94736842105263, "grad_norm": 1.7189970016479492, "learning_rate": 0.00019397891094673529, "loss": 0.2439, "step": 2610 }, { "epoch": 55.1578947368421, "grad_norm": 1.3337494134902954, "learning_rate": 0.00019393313954571074, "loss": 0.2216, "step": 2620 }, { "epoch": 55.36842105263158, "grad_norm": 1.533355951309204, "learning_rate": 0.00019388720027093523, "loss": 0.2194, "step": 2630 }, { "epoch": 55.578947368421055, "grad_norm": 1.2764497995376587, "learning_rate": 0.00019384109320450977, "loss": 0.2517, "step": 2640 }, { "epoch": 55.78947368421053, "grad_norm": 1.2663401365280151, "learning_rate": 0.00019379481842883518, "loss": 0.2331, "step": 2650 }, { "epoch": 56.0, "grad_norm": 1.443335771560669, "learning_rate": 0.00019374837602661188, "loss": 0.2451, "step": 2660 }, { "epoch": 56.21052631578947, "grad_norm": 1.2762813568115234, "learning_rate": 0.00019370176608084008, "loss": 0.2138, "step": 2670 }, { "epoch": 56.421052631578945, "grad_norm": 1.145351767539978, "learning_rate": 0.00019365498867481923, "loss": 0.2269, "step": 2680 }, { "epoch": 56.63157894736842, "grad_norm": 1.6088831424713135, "learning_rate": 0.00019360804389214822, "loss": 0.2383, "step": 2690 }, { "epoch": 56.8421052631579, "grad_norm": 1.5034024715423584, "learning_rate": 0.0001935609318167249, "loss": 0.2433, "step": 2700 }, { "epoch": 57.05263157894737, "grad_norm": 1.2446277141571045, "learning_rate": 0.00019351365253274626, "loss": 0.2329, "step": 2710 }, { "epoch": 57.26315789473684, "grad_norm": 1.3016337156295776, "learning_rate": 0.00019346620612470803, "loss": 0.2334, "step": 2720 }, { "epoch": 57.473684210526315, "grad_norm": 1.444482684135437, "learning_rate": 0.0001934185926774046, "loss": 0.2056, "step": 2730 }, { "epoch": 57.68421052631579, "grad_norm": 1.6010749340057373, "learning_rate": 0.00019337081227592897, "loss": 0.2506, "step": 2740 }, { "epoch": 57.89473684210526, "grad_norm": 2.1765823364257812, "learning_rate": 0.0001933228650056724, "loss": 0.2421, "step": 2750 }, { "epoch": 58.10526315789474, "grad_norm": 1.2616775035858154, "learning_rate": 0.0001932747509523245, "loss": 0.2308, "step": 2760 }, { "epoch": 58.31578947368421, "grad_norm": 1.6567051410675049, "learning_rate": 0.00019322647020187286, "loss": 0.2325, "step": 2770 }, { "epoch": 58.526315789473685, "grad_norm": 1.4858084917068481, "learning_rate": 0.000193178022840603, "loss": 0.2286, "step": 2780 }, { "epoch": 58.73684210526316, "grad_norm": 1.2015066146850586, "learning_rate": 0.00019312940895509822, "loss": 0.2314, "step": 2790 }, { "epoch": 58.94736842105263, "grad_norm": 1.4122145175933838, "learning_rate": 0.00019308062863223943, "loss": 0.235, "step": 2800 }, { "epoch": 59.1578947368421, "grad_norm": 1.3632580041885376, "learning_rate": 0.00019303168195920493, "loss": 0.2159, "step": 2810 }, { "epoch": 59.36842105263158, "grad_norm": 1.4167340993881226, "learning_rate": 0.00019298256902347042, "loss": 0.2385, "step": 2820 }, { "epoch": 59.578947368421055, "grad_norm": 1.299408197402954, "learning_rate": 0.00019293328991280863, "loss": 0.2438, "step": 2830 }, { "epoch": 59.78947368421053, "grad_norm": 1.329671025276184, "learning_rate": 0.00019288384471528936, "loss": 0.2228, "step": 2840 }, { "epoch": 60.0, "grad_norm": 1.7981964349746704, "learning_rate": 0.00019283423351927918, "loss": 0.2182, "step": 2850 }, { "epoch": 60.21052631578947, "grad_norm": 1.8409744501113892, "learning_rate": 0.00019278445641344135, "loss": 0.2194, "step": 2860 }, { "epoch": 60.421052631578945, "grad_norm": 1.713848352432251, "learning_rate": 0.0001927345134867356, "loss": 0.2307, "step": 2870 }, { "epoch": 60.63157894736842, "grad_norm": 1.1938320398330688, "learning_rate": 0.00019268440482841804, "loss": 0.2272, "step": 2880 }, { "epoch": 60.8421052631579, "grad_norm": 1.3815594911575317, "learning_rate": 0.000192634130528041, "loss": 0.2379, "step": 2890 }, { "epoch": 61.05263157894737, "grad_norm": 1.0334292650222778, "learning_rate": 0.00019258369067545278, "loss": 0.2196, "step": 2900 }, { "epoch": 61.26315789473684, "grad_norm": 1.033290147781372, "learning_rate": 0.0001925330853607976, "loss": 0.2215, "step": 2910 }, { "epoch": 61.473684210526315, "grad_norm": 1.375057339668274, "learning_rate": 0.00019248231467451534, "loss": 0.2373, "step": 2920 }, { "epoch": 61.68421052631579, "grad_norm": 1.3225274085998535, "learning_rate": 0.00019243137870734146, "loss": 0.2256, "step": 2930 }, { "epoch": 61.89473684210526, "grad_norm": 1.3929612636566162, "learning_rate": 0.0001923802775503068, "loss": 0.233, "step": 2940 }, { "epoch": 62.10526315789474, "grad_norm": 1.6330879926681519, "learning_rate": 0.00019232901129473734, "loss": 0.2181, "step": 2950 }, { "epoch": 62.31578947368421, "grad_norm": 1.1357747316360474, "learning_rate": 0.0001922775800322543, "loss": 0.2196, "step": 2960 }, { "epoch": 62.526315789473685, "grad_norm": 1.415575623512268, "learning_rate": 0.00019222598385477366, "loss": 0.2299, "step": 2970 }, { "epoch": 62.73684210526316, "grad_norm": 1.2679227590560913, "learning_rate": 0.00019217422285450607, "loss": 0.2175, "step": 2980 }, { "epoch": 62.94736842105263, "grad_norm": 1.4680553674697876, "learning_rate": 0.00019212229712395695, "loss": 0.2357, "step": 2990 }, { "epoch": 63.1578947368421, "grad_norm": 1.255954623222351, "learning_rate": 0.00019207020675592593, "loss": 0.2195, "step": 3000 }, { "epoch": 63.36842105263158, "grad_norm": 1.352297306060791, "learning_rate": 0.00019201795184350693, "loss": 0.2203, "step": 3010 }, { "epoch": 63.578947368421055, "grad_norm": 1.590889573097229, "learning_rate": 0.000191965532480088, "loss": 0.2299, "step": 3020 }, { "epoch": 63.78947368421053, "grad_norm": 1.265857458114624, "learning_rate": 0.00019191294875935103, "loss": 0.2343, "step": 3030 }, { "epoch": 64.0, "grad_norm": 1.2470191717147827, "learning_rate": 0.00019186020077527162, "loss": 0.2238, "step": 3040 }, { "epoch": 64.21052631578948, "grad_norm": 1.4418771266937256, "learning_rate": 0.000191807288622119, "loss": 0.2242, "step": 3050 }, { "epoch": 64.42105263157895, "grad_norm": 1.957590103149414, "learning_rate": 0.00019175421239445576, "loss": 0.2332, "step": 3060 }, { "epoch": 64.63157894736842, "grad_norm": 1.2326531410217285, "learning_rate": 0.00019170097218713773, "loss": 0.2238, "step": 3070 }, { "epoch": 64.84210526315789, "grad_norm": 1.2268998622894287, "learning_rate": 0.0001916475680953138, "loss": 0.2154, "step": 3080 }, { "epoch": 65.05263157894737, "grad_norm": 1.0560153722763062, "learning_rate": 0.0001915940002144257, "loss": 0.2253, "step": 3090 }, { "epoch": 65.26315789473684, "grad_norm": 1.13203763961792, "learning_rate": 0.00019154026864020798, "loss": 0.2109, "step": 3100 }, { "epoch": 65.47368421052632, "grad_norm": 1.2309409379959106, "learning_rate": 0.00019148637346868763, "loss": 0.2148, "step": 3110 }, { "epoch": 65.6842105263158, "grad_norm": 1.1464983224868774, "learning_rate": 0.00019143231479618405, "loss": 0.224, "step": 3120 }, { "epoch": 65.89473684210526, "grad_norm": 0.9732810258865356, "learning_rate": 0.00019137809271930893, "loss": 0.2306, "step": 3130 }, { "epoch": 66.10526315789474, "grad_norm": 1.1428371667861938, "learning_rate": 0.00019132370733496582, "loss": 0.2163, "step": 3140 }, { "epoch": 66.3157894736842, "grad_norm": 1.4518449306488037, "learning_rate": 0.0001912691587403503, "loss": 0.2374, "step": 3150 }, { "epoch": 66.52631578947368, "grad_norm": 1.1604626178741455, "learning_rate": 0.0001912144470329495, "loss": 0.2398, "step": 3160 }, { "epoch": 66.73684210526316, "grad_norm": 1.2528883218765259, "learning_rate": 0.0001911595723105421, "loss": 0.2231, "step": 3170 }, { "epoch": 66.94736842105263, "grad_norm": 1.35756516456604, "learning_rate": 0.00019110453467119815, "loss": 0.2136, "step": 3180 }, { "epoch": 67.15789473684211, "grad_norm": 1.3628764152526855, "learning_rate": 0.00019104933421327887, "loss": 0.2084, "step": 3190 }, { "epoch": 67.36842105263158, "grad_norm": 0.9904689192771912, "learning_rate": 0.0001909939710354364, "loss": 0.2278, "step": 3200 }, { "epoch": 67.57894736842105, "grad_norm": 1.4833720922470093, "learning_rate": 0.00019093844523661367, "loss": 0.2396, "step": 3210 }, { "epoch": 67.78947368421052, "grad_norm": 2.6204235553741455, "learning_rate": 0.00019088275691604435, "loss": 0.2349, "step": 3220 }, { "epoch": 68.0, "grad_norm": 1.6207849979400635, "learning_rate": 0.00019082690617325246, "loss": 0.2195, "step": 3230 }, { "epoch": 68.21052631578948, "grad_norm": 1.9536974430084229, "learning_rate": 0.00019077089310805238, "loss": 0.2093, "step": 3240 }, { "epoch": 68.42105263157895, "grad_norm": 1.4202009439468384, "learning_rate": 0.00019071471782054853, "loss": 0.2127, "step": 3250 }, { "epoch": 68.63157894736842, "grad_norm": 1.3091250658035278, "learning_rate": 0.00019065838041113517, "loss": 0.223, "step": 3260 }, { "epoch": 68.84210526315789, "grad_norm": 1.2577513456344604, "learning_rate": 0.0001906018809804965, "loss": 0.2396, "step": 3270 }, { "epoch": 69.05263157894737, "grad_norm": 0.9996993541717529, "learning_rate": 0.0001905452196296061, "loss": 0.2333, "step": 3280 }, { "epoch": 69.26315789473684, "grad_norm": 1.3234609365463257, "learning_rate": 0.000190488396459727, "loss": 0.2289, "step": 3290 }, { "epoch": 69.47368421052632, "grad_norm": 1.3428514003753662, "learning_rate": 0.00019043141157241143, "loss": 0.2127, "step": 3300 }, { "epoch": 69.6842105263158, "grad_norm": 1.731479287147522, "learning_rate": 0.0001903742650695006, "loss": 0.2224, "step": 3310 }, { "epoch": 69.89473684210526, "grad_norm": 0.9988399147987366, "learning_rate": 0.0001903169570531246, "loss": 0.216, "step": 3320 }, { "epoch": 70.10526315789474, "grad_norm": 0.9177957773208618, "learning_rate": 0.00019025948762570216, "loss": 0.2024, "step": 3330 }, { "epoch": 70.3157894736842, "grad_norm": 1.2759298086166382, "learning_rate": 0.00019020185688994046, "loss": 0.2181, "step": 3340 }, { "epoch": 70.52631578947368, "grad_norm": 0.9687845706939697, "learning_rate": 0.0001901440649488349, "loss": 0.2183, "step": 3350 }, { "epoch": 70.73684210526316, "grad_norm": 1.1401742696762085, "learning_rate": 0.00019008611190566918, "loss": 0.2339, "step": 3360 }, { "epoch": 70.94736842105263, "grad_norm": 1.353368878364563, "learning_rate": 0.00019002799786401462, "loss": 0.2372, "step": 3370 }, { "epoch": 71.15789473684211, "grad_norm": 1.0063115358352661, "learning_rate": 0.00018996972292773057, "loss": 0.2327, "step": 3380 }, { "epoch": 71.36842105263158, "grad_norm": 1.4881367683410645, "learning_rate": 0.00018991128720096377, "loss": 0.212, "step": 3390 }, { "epoch": 71.57894736842105, "grad_norm": 1.4579118490219116, "learning_rate": 0.00018985269078814827, "loss": 0.2224, "step": 3400 }, { "epoch": 71.78947368421052, "grad_norm": 0.9688888192176819, "learning_rate": 0.00018979393379400542, "loss": 0.2216, "step": 3410 }, { "epoch": 72.0, "grad_norm": 1.7144838571548462, "learning_rate": 0.0001897350163235435, "loss": 0.2312, "step": 3420 }, { "epoch": 72.21052631578948, "grad_norm": 0.9466618299484253, "learning_rate": 0.00018967593848205754, "loss": 0.21, "step": 3430 }, { "epoch": 72.42105263157895, "grad_norm": 1.3977335691452026, "learning_rate": 0.00018961670037512924, "loss": 0.2285, "step": 3440 }, { "epoch": 72.63157894736842, "grad_norm": 1.2872693538665771, "learning_rate": 0.0001895573021086267, "loss": 0.2152, "step": 3450 }, { "epoch": 72.84210526315789, "grad_norm": 0.9000837802886963, "learning_rate": 0.00018949774378870427, "loss": 0.2184, "step": 3460 }, { "epoch": 73.05263157894737, "grad_norm": 1.1097438335418701, "learning_rate": 0.0001894380255218023, "loss": 0.2178, "step": 3470 }, { "epoch": 73.26315789473684, "grad_norm": 1.0013376474380493, "learning_rate": 0.0001893781474146471, "loss": 0.2052, "step": 3480 }, { "epoch": 73.47368421052632, "grad_norm": 0.9624537229537964, "learning_rate": 0.0001893181095742504, "loss": 0.2118, "step": 3490 }, { "epoch": 73.6842105263158, "grad_norm": 0.8211550712585449, "learning_rate": 0.00018925791210790968, "loss": 0.2262, "step": 3500 }, { "epoch": 73.89473684210526, "grad_norm": 0.9911038279533386, "learning_rate": 0.00018919755512320752, "loss": 0.2336, "step": 3510 }, { "epoch": 74.10526315789474, "grad_norm": 1.0063726902008057, "learning_rate": 0.00018913703872801166, "loss": 0.2104, "step": 3520 }, { "epoch": 74.3157894736842, "grad_norm": 1.1858199834823608, "learning_rate": 0.00018907636303047468, "loss": 0.2155, "step": 3530 }, { "epoch": 74.52631578947368, "grad_norm": 0.9891308546066284, "learning_rate": 0.00018901552813903395, "loss": 0.2059, "step": 3540 }, { "epoch": 74.73684210526316, "grad_norm": 1.5848833322525024, "learning_rate": 0.0001889545341624112, "loss": 0.2288, "step": 3550 }, { "epoch": 74.94736842105263, "grad_norm": 0.734965980052948, "learning_rate": 0.00018889338120961266, "loss": 0.2202, "step": 3560 }, { "epoch": 75.15789473684211, "grad_norm": 0.9111061692237854, "learning_rate": 0.0001888320693899285, "loss": 0.2135, "step": 3570 }, { "epoch": 75.36842105263158, "grad_norm": 0.923260509967804, "learning_rate": 0.00018877059881293288, "loss": 0.2089, "step": 3580 }, { "epoch": 75.57894736842105, "grad_norm": 1.0902652740478516, "learning_rate": 0.00018870896958848372, "loss": 0.226, "step": 3590 }, { "epoch": 75.78947368421052, "grad_norm": 1.3361833095550537, "learning_rate": 0.00018864718182672242, "loss": 0.2156, "step": 3600 }, { "epoch": 76.0, "grad_norm": 0.9090056419372559, "learning_rate": 0.00018858523563807373, "loss": 0.2344, "step": 3610 }, { "epoch": 76.21052631578948, "grad_norm": 1.0681171417236328, "learning_rate": 0.00018852313113324552, "loss": 0.2057, "step": 3620 }, { "epoch": 76.42105263157895, "grad_norm": 1.2302827835083008, "learning_rate": 0.00018846086842322864, "loss": 0.2168, "step": 3630 }, { "epoch": 76.63157894736842, "grad_norm": 1.1854231357574463, "learning_rate": 0.00018839844761929663, "loss": 0.2093, "step": 3640 }, { "epoch": 76.84210526315789, "grad_norm": 1.017804741859436, "learning_rate": 0.0001883358688330056, "loss": 0.2088, "step": 3650 }, { "epoch": 77.05263157894737, "grad_norm": 0.9746867418289185, "learning_rate": 0.00018827313217619399, "loss": 0.2297, "step": 3660 }, { "epoch": 77.26315789473684, "grad_norm": 1.3612266778945923, "learning_rate": 0.00018821023776098233, "loss": 0.2159, "step": 3670 }, { "epoch": 77.47368421052632, "grad_norm": 1.2489289045333862, "learning_rate": 0.0001881471856997732, "loss": 0.2214, "step": 3680 }, { "epoch": 77.6842105263158, "grad_norm": 0.9268271923065186, "learning_rate": 0.00018808397610525085, "loss": 0.2199, "step": 3690 }, { "epoch": 77.89473684210526, "grad_norm": 1.429926872253418, "learning_rate": 0.00018802060909038103, "loss": 0.2153, "step": 3700 }, { "epoch": 78.10526315789474, "grad_norm": 1.1016651391983032, "learning_rate": 0.0001879570847684109, "loss": 0.2119, "step": 3710 }, { "epoch": 78.3157894736842, "grad_norm": 0.8588700294494629, "learning_rate": 0.00018789340325286872, "loss": 0.2107, "step": 3720 }, { "epoch": 78.52631578947368, "grad_norm": 1.3524593114852905, "learning_rate": 0.00018782956465756366, "loss": 0.2245, "step": 3730 }, { "epoch": 78.73684210526316, "grad_norm": 1.5214595794677734, "learning_rate": 0.0001877655690965857, "loss": 0.2131, "step": 3740 }, { "epoch": 78.94736842105263, "grad_norm": 1.0766842365264893, "learning_rate": 0.00018770141668430522, "loss": 0.2288, "step": 3750 }, { "epoch": 79.15789473684211, "grad_norm": 1.0418442487716675, "learning_rate": 0.00018763710753537301, "loss": 0.22, "step": 3760 }, { "epoch": 79.36842105263158, "grad_norm": 1.1370205879211426, "learning_rate": 0.00018757264176471998, "loss": 0.2124, "step": 3770 }, { "epoch": 79.57894736842105, "grad_norm": 0.9709566235542297, "learning_rate": 0.00018750801948755685, "loss": 0.2032, "step": 3780 }, { "epoch": 79.78947368421052, "grad_norm": 1.1694402694702148, "learning_rate": 0.00018744324081937415, "loss": 0.2244, "step": 3790 }, { "epoch": 80.0, "grad_norm": 1.1555752754211426, "learning_rate": 0.00018737830587594185, "loss": 0.2204, "step": 3800 }, { "epoch": 80.21052631578948, "grad_norm": 0.89194655418396, "learning_rate": 0.0001873132147733092, "loss": 0.2024, "step": 3810 }, { "epoch": 80.42105263157895, "grad_norm": 3.491398811340332, "learning_rate": 0.00018724796762780463, "loss": 0.2363, "step": 3820 }, { "epoch": 80.63157894736842, "grad_norm": 0.9344122409820557, "learning_rate": 0.00018718256455603526, "loss": 0.2222, "step": 3830 }, { "epoch": 80.84210526315789, "grad_norm": 1.2333403825759888, "learning_rate": 0.0001871170056748871, "loss": 0.2085, "step": 3840 }, { "epoch": 81.05263157894737, "grad_norm": 0.8288215398788452, "learning_rate": 0.0001870512911015244, "loss": 0.2197, "step": 3850 }, { "epoch": 81.26315789473684, "grad_norm": 1.2045210599899292, "learning_rate": 0.00018698542095338982, "loss": 0.2155, "step": 3860 }, { "epoch": 81.47368421052632, "grad_norm": 1.1142665147781372, "learning_rate": 0.00018691939534820398, "loss": 0.2156, "step": 3870 }, { "epoch": 81.6842105263158, "grad_norm": 0.994482159614563, "learning_rate": 0.0001868532144039653, "loss": 0.2075, "step": 3880 }, { "epoch": 81.89473684210526, "grad_norm": 1.2781574726104736, "learning_rate": 0.00018678687823894992, "loss": 0.2203, "step": 3890 }, { "epoch": 82.10526315789474, "grad_norm": 1.054432988166809, "learning_rate": 0.0001867203869717113, "loss": 0.2248, "step": 3900 }, { "epoch": 82.3157894736842, "grad_norm": 1.141908049583435, "learning_rate": 0.00018665374072108012, "loss": 0.2061, "step": 3910 }, { "epoch": 82.52631578947368, "grad_norm": 1.210333228111267, "learning_rate": 0.000186586939606164, "loss": 0.2082, "step": 3920 }, { "epoch": 82.73684210526316, "grad_norm": 1.2293156385421753, "learning_rate": 0.00018651998374634744, "loss": 0.2153, "step": 3930 }, { "epoch": 82.94736842105263, "grad_norm": 0.8742419481277466, "learning_rate": 0.0001864528732612913, "loss": 0.2146, "step": 3940 }, { "epoch": 83.15789473684211, "grad_norm": 0.9708311557769775, "learning_rate": 0.00018638560827093304, "loss": 0.2171, "step": 3950 }, { "epoch": 83.36842105263158, "grad_norm": 1.1058762073516846, "learning_rate": 0.000186318188895486, "loss": 0.2209, "step": 3960 }, { "epoch": 83.57894736842105, "grad_norm": 1.1592589616775513, "learning_rate": 0.00018625061525543956, "loss": 0.2162, "step": 3970 }, { "epoch": 83.78947368421052, "grad_norm": 1.1639076471328735, "learning_rate": 0.00018618288747155882, "loss": 0.2217, "step": 3980 }, { "epoch": 84.0, "grad_norm": 0.9892878532409668, "learning_rate": 0.00018611500566488421, "loss": 0.2165, "step": 3990 }, { "epoch": 84.21052631578948, "grad_norm": 1.1064993143081665, "learning_rate": 0.00018604696995673162, "loss": 0.2023, "step": 4000 }, { "epoch": 84.42105263157895, "grad_norm": 1.0864388942718506, "learning_rate": 0.00018597878046869187, "loss": 0.2118, "step": 4010 }, { "epoch": 84.63157894736842, "grad_norm": 1.1595569849014282, "learning_rate": 0.0001859104373226306, "loss": 0.215, "step": 4020 }, { "epoch": 84.84210526315789, "grad_norm": 0.945059061050415, "learning_rate": 0.00018584194064068813, "loss": 0.2203, "step": 4030 }, { "epoch": 85.05263157894737, "grad_norm": 0.8729904890060425, "learning_rate": 0.0001857732905452791, "loss": 0.2077, "step": 4040 }, { "epoch": 85.26315789473684, "grad_norm": 1.315699815750122, "learning_rate": 0.0001857044871590924, "loss": 0.2098, "step": 4050 }, { "epoch": 85.47368421052632, "grad_norm": 0.8844873309135437, "learning_rate": 0.0001856355306050908, "loss": 0.2146, "step": 4060 }, { "epoch": 85.6842105263158, "grad_norm": 1.0418226718902588, "learning_rate": 0.00018556642100651087, "loss": 0.2182, "step": 4070 }, { "epoch": 85.89473684210526, "grad_norm": 0.7228378653526306, "learning_rate": 0.00018549715848686267, "loss": 0.2144, "step": 4080 }, { "epoch": 86.10526315789474, "grad_norm": 0.8830803632736206, "learning_rate": 0.0001854277431699295, "loss": 0.1954, "step": 4090 }, { "epoch": 86.3157894736842, "grad_norm": 0.7646278738975525, "learning_rate": 0.0001853581751797679, "loss": 0.1989, "step": 4100 }, { "epoch": 86.52631578947368, "grad_norm": 1.2258400917053223, "learning_rate": 0.00018528845464070703, "loss": 0.2189, "step": 4110 }, { "epoch": 86.73684210526316, "grad_norm": 0.8631286025047302, "learning_rate": 0.00018521858167734885, "loss": 0.2215, "step": 4120 }, { "epoch": 86.94736842105263, "grad_norm": 1.0356979370117188, "learning_rate": 0.00018514855641456768, "loss": 0.2294, "step": 4130 }, { "epoch": 87.15789473684211, "grad_norm": 1.2814905643463135, "learning_rate": 0.00018507837897751, "loss": 0.2127, "step": 4140 }, { "epoch": 87.36842105263158, "grad_norm": 1.0343648195266724, "learning_rate": 0.00018500804949159427, "loss": 0.2026, "step": 4150 }, { "epoch": 87.57894736842105, "grad_norm": 1.256469488143921, "learning_rate": 0.00018493756808251073, "loss": 0.2157, "step": 4160 }, { "epoch": 87.78947368421052, "grad_norm": 1.116888165473938, "learning_rate": 0.000184866934876221, "loss": 0.2184, "step": 4170 }, { "epoch": 88.0, "grad_norm": 1.3331102132797241, "learning_rate": 0.00018479614999895814, "loss": 0.2189, "step": 4180 }, { "epoch": 88.21052631578948, "grad_norm": 1.0215224027633667, "learning_rate": 0.00018472521357722622, "loss": 0.2124, "step": 4190 }, { "epoch": 88.42105263157895, "grad_norm": 1.1717097759246826, "learning_rate": 0.00018465412573780003, "loss": 0.2028, "step": 4200 }, { "epoch": 88.63157894736842, "grad_norm": 1.110374093055725, "learning_rate": 0.00018458288660772515, "loss": 0.2056, "step": 4210 }, { "epoch": 88.84210526315789, "grad_norm": 1.1572178602218628, "learning_rate": 0.00018451149631431744, "loss": 0.2213, "step": 4220 }, { "epoch": 89.05263157894737, "grad_norm": 0.8311715722084045, "learning_rate": 0.00018443995498516294, "loss": 0.2155, "step": 4230 }, { "epoch": 89.26315789473684, "grad_norm": 0.6976020932197571, "learning_rate": 0.00018436826274811753, "loss": 0.193, "step": 4240 }, { "epoch": 89.47368421052632, "grad_norm": 1.1248303651809692, "learning_rate": 0.00018429641973130697, "loss": 0.1934, "step": 4250 }, { "epoch": 89.6842105263158, "grad_norm": 1.1168967485427856, "learning_rate": 0.00018422442606312633, "loss": 0.2176, "step": 4260 }, { "epoch": 89.89473684210526, "grad_norm": 0.9903426766395569, "learning_rate": 0.00018415228187223997, "loss": 0.2214, "step": 4270 }, { "epoch": 90.10526315789474, "grad_norm": 0.7373839020729065, "learning_rate": 0.00018407998728758122, "loss": 0.2047, "step": 4280 }, { "epoch": 90.3157894736842, "grad_norm": 0.8506777882575989, "learning_rate": 0.00018400754243835227, "loss": 0.2025, "step": 4290 }, { "epoch": 90.52631578947368, "grad_norm": 1.0671459436416626, "learning_rate": 0.0001839349474540238, "loss": 0.2118, "step": 4300 }, { "epoch": 90.73684210526316, "grad_norm": 0.9546430706977844, "learning_rate": 0.00018386220246433484, "loss": 0.2131, "step": 4310 }, { "epoch": 90.94736842105263, "grad_norm": 0.998571515083313, "learning_rate": 0.00018378930759929246, "loss": 0.2234, "step": 4320 }, { "epoch": 91.15789473684211, "grad_norm": 1.444380521774292, "learning_rate": 0.00018371626298917156, "loss": 0.2124, "step": 4330 }, { "epoch": 91.36842105263158, "grad_norm": 0.929400622844696, "learning_rate": 0.0001836430687645148, "loss": 0.2091, "step": 4340 }, { "epoch": 91.57894736842105, "grad_norm": 0.7143007516860962, "learning_rate": 0.00018356972505613204, "loss": 0.1907, "step": 4350 }, { "epoch": 91.78947368421052, "grad_norm": 1.2031328678131104, "learning_rate": 0.0001834962319951004, "loss": 0.2276, "step": 4360 }, { "epoch": 92.0, "grad_norm": 1.0419529676437378, "learning_rate": 0.00018342258971276395, "loss": 0.2175, "step": 4370 }, { "epoch": 92.21052631578948, "grad_norm": 0.9380586743354797, "learning_rate": 0.00018334879834073332, "loss": 0.1941, "step": 4380 }, { "epoch": 92.42105263157895, "grad_norm": 1.299815058708191, "learning_rate": 0.0001832748580108857, "loss": 0.2108, "step": 4390 }, { "epoch": 92.63157894736842, "grad_norm": 0.9901102185249329, "learning_rate": 0.00018320076885536445, "loss": 0.2002, "step": 4400 }, { "epoch": 92.84210526315789, "grad_norm": 1.0252271890640259, "learning_rate": 0.00018312653100657883, "loss": 0.2323, "step": 4410 }, { "epoch": 93.05263157894737, "grad_norm": 0.9015369415283203, "learning_rate": 0.00018305214459720398, "loss": 0.2066, "step": 4420 }, { "epoch": 93.26315789473684, "grad_norm": 0.7637150287628174, "learning_rate": 0.00018297760976018052, "loss": 0.209, "step": 4430 }, { "epoch": 93.47368421052632, "grad_norm": 0.784934937953949, "learning_rate": 0.00018290292662871417, "loss": 0.2161, "step": 4440 }, { "epoch": 93.6842105263158, "grad_norm": 0.7980606555938721, "learning_rate": 0.0001828280953362759, "loss": 0.2133, "step": 4450 }, { "epoch": 93.89473684210526, "grad_norm": 1.3009899854660034, "learning_rate": 0.0001827531160166013, "loss": 0.2143, "step": 4460 }, { "epoch": 94.10526315789474, "grad_norm": 0.9536803364753723, "learning_rate": 0.0001826779888036906, "loss": 0.21, "step": 4470 }, { "epoch": 94.3157894736842, "grad_norm": 1.1637252569198608, "learning_rate": 0.0001826027138318083, "loss": 0.2102, "step": 4480 }, { "epoch": 94.52631578947368, "grad_norm": 1.1899417638778687, "learning_rate": 0.00018252729123548295, "loss": 0.2033, "step": 4490 }, { "epoch": 94.73684210526316, "grad_norm": 0.8081888556480408, "learning_rate": 0.00018245172114950703, "loss": 0.2071, "step": 4500 }, { "epoch": 94.94736842105263, "grad_norm": 0.962952196598053, "learning_rate": 0.0001823760037089365, "loss": 0.2121, "step": 4510 }, { "epoch": 95.15789473684211, "grad_norm": 0.9023415446281433, "learning_rate": 0.0001823001390490907, "loss": 0.2044, "step": 4520 }, { "epoch": 95.36842105263158, "grad_norm": 0.9460718035697937, "learning_rate": 0.00018222412730555207, "loss": 0.2085, "step": 4530 }, { "epoch": 95.57894736842105, "grad_norm": 0.9123182892799377, "learning_rate": 0.00018214796861416594, "loss": 0.2036, "step": 4540 }, { "epoch": 95.78947368421052, "grad_norm": 0.9567075967788696, "learning_rate": 0.00018207166311104024, "loss": 0.2142, "step": 4550 }, { "epoch": 96.0, "grad_norm": 1.246997594833374, "learning_rate": 0.00018199521093254523, "loss": 0.2081, "step": 4560 }, { "epoch": 96.21052631578948, "grad_norm": 0.8137680292129517, "learning_rate": 0.0001819186122153134, "loss": 0.2071, "step": 4570 }, { "epoch": 96.42105263157895, "grad_norm": 0.901633620262146, "learning_rate": 0.0001818418670962391, "loss": 0.2161, "step": 4580 }, { "epoch": 96.63157894736842, "grad_norm": 0.9326063990592957, "learning_rate": 0.00018176497571247824, "loss": 0.2102, "step": 4590 }, { "epoch": 96.84210526315789, "grad_norm": 1.112606406211853, "learning_rate": 0.0001816879382014482, "loss": 0.1983, "step": 4600 }, { "epoch": 97.05263157894737, "grad_norm": 0.6728799939155579, "learning_rate": 0.00018161075470082754, "loss": 0.2174, "step": 4610 }, { "epoch": 97.26315789473684, "grad_norm": 1.1379280090332031, "learning_rate": 0.00018153342534855566, "loss": 0.2151, "step": 4620 }, { "epoch": 97.47368421052632, "grad_norm": 1.089913010597229, "learning_rate": 0.00018145595028283267, "loss": 0.2061, "step": 4630 }, { "epoch": 97.6842105263158, "grad_norm": 0.8472727537155151, "learning_rate": 0.00018137832964211905, "loss": 0.1973, "step": 4640 }, { "epoch": 97.89473684210526, "grad_norm": 1.0479404926300049, "learning_rate": 0.0001813005635651355, "loss": 0.2134, "step": 4650 }, { "epoch": 98.10526315789474, "grad_norm": 0.8933761715888977, "learning_rate": 0.00018122265219086258, "loss": 0.2074, "step": 4660 }, { "epoch": 98.3157894736842, "grad_norm": 0.8513138294219971, "learning_rate": 0.00018114459565854056, "loss": 0.1951, "step": 4670 }, { "epoch": 98.52631578947368, "grad_norm": 1.1558623313903809, "learning_rate": 0.00018106639410766912, "loss": 0.2212, "step": 4680 }, { "epoch": 98.73684210526316, "grad_norm": 1.7782244682312012, "learning_rate": 0.00018098804767800711, "loss": 0.2202, "step": 4690 }, { "epoch": 98.94736842105263, "grad_norm": 0.9603464603424072, "learning_rate": 0.00018090955650957232, "loss": 0.1953, "step": 4700 }, { "epoch": 99.15789473684211, "grad_norm": 0.7804552316665649, "learning_rate": 0.0001808309207426412, "loss": 0.1969, "step": 4710 }, { "epoch": 99.36842105263158, "grad_norm": 1.0236091613769531, "learning_rate": 0.00018075214051774857, "loss": 0.2108, "step": 4720 }, { "epoch": 99.57894736842105, "grad_norm": 0.8507013320922852, "learning_rate": 0.00018067321597568746, "loss": 0.2114, "step": 4730 }, { "epoch": 99.78947368421052, "grad_norm": 1.1381756067276, "learning_rate": 0.0001805941472575089, "loss": 0.2161, "step": 4740 }, { "epoch": 100.0, "grad_norm": 1.0529285669326782, "learning_rate": 0.00018051493450452148, "loss": 0.2094, "step": 4750 }, { "epoch": 100.21052631578948, "grad_norm": 0.7255027294158936, "learning_rate": 0.0001804355778582912, "loss": 0.1996, "step": 4760 }, { "epoch": 100.42105263157895, "grad_norm": 0.8361448049545288, "learning_rate": 0.00018035607746064126, "loss": 0.1991, "step": 4770 }, { "epoch": 100.63157894736842, "grad_norm": 1.0485684871673584, "learning_rate": 0.00018027643345365176, "loss": 0.2255, "step": 4780 }, { "epoch": 100.84210526315789, "grad_norm": 0.8241237998008728, "learning_rate": 0.00018019664597965947, "loss": 0.2072, "step": 4790 }, { "epoch": 101.05263157894737, "grad_norm": 0.5731729865074158, "learning_rate": 0.00018011671518125758, "loss": 0.208, "step": 4800 }, { "epoch": 101.26315789473684, "grad_norm": 1.0061657428741455, "learning_rate": 0.00018003664120129533, "loss": 0.1995, "step": 4810 }, { "epoch": 101.47368421052632, "grad_norm": 1.0121266841888428, "learning_rate": 0.00017995642418287792, "loss": 0.2157, "step": 4820 }, { "epoch": 101.6842105263158, "grad_norm": 0.7068321108818054, "learning_rate": 0.00017987606426936615, "loss": 0.2062, "step": 4830 }, { "epoch": 101.89473684210526, "grad_norm": 1.0422128438949585, "learning_rate": 0.00017979556160437627, "loss": 0.2076, "step": 4840 }, { "epoch": 102.10526315789474, "grad_norm": 0.9450851082801819, "learning_rate": 0.00017971491633177956, "loss": 0.2104, "step": 4850 }, { "epoch": 102.3157894736842, "grad_norm": 0.9264403581619263, "learning_rate": 0.0001796341285957022, "loss": 0.1923, "step": 4860 }, { "epoch": 102.52631578947368, "grad_norm": 0.8550981283187866, "learning_rate": 0.00017955319854052494, "loss": 0.223, "step": 4870 }, { "epoch": 102.73684210526316, "grad_norm": 0.9619629979133606, "learning_rate": 0.00017947212631088297, "loss": 0.1947, "step": 4880 }, { "epoch": 102.94736842105263, "grad_norm": 0.872445285320282, "learning_rate": 0.00017939091205166548, "loss": 0.2202, "step": 4890 }, { "epoch": 103.15789473684211, "grad_norm": 0.8909777402877808, "learning_rate": 0.00017930955590801553, "loss": 0.2104, "step": 4900 }, { "epoch": 103.36842105263158, "grad_norm": 1.1058716773986816, "learning_rate": 0.00017922805802532974, "loss": 0.1991, "step": 4910 }, { "epoch": 103.57894736842105, "grad_norm": 0.9872148633003235, "learning_rate": 0.000179146418549258, "loss": 0.1916, "step": 4920 }, { "epoch": 103.78947368421052, "grad_norm": 0.8365228176116943, "learning_rate": 0.00017906463762570337, "loss": 0.223, "step": 4930 }, { "epoch": 104.0, "grad_norm": 0.7262590527534485, "learning_rate": 0.00017898271540082154, "loss": 0.2089, "step": 4940 }, { "epoch": 104.21052631578948, "grad_norm": 0.753351628780365, "learning_rate": 0.00017890065202102085, "loss": 0.1939, "step": 4950 }, { "epoch": 104.42105263157895, "grad_norm": 0.7613105773925781, "learning_rate": 0.00017881844763296186, "loss": 0.2005, "step": 4960 }, { "epoch": 104.63157894736842, "grad_norm": 0.79398512840271, "learning_rate": 0.00017873610238355715, "loss": 0.2098, "step": 4970 }, { "epoch": 104.84210526315789, "grad_norm": 0.7749453186988831, "learning_rate": 0.00017865361641997103, "loss": 0.2094, "step": 4980 }, { "epoch": 105.05263157894737, "grad_norm": 0.8748012185096741, "learning_rate": 0.0001785709898896193, "loss": 0.2133, "step": 4990 }, { "epoch": 105.26315789473684, "grad_norm": 0.9628874659538269, "learning_rate": 0.0001784882229401689, "loss": 0.1999, "step": 5000 }, { "epoch": 105.47368421052632, "grad_norm": 1.3108863830566406, "learning_rate": 0.00017840531571953786, "loss": 0.2095, "step": 5010 }, { "epoch": 105.6842105263158, "grad_norm": 0.9370902180671692, "learning_rate": 0.0001783222683758948, "loss": 0.2298, "step": 5020 }, { "epoch": 105.89473684210526, "grad_norm": 0.8537701964378357, "learning_rate": 0.0001782390810576588, "loss": 0.1997, "step": 5030 }, { "epoch": 106.10526315789474, "grad_norm": 0.572172224521637, "learning_rate": 0.0001781557539134991, "loss": 0.194, "step": 5040 }, { "epoch": 106.3157894736842, "grad_norm": 0.7815021276473999, "learning_rate": 0.00017807228709233478, "loss": 0.2138, "step": 5050 }, { "epoch": 106.52631578947368, "grad_norm": 0.8081856966018677, "learning_rate": 0.00017798868074333463, "loss": 0.1877, "step": 5060 }, { "epoch": 106.73684210526316, "grad_norm": 0.891789436340332, "learning_rate": 0.00017790493501591668, "loss": 0.2164, "step": 5070 }, { "epoch": 106.94736842105263, "grad_norm": 0.8877257108688354, "learning_rate": 0.0001778210500597482, "loss": 0.2139, "step": 5080 }, { "epoch": 107.15789473684211, "grad_norm": 0.8057231903076172, "learning_rate": 0.00017773702602474515, "loss": 0.1985, "step": 5090 }, { "epoch": 107.36842105263158, "grad_norm": 0.9601947069168091, "learning_rate": 0.00017765286306107214, "loss": 0.2082, "step": 5100 }, { "epoch": 107.57894736842105, "grad_norm": 1.2363498210906982, "learning_rate": 0.000177568561319142, "loss": 0.1961, "step": 5110 }, { "epoch": 107.78947368421052, "grad_norm": 0.8430647850036621, "learning_rate": 0.00017748412094961566, "loss": 0.2112, "step": 5120 }, { "epoch": 108.0, "grad_norm": 1.2295119762420654, "learning_rate": 0.00017739954210340173, "loss": 0.2104, "step": 5130 }, { "epoch": 108.21052631578948, "grad_norm": 0.6926615238189697, "learning_rate": 0.0001773148249316563, "loss": 0.2053, "step": 5140 }, { "epoch": 108.42105263157895, "grad_norm": 0.6129204034805298, "learning_rate": 0.0001772299695857827, "loss": 0.1957, "step": 5150 }, { "epoch": 108.63157894736842, "grad_norm": 0.7398890256881714, "learning_rate": 0.00017714497621743123, "loss": 0.2114, "step": 5160 }, { "epoch": 108.84210526315789, "grad_norm": 0.9888297915458679, "learning_rate": 0.00017705984497849874, "loss": 0.2087, "step": 5170 }, { "epoch": 109.05263157894737, "grad_norm": 0.8859359622001648, "learning_rate": 0.00017697457602112863, "loss": 0.2095, "step": 5180 }, { "epoch": 109.26315789473684, "grad_norm": 0.80406254529953, "learning_rate": 0.00017688916949771036, "loss": 0.2149, "step": 5190 }, { "epoch": 109.47368421052632, "grad_norm": 0.9358986020088196, "learning_rate": 0.0001768036255608792, "loss": 0.193, "step": 5200 }, { "epoch": 109.6842105263158, "grad_norm": 0.9183883666992188, "learning_rate": 0.000176717944363516, "loss": 0.2081, "step": 5210 }, { "epoch": 109.89473684210526, "grad_norm": 0.8253007531166077, "learning_rate": 0.00017663212605874704, "loss": 0.2275, "step": 5220 }, { "epoch": 110.10526315789474, "grad_norm": 0.8964976668357849, "learning_rate": 0.00017654617079994347, "loss": 0.2045, "step": 5230 }, { "epoch": 110.3157894736842, "grad_norm": 0.922153115272522, "learning_rate": 0.0001764600787407213, "loss": 0.205, "step": 5240 }, { "epoch": 110.52631578947368, "grad_norm": 0.8380544185638428, "learning_rate": 0.00017637385003494102, "loss": 0.1918, "step": 5250 }, { "epoch": 110.73684210526316, "grad_norm": 0.7515467405319214, "learning_rate": 0.00017628748483670728, "loss": 0.21, "step": 5260 }, { "epoch": 110.94736842105263, "grad_norm": 0.9965086579322815, "learning_rate": 0.00017620098330036873, "loss": 0.2108, "step": 5270 }, { "epoch": 111.15789473684211, "grad_norm": 0.7745735049247742, "learning_rate": 0.00017611434558051757, "loss": 0.21, "step": 5280 }, { "epoch": 111.36842105263158, "grad_norm": 0.8300684690475464, "learning_rate": 0.00017602757183198952, "loss": 0.1987, "step": 5290 }, { "epoch": 111.57894736842105, "grad_norm": 0.9019736051559448, "learning_rate": 0.00017594066220986333, "loss": 0.1983, "step": 5300 }, { "epoch": 111.78947368421052, "grad_norm": 0.7582821846008301, "learning_rate": 0.00017585361686946055, "loss": 0.2161, "step": 5310 }, { "epoch": 112.0, "grad_norm": 0.7558669447898865, "learning_rate": 0.00017576643596634538, "loss": 0.208, "step": 5320 }, { "epoch": 112.21052631578948, "grad_norm": 0.9202759265899658, "learning_rate": 0.00017567911965632414, "loss": 0.2082, "step": 5330 }, { "epoch": 112.42105263157895, "grad_norm": 0.8538267016410828, "learning_rate": 0.0001755916680954453, "loss": 0.2044, "step": 5340 }, { "epoch": 112.63157894736842, "grad_norm": 0.6622471213340759, "learning_rate": 0.00017550408143999894, "loss": 0.1959, "step": 5350 }, { "epoch": 112.84210526315789, "grad_norm": 1.5098806619644165, "learning_rate": 0.00017541635984651667, "loss": 0.2011, "step": 5360 }, { "epoch": 113.05263157894737, "grad_norm": 1.0928170680999756, "learning_rate": 0.00017532850347177118, "loss": 0.2145, "step": 5370 }, { "epoch": 113.26315789473684, "grad_norm": 0.7540849447250366, "learning_rate": 0.00017524051247277603, "loss": 0.1989, "step": 5380 }, { "epoch": 113.47368421052632, "grad_norm": 0.7334789633750916, "learning_rate": 0.00017515238700678538, "loss": 0.1966, "step": 5390 }, { "epoch": 113.6842105263158, "grad_norm": 0.948559045791626, "learning_rate": 0.0001750641272312938, "loss": 0.2164, "step": 5400 }, { "epoch": 113.89473684210526, "grad_norm": 1.1310551166534424, "learning_rate": 0.00017497573330403578, "loss": 0.2174, "step": 5410 }, { "epoch": 114.10526315789474, "grad_norm": 0.6442914605140686, "learning_rate": 0.00017488720538298558, "loss": 0.1954, "step": 5420 }, { "epoch": 114.3157894736842, "grad_norm": 0.6499135494232178, "learning_rate": 0.000174798543626357, "loss": 0.2052, "step": 5430 }, { "epoch": 114.52631578947368, "grad_norm": 0.9780040979385376, "learning_rate": 0.00017470974819260292, "loss": 0.1946, "step": 5440 }, { "epoch": 114.73684210526316, "grad_norm": 0.8085854053497314, "learning_rate": 0.00017462081924041523, "loss": 0.2252, "step": 5450 }, { "epoch": 114.94736842105263, "grad_norm": 1.080230712890625, "learning_rate": 0.00017453175692872436, "loss": 0.2177, "step": 5460 }, { "epoch": 115.15789473684211, "grad_norm": 0.6058759093284607, "learning_rate": 0.00017444256141669907, "loss": 0.1986, "step": 5470 }, { "epoch": 115.36842105263158, "grad_norm": 0.9204846024513245, "learning_rate": 0.00017435323286374627, "loss": 0.1958, "step": 5480 }, { "epoch": 115.57894736842105, "grad_norm": 0.781603991985321, "learning_rate": 0.00017426377142951052, "loss": 0.2055, "step": 5490 }, { "epoch": 115.78947368421052, "grad_norm": 0.9358359575271606, "learning_rate": 0.00017417417727387394, "loss": 0.2253, "step": 5500 }, { "epoch": 116.0, "grad_norm": 0.8498882055282593, "learning_rate": 0.00017408445055695578, "loss": 0.1943, "step": 5510 }, { "epoch": 116.21052631578948, "grad_norm": 0.8480699062347412, "learning_rate": 0.00017399459143911225, "loss": 0.2052, "step": 5520 }, { "epoch": 116.42105263157895, "grad_norm": 1.2214350700378418, "learning_rate": 0.00017390460008093618, "loss": 0.2145, "step": 5530 }, { "epoch": 116.63157894736842, "grad_norm": 0.7576544880867004, "learning_rate": 0.00017381447664325666, "loss": 0.2106, "step": 5540 }, { "epoch": 116.84210526315789, "grad_norm": 0.7696592211723328, "learning_rate": 0.00017372422128713891, "loss": 0.1956, "step": 5550 }, { "epoch": 117.05263157894737, "grad_norm": 0.787528395652771, "learning_rate": 0.00017364287880967868, "loss": 0.2027, "step": 5560 }, { "epoch": 117.26315789473684, "grad_norm": 0.7447987198829651, "learning_rate": 0.00017355237325310713, "loss": 0.1893, "step": 5570 }, { "epoch": 117.47368421052632, "grad_norm": 0.8453590273857117, "learning_rate": 0.00017346173624651856, "loss": 0.2099, "step": 5580 }, { "epoch": 117.6842105263158, "grad_norm": 0.7510943412780762, "learning_rate": 0.0001733709679518961, "loss": 0.2025, "step": 5590 }, { "epoch": 117.89473684210526, "grad_norm": 0.8954883813858032, "learning_rate": 0.00017328006853145739, "loss": 0.2046, "step": 5600 }, { "epoch": 118.10526315789474, "grad_norm": 0.9460748434066772, "learning_rate": 0.00017318903814765436, "loss": 0.215, "step": 5610 }, { "epoch": 118.3157894736842, "grad_norm": 0.9654355645179749, "learning_rate": 0.00017309787696317315, "loss": 0.2006, "step": 5620 }, { "epoch": 118.52631578947368, "grad_norm": 0.7358613014221191, "learning_rate": 0.00017300658514093353, "loss": 0.204, "step": 5630 }, { "epoch": 118.73684210526316, "grad_norm": 0.6271918416023254, "learning_rate": 0.00017291516284408882, "loss": 0.2058, "step": 5640 }, { "epoch": 118.94736842105263, "grad_norm": 0.7335479259490967, "learning_rate": 0.00017282361023602546, "loss": 0.2023, "step": 5650 }, { "epoch": 119.15789473684211, "grad_norm": 0.9404072761535645, "learning_rate": 0.0001727319274803628, "loss": 0.1997, "step": 5660 }, { "epoch": 119.36842105263158, "grad_norm": 0.7514150738716125, "learning_rate": 0.00017264011474095282, "loss": 0.1994, "step": 5670 }, { "epoch": 119.57894736842105, "grad_norm": 0.9548558592796326, "learning_rate": 0.00017254817218187977, "loss": 0.2048, "step": 5680 }, { "epoch": 119.78947368421052, "grad_norm": 1.1437008380889893, "learning_rate": 0.00017245609996745985, "loss": 0.2062, "step": 5690 }, { "epoch": 120.0, "grad_norm": 0.9660143256187439, "learning_rate": 0.00017236389826224115, "loss": 0.2125, "step": 5700 }, { "epoch": 120.21052631578948, "grad_norm": 0.6402197480201721, "learning_rate": 0.000172271567231003, "loss": 0.1845, "step": 5710 }, { "epoch": 120.42105263157895, "grad_norm": 0.9553530216217041, "learning_rate": 0.00017217910703875588, "loss": 0.2018, "step": 5720 }, { "epoch": 120.63157894736842, "grad_norm": 1.160772681236267, "learning_rate": 0.00017208651785074122, "loss": 0.2186, "step": 5730 }, { "epoch": 120.84210526315789, "grad_norm": 0.9803076982498169, "learning_rate": 0.00017199379983243087, "loss": 0.2133, "step": 5740 }, { "epoch": 121.05263157894737, "grad_norm": 0.7129189968109131, "learning_rate": 0.00017190095314952697, "loss": 0.2165, "step": 5750 }, { "epoch": 121.26315789473684, "grad_norm": 0.8149412870407104, "learning_rate": 0.0001718079779679616, "loss": 0.2117, "step": 5760 }, { "epoch": 121.47368421052632, "grad_norm": 0.7495967149734497, "learning_rate": 0.0001717148744538965, "loss": 0.1908, "step": 5770 }, { "epoch": 121.6842105263158, "grad_norm": 1.0567039251327515, "learning_rate": 0.00017162164277372273, "loss": 0.2077, "step": 5780 }, { "epoch": 121.89473684210526, "grad_norm": 1.0251121520996094, "learning_rate": 0.0001715282830940604, "loss": 0.207, "step": 5790 }, { "epoch": 122.10526315789474, "grad_norm": 0.8159797191619873, "learning_rate": 0.00017143479558175844, "loss": 0.2052, "step": 5800 }, { "epoch": 122.3157894736842, "grad_norm": 1.1662620306015015, "learning_rate": 0.00017134118040389415, "loss": 0.2021, "step": 5810 }, { "epoch": 122.52631578947368, "grad_norm": 0.9236595630645752, "learning_rate": 0.00017124743772777308, "loss": 0.1977, "step": 5820 }, { "epoch": 122.73684210526316, "grad_norm": 0.8414731621742249, "learning_rate": 0.00017115356772092857, "loss": 0.2003, "step": 5830 }, { "epoch": 122.94736842105263, "grad_norm": 0.7867684364318848, "learning_rate": 0.0001710595705511215, "loss": 0.2051, "step": 5840 }, { "epoch": 123.15789473684211, "grad_norm": 0.6422597169876099, "learning_rate": 0.00017096544638634008, "loss": 0.1987, "step": 5850 }, { "epoch": 123.36842105263158, "grad_norm": 0.8452603220939636, "learning_rate": 0.00017087119539479947, "loss": 0.2116, "step": 5860 }, { "epoch": 123.57894736842105, "grad_norm": 0.7509437203407288, "learning_rate": 0.0001707768177449415, "loss": 0.2036, "step": 5870 }, { "epoch": 123.78947368421052, "grad_norm": 0.964089572429657, "learning_rate": 0.00017068231360543425, "loss": 0.2029, "step": 5880 }, { "epoch": 124.0, "grad_norm": 1.1077382564544678, "learning_rate": 0.00017058768314517203, "loss": 0.2193, "step": 5890 }, { "epoch": 124.21052631578948, "grad_norm": 1.1068215370178223, "learning_rate": 0.00017049292653327478, "loss": 0.1957, "step": 5900 }, { "epoch": 124.42105263157895, "grad_norm": 0.6014849543571472, "learning_rate": 0.0001703980439390879, "loss": 0.1914, "step": 5910 }, { "epoch": 124.63157894736842, "grad_norm": 0.895735502243042, "learning_rate": 0.000170303035532182, "loss": 0.2051, "step": 5920 }, { "epoch": 124.84210526315789, "grad_norm": 0.758565366268158, "learning_rate": 0.00017020790148235252, "loss": 0.2218, "step": 5930 }, { "epoch": 125.05263157894737, "grad_norm": 1.467057228088379, "learning_rate": 0.00017011264195961937, "loss": 0.2013, "step": 5940 }, { "epoch": 125.26315789473684, "grad_norm": 0.9302055239677429, "learning_rate": 0.00017001725713422684, "loss": 0.2045, "step": 5950 }, { "epoch": 125.47368421052632, "grad_norm": 0.963485598564148, "learning_rate": 0.00016992174717664305, "loss": 0.1982, "step": 5960 }, { "epoch": 125.6842105263158, "grad_norm": 0.7401504516601562, "learning_rate": 0.00016982611225755978, "loss": 0.2028, "step": 5970 }, { "epoch": 125.89473684210526, "grad_norm": 0.938339352607727, "learning_rate": 0.00016973035254789213, "loss": 0.2072, "step": 5980 }, { "epoch": 126.10526315789474, "grad_norm": 0.8344370722770691, "learning_rate": 0.00016963446821877825, "loss": 0.2026, "step": 5990 }, { "epoch": 126.3157894736842, "grad_norm": 0.695802628993988, "learning_rate": 0.00016953845944157894, "loss": 0.1893, "step": 6000 }, { "epoch": 126.52631578947368, "grad_norm": 0.7851212620735168, "learning_rate": 0.00016944232638787748, "loss": 0.1912, "step": 6010 }, { "epoch": 126.73684210526316, "grad_norm": 0.762319803237915, "learning_rate": 0.00016934606922947923, "loss": 0.2068, "step": 6020 }, { "epoch": 126.94736842105263, "grad_norm": 0.9079723358154297, "learning_rate": 0.0001692496881384113, "loss": 0.2096, "step": 6030 }, { "epoch": 127.15789473684211, "grad_norm": 0.7185937166213989, "learning_rate": 0.00016915318328692243, "loss": 0.2039, "step": 6040 }, { "epoch": 127.36842105263158, "grad_norm": 0.9591910243034363, "learning_rate": 0.0001690565548474823, "loss": 0.2109, "step": 6050 }, { "epoch": 127.57894736842105, "grad_norm": 0.8179908394813538, "learning_rate": 0.0001689598029927817, "loss": 0.1946, "step": 6060 }, { "epoch": 127.78947368421052, "grad_norm": 0.9175289273262024, "learning_rate": 0.00016886292789573183, "loss": 0.2043, "step": 6070 }, { "epoch": 128.0, "grad_norm": 0.7145066857337952, "learning_rate": 0.0001687659297294642, "loss": 0.2177, "step": 6080 }, { "epoch": 128.21052631578948, "grad_norm": 0.6293530464172363, "learning_rate": 0.0001686688086673303, "loss": 0.2011, "step": 6090 }, { "epoch": 128.42105263157896, "grad_norm": 0.8945363759994507, "learning_rate": 0.0001685715648829012, "loss": 0.2044, "step": 6100 }, { "epoch": 128.6315789473684, "grad_norm": 0.9259580373764038, "learning_rate": 0.00016847419854996724, "loss": 0.2128, "step": 6110 }, { "epoch": 128.8421052631579, "grad_norm": 1.0482829809188843, "learning_rate": 0.00016837670984253794, "loss": 0.2029, "step": 6120 }, { "epoch": 129.05263157894737, "grad_norm": 0.7181470990180969, "learning_rate": 0.0001682790989348414, "loss": 0.2024, "step": 6130 }, { "epoch": 129.26315789473685, "grad_norm": 0.9605810642242432, "learning_rate": 0.00016818136600132416, "loss": 0.2086, "step": 6140 }, { "epoch": 129.47368421052633, "grad_norm": 0.651897668838501, "learning_rate": 0.00016808351121665071, "loss": 0.1877, "step": 6150 }, { "epoch": 129.68421052631578, "grad_norm": 0.7484719753265381, "learning_rate": 0.00016798553475570356, "loss": 0.2019, "step": 6160 }, { "epoch": 129.89473684210526, "grad_norm": 0.7095634937286377, "learning_rate": 0.0001678874367935824, "loss": 0.2328, "step": 6170 }, { "epoch": 130.10526315789474, "grad_norm": 1.094728946685791, "learning_rate": 0.0001677892175056043, "loss": 0.2077, "step": 6180 }, { "epoch": 130.31578947368422, "grad_norm": 0.7821156978607178, "learning_rate": 0.00016769087706730302, "loss": 0.2012, "step": 6190 }, { "epoch": 130.52631578947367, "grad_norm": 0.6422997117042542, "learning_rate": 0.00016759241565442884, "loss": 0.2012, "step": 6200 }, { "epoch": 130.73684210526315, "grad_norm": 0.7488658428192139, "learning_rate": 0.00016749383344294834, "loss": 0.199, "step": 6210 }, { "epoch": 130.94736842105263, "grad_norm": 0.7575473785400391, "learning_rate": 0.00016739513060904382, "loss": 0.1988, "step": 6220 }, { "epoch": 131.1578947368421, "grad_norm": 0.6518003344535828, "learning_rate": 0.0001672963073291133, "loss": 0.1945, "step": 6230 }, { "epoch": 131.3684210526316, "grad_norm": 0.6180778741836548, "learning_rate": 0.00016719736377977, "loss": 0.1934, "step": 6240 }, { "epoch": 131.57894736842104, "grad_norm": 0.5813746452331543, "learning_rate": 0.00016709830013784212, "loss": 0.2006, "step": 6250 }, { "epoch": 131.78947368421052, "grad_norm": 0.880317211151123, "learning_rate": 0.00016699911658037237, "loss": 0.2136, "step": 6260 }, { "epoch": 132.0, "grad_norm": 0.804076075553894, "learning_rate": 0.00016689981328461793, "loss": 0.2092, "step": 6270 }, { "epoch": 132.21052631578948, "grad_norm": 0.7459381818771362, "learning_rate": 0.00016680039042804982, "loss": 0.194, "step": 6280 }, { "epoch": 132.42105263157896, "grad_norm": 0.6983547806739807, "learning_rate": 0.00016670084818835287, "loss": 0.1926, "step": 6290 }, { "epoch": 132.6315789473684, "grad_norm": 0.7421784400939941, "learning_rate": 0.00016660118674342517, "loss": 0.1958, "step": 6300 }, { "epoch": 132.8421052631579, "grad_norm": 1.2000913619995117, "learning_rate": 0.0001665014062713779, "loss": 0.2105, "step": 6310 }, { "epoch": 133.05263157894737, "grad_norm": 0.6287597417831421, "learning_rate": 0.0001664015069505349, "loss": 0.2012, "step": 6320 }, { "epoch": 133.26315789473685, "grad_norm": 0.8468930125236511, "learning_rate": 0.0001663014889594325, "loss": 0.203, "step": 6330 }, { "epoch": 133.47368421052633, "grad_norm": 0.9103506207466125, "learning_rate": 0.00016620135247681902, "loss": 0.1935, "step": 6340 }, { "epoch": 133.68421052631578, "grad_norm": 0.619668185710907, "learning_rate": 0.00016610109768165464, "loss": 0.2187, "step": 6350 }, { "epoch": 133.89473684210526, "grad_norm": 0.816078782081604, "learning_rate": 0.00016600072475311096, "loss": 0.1962, "step": 6360 }, { "epoch": 134.10526315789474, "grad_norm": 0.721704363822937, "learning_rate": 0.00016590023387057055, "loss": 0.2024, "step": 6370 }, { "epoch": 134.31578947368422, "grad_norm": 0.913533091545105, "learning_rate": 0.00016579962521362708, "loss": 0.1888, "step": 6380 }, { "epoch": 134.52631578947367, "grad_norm": 0.8451491594314575, "learning_rate": 0.00016569889896208436, "loss": 0.2001, "step": 6390 }, { "epoch": 134.73684210526315, "grad_norm": 0.7206440567970276, "learning_rate": 0.00016559805529595668, "loss": 0.2135, "step": 6400 }, { "epoch": 134.94736842105263, "grad_norm": 0.5913563966751099, "learning_rate": 0.00016549709439546794, "loss": 0.2041, "step": 6410 }, { "epoch": 135.1578947368421, "grad_norm": 1.0242068767547607, "learning_rate": 0.00016539601644105167, "loss": 0.2064, "step": 6420 }, { "epoch": 135.3684210526316, "grad_norm": 0.7297083735466003, "learning_rate": 0.00016529482161335054, "loss": 0.1989, "step": 6430 }, { "epoch": 135.57894736842104, "grad_norm": 0.9045604467391968, "learning_rate": 0.00016519351009321612, "loss": 0.1974, "step": 6440 }, { "epoch": 135.78947368421052, "grad_norm": 0.6610522866249084, "learning_rate": 0.00016509208206170857, "loss": 0.1924, "step": 6450 }, { "epoch": 136.0, "grad_norm": 0.9059966802597046, "learning_rate": 0.00016499053770009618, "loss": 0.2102, "step": 6460 }, { "epoch": 136.21052631578948, "grad_norm": 0.7158136367797852, "learning_rate": 0.0001648888771898552, "loss": 0.1978, "step": 6470 }, { "epoch": 136.42105263157896, "grad_norm": 0.8900578618049622, "learning_rate": 0.00016478710071266944, "loss": 0.2019, "step": 6480 }, { "epoch": 136.6315789473684, "grad_norm": 0.7282655239105225, "learning_rate": 0.00016468520845042996, "loss": 0.2081, "step": 6490 }, { "epoch": 136.8421052631579, "grad_norm": 0.6675572395324707, "learning_rate": 0.0001645832005852348, "loss": 0.1943, "step": 6500 }, { "epoch": 137.05263157894737, "grad_norm": 0.6044570803642273, "learning_rate": 0.0001644810772993885, "loss": 0.1966, "step": 6510 }, { "epoch": 137.26315789473685, "grad_norm": 0.7982662320137024, "learning_rate": 0.00016437883877540194, "loss": 0.204, "step": 6520 }, { "epoch": 137.47368421052633, "grad_norm": 0.6893635392189026, "learning_rate": 0.00016427648519599196, "loss": 0.1854, "step": 6530 }, { "epoch": 137.68421052631578, "grad_norm": 0.7114787101745605, "learning_rate": 0.000164174016744081, "loss": 0.2067, "step": 6540 }, { "epoch": 137.89473684210526, "grad_norm": 0.8275425434112549, "learning_rate": 0.00016407143360279682, "loss": 0.2046, "step": 6550 }, { "epoch": 138.10526315789474, "grad_norm": 0.6906023025512695, "learning_rate": 0.00016396873595547206, "loss": 0.2047, "step": 6560 }, { "epoch": 138.31578947368422, "grad_norm": 0.7150043845176697, "learning_rate": 0.00016386592398564412, "loss": 0.1906, "step": 6570 }, { "epoch": 138.52631578947367, "grad_norm": 0.6663026213645935, "learning_rate": 0.00016376299787705464, "loss": 0.2021, "step": 6580 }, { "epoch": 138.73684210526315, "grad_norm": 0.9189106822013855, "learning_rate": 0.00016365995781364925, "loss": 0.2195, "step": 6590 }, { "epoch": 138.94736842105263, "grad_norm": 0.7078124284744263, "learning_rate": 0.0001635568039795773, "loss": 0.1993, "step": 6600 }, { "epoch": 139.1578947368421, "grad_norm": 0.9718939065933228, "learning_rate": 0.00016345353655919137, "loss": 0.1884, "step": 6610 }, { "epoch": 139.3684210526316, "grad_norm": 1.3638027906417847, "learning_rate": 0.0001633501557370471, "loss": 0.1878, "step": 6620 }, { "epoch": 139.57894736842104, "grad_norm": 0.8305347561836243, "learning_rate": 0.00016324666169790283, "loss": 0.1939, "step": 6630 }, { "epoch": 139.78947368421052, "grad_norm": 1.2636075019836426, "learning_rate": 0.0001631430546267191, "loss": 0.2179, "step": 6640 }, { "epoch": 140.0, "grad_norm": 1.4744796752929688, "learning_rate": 0.0001630393347086586, "loss": 0.2319, "step": 6650 }, { "epoch": 140.21052631578948, "grad_norm": 0.9998826384544373, "learning_rate": 0.0001629355021290856, "loss": 0.1919, "step": 6660 }, { "epoch": 140.42105263157896, "grad_norm": 0.6231628060340881, "learning_rate": 0.0001628315570735658, "loss": 0.1977, "step": 6670 }, { "epoch": 140.6315789473684, "grad_norm": 0.6121691465377808, "learning_rate": 0.00016272749972786587, "loss": 0.1989, "step": 6680 }, { "epoch": 140.8421052631579, "grad_norm": 0.7017862200737, "learning_rate": 0.00016262333027795313, "loss": 0.2033, "step": 6690 }, { "epoch": 141.05263157894737, "grad_norm": 0.6834351420402527, "learning_rate": 0.0001625190489099953, "loss": 0.2034, "step": 6700 }, { "epoch": 141.26315789473685, "grad_norm": 0.8252644538879395, "learning_rate": 0.00016241465581036009, "loss": 0.1984, "step": 6710 }, { "epoch": 141.47368421052633, "grad_norm": 0.830450713634491, "learning_rate": 0.00016231015116561487, "loss": 0.2015, "step": 6720 }, { "epoch": 141.68421052631578, "grad_norm": 0.644212007522583, "learning_rate": 0.0001622055351625264, "loss": 0.1998, "step": 6730 }, { "epoch": 141.89473684210526, "grad_norm": 0.8591092824935913, "learning_rate": 0.00016210080798806042, "loss": 0.2094, "step": 6740 }, { "epoch": 142.10526315789474, "grad_norm": 0.7406374216079712, "learning_rate": 0.00016199596982938142, "loss": 0.2068, "step": 6750 }, { "epoch": 142.31578947368422, "grad_norm": 0.7162677645683289, "learning_rate": 0.00016189102087385218, "loss": 0.2031, "step": 6760 }, { "epoch": 142.52631578947367, "grad_norm": 0.7859647870063782, "learning_rate": 0.00016178596130903344, "loss": 0.1974, "step": 6770 }, { "epoch": 142.73684210526315, "grad_norm": 0.6114941239356995, "learning_rate": 0.00016168079132268374, "loss": 0.1965, "step": 6780 }, { "epoch": 142.94736842105263, "grad_norm": 0.6999493837356567, "learning_rate": 0.00016157551110275887, "loss": 0.2135, "step": 6790 }, { "epoch": 143.1578947368421, "grad_norm": 0.6540025472640991, "learning_rate": 0.00016147012083741168, "loss": 0.1862, "step": 6800 }, { "epoch": 143.3684210526316, "grad_norm": 0.7767214775085449, "learning_rate": 0.0001613646207149916, "loss": 0.2028, "step": 6810 }, { "epoch": 143.57894736842104, "grad_norm": 0.8844462633132935, "learning_rate": 0.00016125901092404457, "loss": 0.2026, "step": 6820 }, { "epoch": 143.78947368421052, "grad_norm": 0.6872785687446594, "learning_rate": 0.00016115329165331227, "loss": 0.206, "step": 6830 }, { "epoch": 144.0, "grad_norm": 1.2136003971099854, "learning_rate": 0.0001610474630917323, "loss": 0.2072, "step": 6840 }, { "epoch": 144.21052631578948, "grad_norm": 0.7713127136230469, "learning_rate": 0.00016094152542843733, "loss": 0.1923, "step": 6850 }, { "epoch": 144.42105263157896, "grad_norm": 0.7771678566932678, "learning_rate": 0.0001608354788527553, "loss": 0.2069, "step": 6860 }, { "epoch": 144.6315789473684, "grad_norm": 0.7056291699409485, "learning_rate": 0.0001607293235542085, "loss": 0.1953, "step": 6870 }, { "epoch": 144.8421052631579, "grad_norm": 0.9615933299064636, "learning_rate": 0.00016062305972251373, "loss": 0.2048, "step": 6880 }, { "epoch": 145.05263157894737, "grad_norm": 0.7184662222862244, "learning_rate": 0.00016051668754758167, "loss": 0.2147, "step": 6890 }, { "epoch": 145.26315789473685, "grad_norm": 0.7650150656700134, "learning_rate": 0.00016041020721951666, "loss": 0.1993, "step": 6900 }, { "epoch": 145.47368421052633, "grad_norm": 0.6379842162132263, "learning_rate": 0.00016030361892861622, "loss": 0.2053, "step": 6910 }, { "epoch": 145.68421052631578, "grad_norm": 0.7708711624145508, "learning_rate": 0.00016019692286537107, "loss": 0.1985, "step": 6920 }, { "epoch": 145.89473684210526, "grad_norm": 0.7167929410934448, "learning_rate": 0.00016009011922046425, "loss": 0.1991, "step": 6930 }, { "epoch": 146.10526315789474, "grad_norm": 0.7796617746353149, "learning_rate": 0.00015998320818477125, "loss": 0.2039, "step": 6940 }, { "epoch": 146.31578947368422, "grad_norm": 0.6694478988647461, "learning_rate": 0.00015987618994935937, "loss": 0.1863, "step": 6950 }, { "epoch": 146.52631578947367, "grad_norm": 0.7410951256752014, "learning_rate": 0.0001597690647054876, "loss": 0.2019, "step": 6960 }, { "epoch": 146.73684210526315, "grad_norm": 0.6894869804382324, "learning_rate": 0.0001596618326446061, "loss": 0.187, "step": 6970 }, { "epoch": 146.94736842105263, "grad_norm": 0.6591702699661255, "learning_rate": 0.00015955449395835597, "loss": 0.208, "step": 6980 }, { "epoch": 147.1578947368421, "grad_norm": 0.5875535607337952, "learning_rate": 0.00015944704883856883, "loss": 0.2059, "step": 6990 }, { "epoch": 147.3684210526316, "grad_norm": 0.7606227993965149, "learning_rate": 0.00015933949747726653, "loss": 0.2009, "step": 7000 }, { "epoch": 147.57894736842104, "grad_norm": 0.8052495718002319, "learning_rate": 0.00015923184006666076, "loss": 0.2118, "step": 7010 }, { "epoch": 147.78947368421052, "grad_norm": 0.6055042743682861, "learning_rate": 0.00015912407679915283, "loss": 0.1936, "step": 7020 }, { "epoch": 148.0, "grad_norm": 0.6587730050086975, "learning_rate": 0.00015901620786733312, "loss": 0.1942, "step": 7030 }, { "epoch": 148.21052631578948, "grad_norm": 0.6731119751930237, "learning_rate": 0.00015890823346398095, "loss": 0.1803, "step": 7040 }, { "epoch": 148.42105263157896, "grad_norm": 0.7457992434501648, "learning_rate": 0.00015880015378206408, "loss": 0.2079, "step": 7050 }, { "epoch": 148.6315789473684, "grad_norm": 0.7178442478179932, "learning_rate": 0.00015869196901473838, "loss": 0.1932, "step": 7060 }, { "epoch": 148.8421052631579, "grad_norm": 1.3482714891433716, "learning_rate": 0.00015858367935534754, "loss": 0.2095, "step": 7070 }, { "epoch": 149.05263157894737, "grad_norm": 0.669923722743988, "learning_rate": 0.00015847528499742287, "loss": 0.2085, "step": 7080 }, { "epoch": 149.26315789473685, "grad_norm": 0.6748956441879272, "learning_rate": 0.00015836678613468256, "loss": 0.1856, "step": 7090 }, { "epoch": 149.47368421052633, "grad_norm": 0.7660018801689148, "learning_rate": 0.0001582581829610317, "loss": 0.1958, "step": 7100 }, { "epoch": 149.68421052631578, "grad_norm": 0.8634538650512695, "learning_rate": 0.00015814947567056178, "loss": 0.2109, "step": 7110 }, { "epoch": 149.89473684210526, "grad_norm": 0.713979959487915, "learning_rate": 0.0001580406644575503, "loss": 0.2067, "step": 7120 }, { "epoch": 150.10526315789474, "grad_norm": 0.5765550136566162, "learning_rate": 0.00015793174951646063, "loss": 0.1914, "step": 7130 }, { "epoch": 150.31578947368422, "grad_norm": 0.8116117119789124, "learning_rate": 0.00015782273104194137, "loss": 0.189, "step": 7140 }, { "epoch": 150.52631578947367, "grad_norm": 0.7519946694374084, "learning_rate": 0.00015771360922882624, "loss": 0.202, "step": 7150 }, { "epoch": 150.73684210526315, "grad_norm": 0.7752880454063416, "learning_rate": 0.0001576043842721336, "loss": 0.2098, "step": 7160 }, { "epoch": 150.94736842105263, "grad_norm": 0.726283073425293, "learning_rate": 0.0001574950563670661, "loss": 0.1998, "step": 7170 }, { "epoch": 151.1578947368421, "grad_norm": 0.7491242289543152, "learning_rate": 0.00015738562570901055, "loss": 0.2044, "step": 7180 }, { "epoch": 151.3684210526316, "grad_norm": 0.9558908343315125, "learning_rate": 0.00015727609249353722, "loss": 0.2053, "step": 7190 }, { "epoch": 151.57894736842104, "grad_norm": 0.6602309346199036, "learning_rate": 0.00015716645691639966, "loss": 0.1879, "step": 7200 }, { "epoch": 151.78947368421052, "grad_norm": 0.6388182640075684, "learning_rate": 0.00015705671917353456, "loss": 0.2012, "step": 7210 }, { "epoch": 152.0, "grad_norm": 0.653289258480072, "learning_rate": 0.00015694687946106093, "loss": 0.1997, "step": 7220 }, { "epoch": 152.21052631578948, "grad_norm": 0.693202555179596, "learning_rate": 0.00015683693797528022, "loss": 0.1867, "step": 7230 }, { "epoch": 152.42105263157896, "grad_norm": 0.8159785866737366, "learning_rate": 0.00015672689491267567, "loss": 0.1981, "step": 7240 }, { "epoch": 152.6315789473684, "grad_norm": 0.604954183101654, "learning_rate": 0.00015661675046991206, "loss": 0.2078, "step": 7250 }, { "epoch": 152.8421052631579, "grad_norm": 0.6773616075515747, "learning_rate": 0.0001565065048438354, "loss": 0.2006, "step": 7260 }, { "epoch": 153.05263157894737, "grad_norm": 0.6346316933631897, "learning_rate": 0.0001563961582314725, "loss": 0.1864, "step": 7270 }, { "epoch": 153.26315789473685, "grad_norm": 0.6382675170898438, "learning_rate": 0.00015628571083003062, "loss": 0.2017, "step": 7280 }, { "epoch": 153.47368421052633, "grad_norm": 0.7643656134605408, "learning_rate": 0.00015617516283689722, "loss": 0.1972, "step": 7290 }, { "epoch": 153.68421052631578, "grad_norm": 0.7513641715049744, "learning_rate": 0.0001560645144496394, "loss": 0.2053, "step": 7300 }, { "epoch": 153.89473684210526, "grad_norm": 0.5776196122169495, "learning_rate": 0.00015595376586600388, "loss": 0.1991, "step": 7310 }, { "epoch": 154.10526315789474, "grad_norm": 0.743189811706543, "learning_rate": 0.00015584291728391625, "loss": 0.1923, "step": 7320 }, { "epoch": 154.31578947368422, "grad_norm": 0.6481560468673706, "learning_rate": 0.00015573196890148093, "loss": 0.201, "step": 7330 }, { "epoch": 154.52631578947367, "grad_norm": 0.5811365842819214, "learning_rate": 0.00015562092091698067, "loss": 0.1926, "step": 7340 }, { "epoch": 154.73684210526315, "grad_norm": 0.8171225190162659, "learning_rate": 0.00015550977352887622, "loss": 0.2118, "step": 7350 }, { "epoch": 154.94736842105263, "grad_norm": 0.6557311415672302, "learning_rate": 0.00015539852693580603, "loss": 0.1983, "step": 7360 }, { "epoch": 155.1578947368421, "grad_norm": 0.6950130462646484, "learning_rate": 0.00015528718133658571, "loss": 0.1867, "step": 7370 }, { "epoch": 155.3684210526316, "grad_norm": 0.525143027305603, "learning_rate": 0.00015517573693020798, "loss": 0.1907, "step": 7380 }, { "epoch": 155.57894736842104, "grad_norm": 0.6554726958274841, "learning_rate": 0.00015506419391584202, "loss": 0.2006, "step": 7390 }, { "epoch": 155.78947368421052, "grad_norm": 0.8342046141624451, "learning_rate": 0.00015495255249283328, "loss": 0.2063, "step": 7400 }, { "epoch": 156.0, "grad_norm": 0.7520895004272461, "learning_rate": 0.00015484081286070312, "loss": 0.1961, "step": 7410 }, { "epoch": 156.21052631578948, "grad_norm": 0.7809695601463318, "learning_rate": 0.00015472897521914836, "loss": 0.2021, "step": 7420 }, { "epoch": 156.42105263157896, "grad_norm": 0.6753067374229431, "learning_rate": 0.00015461703976804095, "loss": 0.1963, "step": 7430 }, { "epoch": 156.6315789473684, "grad_norm": 0.9035070538520813, "learning_rate": 0.0001545050067074278, "loss": 0.2009, "step": 7440 }, { "epoch": 156.8421052631579, "grad_norm": 0.6189687252044678, "learning_rate": 0.00015439287623753007, "loss": 0.1901, "step": 7450 }, { "epoch": 157.05263157894737, "grad_norm": 0.6014378666877747, "learning_rate": 0.00015428064855874308, "loss": 0.206, "step": 7460 }, { "epoch": 157.26315789473685, "grad_norm": 0.73002028465271, "learning_rate": 0.00015416832387163596, "loss": 0.2043, "step": 7470 }, { "epoch": 157.47368421052633, "grad_norm": 0.6451747417449951, "learning_rate": 0.0001540559023769511, "loss": 0.1884, "step": 7480 }, { "epoch": 157.68421052631578, "grad_norm": 0.7301135659217834, "learning_rate": 0.00015394338427560396, "loss": 0.1977, "step": 7490 }, { "epoch": 157.89473684210526, "grad_norm": 0.6060409545898438, "learning_rate": 0.0001538307697686826, "loss": 0.2155, "step": 7500 }, { "epoch": 158.10526315789474, "grad_norm": 0.571553647518158, "learning_rate": 0.00015371805905744736, "loss": 0.1992, "step": 7510 }, { "epoch": 158.31578947368422, "grad_norm": 0.5945022106170654, "learning_rate": 0.00015360525234333066, "loss": 0.1823, "step": 7520 }, { "epoch": 158.52631578947367, "grad_norm": 0.7339075803756714, "learning_rate": 0.00015349234982793634, "loss": 0.2059, "step": 7530 }, { "epoch": 158.73684210526315, "grad_norm": 0.633240282535553, "learning_rate": 0.00015337935171303948, "loss": 0.1908, "step": 7540 }, { "epoch": 158.94736842105263, "grad_norm": 0.6861400008201599, "learning_rate": 0.00015326625820058612, "loss": 0.2089, "step": 7550 }, { "epoch": 159.1578947368421, "grad_norm": 0.46820777654647827, "learning_rate": 0.00015315306949269255, "loss": 0.1958, "step": 7560 }, { "epoch": 159.3684210526316, "grad_norm": 0.7215929627418518, "learning_rate": 0.00015303978579164545, "loss": 0.1922, "step": 7570 }, { "epoch": 159.57894736842104, "grad_norm": 0.5514410138130188, "learning_rate": 0.00015292640729990117, "loss": 0.1902, "step": 7580 }, { "epoch": 159.78947368421052, "grad_norm": 0.807755172252655, "learning_rate": 0.00015281293422008543, "loss": 0.2058, "step": 7590 }, { "epoch": 160.0, "grad_norm": 0.8933700323104858, "learning_rate": 0.00015269936675499306, "loss": 0.2119, "step": 7600 }, { "epoch": 160.21052631578948, "grad_norm": 0.7081916332244873, "learning_rate": 0.00015258570510758745, "loss": 0.195, "step": 7610 }, { "epoch": 160.42105263157896, "grad_norm": 0.6982260942459106, "learning_rate": 0.00015247194948100047, "loss": 0.1978, "step": 7620 }, { "epoch": 160.6315789473684, "grad_norm": 0.7810565233230591, "learning_rate": 0.00015235810007853179, "loss": 0.1982, "step": 7630 }, { "epoch": 160.8421052631579, "grad_norm": 0.679720938205719, "learning_rate": 0.00015224415710364883, "loss": 0.2144, "step": 7640 }, { "epoch": 161.05263157894737, "grad_norm": 0.6578332185745239, "learning_rate": 0.00015213012075998615, "loss": 0.1944, "step": 7650 }, { "epoch": 161.26315789473685, "grad_norm": 0.7118348479270935, "learning_rate": 0.00015201599125134517, "loss": 0.2015, "step": 7660 }, { "epoch": 161.47368421052633, "grad_norm": 0.5404331684112549, "learning_rate": 0.00015190176878169384, "loss": 0.1995, "step": 7670 }, { "epoch": 161.68421052631578, "grad_norm": 0.685939371585846, "learning_rate": 0.0001517874535551662, "loss": 0.2089, "step": 7680 }, { "epoch": 161.89473684210526, "grad_norm": 0.6666051745414734, "learning_rate": 0.0001516730457760621, "loss": 0.1867, "step": 7690 }, { "epoch": 162.10526315789474, "grad_norm": 0.7303677797317505, "learning_rate": 0.0001515585456488468, "loss": 0.1986, "step": 7700 }, { "epoch": 162.31578947368422, "grad_norm": 0.7837722301483154, "learning_rate": 0.00015144395337815064, "loss": 0.1879, "step": 7710 }, { "epoch": 162.52631578947367, "grad_norm": 0.6972552537918091, "learning_rate": 0.00015132926916876856, "loss": 0.187, "step": 7720 }, { "epoch": 162.73684210526315, "grad_norm": 0.7570910453796387, "learning_rate": 0.0001512144932256598, "loss": 0.216, "step": 7730 }, { "epoch": 162.94736842105263, "grad_norm": 0.5790467858314514, "learning_rate": 0.0001510996257539476, "loss": 0.2098, "step": 7740 }, { "epoch": 163.1578947368421, "grad_norm": 0.6272661089897156, "learning_rate": 0.0001509846669589188, "loss": 0.1912, "step": 7750 }, { "epoch": 163.3684210526316, "grad_norm": 0.9367342591285706, "learning_rate": 0.0001508696170460233, "loss": 0.1912, "step": 7760 }, { "epoch": 163.57894736842104, "grad_norm": 0.5576528310775757, "learning_rate": 0.00015075447622087408, "loss": 0.194, "step": 7770 }, { "epoch": 163.78947368421052, "grad_norm": 0.7530799508094788, "learning_rate": 0.0001506392446892464, "loss": 0.2107, "step": 7780 }, { "epoch": 164.0, "grad_norm": 0.7183924913406372, "learning_rate": 0.00015052392265707767, "loss": 0.1968, "step": 7790 }, { "epoch": 164.21052631578948, "grad_norm": 0.5782009959220886, "learning_rate": 0.0001504085103304671, "loss": 0.1872, "step": 7800 }, { "epoch": 164.42105263157896, "grad_norm": 0.7883465886116028, "learning_rate": 0.0001502930079156752, "loss": 0.1902, "step": 7810 }, { "epoch": 164.6315789473684, "grad_norm": 0.6550580263137817, "learning_rate": 0.00015017741561912352, "loss": 0.198, "step": 7820 }, { "epoch": 164.8421052631579, "grad_norm": 0.6090496182441711, "learning_rate": 0.00015006173364739427, "loss": 0.1965, "step": 7830 }, { "epoch": 165.05263157894737, "grad_norm": 0.6356409192085266, "learning_rate": 0.00014994596220722987, "loss": 0.2125, "step": 7840 }, { "epoch": 165.26315789473685, "grad_norm": 0.5657672882080078, "learning_rate": 0.00014983010150553262, "loss": 0.2024, "step": 7850 }, { "epoch": 165.47368421052633, "grad_norm": 0.7943452596664429, "learning_rate": 0.00014971415174936444, "loss": 0.1978, "step": 7860 }, { "epoch": 165.68421052631578, "grad_norm": 0.5848918557167053, "learning_rate": 0.00014959811314594628, "loss": 0.1951, "step": 7870 }, { "epoch": 165.89473684210526, "grad_norm": 0.6190322041511536, "learning_rate": 0.000149481985902658, "loss": 0.1859, "step": 7880 }, { "epoch": 166.10526315789474, "grad_norm": 0.6574214100837708, "learning_rate": 0.00014936577022703777, "loss": 0.2012, "step": 7890 }, { "epoch": 166.31578947368422, "grad_norm": 0.6191821694374084, "learning_rate": 0.00014924946632678186, "loss": 0.1964, "step": 7900 }, { "epoch": 166.52631578947367, "grad_norm": 0.5656545162200928, "learning_rate": 0.0001491330744097442, "loss": 0.1917, "step": 7910 }, { "epoch": 166.73684210526315, "grad_norm": 0.6235769987106323, "learning_rate": 0.00014901659468393602, "loss": 0.1846, "step": 7920 }, { "epoch": 166.94736842105263, "grad_norm": 0.753472626209259, "learning_rate": 0.00014890002735752547, "loss": 0.2266, "step": 7930 }, { "epoch": 167.1578947368421, "grad_norm": 0.6449782252311707, "learning_rate": 0.00014878337263883728, "loss": 0.1922, "step": 7940 }, { "epoch": 167.3684210526316, "grad_norm": 0.7990699410438538, "learning_rate": 0.00014866663073635232, "loss": 0.1932, "step": 7950 }, { "epoch": 167.57894736842104, "grad_norm": 0.7350342273712158, "learning_rate": 0.00014854980185870733, "loss": 0.2028, "step": 7960 }, { "epoch": 167.78947368421052, "grad_norm": 0.5282038450241089, "learning_rate": 0.00014843288621469442, "loss": 0.2125, "step": 7970 }, { "epoch": 168.0, "grad_norm": 0.6253044605255127, "learning_rate": 0.00014831588401326083, "loss": 0.1925, "step": 7980 }, { "epoch": 168.21052631578948, "grad_norm": 0.8048043847084045, "learning_rate": 0.00014819879546350842, "loss": 0.2032, "step": 7990 }, { "epoch": 168.42105263157896, "grad_norm": 0.6136367917060852, "learning_rate": 0.00014808162077469347, "loss": 0.1854, "step": 8000 }, { "epoch": 168.6315789473684, "grad_norm": 0.6089524626731873, "learning_rate": 0.00014796436015622618, "loss": 0.1951, "step": 8010 }, { "epoch": 168.8421052631579, "grad_norm": 0.6197385787963867, "learning_rate": 0.00014784701381767018, "loss": 0.2084, "step": 8020 }, { "epoch": 169.05263157894737, "grad_norm": 0.5597293376922607, "learning_rate": 0.00014772958196874246, "loss": 0.1921, "step": 8030 }, { "epoch": 169.26315789473685, "grad_norm": 0.5773986577987671, "learning_rate": 0.00014761206481931282, "loss": 0.208, "step": 8040 }, { "epoch": 169.47368421052633, "grad_norm": 0.6238757967948914, "learning_rate": 0.00014749446257940335, "loss": 0.1902, "step": 8050 }, { "epoch": 169.68421052631578, "grad_norm": 0.6683410406112671, "learning_rate": 0.00014737677545918843, "loss": 0.187, "step": 8060 }, { "epoch": 169.89473684210526, "grad_norm": 0.8527116179466248, "learning_rate": 0.000147259003668994, "loss": 0.2193, "step": 8070 }, { "epoch": 170.10526315789474, "grad_norm": 0.6407440900802612, "learning_rate": 0.00014714114741929728, "loss": 0.1941, "step": 8080 }, { "epoch": 170.31578947368422, "grad_norm": 0.6082753539085388, "learning_rate": 0.00014702320692072657, "loss": 0.2025, "step": 8090 }, { "epoch": 170.52631578947367, "grad_norm": 0.7939004302024841, "learning_rate": 0.00014690518238406064, "loss": 0.1986, "step": 8100 }, { "epoch": 170.73684210526315, "grad_norm": 0.5582263469696045, "learning_rate": 0.00014678707402022845, "loss": 0.1946, "step": 8110 }, { "epoch": 170.94736842105263, "grad_norm": 0.5746979117393494, "learning_rate": 0.00014666888204030885, "loss": 0.2034, "step": 8120 }, { "epoch": 171.1578947368421, "grad_norm": 0.825444757938385, "learning_rate": 0.00014655060665553005, "loss": 0.1941, "step": 8130 }, { "epoch": 171.3684210526316, "grad_norm": 0.9377308487892151, "learning_rate": 0.0001464322480772693, "loss": 0.2003, "step": 8140 }, { "epoch": 171.57894736842104, "grad_norm": 0.6175430417060852, "learning_rate": 0.0001463138065170526, "loss": 0.1862, "step": 8150 }, { "epoch": 171.78947368421052, "grad_norm": 0.6299969553947449, "learning_rate": 0.00014619528218655424, "loss": 0.2055, "step": 8160 }, { "epoch": 172.0, "grad_norm": 0.8263299465179443, "learning_rate": 0.00014607667529759635, "loss": 0.2038, "step": 8170 }, { "epoch": 172.21052631578948, "grad_norm": 0.522394061088562, "learning_rate": 0.00014595798606214882, "loss": 0.1864, "step": 8180 }, { "epoch": 172.42105263157896, "grad_norm": 0.8208839893341064, "learning_rate": 0.00014583921469232838, "loss": 0.2064, "step": 8190 }, { "epoch": 172.6315789473684, "grad_norm": 0.6659351587295532, "learning_rate": 0.00014572036140039885, "loss": 0.2048, "step": 8200 }, { "epoch": 172.8421052631579, "grad_norm": 0.7370153665542603, "learning_rate": 0.00014560142639877025, "loss": 0.1909, "step": 8210 }, { "epoch": 173.05263157894737, "grad_norm": 0.6763059496879578, "learning_rate": 0.0001454824098999988, "loss": 0.1938, "step": 8220 }, { "epoch": 173.26315789473685, "grad_norm": 0.8497671484947205, "learning_rate": 0.0001453633121167862, "loss": 0.1956, "step": 8230 }, { "epoch": 173.47368421052633, "grad_norm": 0.599168598651886, "learning_rate": 0.00014524413326197952, "loss": 0.1873, "step": 8240 }, { "epoch": 173.68421052631578, "grad_norm": 0.6262931823730469, "learning_rate": 0.00014512487354857075, "loss": 0.2077, "step": 8250 }, { "epoch": 173.89473684210526, "grad_norm": 0.6336191296577454, "learning_rate": 0.00014500553318969628, "loss": 0.2007, "step": 8260 }, { "epoch": 174.10526315789474, "grad_norm": 0.6542587876319885, "learning_rate": 0.00014488611239863667, "loss": 0.1922, "step": 8270 }, { "epoch": 174.31578947368422, "grad_norm": 0.5030838251113892, "learning_rate": 0.00014476661138881629, "loss": 0.1903, "step": 8280 }, { "epoch": 174.52631578947367, "grad_norm": 0.5871732831001282, "learning_rate": 0.00014464703037380278, "loss": 0.1861, "step": 8290 }, { "epoch": 174.73684210526315, "grad_norm": 0.9264737963676453, "learning_rate": 0.00014452736956730683, "loss": 0.2027, "step": 8300 }, { "epoch": 174.94736842105263, "grad_norm": 0.5170317888259888, "learning_rate": 0.0001444076291831817, "loss": 0.2027, "step": 8310 }, { "epoch": 175.1578947368421, "grad_norm": 0.5175766348838806, "learning_rate": 0.00014428780943542285, "loss": 0.2065, "step": 8320 }, { "epoch": 175.3684210526316, "grad_norm": 0.7319501042366028, "learning_rate": 0.0001441679105381676, "loss": 0.1838, "step": 8330 }, { "epoch": 175.57894736842104, "grad_norm": 0.7036318182945251, "learning_rate": 0.00014404793270569475, "loss": 0.1948, "step": 8340 }, { "epoch": 175.78947368421052, "grad_norm": 0.7479885220527649, "learning_rate": 0.0001439278761524241, "loss": 0.2003, "step": 8350 }, { "epoch": 176.0, "grad_norm": 0.7207411527633667, "learning_rate": 0.0001438077410929162, "loss": 0.2014, "step": 8360 }, { "epoch": 176.21052631578948, "grad_norm": 0.7896419167518616, "learning_rate": 0.00014368752774187186, "loss": 0.1973, "step": 8370 }, { "epoch": 176.42105263157896, "grad_norm": 0.7755358815193176, "learning_rate": 0.00014356723631413188, "loss": 0.1972, "step": 8380 }, { "epoch": 176.6315789473684, "grad_norm": 0.855574905872345, "learning_rate": 0.00014344686702467648, "loss": 0.2078, "step": 8390 }, { "epoch": 176.8421052631579, "grad_norm": 0.753102719783783, "learning_rate": 0.00014332642008862514, "loss": 0.1881, "step": 8400 }, { "epoch": 177.05263157894737, "grad_norm": 0.5475196838378906, "learning_rate": 0.00014320589572123607, "loss": 0.1862, "step": 8410 }, { "epoch": 177.26315789473685, "grad_norm": 0.7424550652503967, "learning_rate": 0.0001430852941379058, "loss": 0.1923, "step": 8420 }, { "epoch": 177.47368421052633, "grad_norm": 0.5902268290519714, "learning_rate": 0.000142964615554169, "loss": 0.1925, "step": 8430 }, { "epoch": 177.68421052631578, "grad_norm": 0.6847654581069946, "learning_rate": 0.0001428438601856978, "loss": 0.2016, "step": 8440 }, { "epoch": 177.89473684210526, "grad_norm": 0.81497722864151, "learning_rate": 0.00014272302824830166, "loss": 0.1913, "step": 8450 }, { "epoch": 178.10526315789474, "grad_norm": 0.6223244071006775, "learning_rate": 0.00014260211995792679, "loss": 0.1937, "step": 8460 }, { "epoch": 178.31578947368422, "grad_norm": 0.8048047423362732, "learning_rate": 0.00014248113553065597, "loss": 0.1931, "step": 8470 }, { "epoch": 178.52631578947367, "grad_norm": 0.606454074382782, "learning_rate": 0.00014236007518270797, "loss": 0.2105, "step": 8480 }, { "epoch": 178.73684210526315, "grad_norm": 0.6434724926948547, "learning_rate": 0.00014223893913043725, "loss": 0.1907, "step": 8490 }, { "epoch": 178.94736842105263, "grad_norm": 0.6487394571304321, "learning_rate": 0.00014211772759033359, "loss": 0.1984, "step": 8500 }, { "epoch": 179.1578947368421, "grad_norm": 0.7166820168495178, "learning_rate": 0.00014199644077902165, "loss": 0.2023, "step": 8510 }, { "epoch": 179.3684210526316, "grad_norm": 0.684614360332489, "learning_rate": 0.00014187507891326063, "loss": 0.1902, "step": 8520 }, { "epoch": 179.57894736842104, "grad_norm": 0.5545143485069275, "learning_rate": 0.00014175364220994388, "loss": 0.1909, "step": 8530 }, { "epoch": 179.78947368421052, "grad_norm": 0.7007152438163757, "learning_rate": 0.00014163213088609847, "loss": 0.2038, "step": 8540 }, { "epoch": 180.0, "grad_norm": 0.8921439051628113, "learning_rate": 0.00014151054515888482, "loss": 0.2096, "step": 8550 }, { "epoch": 180.21052631578948, "grad_norm": 0.5631104707717896, "learning_rate": 0.00014138888524559636, "loss": 0.1777, "step": 8560 }, { "epoch": 180.42105263157896, "grad_norm": 0.6226398348808289, "learning_rate": 0.0001412671513636591, "loss": 0.2028, "step": 8570 }, { "epoch": 180.6315789473684, "grad_norm": 0.5310755968093872, "learning_rate": 0.00014114534373063113, "loss": 0.2058, "step": 8580 }, { "epoch": 180.8421052631579, "grad_norm": 0.7497243285179138, "learning_rate": 0.00014102346256420257, "loss": 0.197, "step": 8590 }, { "epoch": 181.05263157894737, "grad_norm": 0.6481735706329346, "learning_rate": 0.00014090150808219475, "loss": 0.2011, "step": 8600 }, { "epoch": 181.26315789473685, "grad_norm": 0.4959108829498291, "learning_rate": 0.0001407794805025601, "loss": 0.1873, "step": 8610 }, { "epoch": 181.47368421052633, "grad_norm": 0.6530917882919312, "learning_rate": 0.00014065738004338175, "loss": 0.1928, "step": 8620 }, { "epoch": 181.68421052631578, "grad_norm": 0.819771945476532, "learning_rate": 0.00014053520692287297, "loss": 0.1971, "step": 8630 }, { "epoch": 181.89473684210526, "grad_norm": 0.8146491050720215, "learning_rate": 0.00014041296135937692, "loss": 0.2056, "step": 8640 }, { "epoch": 182.10526315789474, "grad_norm": 0.6754285097122192, "learning_rate": 0.00014029064357136628, "loss": 0.1985, "step": 8650 }, { "epoch": 182.31578947368422, "grad_norm": 0.651917576789856, "learning_rate": 0.00014016825377744275, "loss": 0.1852, "step": 8660 }, { "epoch": 182.52631578947367, "grad_norm": 0.5566385984420776, "learning_rate": 0.0001400457921963368, "loss": 0.1838, "step": 8670 }, { "epoch": 182.73684210526315, "grad_norm": 0.6756837368011475, "learning_rate": 0.00013992325904690697, "loss": 0.2143, "step": 8680 }, { "epoch": 182.94736842105263, "grad_norm": 0.5130738019943237, "learning_rate": 0.00013980065454814004, "loss": 0.1997, "step": 8690 }, { "epoch": 183.1578947368421, "grad_norm": 0.69854336977005, "learning_rate": 0.00013967797891915003, "loss": 0.1889, "step": 8700 }, { "epoch": 183.3684210526316, "grad_norm": 0.8691510558128357, "learning_rate": 0.00013955523237917824, "loss": 0.2069, "step": 8710 }, { "epoch": 183.57894736842104, "grad_norm": 0.5549122095108032, "learning_rate": 0.00013943241514759262, "loss": 0.1862, "step": 8720 }, { "epoch": 183.78947368421052, "grad_norm": 0.6922882199287415, "learning_rate": 0.00013930952744388743, "loss": 0.2016, "step": 8730 }, { "epoch": 184.0, "grad_norm": 1.2639508247375488, "learning_rate": 0.000139186569487683, "loss": 0.2071, "step": 8740 }, { "epoch": 184.21052631578948, "grad_norm": 0.5515004396438599, "learning_rate": 0.00013906354149872504, "loss": 0.1907, "step": 8750 }, { "epoch": 184.42105263157896, "grad_norm": 0.571441113948822, "learning_rate": 0.00013894044369688462, "loss": 0.1918, "step": 8760 }, { "epoch": 184.6315789473684, "grad_norm": 0.7092531323432922, "learning_rate": 0.00013881727630215738, "loss": 0.1966, "step": 8770 }, { "epoch": 184.8421052631579, "grad_norm": 0.8415676951408386, "learning_rate": 0.00013869403953466346, "loss": 0.2004, "step": 8780 }, { "epoch": 185.05263157894737, "grad_norm": 1.0705996751785278, "learning_rate": 0.00013857073361464697, "loss": 0.209, "step": 8790 }, { "epoch": 185.26315789473685, "grad_norm": 0.530075192451477, "learning_rate": 0.00013844735876247558, "loss": 0.1937, "step": 8800 }, { "epoch": 185.47368421052633, "grad_norm": 0.5449802875518799, "learning_rate": 0.00013832391519864008, "loss": 0.1881, "step": 8810 }, { "epoch": 185.68421052631578, "grad_norm": 0.7269825339317322, "learning_rate": 0.00013820040314375422, "loss": 0.1986, "step": 8820 }, { "epoch": 185.89473684210526, "grad_norm": 1.3125274181365967, "learning_rate": 0.00013807682281855404, "loss": 0.203, "step": 8830 }, { "epoch": 186.10526315789474, "grad_norm": 0.5790199041366577, "learning_rate": 0.00013795317444389763, "loss": 0.1958, "step": 8840 }, { "epoch": 186.31578947368422, "grad_norm": 0.637947142124176, "learning_rate": 0.00013782945824076465, "loss": 0.2031, "step": 8850 }, { "epoch": 186.52631578947367, "grad_norm": 0.561091959476471, "learning_rate": 0.00013770567443025606, "loss": 0.1828, "step": 8860 }, { "epoch": 186.73684210526315, "grad_norm": 0.48705366253852844, "learning_rate": 0.00013758182323359355, "loss": 0.1902, "step": 8870 }, { "epoch": 186.94736842105263, "grad_norm": 0.4845261573791504, "learning_rate": 0.0001374579048721193, "loss": 0.2079, "step": 8880 }, { "epoch": 187.1578947368421, "grad_norm": 0.6765108108520508, "learning_rate": 0.00013733391956729555, "loss": 0.1987, "step": 8890 }, { "epoch": 187.3684210526316, "grad_norm": 0.6771337389945984, "learning_rate": 0.00013720986754070413, "loss": 0.2101, "step": 8900 }, { "epoch": 187.57894736842104, "grad_norm": 0.8644657731056213, "learning_rate": 0.00013708574901404613, "loss": 0.1934, "step": 8910 }, { "epoch": 187.78947368421052, "grad_norm": 0.733113169670105, "learning_rate": 0.00013696156420914146, "loss": 0.2098, "step": 8920 }, { "epoch": 188.0, "grad_norm": 0.7218254208564758, "learning_rate": 0.0001368373133479285, "loss": 0.1852, "step": 8930 }, { "epoch": 188.21052631578948, "grad_norm": 0.7735620141029358, "learning_rate": 0.0001367129966524637, "loss": 0.1996, "step": 8940 }, { "epoch": 188.42105263157896, "grad_norm": 0.6765386462211609, "learning_rate": 0.00013658861434492117, "loss": 0.1981, "step": 8950 }, { "epoch": 188.6315789473684, "grad_norm": 0.7172536849975586, "learning_rate": 0.00013646416664759222, "loss": 0.1918, "step": 8960 }, { "epoch": 188.8421052631579, "grad_norm": 0.6445688605308533, "learning_rate": 0.00013633965378288509, "loss": 0.2076, "step": 8970 }, { "epoch": 189.05263157894737, "grad_norm": 0.6303783655166626, "learning_rate": 0.00013621507597332447, "loss": 0.1827, "step": 8980 }, { "epoch": 189.26315789473685, "grad_norm": 0.6926207542419434, "learning_rate": 0.00013609043344155108, "loss": 0.1913, "step": 8990 }, { "epoch": 189.47368421052633, "grad_norm": 0.4824252426624298, "learning_rate": 0.00013596572641032132, "loss": 0.1955, "step": 9000 }, { "epoch": 189.68421052631578, "grad_norm": 0.5803983807563782, "learning_rate": 0.00013584095510250693, "loss": 0.1982, "step": 9010 }, { "epoch": 189.89473684210526, "grad_norm": 0.6127357482910156, "learning_rate": 0.0001357161197410944, "loss": 0.2044, "step": 9020 }, { "epoch": 190.10526315789474, "grad_norm": 0.5252726674079895, "learning_rate": 0.00013559122054918483, "loss": 0.1955, "step": 9030 }, { "epoch": 190.31578947368422, "grad_norm": 0.6263585090637207, "learning_rate": 0.00013546625774999327, "loss": 0.1971, "step": 9040 }, { "epoch": 190.52631578947367, "grad_norm": 0.5231578946113586, "learning_rate": 0.00013534123156684852, "loss": 0.2063, "step": 9050 }, { "epoch": 190.73684210526315, "grad_norm": 0.6962978839874268, "learning_rate": 0.00013521614222319268, "loss": 0.1995, "step": 9060 }, { "epoch": 190.94736842105263, "grad_norm": 0.5984175801277161, "learning_rate": 0.00013509098994258064, "loss": 0.1859, "step": 9070 }, { "epoch": 191.1578947368421, "grad_norm": 0.534826397895813, "learning_rate": 0.00013496577494867985, "loss": 0.1959, "step": 9080 }, { "epoch": 191.3684210526316, "grad_norm": 0.579361617565155, "learning_rate": 0.00013484049746526977, "loss": 0.1814, "step": 9090 }, { "epoch": 191.57894736842104, "grad_norm": 0.7044987082481384, "learning_rate": 0.0001347151577162416, "loss": 0.1996, "step": 9100 }, { "epoch": 191.78947368421052, "grad_norm": 0.6362590193748474, "learning_rate": 0.00013458975592559781, "loss": 0.1943, "step": 9110 }, { "epoch": 192.0, "grad_norm": 0.7269014120101929, "learning_rate": 0.0001344642923174517, "loss": 0.2107, "step": 9120 }, { "epoch": 192.21052631578948, "grad_norm": 0.6897448897361755, "learning_rate": 0.00013433876711602713, "loss": 0.184, "step": 9130 }, { "epoch": 192.42105263157896, "grad_norm": 0.6716914176940918, "learning_rate": 0.00013421318054565793, "loss": 0.1982, "step": 9140 }, { "epoch": 192.6315789473684, "grad_norm": 0.6334236264228821, "learning_rate": 0.00013408753283078768, "loss": 0.2077, "step": 9150 }, { "epoch": 192.8421052631579, "grad_norm": 0.6236383318901062, "learning_rate": 0.00013396182419596925, "loss": 0.1933, "step": 9160 }, { "epoch": 193.05263157894737, "grad_norm": 0.7885491251945496, "learning_rate": 0.00013383605486586432, "loss": 0.1919, "step": 9170 }, { "epoch": 193.26315789473685, "grad_norm": 0.6639363765716553, "learning_rate": 0.0001337102250652431, "loss": 0.1943, "step": 9180 }, { "epoch": 193.47368421052633, "grad_norm": 0.8824483752250671, "learning_rate": 0.00013358433501898381, "loss": 0.2155, "step": 9190 }, { "epoch": 193.68421052631578, "grad_norm": 0.6293418407440186, "learning_rate": 0.0001334583849520724, "loss": 0.1853, "step": 9200 }, { "epoch": 193.89473684210526, "grad_norm": 0.5062781572341919, "learning_rate": 0.00013333237508960207, "loss": 0.197, "step": 9210 }, { "epoch": 194.10526315789474, "grad_norm": 0.5617866516113281, "learning_rate": 0.00013320630565677287, "loss": 0.1886, "step": 9220 }, { "epoch": 194.31578947368422, "grad_norm": 0.8578426241874695, "learning_rate": 0.0001330801768788913, "loss": 0.2, "step": 9230 }, { "epoch": 194.52631578947367, "grad_norm": 0.6378150582313538, "learning_rate": 0.00013295398898136994, "loss": 0.193, "step": 9240 }, { "epoch": 194.73684210526315, "grad_norm": 0.6234794855117798, "learning_rate": 0.00013282774218972707, "loss": 0.189, "step": 9250 }, { "epoch": 194.94736842105263, "grad_norm": 0.548239529132843, "learning_rate": 0.0001327014367295861, "loss": 0.2063, "step": 9260 }, { "epoch": 195.1578947368421, "grad_norm": 0.6770426034927368, "learning_rate": 0.00013257507282667542, "loss": 0.2092, "step": 9270 }, { "epoch": 195.3684210526316, "grad_norm": 0.7459677457809448, "learning_rate": 0.00013244865070682785, "loss": 0.1917, "step": 9280 }, { "epoch": 195.57894736842104, "grad_norm": 0.6438161134719849, "learning_rate": 0.0001323221705959801, "loss": 0.1967, "step": 9290 }, { "epoch": 195.78947368421052, "grad_norm": 0.6017579436302185, "learning_rate": 0.00013219563272017271, "loss": 0.197, "step": 9300 }, { "epoch": 196.0, "grad_norm": 0.6799076199531555, "learning_rate": 0.00013206903730554937, "loss": 0.1958, "step": 9310 }, { "epoch": 196.21052631578948, "grad_norm": 0.5263960361480713, "learning_rate": 0.00013194238457835665, "loss": 0.1785, "step": 9320 }, { "epoch": 196.42105263157896, "grad_norm": 0.5635556578636169, "learning_rate": 0.00013181567476494346, "loss": 0.1882, "step": 9330 }, { "epoch": 196.6315789473684, "grad_norm": 0.6033175587654114, "learning_rate": 0.00013168890809176075, "loss": 0.1971, "step": 9340 }, { "epoch": 196.8421052631579, "grad_norm": 0.916451632976532, "learning_rate": 0.00013156208478536124, "loss": 0.2021, "step": 9350 }, { "epoch": 197.05263157894737, "grad_norm": 0.6587386131286621, "learning_rate": 0.0001314352050723986, "loss": 0.2214, "step": 9360 }, { "epoch": 197.26315789473685, "grad_norm": 0.6655704975128174, "learning_rate": 0.00013130826917962755, "loss": 0.1986, "step": 9370 }, { "epoch": 197.47368421052633, "grad_norm": 0.6699230074882507, "learning_rate": 0.0001311812773339031, "loss": 0.1883, "step": 9380 }, { "epoch": 197.68421052631578, "grad_norm": 0.515881359577179, "learning_rate": 0.0001310542297621802, "loss": 0.1909, "step": 9390 }, { "epoch": 197.89473684210526, "grad_norm": 0.4768686890602112, "learning_rate": 0.00013092712669151356, "loss": 0.1918, "step": 9400 }, { "epoch": 198.10526315789474, "grad_norm": 0.6306703090667725, "learning_rate": 0.00013079996834905688, "loss": 0.1894, "step": 9410 }, { "epoch": 198.31578947368422, "grad_norm": 0.5824154019355774, "learning_rate": 0.0001306727549620628, "loss": 0.1845, "step": 9420 }, { "epoch": 198.52631578947367, "grad_norm": 0.6079233288764954, "learning_rate": 0.00013054548675788224, "loss": 0.2015, "step": 9430 }, { "epoch": 198.73684210526315, "grad_norm": 0.607089638710022, "learning_rate": 0.00013041816396396416, "loss": 0.1983, "step": 9440 }, { "epoch": 198.94736842105263, "grad_norm": 0.642997682094574, "learning_rate": 0.00013029078680785498, "loss": 0.2135, "step": 9450 }, { "epoch": 199.1578947368421, "grad_norm": 0.769798219203949, "learning_rate": 0.00013016335551719837, "loss": 0.2018, "step": 9460 }, { "epoch": 199.3684210526316, "grad_norm": 0.72061687707901, "learning_rate": 0.00013003587031973465, "loss": 0.1863, "step": 9470 }, { "epoch": 199.57894736842104, "grad_norm": 0.5181578993797302, "learning_rate": 0.00012990833144330062, "loss": 0.204, "step": 9480 }, { "epoch": 199.78947368421052, "grad_norm": 0.6537469625473022, "learning_rate": 0.00012978073911582886, "loss": 0.1968, "step": 9490 }, { "epoch": 200.0, "grad_norm": 0.5669968128204346, "learning_rate": 0.00012965309356534764, "loss": 0.1947, "step": 9500 }, { "epoch": 200.21052631578948, "grad_norm": 0.5587459206581116, "learning_rate": 0.00012952539501998012, "loss": 0.1982, "step": 9510 }, { "epoch": 200.42105263157896, "grad_norm": 0.6033953428268433, "learning_rate": 0.00012939764370794446, "loss": 0.2012, "step": 9520 }, { "epoch": 200.6315789473684, "grad_norm": 0.6542585492134094, "learning_rate": 0.00012926983985755283, "loss": 0.194, "step": 9530 }, { "epoch": 200.8421052631579, "grad_norm": 0.4944283664226532, "learning_rate": 0.0001291419836972115, "loss": 0.1766, "step": 9540 }, { "epoch": 201.05263157894737, "grad_norm": 0.5921803116798401, "learning_rate": 0.0001290140754554202, "loss": 0.2082, "step": 9550 }, { "epoch": 201.26315789473685, "grad_norm": 0.631533682346344, "learning_rate": 0.0001288861153607716, "loss": 0.1876, "step": 9560 }, { "epoch": 201.47368421052633, "grad_norm": 0.6319711208343506, "learning_rate": 0.00012875810364195123, "loss": 0.1849, "step": 9570 }, { "epoch": 201.68421052631578, "grad_norm": 0.8693262934684753, "learning_rate": 0.0001286300405277367, "loss": 0.195, "step": 9580 }, { "epoch": 201.89473684210526, "grad_norm": 0.6572778820991516, "learning_rate": 0.0001285019262469976, "loss": 0.2099, "step": 9590 }, { "epoch": 202.10526315789474, "grad_norm": 0.7793500423431396, "learning_rate": 0.000128373761028695, "loss": 0.1986, "step": 9600 }, { "epoch": 202.31578947368422, "grad_norm": 0.6409270167350769, "learning_rate": 0.0001282455451018808, "loss": 0.1927, "step": 9610 }, { "epoch": 202.52631578947367, "grad_norm": 0.6425662040710449, "learning_rate": 0.0001281172786956977, "loss": 0.1952, "step": 9620 }, { "epoch": 202.73684210526315, "grad_norm": 0.5852437019348145, "learning_rate": 0.00012798896203937855, "loss": 0.1831, "step": 9630 }, { "epoch": 202.94736842105263, "grad_norm": 0.768488347530365, "learning_rate": 0.00012786059536224611, "loss": 0.1994, "step": 9640 }, { "epoch": 203.1578947368421, "grad_norm": 0.7588623762130737, "learning_rate": 0.00012774502277463864, "loss": 0.1998, "step": 9650 }, { "epoch": 203.3684210526316, "grad_norm": 0.5900463461875916, "learning_rate": 0.00012761656169006457, "loss": 0.2037, "step": 9660 }, { "epoch": 203.57894736842104, "grad_norm": 0.5838971734046936, "learning_rate": 0.00012748805125021694, "loss": 0.1923, "step": 9670 }, { "epoch": 203.78947368421052, "grad_norm": 0.6288439035415649, "learning_rate": 0.00012735949168476477, "loss": 0.2109, "step": 9680 }, { "epoch": 204.0, "grad_norm": 0.6366193890571594, "learning_rate": 0.00012723088322346478, "loss": 0.193, "step": 9690 }, { "epoch": 204.21052631578948, "grad_norm": 0.49313342571258545, "learning_rate": 0.00012710222609616125, "loss": 0.1762, "step": 9700 }, { "epoch": 204.42105263157896, "grad_norm": 0.6465559601783752, "learning_rate": 0.0001269735205327852, "loss": 0.1943, "step": 9710 }, { "epoch": 204.6315789473684, "grad_norm": 0.6326197385787964, "learning_rate": 0.00012684476676335445, "loss": 0.1925, "step": 9720 }, { "epoch": 204.8421052631579, "grad_norm": 0.8847769498825073, "learning_rate": 0.00012671596501797282, "loss": 0.2049, "step": 9730 }, { "epoch": 205.05263157894737, "grad_norm": 0.5743268728256226, "learning_rate": 0.00012658711552682988, "loss": 0.1966, "step": 9740 }, { "epoch": 205.26315789473685, "grad_norm": 0.5662633180618286, "learning_rate": 0.00012645821852020066, "loss": 0.1855, "step": 9750 }, { "epoch": 205.47368421052633, "grad_norm": 0.6402904987335205, "learning_rate": 0.0001263292742284449, "loss": 0.1904, "step": 9760 }, { "epoch": 205.68421052631578, "grad_norm": 0.5689384341239929, "learning_rate": 0.000126200282882007, "loss": 0.2074, "step": 9770 }, { "epoch": 205.89473684210526, "grad_norm": 0.7823508977890015, "learning_rate": 0.00012607124471141542, "loss": 0.1976, "step": 9780 }, { "epoch": 206.10526315789474, "grad_norm": 0.4707532227039337, "learning_rate": 0.0001259421599472823, "loss": 0.199, "step": 9790 }, { "epoch": 206.31578947368422, "grad_norm": 0.6291844248771667, "learning_rate": 0.000125813028820303, "loss": 0.185, "step": 9800 }, { "epoch": 206.52631578947367, "grad_norm": 0.6552050709724426, "learning_rate": 0.00012568385156125586, "loss": 0.1928, "step": 9810 }, { "epoch": 206.73684210526315, "grad_norm": 0.5243370532989502, "learning_rate": 0.0001255546284010015, "loss": 0.1958, "step": 9820 }, { "epoch": 206.94736842105263, "grad_norm": 0.5976846218109131, "learning_rate": 0.0001254253595704827, "loss": 0.1948, "step": 9830 }, { "epoch": 207.1578947368421, "grad_norm": 0.473967969417572, "learning_rate": 0.00012529604530072384, "loss": 0.2067, "step": 9840 }, { "epoch": 207.3684210526316, "grad_norm": 0.7041087746620178, "learning_rate": 0.00012516668582283045, "loss": 0.1843, "step": 9850 }, { "epoch": 207.57894736842104, "grad_norm": 0.5606855154037476, "learning_rate": 0.00012503728136798893, "loss": 0.1965, "step": 9860 }, { "epoch": 207.78947368421052, "grad_norm": 0.7192473411560059, "learning_rate": 0.00012490783216746601, "loss": 0.1948, "step": 9870 }, { "epoch": 208.0, "grad_norm": 0.673022449016571, "learning_rate": 0.00012477833845260836, "loss": 0.2078, "step": 9880 }, { "epoch": 208.21052631578948, "grad_norm": 0.7071841359138489, "learning_rate": 0.0001246488004548423, "loss": 0.1937, "step": 9890 }, { "epoch": 208.42105263157896, "grad_norm": 0.518195390701294, "learning_rate": 0.0001245192184056732, "loss": 0.1941, "step": 9900 }, { "epoch": 208.6315789473684, "grad_norm": 0.6964612603187561, "learning_rate": 0.0001243895925366852, "loss": 0.1954, "step": 9910 }, { "epoch": 208.8421052631579, "grad_norm": 0.6792444586753845, "learning_rate": 0.00012425992307954075, "loss": 0.2075, "step": 9920 }, { "epoch": 209.05263157894737, "grad_norm": 0.4383447468280792, "learning_rate": 0.0001241302102659802, "loss": 0.1745, "step": 9930 }, { "epoch": 209.26315789473685, "grad_norm": 0.6733793616294861, "learning_rate": 0.00012400045432782138, "loss": 0.1972, "step": 9940 }, { "epoch": 209.47368421052633, "grad_norm": 0.5500863194465637, "learning_rate": 0.00012387065549695917, "loss": 0.1809, "step": 9950 }, { "epoch": 209.68421052631578, "grad_norm": 0.8505430221557617, "learning_rate": 0.00012374081400536518, "loss": 0.2014, "step": 9960 }, { "epoch": 209.89473684210526, "grad_norm": 0.5805114507675171, "learning_rate": 0.00012361093008508714, "loss": 0.202, "step": 9970 }, { "epoch": 210.10526315789474, "grad_norm": 0.5001348853111267, "learning_rate": 0.0001234810039682487, "loss": 0.1882, "step": 9980 }, { "epoch": 210.31578947368422, "grad_norm": 0.6055691242218018, "learning_rate": 0.00012335103588704895, "loss": 0.201, "step": 9990 }, { "epoch": 210.52631578947367, "grad_norm": 0.6668257713317871, "learning_rate": 0.00012322102607376182, "loss": 0.1899, "step": 10000 }, { "epoch": 210.73684210526315, "grad_norm": 0.8748318552970886, "learning_rate": 0.00012309097476073598, "loss": 0.2055, "step": 10010 }, { "epoch": 210.94736842105263, "grad_norm": 0.6310805678367615, "learning_rate": 0.00012296088218039427, "loss": 0.1911, "step": 10020 }, { "epoch": 211.1578947368421, "grad_norm": 0.5111770033836365, "learning_rate": 0.00012283074856523314, "loss": 0.1892, "step": 10030 }, { "epoch": 211.3684210526316, "grad_norm": 0.8765570521354675, "learning_rate": 0.00012270057414782252, "loss": 0.1764, "step": 10040 }, { "epoch": 211.57894736842104, "grad_norm": 0.5945147275924683, "learning_rate": 0.00012257035916080514, "loss": 0.1962, "step": 10050 }, { "epoch": 211.78947368421052, "grad_norm": 0.7749220728874207, "learning_rate": 0.00012244010383689636, "loss": 0.204, "step": 10060 }, { "epoch": 212.0, "grad_norm": 0.741985023021698, "learning_rate": 0.00012230980840888348, "loss": 0.2083, "step": 10070 }, { "epoch": 212.21052631578948, "grad_norm": 0.5785123705863953, "learning_rate": 0.00012217947310962565, "loss": 0.2033, "step": 10080 }, { "epoch": 212.42105263157896, "grad_norm": 0.5759528875350952, "learning_rate": 0.00012204909817205314, "loss": 0.1963, "step": 10090 }, { "epoch": 212.6315789473684, "grad_norm": 0.8312242031097412, "learning_rate": 0.00012191868382916709, "loss": 0.1874, "step": 10100 }, { "epoch": 212.8421052631579, "grad_norm": 0.5142462849617004, "learning_rate": 0.00012178823031403911, "loss": 0.2005, "step": 10110 }, { "epoch": 213.05263157894737, "grad_norm": 0.49217525124549866, "learning_rate": 0.00012165773785981076, "loss": 0.1855, "step": 10120 }, { "epoch": 213.26315789473685, "grad_norm": 0.4872840940952301, "learning_rate": 0.00012152720669969317, "loss": 0.1837, "step": 10130 }, { "epoch": 213.47368421052633, "grad_norm": 0.6701936721801758, "learning_rate": 0.0001213966370669668, "loss": 0.2015, "step": 10140 }, { "epoch": 213.68421052631578, "grad_norm": 0.6903809905052185, "learning_rate": 0.00012126602919498065, "loss": 0.1927, "step": 10150 }, { "epoch": 213.89473684210526, "grad_norm": 0.4880557656288147, "learning_rate": 0.00012113538331715224, "loss": 0.2053, "step": 10160 }, { "epoch": 214.10526315789474, "grad_norm": 0.5100629329681396, "learning_rate": 0.00012100469966696686, "loss": 0.1871, "step": 10170 }, { "epoch": 214.31578947368422, "grad_norm": 0.630207896232605, "learning_rate": 0.00012087397847797743, "loss": 0.2026, "step": 10180 }, { "epoch": 214.52631578947367, "grad_norm": 0.5134378671646118, "learning_rate": 0.00012074321998380391, "loss": 0.1936, "step": 10190 }, { "epoch": 214.73684210526315, "grad_norm": 0.5366536378860474, "learning_rate": 0.00012061242441813294, "loss": 0.195, "step": 10200 }, { "epoch": 214.94736842105263, "grad_norm": 0.5395473837852478, "learning_rate": 0.00012048159201471739, "loss": 0.1924, "step": 10210 }, { "epoch": 215.1578947368421, "grad_norm": 0.9155479669570923, "learning_rate": 0.00012035072300737596, "loss": 0.207, "step": 10220 }, { "epoch": 215.3684210526316, "grad_norm": 0.6539714336395264, "learning_rate": 0.00012021981762999279, "loss": 0.1854, "step": 10230 }, { "epoch": 215.57894736842104, "grad_norm": 0.8252039551734924, "learning_rate": 0.00012008887611651704, "loss": 0.1946, "step": 10240 }, { "epoch": 215.78947368421052, "grad_norm": 0.6097246408462524, "learning_rate": 0.00011995789870096241, "loss": 0.1948, "step": 10250 }, { "epoch": 216.0, "grad_norm": 0.5688139796257019, "learning_rate": 0.0001198268856174068, "loss": 0.1929, "step": 10260 }, { "epoch": 216.21052631578948, "grad_norm": 0.7054363489151001, "learning_rate": 0.0001196958370999918, "loss": 0.2023, "step": 10270 }, { "epoch": 216.42105263157896, "grad_norm": 0.4730806350708008, "learning_rate": 0.00011956475338292237, "loss": 0.1812, "step": 10280 }, { "epoch": 216.6315789473684, "grad_norm": 0.7729452252388, "learning_rate": 0.00011943363470046636, "loss": 0.2017, "step": 10290 }, { "epoch": 216.8421052631579, "grad_norm": 0.6470244526863098, "learning_rate": 0.00011930248128695414, "loss": 0.1964, "step": 10300 }, { "epoch": 217.05263157894737, "grad_norm": 0.5927222371101379, "learning_rate": 0.00011917129337677809, "loss": 0.1905, "step": 10310 }, { "epoch": 217.26315789473685, "grad_norm": 0.6557316184043884, "learning_rate": 0.00011904007120439232, "loss": 0.1878, "step": 10320 }, { "epoch": 217.47368421052633, "grad_norm": 0.5663248896598816, "learning_rate": 0.0001189088150043121, "loss": 0.1854, "step": 10330 }, { "epoch": 217.68421052631578, "grad_norm": 0.5649807453155518, "learning_rate": 0.00011877752501111354, "loss": 0.2107, "step": 10340 }, { "epoch": 217.89473684210526, "grad_norm": 0.570824384689331, "learning_rate": 0.00011864620145943315, "loss": 0.1906, "step": 10350 }, { "epoch": 218.10526315789474, "grad_norm": 0.5065656304359436, "learning_rate": 0.0001185148445839674, "loss": 0.1995, "step": 10360 }, { "epoch": 218.31578947368422, "grad_norm": 0.7696800231933594, "learning_rate": 0.00011838345461947235, "loss": 0.1957, "step": 10370 }, { "epoch": 218.52631578947367, "grad_norm": 0.4969731569290161, "learning_rate": 0.00011825203180076319, "loss": 0.1994, "step": 10380 }, { "epoch": 218.73684210526315, "grad_norm": 0.8026688694953918, "learning_rate": 0.00011812057636271374, "loss": 0.2052, "step": 10390 }, { "epoch": 218.94736842105263, "grad_norm": 0.609519362449646, "learning_rate": 0.00011798908854025623, "loss": 0.1834, "step": 10400 }, { "epoch": 219.1578947368421, "grad_norm": 0.5675802826881409, "learning_rate": 0.00011785756856838071, "loss": 0.1994, "step": 10410 }, { "epoch": 219.3684210526316, "grad_norm": 0.7236883640289307, "learning_rate": 0.00011772601668213468, "loss": 0.2032, "step": 10420 }, { "epoch": 219.57894736842104, "grad_norm": 0.513433575630188, "learning_rate": 0.0001175944331166227, "loss": 0.1878, "step": 10430 }, { "epoch": 219.78947368421052, "grad_norm": 0.6006221771240234, "learning_rate": 0.00011746281810700592, "loss": 0.1938, "step": 10440 }, { "epoch": 220.0, "grad_norm": 0.5742442607879639, "learning_rate": 0.00011733117188850178, "loss": 0.1967, "step": 10450 }, { "epoch": 220.21052631578948, "grad_norm": 0.6315630078315735, "learning_rate": 0.00011719949469638329, "loss": 0.1978, "step": 10460 }, { "epoch": 220.42105263157896, "grad_norm": 0.5011500120162964, "learning_rate": 0.00011706778676597905, "loss": 0.1868, "step": 10470 }, { "epoch": 220.6315789473684, "grad_norm": 0.6937258243560791, "learning_rate": 0.00011693604833267242, "loss": 0.2015, "step": 10480 }, { "epoch": 220.8421052631579, "grad_norm": 0.6345796585083008, "learning_rate": 0.00011680427963190139, "loss": 0.1986, "step": 10490 }, { "epoch": 221.05263157894737, "grad_norm": 0.5916712880134583, "learning_rate": 0.00011667248089915799, "loss": 0.1911, "step": 10500 }, { "epoch": 221.26315789473685, "grad_norm": 0.5585072040557861, "learning_rate": 0.00011654065236998786, "loss": 0.1885, "step": 10510 }, { "epoch": 221.47368421052633, "grad_norm": 0.7705309987068176, "learning_rate": 0.00011640879427999003, "loss": 0.1897, "step": 10520 }, { "epoch": 221.68421052631578, "grad_norm": 0.5290719866752625, "learning_rate": 0.00011627690686481627, "loss": 0.1949, "step": 10530 }, { "epoch": 221.89473684210526, "grad_norm": 0.8180526494979858, "learning_rate": 0.00011614499036017075, "loss": 0.2041, "step": 10540 }, { "epoch": 222.10526315789474, "grad_norm": 0.5164480209350586, "learning_rate": 0.00011601304500180967, "loss": 0.1852, "step": 10550 }, { "epoch": 222.31578947368422, "grad_norm": 0.5821452140808105, "learning_rate": 0.00011588107102554078, "loss": 0.1907, "step": 10560 }, { "epoch": 222.52631578947367, "grad_norm": 0.6250927448272705, "learning_rate": 0.000115749068667223, "loss": 0.1949, "step": 10570 }, { "epoch": 222.73684210526315, "grad_norm": 0.7137615084648132, "learning_rate": 0.0001156170381627659, "loss": 0.1976, "step": 10580 }, { "epoch": 222.94736842105263, "grad_norm": 0.6757993102073669, "learning_rate": 0.00011548497974812942, "loss": 0.1982, "step": 10590 }, { "epoch": 223.1578947368421, "grad_norm": 0.49481892585754395, "learning_rate": 0.00011535289365932333, "loss": 0.1832, "step": 10600 }, { "epoch": 223.3684210526316, "grad_norm": 0.6089574694633484, "learning_rate": 0.00011522078013240696, "loss": 0.1833, "step": 10610 }, { "epoch": 223.57894736842104, "grad_norm": 0.6293704509735107, "learning_rate": 0.00011508863940348855, "loss": 0.1994, "step": 10620 }, { "epoch": 223.78947368421052, "grad_norm": 0.5520006418228149, "learning_rate": 0.000114956471708725, "loss": 0.2023, "step": 10630 }, { "epoch": 224.0, "grad_norm": 0.566952645778656, "learning_rate": 0.00011482427728432144, "loss": 0.1956, "step": 10640 }, { "epoch": 224.21052631578948, "grad_norm": 0.7031182050704956, "learning_rate": 0.00011469205636653075, "loss": 0.1958, "step": 10650 }, { "epoch": 224.42105263157896, "grad_norm": 0.5254794955253601, "learning_rate": 0.00011455980919165308, "loss": 0.1892, "step": 10660 }, { "epoch": 224.6315789473684, "grad_norm": 0.539675235748291, "learning_rate": 0.00011442753599603566, "loss": 0.179, "step": 10670 }, { "epoch": 224.8421052631579, "grad_norm": 0.5456839203834534, "learning_rate": 0.0001142952370160721, "loss": 0.1983, "step": 10680 }, { "epoch": 225.05263157894737, "grad_norm": 0.5140777826309204, "learning_rate": 0.0001141629124882021, "loss": 0.2006, "step": 10690 }, { "epoch": 225.26315789473685, "grad_norm": 0.48138687014579773, "learning_rate": 0.00011403056264891112, "loss": 0.1995, "step": 10700 }, { "epoch": 225.47368421052633, "grad_norm": 0.6670958995819092, "learning_rate": 0.00011389818773472971, "loss": 0.1954, "step": 10710 }, { "epoch": 225.68421052631578, "grad_norm": 0.7487067580223083, "learning_rate": 0.00011376578798223332, "loss": 0.1916, "step": 10720 }, { "epoch": 225.89473684210526, "grad_norm": 0.6026431918144226, "learning_rate": 0.0001136333636280418, "loss": 0.1847, "step": 10730 }, { "epoch": 226.10526315789474, "grad_norm": 0.6704311966896057, "learning_rate": 0.00011350091490881893, "loss": 0.1861, "step": 10740 }, { "epoch": 226.31578947368422, "grad_norm": 0.8869838714599609, "learning_rate": 0.00011336844206127209, "loss": 0.2022, "step": 10750 }, { "epoch": 226.52631578947367, "grad_norm": 0.7338431477546692, "learning_rate": 0.00011323594532215165, "loss": 0.1899, "step": 10760 }, { "epoch": 226.73684210526315, "grad_norm": 0.5879060626029968, "learning_rate": 0.00011310342492825081, "loss": 0.1827, "step": 10770 }, { "epoch": 226.94736842105263, "grad_norm": 0.5951240062713623, "learning_rate": 0.00011297088111640499, "loss": 0.1964, "step": 10780 }, { "epoch": 227.1578947368421, "grad_norm": 0.4901759922504425, "learning_rate": 0.00011283831412349153, "loss": 0.1882, "step": 10790 }, { "epoch": 227.3684210526316, "grad_norm": 0.6785090565681458, "learning_rate": 0.00011270572418642909, "loss": 0.1897, "step": 10800 }, { "epoch": 227.57894736842104, "grad_norm": 0.6172801852226257, "learning_rate": 0.00011257311154217736, "loss": 0.2049, "step": 10810 }, { "epoch": 227.78947368421052, "grad_norm": 0.6416763663291931, "learning_rate": 0.0001124404764277367, "loss": 0.1885, "step": 10820 }, { "epoch": 228.0, "grad_norm": 0.7574992775917053, "learning_rate": 0.00011230781908014752, "loss": 0.2072, "step": 10830 }, { "epoch": 228.21052631578948, "grad_norm": 0.7357071042060852, "learning_rate": 0.00011217513973649003, "loss": 0.2017, "step": 10840 }, { "epoch": 228.42105263157896, "grad_norm": 0.45522359013557434, "learning_rate": 0.0001120424386338837, "loss": 0.1815, "step": 10850 }, { "epoch": 228.6315789473684, "grad_norm": 0.5420458316802979, "learning_rate": 0.00011190971600948699, "loss": 0.1809, "step": 10860 }, { "epoch": 228.8421052631579, "grad_norm": 0.7300458550453186, "learning_rate": 0.00011177697210049667, "loss": 0.2028, "step": 10870 }, { "epoch": 229.05263157894737, "grad_norm": 0.5525165796279907, "learning_rate": 0.00011164420714414768, "loss": 0.2037, "step": 10880 }, { "epoch": 229.26315789473685, "grad_norm": 0.5621128082275391, "learning_rate": 0.00011151142137771246, "loss": 0.1981, "step": 10890 }, { "epoch": 229.47368421052633, "grad_norm": 0.6248337626457214, "learning_rate": 0.0001113786150385008, "loss": 0.1974, "step": 10900 }, { "epoch": 229.68421052631578, "grad_norm": 0.6925014853477478, "learning_rate": 0.00011124578836385908, "loss": 0.1905, "step": 10910 }, { "epoch": 229.89473684210526, "grad_norm": 0.4596860110759735, "learning_rate": 0.00011111294159117016, "loss": 0.1923, "step": 10920 }, { "epoch": 230.10526315789474, "grad_norm": 0.5915650129318237, "learning_rate": 0.00011098007495785272, "loss": 0.1887, "step": 10930 }, { "epoch": 230.31578947368422, "grad_norm": 0.47109338641166687, "learning_rate": 0.00011084718870136102, "loss": 0.1875, "step": 10940 }, { "epoch": 230.52631578947367, "grad_norm": 0.7140452861785889, "learning_rate": 0.0001107142830591843, "loss": 0.1857, "step": 10950 }, { "epoch": 230.73684210526315, "grad_norm": 0.6751444935798645, "learning_rate": 0.0001105813582688465, "loss": 0.1947, "step": 10960 }, { "epoch": 230.94736842105263, "grad_norm": 0.5488141775131226, "learning_rate": 0.0001104484145679058, "loss": 0.1902, "step": 10970 }, { "epoch": 231.1578947368421, "grad_norm": 0.5548514127731323, "learning_rate": 0.00011031545219395413, "loss": 0.1883, "step": 10980 }, { "epoch": 231.3684210526316, "grad_norm": 0.6914103031158447, "learning_rate": 0.00011018247138461684, "loss": 0.1943, "step": 10990 }, { "epoch": 231.57894736842104, "grad_norm": 0.7115802764892578, "learning_rate": 0.00011004947237755217, "loss": 0.198, "step": 11000 }, { "epoch": 231.78947368421052, "grad_norm": 0.5460284352302551, "learning_rate": 0.0001099164554104509, "loss": 0.1859, "step": 11010 }, { "epoch": 232.0, "grad_norm": 0.7350656390190125, "learning_rate": 0.00010978342072103593, "loss": 0.206, "step": 11020 }, { "epoch": 232.21052631578948, "grad_norm": 0.4807220995426178, "learning_rate": 0.00010965036854706183, "loss": 0.1907, "step": 11030 }, { "epoch": 232.42105263157896, "grad_norm": 0.6180817484855652, "learning_rate": 0.00010951729912631443, "loss": 0.1993, "step": 11040 }, { "epoch": 232.6315789473684, "grad_norm": 0.5458692908287048, "learning_rate": 0.0001093842126966103, "loss": 0.185, "step": 11050 }, { "epoch": 232.8421052631579, "grad_norm": 0.6068777441978455, "learning_rate": 0.00010925110949579653, "loss": 0.1931, "step": 11060 }, { "epoch": 233.05263157894737, "grad_norm": 0.604051411151886, "learning_rate": 0.00010911798976175008, "loss": 0.1988, "step": 11070 }, { "epoch": 233.26315789473685, "grad_norm": 0.621242344379425, "learning_rate": 0.00010898485373237748, "loss": 0.1924, "step": 11080 }, { "epoch": 233.47368421052633, "grad_norm": 0.5285283923149109, "learning_rate": 0.00010885170164561449, "loss": 0.1833, "step": 11090 }, { "epoch": 233.68421052631578, "grad_norm": 0.512630045413971, "learning_rate": 0.0001087185337394254, "loss": 0.1929, "step": 11100 }, { "epoch": 233.89473684210526, "grad_norm": 0.602725088596344, "learning_rate": 0.0001085853502518029, "loss": 0.1954, "step": 11110 }, { "epoch": 234.10526315789474, "grad_norm": 0.41769760847091675, "learning_rate": 0.00010845215142076742, "loss": 0.191, "step": 11120 }, { "epoch": 234.31578947368422, "grad_norm": 0.5544017553329468, "learning_rate": 0.0001083189374843669, "loss": 0.1834, "step": 11130 }, { "epoch": 234.52631578947367, "grad_norm": 0.5937119126319885, "learning_rate": 0.00010818570868067623, "loss": 0.1933, "step": 11140 }, { "epoch": 234.73684210526315, "grad_norm": 0.6727398633956909, "learning_rate": 0.00010805246524779693, "loss": 0.1862, "step": 11150 }, { "epoch": 234.94736842105263, "grad_norm": 0.6366226673126221, "learning_rate": 0.00010791920742385657, "loss": 0.2097, "step": 11160 }, { "epoch": 235.1578947368421, "grad_norm": 0.5684067010879517, "learning_rate": 0.00010778593544700852, "loss": 0.2046, "step": 11170 }, { "epoch": 235.3684210526316, "grad_norm": 0.8542050123214722, "learning_rate": 0.0001076526495554314, "loss": 0.1991, "step": 11180 }, { "epoch": 235.57894736842104, "grad_norm": 0.6223448514938354, "learning_rate": 0.00010751934998732867, "loss": 0.186, "step": 11190 }, { "epoch": 235.78947368421052, "grad_norm": 0.47746938467025757, "learning_rate": 0.00010738603698092831, "loss": 0.1916, "step": 11200 }, { "epoch": 236.0, "grad_norm": 0.6482349038124084, "learning_rate": 0.00010725271077448232, "loss": 0.1981, "step": 11210 }, { "epoch": 236.21052631578948, "grad_norm": 0.5127678513526917, "learning_rate": 0.00010711937160626617, "loss": 0.1871, "step": 11220 }, { "epoch": 236.42105263157896, "grad_norm": 0.7345888614654541, "learning_rate": 0.00010698601971457862, "loss": 0.199, "step": 11230 }, { "epoch": 236.6315789473684, "grad_norm": 0.4777671992778778, "learning_rate": 0.00010685265533774109, "loss": 0.1944, "step": 11240 }, { "epoch": 236.8421052631579, "grad_norm": 0.5794364809989929, "learning_rate": 0.0001067192787140974, "loss": 0.1884, "step": 11250 }, { "epoch": 237.05263157894737, "grad_norm": 0.6226224303245544, "learning_rate": 0.00010658589008201314, "loss": 0.189, "step": 11260 }, { "epoch": 237.26315789473685, "grad_norm": 0.5198787450790405, "learning_rate": 0.00010645248967987544, "loss": 0.1817, "step": 11270 }, { "epoch": 237.47368421052633, "grad_norm": 0.5877044200897217, "learning_rate": 0.0001063190777460925, "loss": 0.1982, "step": 11280 }, { "epoch": 237.68421052631578, "grad_norm": 0.5517503619194031, "learning_rate": 0.00010618565451909302, "loss": 0.1969, "step": 11290 }, { "epoch": 237.89473684210526, "grad_norm": 0.5407507419586182, "learning_rate": 0.00010605222023732596, "loss": 0.2072, "step": 11300 }, { "epoch": 238.10526315789474, "grad_norm": 0.583928644657135, "learning_rate": 0.00010591877513926, "loss": 0.1818, "step": 11310 }, { "epoch": 238.31578947368422, "grad_norm": 0.620707631111145, "learning_rate": 0.00010578531946338319, "loss": 0.1861, "step": 11320 }, { "epoch": 238.52631578947367, "grad_norm": 0.83612459897995, "learning_rate": 0.00010565185344820247, "loss": 0.1924, "step": 11330 }, { "epoch": 238.73684210526315, "grad_norm": 0.6024158596992493, "learning_rate": 0.00010551837733224321, "loss": 0.1889, "step": 11340 }, { "epoch": 238.94736842105263, "grad_norm": 0.5878002047538757, "learning_rate": 0.00010538489135404893, "loss": 0.199, "step": 11350 }, { "epoch": 239.1578947368421, "grad_norm": 0.6979864239692688, "learning_rate": 0.00010525139575218063, "loss": 0.1959, "step": 11360 }, { "epoch": 239.3684210526316, "grad_norm": 0.6338476538658142, "learning_rate": 0.00010511789076521668, "loss": 0.1866, "step": 11370 }, { "epoch": 239.57894736842104, "grad_norm": 0.7045919299125671, "learning_rate": 0.0001049843766317521, "loss": 0.1935, "step": 11380 }, { "epoch": 239.78947368421052, "grad_norm": 0.5786710977554321, "learning_rate": 0.00010485085359039828, "loss": 0.1875, "step": 11390 }, { "epoch": 240.0, "grad_norm": 0.6468011736869812, "learning_rate": 0.0001047173218797826, "loss": 0.1982, "step": 11400 }, { "epoch": 240.21052631578948, "grad_norm": 0.4997261166572571, "learning_rate": 0.00010458378173854783, "loss": 0.186, "step": 11410 }, { "epoch": 240.42105263157896, "grad_norm": 0.44703060388565063, "learning_rate": 0.00010445023340535185, "loss": 0.1809, "step": 11420 }, { "epoch": 240.6315789473684, "grad_norm": 0.8224093317985535, "learning_rate": 0.00010431667711886721, "loss": 0.1949, "step": 11430 }, { "epoch": 240.8421052631579, "grad_norm": 0.6010963320732117, "learning_rate": 0.00010418311311778066, "loss": 0.1917, "step": 11440 }, { "epoch": 241.05263157894737, "grad_norm": 0.5227510929107666, "learning_rate": 0.0001040495416407927, "loss": 0.2126, "step": 11450 }, { "epoch": 241.26315789473685, "grad_norm": 0.4908906817436218, "learning_rate": 0.00010391596292661722, "loss": 0.1938, "step": 11460 }, { "epoch": 241.47368421052633, "grad_norm": 0.6381858587265015, "learning_rate": 0.00010378237721398106, "loss": 0.1838, "step": 11470 }, { "epoch": 241.68421052631578, "grad_norm": 0.6832618713378906, "learning_rate": 0.00010364878474162354, "loss": 0.1942, "step": 11480 }, { "epoch": 241.89473684210526, "grad_norm": 0.7349768280982971, "learning_rate": 0.00010351518574829602, "loss": 0.2055, "step": 11490 }, { "epoch": 242.10526315789474, "grad_norm": 0.7650448083877563, "learning_rate": 0.00010338158047276165, "loss": 0.1882, "step": 11500 }, { "epoch": 242.31578947368422, "grad_norm": 0.5663439631462097, "learning_rate": 0.00010324796915379466, "loss": 0.1913, "step": 11510 }, { "epoch": 242.52631578947367, "grad_norm": 0.5075819492340088, "learning_rate": 0.00010311435203018018, "loss": 0.1845, "step": 11520 }, { "epoch": 242.73684210526315, "grad_norm": 0.5255244970321655, "learning_rate": 0.00010298072934071363, "loss": 0.1835, "step": 11530 }, { "epoch": 242.94736842105263, "grad_norm": 0.5974047780036926, "learning_rate": 0.00010284710132420045, "loss": 0.195, "step": 11540 }, { "epoch": 243.1578947368421, "grad_norm": 0.719183623790741, "learning_rate": 0.00010271346821945558, "loss": 0.2077, "step": 11550 }, { "epoch": 243.3684210526316, "grad_norm": 0.5450050234794617, "learning_rate": 0.00010257983026530302, "loss": 0.1757, "step": 11560 }, { "epoch": 243.57894736842104, "grad_norm": 0.8635089993476868, "learning_rate": 0.0001024461877005755, "loss": 0.2051, "step": 11570 }, { "epoch": 243.78947368421052, "grad_norm": 0.6234176158905029, "learning_rate": 0.0001023125407641139, "loss": 0.2019, "step": 11580 }, { "epoch": 244.0, "grad_norm": 0.5475245714187622, "learning_rate": 0.00010217888969476699, "loss": 0.1883, "step": 11590 }, { "epoch": 244.21052631578948, "grad_norm": 0.5728968381881714, "learning_rate": 0.00010204523473139094, "loss": 0.1979, "step": 11600 }, { "epoch": 244.42105263157896, "grad_norm": 0.6688836812973022, "learning_rate": 0.00010191157611284876, "loss": 0.1858, "step": 11610 }, { "epoch": 244.6315789473684, "grad_norm": 0.5823827385902405, "learning_rate": 0.00010177791407801017, "loss": 0.1918, "step": 11620 }, { "epoch": 244.8421052631579, "grad_norm": 0.46818673610687256, "learning_rate": 0.0001016442488657508, "loss": 0.1982, "step": 11630 }, { "epoch": 245.05263157894737, "grad_norm": 0.6950215101242065, "learning_rate": 0.00010151058071495211, "loss": 0.1994, "step": 11640 }, { "epoch": 245.26315789473685, "grad_norm": 0.6790728569030762, "learning_rate": 0.00010137690986450079, "loss": 0.1833, "step": 11650 }, { "epoch": 245.47368421052633, "grad_norm": 0.5021740198135376, "learning_rate": 0.00010124323655328826, "loss": 0.1758, "step": 11660 }, { "epoch": 245.68421052631578, "grad_norm": 0.6168434619903564, "learning_rate": 0.00010110956102021043, "loss": 0.2001, "step": 11670 }, { "epoch": 245.89473684210526, "grad_norm": 0.6883941888809204, "learning_rate": 0.00010097588350416715, "loss": 0.2136, "step": 11680 }, { "epoch": 246.10526315789474, "grad_norm": 0.703221321105957, "learning_rate": 0.00010084220424406183, "loss": 0.1941, "step": 11690 }, { "epoch": 246.31578947368422, "grad_norm": 0.5424191951751709, "learning_rate": 0.00010070852347880095, "loss": 0.1827, "step": 11700 }, { "epoch": 246.52631578947367, "grad_norm": 0.6174150705337524, "learning_rate": 0.00010057484144729375, "loss": 0.1899, "step": 11710 }, { "epoch": 246.73684210526315, "grad_norm": 0.6344186663627625, "learning_rate": 0.00010044115838845167, "loss": 0.1847, "step": 11720 }, { "epoch": 246.94736842105263, "grad_norm": 0.5950223207473755, "learning_rate": 0.000100307474541188, "loss": 0.1944, "step": 11730 }, { "epoch": 247.1578947368421, "grad_norm": 0.6062918305397034, "learning_rate": 0.0001001737901444175, "loss": 0.1882, "step": 11740 }, { "epoch": 247.3684210526316, "grad_norm": 0.6734747290611267, "learning_rate": 0.00010004010543705583, "loss": 0.1873, "step": 11750 }, { "epoch": 247.57894736842104, "grad_norm": 0.5567269325256348, "learning_rate": 9.990642065801922e-05, "loss": 0.1905, "step": 11760 }, { "epoch": 247.78947368421052, "grad_norm": 0.638371467590332, "learning_rate": 9.977273604622408e-05, "loss": 0.1911, "step": 11770 }, { "epoch": 248.0, "grad_norm": 0.7093550562858582, "learning_rate": 9.963905184058648e-05, "loss": 0.2124, "step": 11780 }, { "epoch": 248.21052631578948, "grad_norm": 0.5440819263458252, "learning_rate": 9.950536828002174e-05, "loss": 0.1972, "step": 11790 }, { "epoch": 248.42105263157896, "grad_norm": 0.4559662640094757, "learning_rate": 9.937168560344412e-05, "loss": 0.1837, "step": 11800 }, { "epoch": 248.6315789473684, "grad_norm": 0.6494418978691101, "learning_rate": 9.923800404976619e-05, "loss": 0.2035, "step": 11810 }, { "epoch": 248.8421052631579, "grad_norm": 0.6026273369789124, "learning_rate": 9.910432385789855e-05, "loss": 0.1968, "step": 11820 }, { "epoch": 249.05263157894737, "grad_norm": 0.5863096714019775, "learning_rate": 9.897064526674944e-05, "loss": 0.198, "step": 11830 }, { "epoch": 249.26315789473685, "grad_norm": 0.6923006176948547, "learning_rate": 9.883696851522412e-05, "loss": 0.1906, "step": 11840 }, { "epoch": 249.47368421052633, "grad_norm": 0.6653925180435181, "learning_rate": 9.870329384222465e-05, "loss": 0.1976, "step": 11850 }, { "epoch": 249.68421052631578, "grad_norm": 0.6321898698806763, "learning_rate": 9.85696214866493e-05, "loss": 0.1967, "step": 11860 }, { "epoch": 249.89473684210526, "grad_norm": 0.5583874583244324, "learning_rate": 9.843595168739233e-05, "loss": 0.1899, "step": 11870 }, { "epoch": 250.10526315789474, "grad_norm": 0.5806682109832764, "learning_rate": 9.830228468334329e-05, "loss": 0.1894, "step": 11880 }, { "epoch": 250.31578947368422, "grad_norm": 0.519559383392334, "learning_rate": 9.816862071338675e-05, "loss": 0.1802, "step": 11890 }, { "epoch": 250.52631578947367, "grad_norm": 0.6656150817871094, "learning_rate": 9.803496001640198e-05, "loss": 0.189, "step": 11900 }, { "epoch": 250.73684210526315, "grad_norm": 1.0183082818984985, "learning_rate": 9.790130283126226e-05, "loss": 0.2056, "step": 11910 }, { "epoch": 250.94736842105263, "grad_norm": 0.5636556148529053, "learning_rate": 9.776764939683463e-05, "loss": 0.1879, "step": 11920 }, { "epoch": 251.1578947368421, "grad_norm": 0.5543136596679688, "learning_rate": 9.763399995197955e-05, "loss": 0.2136, "step": 11930 }, { "epoch": 251.3684210526316, "grad_norm": 0.48448771238327026, "learning_rate": 9.750035473555016e-05, "loss": 0.1811, "step": 11940 }, { "epoch": 251.57894736842104, "grad_norm": 0.7867603302001953, "learning_rate": 9.736671398639217e-05, "loss": 0.1843, "step": 11950 }, { "epoch": 251.78947368421052, "grad_norm": 0.6914984583854675, "learning_rate": 9.723307794334322e-05, "loss": 0.1912, "step": 11960 }, { "epoch": 252.0, "grad_norm": NaN, "learning_rate": 9.711280972571503e-05, "loss": 0.2048, "step": 11970 }, { "epoch": 252.21052631578948, "grad_norm": 0.4894615411758423, "learning_rate": 9.697918328224071e-05, "loss": 0.188, "step": 11980 }, { "epoch": 252.42105263157896, "grad_norm": 0.7946804761886597, "learning_rate": 9.684556223745563e-05, "loss": 0.1935, "step": 11990 }, { "epoch": 252.6315789473684, "grad_norm": 0.5945575833320618, "learning_rate": 9.671194683016235e-05, "loss": 0.1974, "step": 12000 }, { "epoch": 252.8421052631579, "grad_norm": 0.5779606103897095, "learning_rate": 9.65783372991532e-05, "loss": 0.1951, "step": 12010 }, { "epoch": 253.05263157894737, "grad_norm": 0.5105938911437988, "learning_rate": 9.644473388321008e-05, "loss": 0.1904, "step": 12020 }, { "epoch": 253.26315789473685, "grad_norm": 0.6342964172363281, "learning_rate": 9.631113682110396e-05, "loss": 0.1927, "step": 12030 }, { "epoch": 253.47368421052633, "grad_norm": 0.6746343970298767, "learning_rate": 9.61775463515945e-05, "loss": 0.1941, "step": 12040 }, { "epoch": 253.68421052631578, "grad_norm": 0.6832113862037659, "learning_rate": 9.604396271342943e-05, "loss": 0.1879, "step": 12050 }, { "epoch": 253.89473684210526, "grad_norm": 0.4744756817817688, "learning_rate": 9.59103861453445e-05, "loss": 0.1896, "step": 12060 }, { "epoch": 254.10526315789474, "grad_norm": 0.6902132034301758, "learning_rate": 9.577681688606262e-05, "loss": 0.1884, "step": 12070 }, { "epoch": 254.31578947368422, "grad_norm": 0.5863637924194336, "learning_rate": 9.564325517429369e-05, "loss": 0.2072, "step": 12080 }, { "epoch": 254.52631578947367, "grad_norm": 0.5211676359176636, "learning_rate": 9.550970124873417e-05, "loss": 0.1781, "step": 12090 }, { "epoch": 254.73684210526315, "grad_norm": 0.5536790490150452, "learning_rate": 9.537615534806662e-05, "loss": 0.1973, "step": 12100 }, { "epoch": 254.94736842105263, "grad_norm": 0.5267168283462524, "learning_rate": 9.524261771095919e-05, "loss": 0.1871, "step": 12110 }, { "epoch": 255.1578947368421, "grad_norm": 0.6351366639137268, "learning_rate": 9.510908857606522e-05, "loss": 0.1895, "step": 12120 }, { "epoch": 255.3684210526316, "grad_norm": 0.5926997661590576, "learning_rate": 9.497556818202306e-05, "loss": 0.181, "step": 12130 }, { "epoch": 255.57894736842104, "grad_norm": 0.7168594002723694, "learning_rate": 9.48420567674552e-05, "loss": 0.1904, "step": 12140 }, { "epoch": 255.78947368421052, "grad_norm": 0.6090984344482422, "learning_rate": 9.470855457096824e-05, "loss": 0.1957, "step": 12150 }, { "epoch": 256.0, "grad_norm": 0.5657641887664795, "learning_rate": 9.457506183115217e-05, "loss": 0.2069, "step": 12160 }, { "epoch": 256.2105263157895, "grad_norm": 0.5447742342948914, "learning_rate": 9.444157878658028e-05, "loss": 0.1941, "step": 12170 }, { "epoch": 256.42105263157896, "grad_norm": 0.6158611178398132, "learning_rate": 9.430810567580836e-05, "loss": 0.194, "step": 12180 }, { "epoch": 256.63157894736844, "grad_norm": 0.5748459100723267, "learning_rate": 9.417464273737444e-05, "loss": 0.1948, "step": 12190 }, { "epoch": 256.8421052631579, "grad_norm": 0.5924521088600159, "learning_rate": 9.404119020979853e-05, "loss": 0.1962, "step": 12200 }, { "epoch": 257.05263157894734, "grad_norm": 0.6660053730010986, "learning_rate": 9.390774833158186e-05, "loss": 0.1842, "step": 12210 }, { "epoch": 257.2631578947368, "grad_norm": 0.5104465484619141, "learning_rate": 9.377431734120673e-05, "loss": 0.1955, "step": 12220 }, { "epoch": 257.4736842105263, "grad_norm": 0.4942176342010498, "learning_rate": 9.364089747713599e-05, "loss": 0.1897, "step": 12230 }, { "epoch": 257.6842105263158, "grad_norm": 0.49282151460647583, "learning_rate": 9.350748897781254e-05, "loss": 0.1835, "step": 12240 }, { "epoch": 257.89473684210526, "grad_norm": 0.5877977013587952, "learning_rate": 9.337409208165898e-05, "loss": 0.2036, "step": 12250 }, { "epoch": 258.10526315789474, "grad_norm": 0.4874722361564636, "learning_rate": 9.32407070270772e-05, "loss": 0.1842, "step": 12260 }, { "epoch": 258.3157894736842, "grad_norm": 0.6626269221305847, "learning_rate": 9.310733405244795e-05, "loss": 0.187, "step": 12270 }, { "epoch": 258.5263157894737, "grad_norm": 0.5759614109992981, "learning_rate": 9.297397339613035e-05, "loss": 0.2007, "step": 12280 }, { "epoch": 258.7368421052632, "grad_norm": 0.5541868805885315, "learning_rate": 9.284062529646146e-05, "loss": 0.1879, "step": 12290 }, { "epoch": 258.94736842105266, "grad_norm": 0.5124220252037048, "learning_rate": 9.270728999175605e-05, "loss": 0.1843, "step": 12300 }, { "epoch": 259.1578947368421, "grad_norm": 0.39318570494651794, "learning_rate": 9.257396772030589e-05, "loss": 0.1904, "step": 12310 }, { "epoch": 259.36842105263156, "grad_norm": 0.5387923121452332, "learning_rate": 9.244065872037946e-05, "loss": 0.1837, "step": 12320 }, { "epoch": 259.57894736842104, "grad_norm": 0.5390989780426025, "learning_rate": 9.230736323022157e-05, "loss": 0.1773, "step": 12330 }, { "epoch": 259.7894736842105, "grad_norm": 0.7210453748703003, "learning_rate": 9.217408148805292e-05, "loss": 0.2003, "step": 12340 }, { "epoch": 260.0, "grad_norm": 0.5788223743438721, "learning_rate": 9.204081373206958e-05, "loss": 0.1979, "step": 12350 }, { "epoch": 260.2105263157895, "grad_norm": 0.6655392050743103, "learning_rate": 9.190756020044257e-05, "loss": 0.1754, "step": 12360 }, { "epoch": 260.42105263157896, "grad_norm": 0.6042269468307495, "learning_rate": 9.177432113131766e-05, "loss": 0.192, "step": 12370 }, { "epoch": 260.63157894736844, "grad_norm": 0.8559859395027161, "learning_rate": 9.164109676281458e-05, "loss": 0.2043, "step": 12380 }, { "epoch": 260.8421052631579, "grad_norm": 0.5184524059295654, "learning_rate": 9.150788733302691e-05, "loss": 0.1964, "step": 12390 }, { "epoch": 261.05263157894734, "grad_norm": 0.6655091643333435, "learning_rate": 9.137469308002154e-05, "loss": 0.211, "step": 12400 }, { "epoch": 261.2631578947368, "grad_norm": 0.5456916689872742, "learning_rate": 9.124151424183817e-05, "loss": 0.1753, "step": 12410 }, { "epoch": 261.4736842105263, "grad_norm": 0.6656398177146912, "learning_rate": 9.110835105648898e-05, "loss": 0.1864, "step": 12420 }, { "epoch": 261.6842105263158, "grad_norm": 0.5527902245521545, "learning_rate": 9.097520376195811e-05, "loss": 0.202, "step": 12430 }, { "epoch": 261.89473684210526, "grad_norm": 0.5181113481521606, "learning_rate": 9.084207259620144e-05, "loss": 0.1909, "step": 12440 }, { "epoch": 262.10526315789474, "grad_norm": 0.4919548034667969, "learning_rate": 9.070895779714597e-05, "loss": 0.1996, "step": 12450 }, { "epoch": 262.3157894736842, "grad_norm": 0.5647410154342651, "learning_rate": 9.057585960268931e-05, "loss": 0.1774, "step": 12460 }, { "epoch": 262.5263157894737, "grad_norm": 0.5741182565689087, "learning_rate": 9.044277825069967e-05, "loss": 0.1846, "step": 12470 }, { "epoch": 262.7368421052632, "grad_norm": 0.7225587964057922, "learning_rate": 9.030971397901491e-05, "loss": 0.1999, "step": 12480 }, { "epoch": 262.94736842105266, "grad_norm": 0.4701984226703644, "learning_rate": 9.017666702544245e-05, "loss": 0.1977, "step": 12490 }, { "epoch": 263.1578947368421, "grad_norm": 0.5436278581619263, "learning_rate": 9.00436376277588e-05, "loss": 0.1948, "step": 12500 }, { "epoch": 263.36842105263156, "grad_norm": 0.5927190780639648, "learning_rate": 8.991062602370907e-05, "loss": 0.2018, "step": 12510 }, { "epoch": 263.57894736842104, "grad_norm": 0.5915952324867249, "learning_rate": 8.977763245100656e-05, "loss": 0.1815, "step": 12520 }, { "epoch": 263.7894736842105, "grad_norm": 0.6053709387779236, "learning_rate": 8.964465714733229e-05, "loss": 0.2057, "step": 12530 }, { "epoch": 264.0, "grad_norm": 0.5991526246070862, "learning_rate": 8.951170035033478e-05, "loss": 0.1847, "step": 12540 }, { "epoch": 264.2105263157895, "grad_norm": 0.43415001034736633, "learning_rate": 8.937876229762933e-05, "loss": 0.1909, "step": 12550 }, { "epoch": 264.42105263157896, "grad_norm": 0.571382462978363, "learning_rate": 8.92458432267978e-05, "loss": 0.1816, "step": 12560 }, { "epoch": 264.63157894736844, "grad_norm": 0.5127904415130615, "learning_rate": 8.911294337538813e-05, "loss": 0.1895, "step": 12570 }, { "epoch": 264.8421052631579, "grad_norm": 0.5136194825172424, "learning_rate": 8.898006298091392e-05, "loss": 0.2023, "step": 12580 }, { "epoch": 265.05263157894734, "grad_norm": 0.5507082343101501, "learning_rate": 8.884720228085397e-05, "loss": 0.1958, "step": 12590 }, { "epoch": 265.2631578947368, "grad_norm": 0.441049188375473, "learning_rate": 8.871436151265184e-05, "loss": 0.1869, "step": 12600 }, { "epoch": 265.4736842105263, "grad_norm": 0.5187159776687622, "learning_rate": 8.85815409137156e-05, "loss": 0.184, "step": 12610 }, { "epoch": 265.6842105263158, "grad_norm": 0.6891997456550598, "learning_rate": 8.844874072141715e-05, "loss": 0.1951, "step": 12620 }, { "epoch": 265.89473684210526, "grad_norm": 0.6691162586212158, "learning_rate": 8.831596117309195e-05, "loss": 0.1935, "step": 12630 }, { "epoch": 266.10526315789474, "grad_norm": 0.8119604587554932, "learning_rate": 8.818320250603866e-05, "loss": 0.1909, "step": 12640 }, { "epoch": 266.3157894736842, "grad_norm": 0.5154120326042175, "learning_rate": 8.80504649575185e-05, "loss": 0.1858, "step": 12650 }, { "epoch": 266.5263157894737, "grad_norm": 0.5835145711898804, "learning_rate": 8.7917748764755e-05, "loss": 0.1811, "step": 12660 }, { "epoch": 266.7368421052632, "grad_norm": 0.601320207118988, "learning_rate": 8.778505416493343e-05, "loss": 0.1929, "step": 12670 }, { "epoch": 266.94736842105266, "grad_norm": 0.6263984441757202, "learning_rate": 8.765238139520067e-05, "loss": 0.2011, "step": 12680 }, { "epoch": 267.1578947368421, "grad_norm": 0.6324906945228577, "learning_rate": 8.751973069266444e-05, "loss": 0.1813, "step": 12690 }, { "epoch": 267.36842105263156, "grad_norm": 0.5795466303825378, "learning_rate": 8.7387102294393e-05, "loss": 0.1942, "step": 12700 }, { "epoch": 267.57894736842104, "grad_norm": 0.4995231330394745, "learning_rate": 8.725449643741487e-05, "loss": 0.1838, "step": 12710 }, { "epoch": 267.7894736842105, "grad_norm": 0.6501296162605286, "learning_rate": 8.712191335871822e-05, "loss": 0.1951, "step": 12720 }, { "epoch": 268.0, "grad_norm": 0.7934373617172241, "learning_rate": 8.698935329525043e-05, "loss": 0.1987, "step": 12730 }, { "epoch": 268.2105263157895, "grad_norm": 0.5606156587600708, "learning_rate": 8.685681648391791e-05, "loss": 0.1904, "step": 12740 }, { "epoch": 268.42105263157896, "grad_norm": 0.5056310296058655, "learning_rate": 8.672430316158541e-05, "loss": 0.1855, "step": 12750 }, { "epoch": 268.63157894736844, "grad_norm": 0.48892274498939514, "learning_rate": 8.659181356507571e-05, "loss": 0.1878, "step": 12760 }, { "epoch": 268.8421052631579, "grad_norm": 0.4638197422027588, "learning_rate": 8.645934793116917e-05, "loss": 0.1937, "step": 12770 }, { "epoch": 269.05263157894734, "grad_norm": 0.3581981956958771, "learning_rate": 8.632690649660342e-05, "loss": 0.2066, "step": 12780 }, { "epoch": 269.2631578947368, "grad_norm": 0.4948786795139313, "learning_rate": 8.619448949807274e-05, "loss": 0.1829, "step": 12790 }, { "epoch": 269.4736842105263, "grad_norm": 0.6615347862243652, "learning_rate": 8.606209717222777e-05, "loss": 0.1841, "step": 12800 }, { "epoch": 269.6842105263158, "grad_norm": 0.5024154782295227, "learning_rate": 8.59297297556751e-05, "loss": 0.1941, "step": 12810 }, { "epoch": 269.89473684210526, "grad_norm": 0.5818947553634644, "learning_rate": 8.579738748497675e-05, "loss": 0.1993, "step": 12820 }, { "epoch": 270.10526315789474, "grad_norm": 0.44670799374580383, "learning_rate": 8.566507059664981e-05, "loss": 0.1799, "step": 12830 }, { "epoch": 270.3157894736842, "grad_norm": 0.5685772895812988, "learning_rate": 8.553277932716599e-05, "loss": 0.2018, "step": 12840 }, { "epoch": 270.5263157894737, "grad_norm": 0.7774202823638916, "learning_rate": 8.540051391295125e-05, "loss": 0.175, "step": 12850 }, { "epoch": 270.7368421052632, "grad_norm": 0.6112064719200134, "learning_rate": 8.52682745903854e-05, "loss": 0.1916, "step": 12860 }, { "epoch": 270.94736842105266, "grad_norm": 0.5548771619796753, "learning_rate": 8.513606159580142e-05, "loss": 0.195, "step": 12870 }, { "epoch": 271.1578947368421, "grad_norm": 0.5652728080749512, "learning_rate": 8.500387516548549e-05, "loss": 0.185, "step": 12880 }, { "epoch": 271.36842105263156, "grad_norm": 0.5679665803909302, "learning_rate": 8.487171553567616e-05, "loss": 0.1895, "step": 12890 }, { "epoch": 271.57894736842104, "grad_norm": 0.6408627033233643, "learning_rate": 8.473958294256406e-05, "loss": 0.1915, "step": 12900 }, { "epoch": 271.7894736842105, "grad_norm": 0.6951084733009338, "learning_rate": 8.460747762229164e-05, "loss": 0.202, "step": 12910 }, { "epoch": 272.0, "grad_norm": 0.6909904479980469, "learning_rate": 8.447539981095246e-05, "loss": 0.1847, "step": 12920 }, { "epoch": 272.2105263157895, "grad_norm": 0.47111183404922485, "learning_rate": 8.434334974459104e-05, "loss": 0.1627, "step": 12930 }, { "epoch": 272.42105263157896, "grad_norm": 0.5050517320632935, "learning_rate": 8.421132765920219e-05, "loss": 0.1828, "step": 12940 }, { "epoch": 272.63157894736844, "grad_norm": 0.8324809670448303, "learning_rate": 8.407933379073086e-05, "loss": 0.2063, "step": 12950 }, { "epoch": 272.8421052631579, "grad_norm": 0.5706649422645569, "learning_rate": 8.394736837507148e-05, "loss": 0.1949, "step": 12960 }, { "epoch": 273.05263157894734, "grad_norm": 0.5625925064086914, "learning_rate": 8.381543164806756e-05, "loss": 0.2075, "step": 12970 }, { "epoch": 273.2631578947368, "grad_norm": 0.4872249662876129, "learning_rate": 8.368352384551153e-05, "loss": 0.1857, "step": 12980 }, { "epoch": 273.4736842105263, "grad_norm": 0.5305653810501099, "learning_rate": 8.3551645203144e-05, "loss": 0.1987, "step": 12990 }, { "epoch": 273.6842105263158, "grad_norm": 0.46276259422302246, "learning_rate": 8.341979595665346e-05, "loss": 0.1812, "step": 13000 }, { "epoch": 273.89473684210526, "grad_norm": 0.6188380122184753, "learning_rate": 8.328797634167586e-05, "loss": 0.1948, "step": 13010 }, { "epoch": 274.10526315789474, "grad_norm": 0.4250720143318176, "learning_rate": 8.315618659379429e-05, "loss": 0.1867, "step": 13020 }, { "epoch": 274.3157894736842, "grad_norm": 0.48751601576805115, "learning_rate": 8.302442694853838e-05, "loss": 0.1806, "step": 13030 }, { "epoch": 274.5263157894737, "grad_norm": 0.45683354139328003, "learning_rate": 8.289269764138393e-05, "loss": 0.1804, "step": 13040 }, { "epoch": 274.7368421052632, "grad_norm": 0.6656365990638733, "learning_rate": 8.276099890775266e-05, "loss": 0.2056, "step": 13050 }, { "epoch": 274.94736842105266, "grad_norm": 0.4965307414531708, "learning_rate": 8.262933098301152e-05, "loss": 0.1938, "step": 13060 }, { "epoch": 275.1578947368421, "grad_norm": 0.5132878422737122, "learning_rate": 8.249769410247239e-05, "loss": 0.192, "step": 13070 }, { "epoch": 275.36842105263156, "grad_norm": 0.5989632606506348, "learning_rate": 8.23660885013918e-05, "loss": 0.1897, "step": 13080 }, { "epoch": 275.57894736842104, "grad_norm": 0.6420390009880066, "learning_rate": 8.223451441497026e-05, "loss": 0.1839, "step": 13090 }, { "epoch": 275.7894736842105, "grad_norm": 0.6834862232208252, "learning_rate": 8.2102972078352e-05, "loss": 0.205, "step": 13100 }, { "epoch": 276.0, "grad_norm": 0.5762868523597717, "learning_rate": 8.197146172662447e-05, "loss": 0.195, "step": 13110 }, { "epoch": 276.2105263157895, "grad_norm": 0.4554460346698761, "learning_rate": 8.183998359481806e-05, "loss": 0.1782, "step": 13120 }, { "epoch": 276.42105263157896, "grad_norm": 0.720638632774353, "learning_rate": 8.170853791790547e-05, "loss": 0.1919, "step": 13130 }, { "epoch": 276.63157894736844, "grad_norm": 0.5189310908317566, "learning_rate": 8.15771249308014e-05, "loss": 0.1967, "step": 13140 }, { "epoch": 276.8421052631579, "grad_norm": 0.8014587759971619, "learning_rate": 8.144574486836228e-05, "loss": 0.1936, "step": 13150 }, { "epoch": 277.05263157894734, "grad_norm": 0.5890443921089172, "learning_rate": 8.131439796538546e-05, "loss": 0.1888, "step": 13160 }, { "epoch": 277.2631578947368, "grad_norm": 0.5942811369895935, "learning_rate": 8.118308445660923e-05, "loss": 0.1883, "step": 13170 }, { "epoch": 277.4736842105263, "grad_norm": 0.5413142442703247, "learning_rate": 8.105180457671204e-05, "loss": 0.1807, "step": 13180 }, { "epoch": 277.6842105263158, "grad_norm": 0.5064405798912048, "learning_rate": 8.092055856031244e-05, "loss": 0.1935, "step": 13190 }, { "epoch": 277.89473684210526, "grad_norm": 0.6160650253295898, "learning_rate": 8.078934664196825e-05, "loss": 0.1909, "step": 13200 }, { "epoch": 278.10526315789474, "grad_norm": 0.7421651482582092, "learning_rate": 8.065816905617647e-05, "loss": 0.1884, "step": 13210 }, { "epoch": 278.3157894736842, "grad_norm": 0.5469620227813721, "learning_rate": 8.052702603737272e-05, "loss": 0.2007, "step": 13220 }, { "epoch": 278.5263157894737, "grad_norm": 0.6061626076698303, "learning_rate": 8.039591781993086e-05, "loss": 0.1947, "step": 13230 }, { "epoch": 278.7368421052632, "grad_norm": 0.5570159554481506, "learning_rate": 8.026484463816245e-05, "loss": 0.1858, "step": 13240 }, { "epoch": 278.94736842105266, "grad_norm": 0.593967080116272, "learning_rate": 8.013380672631664e-05, "loss": 0.2003, "step": 13250 }, { "epoch": 279.1578947368421, "grad_norm": 0.6793128848075867, "learning_rate": 8.000280431857933e-05, "loss": 0.2016, "step": 13260 }, { "epoch": 279.36842105263156, "grad_norm": 0.7546000480651855, "learning_rate": 7.98718376490731e-05, "loss": 0.1938, "step": 13270 }, { "epoch": 279.57894736842104, "grad_norm": 0.5035498142242432, "learning_rate": 7.97409069518566e-05, "loss": 0.1867, "step": 13280 }, { "epoch": 279.7894736842105, "grad_norm": 0.6848862767219543, "learning_rate": 7.961001246092427e-05, "loss": 0.1838, "step": 13290 }, { "epoch": 280.0, "grad_norm": 0.607519805431366, "learning_rate": 7.947915441020575e-05, "loss": 0.1892, "step": 13300 }, { "epoch": 280.2105263157895, "grad_norm": 0.6071075797080994, "learning_rate": 7.934833303356556e-05, "loss": 0.1899, "step": 13310 }, { "epoch": 280.42105263157896, "grad_norm": 0.5092148780822754, "learning_rate": 7.921754856480279e-05, "loss": 0.1826, "step": 13320 }, { "epoch": 280.63157894736844, "grad_norm": 0.5188595652580261, "learning_rate": 7.908680123765043e-05, "loss": 0.1899, "step": 13330 }, { "epoch": 280.8421052631579, "grad_norm": 0.6495827436447144, "learning_rate": 7.895609128577514e-05, "loss": 0.1933, "step": 13340 }, { "epoch": 281.05263157894734, "grad_norm": 0.5069420337677002, "learning_rate": 7.882541894277689e-05, "loss": 0.1933, "step": 13350 }, { "epoch": 281.2631578947368, "grad_norm": 0.6902370452880859, "learning_rate": 7.869478444218828e-05, "loss": 0.189, "step": 13360 }, { "epoch": 281.4736842105263, "grad_norm": 0.6552804112434387, "learning_rate": 7.856418801747435e-05, "loss": 0.1977, "step": 13370 }, { "epoch": 281.6842105263158, "grad_norm": 0.4557342529296875, "learning_rate": 7.843362990203205e-05, "loss": 0.1981, "step": 13380 }, { "epoch": 281.89473684210526, "grad_norm": 0.44700777530670166, "learning_rate": 7.830311032918994e-05, "loss": 0.1859, "step": 13390 }, { "epoch": 282.10526315789474, "grad_norm": 0.46295836567878723, "learning_rate": 7.817262953220769e-05, "loss": 0.1845, "step": 13400 }, { "epoch": 282.3157894736842, "grad_norm": 0.4293200373649597, "learning_rate": 7.804218774427558e-05, "loss": 0.1884, "step": 13410 }, { "epoch": 282.5263157894737, "grad_norm": 0.5236184000968933, "learning_rate": 7.791178519851427e-05, "loss": 0.1806, "step": 13420 }, { "epoch": 282.7368421052632, "grad_norm": 0.6687787175178528, "learning_rate": 7.778142212797428e-05, "loss": 0.1915, "step": 13430 }, { "epoch": 282.94736842105266, "grad_norm": 0.5880486369132996, "learning_rate": 7.765109876563547e-05, "loss": 0.1934, "step": 13440 }, { "epoch": 283.1578947368421, "grad_norm": 0.4489704966545105, "learning_rate": 7.752081534440689e-05, "loss": 0.182, "step": 13450 }, { "epoch": 283.36842105263156, "grad_norm": 0.6451957821846008, "learning_rate": 7.739057209712612e-05, "loss": 0.1884, "step": 13460 }, { "epoch": 283.57894736842104, "grad_norm": 0.7118418216705322, "learning_rate": 7.726036925655897e-05, "loss": 0.2025, "step": 13470 }, { "epoch": 283.7894736842105, "grad_norm": 0.49697622656822205, "learning_rate": 7.713020705539898e-05, "loss": 0.1944, "step": 13480 }, { "epoch": 284.0, "grad_norm": 0.6399528384208679, "learning_rate": 7.700008572626718e-05, "loss": 0.1996, "step": 13490 }, { "epoch": 284.2105263157895, "grad_norm": 0.4944116771221161, "learning_rate": 7.687000550171143e-05, "loss": 0.1754, "step": 13500 }, { "epoch": 284.42105263157896, "grad_norm": 0.44895878434181213, "learning_rate": 7.67399666142062e-05, "loss": 0.1752, "step": 13510 }, { "epoch": 284.63157894736844, "grad_norm": 0.5446719527244568, "learning_rate": 7.660996929615206e-05, "loss": 0.1885, "step": 13520 }, { "epoch": 284.8421052631579, "grad_norm": 0.5553343296051025, "learning_rate": 7.648001377987533e-05, "loss": 0.1954, "step": 13530 }, { "epoch": 285.05263157894734, "grad_norm": 0.5816590785980225, "learning_rate": 7.635010029762756e-05, "loss": 0.2043, "step": 13540 }, { "epoch": 285.2631578947368, "grad_norm": 0.6026872992515564, "learning_rate": 7.622022908158518e-05, "loss": 0.1834, "step": 13550 }, { "epoch": 285.4736842105263, "grad_norm": 0.4454641044139862, "learning_rate": 7.609040036384915e-05, "loss": 0.1829, "step": 13560 }, { "epoch": 285.6842105263158, "grad_norm": 0.4715535044670105, "learning_rate": 7.596061437644444e-05, "loss": 0.1975, "step": 13570 }, { "epoch": 285.89473684210526, "grad_norm": 0.5461314916610718, "learning_rate": 7.583087135131961e-05, "loss": 0.1976, "step": 13580 }, { "epoch": 286.10526315789474, "grad_norm": 0.6173868179321289, "learning_rate": 7.570117152034655e-05, "loss": 0.1919, "step": 13590 }, { "epoch": 286.3157894736842, "grad_norm": 0.5752949118614197, "learning_rate": 7.557151511531986e-05, "loss": 0.1893, "step": 13600 }, { "epoch": 286.5263157894737, "grad_norm": 0.5035438537597656, "learning_rate": 7.544190236795655e-05, "loss": 0.1973, "step": 13610 }, { "epoch": 286.7368421052632, "grad_norm": 0.5442306399345398, "learning_rate": 7.531233350989558e-05, "loss": 0.2008, "step": 13620 }, { "epoch": 286.94736842105266, "grad_norm": 0.6274810433387756, "learning_rate": 7.518280877269755e-05, "loss": 0.1792, "step": 13630 }, { "epoch": 287.1578947368421, "grad_norm": 0.5484516620635986, "learning_rate": 7.50533283878442e-05, "loss": 0.1898, "step": 13640 }, { "epoch": 287.36842105263156, "grad_norm": 0.5255657434463501, "learning_rate": 7.492389258673787e-05, "loss": 0.1921, "step": 13650 }, { "epoch": 287.57894736842104, "grad_norm": 0.4797198176383972, "learning_rate": 7.479450160070145e-05, "loss": 0.1794, "step": 13660 }, { "epoch": 287.7894736842105, "grad_norm": 0.6554931402206421, "learning_rate": 7.466515566097753e-05, "loss": 0.1912, "step": 13670 }, { "epoch": 288.0, "grad_norm": 0.5043107271194458, "learning_rate": 7.453585499872826e-05, "loss": 0.1884, "step": 13680 }, { "epoch": 288.2105263157895, "grad_norm": 0.6006696224212646, "learning_rate": 7.440659984503495e-05, "loss": 0.1793, "step": 13690 }, { "epoch": 288.42105263157896, "grad_norm": 0.5427776575088501, "learning_rate": 7.427739043089753e-05, "loss": 0.1773, "step": 13700 }, { "epoch": 288.63157894736844, "grad_norm": 0.5292754769325256, "learning_rate": 7.41482269872341e-05, "loss": 0.2064, "step": 13710 }, { "epoch": 288.8421052631579, "grad_norm": 0.485466867685318, "learning_rate": 7.401910974488069e-05, "loss": 0.1936, "step": 13720 }, { "epoch": 289.05263157894734, "grad_norm": 0.5587201714515686, "learning_rate": 7.389003893459081e-05, "loss": 0.1942, "step": 13730 }, { "epoch": 289.2631578947368, "grad_norm": 0.5549042820930481, "learning_rate": 7.376101478703485e-05, "loss": 0.1901, "step": 13740 }, { "epoch": 289.4736842105263, "grad_norm": 0.48474541306495667, "learning_rate": 7.363203753279992e-05, "loss": 0.1956, "step": 13750 }, { "epoch": 289.6842105263158, "grad_norm": 0.44179201126098633, "learning_rate": 7.35031074023893e-05, "loss": 0.1898, "step": 13760 }, { "epoch": 289.89473684210526, "grad_norm": 0.7074770927429199, "learning_rate": 7.337422462622203e-05, "loss": 0.1908, "step": 13770 }, { "epoch": 290.10526315789474, "grad_norm": 0.6862475872039795, "learning_rate": 7.324538943463251e-05, "loss": 0.1944, "step": 13780 }, { "epoch": 290.3157894736842, "grad_norm": 0.5927271842956543, "learning_rate": 7.31166020578701e-05, "loss": 0.1913, "step": 13790 }, { "epoch": 290.5263157894737, "grad_norm": 0.5208861231803894, "learning_rate": 7.298786272609878e-05, "loss": 0.1887, "step": 13800 }, { "epoch": 290.7368421052632, "grad_norm": 0.6315051317214966, "learning_rate": 7.285917166939658e-05, "loss": 0.183, "step": 13810 }, { "epoch": 290.94736842105266, "grad_norm": 0.6303961277008057, "learning_rate": 7.273052911775524e-05, "loss": 0.1909, "step": 13820 }, { "epoch": 291.1578947368421, "grad_norm": 0.5073893666267395, "learning_rate": 7.260193530107994e-05, "loss": 0.192, "step": 13830 }, { "epoch": 291.36842105263156, "grad_norm": 0.501746654510498, "learning_rate": 7.247339044918867e-05, "loss": 0.1808, "step": 13840 }, { "epoch": 291.57894736842104, "grad_norm": 0.6303046941757202, "learning_rate": 7.234489479181185e-05, "loss": 0.1782, "step": 13850 }, { "epoch": 291.7894736842105, "grad_norm": 0.6213247179985046, "learning_rate": 7.221644855859213e-05, "loss": 0.1964, "step": 13860 }, { "epoch": 292.0, "grad_norm": 0.5928416848182678, "learning_rate": 7.208805197908372e-05, "loss": 0.1919, "step": 13870 }, { "epoch": 292.2105263157895, "grad_norm": 0.5291090607643127, "learning_rate": 7.195970528275213e-05, "loss": 0.1875, "step": 13880 }, { "epoch": 292.42105263157896, "grad_norm": 0.5044816732406616, "learning_rate": 7.18314086989737e-05, "loss": 0.1867, "step": 13890 }, { "epoch": 292.63157894736844, "grad_norm": 0.6854403018951416, "learning_rate": 7.170316245703528e-05, "loss": 0.1819, "step": 13900 }, { "epoch": 292.8421052631579, "grad_norm": 0.4865442216396332, "learning_rate": 7.157496678613367e-05, "loss": 0.1896, "step": 13910 }, { "epoch": 293.05263157894734, "grad_norm": 0.6360872387886047, "learning_rate": 7.144682191537527e-05, "loss": 0.1981, "step": 13920 }, { "epoch": 293.2631578947368, "grad_norm": 0.45435404777526855, "learning_rate": 7.131872807377581e-05, "loss": 0.1923, "step": 13930 }, { "epoch": 293.4736842105263, "grad_norm": 0.6130965352058411, "learning_rate": 7.119068549025976e-05, "loss": 0.1824, "step": 13940 }, { "epoch": 293.6842105263158, "grad_norm": 0.6408197283744812, "learning_rate": 7.106269439365993e-05, "loss": 0.1987, "step": 13950 }, { "epoch": 293.89473684210526, "grad_norm": 0.5750312209129333, "learning_rate": 7.093475501271716e-05, "loss": 0.1844, "step": 13960 }, { "epoch": 294.10526315789474, "grad_norm": 0.4592019021511078, "learning_rate": 7.08068675760799e-05, "loss": 0.1886, "step": 13970 }, { "epoch": 294.3157894736842, "grad_norm": 0.5203351378440857, "learning_rate": 7.067903231230374e-05, "loss": 0.2025, "step": 13980 }, { "epoch": 294.5263157894737, "grad_norm": 0.9253612160682678, "learning_rate": 7.055124944985096e-05, "loss": 0.1964, "step": 13990 }, { "epoch": 294.7368421052632, "grad_norm": 0.5490900874137878, "learning_rate": 7.042351921709037e-05, "loss": 0.1786, "step": 14000 }, { "epoch": 294.94736842105266, "grad_norm": 0.5033005475997925, "learning_rate": 7.029584184229653e-05, "loss": 0.193, "step": 14010 }, { "epoch": 295.1578947368421, "grad_norm": 0.5323711037635803, "learning_rate": 7.016821755364957e-05, "loss": 0.1834, "step": 14020 }, { "epoch": 295.36842105263156, "grad_norm": 0.5181049108505249, "learning_rate": 7.00406465792349e-05, "loss": 0.1997, "step": 14030 }, { "epoch": 295.57894736842104, "grad_norm": 0.5805120468139648, "learning_rate": 6.991312914704242e-05, "loss": 0.187, "step": 14040 }, { "epoch": 295.7894736842105, "grad_norm": 0.521169900894165, "learning_rate": 6.978566548496657e-05, "loss": 0.1825, "step": 14050 }, { "epoch": 296.0, "grad_norm": 0.47887876629829407, "learning_rate": 6.965825582080545e-05, "loss": 0.2017, "step": 14060 }, { "epoch": 296.2105263157895, "grad_norm": 0.49226275086402893, "learning_rate": 6.953090038226092e-05, "loss": 0.1998, "step": 14070 }, { "epoch": 296.42105263157896, "grad_norm": 0.5467112064361572, "learning_rate": 6.940359939693772e-05, "loss": 0.1936, "step": 14080 }, { "epoch": 296.63157894736844, "grad_norm": 0.5136857628822327, "learning_rate": 6.927635309234335e-05, "loss": 0.1757, "step": 14090 }, { "epoch": 296.8421052631579, "grad_norm": 0.47038963437080383, "learning_rate": 6.916187835818779e-05, "loss": 0.1933, "step": 14100 }, { "epoch": 297.05263157894734, "grad_norm": 0.5792372822761536, "learning_rate": 6.903473657341111e-05, "loss": 0.1895, "step": 14110 }, { "epoch": 297.2631578947368, "grad_norm": 0.6727595925331116, "learning_rate": 6.890765012858093e-05, "loss": 0.1902, "step": 14120 }, { "epoch": 297.4736842105263, "grad_norm": 0.5258743166923523, "learning_rate": 6.878061925082137e-05, "loss": 0.1787, "step": 14130 }, { "epoch": 297.6842105263158, "grad_norm": 0.5945499539375305, "learning_rate": 6.86536441671572e-05, "loss": 0.2014, "step": 14140 }, { "epoch": 297.89473684210526, "grad_norm": 0.49428433179855347, "learning_rate": 6.852672510451346e-05, "loss": 0.2045, "step": 14150 }, { "epoch": 298.10526315789474, "grad_norm": 0.5705098509788513, "learning_rate": 6.839986228971512e-05, "loss": 0.1776, "step": 14160 }, { "epoch": 298.3157894736842, "grad_norm": 0.5622084140777588, "learning_rate": 6.827305594948658e-05, "loss": 0.1858, "step": 14170 }, { "epoch": 298.5263157894737, "grad_norm": 0.4996286630630493, "learning_rate": 6.814630631045136e-05, "loss": 0.1949, "step": 14180 }, { "epoch": 298.7368421052632, "grad_norm": 0.7660753130912781, "learning_rate": 6.801961359913156e-05, "loss": 0.1768, "step": 14190 }, { "epoch": 298.94736842105266, "grad_norm": 0.9577729105949402, "learning_rate": 6.789297804194766e-05, "loss": 0.2199, "step": 14200 }, { "epoch": 299.1578947368421, "grad_norm": 0.5109399557113647, "learning_rate": 6.776639986521792e-05, "loss": 0.1809, "step": 14210 }, { "epoch": 299.36842105263156, "grad_norm": 0.6572682857513428, "learning_rate": 6.7639879295158e-05, "loss": 0.1806, "step": 14220 }, { "epoch": 299.57894736842104, "grad_norm": 0.45336204767227173, "learning_rate": 6.751341655788077e-05, "loss": 0.1925, "step": 14230 }, { "epoch": 299.7894736842105, "grad_norm": 0.5275882482528687, "learning_rate": 6.73870118793956e-05, "loss": 0.1968, "step": 14240 }, { "epoch": 300.0, "grad_norm": 0.628964364528656, "learning_rate": 6.726066548560817e-05, "loss": 0.1994, "step": 14250 }, { "epoch": 300.2105263157895, "grad_norm": 0.6299924254417419, "learning_rate": 6.71343776023199e-05, "loss": 0.1782, "step": 14260 }, { "epoch": 300.42105263157896, "grad_norm": 0.698186457157135, "learning_rate": 6.700814845522779e-05, "loss": 0.1941, "step": 14270 }, { "epoch": 300.63157894736844, "grad_norm": 1.0937559604644775, "learning_rate": 6.688197826992375e-05, "loss": 0.2051, "step": 14280 }, { "epoch": 300.8421052631579, "grad_norm": 0.5907070636749268, "learning_rate": 6.675586727189436e-05, "loss": 0.1948, "step": 14290 }, { "epoch": 301.05263157894734, "grad_norm": 0.44216281175613403, "learning_rate": 6.662981568652049e-05, "loss": 0.1878, "step": 14300 }, { "epoch": 301.2631578947368, "grad_norm": 0.6885428428649902, "learning_rate": 6.650382373907672e-05, "loss": 0.1857, "step": 14310 }, { "epoch": 301.4736842105263, "grad_norm": 0.5614356398582458, "learning_rate": 6.637789165473101e-05, "loss": 0.1982, "step": 14320 }, { "epoch": 301.6842105263158, "grad_norm": 0.5463254451751709, "learning_rate": 6.625201965854453e-05, "loss": 0.1884, "step": 14330 }, { "epoch": 301.89473684210526, "grad_norm": 0.6541967988014221, "learning_rate": 6.612620797547087e-05, "loss": 0.1901, "step": 14340 }, { "epoch": 302.10526315789474, "grad_norm": 0.5899919271469116, "learning_rate": 6.600045683035597e-05, "loss": 0.1976, "step": 14350 }, { "epoch": 302.3157894736842, "grad_norm": 0.5349833369255066, "learning_rate": 6.587476644793742e-05, "loss": 0.1729, "step": 14360 }, { "epoch": 302.5263157894737, "grad_norm": 0.8576452732086182, "learning_rate": 6.574913705284443e-05, "loss": 0.2013, "step": 14370 }, { "epoch": 302.7368421052632, "grad_norm": 0.5418473482131958, "learning_rate": 6.562356886959704e-05, "loss": 0.199, "step": 14380 }, { "epoch": 302.94736842105266, "grad_norm": 0.6117028594017029, "learning_rate": 6.54980621226059e-05, "loss": 0.1878, "step": 14390 }, { "epoch": 303.1578947368421, "grad_norm": 0.6464748978614807, "learning_rate": 6.537261703617202e-05, "loss": 0.1913, "step": 14400 }, { "epoch": 303.36842105263156, "grad_norm": 0.4876056909561157, "learning_rate": 6.524723383448607e-05, "loss": 0.1849, "step": 14410 }, { "epoch": 303.57894736842104, "grad_norm": 0.5367958545684814, "learning_rate": 6.512191274162816e-05, "loss": 0.1915, "step": 14420 }, { "epoch": 303.7894736842105, "grad_norm": 0.5246376991271973, "learning_rate": 6.499665398156733e-05, "loss": 0.1854, "step": 14430 }, { "epoch": 304.0, "grad_norm": 0.49785304069519043, "learning_rate": 6.487145777816143e-05, "loss": 0.1839, "step": 14440 }, { "epoch": 304.2105263157895, "grad_norm": 0.5250376462936401, "learning_rate": 6.474632435515627e-05, "loss": 0.1878, "step": 14450 }, { "epoch": 304.42105263157896, "grad_norm": 0.48043861985206604, "learning_rate": 6.462125393618561e-05, "loss": 0.1883, "step": 14460 }, { "epoch": 304.63157894736844, "grad_norm": 0.47180649638175964, "learning_rate": 6.449624674477054e-05, "loss": 0.196, "step": 14470 }, { "epoch": 304.8421052631579, "grad_norm": 0.5821836590766907, "learning_rate": 6.437130300431924e-05, "loss": 0.1811, "step": 14480 }, { "epoch": 305.05263157894734, "grad_norm": 0.49136173725128174, "learning_rate": 6.424642293812636e-05, "loss": 0.193, "step": 14490 }, { "epoch": 305.2631578947368, "grad_norm": 0.45949774980545044, "learning_rate": 6.412160676937288e-05, "loss": 0.1872, "step": 14500 }, { "epoch": 305.4736842105263, "grad_norm": 0.7283862829208374, "learning_rate": 6.399685472112552e-05, "loss": 0.1949, "step": 14510 }, { "epoch": 305.6842105263158, "grad_norm": 0.49841785430908203, "learning_rate": 6.387216701633638e-05, "loss": 0.1821, "step": 14520 }, { "epoch": 305.89473684210526, "grad_norm": 0.47515934705734253, "learning_rate": 6.374754387784262e-05, "loss": 0.196, "step": 14530 }, { "epoch": 306.10526315789474, "grad_norm": 0.42606112360954285, "learning_rate": 6.362298552836605e-05, "loss": 0.1819, "step": 14540 }, { "epoch": 306.3157894736842, "grad_norm": 0.5149482488632202, "learning_rate": 6.34984921905126e-05, "loss": 0.1825, "step": 14550 }, { "epoch": 306.5263157894737, "grad_norm": 0.5274513363838196, "learning_rate": 6.3374064086772e-05, "loss": 0.1868, "step": 14560 }, { "epoch": 306.7368421052632, "grad_norm": 0.5605398416519165, "learning_rate": 6.324970143951753e-05, "loss": 0.1859, "step": 14570 }, { "epoch": 306.94736842105266, "grad_norm": 0.6485209465026855, "learning_rate": 6.312540447100534e-05, "loss": 0.1988, "step": 14580 }, { "epoch": 307.1578947368421, "grad_norm": 0.4830923080444336, "learning_rate": 6.300117340337433e-05, "loss": 0.1932, "step": 14590 }, { "epoch": 307.36842105263156, "grad_norm": 0.6907581090927124, "learning_rate": 6.287700845864549e-05, "loss": 0.1904, "step": 14600 }, { "epoch": 307.57894736842104, "grad_norm": 0.5328361988067627, "learning_rate": 6.275290985872177e-05, "loss": 0.1906, "step": 14610 }, { "epoch": 307.7894736842105, "grad_norm": 0.5460513234138489, "learning_rate": 6.262887782538746e-05, "loss": 0.1901, "step": 14620 }, { "epoch": 308.0, "grad_norm": 0.5381531119346619, "learning_rate": 6.250491258030791e-05, "loss": 0.1927, "step": 14630 }, { "epoch": 308.2105263157895, "grad_norm": 0.6306193470954895, "learning_rate": 6.23810143450291e-05, "loss": 0.2011, "step": 14640 }, { "epoch": 308.42105263157896, "grad_norm": 0.5382252931594849, "learning_rate": 6.225718334097733e-05, "loss": 0.1901, "step": 14650 }, { "epoch": 308.63157894736844, "grad_norm": 0.5241566300392151, "learning_rate": 6.213341978945859e-05, "loss": 0.171, "step": 14660 }, { "epoch": 308.8421052631579, "grad_norm": 0.8374577760696411, "learning_rate": 6.200972391165852e-05, "loss": 0.2042, "step": 14670 }, { "epoch": 309.05263157894734, "grad_norm": 0.516899824142456, "learning_rate": 6.188609592864163e-05, "loss": 0.1779, "step": 14680 }, { "epoch": 309.2631578947368, "grad_norm": 0.5281073451042175, "learning_rate": 6.176253606135119e-05, "loss": 0.1813, "step": 14690 }, { "epoch": 309.4736842105263, "grad_norm": 0.5887854099273682, "learning_rate": 6.163904453060869e-05, "loss": 0.1897, "step": 14700 }, { "epoch": 309.6842105263158, "grad_norm": 0.5451268553733826, "learning_rate": 6.15156215571136e-05, "loss": 0.1968, "step": 14710 }, { "epoch": 309.89473684210526, "grad_norm": 0.671812891960144, "learning_rate": 6.139226736144273e-05, "loss": 0.2004, "step": 14720 }, { "epoch": 310.10526315789474, "grad_norm": 0.5144766569137573, "learning_rate": 6.126898216405e-05, "loss": 0.1946, "step": 14730 }, { "epoch": 310.3157894736842, "grad_norm": 0.5516157150268555, "learning_rate": 6.114576618526611e-05, "loss": 0.1764, "step": 14740 }, { "epoch": 310.5263157894737, "grad_norm": 0.5606300234794617, "learning_rate": 6.102261964529796e-05, "loss": 0.1935, "step": 14750 }, { "epoch": 310.7368421052632, "grad_norm": 0.646920382976532, "learning_rate": 6.08995427642284e-05, "loss": 0.1944, "step": 14760 }, { "epoch": 310.94736842105266, "grad_norm": 0.529812753200531, "learning_rate": 6.077653576201572e-05, "loss": 0.1858, "step": 14770 }, { "epoch": 311.1578947368421, "grad_norm": 0.42989394068717957, "learning_rate": 6.065359885849345e-05, "loss": 0.1761, "step": 14780 }, { "epoch": 311.36842105263156, "grad_norm": 0.5556666851043701, "learning_rate": 6.053073227336975e-05, "loss": 0.1884, "step": 14790 }, { "epoch": 311.57894736842104, "grad_norm": 0.48674774169921875, "learning_rate": 6.040793622622707e-05, "loss": 0.2071, "step": 14800 }, { "epoch": 311.7894736842105, "grad_norm": 0.48857688903808594, "learning_rate": 6.0285210936521955e-05, "loss": 0.1836, "step": 14810 }, { "epoch": 312.0, "grad_norm": 0.737316906452179, "learning_rate": 6.016255662358432e-05, "loss": 0.1895, "step": 14820 }, { "epoch": 312.2105263157895, "grad_norm": 0.49834001064300537, "learning_rate": 6.003997350661732e-05, "loss": 0.1996, "step": 14830 }, { "epoch": 312.42105263157896, "grad_norm": 0.5362703204154968, "learning_rate": 5.991746180469691e-05, "loss": 0.1813, "step": 14840 }, { "epoch": 312.63157894736844, "grad_norm": 0.6358118057250977, "learning_rate": 5.979502173677134e-05, "loss": 0.1995, "step": 14850 }, { "epoch": 312.8421052631579, "grad_norm": 0.5340322256088257, "learning_rate": 5.9672653521660826e-05, "loss": 0.1778, "step": 14860 }, { "epoch": 313.05263157894734, "grad_norm": 0.44907963275909424, "learning_rate": 5.955035737805725e-05, "loss": 0.1904, "step": 14870 }, { "epoch": 313.2631578947368, "grad_norm": 0.5180246233940125, "learning_rate": 5.9428133524523646e-05, "loss": 0.1911, "step": 14880 }, { "epoch": 313.4736842105263, "grad_norm": 0.525974452495575, "learning_rate": 5.930598217949386e-05, "loss": 0.1759, "step": 14890 }, { "epoch": 313.6842105263158, "grad_norm": 0.5962675213813782, "learning_rate": 5.91839035612721e-05, "loss": 0.1956, "step": 14900 }, { "epoch": 313.89473684210526, "grad_norm": 0.5130165815353394, "learning_rate": 5.9061897888032747e-05, "loss": 0.1916, "step": 14910 }, { "epoch": 314.10526315789474, "grad_norm": 0.5901978611946106, "learning_rate": 5.893996537781966e-05, "loss": 0.1812, "step": 14920 }, { "epoch": 314.3157894736842, "grad_norm": 0.48012682795524597, "learning_rate": 5.8818106248546004e-05, "loss": 0.1936, "step": 14930 }, { "epoch": 314.5263157894737, "grad_norm": 0.5353314876556396, "learning_rate": 5.8696320717993784e-05, "loss": 0.1792, "step": 14940 }, { "epoch": 314.7368421052632, "grad_norm": 0.6114587187767029, "learning_rate": 5.857460900381355e-05, "loss": 0.1873, "step": 14950 }, { "epoch": 314.94736842105266, "grad_norm": 0.654788076877594, "learning_rate": 5.845297132352385e-05, "loss": 0.201, "step": 14960 }, { "epoch": 315.1578947368421, "grad_norm": 0.6699498295783997, "learning_rate": 5.833140789451086e-05, "loss": 0.1781, "step": 14970 }, { "epoch": 315.36842105263156, "grad_norm": 0.573408305644989, "learning_rate": 5.8209918934028275e-05, "loss": 0.1975, "step": 14980 }, { "epoch": 315.57894736842104, "grad_norm": 0.6000211238861084, "learning_rate": 5.808850465919649e-05, "loss": 0.1784, "step": 14990 }, { "epoch": 315.7894736842105, "grad_norm": 0.6924741864204407, "learning_rate": 5.7967165287002464e-05, "loss": 0.1823, "step": 15000 }, { "epoch": 316.0, "grad_norm": 0.8868710398674011, "learning_rate": 5.7845901034299424e-05, "loss": 0.2024, "step": 15010 }, { "epoch": 316.2105263157895, "grad_norm": 0.49164116382598877, "learning_rate": 5.772471211780619e-05, "loss": 0.1755, "step": 15020 }, { "epoch": 316.42105263157896, "grad_norm": 0.6795914173126221, "learning_rate": 5.760359875410702e-05, "loss": 0.1937, "step": 15030 }, { "epoch": 316.63157894736844, "grad_norm": 0.6305373311042786, "learning_rate": 5.748256115965109e-05, "loss": 0.1841, "step": 15040 }, { "epoch": 316.8421052631579, "grad_norm": 0.6585614085197449, "learning_rate": 5.73615995507523e-05, "loss": 0.2104, "step": 15050 }, { "epoch": 317.05263157894734, "grad_norm": 0.6081206798553467, "learning_rate": 5.724071414358858e-05, "loss": 0.1857, "step": 15060 }, { "epoch": 317.2631578947368, "grad_norm": 0.4927370846271515, "learning_rate": 5.711990515420176e-05, "loss": 0.1822, "step": 15070 }, { "epoch": 317.4736842105263, "grad_norm": 0.5095883011817932, "learning_rate": 5.699917279849714e-05, "loss": 0.18, "step": 15080 }, { "epoch": 317.6842105263158, "grad_norm": 0.7175976037979126, "learning_rate": 5.6878517292242936e-05, "loss": 0.1883, "step": 15090 }, { "epoch": 317.89473684210526, "grad_norm": 0.5586292147636414, "learning_rate": 5.675793885107019e-05, "loss": 0.1906, "step": 15100 }, { "epoch": 318.10526315789474, "grad_norm": 0.8061725497245789, "learning_rate": 5.663743769047206e-05, "loss": 0.1899, "step": 15110 }, { "epoch": 318.3157894736842, "grad_norm": 0.6274729371070862, "learning_rate": 5.651701402580371e-05, "loss": 0.1747, "step": 15120 }, { "epoch": 318.5263157894737, "grad_norm": 0.4897122383117676, "learning_rate": 5.639666807228175e-05, "loss": 0.1936, "step": 15130 }, { "epoch": 318.7368421052632, "grad_norm": 0.5890952348709106, "learning_rate": 5.627640004498385e-05, "loss": 0.1928, "step": 15140 }, { "epoch": 318.94736842105266, "grad_norm": 0.5626781582832336, "learning_rate": 5.6156210158848544e-05, "loss": 0.1967, "step": 15150 }, { "epoch": 319.1578947368421, "grad_norm": 0.5838659405708313, "learning_rate": 5.603609862867463e-05, "loss": 0.1872, "step": 15160 }, { "epoch": 319.36842105263156, "grad_norm": 0.4655553698539734, "learning_rate": 5.591606566912082e-05, "loss": 0.1894, "step": 15170 }, { "epoch": 319.57894736842104, "grad_norm": 0.5683347582817078, "learning_rate": 5.5796111494705584e-05, "loss": 0.1836, "step": 15180 }, { "epoch": 319.7894736842105, "grad_norm": 0.5712106823921204, "learning_rate": 5.567623631980644e-05, "loss": 0.1925, "step": 15190 }, { "epoch": 320.0, "grad_norm": 0.5311657190322876, "learning_rate": 5.55564403586597e-05, "loss": 0.1979, "step": 15200 }, { "epoch": 320.2105263157895, "grad_norm": 0.4528977572917938, "learning_rate": 5.543672382536023e-05, "loss": 0.1969, "step": 15210 }, { "epoch": 320.42105263157896, "grad_norm": 0.5928493738174438, "learning_rate": 5.5317086933860907e-05, "loss": 0.1889, "step": 15220 }, { "epoch": 320.63157894736844, "grad_norm": 0.4887818396091461, "learning_rate": 5.519752989797224e-05, "loss": 0.1806, "step": 15230 }, { "epoch": 320.8421052631579, "grad_norm": 0.5781086683273315, "learning_rate": 5.507805293136198e-05, "loss": 0.1813, "step": 15240 }, { "epoch": 321.05263157894734, "grad_norm": 0.44095057249069214, "learning_rate": 5.495865624755492e-05, "loss": 0.1859, "step": 15250 }, { "epoch": 321.2631578947368, "grad_norm": 0.6125297546386719, "learning_rate": 5.4839340059932255e-05, "loss": 0.1851, "step": 15260 }, { "epoch": 321.4736842105263, "grad_norm": 0.7112295627593994, "learning_rate": 5.472010458173132e-05, "loss": 0.1968, "step": 15270 }, { "epoch": 321.6842105263158, "grad_norm": 0.5584365129470825, "learning_rate": 5.4600950026045326e-05, "loss": 0.1845, "step": 15280 }, { "epoch": 321.89473684210526, "grad_norm": 0.5321446061134338, "learning_rate": 5.448187660582276e-05, "loss": 0.1951, "step": 15290 }, { "epoch": 322.10526315789474, "grad_norm": 0.5082312226295471, "learning_rate": 5.436288453386709e-05, "loss": 0.183, "step": 15300 }, { "epoch": 322.3157894736842, "grad_norm": 0.47308409214019775, "learning_rate": 5.424397402283644e-05, "loss": 0.1886, "step": 15310 }, { "epoch": 322.5263157894737, "grad_norm": 0.6271807551383972, "learning_rate": 5.4125145285243194e-05, "loss": 0.1929, "step": 15320 }, { "epoch": 322.7368421052632, "grad_norm": 0.677312433719635, "learning_rate": 5.400639853345364e-05, "loss": 0.1929, "step": 15330 }, { "epoch": 322.94736842105266, "grad_norm": 0.3949933648109436, "learning_rate": 5.388773397968736e-05, "loss": 0.1827, "step": 15340 }, { "epoch": 323.1578947368421, "grad_norm": 0.5227069854736328, "learning_rate": 5.376915183601725e-05, "loss": 0.1821, "step": 15350 }, { "epoch": 323.36842105263156, "grad_norm": 0.4687119722366333, "learning_rate": 5.36506523143688e-05, "loss": 0.1883, "step": 15360 }, { "epoch": 323.57894736842104, "grad_norm": 0.6543534398078918, "learning_rate": 5.353223562651986e-05, "loss": 0.1898, "step": 15370 }, { "epoch": 323.7894736842105, "grad_norm": 0.6527659893035889, "learning_rate": 5.341390198410019e-05, "loss": 0.1915, "step": 15380 }, { "epoch": 324.0, "grad_norm": 0.5707231760025024, "learning_rate": 5.329565159859131e-05, "loss": 0.1917, "step": 15390 }, { "epoch": 324.2105263157895, "grad_norm": 0.6318780183792114, "learning_rate": 5.317748468132577e-05, "loss": 0.2033, "step": 15400 }, { "epoch": 324.42105263157896, "grad_norm": 0.5352064967155457, "learning_rate": 5.305940144348698e-05, "loss": 0.1865, "step": 15410 }, { "epoch": 324.63157894736844, "grad_norm": 0.4935546815395355, "learning_rate": 5.2941402096108905e-05, "loss": 0.178, "step": 15420 }, { "epoch": 324.8421052631579, "grad_norm": 0.5082517266273499, "learning_rate": 5.282348685007543e-05, "loss": 0.1892, "step": 15430 }, { "epoch": 325.05263157894734, "grad_norm": 0.5046442747116089, "learning_rate": 5.2705655916120325e-05, "loss": 0.1926, "step": 15440 }, { "epoch": 325.2631578947368, "grad_norm": 0.43995267152786255, "learning_rate": 5.258790950482646e-05, "loss": 0.1827, "step": 15450 }, { "epoch": 325.4736842105263, "grad_norm": 0.5442992448806763, "learning_rate": 5.247024782662586e-05, "loss": 0.1884, "step": 15460 }, { "epoch": 325.6842105263158, "grad_norm": 0.7053566575050354, "learning_rate": 5.2352671091798997e-05, "loss": 0.1953, "step": 15470 }, { "epoch": 325.89473684210526, "grad_norm": 0.5656514167785645, "learning_rate": 5.223517951047449e-05, "loss": 0.1848, "step": 15480 }, { "epoch": 326.10526315789474, "grad_norm": 0.5459962487220764, "learning_rate": 5.2117773292628935e-05, "loss": 0.2066, "step": 15490 }, { "epoch": 326.3157894736842, "grad_norm": 0.5724794268608093, "learning_rate": 5.200045264808624e-05, "loss": 0.1913, "step": 15500 }, { "epoch": 326.5263157894737, "grad_norm": 0.5558455586433411, "learning_rate": 5.188321778651739e-05, "loss": 0.185, "step": 15510 }, { "epoch": 326.7368421052632, "grad_norm": 0.6177617311477661, "learning_rate": 5.176606891744017e-05, "loss": 0.1826, "step": 15520 }, { "epoch": 326.94736842105266, "grad_norm": 0.5286545157432556, "learning_rate": 5.164900625021856e-05, "loss": 0.1786, "step": 15530 }, { "epoch": 327.1578947368421, "grad_norm": 0.6205462217330933, "learning_rate": 5.153202999406251e-05, "loss": 0.1972, "step": 15540 }, { "epoch": 327.36842105263156, "grad_norm": 0.5253833532333374, "learning_rate": 5.141514035802755e-05, "loss": 0.1931, "step": 15550 }, { "epoch": 327.57894736842104, "grad_norm": 0.5076433420181274, "learning_rate": 5.129833755101442e-05, "loss": 0.1911, "step": 15560 }, { "epoch": 327.7894736842105, "grad_norm": 0.466351717710495, "learning_rate": 5.118162178176873e-05, "loss": 0.1876, "step": 15570 }, { "epoch": 328.0, "grad_norm": 0.636641800403595, "learning_rate": 5.106499325888041e-05, "loss": 0.1804, "step": 15580 }, { "epoch": 328.2105263157895, "grad_norm": 0.6042010188102722, "learning_rate": 5.094845219078361e-05, "loss": 0.1795, "step": 15590 }, { "epoch": 328.42105263157896, "grad_norm": 0.8110350966453552, "learning_rate": 5.083199878575609e-05, "loss": 0.196, "step": 15600 }, { "epoch": 328.63157894736844, "grad_norm": 0.5079600811004639, "learning_rate": 5.071563325191889e-05, "loss": 0.1775, "step": 15610 }, { "epoch": 328.8421052631579, "grad_norm": 0.6384349465370178, "learning_rate": 5.0599355797236205e-05, "loss": 0.2021, "step": 15620 }, { "epoch": 329.05263157894734, "grad_norm": 0.46881303191185, "learning_rate": 5.0483166629514654e-05, "loss": 0.1925, "step": 15630 }, { "epoch": 329.2631578947368, "grad_norm": 0.7932872176170349, "learning_rate": 5.03670659564031e-05, "loss": 0.1959, "step": 15640 }, { "epoch": 329.4736842105263, "grad_norm": 0.5118970274925232, "learning_rate": 5.025105398539227e-05, "loss": 0.1719, "step": 15650 }, { "epoch": 329.6842105263158, "grad_norm": 0.5387006402015686, "learning_rate": 5.0135130923814386e-05, "loss": 0.2077, "step": 15660 }, { "epoch": 329.89473684210526, "grad_norm": 0.6646759510040283, "learning_rate": 5.001929697884273e-05, "loss": 0.1943, "step": 15670 }, { "epoch": 330.10526315789474, "grad_norm": 0.7001065611839294, "learning_rate": 4.9903552357491404e-05, "loss": 0.1868, "step": 15680 }, { "epoch": 330.3157894736842, "grad_norm": 0.5651974081993103, "learning_rate": 4.978789726661472e-05, "loss": 0.1791, "step": 15690 }, { "epoch": 330.5263157894737, "grad_norm": 0.6005178093910217, "learning_rate": 4.9672331912907174e-05, "loss": 0.1871, "step": 15700 }, { "epoch": 330.7368421052632, "grad_norm": 0.6612521409988403, "learning_rate": 4.9556856502902745e-05, "loss": 0.1931, "step": 15710 }, { "epoch": 330.94736842105266, "grad_norm": 0.5340935587882996, "learning_rate": 4.944147124297468e-05, "loss": 0.19, "step": 15720 }, { "epoch": 331.1578947368421, "grad_norm": 0.6876013278961182, "learning_rate": 4.9326176339335225e-05, "loss": 0.1803, "step": 15730 }, { "epoch": 331.36842105263156, "grad_norm": 0.7157713770866394, "learning_rate": 4.921097199803503e-05, "loss": 0.1884, "step": 15740 }, { "epoch": 331.57894736842104, "grad_norm": 0.5469095706939697, "learning_rate": 4.909585842496287e-05, "loss": 0.1763, "step": 15750 }, { "epoch": 331.7894736842105, "grad_norm": 0.5198671817779541, "learning_rate": 4.8980835825845475e-05, "loss": 0.1859, "step": 15760 }, { "epoch": 332.0, "grad_norm": 0.6175537705421448, "learning_rate": 4.886590440624682e-05, "loss": 0.2021, "step": 15770 }, { "epoch": 332.2105263157895, "grad_norm": 0.7941794991493225, "learning_rate": 4.875106437156795e-05, "loss": 0.1942, "step": 15780 }, { "epoch": 332.42105263157896, "grad_norm": 0.5476126074790955, "learning_rate": 4.863631592704673e-05, "loss": 0.181, "step": 15790 }, { "epoch": 332.63157894736844, "grad_norm": 0.6549970507621765, "learning_rate": 4.852165927775713e-05, "loss": 0.1829, "step": 15800 }, { "epoch": 332.8421052631579, "grad_norm": 0.48638707399368286, "learning_rate": 4.840709462860925e-05, "loss": 0.1839, "step": 15810 }, { "epoch": 333.05263157894734, "grad_norm": 0.42391976714134216, "learning_rate": 4.8292622184348636e-05, "loss": 0.1905, "step": 15820 }, { "epoch": 333.2631578947368, "grad_norm": 0.5468048453330994, "learning_rate": 4.8178242149556176e-05, "loss": 0.1918, "step": 15830 }, { "epoch": 333.4736842105263, "grad_norm": 0.8398631811141968, "learning_rate": 4.806395472864749e-05, "loss": 0.179, "step": 15840 }, { "epoch": 333.6842105263158, "grad_norm": 0.6367696523666382, "learning_rate": 4.79497601258727e-05, "loss": 0.1901, "step": 15850 }, { "epoch": 333.89473684210526, "grad_norm": 0.5233663320541382, "learning_rate": 4.783565854531615e-05, "loss": 0.1903, "step": 15860 }, { "epoch": 334.10526315789474, "grad_norm": 0.5648738741874695, "learning_rate": 4.7721650190895826e-05, "loss": 0.1979, "step": 15870 }, { "epoch": 334.3157894736842, "grad_norm": 0.5198102593421936, "learning_rate": 4.760773526636315e-05, "loss": 0.1795, "step": 15880 }, { "epoch": 334.5263157894737, "grad_norm": 0.6149677038192749, "learning_rate": 4.7493913975302526e-05, "loss": 0.1966, "step": 15890 }, { "epoch": 334.7368421052632, "grad_norm": 0.49036896228790283, "learning_rate": 4.738018652113113e-05, "loss": 0.1873, "step": 15900 }, { "epoch": 334.94736842105266, "grad_norm": 0.5529518127441406, "learning_rate": 4.7266553107098274e-05, "loss": 0.192, "step": 15910 }, { "epoch": 335.1578947368421, "grad_norm": 0.6180853843688965, "learning_rate": 4.715301393628534e-05, "loss": 0.185, "step": 15920 }, { "epoch": 335.36842105263156, "grad_norm": 0.5150652527809143, "learning_rate": 4.703956921160528e-05, "loss": 0.1899, "step": 15930 }, { "epoch": 335.57894736842104, "grad_norm": 0.7054027318954468, "learning_rate": 4.6926219135802173e-05, "loss": 0.1837, "step": 15940 }, { "epoch": 335.7894736842105, "grad_norm": 0.5216421484947205, "learning_rate": 4.6812963911450934e-05, "loss": 0.1855, "step": 15950 }, { "epoch": 336.0, "grad_norm": 0.586047887802124, "learning_rate": 4.669980374095709e-05, "loss": 0.1972, "step": 15960 }, { "epoch": 336.2105263157895, "grad_norm": 0.6370906233787537, "learning_rate": 4.65867388265562e-05, "loss": 0.1911, "step": 15970 }, { "epoch": 336.42105263157896, "grad_norm": 0.5958567261695862, "learning_rate": 4.647376937031356e-05, "loss": 0.1889, "step": 15980 }, { "epoch": 336.63157894736844, "grad_norm": 0.45742881298065186, "learning_rate": 4.63608955741239e-05, "loss": 0.1784, "step": 15990 }, { "epoch": 336.8421052631579, "grad_norm": 0.7190865278244019, "learning_rate": 4.6248117639711044e-05, "loss": 0.1832, "step": 16000 }, { "epoch": 337.05263157894734, "grad_norm": 0.4909994900226593, "learning_rate": 4.613543576862743e-05, "loss": 0.2005, "step": 16010 }, { "epoch": 337.2631578947368, "grad_norm": 0.5645287036895752, "learning_rate": 4.6022850162253795e-05, "loss": 0.1745, "step": 16020 }, { "epoch": 337.4736842105263, "grad_norm": 0.5330631136894226, "learning_rate": 4.591036102179893e-05, "loss": 0.1906, "step": 16030 }, { "epoch": 337.6842105263158, "grad_norm": 0.6603984832763672, "learning_rate": 4.579796854829911e-05, "loss": 0.2005, "step": 16040 }, { "epoch": 337.89473684210526, "grad_norm": 0.6532229781150818, "learning_rate": 4.568567294261797e-05, "loss": 0.1868, "step": 16050 }, { "epoch": 338.10526315789474, "grad_norm": 0.6483712792396545, "learning_rate": 4.55734744054459e-05, "loss": 0.1927, "step": 16060 }, { "epoch": 338.3157894736842, "grad_norm": 0.592397153377533, "learning_rate": 4.546137313729996e-05, "loss": 0.1991, "step": 16070 }, { "epoch": 338.5263157894737, "grad_norm": 0.5688551664352417, "learning_rate": 4.534936933852324e-05, "loss": 0.1859, "step": 16080 }, { "epoch": 338.7368421052632, "grad_norm": 0.710352897644043, "learning_rate": 4.523746320928465e-05, "loss": 0.1932, "step": 16090 }, { "epoch": 338.94736842105266, "grad_norm": 0.6369743943214417, "learning_rate": 4.5125654949578674e-05, "loss": 0.1927, "step": 16100 }, { "epoch": 339.1578947368421, "grad_norm": 0.5039718747138977, "learning_rate": 4.5013944759224755e-05, "loss": 0.196, "step": 16110 }, { "epoch": 339.36842105263156, "grad_norm": 0.5557557940483093, "learning_rate": 4.490233283786709e-05, "loss": 0.171, "step": 16120 }, { "epoch": 339.57894736842104, "grad_norm": 0.5374524593353271, "learning_rate": 4.479081938497435e-05, "loss": 0.2013, "step": 16130 }, { "epoch": 339.7894736842105, "grad_norm": 0.48493510484695435, "learning_rate": 4.4679404599839116e-05, "loss": 0.1812, "step": 16140 }, { "epoch": 340.0, "grad_norm": 0.6007246971130371, "learning_rate": 4.456808868157762e-05, "loss": 0.1925, "step": 16150 }, { "epoch": 340.2105263157895, "grad_norm": 0.49306368827819824, "learning_rate": 4.445687182912953e-05, "loss": 0.1747, "step": 16160 }, { "epoch": 340.42105263157896, "grad_norm": 0.5362128615379333, "learning_rate": 4.434575424125741e-05, "loss": 0.1861, "step": 16170 }, { "epoch": 340.63157894736844, "grad_norm": 0.6607531309127808, "learning_rate": 4.4234736116546364e-05, "loss": 0.1833, "step": 16180 }, { "epoch": 340.8421052631579, "grad_norm": 0.4933454692363739, "learning_rate": 4.4123817653403756e-05, "loss": 0.1889, "step": 16190 }, { "epoch": 341.05263157894734, "grad_norm": 0.6923052668571472, "learning_rate": 4.401299905005893e-05, "loss": 0.1935, "step": 16200 }, { "epoch": 341.2631578947368, "grad_norm": 0.4283411502838135, "learning_rate": 4.390228050456267e-05, "loss": 0.1871, "step": 16210 }, { "epoch": 341.4736842105263, "grad_norm": 0.5722445249557495, "learning_rate": 4.379166221478697e-05, "loss": 0.1831, "step": 16220 }, { "epoch": 341.6842105263158, "grad_norm": 0.5370818972587585, "learning_rate": 4.368114437842461e-05, "loss": 0.1974, "step": 16230 }, { "epoch": 341.89473684210526, "grad_norm": 0.7643104195594788, "learning_rate": 4.357072719298895e-05, "loss": 0.1863, "step": 16240 }, { "epoch": 342.10526315789474, "grad_norm": 0.6327834129333496, "learning_rate": 4.3460410855813374e-05, "loss": 0.1774, "step": 16250 }, { "epoch": 342.3157894736842, "grad_norm": 0.5486928820610046, "learning_rate": 4.3350195564051013e-05, "loss": 0.1784, "step": 16260 }, { "epoch": 342.5263157894737, "grad_norm": 0.4978579580783844, "learning_rate": 4.3240081514674526e-05, "loss": 0.1834, "step": 16270 }, { "epoch": 342.7368421052632, "grad_norm": 0.6283280253410339, "learning_rate": 4.3130068904475586e-05, "loss": 0.1831, "step": 16280 }, { "epoch": 342.94736842105266, "grad_norm": 0.7025750875473022, "learning_rate": 4.302015793006451e-05, "loss": 0.2037, "step": 16290 }, { "epoch": 343.1578947368421, "grad_norm": 0.5586254000663757, "learning_rate": 4.2910348787870094e-05, "loss": 0.2055, "step": 16300 }, { "epoch": 343.36842105263156, "grad_norm": 0.5136498808860779, "learning_rate": 4.280064167413904e-05, "loss": 0.1858, "step": 16310 }, { "epoch": 343.57894736842104, "grad_norm": 0.6225966811180115, "learning_rate": 4.2691036784935756e-05, "loss": 0.1814, "step": 16320 }, { "epoch": 343.7894736842105, "grad_norm": 0.542299747467041, "learning_rate": 4.258153431614193e-05, "loss": 0.1869, "step": 16330 }, { "epoch": 344.0, "grad_norm": 0.7111690044403076, "learning_rate": 4.247213446345626e-05, "loss": 0.1881, "step": 16340 }, { "epoch": 344.2105263157895, "grad_norm": 0.4649767577648163, "learning_rate": 4.236283742239401e-05, "loss": 0.1838, "step": 16350 }, { "epoch": 344.42105263157896, "grad_norm": 0.5829393863677979, "learning_rate": 4.225364338828668e-05, "loss": 0.1988, "step": 16360 }, { "epoch": 344.63157894736844, "grad_norm": 0.6762527227401733, "learning_rate": 4.214455255628178e-05, "loss": 0.1812, "step": 16370 }, { "epoch": 344.8421052631579, "grad_norm": 0.5468589663505554, "learning_rate": 4.2035565121342246e-05, "loss": 0.1892, "step": 16380 }, { "epoch": 345.05263157894734, "grad_norm": 0.4818148612976074, "learning_rate": 4.1926681278246374e-05, "loss": 0.1773, "step": 16390 }, { "epoch": 345.2631578947368, "grad_norm": 0.46287909150123596, "learning_rate": 4.181790122158716e-05, "loss": 0.1774, "step": 16400 }, { "epoch": 345.4736842105263, "grad_norm": 0.7505784630775452, "learning_rate": 4.170922514577228e-05, "loss": 0.2055, "step": 16410 }, { "epoch": 345.6842105263158, "grad_norm": 0.47206437587738037, "learning_rate": 4.160065324502348e-05, "loss": 0.1876, "step": 16420 }, { "epoch": 345.89473684210526, "grad_norm": 0.4681949317455292, "learning_rate": 4.14921857133763e-05, "loss": 0.1824, "step": 16430 }, { "epoch": 346.10526315789474, "grad_norm": 0.6797969937324524, "learning_rate": 4.1383822744679866e-05, "loss": 0.1885, "step": 16440 }, { "epoch": 346.3157894736842, "grad_norm": 0.5295494794845581, "learning_rate": 4.127556453259637e-05, "loss": 0.1743, "step": 16450 }, { "epoch": 346.5263157894737, "grad_norm": 0.6828837394714355, "learning_rate": 4.116741127060073e-05, "loss": 0.1808, "step": 16460 }, { "epoch": 346.7368421052632, "grad_norm": 0.6107656955718994, "learning_rate": 4.105936315198043e-05, "loss": 0.1928, "step": 16470 }, { "epoch": 346.94736842105266, "grad_norm": 0.5245572924613953, "learning_rate": 4.095142036983497e-05, "loss": 0.2069, "step": 16480 }, { "epoch": 347.1578947368421, "grad_norm": 0.49881359934806824, "learning_rate": 4.0843583117075576e-05, "loss": 0.18, "step": 16490 }, { "epoch": 347.36842105263156, "grad_norm": 0.8077996969223022, "learning_rate": 4.073585158642488e-05, "loss": 0.19, "step": 16500 }, { "epoch": 347.57894736842104, "grad_norm": 0.5267335176467896, "learning_rate": 4.062822597041663e-05, "loss": 0.1931, "step": 16510 }, { "epoch": 347.7894736842105, "grad_norm": 0.672138512134552, "learning_rate": 4.052070646139529e-05, "loss": 0.184, "step": 16520 }, { "epoch": 348.0, "grad_norm": 0.5576406717300415, "learning_rate": 4.0413293251515574e-05, "loss": 0.1909, "step": 16530 }, { "epoch": 348.2105263157895, "grad_norm": 0.6533356308937073, "learning_rate": 4.030598653274238e-05, "loss": 0.1859, "step": 16540 }, { "epoch": 348.42105263157896, "grad_norm": 0.6966098546981812, "learning_rate": 4.019878649685018e-05, "loss": 0.179, "step": 16550 }, { "epoch": 348.63157894736844, "grad_norm": 0.5067752003669739, "learning_rate": 4.009169333542283e-05, "loss": 0.1943, "step": 16560 }, { "epoch": 348.8421052631579, "grad_norm": 0.7860790491104126, "learning_rate": 3.998470723985312e-05, "loss": 0.1957, "step": 16570 }, { "epoch": 349.05263157894734, "grad_norm": 0.6456900835037231, "learning_rate": 3.987782840134263e-05, "loss": 0.1946, "step": 16580 }, { "epoch": 349.2631578947368, "grad_norm": 0.4159180819988251, "learning_rate": 3.977105701090115e-05, "loss": 0.1864, "step": 16590 }, { "epoch": 349.4736842105263, "grad_norm": 0.5213875770568848, "learning_rate": 3.96643932593464e-05, "loss": 0.1913, "step": 16600 }, { "epoch": 349.6842105263158, "grad_norm": 0.6271671652793884, "learning_rate": 3.9557837337303906e-05, "loss": 0.1888, "step": 16610 }, { "epoch": 349.89473684210526, "grad_norm": 0.7119611501693726, "learning_rate": 3.945138943520628e-05, "loss": 0.1824, "step": 16620 }, { "epoch": 350.10526315789474, "grad_norm": 0.45116692781448364, "learning_rate": 3.934504974329326e-05, "loss": 0.1866, "step": 16630 }, { "epoch": 350.3157894736842, "grad_norm": 0.4684094488620758, "learning_rate": 3.9238818451611056e-05, "loss": 0.1807, "step": 16640 }, { "epoch": 350.5263157894737, "grad_norm": 0.5230817794799805, "learning_rate": 3.913269575001228e-05, "loss": 0.1866, "step": 16650 }, { "epoch": 350.7368421052632, "grad_norm": 0.7680791616439819, "learning_rate": 3.9026681828155366e-05, "loss": 0.1959, "step": 16660 }, { "epoch": 350.94736842105266, "grad_norm": 0.47278884053230286, "learning_rate": 3.892077687550435e-05, "loss": 0.1774, "step": 16670 }, { "epoch": 351.1578947368421, "grad_norm": 0.6973139643669128, "learning_rate": 3.8814981081328615e-05, "loss": 0.1845, "step": 16680 }, { "epoch": 351.36842105263156, "grad_norm": 0.5785807967185974, "learning_rate": 3.8709294634702376e-05, "loss": 0.2071, "step": 16690 }, { "epoch": 351.57894736842104, "grad_norm": 0.5238916277885437, "learning_rate": 3.8603717724504404e-05, "loss": 0.1744, "step": 16700 }, { "epoch": 351.7894736842105, "grad_norm": 0.7316076159477234, "learning_rate": 3.8498250539417835e-05, "loss": 0.1834, "step": 16710 }, { "epoch": 352.0, "grad_norm": 0.5799602270126343, "learning_rate": 3.8392893267929597e-05, "loss": 0.1788, "step": 16720 }, { "epoch": 352.2105263157895, "grad_norm": 0.6016362905502319, "learning_rate": 3.8287646098330166e-05, "loss": 0.1821, "step": 16730 }, { "epoch": 352.42105263157896, "grad_norm": 0.5646636486053467, "learning_rate": 3.818250921871338e-05, "loss": 0.1875, "step": 16740 }, { "epoch": 352.63157894736844, "grad_norm": 0.5801871418952942, "learning_rate": 3.807748281697583e-05, "loss": 0.1926, "step": 16750 }, { "epoch": 352.8421052631579, "grad_norm": 0.4862273037433624, "learning_rate": 3.797256708081678e-05, "loss": 0.1938, "step": 16760 }, { "epoch": 353.05263157894734, "grad_norm": 0.5285743474960327, "learning_rate": 3.786776219773759e-05, "loss": 0.1796, "step": 16770 }, { "epoch": 353.2631578947368, "grad_norm": 0.5707828998565674, "learning_rate": 3.776306835504166e-05, "loss": 0.187, "step": 16780 }, { "epoch": 353.4736842105263, "grad_norm": 0.5975477695465088, "learning_rate": 3.7658485739833824e-05, "loss": 0.1862, "step": 16790 }, { "epoch": 353.6842105263158, "grad_norm": 0.6158720850944519, "learning_rate": 3.7554014539020134e-05, "loss": 0.1942, "step": 16800 }, { "epoch": 353.89473684210526, "grad_norm": 0.5053333044052124, "learning_rate": 3.7449654939307635e-05, "loss": 0.1784, "step": 16810 }, { "epoch": 354.10526315789474, "grad_norm": 0.5054594874382019, "learning_rate": 3.7345407127203826e-05, "loss": 0.1938, "step": 16820 }, { "epoch": 354.3157894736842, "grad_norm": 0.5698423385620117, "learning_rate": 3.724127128901644e-05, "loss": 0.183, "step": 16830 }, { "epoch": 354.5263157894737, "grad_norm": 0.5378226637840271, "learning_rate": 3.713724761085308e-05, "loss": 0.182, "step": 16840 }, { "epoch": 354.7368421052632, "grad_norm": 0.6348969340324402, "learning_rate": 3.703333627862099e-05, "loss": 0.1928, "step": 16850 }, { "epoch": 354.94736842105266, "grad_norm": 0.6424091458320618, "learning_rate": 3.692953747802649e-05, "loss": 0.1792, "step": 16860 }, { "epoch": 355.1578947368421, "grad_norm": 0.5656790733337402, "learning_rate": 3.683621492536592e-05, "loss": 0.1836, "step": 16870 }, { "epoch": 355.36842105263156, "grad_norm": 0.5092337727546692, "learning_rate": 3.6732630445783543e-05, "loss": 0.1819, "step": 16880 }, { "epoch": 355.57894736842104, "grad_norm": 0.5355526804924011, "learning_rate": 3.662915903524888e-05, "loss": 0.2005, "step": 16890 }, { "epoch": 355.7894736842105, "grad_norm": 0.5522583723068237, "learning_rate": 3.6525800878682084e-05, "loss": 0.176, "step": 16900 }, { "epoch": 356.0, "grad_norm": 0.5521705150604248, "learning_rate": 3.642255616080101e-05, "loss": 0.2027, "step": 16910 }, { "epoch": 356.2105263157895, "grad_norm": 0.6535980701446533, "learning_rate": 3.631942506612064e-05, "loss": 0.176, "step": 16920 }, { "epoch": 356.42105263157896, "grad_norm": 0.7658635973930359, "learning_rate": 3.6216407778953033e-05, "loss": 0.1811, "step": 16930 }, { "epoch": 356.63157894736844, "grad_norm": 0.6077464818954468, "learning_rate": 3.61135044834067e-05, "loss": 0.1914, "step": 16940 }, { "epoch": 356.8421052631579, "grad_norm": 0.5396264791488647, "learning_rate": 3.601071536338661e-05, "loss": 0.1869, "step": 16950 }, { "epoch": 357.05263157894734, "grad_norm": 0.5491655468940735, "learning_rate": 3.590804060259354e-05, "loss": 0.189, "step": 16960 }, { "epoch": 357.2631578947368, "grad_norm": 0.562882125377655, "learning_rate": 3.5805480384523895e-05, "loss": 0.1792, "step": 16970 }, { "epoch": 357.4736842105263, "grad_norm": 0.6570281982421875, "learning_rate": 3.570303489246949e-05, "loss": 0.1885, "step": 16980 }, { "epoch": 357.6842105263158, "grad_norm": 0.5322051048278809, "learning_rate": 3.5600704309516997e-05, "loss": 0.1805, "step": 16990 }, { "epoch": 357.89473684210526, "grad_norm": 0.6352676153182983, "learning_rate": 3.549848881854772e-05, "loss": 0.1897, "step": 17000 }, { "epoch": 358.10526315789474, "grad_norm": 0.6159738302230835, "learning_rate": 3.539638860223738e-05, "loss": 0.1955, "step": 17010 }, { "epoch": 358.3157894736842, "grad_norm": 0.5578885078430176, "learning_rate": 3.52944038430556e-05, "loss": 0.1817, "step": 17020 }, { "epoch": 358.5263157894737, "grad_norm": 0.5949670672416687, "learning_rate": 3.519253472326562e-05, "loss": 0.195, "step": 17030 }, { "epoch": 358.7368421052632, "grad_norm": 0.4901580512523651, "learning_rate": 3.509078142492418e-05, "loss": 0.187, "step": 17040 }, { "epoch": 358.94736842105266, "grad_norm": 0.5938405990600586, "learning_rate": 3.498914412988083e-05, "loss": 0.1811, "step": 17050 }, { "epoch": 359.1578947368421, "grad_norm": 0.5262094736099243, "learning_rate": 3.488762301977796e-05, "loss": 0.1899, "step": 17060 }, { "epoch": 359.36842105263156, "grad_norm": 0.41206714510917664, "learning_rate": 3.47862182760502e-05, "loss": 0.1825, "step": 17070 }, { "epoch": 359.57894736842104, "grad_norm": 0.4352954626083374, "learning_rate": 3.468493007992433e-05, "loss": 0.1758, "step": 17080 }, { "epoch": 359.7894736842105, "grad_norm": 0.6213468909263611, "learning_rate": 3.458375861241874e-05, "loss": 0.1799, "step": 17090 }, { "epoch": 360.0, "grad_norm": 0.7452707290649414, "learning_rate": 3.448270405434323e-05, "loss": 0.1959, "step": 17100 }, { "epoch": 360.2105263157895, "grad_norm": 0.7887313365936279, "learning_rate": 3.438176658629873e-05, "loss": 0.1806, "step": 17110 }, { "epoch": 360.42105263157896, "grad_norm": 0.6283901333808899, "learning_rate": 3.428094638867684e-05, "loss": 0.1865, "step": 17120 }, { "epoch": 360.63157894736844, "grad_norm": 0.6054136157035828, "learning_rate": 3.418024364165959e-05, "loss": 0.1943, "step": 17130 }, { "epoch": 360.8421052631579, "grad_norm": 0.5418136119842529, "learning_rate": 3.4079658525219106e-05, "loss": 0.1799, "step": 17140 }, { "epoch": 361.05263157894734, "grad_norm": 0.4626767039299011, "learning_rate": 3.397919121911734e-05, "loss": 0.1906, "step": 17150 }, { "epoch": 361.2631578947368, "grad_norm": 0.4851849377155304, "learning_rate": 3.38788419029056e-05, "loss": 0.1909, "step": 17160 }, { "epoch": 361.4736842105263, "grad_norm": 0.4757574796676636, "learning_rate": 3.377861075592442e-05, "loss": 0.1731, "step": 17170 }, { "epoch": 361.6842105263158, "grad_norm": 0.6023973822593689, "learning_rate": 3.367849795730314e-05, "loss": 0.1866, "step": 17180 }, { "epoch": 361.89473684210526, "grad_norm": 0.49852490425109863, "learning_rate": 3.357850368595955e-05, "loss": 0.2079, "step": 17190 }, { "epoch": 362.10526315789474, "grad_norm": 0.569017231464386, "learning_rate": 3.3478628120599573e-05, "loss": 0.1862, "step": 17200 }, { "epoch": 362.3157894736842, "grad_norm": 0.5501569509506226, "learning_rate": 3.337887143971711e-05, "loss": 0.1872, "step": 17210 }, { "epoch": 362.5263157894737, "grad_norm": 0.7906637787818909, "learning_rate": 3.3279233821593494e-05, "loss": 0.1925, "step": 17220 }, { "epoch": 362.7368421052632, "grad_norm": 0.6459226608276367, "learning_rate": 3.3179715444297286e-05, "loss": 0.1832, "step": 17230 }, { "epoch": 362.94736842105266, "grad_norm": 0.6349130272865295, "learning_rate": 3.308031648568396e-05, "loss": 0.202, "step": 17240 }, { "epoch": 363.1578947368421, "grad_norm": 0.4548916816711426, "learning_rate": 3.298103712339562e-05, "loss": 0.1833, "step": 17250 }, { "epoch": 363.36842105263156, "grad_norm": 0.53446364402771, "learning_rate": 3.288187753486056e-05, "loss": 0.1857, "step": 17260 }, { "epoch": 363.57894736842104, "grad_norm": 0.8272064328193665, "learning_rate": 3.2782837897293e-05, "loss": 0.1796, "step": 17270 }, { "epoch": 363.7894736842105, "grad_norm": 0.538015604019165, "learning_rate": 3.268391838769286e-05, "loss": 0.1971, "step": 17280 }, { "epoch": 364.0, "grad_norm": 0.6953075528144836, "learning_rate": 3.258511918284538e-05, "loss": 0.1908, "step": 17290 }, { "epoch": 364.2105263157895, "grad_norm": 0.64228755235672, "learning_rate": 3.248644045932074e-05, "loss": 0.1865, "step": 17300 }, { "epoch": 364.42105263157896, "grad_norm": 0.4800601899623871, "learning_rate": 3.2387882393473766e-05, "loss": 0.1773, "step": 17310 }, { "epoch": 364.63157894736844, "grad_norm": 0.8892537951469421, "learning_rate": 3.228944516144379e-05, "loss": 0.1917, "step": 17320 }, { "epoch": 364.8421052631579, "grad_norm": 0.6665611863136292, "learning_rate": 3.219112893915405e-05, "loss": 0.1877, "step": 17330 }, { "epoch": 365.05263157894734, "grad_norm": 0.5820897221565247, "learning_rate": 3.209293390231155e-05, "loss": 0.1975, "step": 17340 }, { "epoch": 365.2631578947368, "grad_norm": 0.6017282009124756, "learning_rate": 3.199486022640681e-05, "loss": 0.1879, "step": 17350 }, { "epoch": 365.4736842105263, "grad_norm": 0.5204026103019714, "learning_rate": 3.189690808671336e-05, "loss": 0.1886, "step": 17360 }, { "epoch": 365.6842105263158, "grad_norm": 0.5471468567848206, "learning_rate": 3.1799077658287534e-05, "loss": 0.1852, "step": 17370 }, { "epoch": 365.89473684210526, "grad_norm": 0.5290389060974121, "learning_rate": 3.170136911596822e-05, "loss": 0.1863, "step": 17380 }, { "epoch": 366.10526315789474, "grad_norm": 0.5705314874649048, "learning_rate": 3.160378263437639e-05, "loss": 0.1946, "step": 17390 }, { "epoch": 366.3157894736842, "grad_norm": 0.5175087451934814, "learning_rate": 3.150631838791489e-05, "loss": 0.1771, "step": 17400 }, { "epoch": 366.5263157894737, "grad_norm": 0.5701705813407898, "learning_rate": 3.1408976550768156e-05, "loss": 0.1878, "step": 17410 }, { "epoch": 366.7368421052632, "grad_norm": 0.7232944965362549, "learning_rate": 3.131175729690187e-05, "loss": 0.1968, "step": 17420 }, { "epoch": 366.94736842105266, "grad_norm": 0.5719231963157654, "learning_rate": 3.1214660800062567e-05, "loss": 0.1725, "step": 17430 }, { "epoch": 367.1578947368421, "grad_norm": 0.542506217956543, "learning_rate": 3.111768723377741e-05, "loss": 0.1909, "step": 17440 }, { "epoch": 367.36842105263156, "grad_norm": 0.5347987413406372, "learning_rate": 3.1020836771353926e-05, "loss": 0.1861, "step": 17450 }, { "epoch": 367.57894736842104, "grad_norm": 0.5752246975898743, "learning_rate": 3.092410958587958e-05, "loss": 0.1868, "step": 17460 }, { "epoch": 367.7894736842105, "grad_norm": 0.5393999814987183, "learning_rate": 3.082750585022153e-05, "loss": 0.1806, "step": 17470 }, { "epoch": 368.0, "grad_norm": 0.5871136784553528, "learning_rate": 3.073102573702629e-05, "loss": 0.193, "step": 17480 }, { "epoch": 368.2105263157895, "grad_norm": 0.6469846963882446, "learning_rate": 3.063466941871952e-05, "loss": 0.1846, "step": 17490 }, { "epoch": 368.42105263157896, "grad_norm": 0.5563372373580933, "learning_rate": 3.0538437067505565e-05, "loss": 0.1901, "step": 17500 }, { "epoch": 368.63157894736844, "grad_norm": 0.5169476270675659, "learning_rate": 3.0442328855367197e-05, "loss": 0.1779, "step": 17510 }, { "epoch": 368.8421052631579, "grad_norm": 0.5597511529922485, "learning_rate": 3.0346344954065408e-05, "loss": 0.1934, "step": 17520 }, { "epoch": 369.05263157894734, "grad_norm": 0.5994619727134705, "learning_rate": 3.0250485535139028e-05, "loss": 0.182, "step": 17530 }, { "epoch": 369.2631578947368, "grad_norm": 0.5678781270980835, "learning_rate": 3.0154750769904317e-05, "loss": 0.1866, "step": 17540 }, { "epoch": 369.4736842105263, "grad_norm": 0.5461990237236023, "learning_rate": 3.005914082945488e-05, "loss": 0.1935, "step": 17550 }, { "epoch": 369.6842105263158, "grad_norm": 0.5570061802864075, "learning_rate": 2.996365588466117e-05, "loss": 0.1912, "step": 17560 }, { "epoch": 369.89473684210526, "grad_norm": 0.5612841844558716, "learning_rate": 2.9868296106170236e-05, "loss": 0.1772, "step": 17570 }, { "epoch": 370.10526315789474, "grad_norm": 0.5573035478591919, "learning_rate": 2.9773061664405455e-05, "loss": 0.1834, "step": 17580 }, { "epoch": 370.3157894736842, "grad_norm": 0.5268656611442566, "learning_rate": 2.9677952729566284e-05, "loss": 0.1801, "step": 17590 }, { "epoch": 370.5263157894737, "grad_norm": 0.4846976697444916, "learning_rate": 2.958296947162775e-05, "loss": 0.1878, "step": 17600 }, { "epoch": 370.7368421052632, "grad_norm": 0.625234067440033, "learning_rate": 2.9488112060340333e-05, "loss": 0.1863, "step": 17610 }, { "epoch": 370.94736842105266, "grad_norm": 0.7202187180519104, "learning_rate": 2.9393380665229666e-05, "loss": 0.1998, "step": 17620 }, { "epoch": 371.1578947368421, "grad_norm": 0.8498916029930115, "learning_rate": 2.9298775455596027e-05, "loss": 0.1814, "step": 17630 }, { "epoch": 371.36842105263156, "grad_norm": 0.5455896854400635, "learning_rate": 2.920429660051436e-05, "loss": 0.1823, "step": 17640 }, { "epoch": 371.57894736842104, "grad_norm": 0.6018717288970947, "learning_rate": 2.910994426883361e-05, "loss": 0.1923, "step": 17650 }, { "epoch": 371.7894736842105, "grad_norm": 0.8865271806716919, "learning_rate": 2.9015718629176758e-05, "loss": 0.1908, "step": 17660 }, { "epoch": 372.0, "grad_norm": 0.7926931381225586, "learning_rate": 2.8921619849940286e-05, "loss": 0.1847, "step": 17670 }, { "epoch": 372.2105263157895, "grad_norm": 0.531242847442627, "learning_rate": 2.8827648099293925e-05, "loss": 0.18, "step": 17680 }, { "epoch": 372.42105263157896, "grad_norm": 0.7653034925460815, "learning_rate": 2.8733803545180492e-05, "loss": 0.1898, "step": 17690 }, { "epoch": 372.63157894736844, "grad_norm": 0.5573826432228088, "learning_rate": 2.86400863553154e-05, "loss": 0.1743, "step": 17700 }, { "epoch": 372.8421052631579, "grad_norm": 0.5950312614440918, "learning_rate": 2.854649669718642e-05, "loss": 0.2007, "step": 17710 }, { "epoch": 373.05263157894734, "grad_norm": 0.48360878229141235, "learning_rate": 2.845303473805352e-05, "loss": 0.1765, "step": 17720 }, { "epoch": 373.2631578947368, "grad_norm": 0.6277357935905457, "learning_rate": 2.8359700644948327e-05, "loss": 0.1944, "step": 17730 }, { "epoch": 373.4736842105263, "grad_norm": 0.6063629984855652, "learning_rate": 2.8266494584673987e-05, "loss": 0.1862, "step": 17740 }, { "epoch": 373.6842105263158, "grad_norm": 0.5055272579193115, "learning_rate": 2.817341672380489e-05, "loss": 0.1818, "step": 17750 }, { "epoch": 373.89473684210526, "grad_norm": 0.5888772010803223, "learning_rate": 2.8080467228686203e-05, "loss": 0.1894, "step": 17760 }, { "epoch": 374.10526315789474, "grad_norm": 0.650392472743988, "learning_rate": 2.798764626543382e-05, "loss": 0.1881, "step": 17770 }, { "epoch": 374.3157894736842, "grad_norm": 0.8182873725891113, "learning_rate": 2.7894953999933783e-05, "loss": 0.1782, "step": 17780 }, { "epoch": 374.5263157894737, "grad_norm": 0.5956078171730042, "learning_rate": 2.7802390597842264e-05, "loss": 0.2024, "step": 17790 }, { "epoch": 374.7368421052632, "grad_norm": 0.5271019339561462, "learning_rate": 2.7709956224585033e-05, "loss": 0.1798, "step": 17800 }, { "epoch": 374.94736842105266, "grad_norm": 0.6090193390846252, "learning_rate": 2.7617651045357307e-05, "loss": 0.1864, "step": 17810 }, { "epoch": 375.1578947368421, "grad_norm": 0.5264062285423279, "learning_rate": 2.7525475225123377e-05, "loss": 0.1925, "step": 17820 }, { "epoch": 375.36842105263156, "grad_norm": 0.8413964509963989, "learning_rate": 2.7433428928616444e-05, "loss": 0.1828, "step": 17830 }, { "epoch": 375.57894736842104, "grad_norm": 0.5953909158706665, "learning_rate": 2.7341512320338125e-05, "loss": 0.1952, "step": 17840 }, { "epoch": 375.7894736842105, "grad_norm": 0.6618128418922424, "learning_rate": 2.7249725564558294e-05, "loss": 0.1924, "step": 17850 }, { "epoch": 376.0, "grad_norm": 0.5150593519210815, "learning_rate": 2.7158068825314798e-05, "loss": 0.1819, "step": 17860 }, { "epoch": 376.2105263157895, "grad_norm": 0.6715971827507019, "learning_rate": 2.7066542266413042e-05, "loss": 0.1881, "step": 17870 }, { "epoch": 376.42105263157896, "grad_norm": 0.6898446679115295, "learning_rate": 2.6975146051425892e-05, "loss": 0.1784, "step": 17880 }, { "epoch": 376.63157894736844, "grad_norm": 0.549727737903595, "learning_rate": 2.6883880343693146e-05, "loss": 0.1982, "step": 17890 }, { "epoch": 376.8421052631579, "grad_norm": 0.5464901328086853, "learning_rate": 2.6792745306321464e-05, "loss": 0.2045, "step": 17900 }, { "epoch": 377.05263157894734, "grad_norm": 0.6344653367996216, "learning_rate": 2.670174110218393e-05, "loss": 0.1812, "step": 17910 }, { "epoch": 377.2631578947368, "grad_norm": 0.543652355670929, "learning_rate": 2.6610867893919768e-05, "loss": 0.1899, "step": 17920 }, { "epoch": 377.4736842105263, "grad_norm": 0.6504817605018616, "learning_rate": 2.6520125843934184e-05, "loss": 0.1917, "step": 17930 }, { "epoch": 377.6842105263158, "grad_norm": 0.511242687702179, "learning_rate": 2.6429515114397928e-05, "loss": 0.1858, "step": 17940 }, { "epoch": 377.89473684210526, "grad_norm": 0.4847296178340912, "learning_rate": 2.633903586724703e-05, "loss": 0.1846, "step": 17950 }, { "epoch": 378.10526315789474, "grad_norm": 0.5976632237434387, "learning_rate": 2.624868826418262e-05, "loss": 0.1861, "step": 17960 }, { "epoch": 378.3157894736842, "grad_norm": 0.5360187292098999, "learning_rate": 2.6158472466670502e-05, "loss": 0.1818, "step": 17970 }, { "epoch": 378.5263157894737, "grad_norm": 0.5664785504341125, "learning_rate": 2.6068388635940888e-05, "loss": 0.1807, "step": 17980 }, { "epoch": 378.7368421052632, "grad_norm": 0.6407875418663025, "learning_rate": 2.597843693298826e-05, "loss": 0.1823, "step": 17990 }, { "epoch": 378.94736842105266, "grad_norm": 0.6425795555114746, "learning_rate": 2.5888617518570834e-05, "loss": 0.1914, "step": 18000 }, { "epoch": 379.1578947368421, "grad_norm": 0.7084444165229797, "learning_rate": 2.5798930553210533e-05, "loss": 0.1944, "step": 18010 }, { "epoch": 379.36842105263156, "grad_norm": 0.6060276627540588, "learning_rate": 2.5709376197192437e-05, "loss": 0.1833, "step": 18020 }, { "epoch": 379.57894736842104, "grad_norm": 0.55902498960495, "learning_rate": 2.5619954610564767e-05, "loss": 0.1859, "step": 18030 }, { "epoch": 379.7894736842105, "grad_norm": 0.4850768744945526, "learning_rate": 2.5530665953138356e-05, "loss": 0.1866, "step": 18040 }, { "epoch": 380.0, "grad_norm": 0.6732696890830994, "learning_rate": 2.54415103844865e-05, "loss": 0.1842, "step": 18050 }, { "epoch": 380.2105263157895, "grad_norm": 0.48501572012901306, "learning_rate": 2.53524880639447e-05, "loss": 0.173, "step": 18060 }, { "epoch": 380.42105263157896, "grad_norm": 0.4836499094963074, "learning_rate": 2.526359915061025e-05, "loss": 0.1913, "step": 18070 }, { "epoch": 380.63157894736844, "grad_norm": 0.8080970048904419, "learning_rate": 2.5174843803342062e-05, "loss": 0.1905, "step": 18080 }, { "epoch": 380.8421052631579, "grad_norm": 0.530227780342102, "learning_rate": 2.5086222180760298e-05, "loss": 0.187, "step": 18090 }, { "epoch": 381.05263157894734, "grad_norm": 0.45946136116981506, "learning_rate": 2.499773444124621e-05, "loss": 0.1861, "step": 18100 }, { "epoch": 381.2631578947368, "grad_norm": 0.5182830095291138, "learning_rate": 2.4909380742941703e-05, "loss": 0.1856, "step": 18110 }, { "epoch": 381.4736842105263, "grad_norm": 0.8414605259895325, "learning_rate": 2.482116124374918e-05, "loss": 0.1873, "step": 18120 }, { "epoch": 381.6842105263158, "grad_norm": 0.6385777592658997, "learning_rate": 2.473307610133121e-05, "loss": 0.194, "step": 18130 }, { "epoch": 381.89473684210526, "grad_norm": 0.6704742312431335, "learning_rate": 2.464512547311021e-05, "loss": 0.1955, "step": 18140 }, { "epoch": 382.10526315789474, "grad_norm": 0.5813697576522827, "learning_rate": 2.45573095162682e-05, "loss": 0.1752, "step": 18150 }, { "epoch": 382.3157894736842, "grad_norm": 0.6703978180885315, "learning_rate": 2.4469628387746523e-05, "loss": 0.1811, "step": 18160 }, { "epoch": 382.5263157894737, "grad_norm": 0.5987458825111389, "learning_rate": 2.438208224424561e-05, "loss": 0.1813, "step": 18170 }, { "epoch": 382.7368421052632, "grad_norm": 0.678972601890564, "learning_rate": 2.42946712422246e-05, "loss": 0.1964, "step": 18180 }, { "epoch": 382.94736842105266, "grad_norm": 0.5534781813621521, "learning_rate": 2.420739553790109e-05, "loss": 0.1738, "step": 18190 }, { "epoch": 383.1578947368421, "grad_norm": 0.4981260299682617, "learning_rate": 2.412025528725097e-05, "loss": 0.1803, "step": 18200 }, { "epoch": 383.36842105263156, "grad_norm": 0.5293849110603333, "learning_rate": 2.4033250646007976e-05, "loss": 0.193, "step": 18210 }, { "epoch": 383.57894736842104, "grad_norm": 0.4817885160446167, "learning_rate": 2.3946381769663484e-05, "loss": 0.1799, "step": 18220 }, { "epoch": 383.7894736842105, "grad_norm": 0.4696321189403534, "learning_rate": 2.3859648813466274e-05, "loss": 0.1982, "step": 18230 }, { "epoch": 384.0, "grad_norm": 0.761279821395874, "learning_rate": 2.377305193242224e-05, "loss": 0.1971, "step": 18240 }, { "epoch": 384.2105263157895, "grad_norm": 0.5053116679191589, "learning_rate": 2.3686591281294034e-05, "loss": 0.1784, "step": 18250 }, { "epoch": 384.42105263157896, "grad_norm": 0.6713317036628723, "learning_rate": 2.3600267014600796e-05, "loss": 0.1904, "step": 18260 }, { "epoch": 384.63157894736844, "grad_norm": 0.49430567026138306, "learning_rate": 2.3514079286618085e-05, "loss": 0.178, "step": 18270 }, { "epoch": 384.8421052631579, "grad_norm": 0.5539741516113281, "learning_rate": 2.3428028251377278e-05, "loss": 0.1921, "step": 18280 }, { "epoch": 385.05263157894734, "grad_norm": 0.5688350200653076, "learning_rate": 2.3342114062665533e-05, "loss": 0.1851, "step": 18290 }, { "epoch": 385.2631578947368, "grad_norm": 0.46466580033302307, "learning_rate": 2.3256336874025463e-05, "loss": 0.1808, "step": 18300 }, { "epoch": 385.4736842105263, "grad_norm": 0.5379757285118103, "learning_rate": 2.3170696838754814e-05, "loss": 0.1842, "step": 18310 }, { "epoch": 385.6842105263158, "grad_norm": 0.5040892362594604, "learning_rate": 2.308519410990618e-05, "loss": 0.1789, "step": 18320 }, { "epoch": 385.89473684210526, "grad_norm": 0.48588046431541443, "learning_rate": 2.2999828840286807e-05, "loss": 0.191, "step": 18330 }, { "epoch": 386.10526315789474, "grad_norm": 0.7209267020225525, "learning_rate": 2.291460118245832e-05, "loss": 0.1982, "step": 18340 }, { "epoch": 386.3157894736842, "grad_norm": 0.5708928108215332, "learning_rate": 2.282951128873628e-05, "loss": 0.1913, "step": 18350 }, { "epoch": 386.5263157894737, "grad_norm": 0.6246204376220703, "learning_rate": 2.2744559311190185e-05, "loss": 0.1827, "step": 18360 }, { "epoch": 386.7368421052632, "grad_norm": 0.6227989792823792, "learning_rate": 2.2659745401643005e-05, "loss": 0.175, "step": 18370 }, { "epoch": 386.94736842105266, "grad_norm": 0.5730044841766357, "learning_rate": 2.2575069711670928e-05, "loss": 0.179, "step": 18380 }, { "epoch": 387.1578947368421, "grad_norm": 0.6064084768295288, "learning_rate": 2.2490532392603103e-05, "loss": 0.1846, "step": 18390 }, { "epoch": 387.36842105263156, "grad_norm": 0.5600056052207947, "learning_rate": 2.2406133595521495e-05, "loss": 0.1718, "step": 18400 }, { "epoch": 387.57894736842104, "grad_norm": 0.6896880269050598, "learning_rate": 2.23218734712604e-05, "loss": 0.1965, "step": 18410 }, { "epoch": 387.7894736842105, "grad_norm": 0.4734235107898712, "learning_rate": 2.2237752170406333e-05, "loss": 0.1891, "step": 18420 }, { "epoch": 388.0, "grad_norm": 1.0163463354110718, "learning_rate": 2.2153769843297667e-05, "loss": 0.1877, "step": 18430 }, { "epoch": 388.2105263157895, "grad_norm": 0.6150470972061157, "learning_rate": 2.2069926640024486e-05, "loss": 0.1841, "step": 18440 }, { "epoch": 388.42105263157896, "grad_norm": 0.4762638211250305, "learning_rate": 2.1986222710428163e-05, "loss": 0.1835, "step": 18450 }, { "epoch": 388.63157894736844, "grad_norm": 0.5731767416000366, "learning_rate": 2.190265820410117e-05, "loss": 0.2089, "step": 18460 }, { "epoch": 388.8421052631579, "grad_norm": 0.600663959980011, "learning_rate": 2.1819233270386852e-05, "loss": 0.1831, "step": 18470 }, { "epoch": 389.05263157894734, "grad_norm": 0.6098585724830627, "learning_rate": 2.1735948058379118e-05, "loss": 0.1861, "step": 18480 }, { "epoch": 389.2631578947368, "grad_norm": 0.6013163328170776, "learning_rate": 2.1652802716922126e-05, "loss": 0.1851, "step": 18490 }, { "epoch": 389.4736842105263, "grad_norm": 0.5845993757247925, "learning_rate": 2.1569797394610048e-05, "loss": 0.1787, "step": 18500 }, { "epoch": 389.6842105263158, "grad_norm": 0.635697066783905, "learning_rate": 2.1486932239786916e-05, "loss": 0.1819, "step": 18510 }, { "epoch": 389.89473684210526, "grad_norm": 0.49835336208343506, "learning_rate": 2.1404207400546173e-05, "loss": 0.1815, "step": 18520 }, { "epoch": 390.10526315789474, "grad_norm": 0.4586002230644226, "learning_rate": 2.1321623024730474e-05, "loss": 0.1867, "step": 18530 }, { "epoch": 390.3157894736842, "grad_norm": 0.6160049438476562, "learning_rate": 2.1239179259931563e-05, "loss": 0.1922, "step": 18540 }, { "epoch": 390.5263157894737, "grad_norm": 0.5551902651786804, "learning_rate": 2.1156876253489766e-05, "loss": 0.1907, "step": 18550 }, { "epoch": 390.7368421052632, "grad_norm": 0.7020893692970276, "learning_rate": 2.10747141524939e-05, "loss": 0.1816, "step": 18560 }, { "epoch": 390.94736842105266, "grad_norm": 0.5538049936294556, "learning_rate": 2.0992693103781e-05, "loss": 0.183, "step": 18570 }, { "epoch": 391.1578947368421, "grad_norm": 0.5073581337928772, "learning_rate": 2.0910813253935936e-05, "loss": 0.1808, "step": 18580 }, { "epoch": 391.36842105263156, "grad_norm": 0.6061038374900818, "learning_rate": 2.082907474929131e-05, "loss": 0.1868, "step": 18590 }, { "epoch": 391.57894736842104, "grad_norm": 0.541469156742096, "learning_rate": 2.0747477735927044e-05, "loss": 0.1778, "step": 18600 }, { "epoch": 391.7894736842105, "grad_norm": 0.6137720346450806, "learning_rate": 2.066602235967029e-05, "loss": 0.1857, "step": 18610 }, { "epoch": 392.0, "grad_norm": 0.7164705991744995, "learning_rate": 2.0584708766094963e-05, "loss": 0.1914, "step": 18620 }, { "epoch": 392.2105263157895, "grad_norm": 0.6752933859825134, "learning_rate": 2.050353710052164e-05, "loss": 0.1878, "step": 18630 }, { "epoch": 392.42105263157896, "grad_norm": 0.5490244030952454, "learning_rate": 2.0422507508017286e-05, "loss": 0.1797, "step": 18640 }, { "epoch": 392.63157894736844, "grad_norm": 0.639519989490509, "learning_rate": 2.0341620133394902e-05, "loss": 0.1836, "step": 18650 }, { "epoch": 392.8421052631579, "grad_norm": 0.6504760980606079, "learning_rate": 2.0260875121213297e-05, "loss": 0.1853, "step": 18660 }, { "epoch": 393.05263157894734, "grad_norm": 0.7008463740348816, "learning_rate": 2.0180272615776985e-05, "loss": 0.1989, "step": 18670 }, { "epoch": 393.2631578947368, "grad_norm": 0.5902712941169739, "learning_rate": 2.009981276113565e-05, "loss": 0.1822, "step": 18680 }, { "epoch": 393.4736842105263, "grad_norm": 0.5219281911849976, "learning_rate": 2.0019495701084102e-05, "loss": 0.1871, "step": 18690 }, { "epoch": 393.6842105263158, "grad_norm": 0.6408507823944092, "learning_rate": 1.9939321579161994e-05, "loss": 0.1794, "step": 18700 }, { "epoch": 393.89473684210526, "grad_norm": 0.5509892106056213, "learning_rate": 1.985929053865342e-05, "loss": 0.1868, "step": 18710 }, { "epoch": 394.10526315789474, "grad_norm": 0.5968159437179565, "learning_rate": 1.977940272258688e-05, "loss": 0.1922, "step": 18720 }, { "epoch": 394.3157894736842, "grad_norm": 0.6544321179389954, "learning_rate": 1.969965827373481e-05, "loss": 0.1899, "step": 18730 }, { "epoch": 394.5263157894737, "grad_norm": 0.9849665760993958, "learning_rate": 1.9620057334613516e-05, "loss": 0.1883, "step": 18740 }, { "epoch": 394.7368421052632, "grad_norm": 0.5996710658073425, "learning_rate": 1.954060004748276e-05, "loss": 0.1782, "step": 18750 }, { "epoch": 394.94736842105266, "grad_norm": 0.7064588069915771, "learning_rate": 1.9461286554345592e-05, "loss": 0.1827, "step": 18760 }, { "epoch": 395.1578947368421, "grad_norm": 0.531457781791687, "learning_rate": 1.9382116996948074e-05, "loss": 0.1759, "step": 18770 }, { "epoch": 395.36842105263156, "grad_norm": 0.5805523991584778, "learning_rate": 1.930309151677907e-05, "loss": 0.1811, "step": 18780 }, { "epoch": 395.57894736842104, "grad_norm": 0.6076486706733704, "learning_rate": 1.922421025506992e-05, "loss": 0.1919, "step": 18790 }, { "epoch": 395.7894736842105, "grad_norm": 0.46915534138679504, "learning_rate": 1.9145473352794197e-05, "loss": 0.1876, "step": 18800 }, { "epoch": 396.0, "grad_norm": 0.6902257800102234, "learning_rate": 1.9066880950667565e-05, "loss": 0.1893, "step": 18810 }, { "epoch": 396.2105263157895, "grad_norm": 0.5422173738479614, "learning_rate": 1.8988433189147325e-05, "loss": 0.1844, "step": 18820 }, { "epoch": 396.42105263157896, "grad_norm": 0.42955440282821655, "learning_rate": 1.891013020843242e-05, "loss": 0.1751, "step": 18830 }, { "epoch": 396.63157894736844, "grad_norm": 0.5527623891830444, "learning_rate": 1.8831972148462906e-05, "loss": 0.1857, "step": 18840 }, { "epoch": 396.8421052631579, "grad_norm": 0.5967190861701965, "learning_rate": 1.8753959148919964e-05, "loss": 0.1952, "step": 18850 }, { "epoch": 397.05263157894734, "grad_norm": 0.4433288276195526, "learning_rate": 1.8676091349225444e-05, "loss": 0.1828, "step": 18860 }, { "epoch": 397.2631578947368, "grad_norm": 0.6135048270225525, "learning_rate": 1.8598368888541706e-05, "loss": 0.1861, "step": 18870 }, { "epoch": 397.4736842105263, "grad_norm": 0.55300372838974, "learning_rate": 1.852079190577145e-05, "loss": 0.1795, "step": 18880 }, { "epoch": 397.6842105263158, "grad_norm": 0.5510004162788391, "learning_rate": 1.8443360539557285e-05, "loss": 0.1923, "step": 18890 }, { "epoch": 397.89473684210526, "grad_norm": 0.5126235485076904, "learning_rate": 1.8366074928281607e-05, "loss": 0.1815, "step": 18900 }, { "epoch": 398.10526315789474, "grad_norm": 0.5166956186294556, "learning_rate": 1.8288935210066373e-05, "loss": 0.1909, "step": 18910 }, { "epoch": 398.3157894736842, "grad_norm": 0.5784156322479248, "learning_rate": 1.8211941522772736e-05, "loss": 0.1917, "step": 18920 }, { "epoch": 398.5263157894737, "grad_norm": 0.4633670449256897, "learning_rate": 1.8135094004000884e-05, "loss": 0.1796, "step": 18930 }, { "epoch": 398.7368421052632, "grad_norm": 0.5912382006645203, "learning_rate": 1.805839279108984e-05, "loss": 0.1799, "step": 18940 }, { "epoch": 398.94736842105266, "grad_norm": 0.8275567889213562, "learning_rate": 1.798183802111706e-05, "loss": 0.1967, "step": 18950 }, { "epoch": 399.1578947368421, "grad_norm": 0.5267394781112671, "learning_rate": 1.7905429830898378e-05, "loss": 0.1839, "step": 18960 }, { "epoch": 399.36842105263156, "grad_norm": 0.5726824998855591, "learning_rate": 1.782916835698758e-05, "loss": 0.1909, "step": 18970 }, { "epoch": 399.57894736842104, "grad_norm": 0.7753190398216248, "learning_rate": 1.7753053735676317e-05, "loss": 0.2027, "step": 18980 }, { "epoch": 399.7894736842105, "grad_norm": 0.637991726398468, "learning_rate": 1.7677086102993744e-05, "loss": 0.1696, "step": 18990 }, { "epoch": 400.0, "grad_norm": 0.7141334414482117, "learning_rate": 1.7601265594706316e-05, "loss": 0.1854, "step": 19000 }, { "epoch": 400.2105263157895, "grad_norm": 0.8016759753227234, "learning_rate": 1.752559234631762e-05, "loss": 0.187, "step": 19010 }, { "epoch": 400.42105263157896, "grad_norm": 0.5627360939979553, "learning_rate": 1.7450066493067997e-05, "loss": 0.1826, "step": 19020 }, { "epoch": 400.63157894736844, "grad_norm": 0.6535181403160095, "learning_rate": 1.7374688169934385e-05, "loss": 0.1826, "step": 19030 }, { "epoch": 400.8421052631579, "grad_norm": 0.6111413836479187, "learning_rate": 1.7299457511630057e-05, "loss": 0.1889, "step": 19040 }, { "epoch": 401.05263157894734, "grad_norm": 0.5383073687553406, "learning_rate": 1.722437465260445e-05, "loss": 0.1904, "step": 19050 }, { "epoch": 401.2631578947368, "grad_norm": 0.8181249499320984, "learning_rate": 1.7149439727042736e-05, "loss": 0.1775, "step": 19060 }, { "epoch": 401.4736842105263, "grad_norm": 0.6523582339286804, "learning_rate": 1.7074652868865814e-05, "loss": 0.1846, "step": 19070 }, { "epoch": 401.6842105263158, "grad_norm": 0.8838443160057068, "learning_rate": 1.7000014211729964e-05, "loss": 0.1955, "step": 19080 }, { "epoch": 401.89473684210526, "grad_norm": 0.614313006401062, "learning_rate": 1.692552388902653e-05, "loss": 0.1855, "step": 19090 }, { "epoch": 402.10526315789474, "grad_norm": 0.5090711116790771, "learning_rate": 1.6851182033881795e-05, "loss": 0.1891, "step": 19100 }, { "epoch": 402.3157894736842, "grad_norm": 0.5899013876914978, "learning_rate": 1.677698877915669e-05, "loss": 0.1798, "step": 19110 }, { "epoch": 402.5263157894737, "grad_norm": 0.550887405872345, "learning_rate": 1.6702944257446627e-05, "loss": 0.1776, "step": 19120 }, { "epoch": 402.7368421052632, "grad_norm": 0.5447081923484802, "learning_rate": 1.6629048601081167e-05, "loss": 0.1982, "step": 19130 }, { "epoch": 402.94736842105266, "grad_norm": 0.5017291903495789, "learning_rate": 1.655530194212379e-05, "loss": 0.181, "step": 19140 }, { "epoch": 403.1578947368421, "grad_norm": 0.6135120391845703, "learning_rate": 1.648170441237179e-05, "loss": 0.1986, "step": 19150 }, { "epoch": 403.36842105263156, "grad_norm": 0.5479750633239746, "learning_rate": 1.640825614335586e-05, "loss": 0.182, "step": 19160 }, { "epoch": 403.57894736842104, "grad_norm": 0.486905962228775, "learning_rate": 1.6334957266339933e-05, "loss": 0.1747, "step": 19170 }, { "epoch": 403.7894736842105, "grad_norm": 0.5777673125267029, "learning_rate": 1.6261807912321037e-05, "loss": 0.1829, "step": 19180 }, { "epoch": 404.0, "grad_norm": 0.6578062772750854, "learning_rate": 1.6188808212028916e-05, "loss": 0.1882, "step": 19190 }, { "epoch": 404.2105263157895, "grad_norm": 0.7375777959823608, "learning_rate": 1.611595829592587e-05, "loss": 0.1956, "step": 19200 }, { "epoch": 404.42105263157896, "grad_norm": 0.5458284020423889, "learning_rate": 1.6043258294206487e-05, "loss": 0.1829, "step": 19210 }, { "epoch": 404.63157894736844, "grad_norm": 0.4725562334060669, "learning_rate": 1.5970708336797503e-05, "loss": 0.1706, "step": 19220 }, { "epoch": 404.8421052631579, "grad_norm": 0.6534218192100525, "learning_rate": 1.589830855335742e-05, "loss": 0.1896, "step": 19230 }, { "epoch": 405.05263157894734, "grad_norm": 0.44919002056121826, "learning_rate": 1.582605907327638e-05, "loss": 0.1904, "step": 19240 }, { "epoch": 405.2631578947368, "grad_norm": 0.5909680128097534, "learning_rate": 1.5753960025675963e-05, "loss": 0.1879, "step": 19250 }, { "epoch": 405.4736842105263, "grad_norm": 0.6542550921440125, "learning_rate": 1.5682011539408826e-05, "loss": 0.1772, "step": 19260 }, { "epoch": 405.6842105263158, "grad_norm": 0.5599654316902161, "learning_rate": 1.561021374305859e-05, "loss": 0.1792, "step": 19270 }, { "epoch": 405.89473684210526, "grad_norm": 0.6703615188598633, "learning_rate": 1.553856676493953e-05, "loss": 0.2035, "step": 19280 }, { "epoch": 406.10526315789474, "grad_norm": 0.6065393090248108, "learning_rate": 1.5467070733096466e-05, "loss": 0.1823, "step": 19290 }, { "epoch": 406.3157894736842, "grad_norm": 0.6260120272636414, "learning_rate": 1.5395725775304348e-05, "loss": 0.1947, "step": 19300 }, { "epoch": 406.5263157894737, "grad_norm": 0.5651423335075378, "learning_rate": 1.5324532019068195e-05, "loss": 0.1923, "step": 19310 }, { "epoch": 406.7368421052632, "grad_norm": 0.5632277131080627, "learning_rate": 1.5253489591622837e-05, "loss": 0.1795, "step": 19320 }, { "epoch": 406.94736842105266, "grad_norm": 0.5379475951194763, "learning_rate": 1.5182598619932576e-05, "loss": 0.1904, "step": 19330 }, { "epoch": 407.1578947368421, "grad_norm": 0.5060853362083435, "learning_rate": 1.511185923069105e-05, "loss": 0.1758, "step": 19340 }, { "epoch": 407.36842105263156, "grad_norm": 0.5877968668937683, "learning_rate": 1.5041271550321079e-05, "loss": 0.1813, "step": 19350 }, { "epoch": 407.57894736842104, "grad_norm": 0.7655476927757263, "learning_rate": 1.497083570497424e-05, "loss": 0.1915, "step": 19360 }, { "epoch": 407.7894736842105, "grad_norm": 0.5808361172676086, "learning_rate": 1.4900551820530828e-05, "loss": 0.1809, "step": 19370 }, { "epoch": 408.0, "grad_norm": 0.5417425036430359, "learning_rate": 1.4830420022599523e-05, "loss": 0.191, "step": 19380 }, { "epoch": 408.2105263157895, "grad_norm": 0.564896821975708, "learning_rate": 1.4760440436517253e-05, "loss": 0.1742, "step": 19390 }, { "epoch": 408.42105263157896, "grad_norm": 0.42806559801101685, "learning_rate": 1.4690613187348867e-05, "loss": 0.1943, "step": 19400 }, { "epoch": 408.63157894736844, "grad_norm": 0.5757007002830505, "learning_rate": 1.4620938399886963e-05, "loss": 0.1872, "step": 19410 }, { "epoch": 408.8421052631579, "grad_norm": 0.5994120240211487, "learning_rate": 1.4551416198651701e-05, "loss": 0.1847, "step": 19420 }, { "epoch": 409.05263157894734, "grad_norm": 0.6407523155212402, "learning_rate": 1.448204670789054e-05, "loss": 0.1928, "step": 19430 }, { "epoch": 409.2631578947368, "grad_norm": 0.5221366882324219, "learning_rate": 1.4412830051578009e-05, "loss": 0.1834, "step": 19440 }, { "epoch": 409.4736842105263, "grad_norm": 0.5237503051757812, "learning_rate": 1.4343766353415444e-05, "loss": 0.1758, "step": 19450 }, { "epoch": 409.6842105263158, "grad_norm": 0.6575828790664673, "learning_rate": 1.4274855736830938e-05, "loss": 0.1866, "step": 19460 }, { "epoch": 409.89473684210526, "grad_norm": 0.47361230850219727, "learning_rate": 1.4206098324978912e-05, "loss": 0.1845, "step": 19470 }, { "epoch": 410.10526315789474, "grad_norm": 0.5174840688705444, "learning_rate": 1.4137494240739979e-05, "loss": 0.1986, "step": 19480 }, { "epoch": 410.3157894736842, "grad_norm": 0.48306819796562195, "learning_rate": 1.4069043606720811e-05, "loss": 0.1861, "step": 19490 }, { "epoch": 410.5263157894737, "grad_norm": 0.4556773602962494, "learning_rate": 1.4000746545253774e-05, "loss": 0.1815, "step": 19500 }, { "epoch": 410.7368421052632, "grad_norm": 0.5436505079269409, "learning_rate": 1.3932603178396752e-05, "loss": 0.1896, "step": 19510 }, { "epoch": 410.94736842105266, "grad_norm": 0.6052523851394653, "learning_rate": 1.3864613627933042e-05, "loss": 0.181, "step": 19520 }, { "epoch": 411.1578947368421, "grad_norm": 0.5917760729789734, "learning_rate": 1.3796778015370959e-05, "loss": 0.1892, "step": 19530 }, { "epoch": 411.36842105263156, "grad_norm": 0.5070799589157104, "learning_rate": 1.372909646194377e-05, "loss": 0.1841, "step": 19540 }, { "epoch": 411.57894736842104, "grad_norm": 0.5299696922302246, "learning_rate": 1.366156908860936e-05, "loss": 0.1858, "step": 19550 }, { "epoch": 411.7894736842105, "grad_norm": 0.5153812766075134, "learning_rate": 1.359419601605012e-05, "loss": 0.1798, "step": 19560 }, { "epoch": 412.0, "grad_norm": 0.7871899604797363, "learning_rate": 1.3526977364672644e-05, "loss": 0.1864, "step": 19570 }, { "epoch": 412.2105263157895, "grad_norm": 0.6095196008682251, "learning_rate": 1.3459913254607537e-05, "loss": 0.1817, "step": 19580 }, { "epoch": 412.42105263157896, "grad_norm": 0.5391228795051575, "learning_rate": 1.3393003805709281e-05, "loss": 0.182, "step": 19590 }, { "epoch": 412.63157894736844, "grad_norm": 0.9291274547576904, "learning_rate": 1.332624913755588e-05, "loss": 0.1953, "step": 19600 }, { "epoch": 412.8421052631579, "grad_norm": 0.5795119404792786, "learning_rate": 1.3259649369448768e-05, "loss": 0.1871, "step": 19610 }, { "epoch": 413.05263157894734, "grad_norm": 0.7376877069473267, "learning_rate": 1.3193204620412481e-05, "loss": 0.199, "step": 19620 }, { "epoch": 413.2631578947368, "grad_norm": 0.6380026936531067, "learning_rate": 1.312691500919463e-05, "loss": 0.1857, "step": 19630 }, { "epoch": 413.4736842105263, "grad_norm": 0.5857008099555969, "learning_rate": 1.3060780654265447e-05, "loss": 0.2078, "step": 19640 }, { "epoch": 413.6842105263158, "grad_norm": 0.6055216789245605, "learning_rate": 1.299480167381778e-05, "loss": 0.1767, "step": 19650 }, { "epoch": 413.89473684210526, "grad_norm": 0.714047372341156, "learning_rate": 1.2928978185766727e-05, "loss": 0.1731, "step": 19660 }, { "epoch": 414.10526315789474, "grad_norm": 0.767358124256134, "learning_rate": 1.2863310307749577e-05, "loss": 0.2019, "step": 19670 }, { "epoch": 414.3157894736842, "grad_norm": 0.5826098322868347, "learning_rate": 1.2797798157125441e-05, "loss": 0.1889, "step": 19680 }, { "epoch": 414.5263157894737, "grad_norm": 0.46154114603996277, "learning_rate": 1.2732441850975185e-05, "loss": 0.1743, "step": 19690 }, { "epoch": 414.7368421052632, "grad_norm": 0.5386494398117065, "learning_rate": 1.2667241506101124e-05, "loss": 0.1811, "step": 19700 }, { "epoch": 414.94736842105266, "grad_norm": 0.590414822101593, "learning_rate": 1.2602197239026814e-05, "loss": 0.1926, "step": 19710 }, { "epoch": 415.1578947368421, "grad_norm": 0.722994327545166, "learning_rate": 1.2537309165996913e-05, "loss": 0.1967, "step": 19720 }, { "epoch": 415.36842105263156, "grad_norm": 0.5225125551223755, "learning_rate": 1.247257740297696e-05, "loss": 0.1861, "step": 19730 }, { "epoch": 415.57894736842104, "grad_norm": 0.4299336075782776, "learning_rate": 1.2408002065653091e-05, "loss": 0.188, "step": 19740 }, { "epoch": 415.7894736842105, "grad_norm": 0.5853255987167358, "learning_rate": 1.234358326943188e-05, "loss": 0.1818, "step": 19750 }, { "epoch": 416.0, "grad_norm": 0.5214622020721436, "learning_rate": 1.2279321129440202e-05, "loss": 0.1748, "step": 19760 }, { "epoch": 416.2105263157895, "grad_norm": 0.6937418580055237, "learning_rate": 1.221521576052489e-05, "loss": 0.1787, "step": 19770 }, { "epoch": 416.42105263157896, "grad_norm": 0.6764062643051147, "learning_rate": 1.2151267277252665e-05, "loss": 0.1894, "step": 19780 }, { "epoch": 416.63157894736844, "grad_norm": 0.5873314738273621, "learning_rate": 1.2087475793909798e-05, "loss": 0.1764, "step": 19790 }, { "epoch": 416.8421052631579, "grad_norm": 0.9404758810997009, "learning_rate": 1.2023841424502048e-05, "loss": 0.209, "step": 19800 }, { "epoch": 417.05263157894734, "grad_norm": 0.6922588348388672, "learning_rate": 1.1960364282754344e-05, "loss": 0.183, "step": 19810 }, { "epoch": 417.2631578947368, "grad_norm": 0.8207189440727234, "learning_rate": 1.1897044482110586e-05, "loss": 0.1839, "step": 19820 }, { "epoch": 417.4736842105263, "grad_norm": 0.5307655930519104, "learning_rate": 1.1833882135733599e-05, "loss": 0.1842, "step": 19830 }, { "epoch": 417.6842105263158, "grad_norm": 0.6499212980270386, "learning_rate": 1.1770877356504683e-05, "loss": 0.184, "step": 19840 }, { "epoch": 417.89473684210526, "grad_norm": 0.5272055268287659, "learning_rate": 1.1708030257023595e-05, "loss": 0.1798, "step": 19850 }, { "epoch": 418.10526315789474, "grad_norm": 0.5772253274917603, "learning_rate": 1.164534094960833e-05, "loss": 0.169, "step": 19860 }, { "epoch": 418.3157894736842, "grad_norm": 0.6082757711410522, "learning_rate": 1.1582809546294816e-05, "loss": 0.1939, "step": 19870 }, { "epoch": 418.5263157894737, "grad_norm": 0.8696257472038269, "learning_rate": 1.15204361588368e-05, "loss": 0.1822, "step": 19880 }, { "epoch": 418.7368421052632, "grad_norm": 0.7336968183517456, "learning_rate": 1.1458220898705663e-05, "loss": 0.1893, "step": 19890 }, { "epoch": 418.94736842105266, "grad_norm": 0.7387612462043762, "learning_rate": 1.1396163877090148e-05, "loss": 0.1973, "step": 19900 }, { "epoch": 419.1578947368421, "grad_norm": 0.660318911075592, "learning_rate": 1.1334265204896233e-05, "loss": 0.1809, "step": 19910 }, { "epoch": 419.36842105263156, "grad_norm": 0.6014663577079773, "learning_rate": 1.1272524992746846e-05, "loss": 0.192, "step": 19920 }, { "epoch": 419.57894736842104, "grad_norm": 0.6727678775787354, "learning_rate": 1.1210943350981806e-05, "loss": 0.1865, "step": 19930 }, { "epoch": 419.7894736842105, "grad_norm": 0.5072178244590759, "learning_rate": 1.1149520389657463e-05, "loss": 0.176, "step": 19940 }, { "epoch": 420.0, "grad_norm": 0.5579133033752441, "learning_rate": 1.1088256218546611e-05, "loss": 0.1804, "step": 19950 }, { "epoch": 420.2105263157895, "grad_norm": 0.46790534257888794, "learning_rate": 1.1027150947138232e-05, "loss": 0.1986, "step": 19960 }, { "epoch": 420.42105263157896, "grad_norm": 0.6098372936248779, "learning_rate": 1.0966204684637405e-05, "loss": 0.1805, "step": 19970 }, { "epoch": 420.63157894736844, "grad_norm": 0.734483003616333, "learning_rate": 1.0905417539964946e-05, "loss": 0.1899, "step": 19980 }, { "epoch": 420.8421052631579, "grad_norm": 0.5049176216125488, "learning_rate": 1.0844789621757335e-05, "loss": 0.173, "step": 19990 }, { "epoch": 421.05263157894734, "grad_norm": 0.5509609580039978, "learning_rate": 1.0784321038366529e-05, "loss": 0.186, "step": 20000 }, { "epoch": 421.2631578947368, "grad_norm": 0.6344820261001587, "learning_rate": 1.0724011897859653e-05, "loss": 0.1726, "step": 20010 }, { "epoch": 421.4736842105263, "grad_norm": 0.5593404769897461, "learning_rate": 1.0663862308018924e-05, "loss": 0.1833, "step": 20020 }, { "epoch": 421.6842105263158, "grad_norm": 0.6860533356666565, "learning_rate": 1.060387237634145e-05, "loss": 0.1867, "step": 20030 }, { "epoch": 421.89473684210526, "grad_norm": 0.5627481937408447, "learning_rate": 1.054404221003894e-05, "loss": 0.1808, "step": 20040 }, { "epoch": 422.10526315789474, "grad_norm": 0.5001289248466492, "learning_rate": 1.0484371916037606e-05, "loss": 0.1808, "step": 20050 }, { "epoch": 422.3157894736842, "grad_norm": 0.5672506093978882, "learning_rate": 1.0424861600977898e-05, "loss": 0.174, "step": 20060 }, { "epoch": 422.5263157894737, "grad_norm": 0.6734273433685303, "learning_rate": 1.0365511371214465e-05, "loss": 0.192, "step": 20070 }, { "epoch": 422.7368421052632, "grad_norm": 0.4520343840122223, "learning_rate": 1.0306321332815761e-05, "loss": 0.1846, "step": 20080 }, { "epoch": 422.94736842105266, "grad_norm": 0.6919784545898438, "learning_rate": 1.0247291591563956e-05, "loss": 0.2028, "step": 20090 }, { "epoch": 423.1578947368421, "grad_norm": 0.7424619197845459, "learning_rate": 1.018842225295481e-05, "loss": 0.187, "step": 20100 }, { "epoch": 423.36842105263156, "grad_norm": 0.6489419937133789, "learning_rate": 1.0129713422197362e-05, "loss": 0.1847, "step": 20110 }, { "epoch": 423.57894736842104, "grad_norm": 0.6978447437286377, "learning_rate": 1.0071165204213794e-05, "loss": 0.1783, "step": 20120 }, { "epoch": 423.7894736842105, "grad_norm": 0.6544562578201294, "learning_rate": 1.0012777703639275e-05, "loss": 0.1866, "step": 20130 }, { "epoch": 424.0, "grad_norm": 1.0255416631698608, "learning_rate": 9.954551024821767e-06, "loss": 0.1943, "step": 20140 }, { "epoch": 424.2105263157895, "grad_norm": 0.44083118438720703, "learning_rate": 9.896485271821755e-06, "loss": 0.1805, "step": 20150 }, { "epoch": 424.42105263157896, "grad_norm": 0.4890379011631012, "learning_rate": 9.838580548412135e-06, "loss": 0.1836, "step": 20160 }, { "epoch": 424.63157894736844, "grad_norm": 0.5661543607711792, "learning_rate": 9.780836958078087e-06, "loss": 0.189, "step": 20170 }, { "epoch": 424.8421052631579, "grad_norm": 0.555959939956665, "learning_rate": 9.72325460401674e-06, "loss": 0.1852, "step": 20180 }, { "epoch": 425.05263157894734, "grad_norm": 0.6740918755531311, "learning_rate": 9.665833589137085e-06, "loss": 0.1896, "step": 20190 }, { "epoch": 425.2631578947368, "grad_norm": 0.5326783061027527, "learning_rate": 9.608574016059823e-06, "loss": 0.1747, "step": 20200 }, { "epoch": 425.4736842105263, "grad_norm": 0.6140498518943787, "learning_rate": 9.551475987117065e-06, "loss": 0.1852, "step": 20210 }, { "epoch": 425.6842105263158, "grad_norm": 0.7099624276161194, "learning_rate": 9.49453960435226e-06, "loss": 0.1775, "step": 20220 }, { "epoch": 425.89473684210526, "grad_norm": 0.6613169312477112, "learning_rate": 9.437764969519935e-06, "loss": 0.1855, "step": 20230 }, { "epoch": 426.10526315789474, "grad_norm": 0.5651838779449463, "learning_rate": 9.381152184085595e-06, "loss": 0.1926, "step": 20240 }, { "epoch": 426.3157894736842, "grad_norm": 0.5982670783996582, "learning_rate": 9.32470134922544e-06, "loss": 0.1869, "step": 20250 }, { "epoch": 426.5263157894737, "grad_norm": 0.5540538430213928, "learning_rate": 9.26841256582629e-06, "loss": 0.1862, "step": 20260 }, { "epoch": 426.7368421052632, "grad_norm": 0.7269121408462524, "learning_rate": 9.212285934485332e-06, "loss": 0.1826, "step": 20270 }, { "epoch": 426.94736842105266, "grad_norm": 0.9267326593399048, "learning_rate": 9.15632155550994e-06, "loss": 0.1919, "step": 20280 }, { "epoch": 427.1578947368421, "grad_norm": 0.600455641746521, "learning_rate": 9.10051952891754e-06, "loss": 0.1857, "step": 20290 }, { "epoch": 427.36842105263156, "grad_norm": 0.5266342759132385, "learning_rate": 9.044879954435381e-06, "loss": 0.1725, "step": 20300 }, { "epoch": 427.57894736842104, "grad_norm": 0.8109986186027527, "learning_rate": 8.989402931500434e-06, "loss": 0.1943, "step": 20310 }, { "epoch": 427.7894736842105, "grad_norm": 0.5752395391464233, "learning_rate": 8.934088559259135e-06, "loss": 0.1944, "step": 20320 }, { "epoch": 428.0, "grad_norm": 0.6188866496086121, "learning_rate": 8.878936936567195e-06, "loss": 0.1817, "step": 20330 }, { "epoch": 428.2105263157895, "grad_norm": 0.6044108271598816, "learning_rate": 8.823948161989549e-06, "loss": 0.1799, "step": 20340 }, { "epoch": 428.42105263157896, "grad_norm": 0.5862536430358887, "learning_rate": 8.76912233380005e-06, "loss": 0.1798, "step": 20350 }, { "epoch": 428.63157894736844, "grad_norm": 0.6204822063446045, "learning_rate": 8.714459549981302e-06, "loss": 0.1862, "step": 20360 }, { "epoch": 428.8421052631579, "grad_norm": 0.5744066834449768, "learning_rate": 8.65995990822459e-06, "loss": 0.1869, "step": 20370 }, { "epoch": 429.05263157894734, "grad_norm": 0.6363000273704529, "learning_rate": 8.60562350592964e-06, "loss": 0.1959, "step": 20380 }, { "epoch": 429.2631578947368, "grad_norm": 0.5891885757446289, "learning_rate": 8.551450440204379e-06, "loss": 0.1804, "step": 20390 }, { "epoch": 429.4736842105263, "grad_norm": 0.5133945345878601, "learning_rate": 8.497440807864853e-06, "loss": 0.1825, "step": 20400 }, { "epoch": 429.6842105263158, "grad_norm": 0.578368067741394, "learning_rate": 8.443594705435054e-06, "loss": 0.1823, "step": 20410 }, { "epoch": 429.89473684210526, "grad_norm": 0.6351949572563171, "learning_rate": 8.389912229146702e-06, "loss": 0.1895, "step": 20420 }, { "epoch": 430.10526315789474, "grad_norm": 0.731504499912262, "learning_rate": 8.336393474939042e-06, "loss": 0.1819, "step": 20430 }, { "epoch": 430.3157894736842, "grad_norm": 0.5696659088134766, "learning_rate": 8.28303853845882e-06, "loss": 0.1879, "step": 20440 }, { "epoch": 430.5263157894737, "grad_norm": 0.5984363555908203, "learning_rate": 8.22984751505993e-06, "loss": 0.1897, "step": 20450 }, { "epoch": 430.7368421052632, "grad_norm": 0.7230535745620728, "learning_rate": 8.17682049980334e-06, "loss": 0.1815, "step": 20460 }, { "epoch": 430.94736842105266, "grad_norm": 0.4991123378276825, "learning_rate": 8.123957587456966e-06, "loss": 0.1735, "step": 20470 }, { "epoch": 431.1578947368421, "grad_norm": 0.5288248062133789, "learning_rate": 8.07125887249537e-06, "loss": 0.1841, "step": 20480 }, { "epoch": 431.36842105263156, "grad_norm": 0.6192090511322021, "learning_rate": 8.018724449099724e-06, "loss": 0.1746, "step": 20490 }, { "epoch": 431.57894736842104, "grad_norm": 0.7012197971343994, "learning_rate": 7.966354411157529e-06, "loss": 0.1866, "step": 20500 }, { "epoch": 431.7894736842105, "grad_norm": 0.7939274311065674, "learning_rate": 7.914148852262582e-06, "loss": 0.1971, "step": 20510 }, { "epoch": 432.0, "grad_norm": 0.5685390830039978, "learning_rate": 7.862107865714641e-06, "loss": 0.1882, "step": 20520 }, { "epoch": 432.2105263157895, "grad_norm": 0.6198951601982117, "learning_rate": 7.810231544519386e-06, "loss": 0.1767, "step": 20530 }, { "epoch": 432.42105263157896, "grad_norm": 0.4967138469219208, "learning_rate": 7.758519981388257e-06, "loss": 0.1871, "step": 20540 }, { "epoch": 432.63157894736844, "grad_norm": 0.6240216493606567, "learning_rate": 7.70697326873816e-06, "loss": 0.1821, "step": 20550 }, { "epoch": 432.8421052631579, "grad_norm": 0.43565139174461365, "learning_rate": 7.65559149869144e-06, "loss": 0.1816, "step": 20560 }, { "epoch": 433.05263157894734, "grad_norm": 0.5389068722724915, "learning_rate": 7.604374763075639e-06, "loss": 0.1962, "step": 20570 }, { "epoch": 433.2631578947368, "grad_norm": 0.4656810462474823, "learning_rate": 7.553323153423409e-06, "loss": 0.1835, "step": 20580 }, { "epoch": 433.4736842105263, "grad_norm": 0.5822871327400208, "learning_rate": 7.502436760972198e-06, "loss": 0.1879, "step": 20590 }, { "epoch": 433.6842105263158, "grad_norm": 0.6489611864089966, "learning_rate": 7.451715676664284e-06, "loss": 0.1833, "step": 20600 }, { "epoch": 433.89473684210526, "grad_norm": 0.60507732629776, "learning_rate": 7.401159991146445e-06, "loss": 0.1779, "step": 20610 }, { "epoch": 434.10526315789474, "grad_norm": 0.5765190720558167, "learning_rate": 7.3507697947699075e-06, "loss": 0.1823, "step": 20620 }, { "epoch": 434.3157894736842, "grad_norm": 0.5958349108695984, "learning_rate": 7.30054517759009e-06, "loss": 0.1959, "step": 20630 }, { "epoch": 434.5263157894737, "grad_norm": 0.5677564144134521, "learning_rate": 7.250486229366582e-06, "loss": 0.1747, "step": 20640 }, { "epoch": 434.7368421052632, "grad_norm": 0.4779718816280365, "learning_rate": 7.2005930395627975e-06, "loss": 0.1864, "step": 20650 }, { "epoch": 434.94736842105266, "grad_norm": 0.6373128294944763, "learning_rate": 7.1508656973459655e-06, "loss": 0.1881, "step": 20660 }, { "epoch": 435.1578947368421, "grad_norm": 0.5015693306922913, "learning_rate": 7.101304291586897e-06, "loss": 0.1923, "step": 20670 }, { "epoch": 435.36842105263156, "grad_norm": 0.9822379946708679, "learning_rate": 7.051908910859884e-06, "loss": 0.19, "step": 20680 }, { "epoch": 435.57894736842104, "grad_norm": 0.5332909822463989, "learning_rate": 7.002679643442478e-06, "loss": 0.1895, "step": 20690 }, { "epoch": 435.7894736842105, "grad_norm": 0.6863967776298523, "learning_rate": 6.953616577315336e-06, "loss": 0.1824, "step": 20700 }, { "epoch": 436.0, "grad_norm": 0.8219735026359558, "learning_rate": 6.904719800162141e-06, "loss": 0.1844, "step": 20710 }, { "epoch": 436.2105263157895, "grad_norm": 0.43607479333877563, "learning_rate": 6.855989399369345e-06, "loss": 0.1911, "step": 20720 }, { "epoch": 436.42105263157896, "grad_norm": 0.5909510254859924, "learning_rate": 6.807425462026096e-06, "loss": 0.1902, "step": 20730 }, { "epoch": 436.63157894736844, "grad_norm": 0.8456542491912842, "learning_rate": 6.75902807492399e-06, "loss": 0.1841, "step": 20740 }, { "epoch": 436.8421052631579, "grad_norm": 0.558668315410614, "learning_rate": 6.71079732455705e-06, "loss": 0.1807, "step": 20750 }, { "epoch": 437.05263157894734, "grad_norm": 0.581362783908844, "learning_rate": 6.662733297121415e-06, "loss": 0.1739, "step": 20760 }, { "epoch": 437.2631578947368, "grad_norm": 0.5437993407249451, "learning_rate": 6.614836078515285e-06, "loss": 0.1792, "step": 20770 }, { "epoch": 437.4736842105263, "grad_norm": 0.824394166469574, "learning_rate": 6.5671057543387985e-06, "loss": 0.197, "step": 20780 }, { "epoch": 437.6842105263158, "grad_norm": 0.4408915042877197, "learning_rate": 6.519542409893753e-06, "loss": 0.1877, "step": 20790 }, { "epoch": 437.89473684210526, "grad_norm": 0.5344646573066711, "learning_rate": 6.472146130183554e-06, "loss": 0.1764, "step": 20800 }, { "epoch": 438.10526315789474, "grad_norm": 0.5722460150718689, "learning_rate": 6.424916999913055e-06, "loss": 0.1904, "step": 20810 }, { "epoch": 438.3157894736842, "grad_norm": 0.6273982524871826, "learning_rate": 6.377855103488373e-06, "loss": 0.187, "step": 20820 }, { "epoch": 438.5263157894737, "grad_norm": 0.49412593245506287, "learning_rate": 6.330960525016716e-06, "loss": 0.1766, "step": 20830 }, { "epoch": 438.7368421052632, "grad_norm": 0.5377347469329834, "learning_rate": 6.284233348306334e-06, "loss": 0.1842, "step": 20840 }, { "epoch": 438.94736842105266, "grad_norm": 0.6035183668136597, "learning_rate": 6.237673656866238e-06, "loss": 0.1907, "step": 20850 }, { "epoch": 439.1578947368421, "grad_norm": 0.719294548034668, "learning_rate": 6.19128153390619e-06, "loss": 0.1936, "step": 20860 }, { "epoch": 439.36842105263156, "grad_norm": 0.5597818493843079, "learning_rate": 6.145057062336379e-06, "loss": 0.1819, "step": 20870 }, { "epoch": 439.57894736842104, "grad_norm": 0.5979058742523193, "learning_rate": 6.099000324767479e-06, "loss": 0.1813, "step": 20880 }, { "epoch": 439.7894736842105, "grad_norm": 0.5603199601173401, "learning_rate": 6.053111403510336e-06, "loss": 0.1794, "step": 20890 }, { "epoch": 440.0, "grad_norm": 1.0742629766464233, "learning_rate": 6.011954925113683e-06, "loss": 0.1906, "step": 20900 }, { "epoch": 440.2105263157895, "grad_norm": 0.6296944618225098, "learning_rate": 5.966385080540993e-06, "loss": 0.1886, "step": 20910 }, { "epoch": 440.42105263157896, "grad_norm": 0.5945459008216858, "learning_rate": 5.920983289285009e-06, "loss": 0.1768, "step": 20920 }, { "epoch": 440.63157894736844, "grad_norm": 0.504224419593811, "learning_rate": 5.8757496324860715e-06, "loss": 0.1877, "step": 20930 }, { "epoch": 440.8421052631579, "grad_norm": 0.6713933944702148, "learning_rate": 5.8306841909840816e-06, "loss": 0.1903, "step": 20940 }, { "epoch": 441.05263157894734, "grad_norm": 0.5135478973388672, "learning_rate": 5.78578704531828e-06, "loss": 0.1901, "step": 20950 }, { "epoch": 441.2631578947368, "grad_norm": 0.6023509502410889, "learning_rate": 5.74105827572714e-06, "loss": 0.1932, "step": 20960 }, { "epoch": 441.4736842105263, "grad_norm": 0.5241458415985107, "learning_rate": 5.696497962148218e-06, "loss": 0.1797, "step": 20970 }, { "epoch": 441.6842105263158, "grad_norm": 0.5880340933799744, "learning_rate": 5.652106184218042e-06, "loss": 0.1931, "step": 20980 }, { "epoch": 441.89473684210526, "grad_norm": 0.7315283417701721, "learning_rate": 5.607883021271898e-06, "loss": 0.1866, "step": 20990 }, { "epoch": 442.10526315789474, "grad_norm": 0.6057814359664917, "learning_rate": 5.5638285523437525e-06, "loss": 0.1749, "step": 21000 }, { "epoch": 442.3157894736842, "grad_norm": 0.7352088689804077, "learning_rate": 5.519942856166105e-06, "loss": 0.1888, "step": 21010 }, { "epoch": 442.5263157894737, "grad_norm": 0.7723200917243958, "learning_rate": 5.4762260111697714e-06, "loss": 0.1831, "step": 21020 }, { "epoch": 442.7368421052632, "grad_norm": 0.6795921921730042, "learning_rate": 5.432678095483878e-06, "loss": 0.1801, "step": 21030 }, { "epoch": 442.94736842105266, "grad_norm": 0.607151448726654, "learning_rate": 5.389299186935592e-06, "loss": 0.1814, "step": 21040 }, { "epoch": 443.1578947368421, "grad_norm": 0.6719841361045837, "learning_rate": 5.3460893630500664e-06, "loss": 0.1824, "step": 21050 }, { "epoch": 443.36842105263156, "grad_norm": 0.6265808343887329, "learning_rate": 5.3030487010502615e-06, "loss": 0.1916, "step": 21060 }, { "epoch": 443.57894736842104, "grad_norm": 0.6264698505401611, "learning_rate": 5.260177277856804e-06, "loss": 0.1817, "step": 21070 }, { "epoch": 443.7894736842105, "grad_norm": 0.5315179824829102, "learning_rate": 5.217475170087893e-06, "loss": 0.1863, "step": 21080 }, { "epoch": 444.0, "grad_norm": 0.7230284214019775, "learning_rate": 5.174942454059128e-06, "loss": 0.1912, "step": 21090 }, { "epoch": 444.2105263157895, "grad_norm": 0.6552962064743042, "learning_rate": 5.132579205783339e-06, "loss": 0.182, "step": 21100 }, { "epoch": 444.42105263157896, "grad_norm": 0.6705915331840515, "learning_rate": 5.0903855009705514e-06, "loss": 0.1784, "step": 21110 }, { "epoch": 444.63157894736844, "grad_norm": 0.6432520747184753, "learning_rate": 5.048361415027736e-06, "loss": 0.1842, "step": 21120 }, { "epoch": 444.8421052631579, "grad_norm": 0.5475180745124817, "learning_rate": 5.0065070230587485e-06, "loss": 0.2005, "step": 21130 }, { "epoch": 445.05263157894734, "grad_norm": 0.7086638808250427, "learning_rate": 4.964822399864189e-06, "loss": 0.1755, "step": 21140 }, { "epoch": 445.2631578947368, "grad_norm": 0.4936695098876953, "learning_rate": 4.92330761994122e-06, "loss": 0.1741, "step": 21150 }, { "epoch": 445.4736842105263, "grad_norm": 0.5482287406921387, "learning_rate": 4.8819627574835045e-06, "loss": 0.185, "step": 21160 }, { "epoch": 445.6842105263158, "grad_norm": 0.560600996017456, "learning_rate": 4.840787886380993e-06, "loss": 0.1802, "step": 21170 }, { "epoch": 445.89473684210526, "grad_norm": 0.8080334067344666, "learning_rate": 4.799783080219889e-06, "loss": 0.1843, "step": 21180 }, { "epoch": 446.10526315789474, "grad_norm": 0.4889693260192871, "learning_rate": 4.758948412282404e-06, "loss": 0.1859, "step": 21190 }, { "epoch": 446.3157894736842, "grad_norm": 0.4943747818470001, "learning_rate": 4.7182839555467095e-06, "loss": 0.1825, "step": 21200 }, { "epoch": 446.5263157894737, "grad_norm": 0.6152059435844421, "learning_rate": 4.677789782686781e-06, "loss": 0.1936, "step": 21210 }, { "epoch": 446.7368421052632, "grad_norm": 0.6645772457122803, "learning_rate": 4.6374659660722854e-06, "loss": 0.1878, "step": 21220 }, { "epoch": 446.94736842105266, "grad_norm": 0.6091228127479553, "learning_rate": 4.597312577768431e-06, "loss": 0.182, "step": 21230 }, { "epoch": 447.1578947368421, "grad_norm": 0.5711859464645386, "learning_rate": 4.557329689535794e-06, "loss": 0.1766, "step": 21240 }, { "epoch": 447.36842105263156, "grad_norm": 0.5759438872337341, "learning_rate": 4.517517372830315e-06, "loss": 0.1906, "step": 21250 }, { "epoch": 447.57894736842104, "grad_norm": 0.6798800230026245, "learning_rate": 4.477875698803025e-06, "loss": 0.179, "step": 21260 }, { "epoch": 447.7894736842105, "grad_norm": 0.6980090141296387, "learning_rate": 4.438404738300061e-06, "loss": 0.1753, "step": 21270 }, { "epoch": 448.0, "grad_norm": 0.8547975420951843, "learning_rate": 4.399104561862411e-06, "loss": 0.1939, "step": 21280 }, { "epoch": 448.2105263157895, "grad_norm": 0.7332622408866882, "learning_rate": 4.359975239725878e-06, "loss": 0.1973, "step": 21290 }, { "epoch": 448.42105263157896, "grad_norm": 0.6926571130752563, "learning_rate": 4.321016841820879e-06, "loss": 0.1766, "step": 21300 }, { "epoch": 448.63157894736844, "grad_norm": 0.6712207198143005, "learning_rate": 4.2822294377724e-06, "loss": 0.176, "step": 21310 }, { "epoch": 448.8421052631579, "grad_norm": 0.5506131649017334, "learning_rate": 4.243613096899823e-06, "loss": 0.1944, "step": 21320 }, { "epoch": 449.05263157894734, "grad_norm": 0.4963815212249756, "learning_rate": 4.2090047061760115e-06, "loss": 0.1906, "step": 21330 }, { "epoch": 449.2631578947368, "grad_norm": 0.5332702398300171, "learning_rate": 4.1707135752175e-06, "loss": 0.1775, "step": 21340 }, { "epoch": 449.4736842105263, "grad_norm": 0.5114326477050781, "learning_rate": 4.1325937067318245e-06, "loss": 0.1875, "step": 21350 }, { "epoch": 449.6842105263158, "grad_norm": 0.599938690662384, "learning_rate": 4.094645168845379e-06, "loss": 0.1852, "step": 21360 }, { "epoch": 449.89473684210526, "grad_norm": 0.6404860019683838, "learning_rate": 4.056868029378314e-06, "loss": 0.1804, "step": 21370 }, { "epoch": 450.10526315789474, "grad_norm": 0.6003533601760864, "learning_rate": 4.019262355844533e-06, "loss": 0.1909, "step": 21380 }, { "epoch": 450.3157894736842, "grad_norm": 0.5360564589500427, "learning_rate": 3.981828215451477e-06, "loss": 0.1886, "step": 21390 }, { "epoch": 450.5263157894737, "grad_norm": 0.5857632756233215, "learning_rate": 3.944565675099999e-06, "loss": 0.1885, "step": 21400 }, { "epoch": 450.7368421052632, "grad_norm": 0.4886971712112427, "learning_rate": 3.907474801384326e-06, "loss": 0.171, "step": 21410 }, { "epoch": 450.94736842105266, "grad_norm": 0.6512584090232849, "learning_rate": 3.870555660591846e-06, "loss": 0.1885, "step": 21420 }, { "epoch": 451.1578947368421, "grad_norm": 0.5677419304847717, "learning_rate": 3.833808318703058e-06, "loss": 0.1778, "step": 21430 }, { "epoch": 451.36842105263156, "grad_norm": 0.5320295095443726, "learning_rate": 3.797232841391407e-06, "loss": 0.1895, "step": 21440 }, { "epoch": 451.57894736842104, "grad_norm": 0.47199392318725586, "learning_rate": 3.760829294023227e-06, "loss": 0.1821, "step": 21450 }, { "epoch": 451.7894736842105, "grad_norm": 0.626926064491272, "learning_rate": 3.724597741657543e-06, "loss": 0.1892, "step": 21460 }, { "epoch": 452.0, "grad_norm": 0.5806630849838257, "learning_rate": 3.688538249046003e-06, "loss": 0.1848, "step": 21470 }, { "epoch": 452.2105263157895, "grad_norm": 0.46587905287742615, "learning_rate": 3.652650880632802e-06, "loss": 0.1711, "step": 21480 }, { "epoch": 452.42105263157896, "grad_norm": 0.8197913765907288, "learning_rate": 3.616935700554458e-06, "loss": 0.1931, "step": 21490 }, { "epoch": 452.63157894736844, "grad_norm": 0.5315291285514832, "learning_rate": 3.5813927726397913e-06, "loss": 0.1861, "step": 21500 }, { "epoch": 452.8421052631579, "grad_norm": 0.6884347796440125, "learning_rate": 3.546022160409779e-06, "loss": 0.1856, "step": 21510 }, { "epoch": 453.05263157894734, "grad_norm": 0.8013739585876465, "learning_rate": 3.5108239270774446e-06, "loss": 0.1943, "step": 21520 }, { "epoch": 453.2631578947368, "grad_norm": 0.720817506313324, "learning_rate": 3.4757981355477363e-06, "loss": 0.1862, "step": 21530 }, { "epoch": 453.4736842105263, "grad_norm": 0.7499425411224365, "learning_rate": 3.4409448484174157e-06, "loss": 0.1908, "step": 21540 }, { "epoch": 453.6842105263158, "grad_norm": 0.7999539375305176, "learning_rate": 3.4062641279749674e-06, "loss": 0.1806, "step": 21550 }, { "epoch": 453.89473684210526, "grad_norm": 0.490826278924942, "learning_rate": 3.3717560362004574e-06, "loss": 0.1734, "step": 21560 }, { "epoch": 454.10526315789474, "grad_norm": 0.5968190431594849, "learning_rate": 3.3374206347654426e-06, "loss": 0.2056, "step": 21570 }, { "epoch": 454.3157894736842, "grad_norm": 0.5196442604064941, "learning_rate": 3.3032579850328595e-06, "loss": 0.1799, "step": 21580 }, { "epoch": 454.5263157894737, "grad_norm": 0.6417738795280457, "learning_rate": 3.269268148056892e-06, "loss": 0.1891, "step": 21590 }, { "epoch": 454.7368421052632, "grad_norm": 0.7515450119972229, "learning_rate": 3.235451184582894e-06, "loss": 0.1913, "step": 21600 }, { "epoch": 454.94736842105266, "grad_norm": 0.5569813847541809, "learning_rate": 3.201807155047254e-06, "loss": 0.1714, "step": 21610 }, { "epoch": 455.1578947368421, "grad_norm": 0.7771729230880737, "learning_rate": 3.168336119577331e-06, "loss": 0.1859, "step": 21620 }, { "epoch": 455.36842105263156, "grad_norm": 0.5104111433029175, "learning_rate": 3.1350381379912753e-06, "loss": 0.1872, "step": 21630 }, { "epoch": 455.57894736842104, "grad_norm": 0.46235164999961853, "learning_rate": 3.1019132697979623e-06, "loss": 0.1865, "step": 21640 }, { "epoch": 455.7894736842105, "grad_norm": 0.5363319516181946, "learning_rate": 3.068961574196938e-06, "loss": 0.1858, "step": 21650 }, { "epoch": 456.0, "grad_norm": 0.567591667175293, "learning_rate": 3.036183110078217e-06, "loss": 0.1714, "step": 21660 }, { "epoch": 456.2105263157895, "grad_norm": 0.7822842001914978, "learning_rate": 3.003577936022195e-06, "loss": 0.1813, "step": 21670 }, { "epoch": 456.42105263157896, "grad_norm": 0.5344955325126648, "learning_rate": 2.9711461102996383e-06, "loss": 0.1761, "step": 21680 }, { "epoch": 456.63157894736844, "grad_norm": 0.721997082233429, "learning_rate": 2.9388876908714834e-06, "loss": 0.1899, "step": 21690 }, { "epoch": 456.8421052631579, "grad_norm": 0.7189823389053345, "learning_rate": 2.906802735388736e-06, "loss": 0.1792, "step": 21700 }, { "epoch": 457.05263157894734, "grad_norm": 0.545745849609375, "learning_rate": 2.8748913011924174e-06, "loss": 0.188, "step": 21710 }, { "epoch": 457.2631578947368, "grad_norm": 0.5995526313781738, "learning_rate": 2.84315344531344e-06, "loss": 0.1845, "step": 21720 }, { "epoch": 457.4736842105263, "grad_norm": 0.5464062094688416, "learning_rate": 2.8115892244724993e-06, "loss": 0.1966, "step": 21730 }, { "epoch": 457.6842105263158, "grad_norm": 0.7825399041175842, "learning_rate": 2.780198695079972e-06, "loss": 0.1815, "step": 21740 }, { "epoch": 457.89473684210526, "grad_norm": 0.5444468259811401, "learning_rate": 2.7489819132358265e-06, "loss": 0.1789, "step": 21750 }, { "epoch": 458.10526315789474, "grad_norm": 0.5473366379737854, "learning_rate": 2.7179389347295137e-06, "loss": 0.1948, "step": 21760 }, { "epoch": 458.3157894736842, "grad_norm": 0.5430154800415039, "learning_rate": 2.6870698150398664e-06, "loss": 0.1827, "step": 21770 }, { "epoch": 458.5263157894737, "grad_norm": 0.6059640645980835, "learning_rate": 2.6563746093349996e-06, "loss": 0.1981, "step": 21780 }, { "epoch": 458.7368421052632, "grad_norm": 0.594071626663208, "learning_rate": 2.625853372472231e-06, "loss": 0.181, "step": 21790 }, { "epoch": 458.94736842105266, "grad_norm": 0.6014748811721802, "learning_rate": 2.5955061589979734e-06, "loss": 0.1857, "step": 21800 }, { "epoch": 459.1578947368421, "grad_norm": 0.6244844794273376, "learning_rate": 2.565333023147587e-06, "loss": 0.1709, "step": 21810 }, { "epoch": 459.36842105263156, "grad_norm": 0.5881552696228027, "learning_rate": 2.5353340188453923e-06, "loss": 0.181, "step": 21820 }, { "epoch": 459.57894736842104, "grad_norm": 0.6716004610061646, "learning_rate": 2.5055091997044587e-06, "loss": 0.1802, "step": 21830 }, { "epoch": 459.7894736842105, "grad_norm": 0.7322579622268677, "learning_rate": 2.475858619026572e-06, "loss": 0.1886, "step": 21840 }, { "epoch": 460.0, "grad_norm": 0.5972902178764343, "learning_rate": 2.4463823298021103e-06, "loss": 0.1982, "step": 21850 }, { "epoch": 460.2105263157895, "grad_norm": 0.6081510782241821, "learning_rate": 2.417080384710013e-06, "loss": 0.1767, "step": 21860 }, { "epoch": 460.42105263157896, "grad_norm": 0.6108368039131165, "learning_rate": 2.387952836117602e-06, "loss": 0.1859, "step": 21870 }, { "epoch": 460.63157894736844, "grad_norm": 0.5147649049758911, "learning_rate": 2.3589997360805025e-06, "loss": 0.182, "step": 21880 }, { "epoch": 460.8421052631579, "grad_norm": 0.5353897213935852, "learning_rate": 2.330221136342625e-06, "loss": 0.1764, "step": 21890 }, { "epoch": 461.05263157894734, "grad_norm": 0.6796343922615051, "learning_rate": 2.3016170883359835e-06, "loss": 0.2011, "step": 21900 }, { "epoch": 461.2631578947368, "grad_norm": 0.6354873776435852, "learning_rate": 2.273187643180652e-06, "loss": 0.1889, "step": 21910 }, { "epoch": 461.4736842105263, "grad_norm": 0.543091356754303, "learning_rate": 2.2449328516846556e-06, "loss": 0.1804, "step": 21920 }, { "epoch": 461.6842105263158, "grad_norm": 0.5700305104255676, "learning_rate": 2.216852764343902e-06, "loss": 0.1827, "step": 21930 }, { "epoch": 461.89473684210526, "grad_norm": 0.569500207901001, "learning_rate": 2.1889474313420477e-06, "loss": 0.1872, "step": 21940 }, { "epoch": 462.10526315789474, "grad_norm": 0.6591985821723938, "learning_rate": 2.1612169025504446e-06, "loss": 0.1858, "step": 21950 }, { "epoch": 462.3157894736842, "grad_norm": 0.5844663381576538, "learning_rate": 2.1336612275280497e-06, "loss": 0.1866, "step": 21960 }, { "epoch": 462.5263157894737, "grad_norm": 0.5001121759414673, "learning_rate": 2.1062804555213255e-06, "loss": 0.1761, "step": 21970 }, { "epoch": 462.7368421052632, "grad_norm": 0.5647687911987305, "learning_rate": 2.079074635464129e-06, "loss": 0.1863, "step": 21980 }, { "epoch": 462.94736842105266, "grad_norm": 0.6637557744979858, "learning_rate": 2.0520438159777e-06, "loss": 0.1864, "step": 21990 }, { "epoch": 463.1578947368421, "grad_norm": 0.9768401384353638, "learning_rate": 2.0251880453704963e-06, "loss": 0.1822, "step": 22000 }, { "epoch": 463.36842105263156, "grad_norm": 0.6367493271827698, "learning_rate": 1.998507371638114e-06, "loss": 0.1967, "step": 22010 }, { "epoch": 463.57894736842104, "grad_norm": 0.6616984605789185, "learning_rate": 1.972001842463245e-06, "loss": 0.1843, "step": 22020 }, { "epoch": 463.7894736842105, "grad_norm": 1.017850399017334, "learning_rate": 1.945671505215574e-06, "loss": 0.1819, "step": 22030 }, { "epoch": 464.0, "grad_norm": 0.6348969340324402, "learning_rate": 1.9195164069516936e-06, "loss": 0.1739, "step": 22040 }, { "epoch": 464.2105263157895, "grad_norm": 0.8872740864753723, "learning_rate": 1.8935365944149908e-06, "loss": 0.1858, "step": 22050 }, { "epoch": 464.42105263157896, "grad_norm": 0.7518464922904968, "learning_rate": 1.867732114035614e-06, "loss": 0.1923, "step": 22060 }, { "epoch": 464.63157894736844, "grad_norm": 0.5332704782485962, "learning_rate": 1.8421030119303407e-06, "loss": 0.1698, "step": 22070 }, { "epoch": 464.8421052631579, "grad_norm": 0.6198592185974121, "learning_rate": 1.8166493339025426e-06, "loss": 0.1872, "step": 22080 }, { "epoch": 465.05263157894734, "grad_norm": 0.5435618162155151, "learning_rate": 1.791371125442065e-06, "loss": 0.1871, "step": 22090 }, { "epoch": 465.2631578947368, "grad_norm": 0.5456479787826538, "learning_rate": 1.7662684317251598e-06, "loss": 0.1755, "step": 22100 }, { "epoch": 465.4736842105263, "grad_norm": 0.5278346538543701, "learning_rate": 1.7413412976144294e-06, "loss": 0.1989, "step": 22110 }, { "epoch": 465.6842105263158, "grad_norm": 0.5534194707870483, "learning_rate": 1.7165897676586717e-06, "loss": 0.175, "step": 22120 }, { "epoch": 465.89473684210526, "grad_norm": 0.6580941081047058, "learning_rate": 1.6920138860929246e-06, "loss": 0.1784, "step": 22130 }, { "epoch": 466.10526315789474, "grad_norm": 0.6645970344543457, "learning_rate": 1.6676136968382328e-06, "loss": 0.1935, "step": 22140 }, { "epoch": 466.3157894736842, "grad_norm": 0.5245230197906494, "learning_rate": 1.643389243501725e-06, "loss": 0.1813, "step": 22150 }, { "epoch": 466.5263157894737, "grad_norm": 0.5805260539054871, "learning_rate": 1.619340569376404e-06, "loss": 0.1731, "step": 22160 }, { "epoch": 466.7368421052632, "grad_norm": 0.6286890506744385, "learning_rate": 1.5954677174411681e-06, "loss": 0.1829, "step": 22170 }, { "epoch": 466.94736842105266, "grad_norm": 0.5264356732368469, "learning_rate": 1.5717707303606555e-06, "loss": 0.1884, "step": 22180 }, { "epoch": 467.1578947368421, "grad_norm": 0.5247817039489746, "learning_rate": 1.548249650485234e-06, "loss": 0.1781, "step": 22190 }, { "epoch": 467.36842105263156, "grad_norm": 0.8821797966957092, "learning_rate": 1.5249045198508893e-06, "loss": 0.1869, "step": 22200 }, { "epoch": 467.57894736842104, "grad_norm": 0.5761975049972534, "learning_rate": 1.5017353801791589e-06, "loss": 0.1823, "step": 22210 }, { "epoch": 467.7894736842105, "grad_norm": 0.5740618109703064, "learning_rate": 1.4787422728770316e-06, "loss": 0.1841, "step": 22220 }, { "epoch": 468.0, "grad_norm": 0.5789286494255066, "learning_rate": 1.4559252390369483e-06, "loss": 0.1863, "step": 22230 }, { "epoch": 468.2105263157895, "grad_norm": 0.4390312731266022, "learning_rate": 1.433284319436623e-06, "loss": 0.1852, "step": 22240 }, { "epoch": 468.42105263157896, "grad_norm": 0.7425456047058105, "learning_rate": 1.4108195545390557e-06, "loss": 0.1847, "step": 22250 }, { "epoch": 468.63157894736844, "grad_norm": 0.5220855474472046, "learning_rate": 1.388530984492431e-06, "loss": 0.1767, "step": 22260 }, { "epoch": 468.8421052631579, "grad_norm": 0.8341360092163086, "learning_rate": 1.36641864913003e-06, "loss": 0.1947, "step": 22270 }, { "epoch": 469.05263157894734, "grad_norm": 0.5250037312507629, "learning_rate": 1.3444825879701973e-06, "loss": 0.1934, "step": 22280 }, { "epoch": 469.2631578947368, "grad_norm": 0.5445212721824646, "learning_rate": 1.3227228402162061e-06, "loss": 0.1716, "step": 22290 }, { "epoch": 469.4736842105263, "grad_norm": 0.6371374130249023, "learning_rate": 1.301139444756272e-06, "loss": 0.1891, "step": 22300 }, { "epoch": 469.6842105263158, "grad_norm": 0.5438109636306763, "learning_rate": 1.279732440163417e-06, "loss": 0.1868, "step": 22310 }, { "epoch": 469.89473684210526, "grad_norm": 0.9809474349021912, "learning_rate": 1.2585018646954273e-06, "loss": 0.1901, "step": 22320 }, { "epoch": 470.10526315789474, "grad_norm": 0.5445967316627502, "learning_rate": 1.237447756294785e-06, "loss": 0.1909, "step": 22330 }, { "epoch": 470.3157894736842, "grad_norm": 0.5317168235778809, "learning_rate": 1.216570152588603e-06, "loss": 0.1852, "step": 22340 }, { "epoch": 470.5263157894737, "grad_norm": 0.9151925444602966, "learning_rate": 1.195869090888524e-06, "loss": 0.1809, "step": 22350 }, { "epoch": 470.7368421052632, "grad_norm": 0.6518405675888062, "learning_rate": 1.1753446081907205e-06, "loss": 0.1738, "step": 22360 }, { "epoch": 470.94736842105266, "grad_norm": 0.7550083994865417, "learning_rate": 1.1549967411757734e-06, "loss": 0.1979, "step": 22370 }, { "epoch": 471.1578947368421, "grad_norm": 0.5095014572143555, "learning_rate": 1.134825526208605e-06, "loss": 0.1843, "step": 22380 }, { "epoch": 471.36842105263156, "grad_norm": 0.7665607333183289, "learning_rate": 1.1148309993384454e-06, "loss": 0.1755, "step": 22390 }, { "epoch": 471.57894736842104, "grad_norm": 0.5469737648963928, "learning_rate": 1.0950131962987774e-06, "loss": 0.1837, "step": 22400 }, { "epoch": 471.7894736842105, "grad_norm": 0.6690005660057068, "learning_rate": 1.0753721525072147e-06, "loss": 0.1928, "step": 22410 }, { "epoch": 472.0, "grad_norm": 0.8379321098327637, "learning_rate": 1.0559079030654895e-06, "loss": 0.1947, "step": 22420 }, { "epoch": 472.2105263157895, "grad_norm": 0.5236616134643555, "learning_rate": 1.0366204827593652e-06, "loss": 0.1857, "step": 22430 }, { "epoch": 472.42105263157896, "grad_norm": 0.5310667753219604, "learning_rate": 1.0175099260586018e-06, "loss": 0.1799, "step": 22440 }, { "epoch": 472.63157894736844, "grad_norm": 0.6408274173736572, "learning_rate": 9.985762671168576e-07, "loss": 0.1812, "step": 22450 }, { "epoch": 472.8421052631579, "grad_norm": 0.533226728439331, "learning_rate": 9.798195397716315e-07, "loss": 0.1825, "step": 22460 }, { "epoch": 473.05263157894734, "grad_norm": 0.5859424471855164, "learning_rate": 9.61239777544276e-07, "loss": 0.1839, "step": 22470 }, { "epoch": 473.2631578947368, "grad_norm": 0.5590562224388123, "learning_rate": 9.428370136398079e-07, "loss": 0.182, "step": 22480 }, { "epoch": 473.4736842105263, "grad_norm": 0.48322606086730957, "learning_rate": 9.246112809469521e-07, "loss": 0.1899, "step": 22490 }, { "epoch": 473.6842105263158, "grad_norm": 0.7842751741409302, "learning_rate": 9.065626120380643e-07, "loss": 0.1917, "step": 22500 }, { "epoch": 473.89473684210526, "grad_norm": 0.6499484181404114, "learning_rate": 8.886910391690206e-07, "loss": 0.1824, "step": 22510 }, { "epoch": 474.10526315789474, "grad_norm": 0.5859502553939819, "learning_rate": 8.709965942792386e-07, "loss": 0.1774, "step": 22520 }, { "epoch": 474.3157894736842, "grad_norm": 0.552681028842926, "learning_rate": 8.53479308991556e-07, "loss": 0.1796, "step": 22530 }, { "epoch": 474.5263157894737, "grad_norm": 0.6872820854187012, "learning_rate": 8.361392146121972e-07, "loss": 0.1819, "step": 22540 }, { "epoch": 474.7368421052632, "grad_norm": 0.5292307734489441, "learning_rate": 8.189763421307284e-07, "loss": 0.1897, "step": 22550 }, { "epoch": 474.94736842105266, "grad_norm": 0.4911171495914459, "learning_rate": 8.019907222199807e-07, "loss": 0.1706, "step": 22560 }, { "epoch": 475.1578947368421, "grad_norm": 0.6938400268554688, "learning_rate": 7.851823852360163e-07, "loss": 0.1977, "step": 22570 }, { "epoch": 475.36842105263156, "grad_norm": 0.5019783973693848, "learning_rate": 7.685513612180506e-07, "loss": 0.1746, "step": 22580 }, { "epoch": 475.57894736842104, "grad_norm": 0.5230517983436584, "learning_rate": 7.520976798884194e-07, "loss": 0.1827, "step": 22590 }, { "epoch": 475.7894736842105, "grad_norm": 0.5872176885604858, "learning_rate": 7.35821370652523e-07, "loss": 0.1827, "step": 22600 }, { "epoch": 476.0, "grad_norm": 0.861818253993988, "learning_rate": 7.197224625987819e-07, "loss": 0.1873, "step": 22610 }, { "epoch": 476.2105263157895, "grad_norm": 0.5507122278213501, "learning_rate": 7.038009844985149e-07, "loss": 0.1974, "step": 22620 }, { "epoch": 476.42105263157896, "grad_norm": 0.5509169101715088, "learning_rate": 6.880569648060275e-07, "loss": 0.1734, "step": 22630 }, { "epoch": 476.63157894736844, "grad_norm": 0.4984310269355774, "learning_rate": 6.724904316584124e-07, "loss": 0.1843, "step": 22640 }, { "epoch": 476.8421052631579, "grad_norm": 0.6479700207710266, "learning_rate": 6.571014128755937e-07, "loss": 0.1784, "step": 22650 }, { "epoch": 477.05263157894734, "grad_norm": 0.49195757508277893, "learning_rate": 6.418899359602381e-07, "loss": 0.1862, "step": 22660 }, { "epoch": 477.2631578947368, "grad_norm": 0.6385235786437988, "learning_rate": 6.26856028097722e-07, "loss": 0.1802, "step": 22670 }, { "epoch": 477.4736842105263, "grad_norm": 0.599315881729126, "learning_rate": 6.119997161560975e-07, "loss": 0.1821, "step": 22680 }, { "epoch": 477.6842105263158, "grad_norm": 0.5851359963417053, "learning_rate": 5.973210266859708e-07, "loss": 0.1878, "step": 22690 }, { "epoch": 477.89473684210526, "grad_norm": 0.5468791127204895, "learning_rate": 5.828199859205574e-07, "loss": 0.1882, "step": 22700 }, { "epoch": 478.10526315789474, "grad_norm": 0.7265982031822205, "learning_rate": 5.684966197755715e-07, "loss": 0.1906, "step": 22710 }, { "epoch": 478.3157894736842, "grad_norm": 0.5022777915000916, "learning_rate": 5.543509538491809e-07, "loss": 0.1756, "step": 22720 }, { "epoch": 478.5263157894737, "grad_norm": 0.501018762588501, "learning_rate": 5.403830134219856e-07, "loss": 0.1757, "step": 22730 }, { "epoch": 478.7368421052632, "grad_norm": 0.5156716108322144, "learning_rate": 5.265928234569617e-07, "loss": 0.1961, "step": 22740 }, { "epoch": 478.94736842105266, "grad_norm": 0.6293845772743225, "learning_rate": 5.129804085994284e-07, "loss": 0.1811, "step": 22750 }, { "epoch": 479.1578947368421, "grad_norm": 0.6754159927368164, "learning_rate": 4.995457931769477e-07, "loss": 0.1861, "step": 22760 }, { "epoch": 479.36842105263156, "grad_norm": 0.5567587018013, "learning_rate": 4.862890011993915e-07, "loss": 0.1847, "step": 22770 }, { "epoch": 479.57894736842104, "grad_norm": 0.713141143321991, "learning_rate": 4.732100563587638e-07, "loss": 0.1737, "step": 22780 }, { "epoch": 479.7894736842105, "grad_norm": 0.5813489556312561, "learning_rate": 4.6030898202928943e-07, "loss": 0.1858, "step": 22790 }, { "epoch": 480.0, "grad_norm": 0.5960988998413086, "learning_rate": 4.475858012672474e-07, "loss": 0.1832, "step": 22800 }, { "epoch": 480.2105263157895, "grad_norm": 0.7617952227592468, "learning_rate": 4.350405368110488e-07, "loss": 0.19, "step": 22810 }, { "epoch": 480.42105263157896, "grad_norm": 0.47482919692993164, "learning_rate": 4.226732110811149e-07, "loss": 0.1753, "step": 22820 }, { "epoch": 480.63157894736844, "grad_norm": 0.8491657972335815, "learning_rate": 4.1048384617985435e-07, "loss": 0.1899, "step": 22830 }, { "epoch": 480.8421052631579, "grad_norm": 0.5300630927085876, "learning_rate": 3.984724638916415e-07, "loss": 0.1823, "step": 22840 }, { "epoch": 481.05263157894734, "grad_norm": 0.4608703553676605, "learning_rate": 3.866390856827495e-07, "loss": 0.1805, "step": 22850 }, { "epoch": 481.2631578947368, "grad_norm": 0.6642935276031494, "learning_rate": 3.749837327013728e-07, "loss": 0.1783, "step": 22860 }, { "epoch": 481.4736842105263, "grad_norm": 0.6894769072532654, "learning_rate": 3.635064257774934e-07, "loss": 0.1898, "step": 22870 }, { "epoch": 481.6842105263158, "grad_norm": 0.5321648120880127, "learning_rate": 3.5220718542292583e-07, "loss": 0.1824, "step": 22880 }, { "epoch": 481.89473684210526, "grad_norm": 0.6732971668243408, "learning_rate": 3.410860318312614e-07, "loss": 0.186, "step": 22890 }, { "epoch": 482.10526315789474, "grad_norm": 0.7677638530731201, "learning_rate": 3.301429848777793e-07, "loss": 0.1756, "step": 22900 }, { "epoch": 482.3157894736842, "grad_norm": 0.7483295798301697, "learning_rate": 3.19378064119491e-07, "loss": 0.1943, "step": 22910 }, { "epoch": 482.5263157894737, "grad_norm": 0.49219393730163574, "learning_rate": 3.087912887950517e-07, "loss": 0.1763, "step": 22920 }, { "epoch": 482.7368421052632, "grad_norm": 0.5193999409675598, "learning_rate": 2.983826778247489e-07, "loss": 0.1858, "step": 22930 }, { "epoch": 482.94736842105266, "grad_norm": 0.6139624118804932, "learning_rate": 2.881522498104472e-07, "loss": 0.1908, "step": 22940 }, { "epoch": 483.1578947368421, "grad_norm": 0.6963362097740173, "learning_rate": 2.781000230356101e-07, "loss": 0.2133, "step": 22950 }, { "epoch": 483.36842105263156, "grad_norm": 0.5296799540519714, "learning_rate": 2.682260154651672e-07, "loss": 0.1714, "step": 22960 }, { "epoch": 483.57894736842104, "grad_norm": 0.5533150434494019, "learning_rate": 2.5853024474556953e-07, "loss": 0.194, "step": 22970 }, { "epoch": 483.7894736842105, "grad_norm": 0.55152827501297, "learning_rate": 2.4901272820475605e-07, "loss": 0.1859, "step": 22980 }, { "epoch": 484.0, "grad_norm": 1.465172290802002, "learning_rate": 2.3967348285205416e-07, "loss": 0.1798, "step": 22990 }, { "epoch": 484.2105263157895, "grad_norm": 0.6252908706665039, "learning_rate": 2.3051252537820145e-07, "loss": 0.1796, "step": 23000 }, { "epoch": 484.42105263157896, "grad_norm": 0.4841025173664093, "learning_rate": 2.2152987215534604e-07, "loss": 0.1842, "step": 23010 }, { "epoch": 484.63157894736844, "grad_norm": 0.5873754024505615, "learning_rate": 2.1272553923691317e-07, "loss": 0.1686, "step": 23020 }, { "epoch": 484.8421052631579, "grad_norm": 0.7624306678771973, "learning_rate": 2.0409954235769414e-07, "loss": 0.1874, "step": 23030 }, { "epoch": 485.05263157894734, "grad_norm": 0.6849631667137146, "learning_rate": 1.9565189693373508e-07, "loss": 0.187, "step": 23040 }, { "epoch": 485.2631578947368, "grad_norm": 0.5415328741073608, "learning_rate": 1.8738261806234837e-07, "loss": 0.1719, "step": 23050 }, { "epoch": 485.4736842105263, "grad_norm": 0.6806001663208008, "learning_rate": 1.7929172052207898e-07, "loss": 0.1908, "step": 23060 }, { "epoch": 485.6842105263158, "grad_norm": 0.8779856562614441, "learning_rate": 1.713792187726604e-07, "loss": 0.1895, "step": 23070 }, { "epoch": 485.89473684210526, "grad_norm": 0.6320582032203674, "learning_rate": 1.6364512695503654e-07, "loss": 0.1895, "step": 23080 }, { "epoch": 486.10526315789474, "grad_norm": 0.5849341154098511, "learning_rate": 1.5608945889127314e-07, "loss": 0.1835, "step": 23090 }, { "epoch": 486.3157894736842, "grad_norm": 0.49242666363716125, "learning_rate": 1.4871222808456874e-07, "loss": 0.1755, "step": 23100 }, { "epoch": 486.5263157894737, "grad_norm": 0.6123622059822083, "learning_rate": 1.415134477192437e-07, "loss": 0.1888, "step": 23110 }, { "epoch": 486.7368421052632, "grad_norm": 0.6847497224807739, "learning_rate": 1.3449313066067337e-07, "loss": 0.1761, "step": 23120 }, { "epoch": 486.94736842105266, "grad_norm": 0.6061682105064392, "learning_rate": 1.2765128945531057e-07, "loss": 0.1884, "step": 23130 }, { "epoch": 487.1578947368421, "grad_norm": 0.5884981155395508, "learning_rate": 1.209879363306299e-07, "loss": 0.188, "step": 23140 }, { "epoch": 487.36842105263156, "grad_norm": 0.5741125345230103, "learning_rate": 1.145030831951277e-07, "loss": 0.1862, "step": 23150 }, { "epoch": 487.57894736842104, "grad_norm": 0.6427663564682007, "learning_rate": 1.0819674163828897e-07, "loss": 0.1893, "step": 23160 }, { "epoch": 487.7894736842105, "grad_norm": 0.5162391066551208, "learning_rate": 1.0206892293055382e-07, "loss": 0.1799, "step": 23170 }, { "epoch": 488.0, "grad_norm": 0.8966327905654907, "learning_rate": 9.611963802335089e-08, "loss": 0.1892, "step": 23180 }, { "epoch": 488.2105263157895, "grad_norm": 0.7478238940238953, "learning_rate": 9.034889754900855e-08, "loss": 0.1786, "step": 23190 }, { "epoch": 488.42105263157896, "grad_norm": 0.6039992570877075, "learning_rate": 8.475671182076595e-08, "loss": 0.1848, "step": 23200 }, { "epoch": 488.63157894736844, "grad_norm": 0.6707776784896851, "learning_rate": 7.934309083278413e-08, "loss": 0.1813, "step": 23210 }, { "epoch": 488.8421052631579, "grad_norm": 0.5292564034461975, "learning_rate": 7.410804426005724e-08, "loss": 0.196, "step": 23220 }, { "epoch": 489.05263157894734, "grad_norm": 0.6198012232780457, "learning_rate": 6.905158145847913e-08, "loss": 0.1719, "step": 23230 }, { "epoch": 489.2631578947368, "grad_norm": 0.5900288224220276, "learning_rate": 6.417371146476559e-08, "loss": 0.172, "step": 23240 }, { "epoch": 489.4736842105263, "grad_norm": 0.5824329257011414, "learning_rate": 5.947444299646554e-08, "loss": 0.1932, "step": 23250 }, { "epoch": 489.6842105263158, "grad_norm": 0.4687194526195526, "learning_rate": 5.495378445192767e-08, "loss": 0.1689, "step": 23260 }, { "epoch": 489.89473684210526, "grad_norm": 0.7158029675483704, "learning_rate": 5.0611743910300436e-08, "loss": 0.1915, "step": 23270 }, { "epoch": 490.10526315789474, "grad_norm": 0.6645452380180359, "learning_rate": 4.644832913152097e-08, "loss": 0.1968, "step": 23280 }, { "epoch": 490.3157894736842, "grad_norm": 0.6376003623008728, "learning_rate": 4.246354755628179e-08, "loss": 0.1806, "step": 23290 }, { "epoch": 490.5263157894737, "grad_norm": 0.8974951505661011, "learning_rate": 3.8657406306030764e-08, "loss": 0.1905, "step": 23300 }, { "epoch": 490.7368421052632, "grad_norm": 0.5921673774719238, "learning_rate": 3.502991218296003e-08, "loss": 0.1861, "step": 23310 }, { "epoch": 490.94736842105266, "grad_norm": 0.892542839050293, "learning_rate": 3.1581071670006015e-08, "loss": 0.1832, "step": 23320 }, { "epoch": 491.1578947368421, "grad_norm": 0.5395411849021912, "learning_rate": 2.8310890930782763e-08, "loss": 0.1806, "step": 23330 }, { "epoch": 491.36842105263156, "grad_norm": 0.6547475457191467, "learning_rate": 2.5219375809637514e-08, "loss": 0.192, "step": 23340 }, { "epoch": 491.57894736842104, "grad_norm": 0.6222972273826599, "learning_rate": 2.230653183162845e-08, "loss": 0.1725, "step": 23350 }, { "epoch": 491.7894736842105, "grad_norm": 0.6540040969848633, "learning_rate": 1.9572364202458115e-08, "loss": 0.1761, "step": 23360 }, { "epoch": 492.0, "grad_norm": 0.9256670475006104, "learning_rate": 1.7016877808539998e-08, "loss": 0.1914, "step": 23370 }, { "epoch": 492.2105263157895, "grad_norm": 0.6202569007873535, "learning_rate": 1.4640077216931946e-08, "loss": 0.1839, "step": 23380 }, { "epoch": 492.42105263157896, "grad_norm": 0.6193927526473999, "learning_rate": 1.2441966675380556e-08, "loss": 0.1789, "step": 23390 }, { "epoch": 492.63157894736844, "grad_norm": 0.4895412027835846, "learning_rate": 1.0422550112243468e-08, "loss": 0.1863, "step": 23400 }, { "epoch": 492.8421052631579, "grad_norm": 0.5214998722076416, "learning_rate": 8.581831136555973e-09, "loss": 0.1762, "step": 23410 }, { "epoch": 493.05263157894734, "grad_norm": 0.6743729114532471, "learning_rate": 6.919813037986611e-09, "loss": 0.1918, "step": 23420 }, { "epoch": 493.2631578947368, "grad_norm": 0.5977574586868286, "learning_rate": 5.436498786826061e-09, "loss": 0.1914, "step": 23430 }, { "epoch": 493.4736842105263, "grad_norm": 0.6423200368881226, "learning_rate": 4.1318910339982475e-09, "loss": 0.1975, "step": 23440 }, { "epoch": 493.6842105263158, "grad_norm": 0.4789294898509979, "learning_rate": 3.005992111038136e-09, "loss": 0.1658, "step": 23450 }, { "epoch": 493.89473684210526, "grad_norm": 0.5794366002082825, "learning_rate": 2.058804030125039e-09, "loss": 0.1798, "step": 23460 }, { "epoch": 494.10526315789474, "grad_norm": 0.621591329574585, "learning_rate": 1.3591339329321884e-09, "loss": 0.1856, "step": 23470 }, { "epoch": 494.3157894736842, "grad_norm": 0.5210126638412476, "learning_rate": 7.515008515257549e-10, "loss": 0.1857, "step": 23480 }, { "epoch": 494.5263157894737, "grad_norm": 0.5082560181617737, "learning_rate": 3.225826413100208e-10, "loss": 0.1706, "step": 23490 }, { "epoch": 494.7368421052632, "grad_norm": 0.5357773303985596, "learning_rate": 7.238006881626901e-11, "loss": 0.1852, "step": 23500 } ], "logging_steps": 10, "max_steps": 23500, "num_input_tokens_seen": 0, "num_train_epochs": 500, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.305535571859456e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }