diff --git "a/llama2_13b_peft/news_commentary_it/trainer_state.json" "b/llama2_13b_peft/news_commentary_it/trainer_state.json" new file mode 100644--- /dev/null +++ "b/llama2_13b_peft/news_commentary_it/trainer_state.json" @@ -0,0 +1,8478 @@ +{ + "best_metric": 0.6415141820907593, + "best_model_checkpoint": "ckpt/llama2_13b_fuze27_no_sys/news_commentary_it_no_sys/checkpoint-6000", + "epoch": 1.6842105263157894, + "eval_steps": 2000, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014035087719298245, + "grad_norm": 0.6813449859619141, + "learning_rate": 2.5e-05, + "loss": 1.3423, + "step": 10 + }, + { + "epoch": 0.002807017543859649, + "grad_norm": 0.9470943212509155, + "learning_rate": 5e-05, + "loss": 1.3855, + "step": 20 + }, + { + "epoch": 0.004210526315789474, + "grad_norm": 0.8929744958877563, + "learning_rate": 4.999999026832157e-05, + "loss": 0.9621, + "step": 30 + }, + { + "epoch": 0.005614035087719298, + "grad_norm": 1.383805274963379, + "learning_rate": 4.9999961073293845e-05, + "loss": 0.8217, + "step": 40 + }, + { + "epoch": 0.007017543859649123, + "grad_norm": 0.7758613228797913, + "learning_rate": 4.9999912414939555e-05, + "loss": 0.7743, + "step": 50 + }, + { + "epoch": 0.008421052631578947, + "grad_norm": 0.38530462980270386, + "learning_rate": 4.9999844293296585e-05, + "loss": 0.7671, + "step": 60 + }, + { + "epoch": 0.009824561403508772, + "grad_norm": 0.9287435412406921, + "learning_rate": 4.999975670841798e-05, + "loss": 0.7657, + "step": 70 + }, + { + "epoch": 0.011228070175438596, + "grad_norm": 0.5709918737411499, + "learning_rate": 4.9999649660371906e-05, + "loss": 0.6544, + "step": 80 + }, + { + "epoch": 0.01263157894736842, + "grad_norm": 0.6181680560112, + "learning_rate": 4.9999523149241714e-05, + "loss": 0.7627, + "step": 90 + }, + { + "epoch": 0.014035087719298246, + "grad_norm": 0.8074678182601929, + "learning_rate": 4.99993771751259e-05, + "loss": 0.7428, + "step": 100 + }, + { + "epoch": 0.015438596491228071, + "grad_norm": 0.7091221809387207, + "learning_rate": 4.999921173813812e-05, + "loss": 0.7024, + "step": 110 + }, + { + "epoch": 0.016842105263157894, + "grad_norm": 2.1647095680236816, + "learning_rate": 4.999902683840715e-05, + "loss": 0.8205, + "step": 120 + }, + { + "epoch": 0.018245614035087718, + "grad_norm": 1.178070068359375, + "learning_rate": 4.9998822476076955e-05, + "loss": 0.7359, + "step": 130 + }, + { + "epoch": 0.019649122807017545, + "grad_norm": 1.0926941633224487, + "learning_rate": 4.999859865130664e-05, + "loss": 0.6837, + "step": 140 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 1.5175189971923828, + "learning_rate": 4.9998355364270445e-05, + "loss": 0.7091, + "step": 150 + }, + { + "epoch": 0.02245614035087719, + "grad_norm": 0.9353613257408142, + "learning_rate": 4.999809261515779e-05, + "loss": 0.7608, + "step": 160 + }, + { + "epoch": 0.023859649122807018, + "grad_norm": 0.4437258839607239, + "learning_rate": 4.9997810404173234e-05, + "loss": 0.7725, + "step": 170 + }, + { + "epoch": 0.02526315789473684, + "grad_norm": 0.4320019781589508, + "learning_rate": 4.999750873153648e-05, + "loss": 0.7884, + "step": 180 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.8100196123123169, + "learning_rate": 4.9997187597482405e-05, + "loss": 0.7266, + "step": 190 + }, + { + "epoch": 0.028070175438596492, + "grad_norm": 1.1367573738098145, + "learning_rate": 4.9996847002261006e-05, + "loss": 0.6825, + "step": 200 + }, + { + "epoch": 0.029473684210526315, + "grad_norm": 0.9733144640922546, + "learning_rate": 4.999648694613746e-05, + "loss": 0.6162, + "step": 210 + }, + { + "epoch": 0.030877192982456142, + "grad_norm": 0.7170027494430542, + "learning_rate": 4.9996107429392083e-05, + "loss": 0.6696, + "step": 220 + }, + { + "epoch": 0.032280701754385965, + "grad_norm": 0.939182698726654, + "learning_rate": 4.9995708452320325e-05, + "loss": 0.7512, + "step": 230 + }, + { + "epoch": 0.03368421052631579, + "grad_norm": 0.7647657990455627, + "learning_rate": 4.999529001523282e-05, + "loss": 0.7137, + "step": 240 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 0.9428808093070984, + "learning_rate": 4.9994852118455335e-05, + "loss": 0.7676, + "step": 250 + }, + { + "epoch": 0.036491228070175435, + "grad_norm": 0.3808974325656891, + "learning_rate": 4.9994394762328786e-05, + "loss": 0.7208, + "step": 260 + }, + { + "epoch": 0.037894736842105266, + "grad_norm": 1.0278472900390625, + "learning_rate": 4.999391794720923e-05, + "loss": 0.7029, + "step": 270 + }, + { + "epoch": 0.03929824561403509, + "grad_norm": 0.8878808617591858, + "learning_rate": 4.9993421673467906e-05, + "loss": 0.6751, + "step": 280 + }, + { + "epoch": 0.04070175438596491, + "grad_norm": 0.5619615316390991, + "learning_rate": 4.9992905941491155e-05, + "loss": 0.7652, + "step": 290 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 1.1087744235992432, + "learning_rate": 4.9992370751680514e-05, + "loss": 0.7609, + "step": 300 + }, + { + "epoch": 0.04350877192982456, + "grad_norm": 0.7816822528839111, + "learning_rate": 4.999181610445263e-05, + "loss": 0.678, + "step": 310 + }, + { + "epoch": 0.04491228070175438, + "grad_norm": 1.0437147617340088, + "learning_rate": 4.9991242000239316e-05, + "loss": 0.7089, + "step": 320 + }, + { + "epoch": 0.04631578947368421, + "grad_norm": 0.7266655564308167, + "learning_rate": 4.9990648439487544e-05, + "loss": 0.7034, + "step": 330 + }, + { + "epoch": 0.047719298245614036, + "grad_norm": 0.8695891499519348, + "learning_rate": 4.999003542265941e-05, + "loss": 0.6789, + "step": 340 + }, + { + "epoch": 0.04912280701754386, + "grad_norm": 1.2530779838562012, + "learning_rate": 4.998940295023218e-05, + "loss": 0.6895, + "step": 350 + }, + { + "epoch": 0.05052631578947368, + "grad_norm": 0.9562914371490479, + "learning_rate": 4.9988751022698244e-05, + "loss": 0.7472, + "step": 360 + }, + { + "epoch": 0.051929824561403506, + "grad_norm": 1.5020138025283813, + "learning_rate": 4.9988079640565155e-05, + "loss": 0.7637, + "step": 370 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 1.3555861711502075, + "learning_rate": 4.998738880435561e-05, + "loss": 0.8042, + "step": 380 + }, + { + "epoch": 0.05473684210526316, + "grad_norm": 1.4689439535140991, + "learning_rate": 4.9986678514607434e-05, + "loss": 0.7878, + "step": 390 + }, + { + "epoch": 0.056140350877192984, + "grad_norm": 1.1399718523025513, + "learning_rate": 4.998594877187362e-05, + "loss": 0.6831, + "step": 400 + }, + { + "epoch": 0.05754385964912281, + "grad_norm": 0.9988260269165039, + "learning_rate": 4.998519957672232e-05, + "loss": 0.7905, + "step": 410 + }, + { + "epoch": 0.05894736842105263, + "grad_norm": 1.3424835205078125, + "learning_rate": 4.998443092973678e-05, + "loss": 0.6195, + "step": 420 + }, + { + "epoch": 0.060350877192982454, + "grad_norm": 1.3029276132583618, + "learning_rate": 4.998364283151542e-05, + "loss": 0.7603, + "step": 430 + }, + { + "epoch": 0.061754385964912284, + "grad_norm": 1.0647430419921875, + "learning_rate": 4.9982835282671816e-05, + "loss": 0.7099, + "step": 440 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 0.4545954763889313, + "learning_rate": 4.998200828383466e-05, + "loss": 0.6307, + "step": 450 + }, + { + "epoch": 0.06456140350877193, + "grad_norm": 0.9822194576263428, + "learning_rate": 4.99811618356478e-05, + "loss": 0.7084, + "step": 460 + }, + { + "epoch": 0.06596491228070175, + "grad_norm": 1.0566892623901367, + "learning_rate": 4.998029593877025e-05, + "loss": 0.6897, + "step": 470 + }, + { + "epoch": 0.06736842105263158, + "grad_norm": 0.9908930063247681, + "learning_rate": 4.9979410593876096e-05, + "loss": 0.7054, + "step": 480 + }, + { + "epoch": 0.0687719298245614, + "grad_norm": 0.5955024361610413, + "learning_rate": 4.997850580165464e-05, + "loss": 0.645, + "step": 490 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 1.570892095565796, + "learning_rate": 4.997758156281029e-05, + "loss": 0.6455, + "step": 500 + }, + { + "epoch": 0.07157894736842105, + "grad_norm": 0.9024527072906494, + "learning_rate": 4.997663787806259e-05, + "loss": 0.6797, + "step": 510 + }, + { + "epoch": 0.07298245614035087, + "grad_norm": 0.6280427575111389, + "learning_rate": 4.997567474814623e-05, + "loss": 0.7582, + "step": 520 + }, + { + "epoch": 0.07438596491228071, + "grad_norm": 1.0706899166107178, + "learning_rate": 4.997469217381105e-05, + "loss": 0.667, + "step": 530 + }, + { + "epoch": 0.07578947368421053, + "grad_norm": 0.8091099262237549, + "learning_rate": 4.997369015582201e-05, + "loss": 0.6878, + "step": 540 + }, + { + "epoch": 0.07719298245614035, + "grad_norm": 0.900131106376648, + "learning_rate": 4.9972668694959216e-05, + "loss": 0.7693, + "step": 550 + }, + { + "epoch": 0.07859649122807018, + "grad_norm": 0.791890025138855, + "learning_rate": 4.9971627792017915e-05, + "loss": 0.561, + "step": 560 + }, + { + "epoch": 0.08, + "grad_norm": 1.3132946491241455, + "learning_rate": 4.997056744780848e-05, + "loss": 0.6739, + "step": 570 + }, + { + "epoch": 0.08140350877192983, + "grad_norm": 0.8182291984558105, + "learning_rate": 4.9969487663156434e-05, + "loss": 0.6561, + "step": 580 + }, + { + "epoch": 0.08280701754385965, + "grad_norm": 1.1820317506790161, + "learning_rate": 4.9968388438902415e-05, + "loss": 0.6056, + "step": 590 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 0.6508825421333313, + "learning_rate": 4.9967269775902204e-05, + "loss": 0.6962, + "step": 600 + }, + { + "epoch": 0.0856140350877193, + "grad_norm": 0.8885963559150696, + "learning_rate": 4.996613167502674e-05, + "loss": 0.6174, + "step": 610 + }, + { + "epoch": 0.08701754385964912, + "grad_norm": 0.8769521713256836, + "learning_rate": 4.996497413716205e-05, + "loss": 0.6806, + "step": 620 + }, + { + "epoch": 0.08842105263157894, + "grad_norm": 1.1168580055236816, + "learning_rate": 4.996379716320933e-05, + "loss": 0.7618, + "step": 630 + }, + { + "epoch": 0.08982456140350877, + "grad_norm": 0.6629518270492554, + "learning_rate": 4.996260075408489e-05, + "loss": 0.6796, + "step": 640 + }, + { + "epoch": 0.0912280701754386, + "grad_norm": 0.5513269901275635, + "learning_rate": 4.996138491072018e-05, + "loss": 0.6249, + "step": 650 + }, + { + "epoch": 0.09263157894736843, + "grad_norm": 0.8878002166748047, + "learning_rate": 4.996014963406177e-05, + "loss": 0.6905, + "step": 660 + }, + { + "epoch": 0.09403508771929825, + "grad_norm": 1.407973289489746, + "learning_rate": 4.9958894925071364e-05, + "loss": 0.7082, + "step": 670 + }, + { + "epoch": 0.09543859649122807, + "grad_norm": 2.0107500553131104, + "learning_rate": 4.995762078472581e-05, + "loss": 0.6751, + "step": 680 + }, + { + "epoch": 0.0968421052631579, + "grad_norm": 0.7563285827636719, + "learning_rate": 4.995632721401705e-05, + "loss": 0.6223, + "step": 690 + }, + { + "epoch": 0.09824561403508772, + "grad_norm": 0.7729387879371643, + "learning_rate": 4.995501421395219e-05, + "loss": 0.622, + "step": 700 + }, + { + "epoch": 0.09964912280701754, + "grad_norm": 0.9992890954017639, + "learning_rate": 4.995368178555343e-05, + "loss": 0.7565, + "step": 710 + }, + { + "epoch": 0.10105263157894737, + "grad_norm": 1.0641027688980103, + "learning_rate": 4.9952329929858125e-05, + "loss": 0.7486, + "step": 720 + }, + { + "epoch": 0.10245614035087719, + "grad_norm": 0.8268628716468811, + "learning_rate": 4.995095864791873e-05, + "loss": 0.6825, + "step": 730 + }, + { + "epoch": 0.10385964912280701, + "grad_norm": 0.7123477458953857, + "learning_rate": 4.994956794080285e-05, + "loss": 0.7342, + "step": 740 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.0346596240997314, + "learning_rate": 4.994815780959318e-05, + "loss": 0.6289, + "step": 750 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.7578685283660889, + "learning_rate": 4.994672825538757e-05, + "loss": 0.5675, + "step": 760 + }, + { + "epoch": 0.1080701754385965, + "grad_norm": 1.1263622045516968, + "learning_rate": 4.994527927929897e-05, + "loss": 0.7527, + "step": 770 + }, + { + "epoch": 0.10947368421052632, + "grad_norm": 0.8590745329856873, + "learning_rate": 4.9943810882455454e-05, + "loss": 0.6421, + "step": 780 + }, + { + "epoch": 0.11087719298245614, + "grad_norm": 0.7870830297470093, + "learning_rate": 4.994232306600023e-05, + "loss": 0.7016, + "step": 790 + }, + { + "epoch": 0.11228070175438597, + "grad_norm": 0.9499567747116089, + "learning_rate": 4.99408158310916e-05, + "loss": 0.6911, + "step": 800 + }, + { + "epoch": 0.11368421052631579, + "grad_norm": 1.1604363918304443, + "learning_rate": 4.9939289178903016e-05, + "loss": 0.697, + "step": 810 + }, + { + "epoch": 0.11508771929824561, + "grad_norm": 0.7308230400085449, + "learning_rate": 4.993774311062301e-05, + "loss": 0.5691, + "step": 820 + }, + { + "epoch": 0.11649122807017544, + "grad_norm": 1.0032395124435425, + "learning_rate": 4.993617762745526e-05, + "loss": 0.7744, + "step": 830 + }, + { + "epoch": 0.11789473684210526, + "grad_norm": 1.0617241859436035, + "learning_rate": 4.993459273061855e-05, + "loss": 0.7652, + "step": 840 + }, + { + "epoch": 0.11929824561403508, + "grad_norm": 1.207223653793335, + "learning_rate": 4.993298842134677e-05, + "loss": 0.6843, + "step": 850 + }, + { + "epoch": 0.12070175438596491, + "grad_norm": 0.6737737059593201, + "learning_rate": 4.993136470088894e-05, + "loss": 0.7147, + "step": 860 + }, + { + "epoch": 0.12210526315789473, + "grad_norm": 1.3904882669448853, + "learning_rate": 4.992972157050916e-05, + "loss": 0.641, + "step": 870 + }, + { + "epoch": 0.12350877192982457, + "grad_norm": 0.8821682929992676, + "learning_rate": 4.992805903148669e-05, + "loss": 0.6212, + "step": 880 + }, + { + "epoch": 0.12491228070175439, + "grad_norm": 1.214309811592102, + "learning_rate": 4.992637708511586e-05, + "loss": 0.6817, + "step": 890 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 1.2954894304275513, + "learning_rate": 4.9924675732706123e-05, + "loss": 0.7072, + "step": 900 + }, + { + "epoch": 0.12771929824561404, + "grad_norm": 0.8437069058418274, + "learning_rate": 4.992295497558204e-05, + "loss": 0.6221, + "step": 910 + }, + { + "epoch": 0.12912280701754386, + "grad_norm": 0.6401008367538452, + "learning_rate": 4.992121481508328e-05, + "loss": 0.6162, + "step": 920 + }, + { + "epoch": 0.13052631578947368, + "grad_norm": 1.1894147396087646, + "learning_rate": 4.9919455252564624e-05, + "loss": 0.7548, + "step": 930 + }, + { + "epoch": 0.1319298245614035, + "grad_norm": 0.9592342376708984, + "learning_rate": 4.991767628939594e-05, + "loss": 0.6377, + "step": 940 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.6419144868850708, + "learning_rate": 4.991587792696223e-05, + "loss": 0.6971, + "step": 950 + }, + { + "epoch": 0.13473684210526315, + "grad_norm": 1.0908111333847046, + "learning_rate": 4.991406016666356e-05, + "loss": 0.7929, + "step": 960 + }, + { + "epoch": 0.13614035087719298, + "grad_norm": 1.231597661972046, + "learning_rate": 4.9912223009915126e-05, + "loss": 0.7556, + "step": 970 + }, + { + "epoch": 0.1375438596491228, + "grad_norm": 0.7628648281097412, + "learning_rate": 4.991036645814722e-05, + "loss": 0.5883, + "step": 980 + }, + { + "epoch": 0.13894736842105262, + "grad_norm": 0.766953706741333, + "learning_rate": 4.9908490512805236e-05, + "loss": 0.6362, + "step": 990 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 1.070429801940918, + "learning_rate": 4.990659517534966e-05, + "loss": 0.7057, + "step": 1000 + }, + { + "epoch": 0.14175438596491227, + "grad_norm": 0.8499042391777039, + "learning_rate": 4.990468044725606e-05, + "loss": 0.6051, + "step": 1010 + }, + { + "epoch": 0.1431578947368421, + "grad_norm": 1.1411361694335938, + "learning_rate": 4.990274633001514e-05, + "loss": 0.7434, + "step": 1020 + }, + { + "epoch": 0.14456140350877192, + "grad_norm": 1.3025455474853516, + "learning_rate": 4.990079282513266e-05, + "loss": 0.6681, + "step": 1030 + }, + { + "epoch": 0.14596491228070174, + "grad_norm": 0.9307923316955566, + "learning_rate": 4.9898819934129506e-05, + "loss": 0.6655, + "step": 1040 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 0.6463920474052429, + "learning_rate": 4.989682765854163e-05, + "loss": 0.7529, + "step": 1050 + }, + { + "epoch": 0.14877192982456142, + "grad_norm": 1.216407060623169, + "learning_rate": 4.989481599992009e-05, + "loss": 0.6249, + "step": 1060 + }, + { + "epoch": 0.15017543859649124, + "grad_norm": 0.747074544429779, + "learning_rate": 4.989278495983103e-05, + "loss": 0.6437, + "step": 1070 + }, + { + "epoch": 0.15157894736842106, + "grad_norm": 0.8777433633804321, + "learning_rate": 4.989073453985569e-05, + "loss": 0.6206, + "step": 1080 + }, + { + "epoch": 0.1529824561403509, + "grad_norm": 0.8588824272155762, + "learning_rate": 4.988866474159037e-05, + "loss": 0.6141, + "step": 1090 + }, + { + "epoch": 0.1543859649122807, + "grad_norm": 0.6369594931602478, + "learning_rate": 4.988657556664652e-05, + "loss": 0.6653, + "step": 1100 + }, + { + "epoch": 0.15578947368421053, + "grad_norm": 0.7276690006256104, + "learning_rate": 4.98844670166506e-05, + "loss": 0.6503, + "step": 1110 + }, + { + "epoch": 0.15719298245614036, + "grad_norm": 0.6937339305877686, + "learning_rate": 4.98823390932442e-05, + "loss": 0.6298, + "step": 1120 + }, + { + "epoch": 0.15859649122807018, + "grad_norm": 1.4779495000839233, + "learning_rate": 4.988019179808398e-05, + "loss": 0.6889, + "step": 1130 + }, + { + "epoch": 0.16, + "grad_norm": 1.4205069541931152, + "learning_rate": 4.987802513284169e-05, + "loss": 0.7086, + "step": 1140 + }, + { + "epoch": 0.16140350877192983, + "grad_norm": 1.5097942352294922, + "learning_rate": 4.9875839099204134e-05, + "loss": 0.6727, + "step": 1150 + }, + { + "epoch": 0.16280701754385965, + "grad_norm": 0.8267427086830139, + "learning_rate": 4.987363369887324e-05, + "loss": 0.6993, + "step": 1160 + }, + { + "epoch": 0.16421052631578947, + "grad_norm": 1.1303791999816895, + "learning_rate": 4.987140893356597e-05, + "loss": 0.5671, + "step": 1170 + }, + { + "epoch": 0.1656140350877193, + "grad_norm": 0.9507080316543579, + "learning_rate": 4.986916480501438e-05, + "loss": 0.6929, + "step": 1180 + }, + { + "epoch": 0.16701754385964912, + "grad_norm": 1.0298510789871216, + "learning_rate": 4.986690131496561e-05, + "loss": 0.5368, + "step": 1190 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 1.0742335319519043, + "learning_rate": 4.986461846518186e-05, + "loss": 0.6473, + "step": 1200 + }, + { + "epoch": 0.16982456140350877, + "grad_norm": 1.3724429607391357, + "learning_rate": 4.986231625744041e-05, + "loss": 0.6698, + "step": 1210 + }, + { + "epoch": 0.1712280701754386, + "grad_norm": 0.7210483551025391, + "learning_rate": 4.985999469353359e-05, + "loss": 0.6747, + "step": 1220 + }, + { + "epoch": 0.1726315789473684, + "grad_norm": 0.8128493428230286, + "learning_rate": 4.9857653775268853e-05, + "loss": 0.6509, + "step": 1230 + }, + { + "epoch": 0.17403508771929824, + "grad_norm": 0.9664400815963745, + "learning_rate": 4.985529350446865e-05, + "loss": 0.6895, + "step": 1240 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 1.0563639402389526, + "learning_rate": 4.985291388297055e-05, + "loss": 0.5882, + "step": 1250 + }, + { + "epoch": 0.17684210526315788, + "grad_norm": 0.7978933453559875, + "learning_rate": 4.985051491262716e-05, + "loss": 0.6688, + "step": 1260 + }, + { + "epoch": 0.1782456140350877, + "grad_norm": 1.0037199258804321, + "learning_rate": 4.984809659530617e-05, + "loss": 0.6135, + "step": 1270 + }, + { + "epoch": 0.17964912280701753, + "grad_norm": 1.0351414680480957, + "learning_rate": 4.9845658932890315e-05, + "loss": 0.6849, + "step": 1280 + }, + { + "epoch": 0.18105263157894738, + "grad_norm": 0.9015732407569885, + "learning_rate": 4.9843201927277407e-05, + "loss": 0.6036, + "step": 1290 + }, + { + "epoch": 0.1824561403508772, + "grad_norm": 1.1445683240890503, + "learning_rate": 4.984072558038031e-05, + "loss": 0.7348, + "step": 1300 + }, + { + "epoch": 0.18385964912280703, + "grad_norm": 1.2019379138946533, + "learning_rate": 4.983822989412693e-05, + "loss": 0.7679, + "step": 1310 + }, + { + "epoch": 0.18526315789473685, + "grad_norm": 0.6560442447662354, + "learning_rate": 4.983571487046026e-05, + "loss": 0.7083, + "step": 1320 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.8415977954864502, + "learning_rate": 4.9833180511338314e-05, + "loss": 0.6417, + "step": 1330 + }, + { + "epoch": 0.1880701754385965, + "grad_norm": 0.8725243210792542, + "learning_rate": 4.983062681873421e-05, + "loss": 0.6817, + "step": 1340 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 0.8865370154380798, + "learning_rate": 4.982805379463605e-05, + "loss": 0.6554, + "step": 1350 + }, + { + "epoch": 0.19087719298245615, + "grad_norm": 0.6979865431785583, + "learning_rate": 4.982546144104704e-05, + "loss": 0.6613, + "step": 1360 + }, + { + "epoch": 0.19228070175438597, + "grad_norm": 0.8604574203491211, + "learning_rate": 4.982284975998541e-05, + "loss": 0.6902, + "step": 1370 + }, + { + "epoch": 0.1936842105263158, + "grad_norm": 0.849172055721283, + "learning_rate": 4.982021875348445e-05, + "loss": 0.81, + "step": 1380 + }, + { + "epoch": 0.19508771929824562, + "grad_norm": 0.9217461347579956, + "learning_rate": 4.9817568423592484e-05, + "loss": 0.6796, + "step": 1390 + }, + { + "epoch": 0.19649122807017544, + "grad_norm": 1.1720378398895264, + "learning_rate": 4.981489877237288e-05, + "loss": 0.6109, + "step": 1400 + }, + { + "epoch": 0.19789473684210526, + "grad_norm": 0.8361873626708984, + "learning_rate": 4.9812209801904064e-05, + "loss": 0.7521, + "step": 1410 + }, + { + "epoch": 0.19929824561403509, + "grad_norm": 0.9124870896339417, + "learning_rate": 4.980950151427948e-05, + "loss": 0.6742, + "step": 1420 + }, + { + "epoch": 0.2007017543859649, + "grad_norm": 1.0720082521438599, + "learning_rate": 4.980677391160763e-05, + "loss": 0.659, + "step": 1430 + }, + { + "epoch": 0.20210526315789473, + "grad_norm": 0.7144408822059631, + "learning_rate": 4.980402699601205e-05, + "loss": 0.6392, + "step": 1440 + }, + { + "epoch": 0.20350877192982456, + "grad_norm": 0.8546087145805359, + "learning_rate": 4.98012607696313e-05, + "loss": 0.6674, + "step": 1450 + }, + { + "epoch": 0.20491228070175438, + "grad_norm": 0.8717739582061768, + "learning_rate": 4.979847523461898e-05, + "loss": 0.6772, + "step": 1460 + }, + { + "epoch": 0.2063157894736842, + "grad_norm": 0.9035875201225281, + "learning_rate": 4.9795670393143735e-05, + "loss": 0.6598, + "step": 1470 + }, + { + "epoch": 0.20771929824561403, + "grad_norm": 1.5168395042419434, + "learning_rate": 4.9792846247389214e-05, + "loss": 0.6784, + "step": 1480 + }, + { + "epoch": 0.20912280701754385, + "grad_norm": 1.3440768718719482, + "learning_rate": 4.979000279955413e-05, + "loss": 0.673, + "step": 1490 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.6575384140014648, + "learning_rate": 4.97871400518522e-05, + "loss": 0.6018, + "step": 1500 + }, + { + "epoch": 0.2119298245614035, + "grad_norm": 0.843136727809906, + "learning_rate": 4.978425800651216e-05, + "loss": 0.673, + "step": 1510 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.9389488101005554, + "learning_rate": 4.978135666577779e-05, + "loss": 0.6455, + "step": 1520 + }, + { + "epoch": 0.21473684210526317, + "grad_norm": 1.0860190391540527, + "learning_rate": 4.977843603190788e-05, + "loss": 0.6945, + "step": 1530 + }, + { + "epoch": 0.216140350877193, + "grad_norm": 0.923224925994873, + "learning_rate": 4.9775496107176245e-05, + "loss": 0.6441, + "step": 1540 + }, + { + "epoch": 0.21754385964912282, + "grad_norm": 0.9440721273422241, + "learning_rate": 4.977253689387172e-05, + "loss": 0.6399, + "step": 1550 + }, + { + "epoch": 0.21894736842105264, + "grad_norm": 1.225602626800537, + "learning_rate": 4.976955839429815e-05, + "loss": 0.7059, + "step": 1560 + }, + { + "epoch": 0.22035087719298246, + "grad_norm": 0.7701632380485535, + "learning_rate": 4.976656061077441e-05, + "loss": 0.7422, + "step": 1570 + }, + { + "epoch": 0.2217543859649123, + "grad_norm": 1.0036752223968506, + "learning_rate": 4.976354354563435e-05, + "loss": 0.7, + "step": 1580 + }, + { + "epoch": 0.2231578947368421, + "grad_norm": 1.2595415115356445, + "learning_rate": 4.976050720122688e-05, + "loss": 0.693, + "step": 1590 + }, + { + "epoch": 0.22456140350877193, + "grad_norm": 0.9705458283424377, + "learning_rate": 4.97574515799159e-05, + "loss": 0.6477, + "step": 1600 + }, + { + "epoch": 0.22596491228070176, + "grad_norm": 0.9339498281478882, + "learning_rate": 4.975437668408031e-05, + "loss": 0.6839, + "step": 1610 + }, + { + "epoch": 0.22736842105263158, + "grad_norm": 2.5165653228759766, + "learning_rate": 4.9751282516114024e-05, + "loss": 0.5796, + "step": 1620 + }, + { + "epoch": 0.2287719298245614, + "grad_norm": 1.2094191312789917, + "learning_rate": 4.9748169078425955e-05, + "loss": 0.6967, + "step": 1630 + }, + { + "epoch": 0.23017543859649123, + "grad_norm": 0.9400249719619751, + "learning_rate": 4.974503637344002e-05, + "loss": 0.5507, + "step": 1640 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 0.7801631093025208, + "learning_rate": 4.9741884403595135e-05, + "loss": 0.6792, + "step": 1650 + }, + { + "epoch": 0.23298245614035087, + "grad_norm": 0.8041971325874329, + "learning_rate": 4.9738713171345225e-05, + "loss": 0.616, + "step": 1660 + }, + { + "epoch": 0.2343859649122807, + "grad_norm": 0.9792094826698303, + "learning_rate": 4.9735522679159195e-05, + "loss": 0.635, + "step": 1670 + }, + { + "epoch": 0.23578947368421052, + "grad_norm": 0.8937766551971436, + "learning_rate": 4.9732312929520964e-05, + "loss": 0.6902, + "step": 1680 + }, + { + "epoch": 0.23719298245614034, + "grad_norm": 0.6050293445587158, + "learning_rate": 4.972908392492942e-05, + "loss": 0.6899, + "step": 1690 + }, + { + "epoch": 0.23859649122807017, + "grad_norm": 1.114696741104126, + "learning_rate": 4.9725835667898455e-05, + "loss": 0.7013, + "step": 1700 + }, + { + "epoch": 0.24, + "grad_norm": 0.7658337354660034, + "learning_rate": 4.972256816095695e-05, + "loss": 0.6379, + "step": 1710 + }, + { + "epoch": 0.24140350877192981, + "grad_norm": 1.0719423294067383, + "learning_rate": 4.971928140664878e-05, + "loss": 0.6819, + "step": 1720 + }, + { + "epoch": 0.24280701754385964, + "grad_norm": 0.5609824061393738, + "learning_rate": 4.971597540753279e-05, + "loss": 0.6888, + "step": 1730 + }, + { + "epoch": 0.24421052631578946, + "grad_norm": 0.8473712205886841, + "learning_rate": 4.971265016618281e-05, + "loss": 0.6761, + "step": 1740 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 1.023040533065796, + "learning_rate": 4.970930568518765e-05, + "loss": 0.5544, + "step": 1750 + }, + { + "epoch": 0.24701754385964914, + "grad_norm": 1.2763292789459229, + "learning_rate": 4.97059419671511e-05, + "loss": 0.7072, + "step": 1760 + }, + { + "epoch": 0.24842105263157896, + "grad_norm": 0.9501249194145203, + "learning_rate": 4.9702559014691965e-05, + "loss": 0.5992, + "step": 1770 + }, + { + "epoch": 0.24982456140350878, + "grad_norm": 0.714192271232605, + "learning_rate": 4.969915683044395e-05, + "loss": 0.6277, + "step": 1780 + }, + { + "epoch": 0.2512280701754386, + "grad_norm": 0.8613963723182678, + "learning_rate": 4.9695735417055776e-05, + "loss": 0.5501, + "step": 1790 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 0.7384011149406433, + "learning_rate": 4.969229477719116e-05, + "loss": 0.7619, + "step": 1800 + }, + { + "epoch": 0.2540350877192982, + "grad_norm": 0.8516148328781128, + "learning_rate": 4.9688834913528724e-05, + "loss": 0.6706, + "step": 1810 + }, + { + "epoch": 0.2554385964912281, + "grad_norm": 0.9726106524467468, + "learning_rate": 4.9685355828762115e-05, + "loss": 0.6825, + "step": 1820 + }, + { + "epoch": 0.25684210526315787, + "grad_norm": 0.9834999442100525, + "learning_rate": 4.96818575255999e-05, + "loss": 0.7195, + "step": 1830 + }, + { + "epoch": 0.2582456140350877, + "grad_norm": 0.6964922547340393, + "learning_rate": 4.967834000676564e-05, + "loss": 0.6196, + "step": 1840 + }, + { + "epoch": 0.2596491228070175, + "grad_norm": 1.0819238424301147, + "learning_rate": 4.967480327499785e-05, + "loss": 0.5768, + "step": 1850 + }, + { + "epoch": 0.26105263157894737, + "grad_norm": 0.7200153470039368, + "learning_rate": 4.9671247333049975e-05, + "loss": 0.6484, + "step": 1860 + }, + { + "epoch": 0.2624561403508772, + "grad_norm": 0.6098335385322571, + "learning_rate": 4.966767218369046e-05, + "loss": 0.6132, + "step": 1870 + }, + { + "epoch": 0.263859649122807, + "grad_norm": 1.1508702039718628, + "learning_rate": 4.966407782970267e-05, + "loss": 0.6435, + "step": 1880 + }, + { + "epoch": 0.26526315789473687, + "grad_norm": 0.9164888858795166, + "learning_rate": 4.966046427388494e-05, + "loss": 0.6581, + "step": 1890 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.7689521908760071, + "learning_rate": 4.965683151905054e-05, + "loss": 0.593, + "step": 1900 + }, + { + "epoch": 0.2680701754385965, + "grad_norm": 1.2253938913345337, + "learning_rate": 4.965317956802769e-05, + "loss": 0.7037, + "step": 1910 + }, + { + "epoch": 0.2694736842105263, + "grad_norm": 1.2759559154510498, + "learning_rate": 4.964950842365957e-05, + "loss": 0.7054, + "step": 1920 + }, + { + "epoch": 0.27087719298245616, + "grad_norm": 1.0961602926254272, + "learning_rate": 4.9645818088804284e-05, + "loss": 0.6463, + "step": 1930 + }, + { + "epoch": 0.27228070175438596, + "grad_norm": 1.0374549627304077, + "learning_rate": 4.964210856633489e-05, + "loss": 0.7222, + "step": 1940 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 1.228814721107483, + "learning_rate": 4.963837985913938e-05, + "loss": 0.7659, + "step": 1950 + }, + { + "epoch": 0.2750877192982456, + "grad_norm": 1.131882667541504, + "learning_rate": 4.963463197012067e-05, + "loss": 0.7388, + "step": 1960 + }, + { + "epoch": 0.27649122807017545, + "grad_norm": 0.6964682340621948, + "learning_rate": 4.9630864902196626e-05, + "loss": 0.6961, + "step": 1970 + }, + { + "epoch": 0.27789473684210525, + "grad_norm": 0.6383505463600159, + "learning_rate": 4.962707865830004e-05, + "loss": 0.5755, + "step": 1980 + }, + { + "epoch": 0.2792982456140351, + "grad_norm": 0.9402531981468201, + "learning_rate": 4.9623273241378636e-05, + "loss": 0.6845, + "step": 1990 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 0.8488597273826599, + "learning_rate": 4.9619448654395055e-05, + "loss": 0.7236, + "step": 2000 + }, + { + "epoch": 0.2807017543859649, + "eval_loss": 0.6709622740745544, + "eval_runtime": 44.146, + "eval_samples_per_second": 33.978, + "eval_steps_per_second": 8.495, + "step": 2000 + }, + { + "epoch": 0.28210526315789475, + "grad_norm": 1.1073328256607056, + "learning_rate": 4.9615604900326875e-05, + "loss": 0.5944, + "step": 2010 + }, + { + "epoch": 0.28350877192982454, + "grad_norm": 1.3910387754440308, + "learning_rate": 4.961174198216658e-05, + "loss": 0.6174, + "step": 2020 + }, + { + "epoch": 0.2849122807017544, + "grad_norm": 0.698826253414154, + "learning_rate": 4.9607859902921595e-05, + "loss": 0.6801, + "step": 2030 + }, + { + "epoch": 0.2863157894736842, + "grad_norm": 1.118665099143982, + "learning_rate": 4.960395866561425e-05, + "loss": 0.6657, + "step": 2040 + }, + { + "epoch": 0.28771929824561404, + "grad_norm": 1.1043261289596558, + "learning_rate": 4.960003827328179e-05, + "loss": 0.6536, + "step": 2050 + }, + { + "epoch": 0.28912280701754384, + "grad_norm": 0.7518707513809204, + "learning_rate": 4.959609872897637e-05, + "loss": 0.6361, + "step": 2060 + }, + { + "epoch": 0.2905263157894737, + "grad_norm": 1.0390689373016357, + "learning_rate": 4.959214003576507e-05, + "loss": 0.6369, + "step": 2070 + }, + { + "epoch": 0.2919298245614035, + "grad_norm": 1.122710108757019, + "learning_rate": 4.958816219672986e-05, + "loss": 0.7563, + "step": 2080 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.9062842726707458, + "learning_rate": 4.9584165214967634e-05, + "loss": 0.6575, + "step": 2090 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 1.4019687175750732, + "learning_rate": 4.9580149093590165e-05, + "loss": 0.6611, + "step": 2100 + }, + { + "epoch": 0.296140350877193, + "grad_norm": 0.9323289394378662, + "learning_rate": 4.957611383572415e-05, + "loss": 0.6456, + "step": 2110 + }, + { + "epoch": 0.29754385964912283, + "grad_norm": 1.0447218418121338, + "learning_rate": 4.9572059444511175e-05, + "loss": 0.7114, + "step": 2120 + }, + { + "epoch": 0.29894736842105263, + "grad_norm": 1.3190436363220215, + "learning_rate": 4.956798592310773e-05, + "loss": 0.6338, + "step": 2130 + }, + { + "epoch": 0.3003508771929825, + "grad_norm": 0.7944990396499634, + "learning_rate": 4.956389327468518e-05, + "loss": 0.6323, + "step": 2140 + }, + { + "epoch": 0.3017543859649123, + "grad_norm": 0.9921332001686096, + "learning_rate": 4.9559781502429784e-05, + "loss": 0.6231, + "step": 2150 + }, + { + "epoch": 0.3031578947368421, + "grad_norm": 1.0437482595443726, + "learning_rate": 4.955565060954272e-05, + "loss": 0.6515, + "step": 2160 + }, + { + "epoch": 0.3045614035087719, + "grad_norm": 0.8929722309112549, + "learning_rate": 4.9551500599240006e-05, + "loss": 0.6023, + "step": 2170 + }, + { + "epoch": 0.3059649122807018, + "grad_norm": 1.1816951036453247, + "learning_rate": 4.954733147475259e-05, + "loss": 0.678, + "step": 2180 + }, + { + "epoch": 0.30736842105263157, + "grad_norm": 1.4489054679870605, + "learning_rate": 4.954314323932627e-05, + "loss": 0.6307, + "step": 2190 + }, + { + "epoch": 0.3087719298245614, + "grad_norm": 0.6073512434959412, + "learning_rate": 4.953893589622172e-05, + "loss": 0.6543, + "step": 2200 + }, + { + "epoch": 0.3101754385964912, + "grad_norm": 0.8957934975624084, + "learning_rate": 4.9534709448714514e-05, + "loss": 0.7493, + "step": 2210 + }, + { + "epoch": 0.31157894736842107, + "grad_norm": 1.1038836240768433, + "learning_rate": 4.9530463900095084e-05, + "loss": 0.6856, + "step": 2220 + }, + { + "epoch": 0.31298245614035086, + "grad_norm": 1.2374224662780762, + "learning_rate": 4.952619925366873e-05, + "loss": 0.5721, + "step": 2230 + }, + { + "epoch": 0.3143859649122807, + "grad_norm": 0.9683862924575806, + "learning_rate": 4.9521915512755635e-05, + "loss": 0.7126, + "step": 2240 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 1.096661925315857, + "learning_rate": 4.951761268069082e-05, + "loss": 0.7522, + "step": 2250 + }, + { + "epoch": 0.31719298245614036, + "grad_norm": 0.9801945686340332, + "learning_rate": 4.95132907608242e-05, + "loss": 0.6825, + "step": 2260 + }, + { + "epoch": 0.31859649122807016, + "grad_norm": 0.8269819021224976, + "learning_rate": 4.950894975652055e-05, + "loss": 0.7592, + "step": 2270 + }, + { + "epoch": 0.32, + "grad_norm": 0.7468457818031311, + "learning_rate": 4.950458967115946e-05, + "loss": 0.5719, + "step": 2280 + }, + { + "epoch": 0.3214035087719298, + "grad_norm": 0.9176953434944153, + "learning_rate": 4.9500210508135436e-05, + "loss": 0.6288, + "step": 2290 + }, + { + "epoch": 0.32280701754385965, + "grad_norm": 0.6870772838592529, + "learning_rate": 4.9495812270857786e-05, + "loss": 0.7081, + "step": 2300 + }, + { + "epoch": 0.32421052631578945, + "grad_norm": 0.8877288103103638, + "learning_rate": 4.94913949627507e-05, + "loss": 0.6371, + "step": 2310 + }, + { + "epoch": 0.3256140350877193, + "grad_norm": 0.9289653897285461, + "learning_rate": 4.9486958587253195e-05, + "loss": 0.6712, + "step": 2320 + }, + { + "epoch": 0.3270175438596491, + "grad_norm": 0.7378761172294617, + "learning_rate": 4.9482503147819156e-05, + "loss": 0.6232, + "step": 2330 + }, + { + "epoch": 0.32842105263157895, + "grad_norm": 0.7357892394065857, + "learning_rate": 4.947802864791727e-05, + "loss": 0.6519, + "step": 2340 + }, + { + "epoch": 0.3298245614035088, + "grad_norm": 1.509859323501587, + "learning_rate": 4.947353509103112e-05, + "loss": 0.7172, + "step": 2350 + }, + { + "epoch": 0.3312280701754386, + "grad_norm": 0.9467512369155884, + "learning_rate": 4.946902248065907e-05, + "loss": 0.6784, + "step": 2360 + }, + { + "epoch": 0.33263157894736844, + "grad_norm": 1.1108275651931763, + "learning_rate": 4.946449082031435e-05, + "loss": 0.612, + "step": 2370 + }, + { + "epoch": 0.33403508771929824, + "grad_norm": 1.0811039209365845, + "learning_rate": 4.9459940113525014e-05, + "loss": 0.7573, + "step": 2380 + }, + { + "epoch": 0.3354385964912281, + "grad_norm": 0.8881508708000183, + "learning_rate": 4.945537036383394e-05, + "loss": 0.7167, + "step": 2390 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 1.144106149673462, + "learning_rate": 4.945078157479884e-05, + "loss": 0.5797, + "step": 2400 + }, + { + "epoch": 0.33824561403508774, + "grad_norm": 1.1335030794143677, + "learning_rate": 4.944617374999224e-05, + "loss": 0.603, + "step": 2410 + }, + { + "epoch": 0.33964912280701753, + "grad_norm": 0.9101009368896484, + "learning_rate": 4.944154689300148e-05, + "loss": 0.6496, + "step": 2420 + }, + { + "epoch": 0.3410526315789474, + "grad_norm": 0.9584961533546448, + "learning_rate": 4.943690100742875e-05, + "loss": 0.6945, + "step": 2430 + }, + { + "epoch": 0.3424561403508772, + "grad_norm": 0.8912618160247803, + "learning_rate": 4.943223609689101e-05, + "loss": 0.6489, + "step": 2440 + }, + { + "epoch": 0.34385964912280703, + "grad_norm": 0.7363690733909607, + "learning_rate": 4.9427552165020066e-05, + "loss": 0.6066, + "step": 2450 + }, + { + "epoch": 0.3452631578947368, + "grad_norm": 1.2380393743515015, + "learning_rate": 4.9422849215462506e-05, + "loss": 0.6208, + "step": 2460 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 1.4995614290237427, + "learning_rate": 4.9418127251879756e-05, + "loss": 0.7249, + "step": 2470 + }, + { + "epoch": 0.3480701754385965, + "grad_norm": 1.0258910655975342, + "learning_rate": 4.9413386277948006e-05, + "loss": 0.7049, + "step": 2480 + }, + { + "epoch": 0.3494736842105263, + "grad_norm": 0.9672191143035889, + "learning_rate": 4.9408626297358286e-05, + "loss": 0.7138, + "step": 2490 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 0.9736180901527405, + "learning_rate": 4.940384731381639e-05, + "loss": 0.6047, + "step": 2500 + }, + { + "epoch": 0.35228070175438597, + "grad_norm": 0.7992679476737976, + "learning_rate": 4.9399049331042925e-05, + "loss": 0.6098, + "step": 2510 + }, + { + "epoch": 0.35368421052631577, + "grad_norm": 0.6984518766403198, + "learning_rate": 4.939423235277328e-05, + "loss": 0.6862, + "step": 2520 + }, + { + "epoch": 0.3550877192982456, + "grad_norm": 0.9038867354393005, + "learning_rate": 4.938939638275765e-05, + "loss": 0.7044, + "step": 2530 + }, + { + "epoch": 0.3564912280701754, + "grad_norm": 0.9274188280105591, + "learning_rate": 4.938454142476099e-05, + "loss": 0.6377, + "step": 2540 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 1.2159563302993774, + "learning_rate": 4.9379667482563066e-05, + "loss": 0.7172, + "step": 2550 + }, + { + "epoch": 0.35929824561403506, + "grad_norm": 0.8040406703948975, + "learning_rate": 4.937477455995839e-05, + "loss": 0.5563, + "step": 2560 + }, + { + "epoch": 0.3607017543859649, + "grad_norm": 0.9026057720184326, + "learning_rate": 4.9369862660756286e-05, + "loss": 0.7217, + "step": 2570 + }, + { + "epoch": 0.36210526315789476, + "grad_norm": 0.9877568483352661, + "learning_rate": 4.9364931788780835e-05, + "loss": 0.6424, + "step": 2580 + }, + { + "epoch": 0.36350877192982456, + "grad_norm": 0.8766788244247437, + "learning_rate": 4.9359981947870874e-05, + "loss": 0.6449, + "step": 2590 + }, + { + "epoch": 0.3649122807017544, + "grad_norm": 1.3229867219924927, + "learning_rate": 4.9355013141880045e-05, + "loss": 0.7963, + "step": 2600 + }, + { + "epoch": 0.3663157894736842, + "grad_norm": 1.6900445222854614, + "learning_rate": 4.9350025374676725e-05, + "loss": 0.716, + "step": 2610 + }, + { + "epoch": 0.36771929824561406, + "grad_norm": 1.055550217628479, + "learning_rate": 4.934501865014405e-05, + "loss": 0.5228, + "step": 2620 + }, + { + "epoch": 0.36912280701754385, + "grad_norm": 0.8242397904396057, + "learning_rate": 4.933999297217994e-05, + "loss": 0.6206, + "step": 2630 + }, + { + "epoch": 0.3705263157894737, + "grad_norm": 0.9964637756347656, + "learning_rate": 4.933494834469706e-05, + "loss": 0.6324, + "step": 2640 + }, + { + "epoch": 0.3719298245614035, + "grad_norm": 1.4224967956542969, + "learning_rate": 4.9329884771622817e-05, + "loss": 0.7658, + "step": 2650 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 1.1131879091262817, + "learning_rate": 4.9324802256899385e-05, + "loss": 0.6711, + "step": 2660 + }, + { + "epoch": 0.37473684210526315, + "grad_norm": 0.7792202830314636, + "learning_rate": 4.931970080448366e-05, + "loss": 0.5751, + "step": 2670 + }, + { + "epoch": 0.376140350877193, + "grad_norm": 0.7242644429206848, + "learning_rate": 4.931458041834731e-05, + "loss": 0.6772, + "step": 2680 + }, + { + "epoch": 0.3775438596491228, + "grad_norm": 0.8322226405143738, + "learning_rate": 4.9309441102476734e-05, + "loss": 0.6141, + "step": 2690 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 1.1265790462493896, + "learning_rate": 4.930428286087306e-05, + "loss": 0.627, + "step": 2700 + }, + { + "epoch": 0.38035087719298244, + "grad_norm": 1.1249980926513672, + "learning_rate": 4.929910569755215e-05, + "loss": 0.6991, + "step": 2710 + }, + { + "epoch": 0.3817543859649123, + "grad_norm": 1.5213415622711182, + "learning_rate": 4.929390961654462e-05, + "loss": 0.6379, + "step": 2720 + }, + { + "epoch": 0.3831578947368421, + "grad_norm": 0.9948049783706665, + "learning_rate": 4.9288694621895776e-05, + "loss": 0.673, + "step": 2730 + }, + { + "epoch": 0.38456140350877194, + "grad_norm": 1.249971866607666, + "learning_rate": 4.928346071766569e-05, + "loss": 0.6562, + "step": 2740 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 1.5983259677886963, + "learning_rate": 4.927820790792912e-05, + "loss": 0.6517, + "step": 2750 + }, + { + "epoch": 0.3873684210526316, + "grad_norm": 1.1207720041275024, + "learning_rate": 4.9272936196775565e-05, + "loss": 0.6506, + "step": 2760 + }, + { + "epoch": 0.3887719298245614, + "grad_norm": 1.2459056377410889, + "learning_rate": 4.926764558830923e-05, + "loss": 0.7087, + "step": 2770 + }, + { + "epoch": 0.39017543859649123, + "grad_norm": 2.858981132507324, + "learning_rate": 4.926233608664904e-05, + "loss": 0.6409, + "step": 2780 + }, + { + "epoch": 0.391578947368421, + "grad_norm": 1.2133064270019531, + "learning_rate": 4.9257007695928624e-05, + "loss": 0.7131, + "step": 2790 + }, + { + "epoch": 0.3929824561403509, + "grad_norm": 1.264398455619812, + "learning_rate": 4.925166042029631e-05, + "loss": 0.7967, + "step": 2800 + }, + { + "epoch": 0.39438596491228073, + "grad_norm": 0.8172046542167664, + "learning_rate": 4.924629426391515e-05, + "loss": 0.6273, + "step": 2810 + }, + { + "epoch": 0.3957894736842105, + "grad_norm": 0.5741508603096008, + "learning_rate": 4.924090923096286e-05, + "loss": 0.6419, + "step": 2820 + }, + { + "epoch": 0.3971929824561404, + "grad_norm": 0.8728544116020203, + "learning_rate": 4.923550532563189e-05, + "loss": 0.6296, + "step": 2830 + }, + { + "epoch": 0.39859649122807017, + "grad_norm": 0.6913738250732422, + "learning_rate": 4.923008255212935e-05, + "loss": 0.6323, + "step": 2840 + }, + { + "epoch": 0.4, + "grad_norm": 1.0395629405975342, + "learning_rate": 4.922464091467707e-05, + "loss": 0.6613, + "step": 2850 + }, + { + "epoch": 0.4014035087719298, + "grad_norm": 0.6149466633796692, + "learning_rate": 4.921918041751155e-05, + "loss": 0.6119, + "step": 2860 + }, + { + "epoch": 0.40280701754385967, + "grad_norm": 0.8594980239868164, + "learning_rate": 4.9213701064883966e-05, + "loss": 0.6575, + "step": 2870 + }, + { + "epoch": 0.40421052631578946, + "grad_norm": 1.0025339126586914, + "learning_rate": 4.9208202861060185e-05, + "loss": 0.7369, + "step": 2880 + }, + { + "epoch": 0.4056140350877193, + "grad_norm": 1.1241748332977295, + "learning_rate": 4.920268581032074e-05, + "loss": 0.6551, + "step": 2890 + }, + { + "epoch": 0.4070175438596491, + "grad_norm": 0.7128563523292542, + "learning_rate": 4.919714991696086e-05, + "loss": 0.6584, + "step": 2900 + }, + { + "epoch": 0.40842105263157896, + "grad_norm": 0.5740714073181152, + "learning_rate": 4.9191595185290414e-05, + "loss": 0.6674, + "step": 2910 + }, + { + "epoch": 0.40982456140350876, + "grad_norm": 0.9508911371231079, + "learning_rate": 4.918602161963396e-05, + "loss": 0.7091, + "step": 2920 + }, + { + "epoch": 0.4112280701754386, + "grad_norm": 1.1646149158477783, + "learning_rate": 4.9180429224330706e-05, + "loss": 0.5862, + "step": 2930 + }, + { + "epoch": 0.4126315789473684, + "grad_norm": 1.2261298894882202, + "learning_rate": 4.917481800373451e-05, + "loss": 0.731, + "step": 2940 + }, + { + "epoch": 0.41403508771929826, + "grad_norm": 0.6014220714569092, + "learning_rate": 4.916918796221393e-05, + "loss": 0.6716, + "step": 2950 + }, + { + "epoch": 0.41543859649122805, + "grad_norm": 1.0764710903167725, + "learning_rate": 4.9163539104152124e-05, + "loss": 0.6427, + "step": 2960 + }, + { + "epoch": 0.4168421052631579, + "grad_norm": 0.7629368901252747, + "learning_rate": 4.9157871433946925e-05, + "loss": 0.6184, + "step": 2970 + }, + { + "epoch": 0.4182456140350877, + "grad_norm": 0.8151566982269287, + "learning_rate": 4.9152184956010813e-05, + "loss": 0.6208, + "step": 2980 + }, + { + "epoch": 0.41964912280701755, + "grad_norm": 1.4884957075119019, + "learning_rate": 4.91464796747709e-05, + "loss": 0.6517, + "step": 2990 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.6417763233184814, + "learning_rate": 4.914075559466895e-05, + "loss": 0.6656, + "step": 3000 + }, + { + "epoch": 0.4224561403508772, + "grad_norm": 0.8164128065109253, + "learning_rate": 4.913501272016135e-05, + "loss": 0.6605, + "step": 3010 + }, + { + "epoch": 0.423859649122807, + "grad_norm": 0.9845851063728333, + "learning_rate": 4.9129251055719125e-05, + "loss": 0.6348, + "step": 3020 + }, + { + "epoch": 0.42526315789473684, + "grad_norm": 0.7174735069274902, + "learning_rate": 4.912347060582793e-05, + "loss": 0.6735, + "step": 3030 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 1.0722357034683228, + "learning_rate": 4.911767137498805e-05, + "loss": 0.604, + "step": 3040 + }, + { + "epoch": 0.4280701754385965, + "grad_norm": 0.8019692897796631, + "learning_rate": 4.911185336771437e-05, + "loss": 0.659, + "step": 3050 + }, + { + "epoch": 0.42947368421052634, + "grad_norm": 0.6950979232788086, + "learning_rate": 4.910601658853642e-05, + "loss": 0.6545, + "step": 3060 + }, + { + "epoch": 0.43087719298245614, + "grad_norm": 1.0000766515731812, + "learning_rate": 4.910016104199833e-05, + "loss": 0.6057, + "step": 3070 + }, + { + "epoch": 0.432280701754386, + "grad_norm": 1.0175904035568237, + "learning_rate": 4.909428673265884e-05, + "loss": 0.5503, + "step": 3080 + }, + { + "epoch": 0.4336842105263158, + "grad_norm": 1.158728003501892, + "learning_rate": 4.90883936650913e-05, + "loss": 0.6534, + "step": 3090 + }, + { + "epoch": 0.43508771929824563, + "grad_norm": 0.9984928369522095, + "learning_rate": 4.908248184388367e-05, + "loss": 0.6696, + "step": 3100 + }, + { + "epoch": 0.43649122807017543, + "grad_norm": 0.8490105867385864, + "learning_rate": 4.90765512736385e-05, + "loss": 0.5936, + "step": 3110 + }, + { + "epoch": 0.4378947368421053, + "grad_norm": 1.14065420627594, + "learning_rate": 4.907060195897296e-05, + "loss": 0.6154, + "step": 3120 + }, + { + "epoch": 0.4392982456140351, + "grad_norm": 1.0342949628829956, + "learning_rate": 4.906463390451878e-05, + "loss": 0.7975, + "step": 3130 + }, + { + "epoch": 0.44070175438596493, + "grad_norm": 1.2673470973968506, + "learning_rate": 4.9058647114922286e-05, + "loss": 0.6742, + "step": 3140 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 0.7902513146400452, + "learning_rate": 4.9052641594844416e-05, + "loss": 0.6221, + "step": 3150 + }, + { + "epoch": 0.4435087719298246, + "grad_norm": 0.813940167427063, + "learning_rate": 4.9046617348960666e-05, + "loss": 0.7789, + "step": 3160 + }, + { + "epoch": 0.44491228070175437, + "grad_norm": 0.9385407567024231, + "learning_rate": 4.904057438196111e-05, + "loss": 0.6668, + "step": 3170 + }, + { + "epoch": 0.4463157894736842, + "grad_norm": 1.005690574645996, + "learning_rate": 4.903451269855043e-05, + "loss": 0.6732, + "step": 3180 + }, + { + "epoch": 0.447719298245614, + "grad_norm": 1.1185845136642456, + "learning_rate": 4.9028432303447826e-05, + "loss": 0.652, + "step": 3190 + }, + { + "epoch": 0.44912280701754387, + "grad_norm": 1.234397292137146, + "learning_rate": 4.902233320138711e-05, + "loss": 0.7734, + "step": 3200 + }, + { + "epoch": 0.45052631578947366, + "grad_norm": 0.75343918800354, + "learning_rate": 4.901621539711664e-05, + "loss": 0.6524, + "step": 3210 + }, + { + "epoch": 0.4519298245614035, + "grad_norm": 0.7265051603317261, + "learning_rate": 4.901007889539933e-05, + "loss": 0.5631, + "step": 3220 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.8995214700698853, + "learning_rate": 4.900392370101266e-05, + "loss": 0.6949, + "step": 3230 + }, + { + "epoch": 0.45473684210526316, + "grad_norm": 1.1753424406051636, + "learning_rate": 4.899774981874867e-05, + "loss": 0.7445, + "step": 3240 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 0.9139629602432251, + "learning_rate": 4.8991557253413924e-05, + "loss": 0.6329, + "step": 3250 + }, + { + "epoch": 0.4575438596491228, + "grad_norm": 1.145979881286621, + "learning_rate": 4.8985346009829546e-05, + "loss": 0.6808, + "step": 3260 + }, + { + "epoch": 0.4589473684210526, + "grad_norm": 0.5931209921836853, + "learning_rate": 4.8979116092831223e-05, + "loss": 0.6464, + "step": 3270 + }, + { + "epoch": 0.46035087719298246, + "grad_norm": 0.9794625639915466, + "learning_rate": 4.897286750726913e-05, + "loss": 0.6997, + "step": 3280 + }, + { + "epoch": 0.4617543859649123, + "grad_norm": 1.121286153793335, + "learning_rate": 4.8966600258008024e-05, + "loss": 0.642, + "step": 3290 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 0.8356245160102844, + "learning_rate": 4.896031434992717e-05, + "loss": 0.651, + "step": 3300 + }, + { + "epoch": 0.46456140350877195, + "grad_norm": 0.8175771832466125, + "learning_rate": 4.8954009787920365e-05, + "loss": 0.6899, + "step": 3310 + }, + { + "epoch": 0.46596491228070175, + "grad_norm": 1.083617925643921, + "learning_rate": 4.894768657689592e-05, + "loss": 0.7559, + "step": 3320 + }, + { + "epoch": 0.4673684210526316, + "grad_norm": 1.0625582933425903, + "learning_rate": 4.8941344721776675e-05, + "loss": 0.6473, + "step": 3330 + }, + { + "epoch": 0.4687719298245614, + "grad_norm": 0.9211772680282593, + "learning_rate": 4.893498422749997e-05, + "loss": 0.726, + "step": 3340 + }, + { + "epoch": 0.47017543859649125, + "grad_norm": 0.8347317576408386, + "learning_rate": 4.8928605099017696e-05, + "loss": 0.6, + "step": 3350 + }, + { + "epoch": 0.47157894736842104, + "grad_norm": 1.115190029144287, + "learning_rate": 4.89222073412962e-05, + "loss": 0.6202, + "step": 3360 + }, + { + "epoch": 0.4729824561403509, + "grad_norm": 1.101366400718689, + "learning_rate": 4.8915790959316356e-05, + "loss": 0.6717, + "step": 3370 + }, + { + "epoch": 0.4743859649122807, + "grad_norm": 0.7661691308021545, + "learning_rate": 4.890935595807355e-05, + "loss": 0.6328, + "step": 3380 + }, + { + "epoch": 0.47578947368421054, + "grad_norm": 0.8245850205421448, + "learning_rate": 4.890290234257764e-05, + "loss": 0.7271, + "step": 3390 + }, + { + "epoch": 0.47719298245614034, + "grad_norm": 1.0110929012298584, + "learning_rate": 4.889643011785299e-05, + "loss": 0.582, + "step": 3400 + }, + { + "epoch": 0.4785964912280702, + "grad_norm": 0.7848758697509766, + "learning_rate": 4.888993928893846e-05, + "loss": 0.6851, + "step": 3410 + }, + { + "epoch": 0.48, + "grad_norm": 0.7310847640037537, + "learning_rate": 4.888342986088736e-05, + "loss": 0.583, + "step": 3420 + }, + { + "epoch": 0.48140350877192983, + "grad_norm": 1.3532679080963135, + "learning_rate": 4.887690183876752e-05, + "loss": 0.6261, + "step": 3430 + }, + { + "epoch": 0.48280701754385963, + "grad_norm": 1.0199493169784546, + "learning_rate": 4.887035522766122e-05, + "loss": 0.6563, + "step": 3440 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 0.7497562766075134, + "learning_rate": 4.886379003266523e-05, + "loss": 0.5178, + "step": 3450 + }, + { + "epoch": 0.4856140350877193, + "grad_norm": 0.8139173984527588, + "learning_rate": 4.885720625889078e-05, + "loss": 0.824, + "step": 3460 + }, + { + "epoch": 0.4870175438596491, + "grad_norm": 0.6662510633468628, + "learning_rate": 4.8850603911463556e-05, + "loss": 0.6821, + "step": 3470 + }, + { + "epoch": 0.4884210526315789, + "grad_norm": 0.9491138458251953, + "learning_rate": 4.8843982995523704e-05, + "loss": 0.6955, + "step": 3480 + }, + { + "epoch": 0.4898245614035088, + "grad_norm": 0.7988129258155823, + "learning_rate": 4.883734351622586e-05, + "loss": 0.6447, + "step": 3490 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 1.0620664358139038, + "learning_rate": 4.8830685478739057e-05, + "loss": 0.6454, + "step": 3500 + }, + { + "epoch": 0.4926315789473684, + "grad_norm": 0.7549204230308533, + "learning_rate": 4.8824008888246834e-05, + "loss": 0.6676, + "step": 3510 + }, + { + "epoch": 0.49403508771929827, + "grad_norm": 0.9870264530181885, + "learning_rate": 4.8817313749947115e-05, + "loss": 0.5715, + "step": 3520 + }, + { + "epoch": 0.49543859649122807, + "grad_norm": 0.7582098245620728, + "learning_rate": 4.881060006905232e-05, + "loss": 0.6479, + "step": 3530 + }, + { + "epoch": 0.4968421052631579, + "grad_norm": 0.6988912224769592, + "learning_rate": 4.880386785078925e-05, + "loss": 0.6208, + "step": 3540 + }, + { + "epoch": 0.4982456140350877, + "grad_norm": 0.7568824291229248, + "learning_rate": 4.87971171003992e-05, + "loss": 0.6503, + "step": 3550 + }, + { + "epoch": 0.49964912280701756, + "grad_norm": 1.2903584241867065, + "learning_rate": 4.879034782313786e-05, + "loss": 0.6525, + "step": 3560 + }, + { + "epoch": 0.5010526315789474, + "grad_norm": 0.7582905888557434, + "learning_rate": 4.878356002427532e-05, + "loss": 0.633, + "step": 3570 + }, + { + "epoch": 0.5024561403508772, + "grad_norm": 0.9976963400840759, + "learning_rate": 4.877675370909612e-05, + "loss": 0.6184, + "step": 3580 + }, + { + "epoch": 0.503859649122807, + "grad_norm": 0.8688436150550842, + "learning_rate": 4.876992888289923e-05, + "loss": 0.64, + "step": 3590 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 1.4120594263076782, + "learning_rate": 4.876308555099799e-05, + "loss": 0.6238, + "step": 3600 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 1.190382719039917, + "learning_rate": 4.875622371872017e-05, + "loss": 0.7433, + "step": 3610 + }, + { + "epoch": 0.5080701754385964, + "grad_norm": 0.8115689754486084, + "learning_rate": 4.874934339140795e-05, + "loss": 0.7031, + "step": 3620 + }, + { + "epoch": 0.5094736842105263, + "grad_norm": 0.8457335233688354, + "learning_rate": 4.8742444574417904e-05, + "loss": 0.5443, + "step": 3630 + }, + { + "epoch": 0.5108771929824562, + "grad_norm": 0.8754384517669678, + "learning_rate": 4.873552727312099e-05, + "loss": 0.6728, + "step": 3640 + }, + { + "epoch": 0.512280701754386, + "grad_norm": 1.2087777853012085, + "learning_rate": 4.872859149290256e-05, + "loss": 0.6321, + "step": 3650 + }, + { + "epoch": 0.5136842105263157, + "grad_norm": 1.0635002851486206, + "learning_rate": 4.872163723916237e-05, + "loss": 0.6301, + "step": 3660 + }, + { + "epoch": 0.5150877192982456, + "grad_norm": 1.1686186790466309, + "learning_rate": 4.871466451731453e-05, + "loss": 0.6991, + "step": 3670 + }, + { + "epoch": 0.5164912280701754, + "grad_norm": 1.1546950340270996, + "learning_rate": 4.870767333278755e-05, + "loss": 0.5503, + "step": 3680 + }, + { + "epoch": 0.5178947368421053, + "grad_norm": 0.8768120408058167, + "learning_rate": 4.87006636910243e-05, + "loss": 0.6342, + "step": 3690 + }, + { + "epoch": 0.519298245614035, + "grad_norm": 0.8353332281112671, + "learning_rate": 4.8693635597482045e-05, + "loss": 0.5933, + "step": 3700 + }, + { + "epoch": 0.5207017543859649, + "grad_norm": 0.8518616557121277, + "learning_rate": 4.868658905763238e-05, + "loss": 0.5878, + "step": 3710 + }, + { + "epoch": 0.5221052631578947, + "grad_norm": 0.8607089519500732, + "learning_rate": 4.8679524076961284e-05, + "loss": 0.5478, + "step": 3720 + }, + { + "epoch": 0.5235087719298246, + "grad_norm": 1.3177140951156616, + "learning_rate": 4.867244066096909e-05, + "loss": 0.6024, + "step": 3730 + }, + { + "epoch": 0.5249122807017544, + "grad_norm": 1.1247279644012451, + "learning_rate": 4.866533881517046e-05, + "loss": 0.6106, + "step": 3740 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.0166698694229126, + "learning_rate": 4.865821854509445e-05, + "loss": 0.602, + "step": 3750 + }, + { + "epoch": 0.527719298245614, + "grad_norm": 0.7537686824798584, + "learning_rate": 4.865107985628442e-05, + "loss": 0.7147, + "step": 3760 + }, + { + "epoch": 0.5291228070175439, + "grad_norm": 1.1428786516189575, + "learning_rate": 4.86439227542981e-05, + "loss": 0.6561, + "step": 3770 + }, + { + "epoch": 0.5305263157894737, + "grad_norm": 1.1645269393920898, + "learning_rate": 4.863674724470751e-05, + "loss": 0.7062, + "step": 3780 + }, + { + "epoch": 0.5319298245614035, + "grad_norm": 1.128609299659729, + "learning_rate": 4.862955333309905e-05, + "loss": 0.7019, + "step": 3790 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.0182465314865112, + "learning_rate": 4.8622341025073425e-05, + "loss": 0.5702, + "step": 3800 + }, + { + "epoch": 0.5347368421052632, + "grad_norm": 0.9816009998321533, + "learning_rate": 4.861511032624567e-05, + "loss": 0.6956, + "step": 3810 + }, + { + "epoch": 0.536140350877193, + "grad_norm": 0.7931702733039856, + "learning_rate": 4.860786124224512e-05, + "loss": 0.6266, + "step": 3820 + }, + { + "epoch": 0.5375438596491228, + "grad_norm": 1.3353627920150757, + "learning_rate": 4.860059377871544e-05, + "loss": 0.6758, + "step": 3830 + }, + { + "epoch": 0.5389473684210526, + "grad_norm": 1.1476149559020996, + "learning_rate": 4.85933079413146e-05, + "loss": 0.6559, + "step": 3840 + }, + { + "epoch": 0.5403508771929825, + "grad_norm": 0.9160752892494202, + "learning_rate": 4.858600373571487e-05, + "loss": 0.6052, + "step": 3850 + }, + { + "epoch": 0.5417543859649123, + "grad_norm": 1.0451756715774536, + "learning_rate": 4.8578681167602834e-05, + "loss": 0.6119, + "step": 3860 + }, + { + "epoch": 0.5431578947368421, + "grad_norm": 0.9673342108726501, + "learning_rate": 4.8571340242679354e-05, + "loss": 0.5872, + "step": 3870 + }, + { + "epoch": 0.5445614035087719, + "grad_norm": 1.24473237991333, + "learning_rate": 4.856398096665959e-05, + "loss": 0.7302, + "step": 3880 + }, + { + "epoch": 0.5459649122807018, + "grad_norm": 0.967494547367096, + "learning_rate": 4.8556603345273e-05, + "loss": 0.6889, + "step": 3890 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 0.9426731467247009, + "learning_rate": 4.8549207384263305e-05, + "loss": 0.6045, + "step": 3900 + }, + { + "epoch": 0.5487719298245614, + "grad_norm": 1.033600091934204, + "learning_rate": 4.854179308938852e-05, + "loss": 0.743, + "step": 3910 + }, + { + "epoch": 0.5501754385964912, + "grad_norm": 0.9784322381019592, + "learning_rate": 4.8534360466420926e-05, + "loss": 0.6416, + "step": 3920 + }, + { + "epoch": 0.5515789473684211, + "grad_norm": 1.0500706434249878, + "learning_rate": 4.852690952114708e-05, + "loss": 0.5975, + "step": 3930 + }, + { + "epoch": 0.5529824561403509, + "grad_norm": 1.6134823560714722, + "learning_rate": 4.851944025936779e-05, + "loss": 0.7975, + "step": 3940 + }, + { + "epoch": 0.5543859649122806, + "grad_norm": 0.785410463809967, + "learning_rate": 4.851195268689813e-05, + "loss": 0.6836, + "step": 3950 + }, + { + "epoch": 0.5557894736842105, + "grad_norm": 1.15956449508667, + "learning_rate": 4.850444680956745e-05, + "loss": 0.5265, + "step": 3960 + }, + { + "epoch": 0.5571929824561404, + "grad_norm": 1.0284963846206665, + "learning_rate": 4.8496922633219314e-05, + "loss": 0.687, + "step": 3970 + }, + { + "epoch": 0.5585964912280702, + "grad_norm": 0.5753929615020752, + "learning_rate": 4.8489380163711556e-05, + "loss": 0.5644, + "step": 3980 + }, + { + "epoch": 0.56, + "grad_norm": 1.0494047403335571, + "learning_rate": 4.848181940691625e-05, + "loss": 0.6013, + "step": 3990 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 1.090614914894104, + "learning_rate": 4.8474240368719703e-05, + "loss": 0.724, + "step": 4000 + }, + { + "epoch": 0.5614035087719298, + "eval_loss": 0.6521075963973999, + "eval_runtime": 44.1632, + "eval_samples_per_second": 33.965, + "eval_steps_per_second": 8.491, + "step": 4000 + }, + { + "epoch": 0.5628070175438596, + "grad_norm": 1.593772530555725, + "learning_rate": 4.846664305502245e-05, + "loss": 0.6668, + "step": 4010 + }, + { + "epoch": 0.5642105263157895, + "grad_norm": 1.0096566677093506, + "learning_rate": 4.8459027471739284e-05, + "loss": 0.6898, + "step": 4020 + }, + { + "epoch": 0.5656140350877193, + "grad_norm": 1.126257061958313, + "learning_rate": 4.8451393624799165e-05, + "loss": 0.6639, + "step": 4030 + }, + { + "epoch": 0.5670175438596491, + "grad_norm": 1.0839751958847046, + "learning_rate": 4.844374152014532e-05, + "loss": 0.7336, + "step": 4040 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 0.4993619918823242, + "learning_rate": 4.843607116373518e-05, + "loss": 0.6233, + "step": 4050 + }, + { + "epoch": 0.5698245614035088, + "grad_norm": 1.6385512351989746, + "learning_rate": 4.8428382561540366e-05, + "loss": 0.6178, + "step": 4060 + }, + { + "epoch": 0.5712280701754386, + "grad_norm": 0.9295198321342468, + "learning_rate": 4.8420675719546723e-05, + "loss": 0.6121, + "step": 4070 + }, + { + "epoch": 0.5726315789473684, + "grad_norm": 1.2179811000823975, + "learning_rate": 4.8412950643754305e-05, + "loss": 0.6225, + "step": 4080 + }, + { + "epoch": 0.5740350877192982, + "grad_norm": 1.1477456092834473, + "learning_rate": 4.840520734017734e-05, + "loss": 0.6502, + "step": 4090 + }, + { + "epoch": 0.5754385964912281, + "grad_norm": 0.8792319297790527, + "learning_rate": 4.839744581484425e-05, + "loss": 0.6799, + "step": 4100 + }, + { + "epoch": 0.5768421052631579, + "grad_norm": 1.995977759361267, + "learning_rate": 4.8389666073797646e-05, + "loss": 0.7671, + "step": 4110 + }, + { + "epoch": 0.5782456140350877, + "grad_norm": 0.680174708366394, + "learning_rate": 4.8381868123094335e-05, + "loss": 0.6289, + "step": 4120 + }, + { + "epoch": 0.5796491228070175, + "grad_norm": 0.8312070369720459, + "learning_rate": 4.837405196880529e-05, + "loss": 0.6621, + "step": 4130 + }, + { + "epoch": 0.5810526315789474, + "grad_norm": 0.8448961973190308, + "learning_rate": 4.836621761701564e-05, + "loss": 0.601, + "step": 4140 + }, + { + "epoch": 0.5824561403508772, + "grad_norm": 1.1311395168304443, + "learning_rate": 4.835836507382471e-05, + "loss": 0.6818, + "step": 4150 + }, + { + "epoch": 0.583859649122807, + "grad_norm": 0.8135958313941956, + "learning_rate": 4.835049434534596e-05, + "loss": 0.6688, + "step": 4160 + }, + { + "epoch": 0.5852631578947368, + "grad_norm": 0.9292672276496887, + "learning_rate": 4.8342605437707034e-05, + "loss": 0.7652, + "step": 4170 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 1.1490682363510132, + "learning_rate": 4.8334698357049715e-05, + "loss": 0.5381, + "step": 4180 + }, + { + "epoch": 0.5880701754385965, + "grad_norm": 1.1863840818405151, + "learning_rate": 4.832677310952993e-05, + "loss": 0.6786, + "step": 4190 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 0.7175789475440979, + "learning_rate": 4.831882970131777e-05, + "loss": 0.629, + "step": 4200 + }, + { + "epoch": 0.5908771929824561, + "grad_norm": 0.898485541343689, + "learning_rate": 4.831086813859743e-05, + "loss": 0.6021, + "step": 4210 + }, + { + "epoch": 0.592280701754386, + "grad_norm": 1.0772299766540527, + "learning_rate": 4.830288842756728e-05, + "loss": 0.5706, + "step": 4220 + }, + { + "epoch": 0.5936842105263158, + "grad_norm": 0.8830444812774658, + "learning_rate": 4.8294890574439784e-05, + "loss": 0.6716, + "step": 4230 + }, + { + "epoch": 0.5950877192982457, + "grad_norm": 1.12392258644104, + "learning_rate": 4.828687458544155e-05, + "loss": 0.6315, + "step": 4240 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 1.751460075378418, + "learning_rate": 4.82788404668133e-05, + "loss": 0.5633, + "step": 4250 + }, + { + "epoch": 0.5978947368421053, + "grad_norm": 1.024601697921753, + "learning_rate": 4.827078822480987e-05, + "loss": 0.6747, + "step": 4260 + }, + { + "epoch": 0.5992982456140351, + "grad_norm": 0.8278754949569702, + "learning_rate": 4.826271786570021e-05, + "loss": 0.6555, + "step": 4270 + }, + { + "epoch": 0.600701754385965, + "grad_norm": 0.9836990237236023, + "learning_rate": 4.825462939576737e-05, + "loss": 0.5987, + "step": 4280 + }, + { + "epoch": 0.6021052631578947, + "grad_norm": 0.5657834410667419, + "learning_rate": 4.8246522821308495e-05, + "loss": 0.6753, + "step": 4290 + }, + { + "epoch": 0.6035087719298246, + "grad_norm": 1.1341723203659058, + "learning_rate": 4.823839814863484e-05, + "loss": 0.5856, + "step": 4300 + }, + { + "epoch": 0.6049122807017544, + "grad_norm": 1.311997652053833, + "learning_rate": 4.823025538407173e-05, + "loss": 0.6204, + "step": 4310 + }, + { + "epoch": 0.6063157894736843, + "grad_norm": 0.8703358173370361, + "learning_rate": 4.82220945339586e-05, + "loss": 0.5866, + "step": 4320 + }, + { + "epoch": 0.607719298245614, + "grad_norm": 0.8117982149124146, + "learning_rate": 4.8213915604648944e-05, + "loss": 0.7384, + "step": 4330 + }, + { + "epoch": 0.6091228070175438, + "grad_norm": 1.2093411684036255, + "learning_rate": 4.820571860251034e-05, + "loss": 0.7113, + "step": 4340 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 0.895978569984436, + "learning_rate": 4.819750353392443e-05, + "loss": 0.6544, + "step": 4350 + }, + { + "epoch": 0.6119298245614035, + "grad_norm": 0.8177430629730225, + "learning_rate": 4.818927040528693e-05, + "loss": 0.6317, + "step": 4360 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.8065016865730286, + "learning_rate": 4.818101922300762e-05, + "loss": 0.5756, + "step": 4370 + }, + { + "epoch": 0.6147368421052631, + "grad_norm": 0.9234448075294495, + "learning_rate": 4.8172749993510315e-05, + "loss": 0.687, + "step": 4380 + }, + { + "epoch": 0.616140350877193, + "grad_norm": 1.0152438879013062, + "learning_rate": 4.81644627232329e-05, + "loss": 0.6573, + "step": 4390 + }, + { + "epoch": 0.6175438596491228, + "grad_norm": 0.8767795562744141, + "learning_rate": 4.81561574186273e-05, + "loss": 0.6787, + "step": 4400 + }, + { + "epoch": 0.6189473684210526, + "grad_norm": 0.8680139183998108, + "learning_rate": 4.814783408615948e-05, + "loss": 0.5503, + "step": 4410 + }, + { + "epoch": 0.6203508771929824, + "grad_norm": 0.9502211213111877, + "learning_rate": 4.813949273230944e-05, + "loss": 0.6495, + "step": 4420 + }, + { + "epoch": 0.6217543859649123, + "grad_norm": 0.8180057406425476, + "learning_rate": 4.8131133363571214e-05, + "loss": 0.5845, + "step": 4430 + }, + { + "epoch": 0.6231578947368421, + "grad_norm": 1.3863866329193115, + "learning_rate": 4.8122755986452845e-05, + "loss": 0.6093, + "step": 4440 + }, + { + "epoch": 0.624561403508772, + "grad_norm": 0.7499920129776001, + "learning_rate": 4.8114360607476416e-05, + "loss": 0.6465, + "step": 4450 + }, + { + "epoch": 0.6259649122807017, + "grad_norm": 0.7183496952056885, + "learning_rate": 4.810594723317801e-05, + "loss": 0.6228, + "step": 4460 + }, + { + "epoch": 0.6273684210526316, + "grad_norm": 1.3374441862106323, + "learning_rate": 4.809751587010774e-05, + "loss": 0.657, + "step": 4470 + }, + { + "epoch": 0.6287719298245614, + "grad_norm": 0.8970227837562561, + "learning_rate": 4.80890665248297e-05, + "loss": 0.6068, + "step": 4480 + }, + { + "epoch": 0.6301754385964913, + "grad_norm": 1.075203537940979, + "learning_rate": 4.808059920392201e-05, + "loss": 0.7177, + "step": 4490 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.3227583169937134, + "learning_rate": 4.807211391397674e-05, + "loss": 0.6333, + "step": 4500 + }, + { + "epoch": 0.6329824561403509, + "grad_norm": 0.8684366941452026, + "learning_rate": 4.806361066160001e-05, + "loss": 0.6396, + "step": 4510 + }, + { + "epoch": 0.6343859649122807, + "grad_norm": 0.9884424209594727, + "learning_rate": 4.8055089453411875e-05, + "loss": 0.622, + "step": 4520 + }, + { + "epoch": 0.6357894736842106, + "grad_norm": 1.2879207134246826, + "learning_rate": 4.80465502960464e-05, + "loss": 0.6551, + "step": 4530 + }, + { + "epoch": 0.6371929824561403, + "grad_norm": 0.8450446724891663, + "learning_rate": 4.80379931961516e-05, + "loss": 0.6874, + "step": 4540 + }, + { + "epoch": 0.6385964912280702, + "grad_norm": 1.0679776668548584, + "learning_rate": 4.8029418160389484e-05, + "loss": 0.5982, + "step": 4550 + }, + { + "epoch": 0.64, + "grad_norm": 0.7384183406829834, + "learning_rate": 4.8020825195435994e-05, + "loss": 0.5541, + "step": 4560 + }, + { + "epoch": 0.6414035087719299, + "grad_norm": 0.8015978336334229, + "learning_rate": 4.8012214307981064e-05, + "loss": 0.7297, + "step": 4570 + }, + { + "epoch": 0.6428070175438596, + "grad_norm": 0.7276405692100525, + "learning_rate": 4.800358550472855e-05, + "loss": 0.7694, + "step": 4580 + }, + { + "epoch": 0.6442105263157895, + "grad_norm": 0.7692060470581055, + "learning_rate": 4.799493879239628e-05, + "loss": 0.6194, + "step": 4590 + }, + { + "epoch": 0.6456140350877193, + "grad_norm": 1.2254407405853271, + "learning_rate": 4.7986274177716024e-05, + "loss": 0.6358, + "step": 4600 + }, + { + "epoch": 0.6470175438596492, + "grad_norm": 1.0495854616165161, + "learning_rate": 4.797759166743346e-05, + "loss": 0.6828, + "step": 4610 + }, + { + "epoch": 0.6484210526315789, + "grad_norm": 0.9298211932182312, + "learning_rate": 4.7968891268308246e-05, + "loss": 0.7163, + "step": 4620 + }, + { + "epoch": 0.6498245614035087, + "grad_norm": 0.9762528538703918, + "learning_rate": 4.796017298711391e-05, + "loss": 0.5935, + "step": 4630 + }, + { + "epoch": 0.6512280701754386, + "grad_norm": 1.0231860876083374, + "learning_rate": 4.795143683063797e-05, + "loss": 0.5696, + "step": 4640 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 1.1608182191848755, + "learning_rate": 4.7942682805681797e-05, + "loss": 0.5665, + "step": 4650 + }, + { + "epoch": 0.6540350877192982, + "grad_norm": 0.6527351140975952, + "learning_rate": 4.79339109190607e-05, + "loss": 0.6242, + "step": 4660 + }, + { + "epoch": 0.655438596491228, + "grad_norm": 0.8694155812263489, + "learning_rate": 4.792512117760391e-05, + "loss": 0.6259, + "step": 4670 + }, + { + "epoch": 0.6568421052631579, + "grad_norm": 0.9847631454467773, + "learning_rate": 4.7916313588154514e-05, + "loss": 0.6757, + "step": 4680 + }, + { + "epoch": 0.6582456140350877, + "grad_norm": 0.5999444127082825, + "learning_rate": 4.790748815756954e-05, + "loss": 0.6324, + "step": 4690 + }, + { + "epoch": 0.6596491228070176, + "grad_norm": 1.4817160367965698, + "learning_rate": 4.78986448927199e-05, + "loss": 0.5834, + "step": 4700 + }, + { + "epoch": 0.6610526315789473, + "grad_norm": 1.3592370748519897, + "learning_rate": 4.788978380049036e-05, + "loss": 0.6985, + "step": 4710 + }, + { + "epoch": 0.6624561403508772, + "grad_norm": 0.9479141235351562, + "learning_rate": 4.78809048877796e-05, + "loss": 0.6595, + "step": 4720 + }, + { + "epoch": 0.663859649122807, + "grad_norm": 1.3383686542510986, + "learning_rate": 4.787200816150014e-05, + "loss": 0.7508, + "step": 4730 + }, + { + "epoch": 0.6652631578947369, + "grad_norm": 1.0097548961639404, + "learning_rate": 4.786309362857839e-05, + "loss": 0.6452, + "step": 4740 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.9222456812858582, + "learning_rate": 4.785416129595463e-05, + "loss": 0.6171, + "step": 4750 + }, + { + "epoch": 0.6680701754385965, + "grad_norm": 0.9993833303451538, + "learning_rate": 4.784521117058298e-05, + "loss": 0.654, + "step": 4760 + }, + { + "epoch": 0.6694736842105263, + "grad_norm": 0.6470888257026672, + "learning_rate": 4.7836243259431425e-05, + "loss": 0.6674, + "step": 4770 + }, + { + "epoch": 0.6708771929824562, + "grad_norm": 0.8498440980911255, + "learning_rate": 4.7827257569481776e-05, + "loss": 0.6319, + "step": 4780 + }, + { + "epoch": 0.6722807017543859, + "grad_norm": 0.9220410585403442, + "learning_rate": 4.781825410772972e-05, + "loss": 0.5856, + "step": 4790 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 1.065016746520996, + "learning_rate": 4.780923288118475e-05, + "loss": 0.5919, + "step": 4800 + }, + { + "epoch": 0.6750877192982456, + "grad_norm": 0.7213327288627625, + "learning_rate": 4.78001938968702e-05, + "loss": 0.6192, + "step": 4810 + }, + { + "epoch": 0.6764912280701755, + "grad_norm": 0.7141574025154114, + "learning_rate": 4.779113716182323e-05, + "loss": 0.6628, + "step": 4820 + }, + { + "epoch": 0.6778947368421052, + "grad_norm": 0.7694927453994751, + "learning_rate": 4.778206268309482e-05, + "loss": 0.6451, + "step": 4830 + }, + { + "epoch": 0.6792982456140351, + "grad_norm": 1.2971090078353882, + "learning_rate": 4.777297046774977e-05, + "loss": 0.6823, + "step": 4840 + }, + { + "epoch": 0.6807017543859649, + "grad_norm": 0.7971644401550293, + "learning_rate": 4.7763860522866665e-05, + "loss": 0.6916, + "step": 4850 + }, + { + "epoch": 0.6821052631578948, + "grad_norm": 0.8853887319564819, + "learning_rate": 4.775473285553792e-05, + "loss": 0.5936, + "step": 4860 + }, + { + "epoch": 0.6835087719298245, + "grad_norm": 1.3222453594207764, + "learning_rate": 4.774558747286973e-05, + "loss": 0.7202, + "step": 4870 + }, + { + "epoch": 0.6849122807017544, + "grad_norm": 1.187171220779419, + "learning_rate": 4.77364243819821e-05, + "loss": 0.6405, + "step": 4880 + }, + { + "epoch": 0.6863157894736842, + "grad_norm": 0.8649610280990601, + "learning_rate": 4.7727243590008806e-05, + "loss": 0.6704, + "step": 4890 + }, + { + "epoch": 0.6877192982456141, + "grad_norm": 0.9361883401870728, + "learning_rate": 4.771804510409741e-05, + "loss": 0.6304, + "step": 4900 + }, + { + "epoch": 0.6891228070175439, + "grad_norm": 0.7870001196861267, + "learning_rate": 4.7708828931409236e-05, + "loss": 0.6645, + "step": 4910 + }, + { + "epoch": 0.6905263157894737, + "grad_norm": 1.0028226375579834, + "learning_rate": 4.769959507911941e-05, + "loss": 0.7018, + "step": 4920 + }, + { + "epoch": 0.6919298245614035, + "grad_norm": 0.7500180602073669, + "learning_rate": 4.769034355441678e-05, + "loss": 0.5191, + "step": 4930 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.9766993522644043, + "learning_rate": 4.7681074364503995e-05, + "loss": 0.6723, + "step": 4940 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 1.3899115324020386, + "learning_rate": 4.767178751659743e-05, + "loss": 0.7069, + "step": 4950 + }, + { + "epoch": 0.696140350877193, + "grad_norm": 1.3812363147735596, + "learning_rate": 4.7662483017927215e-05, + "loss": 0.6333, + "step": 4960 + }, + { + "epoch": 0.6975438596491228, + "grad_norm": 0.6967772841453552, + "learning_rate": 4.765316087573722e-05, + "loss": 0.7116, + "step": 4970 + }, + { + "epoch": 0.6989473684210527, + "grad_norm": 1.235410213470459, + "learning_rate": 4.7643821097285044e-05, + "loss": 0.5517, + "step": 4980 + }, + { + "epoch": 0.7003508771929825, + "grad_norm": 1.0389471054077148, + "learning_rate": 4.763446368984205e-05, + "loss": 0.6856, + "step": 4990 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.9552194476127625, + "learning_rate": 4.762508866069327e-05, + "loss": 0.6119, + "step": 5000 + }, + { + "epoch": 0.7031578947368421, + "grad_norm": 0.8866641521453857, + "learning_rate": 4.7615696017137504e-05, + "loss": 0.5645, + "step": 5010 + }, + { + "epoch": 0.7045614035087719, + "grad_norm": 1.0465891361236572, + "learning_rate": 4.760628576648723e-05, + "loss": 0.6506, + "step": 5020 + }, + { + "epoch": 0.7059649122807018, + "grad_norm": 1.104183316230774, + "learning_rate": 4.759685791606868e-05, + "loss": 0.6092, + "step": 5030 + }, + { + "epoch": 0.7073684210526315, + "grad_norm": 0.8748829364776611, + "learning_rate": 4.758741247322174e-05, + "loss": 0.7659, + "step": 5040 + }, + { + "epoch": 0.7087719298245614, + "grad_norm": 0.9573276042938232, + "learning_rate": 4.7577949445300004e-05, + "loss": 0.5774, + "step": 5050 + }, + { + "epoch": 0.7101754385964912, + "grad_norm": 0.9269713759422302, + "learning_rate": 4.756846883967077e-05, + "loss": 0.6234, + "step": 5060 + }, + { + "epoch": 0.7115789473684211, + "grad_norm": 0.6953681111335754, + "learning_rate": 4.755897066371502e-05, + "loss": 0.6456, + "step": 5070 + }, + { + "epoch": 0.7129824561403508, + "grad_norm": 0.6628289818763733, + "learning_rate": 4.754945492482741e-05, + "loss": 0.54, + "step": 5080 + }, + { + "epoch": 0.7143859649122807, + "grad_norm": 0.7972025871276855, + "learning_rate": 4.7539921630416264e-05, + "loss": 0.5695, + "step": 5090 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 1.032006859779358, + "learning_rate": 4.7530370787903576e-05, + "loss": 0.6748, + "step": 5100 + }, + { + "epoch": 0.7171929824561404, + "grad_norm": 0.7866501212120056, + "learning_rate": 4.7520802404725007e-05, + "loss": 0.595, + "step": 5110 + }, + { + "epoch": 0.7185964912280701, + "grad_norm": 1.0693832635879517, + "learning_rate": 4.751121648832987e-05, + "loss": 0.641, + "step": 5120 + }, + { + "epoch": 0.72, + "grad_norm": 1.0331542491912842, + "learning_rate": 4.750161304618114e-05, + "loss": 0.6345, + "step": 5130 + }, + { + "epoch": 0.7214035087719298, + "grad_norm": 0.6064502000808716, + "learning_rate": 4.749199208575541e-05, + "loss": 0.5997, + "step": 5140 + }, + { + "epoch": 0.7228070175438597, + "grad_norm": 1.1691397428512573, + "learning_rate": 4.748235361454293e-05, + "loss": 0.6529, + "step": 5150 + }, + { + "epoch": 0.7242105263157895, + "grad_norm": 0.7956925630569458, + "learning_rate": 4.7472697640047594e-05, + "loss": 0.5668, + "step": 5160 + }, + { + "epoch": 0.7256140350877193, + "grad_norm": 0.9746783971786499, + "learning_rate": 4.7463024169786895e-05, + "loss": 0.6433, + "step": 5170 + }, + { + "epoch": 0.7270175438596491, + "grad_norm": 1.2105709314346313, + "learning_rate": 4.745333321129197e-05, + "loss": 0.6749, + "step": 5180 + }, + { + "epoch": 0.728421052631579, + "grad_norm": 0.7860882878303528, + "learning_rate": 4.744362477210755e-05, + "loss": 0.7041, + "step": 5190 + }, + { + "epoch": 0.7298245614035088, + "grad_norm": 1.1629239320755005, + "learning_rate": 4.7433898859792e-05, + "loss": 0.5598, + "step": 5200 + }, + { + "epoch": 0.7312280701754386, + "grad_norm": 1.1319113969802856, + "learning_rate": 4.742415548191728e-05, + "loss": 0.6433, + "step": 5210 + }, + { + "epoch": 0.7326315789473684, + "grad_norm": 0.8640940189361572, + "learning_rate": 4.741439464606893e-05, + "loss": 0.6715, + "step": 5220 + }, + { + "epoch": 0.7340350877192983, + "grad_norm": 0.7730684280395508, + "learning_rate": 4.740461635984609e-05, + "loss": 0.6391, + "step": 5230 + }, + { + "epoch": 0.7354385964912281, + "grad_norm": 1.2042145729064941, + "learning_rate": 4.739482063086152e-05, + "loss": 0.5834, + "step": 5240 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.9434259533882141, + "learning_rate": 4.73850074667415e-05, + "loss": 0.7835, + "step": 5250 + }, + { + "epoch": 0.7382456140350877, + "grad_norm": 0.8331650495529175, + "learning_rate": 4.737517687512593e-05, + "loss": 0.6128, + "step": 5260 + }, + { + "epoch": 0.7396491228070176, + "grad_norm": 0.8063735365867615, + "learning_rate": 4.7365328863668256e-05, + "loss": 0.655, + "step": 5270 + }, + { + "epoch": 0.7410526315789474, + "grad_norm": 0.6377186179161072, + "learning_rate": 4.735546344003551e-05, + "loss": 0.6506, + "step": 5280 + }, + { + "epoch": 0.7424561403508771, + "grad_norm": 0.8817654252052307, + "learning_rate": 4.734558061190824e-05, + "loss": 0.6984, + "step": 5290 + }, + { + "epoch": 0.743859649122807, + "grad_norm": 1.2554540634155273, + "learning_rate": 4.733568038698057e-05, + "loss": 0.7401, + "step": 5300 + }, + { + "epoch": 0.7452631578947368, + "grad_norm": 1.0858135223388672, + "learning_rate": 4.732576277296017e-05, + "loss": 0.6432, + "step": 5310 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 1.1962653398513794, + "learning_rate": 4.731582777756825e-05, + "loss": 0.6687, + "step": 5320 + }, + { + "epoch": 0.7480701754385964, + "grad_norm": 1.0213031768798828, + "learning_rate": 4.730587540853954e-05, + "loss": 0.6489, + "step": 5330 + }, + { + "epoch": 0.7494736842105263, + "grad_norm": 0.8629382252693176, + "learning_rate": 4.729590567362228e-05, + "loss": 0.6149, + "step": 5340 + }, + { + "epoch": 0.7508771929824561, + "grad_norm": 0.6692180633544922, + "learning_rate": 4.728591858057827e-05, + "loss": 0.6227, + "step": 5350 + }, + { + "epoch": 0.752280701754386, + "grad_norm": 0.9368489980697632, + "learning_rate": 4.727591413718282e-05, + "loss": 0.712, + "step": 5360 + }, + { + "epoch": 0.7536842105263157, + "grad_norm": 1.1019880771636963, + "learning_rate": 4.7265892351224694e-05, + "loss": 0.7172, + "step": 5370 + }, + { + "epoch": 0.7550877192982456, + "grad_norm": 0.8168277144432068, + "learning_rate": 4.725585323050623e-05, + "loss": 0.6812, + "step": 5380 + }, + { + "epoch": 0.7564912280701754, + "grad_norm": 1.0383678674697876, + "learning_rate": 4.72457967828432e-05, + "loss": 0.6266, + "step": 5390 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 1.9418814182281494, + "learning_rate": 4.723572301606492e-05, + "loss": 0.6976, + "step": 5400 + }, + { + "epoch": 0.7592982456140351, + "grad_norm": 1.1380218267440796, + "learning_rate": 4.7225631938014134e-05, + "loss": 0.709, + "step": 5410 + }, + { + "epoch": 0.7607017543859649, + "grad_norm": 0.7876071333885193, + "learning_rate": 4.7215523556547116e-05, + "loss": 0.5956, + "step": 5420 + }, + { + "epoch": 0.7621052631578947, + "grad_norm": 0.9458256363868713, + "learning_rate": 4.720539787953357e-05, + "loss": 0.6943, + "step": 5430 + }, + { + "epoch": 0.7635087719298246, + "grad_norm": 0.6351762413978577, + "learning_rate": 4.71952549148567e-05, + "loss": 0.6322, + "step": 5440 + }, + { + "epoch": 0.7649122807017544, + "grad_norm": 0.8464050889015198, + "learning_rate": 4.7185094670413134e-05, + "loss": 0.6258, + "step": 5450 + }, + { + "epoch": 0.7663157894736842, + "grad_norm": 1.7159314155578613, + "learning_rate": 4.7174917154112984e-05, + "loss": 0.6347, + "step": 5460 + }, + { + "epoch": 0.767719298245614, + "grad_norm": 0.8159227967262268, + "learning_rate": 4.716472237387979e-05, + "loss": 0.6423, + "step": 5470 + }, + { + "epoch": 0.7691228070175439, + "grad_norm": 1.1517149209976196, + "learning_rate": 4.715451033765054e-05, + "loss": 0.6614, + "step": 5480 + }, + { + "epoch": 0.7705263157894737, + "grad_norm": 1.164534091949463, + "learning_rate": 4.714428105337565e-05, + "loss": 0.6326, + "step": 5490 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 1.0906124114990234, + "learning_rate": 4.713403452901898e-05, + "loss": 0.6146, + "step": 5500 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.7224928140640259, + "learning_rate": 4.7123770772557774e-05, + "loss": 0.6061, + "step": 5510 + }, + { + "epoch": 0.7747368421052632, + "grad_norm": 1.1344630718231201, + "learning_rate": 4.711348979198274e-05, + "loss": 0.7423, + "step": 5520 + }, + { + "epoch": 0.776140350877193, + "grad_norm": 1.0616703033447266, + "learning_rate": 4.710319159529798e-05, + "loss": 0.6648, + "step": 5530 + }, + { + "epoch": 0.7775438596491228, + "grad_norm": 0.8563722968101501, + "learning_rate": 4.709287619052098e-05, + "loss": 0.551, + "step": 5540 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 0.7541974186897278, + "learning_rate": 4.708254358568264e-05, + "loss": 0.7394, + "step": 5550 + }, + { + "epoch": 0.7803508771929825, + "grad_norm": 0.9201952815055847, + "learning_rate": 4.7072193788827236e-05, + "loss": 0.57, + "step": 5560 + }, + { + "epoch": 0.7817543859649123, + "grad_norm": 0.8615202307701111, + "learning_rate": 4.706182680801245e-05, + "loss": 0.6293, + "step": 5570 + }, + { + "epoch": 0.783157894736842, + "grad_norm": 0.8534351587295532, + "learning_rate": 4.705144265130934e-05, + "loss": 0.6007, + "step": 5580 + }, + { + "epoch": 0.7845614035087719, + "grad_norm": 0.8691478967666626, + "learning_rate": 4.704104132680231e-05, + "loss": 0.5963, + "step": 5590 + }, + { + "epoch": 0.7859649122807018, + "grad_norm": 1.209688663482666, + "learning_rate": 4.703062284258916e-05, + "loss": 0.7237, + "step": 5600 + }, + { + "epoch": 0.7873684210526316, + "grad_norm": 0.72704017162323, + "learning_rate": 4.702018720678103e-05, + "loss": 0.6452, + "step": 5610 + }, + { + "epoch": 0.7887719298245615, + "grad_norm": 1.3118873834609985, + "learning_rate": 4.7009734427502426e-05, + "loss": 0.6291, + "step": 5620 + }, + { + "epoch": 0.7901754385964912, + "grad_norm": 0.6223419308662415, + "learning_rate": 4.699926451289119e-05, + "loss": 0.5925, + "step": 5630 + }, + { + "epoch": 0.791578947368421, + "grad_norm": 1.0733870267868042, + "learning_rate": 4.698877747109852e-05, + "loss": 0.7342, + "step": 5640 + }, + { + "epoch": 0.7929824561403509, + "grad_norm": 0.7960459589958191, + "learning_rate": 4.697827331028893e-05, + "loss": 0.644, + "step": 5650 + }, + { + "epoch": 0.7943859649122808, + "grad_norm": 0.9189769625663757, + "learning_rate": 4.6967752038640264e-05, + "loss": 0.6567, + "step": 5660 + }, + { + "epoch": 0.7957894736842105, + "grad_norm": 1.1323273181915283, + "learning_rate": 4.695721366434369e-05, + "loss": 0.6873, + "step": 5670 + }, + { + "epoch": 0.7971929824561403, + "grad_norm": 0.8580273389816284, + "learning_rate": 4.694665819560371e-05, + "loss": 0.6733, + "step": 5680 + }, + { + "epoch": 0.7985964912280702, + "grad_norm": 1.3165494203567505, + "learning_rate": 4.693608564063811e-05, + "loss": 0.642, + "step": 5690 + }, + { + "epoch": 0.8, + "grad_norm": 0.7017198801040649, + "learning_rate": 4.692549600767798e-05, + "loss": 0.5438, + "step": 5700 + }, + { + "epoch": 0.8014035087719298, + "grad_norm": 0.8478591442108154, + "learning_rate": 4.6914889304967725e-05, + "loss": 0.6107, + "step": 5710 + }, + { + "epoch": 0.8028070175438596, + "grad_norm": 0.9716276526451111, + "learning_rate": 4.690426554076501e-05, + "loss": 0.5975, + "step": 5720 + }, + { + "epoch": 0.8042105263157895, + "grad_norm": 1.0631777048110962, + "learning_rate": 4.689362472334082e-05, + "loss": 0.6563, + "step": 5730 + }, + { + "epoch": 0.8056140350877193, + "grad_norm": 0.9736322164535522, + "learning_rate": 4.688296686097937e-05, + "loss": 0.6199, + "step": 5740 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 0.9049164652824402, + "learning_rate": 4.6872291961978195e-05, + "loss": 0.5772, + "step": 5750 + }, + { + "epoch": 0.8084210526315789, + "grad_norm": 1.2360827922821045, + "learning_rate": 4.6861600034648064e-05, + "loss": 0.6401, + "step": 5760 + }, + { + "epoch": 0.8098245614035088, + "grad_norm": 1.2036852836608887, + "learning_rate": 4.6850891087313e-05, + "loss": 0.6087, + "step": 5770 + }, + { + "epoch": 0.8112280701754386, + "grad_norm": 1.010108470916748, + "learning_rate": 4.6840165128310296e-05, + "loss": 0.6973, + "step": 5780 + }, + { + "epoch": 0.8126315789473684, + "grad_norm": 1.1753820180892944, + "learning_rate": 4.6829422165990475e-05, + "loss": 0.6509, + "step": 5790 + }, + { + "epoch": 0.8140350877192982, + "grad_norm": 1.0416866540908813, + "learning_rate": 4.6818662208717296e-05, + "loss": 0.6092, + "step": 5800 + }, + { + "epoch": 0.8154385964912281, + "grad_norm": 0.7539423108100891, + "learning_rate": 4.680788526486776e-05, + "loss": 0.5864, + "step": 5810 + }, + { + "epoch": 0.8168421052631579, + "grad_norm": 0.9227228164672852, + "learning_rate": 4.679709134283209e-05, + "loss": 0.5736, + "step": 5820 + }, + { + "epoch": 0.8182456140350877, + "grad_norm": 0.8869969844818115, + "learning_rate": 4.678628045101371e-05, + "loss": 0.5982, + "step": 5830 + }, + { + "epoch": 0.8196491228070175, + "grad_norm": 0.6802515387535095, + "learning_rate": 4.677545259782929e-05, + "loss": 0.6136, + "step": 5840 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 0.9065477848052979, + "learning_rate": 4.676460779170867e-05, + "loss": 0.6519, + "step": 5850 + }, + { + "epoch": 0.8224561403508772, + "grad_norm": 1.3136307001113892, + "learning_rate": 4.675374604109491e-05, + "loss": 0.6122, + "step": 5860 + }, + { + "epoch": 0.8238596491228071, + "grad_norm": 0.9648601412773132, + "learning_rate": 4.6742867354444256e-05, + "loss": 0.5582, + "step": 5870 + }, + { + "epoch": 0.8252631578947368, + "grad_norm": 1.085227370262146, + "learning_rate": 4.673197174022613e-05, + "loss": 0.6788, + "step": 5880 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.9866172075271606, + "learning_rate": 4.672105920692316e-05, + "loss": 0.647, + "step": 5890 + }, + { + "epoch": 0.8280701754385965, + "grad_norm": 1.5403311252593994, + "learning_rate": 4.6710129763031095e-05, + "loss": 0.6326, + "step": 5900 + }, + { + "epoch": 0.8294736842105264, + "grad_norm": 1.195082187652588, + "learning_rate": 4.669918341705891e-05, + "loss": 0.6205, + "step": 5910 + }, + { + "epoch": 0.8308771929824561, + "grad_norm": 0.9392557740211487, + "learning_rate": 4.66882201775287e-05, + "loss": 0.699, + "step": 5920 + }, + { + "epoch": 0.832280701754386, + "grad_norm": 1.280907392501831, + "learning_rate": 4.667724005297573e-05, + "loss": 0.6147, + "step": 5930 + }, + { + "epoch": 0.8336842105263158, + "grad_norm": 0.6876835823059082, + "learning_rate": 4.66662430519484e-05, + "loss": 0.5737, + "step": 5940 + }, + { + "epoch": 0.8350877192982457, + "grad_norm": 0.7067710161209106, + "learning_rate": 4.665522918300823e-05, + "loss": 0.6072, + "step": 5950 + }, + { + "epoch": 0.8364912280701754, + "grad_norm": 1.0336652994155884, + "learning_rate": 4.6644198454729933e-05, + "loss": 0.6296, + "step": 5960 + }, + { + "epoch": 0.8378947368421052, + "grad_norm": 1.3756647109985352, + "learning_rate": 4.663315087570128e-05, + "loss": 0.6489, + "step": 5970 + }, + { + "epoch": 0.8392982456140351, + "grad_norm": 1.0433988571166992, + "learning_rate": 4.662208645452321e-05, + "loss": 0.6742, + "step": 5980 + }, + { + "epoch": 0.840701754385965, + "grad_norm": 0.6354380249977112, + "learning_rate": 4.661100519980973e-05, + "loss": 0.573, + "step": 5990 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 1.112243890762329, + "learning_rate": 4.6599907120188005e-05, + "loss": 0.6455, + "step": 6000 + }, + { + "epoch": 0.8421052631578947, + "eval_loss": 0.6415141820907593, + "eval_runtime": 44.3465, + "eval_samples_per_second": 33.825, + "eval_steps_per_second": 8.456, + "step": 6000 + }, + { + "epoch": 0.8435087719298245, + "grad_norm": 0.781201958656311, + "learning_rate": 4.658879222429825e-05, + "loss": 0.5362, + "step": 6010 + }, + { + "epoch": 0.8449122807017544, + "grad_norm": 1.069032073020935, + "learning_rate": 4.65776605207938e-05, + "loss": 0.6321, + "step": 6020 + }, + { + "epoch": 0.8463157894736842, + "grad_norm": 1.0449451208114624, + "learning_rate": 4.656651201834106e-05, + "loss": 0.6208, + "step": 6030 + }, + { + "epoch": 0.847719298245614, + "grad_norm": 1.9674957990646362, + "learning_rate": 4.655534672561953e-05, + "loss": 0.6529, + "step": 6040 + }, + { + "epoch": 0.8491228070175438, + "grad_norm": 0.9335805773735046, + "learning_rate": 4.654416465132177e-05, + "loss": 0.6515, + "step": 6050 + }, + { + "epoch": 0.8505263157894737, + "grad_norm": 0.8951327800750732, + "learning_rate": 4.6532965804153416e-05, + "loss": 0.613, + "step": 6060 + }, + { + "epoch": 0.8519298245614035, + "grad_norm": 1.1679803133010864, + "learning_rate": 4.652175019283314e-05, + "loss": 0.6215, + "step": 6070 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 1.6771854162216187, + "learning_rate": 4.6510517826092695e-05, + "loss": 0.7427, + "step": 6080 + }, + { + "epoch": 0.8547368421052631, + "grad_norm": 0.6942294836044312, + "learning_rate": 4.649926871267685e-05, + "loss": 0.58, + "step": 6090 + }, + { + "epoch": 0.856140350877193, + "grad_norm": 1.114723801612854, + "learning_rate": 4.6488002861343425e-05, + "loss": 0.6916, + "step": 6100 + }, + { + "epoch": 0.8575438596491228, + "grad_norm": 0.9489352107048035, + "learning_rate": 4.647672028086328e-05, + "loss": 0.6073, + "step": 6110 + }, + { + "epoch": 0.8589473684210527, + "grad_norm": 0.8159108757972717, + "learning_rate": 4.646542098002029e-05, + "loss": 0.6273, + "step": 6120 + }, + { + "epoch": 0.8603508771929824, + "grad_norm": 1.2675360441207886, + "learning_rate": 4.645410496761135e-05, + "loss": 0.6657, + "step": 6130 + }, + { + "epoch": 0.8617543859649123, + "grad_norm": 0.9706358313560486, + "learning_rate": 4.644277225244635e-05, + "loss": 0.6861, + "step": 6140 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 1.0342049598693848, + "learning_rate": 4.6431422843348216e-05, + "loss": 0.6834, + "step": 6150 + }, + { + "epoch": 0.864561403508772, + "grad_norm": 0.9016236066818237, + "learning_rate": 4.642005674915284e-05, + "loss": 0.6098, + "step": 6160 + }, + { + "epoch": 0.8659649122807017, + "grad_norm": 0.8684419989585876, + "learning_rate": 4.640867397870912e-05, + "loss": 0.6831, + "step": 6170 + }, + { + "epoch": 0.8673684210526316, + "grad_norm": 0.8743478059768677, + "learning_rate": 4.639727454087892e-05, + "loss": 0.5846, + "step": 6180 + }, + { + "epoch": 0.8687719298245614, + "grad_norm": 1.0925372838974, + "learning_rate": 4.638585844453711e-05, + "loss": 0.6436, + "step": 6190 + }, + { + "epoch": 0.8701754385964913, + "grad_norm": 1.0224460363388062, + "learning_rate": 4.6374425698571514e-05, + "loss": 0.7538, + "step": 6200 + }, + { + "epoch": 0.871578947368421, + "grad_norm": 0.8540046215057373, + "learning_rate": 4.63629763118829e-05, + "loss": 0.596, + "step": 6210 + }, + { + "epoch": 0.8729824561403509, + "grad_norm": 0.9685525298118591, + "learning_rate": 4.6351510293385026e-05, + "loss": 0.5844, + "step": 6220 + }, + { + "epoch": 0.8743859649122807, + "grad_norm": 0.9988105893135071, + "learning_rate": 4.634002765200456e-05, + "loss": 0.5785, + "step": 6230 + }, + { + "epoch": 0.8757894736842106, + "grad_norm": 0.7331526279449463, + "learning_rate": 4.632852839668115e-05, + "loss": 0.5728, + "step": 6240 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 1.0520068407058716, + "learning_rate": 4.6317012536367354e-05, + "loss": 0.6317, + "step": 6250 + }, + { + "epoch": 0.8785964912280702, + "grad_norm": 1.117604374885559, + "learning_rate": 4.630548008002866e-05, + "loss": 0.6152, + "step": 6260 + }, + { + "epoch": 0.88, + "grad_norm": 0.7635726928710938, + "learning_rate": 4.629393103664349e-05, + "loss": 0.64, + "step": 6270 + }, + { + "epoch": 0.8814035087719299, + "grad_norm": 1.1754323244094849, + "learning_rate": 4.6282365415203164e-05, + "loss": 0.5923, + "step": 6280 + }, + { + "epoch": 0.8828070175438596, + "grad_norm": 0.6220813989639282, + "learning_rate": 4.627078322471191e-05, + "loss": 0.6745, + "step": 6290 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 0.8440349698066711, + "learning_rate": 4.625918447418687e-05, + "loss": 0.5819, + "step": 6300 + }, + { + "epoch": 0.8856140350877193, + "grad_norm": 1.0416796207427979, + "learning_rate": 4.624756917265807e-05, + "loss": 0.5965, + "step": 6310 + }, + { + "epoch": 0.8870175438596491, + "grad_norm": 1.1395715475082397, + "learning_rate": 4.62359373291684e-05, + "loss": 0.5838, + "step": 6320 + }, + { + "epoch": 0.888421052631579, + "grad_norm": 1.3868945837020874, + "learning_rate": 4.622428895277367e-05, + "loss": 0.7304, + "step": 6330 + }, + { + "epoch": 0.8898245614035087, + "grad_norm": 1.0000405311584473, + "learning_rate": 4.621262405254253e-05, + "loss": 0.5938, + "step": 6340 + }, + { + "epoch": 0.8912280701754386, + "grad_norm": 0.754399836063385, + "learning_rate": 4.620094263755652e-05, + "loss": 0.6276, + "step": 6350 + }, + { + "epoch": 0.8926315789473684, + "grad_norm": 0.9784127473831177, + "learning_rate": 4.618924471691e-05, + "loss": 0.613, + "step": 6360 + }, + { + "epoch": 0.8940350877192983, + "grad_norm": 0.6419925689697266, + "learning_rate": 4.617753029971021e-05, + "loss": 0.599, + "step": 6370 + }, + { + "epoch": 0.895438596491228, + "grad_norm": 1.2562180757522583, + "learning_rate": 4.6165799395077236e-05, + "loss": 0.6358, + "step": 6380 + }, + { + "epoch": 0.8968421052631579, + "grad_norm": 1.1815166473388672, + "learning_rate": 4.615405201214398e-05, + "loss": 0.6747, + "step": 6390 + }, + { + "epoch": 0.8982456140350877, + "grad_norm": 1.5243850946426392, + "learning_rate": 4.614228816005618e-05, + "loss": 0.6082, + "step": 6400 + }, + { + "epoch": 0.8996491228070176, + "grad_norm": 0.894396960735321, + "learning_rate": 4.61305078479724e-05, + "loss": 0.5506, + "step": 6410 + }, + { + "epoch": 0.9010526315789473, + "grad_norm": 0.7782644629478455, + "learning_rate": 4.611871108506403e-05, + "loss": 0.5816, + "step": 6420 + }, + { + "epoch": 0.9024561403508772, + "grad_norm": 1.2209144830703735, + "learning_rate": 4.610689788051523e-05, + "loss": 0.6178, + "step": 6430 + }, + { + "epoch": 0.903859649122807, + "grad_norm": 0.8224475979804993, + "learning_rate": 4.6095068243523e-05, + "loss": 0.644, + "step": 6440 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 1.054763674736023, + "learning_rate": 4.608322218329711e-05, + "loss": 0.5564, + "step": 6450 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.9225585460662842, + "learning_rate": 4.607135970906014e-05, + "loss": 0.6534, + "step": 6460 + }, + { + "epoch": 0.9080701754385965, + "grad_norm": 0.7979352474212646, + "learning_rate": 4.605948083004741e-05, + "loss": 0.6671, + "step": 6470 + }, + { + "epoch": 0.9094736842105263, + "grad_norm": 1.223375916481018, + "learning_rate": 4.6047585555507045e-05, + "loss": 0.6996, + "step": 6480 + }, + { + "epoch": 0.9108771929824562, + "grad_norm": 0.803092360496521, + "learning_rate": 4.603567389469993e-05, + "loss": 0.5937, + "step": 6490 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 0.9455748796463013, + "learning_rate": 4.60237458568997e-05, + "loss": 0.555, + "step": 6500 + }, + { + "epoch": 0.9136842105263158, + "grad_norm": 1.0315808057785034, + "learning_rate": 4.6011801451392736e-05, + "loss": 0.6721, + "step": 6510 + }, + { + "epoch": 0.9150877192982456, + "grad_norm": 1.0508462190628052, + "learning_rate": 4.5999840687478167e-05, + "loss": 0.5849, + "step": 6520 + }, + { + "epoch": 0.9164912280701755, + "grad_norm": 0.8299797773361206, + "learning_rate": 4.598786357446786e-05, + "loss": 0.6013, + "step": 6530 + }, + { + "epoch": 0.9178947368421052, + "grad_norm": 0.8177257776260376, + "learning_rate": 4.5975870121686406e-05, + "loss": 0.6178, + "step": 6540 + }, + { + "epoch": 0.9192982456140351, + "grad_norm": 0.8297099471092224, + "learning_rate": 4.596386033847111e-05, + "loss": 0.5985, + "step": 6550 + }, + { + "epoch": 0.9207017543859649, + "grad_norm": 1.1290909051895142, + "learning_rate": 4.5951834234172025e-05, + "loss": 0.5878, + "step": 6560 + }, + { + "epoch": 0.9221052631578948, + "grad_norm": 0.7982479929924011, + "learning_rate": 4.593979181815187e-05, + "loss": 0.7004, + "step": 6570 + }, + { + "epoch": 0.9235087719298246, + "grad_norm": 0.9358506202697754, + "learning_rate": 4.5927733099786066e-05, + "loss": 0.615, + "step": 6580 + }, + { + "epoch": 0.9249122807017544, + "grad_norm": 0.8648248910903931, + "learning_rate": 4.591565808846276e-05, + "loss": 0.5789, + "step": 6590 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 1.1170403957366943, + "learning_rate": 4.5903566793582755e-05, + "loss": 0.5827, + "step": 6600 + }, + { + "epoch": 0.927719298245614, + "grad_norm": 1.5100983381271362, + "learning_rate": 4.589145922455954e-05, + "loss": 0.6176, + "step": 6610 + }, + { + "epoch": 0.9291228070175439, + "grad_norm": 1.2393382787704468, + "learning_rate": 4.587933539081927e-05, + "loss": 0.7056, + "step": 6620 + }, + { + "epoch": 0.9305263157894736, + "grad_norm": 0.8480477333068848, + "learning_rate": 4.586719530180075e-05, + "loss": 0.6418, + "step": 6630 + }, + { + "epoch": 0.9319298245614035, + "grad_norm": 1.1262218952178955, + "learning_rate": 4.585503896695549e-05, + "loss": 0.5231, + "step": 6640 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.1744680404663086, + "learning_rate": 4.584286639574758e-05, + "loss": 0.7463, + "step": 6650 + }, + { + "epoch": 0.9347368421052632, + "grad_norm": 1.0590097904205322, + "learning_rate": 4.58306775976538e-05, + "loss": 0.6373, + "step": 6660 + }, + { + "epoch": 0.9361403508771929, + "grad_norm": 1.1323457956314087, + "learning_rate": 4.581847258216355e-05, + "loss": 0.6609, + "step": 6670 + }, + { + "epoch": 0.9375438596491228, + "grad_norm": 1.043113350868225, + "learning_rate": 4.580625135877884e-05, + "loss": 0.6366, + "step": 6680 + }, + { + "epoch": 0.9389473684210526, + "grad_norm": 0.6503088474273682, + "learning_rate": 4.5794013937014326e-05, + "loss": 0.6359, + "step": 6690 + }, + { + "epoch": 0.9403508771929825, + "grad_norm": 1.6966040134429932, + "learning_rate": 4.578176032639724e-05, + "loss": 0.7239, + "step": 6700 + }, + { + "epoch": 0.9417543859649122, + "grad_norm": 0.7537420988082886, + "learning_rate": 4.5769490536467465e-05, + "loss": 0.5848, + "step": 6710 + }, + { + "epoch": 0.9431578947368421, + "grad_norm": 1.1561657190322876, + "learning_rate": 4.5757204576777437e-05, + "loss": 0.6155, + "step": 6720 + }, + { + "epoch": 0.9445614035087719, + "grad_norm": 0.6912992000579834, + "learning_rate": 4.574490245689219e-05, + "loss": 0.5817, + "step": 6730 + }, + { + "epoch": 0.9459649122807018, + "grad_norm": 1.0430986881256104, + "learning_rate": 4.573258418638936e-05, + "loss": 0.4903, + "step": 6740 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.9669928550720215, + "learning_rate": 4.572024977485914e-05, + "loss": 0.6289, + "step": 6750 + }, + { + "epoch": 0.9487719298245614, + "grad_norm": 0.9165218472480774, + "learning_rate": 4.5707899231904286e-05, + "loss": 0.6133, + "step": 6760 + }, + { + "epoch": 0.9501754385964912, + "grad_norm": 0.9709174036979675, + "learning_rate": 4.569553256714012e-05, + "loss": 0.638, + "step": 6770 + }, + { + "epoch": 0.9515789473684211, + "grad_norm": 1.2939212322235107, + "learning_rate": 4.5683149790194526e-05, + "loss": 0.7599, + "step": 6780 + }, + { + "epoch": 0.9529824561403509, + "grad_norm": 1.0535619258880615, + "learning_rate": 4.5670750910707903e-05, + "loss": 0.6906, + "step": 6790 + }, + { + "epoch": 0.9543859649122807, + "grad_norm": 0.9928086400032043, + "learning_rate": 4.565833593833321e-05, + "loss": 0.7387, + "step": 6800 + }, + { + "epoch": 0.9557894736842105, + "grad_norm": 0.7087190747261047, + "learning_rate": 4.5645904882735935e-05, + "loss": 0.5566, + "step": 6810 + }, + { + "epoch": 0.9571929824561404, + "grad_norm": 1.2111977338790894, + "learning_rate": 4.563345775359408e-05, + "loss": 0.5748, + "step": 6820 + }, + { + "epoch": 0.9585964912280702, + "grad_norm": 1.2516281604766846, + "learning_rate": 4.562099456059815e-05, + "loss": 0.6256, + "step": 6830 + }, + { + "epoch": 0.96, + "grad_norm": 0.8081939220428467, + "learning_rate": 4.5608515313451186e-05, + "loss": 0.5826, + "step": 6840 + }, + { + "epoch": 0.9614035087719298, + "grad_norm": 1.453393578529358, + "learning_rate": 4.559602002186869e-05, + "loss": 0.5538, + "step": 6850 + }, + { + "epoch": 0.9628070175438597, + "grad_norm": 1.139618158340454, + "learning_rate": 4.558350869557868e-05, + "loss": 0.6514, + "step": 6860 + }, + { + "epoch": 0.9642105263157895, + "grad_norm": 0.9846227765083313, + "learning_rate": 4.557098134432167e-05, + "loss": 0.7813, + "step": 6870 + }, + { + "epoch": 0.9656140350877193, + "grad_norm": 0.8734840750694275, + "learning_rate": 4.555843797785061e-05, + "loss": 0.5993, + "step": 6880 + }, + { + "epoch": 0.9670175438596491, + "grad_norm": 1.172455072402954, + "learning_rate": 4.554587860593095e-05, + "loss": 0.5594, + "step": 6890 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 0.5644223690032959, + "learning_rate": 4.553330323834059e-05, + "loss": 0.5581, + "step": 6900 + }, + { + "epoch": 0.9698245614035088, + "grad_norm": 0.7265453338623047, + "learning_rate": 4.552071188486989e-05, + "loss": 0.6295, + "step": 6910 + }, + { + "epoch": 0.9712280701754386, + "grad_norm": 0.8341143727302551, + "learning_rate": 4.550810455532164e-05, + "loss": 0.5697, + "step": 6920 + }, + { + "epoch": 0.9726315789473684, + "grad_norm": 0.7036447525024414, + "learning_rate": 4.5495481259511095e-05, + "loss": 0.5933, + "step": 6930 + }, + { + "epoch": 0.9740350877192983, + "grad_norm": 0.972158670425415, + "learning_rate": 4.54828420072659e-05, + "loss": 0.5723, + "step": 6940 + }, + { + "epoch": 0.9754385964912281, + "grad_norm": 1.3979262113571167, + "learning_rate": 4.547018680842616e-05, + "loss": 0.5749, + "step": 6950 + }, + { + "epoch": 0.9768421052631578, + "grad_norm": 1.3824502229690552, + "learning_rate": 4.545751567284439e-05, + "loss": 0.7053, + "step": 6960 + }, + { + "epoch": 0.9782456140350877, + "grad_norm": 1.1198428869247437, + "learning_rate": 4.5444828610385486e-05, + "loss": 0.629, + "step": 6970 + }, + { + "epoch": 0.9796491228070175, + "grad_norm": 0.7075201869010925, + "learning_rate": 4.543212563092677e-05, + "loss": 0.6647, + "step": 6980 + }, + { + "epoch": 0.9810526315789474, + "grad_norm": 1.0392223596572876, + "learning_rate": 4.541940674435794e-05, + "loss": 0.6921, + "step": 6990 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 1.027004599571228, + "learning_rate": 4.5406671960581096e-05, + "loss": 0.6187, + "step": 7000 + }, + { + "epoch": 0.983859649122807, + "grad_norm": 1.2175973653793335, + "learning_rate": 4.53939212895107e-05, + "loss": 0.6471, + "step": 7010 + }, + { + "epoch": 0.9852631578947368, + "grad_norm": 1.0904464721679688, + "learning_rate": 4.538115474107357e-05, + "loss": 0.5916, + "step": 7020 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 1.0575454235076904, + "learning_rate": 4.536837232520893e-05, + "loss": 0.6859, + "step": 7030 + }, + { + "epoch": 0.9880701754385965, + "grad_norm": 0.7100856304168701, + "learning_rate": 4.535557405186831e-05, + "loss": 0.64, + "step": 7040 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 0.9754576683044434, + "learning_rate": 4.534275993101561e-05, + "loss": 0.5839, + "step": 7050 + }, + { + "epoch": 0.9908771929824561, + "grad_norm": 0.8776949644088745, + "learning_rate": 4.532992997262706e-05, + "loss": 0.577, + "step": 7060 + }, + { + "epoch": 0.992280701754386, + "grad_norm": 1.569716215133667, + "learning_rate": 4.531708418669122e-05, + "loss": 0.7155, + "step": 7070 + }, + { + "epoch": 0.9936842105263158, + "grad_norm": 0.7174299955368042, + "learning_rate": 4.5304222583208983e-05, + "loss": 0.6713, + "step": 7080 + }, + { + "epoch": 0.9950877192982456, + "grad_norm": 1.0695897340774536, + "learning_rate": 4.5291345172193546e-05, + "loss": 0.6528, + "step": 7090 + }, + { + "epoch": 0.9964912280701754, + "grad_norm": 1.0685267448425293, + "learning_rate": 4.5278451963670403e-05, + "loss": 0.5705, + "step": 7100 + }, + { + "epoch": 0.9978947368421053, + "grad_norm": 1.2662453651428223, + "learning_rate": 4.526554296767738e-05, + "loss": 0.6763, + "step": 7110 + }, + { + "epoch": 0.9992982456140351, + "grad_norm": 1.1944515705108643, + "learning_rate": 4.525261819426455e-05, + "loss": 0.5855, + "step": 7120 + }, + { + "epoch": 1.0007017543859649, + "grad_norm": 0.7510038614273071, + "learning_rate": 4.5239677653494305e-05, + "loss": 0.5631, + "step": 7130 + }, + { + "epoch": 1.0021052631578948, + "grad_norm": 0.7383008003234863, + "learning_rate": 4.5226721355441306e-05, + "loss": 0.493, + "step": 7140 + }, + { + "epoch": 1.0035087719298246, + "grad_norm": 1.3719711303710938, + "learning_rate": 4.5213749310192455e-05, + "loss": 0.5049, + "step": 7150 + }, + { + "epoch": 1.0049122807017543, + "grad_norm": 0.7755573987960815, + "learning_rate": 4.520076152784695e-05, + "loss": 0.5204, + "step": 7160 + }, + { + "epoch": 1.0063157894736843, + "grad_norm": 1.0142574310302734, + "learning_rate": 4.518775801851622e-05, + "loss": 0.5395, + "step": 7170 + }, + { + "epoch": 1.007719298245614, + "grad_norm": 1.0278340578079224, + "learning_rate": 4.517473879232395e-05, + "loss": 0.5231, + "step": 7180 + }, + { + "epoch": 1.0091228070175438, + "grad_norm": 1.4633328914642334, + "learning_rate": 4.516170385940603e-05, + "loss": 0.5764, + "step": 7190 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 1.5323199033737183, + "learning_rate": 4.514865322991063e-05, + "loss": 0.5339, + "step": 7200 + }, + { + "epoch": 1.0119298245614035, + "grad_norm": 1.379055380821228, + "learning_rate": 4.51355869139981e-05, + "loss": 0.5684, + "step": 7210 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 0.9581233859062195, + "learning_rate": 4.512250492184101e-05, + "loss": 0.539, + "step": 7220 + }, + { + "epoch": 1.0147368421052632, + "grad_norm": 1.1807743310928345, + "learning_rate": 4.510940726362416e-05, + "loss": 0.5348, + "step": 7230 + }, + { + "epoch": 1.016140350877193, + "grad_norm": 1.2164653539657593, + "learning_rate": 4.50962939495445e-05, + "loss": 0.5795, + "step": 7240 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 0.7895627617835999, + "learning_rate": 4.50831649898112e-05, + "loss": 0.5894, + "step": 7250 + }, + { + "epoch": 1.0189473684210526, + "grad_norm": 1.4003938436508179, + "learning_rate": 4.507002039464562e-05, + "loss": 0.5825, + "step": 7260 + }, + { + "epoch": 1.0203508771929826, + "grad_norm": 0.8824437856674194, + "learning_rate": 4.505686017428127e-05, + "loss": 0.5513, + "step": 7270 + }, + { + "epoch": 1.0217543859649123, + "grad_norm": 1.0241267681121826, + "learning_rate": 4.504368433896382e-05, + "loss": 0.6675, + "step": 7280 + }, + { + "epoch": 1.023157894736842, + "grad_norm": 1.1416174173355103, + "learning_rate": 4.5030492898951134e-05, + "loss": 0.5146, + "step": 7290 + }, + { + "epoch": 1.024561403508772, + "grad_norm": 1.4306304454803467, + "learning_rate": 4.501728586451318e-05, + "loss": 0.6254, + "step": 7300 + }, + { + "epoch": 1.0259649122807017, + "grad_norm": 0.7919867634773254, + "learning_rate": 4.5004063245932097e-05, + "loss": 0.4688, + "step": 7310 + }, + { + "epoch": 1.0273684210526315, + "grad_norm": 1.0270862579345703, + "learning_rate": 4.4990825053502136e-05, + "loss": 0.5227, + "step": 7320 + }, + { + "epoch": 1.0287719298245614, + "grad_norm": 1.332261085510254, + "learning_rate": 4.497757129752969e-05, + "loss": 0.5219, + "step": 7330 + }, + { + "epoch": 1.0301754385964912, + "grad_norm": 1.1045178174972534, + "learning_rate": 4.496430198833327e-05, + "loss": 0.5006, + "step": 7340 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 1.069557547569275, + "learning_rate": 4.495101713624348e-05, + "loss": 0.519, + "step": 7350 + }, + { + "epoch": 1.032982456140351, + "grad_norm": 1.0084444284439087, + "learning_rate": 4.493771675160303e-05, + "loss": 0.6042, + "step": 7360 + }, + { + "epoch": 1.0343859649122806, + "grad_norm": 1.1198923587799072, + "learning_rate": 4.4924400844766734e-05, + "loss": 0.5092, + "step": 7370 + }, + { + "epoch": 1.0357894736842106, + "grad_norm": 0.7310131788253784, + "learning_rate": 4.491106942610147e-05, + "loss": 0.6178, + "step": 7380 + }, + { + "epoch": 1.0371929824561403, + "grad_norm": 1.1642857789993286, + "learning_rate": 4.489772250598622e-05, + "loss": 0.6226, + "step": 7390 + }, + { + "epoch": 1.03859649122807, + "grad_norm": 1.194806456565857, + "learning_rate": 4.488436009481201e-05, + "loss": 0.5821, + "step": 7400 + }, + { + "epoch": 1.04, + "grad_norm": 1.3842540979385376, + "learning_rate": 4.487098220298193e-05, + "loss": 0.5265, + "step": 7410 + }, + { + "epoch": 1.0414035087719298, + "grad_norm": 1.546705722808838, + "learning_rate": 4.485758884091113e-05, + "loss": 0.5617, + "step": 7420 + }, + { + "epoch": 1.0428070175438597, + "grad_norm": 1.5181100368499756, + "learning_rate": 4.4844180019026805e-05, + "loss": 0.5468, + "step": 7430 + }, + { + "epoch": 1.0442105263157895, + "grad_norm": 1.8832321166992188, + "learning_rate": 4.483075574776819e-05, + "loss": 0.5048, + "step": 7440 + }, + { + "epoch": 1.0456140350877192, + "grad_norm": 1.2133930921554565, + "learning_rate": 4.4817316037586524e-05, + "loss": 0.5684, + "step": 7450 + }, + { + "epoch": 1.0470175438596492, + "grad_norm": 1.6424169540405273, + "learning_rate": 4.480386089894509e-05, + "loss": 0.5851, + "step": 7460 + }, + { + "epoch": 1.048421052631579, + "grad_norm": 1.2695761919021606, + "learning_rate": 4.479039034231918e-05, + "loss": 0.5308, + "step": 7470 + }, + { + "epoch": 1.0498245614035087, + "grad_norm": 1.3531373739242554, + "learning_rate": 4.477690437819607e-05, + "loss": 0.5904, + "step": 7480 + }, + { + "epoch": 1.0512280701754386, + "grad_norm": 1.6086102724075317, + "learning_rate": 4.476340301707507e-05, + "loss": 0.4894, + "step": 7490 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.7145791053771973, + "learning_rate": 4.4749886269467416e-05, + "loss": 0.4906, + "step": 7500 + }, + { + "epoch": 1.0540350877192983, + "grad_norm": 1.5852210521697998, + "learning_rate": 4.473635414589639e-05, + "loss": 0.5399, + "step": 7510 + }, + { + "epoch": 1.055438596491228, + "grad_norm": 1.3225674629211426, + "learning_rate": 4.47228066568972e-05, + "loss": 0.5168, + "step": 7520 + }, + { + "epoch": 1.0568421052631578, + "grad_norm": 1.3213186264038086, + "learning_rate": 4.470924381301704e-05, + "loss": 0.4888, + "step": 7530 + }, + { + "epoch": 1.0582456140350878, + "grad_norm": 1.4983114004135132, + "learning_rate": 4.469566562481503e-05, + "loss": 0.4909, + "step": 7540 + }, + { + "epoch": 1.0596491228070175, + "grad_norm": 1.3175050020217896, + "learning_rate": 4.4682072102862286e-05, + "loss": 0.5369, + "step": 7550 + }, + { + "epoch": 1.0610526315789475, + "grad_norm": 1.14377760887146, + "learning_rate": 4.466846325774179e-05, + "loss": 0.5046, + "step": 7560 + }, + { + "epoch": 1.0624561403508772, + "grad_norm": 0.6691097021102905, + "learning_rate": 4.4654839100048535e-05, + "loss": 0.5201, + "step": 7570 + }, + { + "epoch": 1.063859649122807, + "grad_norm": 1.4467300176620483, + "learning_rate": 4.464119964038937e-05, + "loss": 0.5238, + "step": 7580 + }, + { + "epoch": 1.065263157894737, + "grad_norm": 0.8880655169487, + "learning_rate": 4.462754488938309e-05, + "loss": 0.5074, + "step": 7590 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 2.043294906616211, + "learning_rate": 4.4613874857660384e-05, + "loss": 0.5297, + "step": 7600 + }, + { + "epoch": 1.0680701754385964, + "grad_norm": 1.054681420326233, + "learning_rate": 4.460018955586384e-05, + "loss": 0.5585, + "step": 7610 + }, + { + "epoch": 1.0694736842105264, + "grad_norm": 1.256369709968567, + "learning_rate": 4.458648899464793e-05, + "loss": 0.4944, + "step": 7620 + }, + { + "epoch": 1.070877192982456, + "grad_norm": 1.0441490411758423, + "learning_rate": 4.457277318467903e-05, + "loss": 0.5736, + "step": 7630 + }, + { + "epoch": 1.072280701754386, + "grad_norm": 0.885286271572113, + "learning_rate": 4.4559042136635345e-05, + "loss": 0.6152, + "step": 7640 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 1.8804951906204224, + "learning_rate": 4.4545295861206975e-05, + "loss": 0.4936, + "step": 7650 + }, + { + "epoch": 1.0750877192982455, + "grad_norm": 1.5045465230941772, + "learning_rate": 4.453153436909587e-05, + "loss": 0.5547, + "step": 7660 + }, + { + "epoch": 1.0764912280701755, + "grad_norm": 1.7368062734603882, + "learning_rate": 4.4517757671015826e-05, + "loss": 0.537, + "step": 7670 + }, + { + "epoch": 1.0778947368421052, + "grad_norm": 1.3677830696105957, + "learning_rate": 4.4503965777692456e-05, + "loss": 0.5131, + "step": 7680 + }, + { + "epoch": 1.079298245614035, + "grad_norm": 1.2926596403121948, + "learning_rate": 4.449015869986325e-05, + "loss": 0.4782, + "step": 7690 + }, + { + "epoch": 1.080701754385965, + "grad_norm": 2.191722869873047, + "learning_rate": 4.447633644827747e-05, + "loss": 0.4962, + "step": 7700 + }, + { + "epoch": 1.0821052631578947, + "grad_norm": 1.8317209482192993, + "learning_rate": 4.446249903369621e-05, + "loss": 0.5025, + "step": 7710 + }, + { + "epoch": 1.0835087719298246, + "grad_norm": 1.2881171703338623, + "learning_rate": 4.444864646689239e-05, + "loss": 0.4816, + "step": 7720 + }, + { + "epoch": 1.0849122807017544, + "grad_norm": 1.1918405294418335, + "learning_rate": 4.443477875865071e-05, + "loss": 0.4762, + "step": 7730 + }, + { + "epoch": 1.0863157894736841, + "grad_norm": 1.1728036403656006, + "learning_rate": 4.4420895919767626e-05, + "loss": 0.4501, + "step": 7740 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 1.505370855331421, + "learning_rate": 4.440699796105143e-05, + "loss": 0.4855, + "step": 7750 + }, + { + "epoch": 1.0891228070175438, + "grad_norm": 1.6580755710601807, + "learning_rate": 4.439308489332215e-05, + "loss": 0.5558, + "step": 7760 + }, + { + "epoch": 1.0905263157894738, + "grad_norm": 1.496596097946167, + "learning_rate": 4.437915672741158e-05, + "loss": 0.5219, + "step": 7770 + }, + { + "epoch": 1.0919298245614035, + "grad_norm": 1.2828936576843262, + "learning_rate": 4.43652134741633e-05, + "loss": 0.4643, + "step": 7780 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 1.2443900108337402, + "learning_rate": 4.435125514443258e-05, + "loss": 0.562, + "step": 7790 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 1.5212448835372925, + "learning_rate": 4.4337281749086477e-05, + "loss": 0.5022, + "step": 7800 + }, + { + "epoch": 1.096140350877193, + "grad_norm": 2.2898428440093994, + "learning_rate": 4.432329329900375e-05, + "loss": 0.5462, + "step": 7810 + }, + { + "epoch": 1.0975438596491227, + "grad_norm": 1.3173396587371826, + "learning_rate": 4.4309289805074895e-05, + "loss": 0.5103, + "step": 7820 + }, + { + "epoch": 1.0989473684210527, + "grad_norm": 1.3435895442962646, + "learning_rate": 4.42952712782021e-05, + "loss": 0.5588, + "step": 7830 + }, + { + "epoch": 1.1003508771929824, + "grad_norm": 1.1860660314559937, + "learning_rate": 4.428123772929928e-05, + "loss": 0.5107, + "step": 7840 + }, + { + "epoch": 1.1017543859649124, + "grad_norm": 1.2616344690322876, + "learning_rate": 4.426718916929202e-05, + "loss": 0.463, + "step": 7850 + }, + { + "epoch": 1.1031578947368421, + "grad_norm": 0.8766297101974487, + "learning_rate": 4.425312560911762e-05, + "loss": 0.4801, + "step": 7860 + }, + { + "epoch": 1.1045614035087719, + "grad_norm": 1.4563919305801392, + "learning_rate": 4.4239047059725035e-05, + "loss": 0.4755, + "step": 7870 + }, + { + "epoch": 1.1059649122807018, + "grad_norm": 1.3378584384918213, + "learning_rate": 4.422495353207491e-05, + "loss": 0.4449, + "step": 7880 + }, + { + "epoch": 1.1073684210526316, + "grad_norm": 1.259637713432312, + "learning_rate": 4.4210845037139525e-05, + "loss": 0.4613, + "step": 7890 + }, + { + "epoch": 1.1087719298245613, + "grad_norm": 1.8832120895385742, + "learning_rate": 4.419672158590282e-05, + "loss": 0.6132, + "step": 7900 + }, + { + "epoch": 1.1101754385964913, + "grad_norm": 1.7765206098556519, + "learning_rate": 4.4182583189360415e-05, + "loss": 0.5235, + "step": 7910 + }, + { + "epoch": 1.111578947368421, + "grad_norm": 1.5252950191497803, + "learning_rate": 4.416842985851951e-05, + "loss": 0.5066, + "step": 7920 + }, + { + "epoch": 1.112982456140351, + "grad_norm": 1.145727515220642, + "learning_rate": 4.415426160439897e-05, + "loss": 0.5148, + "step": 7930 + }, + { + "epoch": 1.1143859649122807, + "grad_norm": 1.413393259048462, + "learning_rate": 4.414007843802927e-05, + "loss": 0.4731, + "step": 7940 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 1.227738618850708, + "learning_rate": 4.412588037045248e-05, + "loss": 0.4657, + "step": 7950 + }, + { + "epoch": 1.1171929824561404, + "grad_norm": 1.1237843036651611, + "learning_rate": 4.411166741272228e-05, + "loss": 0.5292, + "step": 7960 + }, + { + "epoch": 1.1185964912280701, + "grad_norm": 1.5785701274871826, + "learning_rate": 4.4097439575903964e-05, + "loss": 0.5086, + "step": 7970 + }, + { + "epoch": 1.12, + "grad_norm": 0.8202313780784607, + "learning_rate": 4.408319687107437e-05, + "loss": 0.4074, + "step": 7980 + }, + { + "epoch": 1.1214035087719298, + "grad_norm": 1.6983180046081543, + "learning_rate": 4.406893930932195e-05, + "loss": 0.5302, + "step": 7990 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 1.3259834051132202, + "learning_rate": 4.4054666901746685e-05, + "loss": 0.5533, + "step": 8000 + }, + { + "epoch": 1.1228070175438596, + "eval_loss": 0.6547604203224182, + "eval_runtime": 43.9015, + "eval_samples_per_second": 34.167, + "eval_steps_per_second": 8.542, + "step": 8000 + }, + { + "epoch": 1.1242105263157895, + "grad_norm": 1.5617778301239014, + "learning_rate": 4.404037965946015e-05, + "loss": 0.5533, + "step": 8010 + }, + { + "epoch": 1.1256140350877193, + "grad_norm": 1.3925784826278687, + "learning_rate": 4.402607759358545e-05, + "loss": 0.573, + "step": 8020 + }, + { + "epoch": 1.127017543859649, + "grad_norm": 2.007066011428833, + "learning_rate": 4.401176071525722e-05, + "loss": 0.6002, + "step": 8030 + }, + { + "epoch": 1.128421052631579, + "grad_norm": 1.2479066848754883, + "learning_rate": 4.399742903562166e-05, + "loss": 0.5412, + "step": 8040 + }, + { + "epoch": 1.1298245614035087, + "grad_norm": 2.116882562637329, + "learning_rate": 4.3983082565836454e-05, + "loss": 0.5516, + "step": 8050 + }, + { + "epoch": 1.1312280701754387, + "grad_norm": 1.0991559028625488, + "learning_rate": 4.3968721317070835e-05, + "loss": 0.5142, + "step": 8060 + }, + { + "epoch": 1.1326315789473684, + "grad_norm": 1.5136295557022095, + "learning_rate": 4.395434530050553e-05, + "loss": 0.4974, + "step": 8070 + }, + { + "epoch": 1.1340350877192982, + "grad_norm": 1.6304662227630615, + "learning_rate": 4.393995452733274e-05, + "loss": 0.5921, + "step": 8080 + }, + { + "epoch": 1.1354385964912281, + "grad_norm": 1.1499663591384888, + "learning_rate": 4.392554900875619e-05, + "loss": 0.5516, + "step": 8090 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 1.14556884765625, + "learning_rate": 4.3911128755991085e-05, + "loss": 0.4377, + "step": 8100 + }, + { + "epoch": 1.1382456140350876, + "grad_norm": 2.168900728225708, + "learning_rate": 4.3896693780264054e-05, + "loss": 0.5489, + "step": 8110 + }, + { + "epoch": 1.1396491228070176, + "grad_norm": 1.8360158205032349, + "learning_rate": 4.388224409281324e-05, + "loss": 0.4883, + "step": 8120 + }, + { + "epoch": 1.1410526315789473, + "grad_norm": 1.3180638551712036, + "learning_rate": 4.3867779704888225e-05, + "loss": 0.5316, + "step": 8130 + }, + { + "epoch": 1.1424561403508773, + "grad_norm": 1.194568157196045, + "learning_rate": 4.385330062775001e-05, + "loss": 0.5961, + "step": 8140 + }, + { + "epoch": 1.143859649122807, + "grad_norm": 1.7998569011688232, + "learning_rate": 4.383880687267107e-05, + "loss": 0.5839, + "step": 8150 + }, + { + "epoch": 1.1452631578947368, + "grad_norm": 1.313109040260315, + "learning_rate": 4.3824298450935284e-05, + "loss": 0.4834, + "step": 8160 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 1.082961916923523, + "learning_rate": 4.380977537383796e-05, + "loss": 0.5543, + "step": 8170 + }, + { + "epoch": 1.1480701754385965, + "grad_norm": 1.8646924495697021, + "learning_rate": 4.37952376526858e-05, + "loss": 0.5548, + "step": 8180 + }, + { + "epoch": 1.1494736842105264, + "grad_norm": 0.8285521268844604, + "learning_rate": 4.378068529879693e-05, + "loss": 0.4596, + "step": 8190 + }, + { + "epoch": 1.1508771929824562, + "grad_norm": 1.6364754438400269, + "learning_rate": 4.376611832350085e-05, + "loss": 0.5165, + "step": 8200 + }, + { + "epoch": 1.152280701754386, + "grad_norm": 1.4383785724639893, + "learning_rate": 4.3751536738138454e-05, + "loss": 0.5085, + "step": 8210 + }, + { + "epoch": 1.1536842105263159, + "grad_norm": 1.0523866415023804, + "learning_rate": 4.3736940554062e-05, + "loss": 0.5485, + "step": 8220 + }, + { + "epoch": 1.1550877192982456, + "grad_norm": 2.191441535949707, + "learning_rate": 4.372232978263513e-05, + "loss": 0.5049, + "step": 8230 + }, + { + "epoch": 1.1564912280701753, + "grad_norm": 1.923846960067749, + "learning_rate": 4.3707704435232816e-05, + "loss": 0.4833, + "step": 8240 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 1.855549693107605, + "learning_rate": 4.36930645232414e-05, + "loss": 0.5616, + "step": 8250 + }, + { + "epoch": 1.159298245614035, + "grad_norm": 1.620718240737915, + "learning_rate": 4.367841005805855e-05, + "loss": 0.5448, + "step": 8260 + }, + { + "epoch": 1.1607017543859648, + "grad_norm": 1.5233041048049927, + "learning_rate": 4.366374105109327e-05, + "loss": 0.4714, + "step": 8270 + }, + { + "epoch": 1.1621052631578948, + "grad_norm": 1.2673170566558838, + "learning_rate": 4.364905751376589e-05, + "loss": 0.5994, + "step": 8280 + }, + { + "epoch": 1.1635087719298245, + "grad_norm": 1.4488414525985718, + "learning_rate": 4.3634359457508046e-05, + "loss": 0.5633, + "step": 8290 + }, + { + "epoch": 1.1649122807017545, + "grad_norm": 2.300537586212158, + "learning_rate": 4.3619646893762675e-05, + "loss": 0.5566, + "step": 8300 + }, + { + "epoch": 1.1663157894736842, + "grad_norm": 0.8948672413825989, + "learning_rate": 4.360491983398402e-05, + "loss": 0.4631, + "step": 8310 + }, + { + "epoch": 1.167719298245614, + "grad_norm": 1.453062653541565, + "learning_rate": 4.3590178289637585e-05, + "loss": 0.4525, + "step": 8320 + }, + { + "epoch": 1.169122807017544, + "grad_norm": 1.201952576637268, + "learning_rate": 4.357542227220019e-05, + "loss": 0.4501, + "step": 8330 + }, + { + "epoch": 1.1705263157894736, + "grad_norm": 1.3226593732833862, + "learning_rate": 4.356065179315988e-05, + "loss": 0.561, + "step": 8340 + }, + { + "epoch": 1.1719298245614036, + "grad_norm": 0.9465067386627197, + "learning_rate": 4.354586686401599e-05, + "loss": 0.513, + "step": 8350 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 1.12758207321167, + "learning_rate": 4.353106749627909e-05, + "loss": 0.4915, + "step": 8360 + }, + { + "epoch": 1.174736842105263, + "grad_norm": 2.4800631999969482, + "learning_rate": 4.3516253701471e-05, + "loss": 0.5193, + "step": 8370 + }, + { + "epoch": 1.176140350877193, + "grad_norm": 0.8092995285987854, + "learning_rate": 4.350142549112476e-05, + "loss": 0.5881, + "step": 8380 + }, + { + "epoch": 1.1775438596491228, + "grad_norm": 1.0919562578201294, + "learning_rate": 4.348658287678465e-05, + "loss": 0.5378, + "step": 8390 + }, + { + "epoch": 1.1789473684210527, + "grad_norm": 1.3849503993988037, + "learning_rate": 4.347172587000614e-05, + "loss": 0.5356, + "step": 8400 + }, + { + "epoch": 1.1803508771929825, + "grad_norm": 2.6931231021881104, + "learning_rate": 4.345685448235594e-05, + "loss": 0.5849, + "step": 8410 + }, + { + "epoch": 1.1817543859649122, + "grad_norm": 1.188615083694458, + "learning_rate": 4.3441968725411905e-05, + "loss": 0.5157, + "step": 8420 + }, + { + "epoch": 1.1831578947368422, + "grad_norm": 2.472364902496338, + "learning_rate": 4.342706861076313e-05, + "loss": 0.6508, + "step": 8430 + }, + { + "epoch": 1.184561403508772, + "grad_norm": 1.9964373111724854, + "learning_rate": 4.341215415000987e-05, + "loss": 0.525, + "step": 8440 + }, + { + "epoch": 1.1859649122807017, + "grad_norm": 1.2414706945419312, + "learning_rate": 4.339722535476353e-05, + "loss": 0.6218, + "step": 8450 + }, + { + "epoch": 1.1873684210526316, + "grad_norm": 1.69329035282135, + "learning_rate": 4.3382282236646684e-05, + "loss": 0.5375, + "step": 8460 + }, + { + "epoch": 1.1887719298245614, + "grad_norm": 0.9698866605758667, + "learning_rate": 4.336732480729306e-05, + "loss": 0.5454, + "step": 8470 + }, + { + "epoch": 1.190175438596491, + "grad_norm": 1.450108528137207, + "learning_rate": 4.335235307834755e-05, + "loss": 0.5507, + "step": 8480 + }, + { + "epoch": 1.191578947368421, + "grad_norm": 1.593243956565857, + "learning_rate": 4.333736706146615e-05, + "loss": 0.5172, + "step": 8490 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 0.9896023869514465, + "learning_rate": 4.332236676831598e-05, + "loss": 0.5401, + "step": 8500 + }, + { + "epoch": 1.1943859649122808, + "grad_norm": 1.9976292848587036, + "learning_rate": 4.330735221057529e-05, + "loss": 0.4734, + "step": 8510 + }, + { + "epoch": 1.1957894736842105, + "grad_norm": 0.9883520007133484, + "learning_rate": 4.329232339993342e-05, + "loss": 0.4783, + "step": 8520 + }, + { + "epoch": 1.1971929824561403, + "grad_norm": 1.6024073362350464, + "learning_rate": 4.327728034809082e-05, + "loss": 0.5843, + "step": 8530 + }, + { + "epoch": 1.1985964912280702, + "grad_norm": 1.5246341228485107, + "learning_rate": 4.326222306675902e-05, + "loss": 0.4922, + "step": 8540 + }, + { + "epoch": 1.2, + "grad_norm": 1.8065810203552246, + "learning_rate": 4.324715156766064e-05, + "loss": 0.6196, + "step": 8550 + }, + { + "epoch": 1.20140350877193, + "grad_norm": 1.25635826587677, + "learning_rate": 4.3232065862529334e-05, + "loss": 0.4713, + "step": 8560 + }, + { + "epoch": 1.2028070175438597, + "grad_norm": 1.874711036682129, + "learning_rate": 4.321696596310987e-05, + "loss": 0.5015, + "step": 8570 + }, + { + "epoch": 1.2042105263157894, + "grad_norm": 1.4795438051223755, + "learning_rate": 4.3201851881158004e-05, + "loss": 0.569, + "step": 8580 + }, + { + "epoch": 1.2056140350877194, + "grad_norm": 1.1996725797653198, + "learning_rate": 4.31867236284406e-05, + "loss": 0.5079, + "step": 8590 + }, + { + "epoch": 1.207017543859649, + "grad_norm": 1.1284021139144897, + "learning_rate": 4.31715812167355e-05, + "loss": 0.5132, + "step": 8600 + }, + { + "epoch": 1.208421052631579, + "grad_norm": 1.3568930625915527, + "learning_rate": 4.3156424657831596e-05, + "loss": 0.5907, + "step": 8610 + }, + { + "epoch": 1.2098245614035088, + "grad_norm": 2.9363083839416504, + "learning_rate": 4.3141253963528795e-05, + "loss": 0.6086, + "step": 8620 + }, + { + "epoch": 1.2112280701754385, + "grad_norm": 1.58176589012146, + "learning_rate": 4.3126069145637987e-05, + "loss": 0.4966, + "step": 8630 + }, + { + "epoch": 1.2126315789473685, + "grad_norm": 1.1019052267074585, + "learning_rate": 4.3110870215981095e-05, + "loss": 0.5713, + "step": 8640 + }, + { + "epoch": 1.2140350877192982, + "grad_norm": 1.8327674865722656, + "learning_rate": 4.309565718639098e-05, + "loss": 0.5538, + "step": 8650 + }, + { + "epoch": 1.215438596491228, + "grad_norm": 1.4098116159439087, + "learning_rate": 4.308043006871153e-05, + "loss": 0.5065, + "step": 8660 + }, + { + "epoch": 1.216842105263158, + "grad_norm": 1.7343579530715942, + "learning_rate": 4.306518887479758e-05, + "loss": 0.495, + "step": 8670 + }, + { + "epoch": 1.2182456140350877, + "grad_norm": 1.1002309322357178, + "learning_rate": 4.3049933616514895e-05, + "loss": 0.5217, + "step": 8680 + }, + { + "epoch": 1.2196491228070174, + "grad_norm": 1.6965640783309937, + "learning_rate": 4.303466430574024e-05, + "loss": 0.5196, + "step": 8690 + }, + { + "epoch": 1.2210526315789474, + "grad_norm": 2.227039337158203, + "learning_rate": 4.301938095436129e-05, + "loss": 0.4687, + "step": 8700 + }, + { + "epoch": 1.2224561403508771, + "grad_norm": 1.9886293411254883, + "learning_rate": 4.300408357427666e-05, + "loss": 0.6043, + "step": 8710 + }, + { + "epoch": 1.223859649122807, + "grad_norm": 1.9546360969543457, + "learning_rate": 4.298877217739587e-05, + "loss": 0.5359, + "step": 8720 + }, + { + "epoch": 1.2252631578947368, + "grad_norm": 1.1686962842941284, + "learning_rate": 4.29734467756394e-05, + "loss": 0.4502, + "step": 8730 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 1.1630245447158813, + "learning_rate": 4.2958107380938564e-05, + "loss": 0.4823, + "step": 8740 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 1.3308658599853516, + "learning_rate": 4.294275400523564e-05, + "loss": 0.4295, + "step": 8750 + }, + { + "epoch": 1.2294736842105263, + "grad_norm": 1.694773554801941, + "learning_rate": 4.2927386660483726e-05, + "loss": 0.4983, + "step": 8760 + }, + { + "epoch": 1.2308771929824562, + "grad_norm": 0.8299292325973511, + "learning_rate": 4.291200535864684e-05, + "loss": 0.5405, + "step": 8770 + }, + { + "epoch": 1.232280701754386, + "grad_norm": 1.7378555536270142, + "learning_rate": 4.289661011169986e-05, + "loss": 0.5094, + "step": 8780 + }, + { + "epoch": 1.2336842105263157, + "grad_norm": 1.8789598941802979, + "learning_rate": 4.28812009316285e-05, + "loss": 0.5622, + "step": 8790 + }, + { + "epoch": 1.2350877192982457, + "grad_norm": 1.2347322702407837, + "learning_rate": 4.286577783042934e-05, + "loss": 0.4577, + "step": 8800 + }, + { + "epoch": 1.2364912280701754, + "grad_norm": 1.610954761505127, + "learning_rate": 4.285034082010981e-05, + "loss": 0.6015, + "step": 8810 + }, + { + "epoch": 1.2378947368421054, + "grad_norm": 0.8974846005439758, + "learning_rate": 4.2834889912688126e-05, + "loss": 0.5716, + "step": 8820 + }, + { + "epoch": 1.2392982456140351, + "grad_norm": 1.1207072734832764, + "learning_rate": 4.281942512019336e-05, + "loss": 0.5634, + "step": 8830 + }, + { + "epoch": 1.2407017543859649, + "grad_norm": 2.1318647861480713, + "learning_rate": 4.2803946454665376e-05, + "loss": 0.4982, + "step": 8840 + }, + { + "epoch": 1.2421052631578948, + "grad_norm": 1.3747590780258179, + "learning_rate": 4.2788453928154855e-05, + "loss": 0.5006, + "step": 8850 + }, + { + "epoch": 1.2435087719298246, + "grad_norm": 0.8502065539360046, + "learning_rate": 4.2772947552723266e-05, + "loss": 0.4901, + "step": 8860 + }, + { + "epoch": 1.2449122807017543, + "grad_norm": 1.888156771659851, + "learning_rate": 4.275742734044283e-05, + "loss": 0.4847, + "step": 8870 + }, + { + "epoch": 1.2463157894736843, + "grad_norm": 2.0071113109588623, + "learning_rate": 4.274189330339658e-05, + "loss": 0.5224, + "step": 8880 + }, + { + "epoch": 1.247719298245614, + "grad_norm": 1.0914371013641357, + "learning_rate": 4.272634545367831e-05, + "loss": 0.4698, + "step": 8890 + }, + { + "epoch": 1.2491228070175437, + "grad_norm": 1.4466750621795654, + "learning_rate": 4.271078380339252e-05, + "loss": 0.5801, + "step": 8900 + }, + { + "epoch": 1.2505263157894737, + "grad_norm": 1.5080820322036743, + "learning_rate": 4.269520836465452e-05, + "loss": 0.6584, + "step": 8910 + }, + { + "epoch": 1.2519298245614034, + "grad_norm": 1.510321855545044, + "learning_rate": 4.2679619149590304e-05, + "loss": 0.5752, + "step": 8920 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 1.527969479560852, + "learning_rate": 4.266401617033662e-05, + "loss": 0.4829, + "step": 8930 + }, + { + "epoch": 1.2547368421052632, + "grad_norm": 1.2593231201171875, + "learning_rate": 4.264839943904091e-05, + "loss": 0.5411, + "step": 8940 + }, + { + "epoch": 1.256140350877193, + "grad_norm": 1.490929365158081, + "learning_rate": 4.2632768967861345e-05, + "loss": 0.5089, + "step": 8950 + }, + { + "epoch": 1.2575438596491229, + "grad_norm": 0.7822336554527283, + "learning_rate": 4.261712476896679e-05, + "loss": 0.6257, + "step": 8960 + }, + { + "epoch": 1.2589473684210526, + "grad_norm": 1.331175446510315, + "learning_rate": 4.2601466854536774e-05, + "loss": 0.5403, + "step": 8970 + }, + { + "epoch": 1.2603508771929826, + "grad_norm": 1.4372813701629639, + "learning_rate": 4.2585795236761526e-05, + "loss": 0.5305, + "step": 8980 + }, + { + "epoch": 1.2617543859649123, + "grad_norm": 1.873630166053772, + "learning_rate": 4.257010992784194e-05, + "loss": 0.5776, + "step": 8990 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.2255460023880005, + "learning_rate": 4.255441093998956e-05, + "loss": 0.5772, + "step": 9000 + }, + { + "epoch": 1.264561403508772, + "grad_norm": 1.3667577505111694, + "learning_rate": 4.253869828542659e-05, + "loss": 0.589, + "step": 9010 + }, + { + "epoch": 1.2659649122807017, + "grad_norm": 1.189122200012207, + "learning_rate": 4.2522971976385876e-05, + "loss": 0.5012, + "step": 9020 + }, + { + "epoch": 1.2673684210526317, + "grad_norm": 2.0150930881500244, + "learning_rate": 4.250723202511089e-05, + "loss": 0.4813, + "step": 9030 + }, + { + "epoch": 1.2687719298245614, + "grad_norm": 1.83956778049469, + "learning_rate": 4.2491478443855704e-05, + "loss": 0.513, + "step": 9040 + }, + { + "epoch": 1.2701754385964912, + "grad_norm": 1.8281301259994507, + "learning_rate": 4.247571124488504e-05, + "loss": 0.6229, + "step": 9050 + }, + { + "epoch": 1.271578947368421, + "grad_norm": 1.5498483180999756, + "learning_rate": 4.2459930440474194e-05, + "loss": 0.5493, + "step": 9060 + }, + { + "epoch": 1.2729824561403509, + "grad_norm": 1.680643081665039, + "learning_rate": 4.2444136042909064e-05, + "loss": 0.4845, + "step": 9070 + }, + { + "epoch": 1.2743859649122806, + "grad_norm": 1.4468814134597778, + "learning_rate": 4.2428328064486134e-05, + "loss": 0.5174, + "step": 9080 + }, + { + "epoch": 1.2757894736842106, + "grad_norm": 1.8637295961380005, + "learning_rate": 4.2412506517512456e-05, + "loss": 0.501, + "step": 9090 + }, + { + "epoch": 1.2771929824561403, + "grad_norm": 1.8078296184539795, + "learning_rate": 4.239667141430564e-05, + "loss": 0.6422, + "step": 9100 + }, + { + "epoch": 1.27859649122807, + "grad_norm": 1.8999830484390259, + "learning_rate": 4.238082276719387e-05, + "loss": 0.5323, + "step": 9110 + }, + { + "epoch": 1.28, + "grad_norm": 0.8832138776779175, + "learning_rate": 4.236496058851585e-05, + "loss": 0.4542, + "step": 9120 + }, + { + "epoch": 1.2814035087719298, + "grad_norm": 1.2980352640151978, + "learning_rate": 4.234908489062083e-05, + "loss": 0.5697, + "step": 9130 + }, + { + "epoch": 1.2828070175438597, + "grad_norm": 1.667039394378662, + "learning_rate": 4.233319568586859e-05, + "loss": 0.5108, + "step": 9140 + }, + { + "epoch": 1.2842105263157895, + "grad_norm": 1.6664785146713257, + "learning_rate": 4.231729298662942e-05, + "loss": 0.4472, + "step": 9150 + }, + { + "epoch": 1.2856140350877192, + "grad_norm": 1.4384082555770874, + "learning_rate": 4.230137680528411e-05, + "loss": 0.62, + "step": 9160 + }, + { + "epoch": 1.2870175438596492, + "grad_norm": 1.851901888847351, + "learning_rate": 4.228544715422395e-05, + "loss": 0.5226, + "step": 9170 + }, + { + "epoch": 1.288421052631579, + "grad_norm": 1.372755527496338, + "learning_rate": 4.2269504045850744e-05, + "loss": 0.5492, + "step": 9180 + }, + { + "epoch": 1.2898245614035089, + "grad_norm": 1.7806882858276367, + "learning_rate": 4.225354749257673e-05, + "loss": 0.5359, + "step": 9190 + }, + { + "epoch": 1.2912280701754386, + "grad_norm": 1.4967597723007202, + "learning_rate": 4.2237577506824624e-05, + "loss": 0.535, + "step": 9200 + }, + { + "epoch": 1.2926315789473684, + "grad_norm": 1.220828890800476, + "learning_rate": 4.222159410102761e-05, + "loss": 0.4581, + "step": 9210 + }, + { + "epoch": 1.2940350877192983, + "grad_norm": 1.9541898965835571, + "learning_rate": 4.220559728762933e-05, + "loss": 0.5109, + "step": 9220 + }, + { + "epoch": 1.295438596491228, + "grad_norm": 0.9027903079986572, + "learning_rate": 4.2189587079083846e-05, + "loss": 0.4501, + "step": 9230 + }, + { + "epoch": 1.296842105263158, + "grad_norm": 2.4572014808654785, + "learning_rate": 4.217356348785565e-05, + "loss": 0.5574, + "step": 9240 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 1.5705862045288086, + "learning_rate": 4.215752652641967e-05, + "loss": 0.5558, + "step": 9250 + }, + { + "epoch": 1.2996491228070175, + "grad_norm": 1.5693955421447754, + "learning_rate": 4.21414762072612e-05, + "loss": 0.4734, + "step": 9260 + }, + { + "epoch": 1.3010526315789472, + "grad_norm": 1.4699418544769287, + "learning_rate": 4.2125412542876e-05, + "loss": 0.574, + "step": 9270 + }, + { + "epoch": 1.3024561403508772, + "grad_norm": 1.8956423997879028, + "learning_rate": 4.210933554577016e-05, + "loss": 0.4505, + "step": 9280 + }, + { + "epoch": 1.303859649122807, + "grad_norm": 1.8722734451293945, + "learning_rate": 4.209324522846018e-05, + "loss": 0.5021, + "step": 9290 + }, + { + "epoch": 1.305263157894737, + "grad_norm": 1.3624267578125, + "learning_rate": 4.207714160347292e-05, + "loss": 0.4925, + "step": 9300 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 2.2316009998321533, + "learning_rate": 4.206102468334561e-05, + "loss": 0.6289, + "step": 9310 + }, + { + "epoch": 1.3080701754385964, + "grad_norm": 2.3992788791656494, + "learning_rate": 4.2044894480625825e-05, + "loss": 0.5387, + "step": 9320 + }, + { + "epoch": 1.3094736842105263, + "grad_norm": 1.9581996202468872, + "learning_rate": 4.202875100787147e-05, + "loss": 0.5788, + "step": 9330 + }, + { + "epoch": 1.310877192982456, + "grad_norm": 1.5940258502960205, + "learning_rate": 4.201259427765081e-05, + "loss": 0.5313, + "step": 9340 + }, + { + "epoch": 1.312280701754386, + "grad_norm": 1.2207392454147339, + "learning_rate": 4.1996424302542404e-05, + "loss": 0.5948, + "step": 9350 + }, + { + "epoch": 1.3136842105263158, + "grad_norm": 1.743915319442749, + "learning_rate": 4.198024109513512e-05, + "loss": 0.4913, + "step": 9360 + }, + { + "epoch": 1.3150877192982455, + "grad_norm": 1.9989562034606934, + "learning_rate": 4.196404466802816e-05, + "loss": 0.4895, + "step": 9370 + }, + { + "epoch": 1.3164912280701755, + "grad_norm": 1.8793307542800903, + "learning_rate": 4.194783503383098e-05, + "loss": 0.5537, + "step": 9380 + }, + { + "epoch": 1.3178947368421052, + "grad_norm": 1.9246269464492798, + "learning_rate": 4.193161220516334e-05, + "loss": 0.5641, + "step": 9390 + }, + { + "epoch": 1.3192982456140352, + "grad_norm": 1.5612519979476929, + "learning_rate": 4.191537619465529e-05, + "loss": 0.464, + "step": 9400 + }, + { + "epoch": 1.320701754385965, + "grad_norm": 0.9451802968978882, + "learning_rate": 4.189912701494709e-05, + "loss": 0.4657, + "step": 9410 + }, + { + "epoch": 1.3221052631578947, + "grad_norm": 1.790861964225769, + "learning_rate": 4.1882864678689296e-05, + "loss": 0.5113, + "step": 9420 + }, + { + "epoch": 1.3235087719298246, + "grad_norm": 1.9305384159088135, + "learning_rate": 4.186658919854269e-05, + "loss": 0.5593, + "step": 9430 + }, + { + "epoch": 1.3249122807017544, + "grad_norm": 2.051849603652954, + "learning_rate": 4.1850300587178304e-05, + "loss": 0.4578, + "step": 9440 + }, + { + "epoch": 1.3263157894736843, + "grad_norm": 1.7359352111816406, + "learning_rate": 4.183399885727737e-05, + "loss": 0.5637, + "step": 9450 + }, + { + "epoch": 1.327719298245614, + "grad_norm": 1.373119592666626, + "learning_rate": 4.181768402153135e-05, + "loss": 0.5491, + "step": 9460 + }, + { + "epoch": 1.3291228070175438, + "grad_norm": 1.0692249536514282, + "learning_rate": 4.1801356092641886e-05, + "loss": 0.5558, + "step": 9470 + }, + { + "epoch": 1.3305263157894736, + "grad_norm": 1.0997167825698853, + "learning_rate": 4.178501508332085e-05, + "loss": 0.4543, + "step": 9480 + }, + { + "epoch": 1.3319298245614035, + "grad_norm": 1.5097479820251465, + "learning_rate": 4.176866100629027e-05, + "loss": 0.5832, + "step": 9490 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.5708390474319458, + "learning_rate": 4.175229387428235e-05, + "loss": 0.5378, + "step": 9500 + }, + { + "epoch": 1.3347368421052632, + "grad_norm": 1.6936376094818115, + "learning_rate": 4.1735913700039477e-05, + "loss": 0.5046, + "step": 9510 + }, + { + "epoch": 1.336140350877193, + "grad_norm": 0.9409717321395874, + "learning_rate": 4.171952049631416e-05, + "loss": 0.5171, + "step": 9520 + }, + { + "epoch": 1.3375438596491227, + "grad_norm": 1.803077220916748, + "learning_rate": 4.170311427586908e-05, + "loss": 0.5939, + "step": 9530 + }, + { + "epoch": 1.3389473684210527, + "grad_norm": 1.1349605321884155, + "learning_rate": 4.168669505147705e-05, + "loss": 0.5768, + "step": 9540 + }, + { + "epoch": 1.3403508771929824, + "grad_norm": 1.6885027885437012, + "learning_rate": 4.1670262835920996e-05, + "loss": 0.5029, + "step": 9550 + }, + { + "epoch": 1.3417543859649124, + "grad_norm": 1.278064489364624, + "learning_rate": 4.1653817641993936e-05, + "loss": 0.4611, + "step": 9560 + }, + { + "epoch": 1.343157894736842, + "grad_norm": 1.7429572343826294, + "learning_rate": 4.163735948249905e-05, + "loss": 0.5701, + "step": 9570 + }, + { + "epoch": 1.3445614035087718, + "grad_norm": 2.2477900981903076, + "learning_rate": 4.162088837024956e-05, + "loss": 0.5356, + "step": 9580 + }, + { + "epoch": 1.3459649122807018, + "grad_norm": 1.617583990097046, + "learning_rate": 4.16044043180688e-05, + "loss": 0.4985, + "step": 9590 + }, + { + "epoch": 1.3473684210526315, + "grad_norm": 1.5791269540786743, + "learning_rate": 4.158790733879017e-05, + "loss": 0.5036, + "step": 9600 + }, + { + "epoch": 1.3487719298245615, + "grad_norm": 1.9323596954345703, + "learning_rate": 4.1571397445257124e-05, + "loss": 0.5212, + "step": 9610 + }, + { + "epoch": 1.3501754385964913, + "grad_norm": 1.3054085969924927, + "learning_rate": 4.155487465032319e-05, + "loss": 0.5225, + "step": 9620 + }, + { + "epoch": 1.351578947368421, + "grad_norm": 1.5751895904541016, + "learning_rate": 4.153833896685193e-05, + "loss": 0.4985, + "step": 9630 + }, + { + "epoch": 1.352982456140351, + "grad_norm": 1.8643230199813843, + "learning_rate": 4.1521790407716936e-05, + "loss": 0.5386, + "step": 9640 + }, + { + "epoch": 1.3543859649122807, + "grad_norm": 1.0118595361709595, + "learning_rate": 4.150522898580183e-05, + "loss": 0.5283, + "step": 9650 + }, + { + "epoch": 1.3557894736842107, + "grad_norm": 1.9065098762512207, + "learning_rate": 4.148865471400024e-05, + "loss": 0.5684, + "step": 9660 + }, + { + "epoch": 1.3571929824561404, + "grad_norm": 2.0933990478515625, + "learning_rate": 4.147206760521582e-05, + "loss": 0.525, + "step": 9670 + }, + { + "epoch": 1.3585964912280701, + "grad_norm": 2.0099165439605713, + "learning_rate": 4.145546767236219e-05, + "loss": 0.5258, + "step": 9680 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 2.2704153060913086, + "learning_rate": 4.143885492836297e-05, + "loss": 0.5159, + "step": 9690 + }, + { + "epoch": 1.3614035087719298, + "grad_norm": 1.3344398736953735, + "learning_rate": 4.1422229386151754e-05, + "loss": 0.5656, + "step": 9700 + }, + { + "epoch": 1.3628070175438596, + "grad_norm": 2.356660842895508, + "learning_rate": 4.140559105867209e-05, + "loss": 0.488, + "step": 9710 + }, + { + "epoch": 1.3642105263157895, + "grad_norm": 1.0358322858810425, + "learning_rate": 4.1388939958877495e-05, + "loss": 0.457, + "step": 9720 + }, + { + "epoch": 1.3656140350877193, + "grad_norm": 1.4958525896072388, + "learning_rate": 4.137227609973141e-05, + "loss": 0.459, + "step": 9730 + }, + { + "epoch": 1.367017543859649, + "grad_norm": 1.9942265748977661, + "learning_rate": 4.135559949420723e-05, + "loss": 0.4794, + "step": 9740 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.7793415784835815, + "learning_rate": 4.133891015528826e-05, + "loss": 0.5903, + "step": 9750 + }, + { + "epoch": 1.3698245614035087, + "grad_norm": 1.069421410560608, + "learning_rate": 4.132220809596772e-05, + "loss": 0.5521, + "step": 9760 + }, + { + "epoch": 1.3712280701754387, + "grad_norm": 0.8958350419998169, + "learning_rate": 4.1305493329248734e-05, + "loss": 0.4667, + "step": 9770 + }, + { + "epoch": 1.3726315789473684, + "grad_norm": 1.314070701599121, + "learning_rate": 4.128876586814433e-05, + "loss": 0.4307, + "step": 9780 + }, + { + "epoch": 1.3740350877192982, + "grad_norm": 1.3073476552963257, + "learning_rate": 4.127202572567741e-05, + "loss": 0.5016, + "step": 9790 + }, + { + "epoch": 1.3754385964912281, + "grad_norm": 1.695670247077942, + "learning_rate": 4.1255272914880735e-05, + "loss": 0.5489, + "step": 9800 + }, + { + "epoch": 1.3768421052631579, + "grad_norm": 1.6946247816085815, + "learning_rate": 4.1238507448796945e-05, + "loss": 0.488, + "step": 9810 + }, + { + "epoch": 1.3782456140350878, + "grad_norm": 1.3960559368133545, + "learning_rate": 4.122172934047855e-05, + "loss": 0.5739, + "step": 9820 + }, + { + "epoch": 1.3796491228070176, + "grad_norm": 1.4782212972640991, + "learning_rate": 4.120493860298786e-05, + "loss": 0.5036, + "step": 9830 + }, + { + "epoch": 1.3810526315789473, + "grad_norm": 1.7010905742645264, + "learning_rate": 4.1188135249397056e-05, + "loss": 0.4737, + "step": 9840 + }, + { + "epoch": 1.3824561403508773, + "grad_norm": 1.8230018615722656, + "learning_rate": 4.117131929278811e-05, + "loss": 0.5341, + "step": 9850 + }, + { + "epoch": 1.383859649122807, + "grad_norm": 1.5947978496551514, + "learning_rate": 4.1154490746252825e-05, + "loss": 0.4567, + "step": 9860 + }, + { + "epoch": 1.385263157894737, + "grad_norm": 2.077136516571045, + "learning_rate": 4.113764962289281e-05, + "loss": 0.5586, + "step": 9870 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 1.0836787223815918, + "learning_rate": 4.112079593581944e-05, + "loss": 0.5065, + "step": 9880 + }, + { + "epoch": 1.3880701754385965, + "grad_norm": 1.8127710819244385, + "learning_rate": 4.110392969815391e-05, + "loss": 0.5335, + "step": 9890 + }, + { + "epoch": 1.3894736842105262, + "grad_norm": 1.7939358949661255, + "learning_rate": 4.108705092302715e-05, + "loss": 0.5445, + "step": 9900 + }, + { + "epoch": 1.3908771929824562, + "grad_norm": 1.5963224172592163, + "learning_rate": 4.1070159623579855e-05, + "loss": 0.5146, + "step": 9910 + }, + { + "epoch": 1.392280701754386, + "grad_norm": 1.4976569414138794, + "learning_rate": 4.105325581296251e-05, + "loss": 0.4938, + "step": 9920 + }, + { + "epoch": 1.3936842105263159, + "grad_norm": 1.3835642337799072, + "learning_rate": 4.103633950433528e-05, + "loss": 0.5353, + "step": 9930 + }, + { + "epoch": 1.3950877192982456, + "grad_norm": 1.504701018333435, + "learning_rate": 4.1019410710868115e-05, + "loss": 0.4869, + "step": 9940 + }, + { + "epoch": 1.3964912280701753, + "grad_norm": 1.6695371866226196, + "learning_rate": 4.100246944574064e-05, + "loss": 0.4858, + "step": 9950 + }, + { + "epoch": 1.3978947368421053, + "grad_norm": 1.033554196357727, + "learning_rate": 4.098551572214223e-05, + "loss": 0.5173, + "step": 9960 + }, + { + "epoch": 1.399298245614035, + "grad_norm": 2.1895320415496826, + "learning_rate": 4.0968549553271926e-05, + "loss": 0.5862, + "step": 9970 + }, + { + "epoch": 1.400701754385965, + "grad_norm": 2.323758363723755, + "learning_rate": 4.095157095233848e-05, + "loss": 0.5312, + "step": 9980 + }, + { + "epoch": 1.4021052631578947, + "grad_norm": 1.7012853622436523, + "learning_rate": 4.093457993256031e-05, + "loss": 0.4668, + "step": 9990 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 1.5631529092788696, + "learning_rate": 4.0917576507165514e-05, + "loss": 0.5192, + "step": 10000 + }, + { + "epoch": 1.4035087719298245, + "eval_loss": 0.6501449942588806, + "eval_runtime": 43.9031, + "eval_samples_per_second": 34.166, + "eval_steps_per_second": 8.542, + "step": 10000 + }, + { + "epoch": 1.4049122807017544, + "grad_norm": 1.5768696069717407, + "learning_rate": 4.090056068939183e-05, + "loss": 0.6369, + "step": 10010 + }, + { + "epoch": 1.4063157894736842, + "grad_norm": 2.0955562591552734, + "learning_rate": 4.088353249248667e-05, + "loss": 0.4765, + "step": 10020 + }, + { + "epoch": 1.4077192982456141, + "grad_norm": 1.7173198461532593, + "learning_rate": 4.0866491929707064e-05, + "loss": 0.4858, + "step": 10030 + }, + { + "epoch": 1.4091228070175439, + "grad_norm": 1.091640591621399, + "learning_rate": 4.084943901431966e-05, + "loss": 0.4502, + "step": 10040 + }, + { + "epoch": 1.4105263157894736, + "grad_norm": 0.9591197967529297, + "learning_rate": 4.083237375960075e-05, + "loss": 0.5036, + "step": 10050 + }, + { + "epoch": 1.4119298245614036, + "grad_norm": 1.6289422512054443, + "learning_rate": 4.081529617883622e-05, + "loss": 0.5185, + "step": 10060 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 1.0051218271255493, + "learning_rate": 4.079820628532155e-05, + "loss": 0.4701, + "step": 10070 + }, + { + "epoch": 1.4147368421052633, + "grad_norm": 1.5907773971557617, + "learning_rate": 4.0781104092361813e-05, + "loss": 0.5406, + "step": 10080 + }, + { + "epoch": 1.416140350877193, + "grad_norm": 1.4398341178894043, + "learning_rate": 4.0763989613271635e-05, + "loss": 0.4963, + "step": 10090 + }, + { + "epoch": 1.4175438596491228, + "grad_norm": 1.8131810426712036, + "learning_rate": 4.0746862861375245e-05, + "loss": 0.5931, + "step": 10100 + }, + { + "epoch": 1.4189473684210525, + "grad_norm": 1.3968654870986938, + "learning_rate": 4.07297238500064e-05, + "loss": 0.4908, + "step": 10110 + }, + { + "epoch": 1.4203508771929825, + "grad_norm": 1.4002443552017212, + "learning_rate": 4.0712572592508394e-05, + "loss": 0.5732, + "step": 10120 + }, + { + "epoch": 1.4217543859649122, + "grad_norm": 1.8497573137283325, + "learning_rate": 4.069540910223409e-05, + "loss": 0.5323, + "step": 10130 + }, + { + "epoch": 1.4231578947368422, + "grad_norm": 1.6966348886489868, + "learning_rate": 4.067823339254584e-05, + "loss": 0.5727, + "step": 10140 + }, + { + "epoch": 1.424561403508772, + "grad_norm": 1.1128507852554321, + "learning_rate": 4.066104547681553e-05, + "loss": 0.5295, + "step": 10150 + }, + { + "epoch": 1.4259649122807017, + "grad_norm": 1.2667880058288574, + "learning_rate": 4.0643845368424545e-05, + "loss": 0.554, + "step": 10160 + }, + { + "epoch": 1.4273684210526316, + "grad_norm": 2.0188159942626953, + "learning_rate": 4.062663308076374e-05, + "loss": 0.5138, + "step": 10170 + }, + { + "epoch": 1.4287719298245614, + "grad_norm": 1.113797903060913, + "learning_rate": 4.0609408627233494e-05, + "loss": 0.543, + "step": 10180 + }, + { + "epoch": 1.4301754385964913, + "grad_norm": 1.800862193107605, + "learning_rate": 4.059217202124361e-05, + "loss": 0.5094, + "step": 10190 + }, + { + "epoch": 1.431578947368421, + "grad_norm": 1.7962946891784668, + "learning_rate": 4.0574923276213405e-05, + "loss": 0.5468, + "step": 10200 + }, + { + "epoch": 1.4329824561403508, + "grad_norm": 2.258661985397339, + "learning_rate": 4.0557662405571595e-05, + "loss": 0.6082, + "step": 10210 + }, + { + "epoch": 1.4343859649122808, + "grad_norm": 2.073396921157837, + "learning_rate": 4.054038942275637e-05, + "loss": 0.5164, + "step": 10220 + }, + { + "epoch": 1.4357894736842105, + "grad_norm": 1.6468226909637451, + "learning_rate": 4.052310434121533e-05, + "loss": 0.5451, + "step": 10230 + }, + { + "epoch": 1.4371929824561405, + "grad_norm": 2.4547080993652344, + "learning_rate": 4.050580717440552e-05, + "loss": 0.5821, + "step": 10240 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 1.6606553792953491, + "learning_rate": 4.048849793579337e-05, + "loss": 0.5088, + "step": 10250 + }, + { + "epoch": 1.44, + "grad_norm": 1.6139086484909058, + "learning_rate": 4.04711766388547e-05, + "loss": 0.5441, + "step": 10260 + }, + { + "epoch": 1.4414035087719297, + "grad_norm": 1.6367645263671875, + "learning_rate": 4.0453843297074756e-05, + "loss": 0.494, + "step": 10270 + }, + { + "epoch": 1.4428070175438596, + "grad_norm": 1.6214492321014404, + "learning_rate": 4.043649792394812e-05, + "loss": 0.571, + "step": 10280 + }, + { + "epoch": 1.4442105263157896, + "grad_norm": 1.8566523790359497, + "learning_rate": 4.041914053297878e-05, + "loss": 0.5845, + "step": 10290 + }, + { + "epoch": 1.4456140350877194, + "grad_norm": 1.105668067932129, + "learning_rate": 4.0401771137680046e-05, + "loss": 0.4655, + "step": 10300 + }, + { + "epoch": 1.447017543859649, + "grad_norm": 1.5444446802139282, + "learning_rate": 4.038438975157458e-05, + "loss": 0.4939, + "step": 10310 + }, + { + "epoch": 1.4484210526315788, + "grad_norm": 2.6764674186706543, + "learning_rate": 4.036699638819441e-05, + "loss": 0.6172, + "step": 10320 + }, + { + "epoch": 1.4498245614035088, + "grad_norm": 1.574623942375183, + "learning_rate": 4.0349591061080846e-05, + "loss": 0.4888, + "step": 10330 + }, + { + "epoch": 1.4512280701754385, + "grad_norm": 2.2457685470581055, + "learning_rate": 4.0332173783784536e-05, + "loss": 0.4427, + "step": 10340 + }, + { + "epoch": 1.4526315789473685, + "grad_norm": 1.68437659740448, + "learning_rate": 4.031474456986543e-05, + "loss": 0.4867, + "step": 10350 + }, + { + "epoch": 1.4540350877192982, + "grad_norm": 1.4421491622924805, + "learning_rate": 4.0297303432892775e-05, + "loss": 0.4401, + "step": 10360 + }, + { + "epoch": 1.455438596491228, + "grad_norm": 2.1541783809661865, + "learning_rate": 4.027985038644507e-05, + "loss": 0.546, + "step": 10370 + }, + { + "epoch": 1.456842105263158, + "grad_norm": 1.7601039409637451, + "learning_rate": 4.026238544411014e-05, + "loss": 0.5211, + "step": 10380 + }, + { + "epoch": 1.4582456140350877, + "grad_norm": 1.2212331295013428, + "learning_rate": 4.024490861948503e-05, + "loss": 0.4633, + "step": 10390 + }, + { + "epoch": 1.4596491228070176, + "grad_norm": 1.688339114189148, + "learning_rate": 4.022741992617603e-05, + "loss": 0.5898, + "step": 10400 + }, + { + "epoch": 1.4610526315789474, + "grad_norm": 0.9874732494354248, + "learning_rate": 4.020991937779872e-05, + "loss": 0.4944, + "step": 10410 + }, + { + "epoch": 1.4624561403508771, + "grad_norm": 0.9479324817657471, + "learning_rate": 4.019240698797785e-05, + "loss": 0.55, + "step": 10420 + }, + { + "epoch": 1.463859649122807, + "grad_norm": 2.4362101554870605, + "learning_rate": 4.017488277034742e-05, + "loss": 0.5103, + "step": 10430 + }, + { + "epoch": 1.4652631578947368, + "grad_norm": 1.659631371498108, + "learning_rate": 4.015734673855065e-05, + "loss": 0.5073, + "step": 10440 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.9145371913909912, + "learning_rate": 4.013979890623992e-05, + "loss": 0.5588, + "step": 10450 + }, + { + "epoch": 1.4680701754385965, + "grad_norm": 1.3370527029037476, + "learning_rate": 4.0122239287076834e-05, + "loss": 0.5984, + "step": 10460 + }, + { + "epoch": 1.4694736842105263, + "grad_norm": 1.9445977210998535, + "learning_rate": 4.010466789473215e-05, + "loss": 0.5437, + "step": 10470 + }, + { + "epoch": 1.470877192982456, + "grad_norm": 1.197405457496643, + "learning_rate": 4.008708474288581e-05, + "loss": 0.4573, + "step": 10480 + }, + { + "epoch": 1.472280701754386, + "grad_norm": 1.8886786699295044, + "learning_rate": 4.006948984522687e-05, + "loss": 0.5319, + "step": 10490 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.8042242527008057, + "learning_rate": 4.00518832154536e-05, + "loss": 0.4559, + "step": 10500 + }, + { + "epoch": 1.4750877192982457, + "grad_norm": 1.6905839443206787, + "learning_rate": 4.003426486727335e-05, + "loss": 0.4776, + "step": 10510 + }, + { + "epoch": 1.4764912280701754, + "grad_norm": 1.3357295989990234, + "learning_rate": 4.00166348144026e-05, + "loss": 0.5672, + "step": 10520 + }, + { + "epoch": 1.4778947368421052, + "grad_norm": 1.399383783340454, + "learning_rate": 3.9998993070566954e-05, + "loss": 0.5926, + "step": 10530 + }, + { + "epoch": 1.4792982456140351, + "grad_norm": 1.8520910739898682, + "learning_rate": 3.998133964950112e-05, + "loss": 0.4975, + "step": 10540 + }, + { + "epoch": 1.4807017543859649, + "grad_norm": 0.865352988243103, + "learning_rate": 3.9963674564948886e-05, + "loss": 0.5229, + "step": 10550 + }, + { + "epoch": 1.4821052631578948, + "grad_norm": 2.3526158332824707, + "learning_rate": 3.9945997830663126e-05, + "loss": 0.4995, + "step": 10560 + }, + { + "epoch": 1.4835087719298246, + "grad_norm": 2.1046128273010254, + "learning_rate": 3.992830946040579e-05, + "loss": 0.5464, + "step": 10570 + }, + { + "epoch": 1.4849122807017543, + "grad_norm": 2.0516717433929443, + "learning_rate": 3.9910609467947866e-05, + "loss": 0.5691, + "step": 10580 + }, + { + "epoch": 1.4863157894736843, + "grad_norm": 1.386889100074768, + "learning_rate": 3.989289786706942e-05, + "loss": 0.5982, + "step": 10590 + }, + { + "epoch": 1.487719298245614, + "grad_norm": 1.7038406133651733, + "learning_rate": 3.987517467155954e-05, + "loss": 0.5173, + "step": 10600 + }, + { + "epoch": 1.489122807017544, + "grad_norm": 1.3720016479492188, + "learning_rate": 3.985743989521633e-05, + "loss": 0.4406, + "step": 10610 + }, + { + "epoch": 1.4905263157894737, + "grad_norm": 1.5871185064315796, + "learning_rate": 3.9839693551846924e-05, + "loss": 0.5417, + "step": 10620 + }, + { + "epoch": 1.4919298245614034, + "grad_norm": 1.440131664276123, + "learning_rate": 3.982193565526747e-05, + "loss": 0.6226, + "step": 10630 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 1.91805100440979, + "learning_rate": 3.9804166219303086e-05, + "loss": 0.5337, + "step": 10640 + }, + { + "epoch": 1.4947368421052631, + "grad_norm": 1.9551035165786743, + "learning_rate": 3.9786385257787886e-05, + "loss": 0.5027, + "step": 10650 + }, + { + "epoch": 1.496140350877193, + "grad_norm": 1.93511962890625, + "learning_rate": 3.9768592784564974e-05, + "loss": 0.5676, + "step": 10660 + }, + { + "epoch": 1.4975438596491228, + "grad_norm": 1.6610795259475708, + "learning_rate": 3.975078881348638e-05, + "loss": 0.4542, + "step": 10670 + }, + { + "epoch": 1.4989473684210526, + "grad_norm": 1.6202747821807861, + "learning_rate": 3.9732973358413115e-05, + "loss": 0.5563, + "step": 10680 + }, + { + "epoch": 1.5003508771929823, + "grad_norm": 1.8837049007415771, + "learning_rate": 3.971514643321513e-05, + "loss": 0.4128, + "step": 10690 + }, + { + "epoch": 1.5017543859649123, + "grad_norm": 1.5047054290771484, + "learning_rate": 3.969730805177129e-05, + "loss": 0.4909, + "step": 10700 + }, + { + "epoch": 1.5031578947368422, + "grad_norm": 1.5055679082870483, + "learning_rate": 3.967945822796938e-05, + "loss": 0.5664, + "step": 10710 + }, + { + "epoch": 1.504561403508772, + "grad_norm": 1.2587159872055054, + "learning_rate": 3.9661596975706104e-05, + "loss": 0.5827, + "step": 10720 + }, + { + "epoch": 1.5059649122807017, + "grad_norm": 1.2444645166397095, + "learning_rate": 3.9643724308887065e-05, + "loss": 0.5105, + "step": 10730 + }, + { + "epoch": 1.5073684210526315, + "grad_norm": 2.214508295059204, + "learning_rate": 3.962584024142675e-05, + "loss": 0.5455, + "step": 10740 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 1.7483428716659546, + "learning_rate": 3.96079447872485e-05, + "loss": 0.5419, + "step": 10750 + }, + { + "epoch": 1.5101754385964914, + "grad_norm": 2.044471263885498, + "learning_rate": 3.9590037960284546e-05, + "loss": 0.5861, + "step": 10760 + }, + { + "epoch": 1.5115789473684211, + "grad_norm": 2.063427209854126, + "learning_rate": 3.9572119774475975e-05, + "loss": 0.5692, + "step": 10770 + }, + { + "epoch": 1.5129824561403509, + "grad_norm": 2.0187671184539795, + "learning_rate": 3.95541902437727e-05, + "loss": 0.5345, + "step": 10780 + }, + { + "epoch": 1.5143859649122806, + "grad_norm": 1.9781345129013062, + "learning_rate": 3.953624938213348e-05, + "loss": 0.5212, + "step": 10790 + }, + { + "epoch": 1.5157894736842106, + "grad_norm": 1.9213941097259521, + "learning_rate": 3.95182972035259e-05, + "loss": 0.4838, + "step": 10800 + }, + { + "epoch": 1.5171929824561403, + "grad_norm": 2.619076728820801, + "learning_rate": 3.950033372192633e-05, + "loss": 0.5011, + "step": 10810 + }, + { + "epoch": 1.5185964912280703, + "grad_norm": 1.8988882303237915, + "learning_rate": 3.948235895131997e-05, + "loss": 0.5043, + "step": 10820 + }, + { + "epoch": 1.52, + "grad_norm": 1.4304900169372559, + "learning_rate": 3.946437290570078e-05, + "loss": 0.5062, + "step": 10830 + }, + { + "epoch": 1.5214035087719298, + "grad_norm": 3.060408353805542, + "learning_rate": 3.944637559907152e-05, + "loss": 0.6164, + "step": 10840 + }, + { + "epoch": 1.5228070175438595, + "grad_norm": 0.9590080976486206, + "learning_rate": 3.9428367045443704e-05, + "loss": 0.5159, + "step": 10850 + }, + { + "epoch": 1.5242105263157895, + "grad_norm": 2.1461575031280518, + "learning_rate": 3.941034725883762e-05, + "loss": 0.6505, + "step": 10860 + }, + { + "epoch": 1.5256140350877194, + "grad_norm": 1.578477144241333, + "learning_rate": 3.939231625328229e-05, + "loss": 0.4808, + "step": 10870 + }, + { + "epoch": 1.5270175438596492, + "grad_norm": 1.8510093688964844, + "learning_rate": 3.9374274042815465e-05, + "loss": 0.5194, + "step": 10880 + }, + { + "epoch": 1.528421052631579, + "grad_norm": 1.9132167100906372, + "learning_rate": 3.935622064148361e-05, + "loss": 0.5079, + "step": 10890 + }, + { + "epoch": 1.5298245614035086, + "grad_norm": 1.3352388143539429, + "learning_rate": 3.9338156063341946e-05, + "loss": 0.4808, + "step": 10900 + }, + { + "epoch": 1.5312280701754386, + "grad_norm": 2.102167844772339, + "learning_rate": 3.932008032245434e-05, + "loss": 0.429, + "step": 10910 + }, + { + "epoch": 1.5326315789473686, + "grad_norm": 1.9585574865341187, + "learning_rate": 3.930199343289339e-05, + "loss": 0.489, + "step": 10920 + }, + { + "epoch": 1.5340350877192983, + "grad_norm": 1.905050277709961, + "learning_rate": 3.9283895408740355e-05, + "loss": 0.4881, + "step": 10930 + }, + { + "epoch": 1.535438596491228, + "grad_norm": 1.964416742324829, + "learning_rate": 3.926578626408517e-05, + "loss": 0.5913, + "step": 10940 + }, + { + "epoch": 1.5368421052631578, + "grad_norm": 2.3363118171691895, + "learning_rate": 3.924766601302642e-05, + "loss": 0.4719, + "step": 10950 + }, + { + "epoch": 1.5382456140350877, + "grad_norm": 1.7316786050796509, + "learning_rate": 3.9229534669671344e-05, + "loss": 0.5445, + "step": 10960 + }, + { + "epoch": 1.5396491228070175, + "grad_norm": 2.2813808917999268, + "learning_rate": 3.9211392248135815e-05, + "loss": 0.4989, + "step": 10970 + }, + { + "epoch": 1.5410526315789475, + "grad_norm": 0.9021309018135071, + "learning_rate": 3.9193238762544325e-05, + "loss": 0.5321, + "step": 10980 + }, + { + "epoch": 1.5424561403508772, + "grad_norm": 2.614776134490967, + "learning_rate": 3.9175074227029996e-05, + "loss": 0.4765, + "step": 10990 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 2.1491498947143555, + "learning_rate": 3.915689865573454e-05, + "loss": 0.5748, + "step": 11000 + }, + { + "epoch": 1.545263157894737, + "grad_norm": 1.7609467506408691, + "learning_rate": 3.913871206280824e-05, + "loss": 0.7091, + "step": 11010 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 2.5683560371398926, + "learning_rate": 3.912051446241001e-05, + "loss": 0.507, + "step": 11020 + }, + { + "epoch": 1.5480701754385966, + "grad_norm": 1.3458114862442017, + "learning_rate": 3.910230586870729e-05, + "loss": 0.5738, + "step": 11030 + }, + { + "epoch": 1.5494736842105263, + "grad_norm": 1.7107462882995605, + "learning_rate": 3.90840862958761e-05, + "loss": 0.4437, + "step": 11040 + }, + { + "epoch": 1.550877192982456, + "grad_norm": 1.6637877225875854, + "learning_rate": 3.9065855758101e-05, + "loss": 0.4859, + "step": 11050 + }, + { + "epoch": 1.5522807017543858, + "grad_norm": 2.1268763542175293, + "learning_rate": 3.904761426957509e-05, + "loss": 0.5433, + "step": 11060 + }, + { + "epoch": 1.5536842105263158, + "grad_norm": 1.8485718965530396, + "learning_rate": 3.902936184449999e-05, + "loss": 0.5938, + "step": 11070 + }, + { + "epoch": 1.5550877192982457, + "grad_norm": 1.9369820356369019, + "learning_rate": 3.901109849708585e-05, + "loss": 0.4484, + "step": 11080 + }, + { + "epoch": 1.5564912280701755, + "grad_norm": 1.5223256349563599, + "learning_rate": 3.8992824241551295e-05, + "loss": 0.4353, + "step": 11090 + }, + { + "epoch": 1.5578947368421052, + "grad_norm": 1.2845451831817627, + "learning_rate": 3.897453909212348e-05, + "loss": 0.4497, + "step": 11100 + }, + { + "epoch": 1.559298245614035, + "grad_norm": 1.2751349210739136, + "learning_rate": 3.895624306303799e-05, + "loss": 0.4648, + "step": 11110 + }, + { + "epoch": 1.560701754385965, + "grad_norm": 1.6384958028793335, + "learning_rate": 3.893793616853894e-05, + "loss": 0.5921, + "step": 11120 + }, + { + "epoch": 1.5621052631578949, + "grad_norm": 1.817355990409851, + "learning_rate": 3.891961842287886e-05, + "loss": 0.4611, + "step": 11130 + }, + { + "epoch": 1.5635087719298246, + "grad_norm": 1.7115503549575806, + "learning_rate": 3.890128984031876e-05, + "loss": 0.4745, + "step": 11140 + }, + { + "epoch": 1.5649122807017544, + "grad_norm": 1.7166131734848022, + "learning_rate": 3.888295043512804e-05, + "loss": 0.5716, + "step": 11150 + }, + { + "epoch": 1.566315789473684, + "grad_norm": 1.8528428077697754, + "learning_rate": 3.886460022158458e-05, + "loss": 0.5193, + "step": 11160 + }, + { + "epoch": 1.567719298245614, + "grad_norm": 1.9985193014144897, + "learning_rate": 3.884623921397463e-05, + "loss": 0.4974, + "step": 11170 + }, + { + "epoch": 1.5691228070175438, + "grad_norm": 1.4072109460830688, + "learning_rate": 3.882786742659289e-05, + "loss": 0.4418, + "step": 11180 + }, + { + "epoch": 1.5705263157894738, + "grad_norm": 1.3553410768508911, + "learning_rate": 3.880948487374241e-05, + "loss": 0.5278, + "step": 11190 + }, + { + "epoch": 1.5719298245614035, + "grad_norm": 1.6441354751586914, + "learning_rate": 3.8791091569734625e-05, + "loss": 0.476, + "step": 11200 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 1.4078179597854614, + "learning_rate": 3.8772687528889385e-05, + "loss": 0.581, + "step": 11210 + }, + { + "epoch": 1.5747368421052632, + "grad_norm": 2.096179723739624, + "learning_rate": 3.875427276553485e-05, + "loss": 0.5076, + "step": 11220 + }, + { + "epoch": 1.576140350877193, + "grad_norm": 1.3418902158737183, + "learning_rate": 3.873584729400753e-05, + "loss": 0.5177, + "step": 11230 + }, + { + "epoch": 1.577543859649123, + "grad_norm": 2.1806328296661377, + "learning_rate": 3.8717411128652304e-05, + "loss": 0.5348, + "step": 11240 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 1.6312358379364014, + "learning_rate": 3.869896428382236e-05, + "loss": 0.4699, + "step": 11250 + }, + { + "epoch": 1.5803508771929824, + "grad_norm": 2.4073407649993896, + "learning_rate": 3.8680506773879184e-05, + "loss": 0.5403, + "step": 11260 + }, + { + "epoch": 1.5817543859649121, + "grad_norm": 2.2572133541107178, + "learning_rate": 3.8662038613192596e-05, + "loss": 0.4982, + "step": 11270 + }, + { + "epoch": 1.583157894736842, + "grad_norm": 1.9021124839782715, + "learning_rate": 3.8643559816140685e-05, + "loss": 0.5025, + "step": 11280 + }, + { + "epoch": 1.584561403508772, + "grad_norm": 1.5274001359939575, + "learning_rate": 3.862507039710982e-05, + "loss": 0.4716, + "step": 11290 + }, + { + "epoch": 1.5859649122807018, + "grad_norm": 1.0260459184646606, + "learning_rate": 3.860657037049466e-05, + "loss": 0.5378, + "step": 11300 + }, + { + "epoch": 1.5873684210526315, + "grad_norm": 2.2560298442840576, + "learning_rate": 3.85880597506981e-05, + "loss": 0.5017, + "step": 11310 + }, + { + "epoch": 1.5887719298245613, + "grad_norm": 1.3712495565414429, + "learning_rate": 3.856953855213131e-05, + "loss": 0.4612, + "step": 11320 + }, + { + "epoch": 1.5901754385964912, + "grad_norm": 1.3681050539016724, + "learning_rate": 3.855100678921365e-05, + "loss": 0.5077, + "step": 11330 + }, + { + "epoch": 1.5915789473684212, + "grad_norm": 2.0733203887939453, + "learning_rate": 3.8532464476372765e-05, + "loss": 0.5643, + "step": 11340 + }, + { + "epoch": 1.592982456140351, + "grad_norm": 2.131699323654175, + "learning_rate": 3.851391162804445e-05, + "loss": 0.4939, + "step": 11350 + }, + { + "epoch": 1.5943859649122807, + "grad_norm": 1.787874698638916, + "learning_rate": 3.849534825867275e-05, + "loss": 0.5191, + "step": 11360 + }, + { + "epoch": 1.5957894736842104, + "grad_norm": 1.7359322309494019, + "learning_rate": 3.847677438270988e-05, + "loss": 0.5361, + "step": 11370 + }, + { + "epoch": 1.5971929824561404, + "grad_norm": 1.7936285734176636, + "learning_rate": 3.845819001461625e-05, + "loss": 0.5005, + "step": 11380 + }, + { + "epoch": 1.5985964912280701, + "grad_norm": 0.9876174330711365, + "learning_rate": 3.8439595168860406e-05, + "loss": 0.491, + "step": 11390 + }, + { + "epoch": 1.6, + "grad_norm": 1.5902925729751587, + "learning_rate": 3.842098985991909e-05, + "loss": 0.5636, + "step": 11400 + }, + { + "epoch": 1.6014035087719298, + "grad_norm": 2.157257556915283, + "learning_rate": 3.840237410227717e-05, + "loss": 0.5482, + "step": 11410 + }, + { + "epoch": 1.6028070175438596, + "grad_norm": 2.4455907344818115, + "learning_rate": 3.838374791042764e-05, + "loss": 0.4854, + "step": 11420 + }, + { + "epoch": 1.6042105263157893, + "grad_norm": 2.3983774185180664, + "learning_rate": 3.8365111298871645e-05, + "loss": 0.5535, + "step": 11430 + }, + { + "epoch": 1.6056140350877193, + "grad_norm": 1.6250687837600708, + "learning_rate": 3.834646428211841e-05, + "loss": 0.5493, + "step": 11440 + }, + { + "epoch": 1.6070175438596492, + "grad_norm": 0.9640924334526062, + "learning_rate": 3.83278068746853e-05, + "loss": 0.5126, + "step": 11450 + }, + { + "epoch": 1.608421052631579, + "grad_norm": 1.3491884469985962, + "learning_rate": 3.830913909109772e-05, + "loss": 0.5692, + "step": 11460 + }, + { + "epoch": 1.6098245614035087, + "grad_norm": 1.2153112888336182, + "learning_rate": 3.8290460945889186e-05, + "loss": 0.4367, + "step": 11470 + }, + { + "epoch": 1.6112280701754385, + "grad_norm": 1.869314193725586, + "learning_rate": 3.827177245360129e-05, + "loss": 0.5275, + "step": 11480 + }, + { + "epoch": 1.6126315789473684, + "grad_norm": 2.1873159408569336, + "learning_rate": 3.825307362878364e-05, + "loss": 0.5663, + "step": 11490 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 1.1323574781417847, + "learning_rate": 3.823436448599393e-05, + "loss": 0.4986, + "step": 11500 + }, + { + "epoch": 1.6154385964912281, + "grad_norm": 1.424310326576233, + "learning_rate": 3.8215645039797874e-05, + "loss": 0.5401, + "step": 11510 + }, + { + "epoch": 1.6168421052631579, + "grad_norm": 1.8369444608688354, + "learning_rate": 3.8196915304769184e-05, + "loss": 0.534, + "step": 11520 + }, + { + "epoch": 1.6182456140350876, + "grad_norm": 1.404891848564148, + "learning_rate": 3.817817529548962e-05, + "loss": 0.457, + "step": 11530 + }, + { + "epoch": 1.6196491228070176, + "grad_norm": 1.5708239078521729, + "learning_rate": 3.815942502654889e-05, + "loss": 0.5023, + "step": 11540 + }, + { + "epoch": 1.6210526315789475, + "grad_norm": 1.4378736019134521, + "learning_rate": 3.8140664512544746e-05, + "loss": 0.4885, + "step": 11550 + }, + { + "epoch": 1.6224561403508773, + "grad_norm": 2.0573270320892334, + "learning_rate": 3.8121893768082896e-05, + "loss": 0.5204, + "step": 11560 + }, + { + "epoch": 1.623859649122807, + "grad_norm": 1.4034109115600586, + "learning_rate": 3.8103112807776986e-05, + "loss": 0.4611, + "step": 11570 + }, + { + "epoch": 1.6252631578947367, + "grad_norm": 1.8642430305480957, + "learning_rate": 3.8084321646248654e-05, + "loss": 0.4999, + "step": 11580 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 2.0132107734680176, + "learning_rate": 3.806552029812747e-05, + "loss": 0.5241, + "step": 11590 + }, + { + "epoch": 1.6280701754385964, + "grad_norm": 1.3874375820159912, + "learning_rate": 3.804670877805091e-05, + "loss": 0.5275, + "step": 11600 + }, + { + "epoch": 1.6294736842105264, + "grad_norm": 1.296034574508667, + "learning_rate": 3.802788710066439e-05, + "loss": 0.4517, + "step": 11610 + }, + { + "epoch": 1.6308771929824561, + "grad_norm": 1.7629979848861694, + "learning_rate": 3.800905528062123e-05, + "loss": 0.4437, + "step": 11620 + }, + { + "epoch": 1.6322807017543859, + "grad_norm": 1.788439393043518, + "learning_rate": 3.7990213332582665e-05, + "loss": 0.5334, + "step": 11630 + }, + { + "epoch": 1.6336842105263156, + "grad_norm": 1.6043287515640259, + "learning_rate": 3.7971361271217775e-05, + "loss": 0.5915, + "step": 11640 + }, + { + "epoch": 1.6350877192982456, + "grad_norm": 1.1127432584762573, + "learning_rate": 3.7952499111203544e-05, + "loss": 0.633, + "step": 11650 + }, + { + "epoch": 1.6364912280701756, + "grad_norm": 1.6562187671661377, + "learning_rate": 3.793362686722483e-05, + "loss": 0.523, + "step": 11660 + }, + { + "epoch": 1.6378947368421053, + "grad_norm": 2.0622971057891846, + "learning_rate": 3.7914744553974284e-05, + "loss": 0.5025, + "step": 11670 + }, + { + "epoch": 1.639298245614035, + "grad_norm": 2.0334134101867676, + "learning_rate": 3.789585218615246e-05, + "loss": 0.5153, + "step": 11680 + }, + { + "epoch": 1.6407017543859648, + "grad_norm": 1.7830958366394043, + "learning_rate": 3.787694977846771e-05, + "loss": 0.5783, + "step": 11690 + }, + { + "epoch": 1.6421052631578947, + "grad_norm": 2.2819151878356934, + "learning_rate": 3.78580373456362e-05, + "loss": 0.5333, + "step": 11700 + }, + { + "epoch": 1.6435087719298247, + "grad_norm": 1.7580994367599487, + "learning_rate": 3.783911490238191e-05, + "loss": 0.574, + "step": 11710 + }, + { + "epoch": 1.6449122807017544, + "grad_norm": 1.012489676475525, + "learning_rate": 3.782018246343661e-05, + "loss": 0.5028, + "step": 11720 + }, + { + "epoch": 1.6463157894736842, + "grad_norm": 0.9846917390823364, + "learning_rate": 3.780124004353987e-05, + "loss": 0.5425, + "step": 11730 + }, + { + "epoch": 1.647719298245614, + "grad_norm": 1.3875446319580078, + "learning_rate": 3.778228765743898e-05, + "loss": 0.4961, + "step": 11740 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 0.912992537021637, + "learning_rate": 3.776332531988903e-05, + "loss": 0.5135, + "step": 11750 + }, + { + "epoch": 1.6505263157894738, + "grad_norm": 1.4657293558120728, + "learning_rate": 3.774435304565288e-05, + "loss": 0.5917, + "step": 11760 + }, + { + "epoch": 1.6519298245614036, + "grad_norm": 1.4134496450424194, + "learning_rate": 3.772537084950106e-05, + "loss": 0.6529, + "step": 11770 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 1.0357835292816162, + "learning_rate": 3.770637874621189e-05, + "loss": 0.4853, + "step": 11780 + }, + { + "epoch": 1.654736842105263, + "grad_norm": 1.1166404485702515, + "learning_rate": 3.7687376750571347e-05, + "loss": 0.5509, + "step": 11790 + }, + { + "epoch": 1.656140350877193, + "grad_norm": 1.819243311882019, + "learning_rate": 3.7668364877373154e-05, + "loss": 0.5083, + "step": 11800 + }, + { + "epoch": 1.6575438596491228, + "grad_norm": 2.0793590545654297, + "learning_rate": 3.764934314141869e-05, + "loss": 0.5239, + "step": 11810 + }, + { + "epoch": 1.6589473684210527, + "grad_norm": 1.4497408866882324, + "learning_rate": 3.763031155751705e-05, + "loss": 0.5295, + "step": 11820 + }, + { + "epoch": 1.6603508771929825, + "grad_norm": 1.8401798009872437, + "learning_rate": 3.7611270140484956e-05, + "loss": 0.3987, + "step": 11830 + }, + { + "epoch": 1.6617543859649122, + "grad_norm": 1.0776817798614502, + "learning_rate": 3.759221890514681e-05, + "loss": 0.5236, + "step": 11840 + }, + { + "epoch": 1.663157894736842, + "grad_norm": 1.6151456832885742, + "learning_rate": 3.757315786633465e-05, + "loss": 0.4783, + "step": 11850 + }, + { + "epoch": 1.664561403508772, + "grad_norm": 2.149061679840088, + "learning_rate": 3.7554087038888155e-05, + "loss": 0.6304, + "step": 11860 + }, + { + "epoch": 1.6659649122807019, + "grad_norm": 1.848923683166504, + "learning_rate": 3.753500643765461e-05, + "loss": 0.4951, + "step": 11870 + }, + { + "epoch": 1.6673684210526316, + "grad_norm": 1.3279706239700317, + "learning_rate": 3.751591607748891e-05, + "loss": 0.5195, + "step": 11880 + }, + { + "epoch": 1.6687719298245614, + "grad_norm": 2.280778646469116, + "learning_rate": 3.749681597325357e-05, + "loss": 0.6116, + "step": 11890 + }, + { + "epoch": 1.670175438596491, + "grad_norm": 1.8069521188735962, + "learning_rate": 3.7477706139818683e-05, + "loss": 0.5038, + "step": 11900 + }, + { + "epoch": 1.671578947368421, + "grad_norm": 1.8922659158706665, + "learning_rate": 3.745858659206188e-05, + "loss": 0.5671, + "step": 11910 + }, + { + "epoch": 1.672982456140351, + "grad_norm": 1.6917041540145874, + "learning_rate": 3.743945734486841e-05, + "loss": 0.5559, + "step": 11920 + }, + { + "epoch": 1.6743859649122808, + "grad_norm": 1.2357120513916016, + "learning_rate": 3.742031841313103e-05, + "loss": 0.5069, + "step": 11930 + }, + { + "epoch": 1.6757894736842105, + "grad_norm": 1.5987924337387085, + "learning_rate": 3.7401169811750066e-05, + "loss": 0.5431, + "step": 11940 + }, + { + "epoch": 1.6771929824561402, + "grad_norm": 1.3475733995437622, + "learning_rate": 3.7382011555633365e-05, + "loss": 0.5636, + "step": 11950 + }, + { + "epoch": 1.6785964912280702, + "grad_norm": 1.4985164403915405, + "learning_rate": 3.736284365969627e-05, + "loss": 0.4871, + "step": 11960 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.2310782670974731, + "learning_rate": 3.7343666138861646e-05, + "loss": 0.6245, + "step": 11970 + }, + { + "epoch": 1.68140350877193, + "grad_norm": 1.7251943349838257, + "learning_rate": 3.7324479008059865e-05, + "loss": 0.5126, + "step": 11980 + }, + { + "epoch": 1.6828070175438596, + "grad_norm": 2.389265775680542, + "learning_rate": 3.7305282282228756e-05, + "loss": 0.5669, + "step": 11990 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 2.130988359451294, + "learning_rate": 3.728607597631363e-05, + "loss": 0.4796, + "step": 12000 + }, + { + "epoch": 1.6842105263157894, + "eval_loss": 0.6500447392463684, + "eval_runtime": 119.5925, + "eval_samples_per_second": 12.543, + "eval_steps_per_second": 3.136, + "step": 12000 + }, + { + "epoch": 1.6842105263157894, + "step": 12000, + "total_flos": 6.933368738955264e+17, + "train_loss": 0.6038338423768679, + "train_runtime": 5861.7175, + "train_samples_per_second": 24.31, + "train_steps_per_second": 6.078 + } + ], + "logging_steps": 10, + "max_steps": 35625, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 2000, + "total_flos": 6.933368738955264e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}