diff --git "a/llama2_13b_lora_f/trainer_state.json" "b/llama2_13b_lora_f/trainer_state.json" new file mode 100644--- /dev/null +++ "b/llama2_13b_lora_f/trainer_state.json" @@ -0,0 +1,87069 @@ +{ + "best_metric": 0.7689330577850342, + "best_model_checkpoint": "ckpt/llama2_13b_other/fuze_28_no_sys/checkpoint-112000", + "epoch": 1.99040113003419, + "eval_steps": 4000, + "global_step": 124000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001605162201640476, + "grad_norm": 0.43448686599731445, + "learning_rate": 2.5e-05, + "loss": 1.7129, + "step": 10 + }, + { + "epoch": 0.0003210324403280952, + "grad_norm": 0.9125259518623352, + "learning_rate": 5e-05, + "loss": 1.4324, + "step": 20 + }, + { + "epoch": 0.00048154866049214274, + "grad_norm": 0.9287012815475464, + "learning_rate": 4.9999999205073364e-05, + "loss": 1.327, + "step": 30 + }, + { + "epoch": 0.0006420648806561904, + "grad_norm": 0.9862312078475952, + "learning_rate": 4.9999996820293525e-05, + "loss": 1.1329, + "step": 40 + }, + { + "epoch": 0.0008025811008202379, + "grad_norm": 1.334333062171936, + "learning_rate": 4.999999284566061e-05, + "loss": 1.0304, + "step": 50 + }, + { + "epoch": 0.0009630973209842855, + "grad_norm": 0.7722494602203369, + "learning_rate": 4.99999872811749e-05, + "loss": 1.0304, + "step": 60 + }, + { + "epoch": 0.001123613541148333, + "grad_norm": 1.1093409061431885, + "learning_rate": 4.999998012683672e-05, + "loss": 1.102, + "step": 70 + }, + { + "epoch": 0.0012841297613123807, + "grad_norm": 1.1147723197937012, + "learning_rate": 4.999997138264654e-05, + "loss": 0.8901, + "step": 80 + }, + { + "epoch": 0.0014446459814764282, + "grad_norm": 1.0770961046218872, + "learning_rate": 4.999996104860492e-05, + "loss": 0.8823, + "step": 90 + }, + { + "epoch": 0.0016051622016404758, + "grad_norm": 2.0371105670928955, + "learning_rate": 4.9999949124712506e-05, + "loss": 0.9119, + "step": 100 + }, + { + "epoch": 0.0017656784218045233, + "grad_norm": 0.6819159984588623, + "learning_rate": 4.9999935610970074e-05, + "loss": 0.9844, + "step": 110 + }, + { + "epoch": 0.001926194641968571, + "grad_norm": 0.7769989967346191, + "learning_rate": 4.9999920507378465e-05, + "loss": 0.986, + "step": 120 + }, + { + "epoch": 0.0020867108621326186, + "grad_norm": 1.5317758321762085, + "learning_rate": 4.999990381393864e-05, + "loss": 0.9559, + "step": 130 + }, + { + "epoch": 0.002247227082296666, + "grad_norm": 1.2397572994232178, + "learning_rate": 4.999988553065168e-05, + "loss": 0.9003, + "step": 140 + }, + { + "epoch": 0.0024077433024607135, + "grad_norm": 1.1749111413955688, + "learning_rate": 4.999986565751873e-05, + "loss": 1.1234, + "step": 150 + }, + { + "epoch": 0.0025682595226247614, + "grad_norm": 0.5587156414985657, + "learning_rate": 4.9999844194541054e-05, + "loss": 1.0414, + "step": 160 + }, + { + "epoch": 0.002728775742788809, + "grad_norm": 0.45693814754486084, + "learning_rate": 4.999982114172003e-05, + "loss": 0.9258, + "step": 170 + }, + { + "epoch": 0.0028892919629528563, + "grad_norm": 0.8136128187179565, + "learning_rate": 4.9999796499057106e-05, + "loss": 0.6897, + "step": 180 + }, + { + "epoch": 0.003049808183116904, + "grad_norm": 1.7789788246154785, + "learning_rate": 4.9999770266553855e-05, + "loss": 0.808, + "step": 190 + }, + { + "epoch": 0.0032103244032809517, + "grad_norm": 1.9896527528762817, + "learning_rate": 4.999974244421196e-05, + "loss": 0.8459, + "step": 200 + }, + { + "epoch": 0.003370840623444999, + "grad_norm": 1.13595712184906, + "learning_rate": 4.999971303203318e-05, + "loss": 0.9714, + "step": 210 + }, + { + "epoch": 0.0035313568436090466, + "grad_norm": 0.609179675579071, + "learning_rate": 4.9999682030019375e-05, + "loss": 0.7962, + "step": 220 + }, + { + "epoch": 0.0036918730637730945, + "grad_norm": 0.6379231214523315, + "learning_rate": 4.999964943817253e-05, + "loss": 0.8778, + "step": 230 + }, + { + "epoch": 0.003852389283937142, + "grad_norm": 0.815813422203064, + "learning_rate": 4.999961525649471e-05, + "loss": 0.8921, + "step": 240 + }, + { + "epoch": 0.004012905504101189, + "grad_norm": 0.9749583601951599, + "learning_rate": 4.9999579484988105e-05, + "loss": 0.9455, + "step": 250 + }, + { + "epoch": 0.004173421724265237, + "grad_norm": 1.185186743736267, + "learning_rate": 4.999954212365496e-05, + "loss": 0.9034, + "step": 260 + }, + { + "epoch": 0.004333937944429284, + "grad_norm": 1.3062928915023804, + "learning_rate": 4.9999503172497687e-05, + "loss": 0.7804, + "step": 270 + }, + { + "epoch": 0.004494454164593332, + "grad_norm": 0.870568573474884, + "learning_rate": 4.999946263151873e-05, + "loss": 0.8868, + "step": 280 + }, + { + "epoch": 0.00465497038475738, + "grad_norm": 0.44410598278045654, + "learning_rate": 4.999942050072068e-05, + "loss": 0.9318, + "step": 290 + }, + { + "epoch": 0.004815486604921427, + "grad_norm": 0.8807160258293152, + "learning_rate": 4.9999376780106233e-05, + "loss": 0.8933, + "step": 300 + }, + { + "epoch": 0.004976002825085475, + "grad_norm": 1.9746800661087036, + "learning_rate": 4.999933146967815e-05, + "loss": 0.9546, + "step": 310 + }, + { + "epoch": 0.005136519045249523, + "grad_norm": 0.6999930739402771, + "learning_rate": 4.999928456943931e-05, + "loss": 0.9294, + "step": 320 + }, + { + "epoch": 0.00529703526541357, + "grad_norm": 1.0425453186035156, + "learning_rate": 4.99992360793927e-05, + "loss": 1.0668, + "step": 330 + }, + { + "epoch": 0.005457551485577618, + "grad_norm": 1.026545763015747, + "learning_rate": 4.999918599954141e-05, + "loss": 0.9147, + "step": 340 + }, + { + "epoch": 0.005618067705741665, + "grad_norm": 0.6989656686782837, + "learning_rate": 4.999913432988862e-05, + "loss": 0.8674, + "step": 350 + }, + { + "epoch": 0.005778583925905713, + "grad_norm": 0.8753497004508972, + "learning_rate": 4.999908107043763e-05, + "loss": 0.8714, + "step": 360 + }, + { + "epoch": 0.0059391001460697606, + "grad_norm": 1.9304440021514893, + "learning_rate": 4.999902622119179e-05, + "loss": 0.9147, + "step": 370 + }, + { + "epoch": 0.006099616366233808, + "grad_norm": 0.6921795606613159, + "learning_rate": 4.9998969782154623e-05, + "loss": 0.9115, + "step": 380 + }, + { + "epoch": 0.0062601325863978555, + "grad_norm": 0.6538827419281006, + "learning_rate": 4.9998911753329714e-05, + "loss": 0.9487, + "step": 390 + }, + { + "epoch": 0.006420648806561903, + "grad_norm": 0.6469338536262512, + "learning_rate": 4.9998852134720735e-05, + "loss": 0.8542, + "step": 400 + }, + { + "epoch": 0.00658116502672595, + "grad_norm": 0.6401376128196716, + "learning_rate": 4.999879092633149e-05, + "loss": 0.8708, + "step": 410 + }, + { + "epoch": 0.006741681246889998, + "grad_norm": 0.5514658093452454, + "learning_rate": 4.999872812816588e-05, + "loss": 0.8776, + "step": 420 + }, + { + "epoch": 0.006902197467054046, + "grad_norm": 0.9358686208724976, + "learning_rate": 4.999866374022788e-05, + "loss": 0.8494, + "step": 430 + }, + { + "epoch": 0.007062713687218093, + "grad_norm": 0.5223158001899719, + "learning_rate": 4.9998597762521595e-05, + "loss": 0.9192, + "step": 440 + }, + { + "epoch": 0.007223229907382141, + "grad_norm": 0.6298983097076416, + "learning_rate": 4.999853019505122e-05, + "loss": 0.7895, + "step": 450 + }, + { + "epoch": 0.007383746127546189, + "grad_norm": 0.43558037281036377, + "learning_rate": 4.999846103782106e-05, + "loss": 0.8033, + "step": 460 + }, + { + "epoch": 0.007544262347710236, + "grad_norm": 1.4178178310394287, + "learning_rate": 4.999839029083548e-05, + "loss": 0.9224, + "step": 470 + }, + { + "epoch": 0.007704778567874284, + "grad_norm": 0.8426011800765991, + "learning_rate": 4.999831795409903e-05, + "loss": 0.8506, + "step": 480 + }, + { + "epoch": 0.007865294788038331, + "grad_norm": 0.7928465604782104, + "learning_rate": 4.999824402761626e-05, + "loss": 0.9702, + "step": 490 + }, + { + "epoch": 0.008025811008202379, + "grad_norm": 0.4446624517440796, + "learning_rate": 4.9998168511391904e-05, + "loss": 0.8899, + "step": 500 + }, + { + "epoch": 0.008186327228366427, + "grad_norm": 0.9177296757698059, + "learning_rate": 4.999809140543076e-05, + "loss": 0.8091, + "step": 510 + }, + { + "epoch": 0.008346843448530475, + "grad_norm": 0.5141400694847107, + "learning_rate": 4.999801270973773e-05, + "loss": 1.0028, + "step": 520 + }, + { + "epoch": 0.008507359668694522, + "grad_norm": 0.8639324307441711, + "learning_rate": 4.9997932424317806e-05, + "loss": 0.8406, + "step": 530 + }, + { + "epoch": 0.008667875888858569, + "grad_norm": 0.6465668082237244, + "learning_rate": 4.999785054917611e-05, + "loss": 0.6853, + "step": 540 + }, + { + "epoch": 0.008828392109022616, + "grad_norm": 0.8927873373031616, + "learning_rate": 4.9997767084317834e-05, + "loss": 0.7856, + "step": 550 + }, + { + "epoch": 0.008988908329186664, + "grad_norm": 0.8533749580383301, + "learning_rate": 4.999768202974829e-05, + "loss": 0.9435, + "step": 560 + }, + { + "epoch": 0.009149424549350712, + "grad_norm": 0.7166557908058167, + "learning_rate": 4.99975953854729e-05, + "loss": 0.9626, + "step": 570 + }, + { + "epoch": 0.00930994076951476, + "grad_norm": 0.97231125831604, + "learning_rate": 4.999750715149716e-05, + "loss": 0.8761, + "step": 580 + }, + { + "epoch": 0.009470456989678806, + "grad_norm": 0.502642810344696, + "learning_rate": 4.999741732782669e-05, + "loss": 0.8523, + "step": 590 + }, + { + "epoch": 0.009630973209842854, + "grad_norm": 0.6897047162055969, + "learning_rate": 4.99973259144672e-05, + "loss": 0.7906, + "step": 600 + }, + { + "epoch": 0.009791489430006902, + "grad_norm": 0.9354003071784973, + "learning_rate": 4.999723291142449e-05, + "loss": 0.8749, + "step": 610 + }, + { + "epoch": 0.00995200565017095, + "grad_norm": 0.5548325777053833, + "learning_rate": 4.9997138318704495e-05, + "loss": 0.8879, + "step": 620 + }, + { + "epoch": 0.010112521870334998, + "grad_norm": 0.7876558303833008, + "learning_rate": 4.999704213631321e-05, + "loss": 0.8767, + "step": 630 + }, + { + "epoch": 0.010273038090499046, + "grad_norm": 1.206376075744629, + "learning_rate": 4.999694436425678e-05, + "loss": 1.0581, + "step": 640 + }, + { + "epoch": 0.010433554310663092, + "grad_norm": 0.742038905620575, + "learning_rate": 4.9996845002541396e-05, + "loss": 0.829, + "step": 650 + }, + { + "epoch": 0.01059407053082714, + "grad_norm": 1.194812297821045, + "learning_rate": 4.999674405117338e-05, + "loss": 0.825, + "step": 660 + }, + { + "epoch": 0.010754586750991188, + "grad_norm": 1.003086805343628, + "learning_rate": 4.999664151015917e-05, + "loss": 1.0611, + "step": 670 + }, + { + "epoch": 0.010915102971155236, + "grad_norm": 0.6933590769767761, + "learning_rate": 4.999653737950527e-05, + "loss": 0.9274, + "step": 680 + }, + { + "epoch": 0.011075619191319283, + "grad_norm": 1.1371748447418213, + "learning_rate": 4.9996431659218305e-05, + "loss": 0.9283, + "step": 690 + }, + { + "epoch": 0.01123613541148333, + "grad_norm": 0.7525023221969604, + "learning_rate": 4.9996324349305e-05, + "loss": 0.8247, + "step": 700 + }, + { + "epoch": 0.011396651631647377, + "grad_norm": 1.0038493871688843, + "learning_rate": 4.9996215449772186e-05, + "loss": 0.8555, + "step": 710 + }, + { + "epoch": 0.011557167851811425, + "grad_norm": 1.0783543586730957, + "learning_rate": 4.9996104960626786e-05, + "loss": 0.9424, + "step": 720 + }, + { + "epoch": 0.011717684071975473, + "grad_norm": 1.1855032444000244, + "learning_rate": 4.999599288187581e-05, + "loss": 0.7228, + "step": 730 + }, + { + "epoch": 0.011878200292139521, + "grad_norm": 0.5774499773979187, + "learning_rate": 4.999587921352641e-05, + "loss": 0.956, + "step": 740 + }, + { + "epoch": 0.012038716512303569, + "grad_norm": 0.6870006918907166, + "learning_rate": 4.9995763955585795e-05, + "loss": 0.9294, + "step": 750 + }, + { + "epoch": 0.012199232732467615, + "grad_norm": 0.5765193104743958, + "learning_rate": 4.999564710806131e-05, + "loss": 0.7541, + "step": 760 + }, + { + "epoch": 0.012359748952631663, + "grad_norm": 0.539435088634491, + "learning_rate": 4.999552867096037e-05, + "loss": 0.7589, + "step": 770 + }, + { + "epoch": 0.012520265172795711, + "grad_norm": 0.730887234210968, + "learning_rate": 4.999540864429052e-05, + "loss": 0.8621, + "step": 780 + }, + { + "epoch": 0.012680781392959759, + "grad_norm": 0.6165319681167603, + "learning_rate": 4.999528702805939e-05, + "loss": 0.9571, + "step": 790 + }, + { + "epoch": 0.012841297613123807, + "grad_norm": 0.8952537775039673, + "learning_rate": 4.999516382227471e-05, + "loss": 0.801, + "step": 800 + }, + { + "epoch": 0.013001813833287855, + "grad_norm": 0.714994490146637, + "learning_rate": 4.999503902694431e-05, + "loss": 0.9462, + "step": 810 + }, + { + "epoch": 0.0131623300534519, + "grad_norm": 0.7236514091491699, + "learning_rate": 4.999491264207614e-05, + "loss": 0.9448, + "step": 820 + }, + { + "epoch": 0.013322846273615949, + "grad_norm": 0.7459760904312134, + "learning_rate": 4.999478466767824e-05, + "loss": 0.9468, + "step": 830 + }, + { + "epoch": 0.013483362493779997, + "grad_norm": 0.7269554138183594, + "learning_rate": 4.999465510375873e-05, + "loss": 0.9121, + "step": 840 + }, + { + "epoch": 0.013643878713944044, + "grad_norm": 0.5984081625938416, + "learning_rate": 4.9994523950325864e-05, + "loss": 0.7792, + "step": 850 + }, + { + "epoch": 0.013804394934108092, + "grad_norm": 0.6167006492614746, + "learning_rate": 4.999439120738797e-05, + "loss": 0.9401, + "step": 860 + }, + { + "epoch": 0.013964911154272138, + "grad_norm": 0.9107213020324707, + "learning_rate": 4.999425687495351e-05, + "loss": 1.0068, + "step": 870 + }, + { + "epoch": 0.014125427374436186, + "grad_norm": 1.2308069467544556, + "learning_rate": 4.999412095303101e-05, + "loss": 0.858, + "step": 880 + }, + { + "epoch": 0.014285943594600234, + "grad_norm": 0.7824222445487976, + "learning_rate": 4.999398344162911e-05, + "loss": 0.8337, + "step": 890 + }, + { + "epoch": 0.014446459814764282, + "grad_norm": 0.5971059799194336, + "learning_rate": 4.9993844340756567e-05, + "loss": 0.7371, + "step": 900 + }, + { + "epoch": 0.01460697603492833, + "grad_norm": 1.0220975875854492, + "learning_rate": 4.999370365042222e-05, + "loss": 0.8325, + "step": 910 + }, + { + "epoch": 0.014767492255092378, + "grad_norm": 0.6975373029708862, + "learning_rate": 4.999356137063502e-05, + "loss": 0.82, + "step": 920 + }, + { + "epoch": 0.014928008475256424, + "grad_norm": 0.7993868589401245, + "learning_rate": 4.999341750140402e-05, + "loss": 0.8224, + "step": 930 + }, + { + "epoch": 0.015088524695420472, + "grad_norm": 0.8275920152664185, + "learning_rate": 4.999327204273836e-05, + "loss": 0.9513, + "step": 940 + }, + { + "epoch": 0.01524904091558452, + "grad_norm": 0.9830689430236816, + "learning_rate": 4.99931249946473e-05, + "loss": 0.8016, + "step": 950 + }, + { + "epoch": 0.015409557135748568, + "grad_norm": 0.720641016960144, + "learning_rate": 4.9992976357140175e-05, + "loss": 1.0143, + "step": 960 + }, + { + "epoch": 0.015570073355912616, + "grad_norm": 0.6260520815849304, + "learning_rate": 4.9992826130226455e-05, + "loss": 0.7746, + "step": 970 + }, + { + "epoch": 0.015730589576076662, + "grad_norm": 0.6191791296005249, + "learning_rate": 4.999267431391568e-05, + "loss": 0.8957, + "step": 980 + }, + { + "epoch": 0.01589110579624071, + "grad_norm": 0.984703540802002, + "learning_rate": 4.999252090821751e-05, + "loss": 0.7049, + "step": 990 + }, + { + "epoch": 0.016051622016404758, + "grad_norm": 0.7037981152534485, + "learning_rate": 4.999236591314171e-05, + "loss": 0.7726, + "step": 1000 + }, + { + "epoch": 0.016212138236568804, + "grad_norm": 1.937394380569458, + "learning_rate": 4.999220932869812e-05, + "loss": 0.9138, + "step": 1010 + }, + { + "epoch": 0.016372654456732853, + "grad_norm": 1.3566086292266846, + "learning_rate": 4.999205115489671e-05, + "loss": 0.9289, + "step": 1020 + }, + { + "epoch": 0.0165331706768969, + "grad_norm": 0.6113929748535156, + "learning_rate": 4.9991891391747534e-05, + "loss": 0.8654, + "step": 1030 + }, + { + "epoch": 0.01669368689706095, + "grad_norm": 0.8540635704994202, + "learning_rate": 4.9991730039260756e-05, + "loss": 0.8875, + "step": 1040 + }, + { + "epoch": 0.016854203117224995, + "grad_norm": 0.541519045829773, + "learning_rate": 4.999156709744663e-05, + "loss": 0.915, + "step": 1050 + }, + { + "epoch": 0.017014719337389045, + "grad_norm": 0.7322198748588562, + "learning_rate": 4.999140256631552e-05, + "loss": 0.7674, + "step": 1060 + }, + { + "epoch": 0.01717523555755309, + "grad_norm": 0.7831448912620544, + "learning_rate": 4.99912364458779e-05, + "loss": 0.749, + "step": 1070 + }, + { + "epoch": 0.017335751777717137, + "grad_norm": 0.7108404636383057, + "learning_rate": 4.999106873614432e-05, + "loss": 0.9087, + "step": 1080 + }, + { + "epoch": 0.017496267997881187, + "grad_norm": 0.9266623854637146, + "learning_rate": 4.999089943712545e-05, + "loss": 0.7866, + "step": 1090 + }, + { + "epoch": 0.017656784218045233, + "grad_norm": 0.7533081769943237, + "learning_rate": 4.9990728548832056e-05, + "loss": 0.8282, + "step": 1100 + }, + { + "epoch": 0.017817300438209283, + "grad_norm": 1.2230699062347412, + "learning_rate": 4.9990556071275005e-05, + "loss": 0.8275, + "step": 1110 + }, + { + "epoch": 0.01797781665837333, + "grad_norm": 0.7302761077880859, + "learning_rate": 4.999038200446528e-05, + "loss": 0.8315, + "step": 1120 + }, + { + "epoch": 0.018138332878537375, + "grad_norm": 0.9341862201690674, + "learning_rate": 4.999020634841393e-05, + "loss": 0.9159, + "step": 1130 + }, + { + "epoch": 0.018298849098701424, + "grad_norm": 0.5557223558425903, + "learning_rate": 4.9990029103132136e-05, + "loss": 0.8997, + "step": 1140 + }, + { + "epoch": 0.01845936531886547, + "grad_norm": 0.7339600920677185, + "learning_rate": 4.998985026863116e-05, + "loss": 1.0367, + "step": 1150 + }, + { + "epoch": 0.01861988153902952, + "grad_norm": 0.7402278780937195, + "learning_rate": 4.9989669844922394e-05, + "loss": 0.9376, + "step": 1160 + }, + { + "epoch": 0.018780397759193566, + "grad_norm": 0.7352900505065918, + "learning_rate": 4.9989487832017293e-05, + "loss": 0.7813, + "step": 1170 + }, + { + "epoch": 0.018940913979357613, + "grad_norm": 0.6954396963119507, + "learning_rate": 4.998930422992745e-05, + "loss": 0.8534, + "step": 1180 + }, + { + "epoch": 0.019101430199521662, + "grad_norm": 0.9799904227256775, + "learning_rate": 4.998911903866451e-05, + "loss": 0.8083, + "step": 1190 + }, + { + "epoch": 0.01926194641968571, + "grad_norm": 0.8654493093490601, + "learning_rate": 4.998893225824029e-05, + "loss": 0.7894, + "step": 1200 + }, + { + "epoch": 0.019422462639849758, + "grad_norm": 0.6636609435081482, + "learning_rate": 4.998874388866663e-05, + "loss": 0.9344, + "step": 1210 + }, + { + "epoch": 0.019582978860013804, + "grad_norm": 0.5866648554801941, + "learning_rate": 4.9988553929955544e-05, + "loss": 0.9067, + "step": 1220 + }, + { + "epoch": 0.01974349508017785, + "grad_norm": 0.6995806097984314, + "learning_rate": 4.9988362382119094e-05, + "loss": 0.9644, + "step": 1230 + }, + { + "epoch": 0.0199040113003419, + "grad_norm": 0.9814038276672363, + "learning_rate": 4.998816924516946e-05, + "loss": 0.8865, + "step": 1240 + }, + { + "epoch": 0.020064527520505946, + "grad_norm": 0.5265271663665771, + "learning_rate": 4.9987974519118926e-05, + "loss": 0.8497, + "step": 1250 + }, + { + "epoch": 0.020225043740669996, + "grad_norm": 0.9725403785705566, + "learning_rate": 4.9987778203979875e-05, + "loss": 0.8603, + "step": 1260 + }, + { + "epoch": 0.020385559960834042, + "grad_norm": 0.5272873044013977, + "learning_rate": 4.9987580299764805e-05, + "loss": 0.9975, + "step": 1270 + }, + { + "epoch": 0.02054607618099809, + "grad_norm": 0.6069408655166626, + "learning_rate": 4.998738080648627e-05, + "loss": 0.9446, + "step": 1280 + }, + { + "epoch": 0.020706592401162138, + "grad_norm": 1.4176936149597168, + "learning_rate": 4.9987179724157e-05, + "loss": 0.8774, + "step": 1290 + }, + { + "epoch": 0.020867108621326184, + "grad_norm": 0.5864118337631226, + "learning_rate": 4.998697705278974e-05, + "loss": 0.8362, + "step": 1300 + }, + { + "epoch": 0.021027624841490233, + "grad_norm": 0.9885786771774292, + "learning_rate": 4.998677279239741e-05, + "loss": 0.8917, + "step": 1310 + }, + { + "epoch": 0.02118814106165428, + "grad_norm": 0.7973999381065369, + "learning_rate": 4.9986566942992985e-05, + "loss": 0.9041, + "step": 1320 + }, + { + "epoch": 0.02134865728181833, + "grad_norm": 1.0723477602005005, + "learning_rate": 4.9986359504589555e-05, + "loss": 0.8662, + "step": 1330 + }, + { + "epoch": 0.021509173501982375, + "grad_norm": 0.8739461898803711, + "learning_rate": 4.998615047720032e-05, + "loss": 0.8742, + "step": 1340 + }, + { + "epoch": 0.02166968972214642, + "grad_norm": 1.138653039932251, + "learning_rate": 4.9985939860838566e-05, + "loss": 0.7908, + "step": 1350 + }, + { + "epoch": 0.02183020594231047, + "grad_norm": 1.0549447536468506, + "learning_rate": 4.998572765551769e-05, + "loss": 0.9147, + "step": 1360 + }, + { + "epoch": 0.021990722162474517, + "grad_norm": 0.6096875071525574, + "learning_rate": 4.998551386125119e-05, + "loss": 0.878, + "step": 1370 + }, + { + "epoch": 0.022151238382638567, + "grad_norm": 0.715433657169342, + "learning_rate": 4.9985298478052655e-05, + "loss": 0.7844, + "step": 1380 + }, + { + "epoch": 0.022311754602802613, + "grad_norm": 0.5785250663757324, + "learning_rate": 4.998508150593578e-05, + "loss": 0.8046, + "step": 1390 + }, + { + "epoch": 0.02247227082296666, + "grad_norm": 0.5342602133750916, + "learning_rate": 4.998486294491438e-05, + "loss": 0.8562, + "step": 1400 + }, + { + "epoch": 0.02263278704313071, + "grad_norm": 0.7622414827346802, + "learning_rate": 4.998464279500233e-05, + "loss": 0.8883, + "step": 1410 + }, + { + "epoch": 0.022793303263294755, + "grad_norm": 0.7113037705421448, + "learning_rate": 4.998442105621366e-05, + "loss": 0.9299, + "step": 1420 + }, + { + "epoch": 0.022953819483458805, + "grad_norm": 0.7185383439064026, + "learning_rate": 4.998419772856244e-05, + "loss": 0.7962, + "step": 1430 + }, + { + "epoch": 0.02311433570362285, + "grad_norm": 0.7509153485298157, + "learning_rate": 4.99839728120629e-05, + "loss": 0.8607, + "step": 1440 + }, + { + "epoch": 0.0232748519237869, + "grad_norm": 0.6829584836959839, + "learning_rate": 4.998374630672932e-05, + "loss": 0.7685, + "step": 1450 + }, + { + "epoch": 0.023435368143950946, + "grad_norm": 0.6476813554763794, + "learning_rate": 4.9983518212576114e-05, + "loss": 0.8757, + "step": 1460 + }, + { + "epoch": 0.023595884364114993, + "grad_norm": 0.8469405174255371, + "learning_rate": 4.9983288529617786e-05, + "loss": 0.9059, + "step": 1470 + }, + { + "epoch": 0.023756400584279042, + "grad_norm": 0.6791167259216309, + "learning_rate": 4.998305725786896e-05, + "loss": 0.772, + "step": 1480 + }, + { + "epoch": 0.02391691680444309, + "grad_norm": 0.7000095248222351, + "learning_rate": 4.9982824397344305e-05, + "loss": 0.832, + "step": 1490 + }, + { + "epoch": 0.024077433024607138, + "grad_norm": 0.9232967495918274, + "learning_rate": 4.998258994805867e-05, + "loss": 0.8483, + "step": 1500 + }, + { + "epoch": 0.024237949244771184, + "grad_norm": 0.6151039600372314, + "learning_rate": 4.9982353910026946e-05, + "loss": 0.8375, + "step": 1510 + }, + { + "epoch": 0.02439846546493523, + "grad_norm": 0.7132372856140137, + "learning_rate": 4.998211628326414e-05, + "loss": 0.8732, + "step": 1520 + }, + { + "epoch": 0.02455898168509928, + "grad_norm": 0.5700934529304504, + "learning_rate": 4.998187706778537e-05, + "loss": 0.8976, + "step": 1530 + }, + { + "epoch": 0.024719497905263326, + "grad_norm": 0.8723342418670654, + "learning_rate": 4.998163626360585e-05, + "loss": 0.8034, + "step": 1540 + }, + { + "epoch": 0.024880014125427376, + "grad_norm": 0.7985078692436218, + "learning_rate": 4.998139387074088e-05, + "loss": 0.8151, + "step": 1550 + }, + { + "epoch": 0.025040530345591422, + "grad_norm": 0.5757223963737488, + "learning_rate": 4.99811498892059e-05, + "loss": 0.8728, + "step": 1560 + }, + { + "epoch": 0.025201046565755468, + "grad_norm": 1.3348075151443481, + "learning_rate": 4.9980904319016405e-05, + "loss": 0.8185, + "step": 1570 + }, + { + "epoch": 0.025361562785919518, + "grad_norm": 0.8669511079788208, + "learning_rate": 4.9980657160188015e-05, + "loss": 0.8652, + "step": 1580 + }, + { + "epoch": 0.025522079006083564, + "grad_norm": 0.5382086038589478, + "learning_rate": 4.998040841273646e-05, + "loss": 0.8643, + "step": 1590 + }, + { + "epoch": 0.025682595226247613, + "grad_norm": 0.747174859046936, + "learning_rate": 4.998015807667754e-05, + "loss": 0.7918, + "step": 1600 + }, + { + "epoch": 0.02584311144641166, + "grad_norm": 2.514329195022583, + "learning_rate": 4.997990615202719e-05, + "loss": 0.8165, + "step": 1610 + }, + { + "epoch": 0.02600362766657571, + "grad_norm": 0.5090530514717102, + "learning_rate": 4.9979652638801424e-05, + "loss": 0.7317, + "step": 1620 + }, + { + "epoch": 0.026164143886739755, + "grad_norm": 0.7556232213973999, + "learning_rate": 4.997939753701637e-05, + "loss": 0.9333, + "step": 1630 + }, + { + "epoch": 0.0263246601069038, + "grad_norm": 0.7190501093864441, + "learning_rate": 4.9979140846688246e-05, + "loss": 0.9107, + "step": 1640 + }, + { + "epoch": 0.02648517632706785, + "grad_norm": 0.5784563422203064, + "learning_rate": 4.997888256783337e-05, + "loss": 0.8589, + "step": 1650 + }, + { + "epoch": 0.026645692547231897, + "grad_norm": 0.5888465046882629, + "learning_rate": 4.997862270046818e-05, + "loss": 0.8205, + "step": 1660 + }, + { + "epoch": 0.026806208767395947, + "grad_norm": 0.9862402677536011, + "learning_rate": 4.997836124460919e-05, + "loss": 0.7859, + "step": 1670 + }, + { + "epoch": 0.026966724987559993, + "grad_norm": 0.6584517955780029, + "learning_rate": 4.997809820027305e-05, + "loss": 0.9107, + "step": 1680 + }, + { + "epoch": 0.02712724120772404, + "grad_norm": 0.5824911594390869, + "learning_rate": 4.997783356747645e-05, + "loss": 0.871, + "step": 1690 + }, + { + "epoch": 0.02728775742788809, + "grad_norm": 0.5213330388069153, + "learning_rate": 4.9977567346236254e-05, + "loss": 0.7931, + "step": 1700 + }, + { + "epoch": 0.027448273648052135, + "grad_norm": 0.5199209451675415, + "learning_rate": 4.997729953656937e-05, + "loss": 0.8166, + "step": 1710 + }, + { + "epoch": 0.027608789868216185, + "grad_norm": 0.9189457893371582, + "learning_rate": 4.997703013849284e-05, + "loss": 0.8479, + "step": 1720 + }, + { + "epoch": 0.02776930608838023, + "grad_norm": 0.42576467990875244, + "learning_rate": 4.99767591520238e-05, + "loss": 0.9045, + "step": 1730 + }, + { + "epoch": 0.027929822308544277, + "grad_norm": 0.3570556342601776, + "learning_rate": 4.997648657717947e-05, + "loss": 0.8075, + "step": 1740 + }, + { + "epoch": 0.028090338528708327, + "grad_norm": 0.7743598222732544, + "learning_rate": 4.9976212413977186e-05, + "loss": 0.8606, + "step": 1750 + }, + { + "epoch": 0.028250854748872373, + "grad_norm": 0.8254098892211914, + "learning_rate": 4.99759366624344e-05, + "loss": 0.8341, + "step": 1760 + }, + { + "epoch": 0.028411370969036422, + "grad_norm": 0.7195034027099609, + "learning_rate": 4.997565932256862e-05, + "loss": 0.881, + "step": 1770 + }, + { + "epoch": 0.02857188718920047, + "grad_norm": 0.7071020603179932, + "learning_rate": 4.9975380394397515e-05, + "loss": 0.8353, + "step": 1780 + }, + { + "epoch": 0.028732403409364515, + "grad_norm": 0.6236571669578552, + "learning_rate": 4.99750998779388e-05, + "loss": 0.8678, + "step": 1790 + }, + { + "epoch": 0.028892919629528564, + "grad_norm": 0.9052008986473083, + "learning_rate": 4.997481777321033e-05, + "loss": 0.8655, + "step": 1800 + }, + { + "epoch": 0.02905343584969261, + "grad_norm": 0.5774853229522705, + "learning_rate": 4.9974534080230026e-05, + "loss": 0.8115, + "step": 1810 + }, + { + "epoch": 0.02921395206985666, + "grad_norm": 0.7047586441040039, + "learning_rate": 4.997424879901594e-05, + "loss": 0.9133, + "step": 1820 + }, + { + "epoch": 0.029374468290020706, + "grad_norm": 0.6954417824745178, + "learning_rate": 4.997396192958622e-05, + "loss": 0.8808, + "step": 1830 + }, + { + "epoch": 0.029534984510184756, + "grad_norm": 0.6610208749771118, + "learning_rate": 4.9973673471959106e-05, + "loss": 0.7705, + "step": 1840 + }, + { + "epoch": 0.029695500730348802, + "grad_norm": 0.7739754319190979, + "learning_rate": 4.997338342615293e-05, + "loss": 0.7407, + "step": 1850 + }, + { + "epoch": 0.029856016950512848, + "grad_norm": 1.4850894212722778, + "learning_rate": 4.9973091792186154e-05, + "loss": 0.9753, + "step": 1860 + }, + { + "epoch": 0.030016533170676898, + "grad_norm": 0.9567078948020935, + "learning_rate": 4.9972798570077316e-05, + "loss": 0.8572, + "step": 1870 + }, + { + "epoch": 0.030177049390840944, + "grad_norm": 0.5358684659004211, + "learning_rate": 4.9972503759845066e-05, + "loss": 0.7801, + "step": 1880 + }, + { + "epoch": 0.030337565611004993, + "grad_norm": 1.079375982284546, + "learning_rate": 4.997220736150815e-05, + "loss": 0.8708, + "step": 1890 + }, + { + "epoch": 0.03049808183116904, + "grad_norm": 0.43019306659698486, + "learning_rate": 4.997190937508541e-05, + "loss": 0.9848, + "step": 1900 + }, + { + "epoch": 0.030658598051333086, + "grad_norm": 0.5155695676803589, + "learning_rate": 4.997160980059582e-05, + "loss": 0.9397, + "step": 1910 + }, + { + "epoch": 0.030819114271497135, + "grad_norm": 0.5319948792457581, + "learning_rate": 4.99713086380584e-05, + "loss": 0.8549, + "step": 1920 + }, + { + "epoch": 0.03097963049166118, + "grad_norm": 0.9198108911514282, + "learning_rate": 4.997100588749233e-05, + "loss": 0.8666, + "step": 1930 + }, + { + "epoch": 0.03114014671182523, + "grad_norm": 0.6465034484863281, + "learning_rate": 4.997070154891684e-05, + "loss": 0.7754, + "step": 1940 + }, + { + "epoch": 0.03130066293198928, + "grad_norm": 0.8467745780944824, + "learning_rate": 4.9970395622351296e-05, + "loss": 0.7839, + "step": 1950 + }, + { + "epoch": 0.031461179152153324, + "grad_norm": 0.5268826484680176, + "learning_rate": 4.9970088107815156e-05, + "loss": 0.8856, + "step": 1960 + }, + { + "epoch": 0.03162169537231737, + "grad_norm": 0.7875310778617859, + "learning_rate": 4.9969779005327966e-05, + "loss": 0.9103, + "step": 1970 + }, + { + "epoch": 0.03178221159248142, + "grad_norm": 0.6687209010124207, + "learning_rate": 4.9969468314909397e-05, + "loss": 0.8513, + "step": 1980 + }, + { + "epoch": 0.03194272781264547, + "grad_norm": 0.7546173334121704, + "learning_rate": 4.996915603657919e-05, + "loss": 0.881, + "step": 1990 + }, + { + "epoch": 0.032103244032809515, + "grad_norm": 0.688092827796936, + "learning_rate": 4.996884217035722e-05, + "loss": 0.8495, + "step": 2000 + }, + { + "epoch": 0.03226376025297356, + "grad_norm": 0.6381787061691284, + "learning_rate": 4.996852671626344e-05, + "loss": 0.7526, + "step": 2010 + }, + { + "epoch": 0.03242427647313761, + "grad_norm": 0.45199352502822876, + "learning_rate": 4.996820967431791e-05, + "loss": 0.7462, + "step": 2020 + }, + { + "epoch": 0.03258479269330166, + "grad_norm": 0.7628257870674133, + "learning_rate": 4.9967891044540796e-05, + "loss": 0.9483, + "step": 2030 + }, + { + "epoch": 0.03274530891346571, + "grad_norm": 0.8477115631103516, + "learning_rate": 4.996757082695235e-05, + "loss": 0.7421, + "step": 2040 + }, + { + "epoch": 0.03290582513362975, + "grad_norm": 1.6098670959472656, + "learning_rate": 4.9967249021572946e-05, + "loss": 0.8572, + "step": 2050 + }, + { + "epoch": 0.0330663413537938, + "grad_norm": 0.8056533932685852, + "learning_rate": 4.9966925628423053e-05, + "loss": 0.8795, + "step": 2060 + }, + { + "epoch": 0.033226857573957845, + "grad_norm": 0.6823061108589172, + "learning_rate": 4.996660064752323e-05, + "loss": 0.8182, + "step": 2070 + }, + { + "epoch": 0.0333873737941219, + "grad_norm": 0.7751241326332092, + "learning_rate": 4.9966274078894145e-05, + "loss": 0.8852, + "step": 2080 + }, + { + "epoch": 0.033547890014285944, + "grad_norm": 0.7673413753509521, + "learning_rate": 4.9965945922556565e-05, + "loss": 0.9195, + "step": 2090 + }, + { + "epoch": 0.03370840623444999, + "grad_norm": 0.6893574595451355, + "learning_rate": 4.9965616178531355e-05, + "loss": 0.8467, + "step": 2100 + }, + { + "epoch": 0.03386892245461404, + "grad_norm": 0.960910975933075, + "learning_rate": 4.9965284846839495e-05, + "loss": 0.8475, + "step": 2110 + }, + { + "epoch": 0.03402943867477809, + "grad_norm": 0.5912385582923889, + "learning_rate": 4.9964951927502046e-05, + "loss": 0.8889, + "step": 2120 + }, + { + "epoch": 0.034189954894942136, + "grad_norm": 0.9482006430625916, + "learning_rate": 4.996461742054018e-05, + "loss": 0.6803, + "step": 2130 + }, + { + "epoch": 0.03435047111510618, + "grad_norm": 0.783513069152832, + "learning_rate": 4.996428132597518e-05, + "loss": 0.8905, + "step": 2140 + }, + { + "epoch": 0.03451098733527023, + "grad_norm": 0.4125489294528961, + "learning_rate": 4.996394364382842e-05, + "loss": 0.8233, + "step": 2150 + }, + { + "epoch": 0.034671503555434274, + "grad_norm": 1.0927475690841675, + "learning_rate": 4.9963604374121356e-05, + "loss": 0.7269, + "step": 2160 + }, + { + "epoch": 0.03483201977559833, + "grad_norm": 0.7588656544685364, + "learning_rate": 4.996326351687558e-05, + "loss": 0.7631, + "step": 2170 + }, + { + "epoch": 0.034992535995762374, + "grad_norm": 0.544887900352478, + "learning_rate": 4.9962921072112766e-05, + "loss": 0.7945, + "step": 2180 + }, + { + "epoch": 0.03515305221592642, + "grad_norm": 0.5539636015892029, + "learning_rate": 4.996257703985469e-05, + "loss": 0.7643, + "step": 2190 + }, + { + "epoch": 0.035313568436090466, + "grad_norm": 0.6775923371315002, + "learning_rate": 4.996223142012322e-05, + "loss": 0.8405, + "step": 2200 + }, + { + "epoch": 0.03547408465625451, + "grad_norm": 0.6526904702186584, + "learning_rate": 4.996188421294035e-05, + "loss": 0.8095, + "step": 2210 + }, + { + "epoch": 0.035634600876418565, + "grad_norm": 0.7234871983528137, + "learning_rate": 4.996153541832816e-05, + "loss": 0.9221, + "step": 2220 + }, + { + "epoch": 0.03579511709658261, + "grad_norm": 0.50689297914505, + "learning_rate": 4.9961185036308824e-05, + "loss": 0.818, + "step": 2230 + }, + { + "epoch": 0.03595563331674666, + "grad_norm": 0.6010943055152893, + "learning_rate": 4.996083306690462e-05, + "loss": 0.8049, + "step": 2240 + }, + { + "epoch": 0.036116149536910704, + "grad_norm": 0.8233171105384827, + "learning_rate": 4.9960479510137945e-05, + "loss": 0.9811, + "step": 2250 + }, + { + "epoch": 0.03627666575707475, + "grad_norm": 0.7345826625823975, + "learning_rate": 4.996012436603128e-05, + "loss": 0.8951, + "step": 2260 + }, + { + "epoch": 0.0364371819772388, + "grad_norm": 0.5373842120170593, + "learning_rate": 4.99597676346072e-05, + "loss": 0.9217, + "step": 2270 + }, + { + "epoch": 0.03659769819740285, + "grad_norm": 1.1832997798919678, + "learning_rate": 4.995940931588841e-05, + "loss": 0.8409, + "step": 2280 + }, + { + "epoch": 0.036758214417566895, + "grad_norm": 0.8816674947738647, + "learning_rate": 4.995904940989766e-05, + "loss": 0.8836, + "step": 2290 + }, + { + "epoch": 0.03691873063773094, + "grad_norm": 0.6132352352142334, + "learning_rate": 4.995868791665788e-05, + "loss": 0.7748, + "step": 2300 + }, + { + "epoch": 0.03707924685789499, + "grad_norm": 0.6240204572677612, + "learning_rate": 4.995832483619204e-05, + "loss": 0.8774, + "step": 2310 + }, + { + "epoch": 0.03723976307805904, + "grad_norm": 0.6416323184967041, + "learning_rate": 4.9957960168523225e-05, + "loss": 0.7027, + "step": 2320 + }, + { + "epoch": 0.03740027929822309, + "grad_norm": 0.6976572275161743, + "learning_rate": 4.995759391367464e-05, + "loss": 0.7717, + "step": 2330 + }, + { + "epoch": 0.03756079551838713, + "grad_norm": 3.74861741065979, + "learning_rate": 4.995722607166956e-05, + "loss": 0.8162, + "step": 2340 + }, + { + "epoch": 0.03772131173855118, + "grad_norm": 2.118655204772949, + "learning_rate": 4.995685664253139e-05, + "loss": 0.8724, + "step": 2350 + }, + { + "epoch": 0.037881827958715225, + "grad_norm": 0.6331731677055359, + "learning_rate": 4.995648562628362e-05, + "loss": 0.8845, + "step": 2360 + }, + { + "epoch": 0.03804234417887928, + "grad_norm": 0.5381685495376587, + "learning_rate": 4.995611302294984e-05, + "loss": 0.8673, + "step": 2370 + }, + { + "epoch": 0.038202860399043324, + "grad_norm": 1.8272955417633057, + "learning_rate": 4.995573883255376e-05, + "loss": 0.9857, + "step": 2380 + }, + { + "epoch": 0.03836337661920737, + "grad_norm": 0.6454482078552246, + "learning_rate": 4.9955363055119156e-05, + "loss": 1.061, + "step": 2390 + }, + { + "epoch": 0.03852389283937142, + "grad_norm": 0.7973785996437073, + "learning_rate": 4.995498569066993e-05, + "loss": 0.8611, + "step": 2400 + }, + { + "epoch": 0.03868440905953546, + "grad_norm": 0.7185001969337463, + "learning_rate": 4.995460673923009e-05, + "loss": 0.9035, + "step": 2410 + }, + { + "epoch": 0.038844925279699516, + "grad_norm": 0.7436386942863464, + "learning_rate": 4.995422620082373e-05, + "loss": 0.8305, + "step": 2420 + }, + { + "epoch": 0.03900544149986356, + "grad_norm": 0.6751053929328918, + "learning_rate": 4.995384407547506e-05, + "loss": 0.8371, + "step": 2430 + }, + { + "epoch": 0.03916595772002761, + "grad_norm": 0.9697868824005127, + "learning_rate": 4.995346036320836e-05, + "loss": 0.7688, + "step": 2440 + }, + { + "epoch": 0.039326473940191654, + "grad_norm": 0.975233793258667, + "learning_rate": 4.995307506404804e-05, + "loss": 0.7993, + "step": 2450 + }, + { + "epoch": 0.0394869901603557, + "grad_norm": 0.5289736390113831, + "learning_rate": 4.995268817801861e-05, + "loss": 0.8213, + "step": 2460 + }, + { + "epoch": 0.039647506380519754, + "grad_norm": 0.5036275386810303, + "learning_rate": 4.9952299705144666e-05, + "loss": 0.9674, + "step": 2470 + }, + { + "epoch": 0.0398080226006838, + "grad_norm": 1.447878122329712, + "learning_rate": 4.995190964545092e-05, + "loss": 0.8835, + "step": 2480 + }, + { + "epoch": 0.039968538820847846, + "grad_norm": 0.7461075782775879, + "learning_rate": 4.9951517998962175e-05, + "loss": 0.8119, + "step": 2490 + }, + { + "epoch": 0.04012905504101189, + "grad_norm": 0.5584686994552612, + "learning_rate": 4.995112476570334e-05, + "loss": 0.8959, + "step": 2500 + }, + { + "epoch": 0.040289571261175945, + "grad_norm": 0.9922535419464111, + "learning_rate": 4.9950729945699406e-05, + "loss": 0.8239, + "step": 2510 + }, + { + "epoch": 0.04045008748133999, + "grad_norm": 0.6139997839927673, + "learning_rate": 4.99503335389755e-05, + "loss": 1.0045, + "step": 2520 + }, + { + "epoch": 0.04061060370150404, + "grad_norm": 0.6656941771507263, + "learning_rate": 4.994993554555683e-05, + "loss": 0.8757, + "step": 2530 + }, + { + "epoch": 0.040771119921668084, + "grad_norm": 0.7273666858673096, + "learning_rate": 4.994953596546869e-05, + "loss": 0.7113, + "step": 2540 + }, + { + "epoch": 0.04093163614183213, + "grad_norm": 0.5209974050521851, + "learning_rate": 4.994913479873651e-05, + "loss": 0.8895, + "step": 2550 + }, + { + "epoch": 0.04109215236199618, + "grad_norm": 0.7448940277099609, + "learning_rate": 4.9948732045385786e-05, + "loss": 0.8022, + "step": 2560 + }, + { + "epoch": 0.04125266858216023, + "grad_norm": 0.8075540661811829, + "learning_rate": 4.9948327705442145e-05, + "loss": 0.8895, + "step": 2570 + }, + { + "epoch": 0.041413184802324275, + "grad_norm": 0.5702062845230103, + "learning_rate": 4.994792177893129e-05, + "loss": 0.8269, + "step": 2580 + }, + { + "epoch": 0.04157370102248832, + "grad_norm": 0.6395582556724548, + "learning_rate": 4.9947514265879035e-05, + "loss": 0.779, + "step": 2590 + }, + { + "epoch": 0.04173421724265237, + "grad_norm": 0.6368842720985413, + "learning_rate": 4.99471051663113e-05, + "loss": 0.8796, + "step": 2600 + }, + { + "epoch": 0.04189473346281642, + "grad_norm": 0.7900396585464478, + "learning_rate": 4.994669448025411e-05, + "loss": 0.9113, + "step": 2610 + }, + { + "epoch": 0.04205524968298047, + "grad_norm": 0.6968269348144531, + "learning_rate": 4.9946282207733574e-05, + "loss": 0.8294, + "step": 2620 + }, + { + "epoch": 0.04221576590314451, + "grad_norm": 0.7174068689346313, + "learning_rate": 4.9945868348775904e-05, + "loss": 0.7205, + "step": 2630 + }, + { + "epoch": 0.04237628212330856, + "grad_norm": 1.7732943296432495, + "learning_rate": 4.994545290340742e-05, + "loss": 0.8395, + "step": 2640 + }, + { + "epoch": 0.042536798343472605, + "grad_norm": 1.1805360317230225, + "learning_rate": 4.9945035871654555e-05, + "loss": 0.8429, + "step": 2650 + }, + { + "epoch": 0.04269731456363666, + "grad_norm": 0.5873373746871948, + "learning_rate": 4.994461725354381e-05, + "loss": 0.825, + "step": 2660 + }, + { + "epoch": 0.042857830783800704, + "grad_norm": 0.841671884059906, + "learning_rate": 4.994419704910183e-05, + "loss": 0.9324, + "step": 2670 + }, + { + "epoch": 0.04301834700396475, + "grad_norm": 0.7017927169799805, + "learning_rate": 4.994377525835532e-05, + "loss": 0.7808, + "step": 2680 + }, + { + "epoch": 0.0431788632241288, + "grad_norm": 0.8174642324447632, + "learning_rate": 4.9943351881331107e-05, + "loss": 0.7071, + "step": 2690 + }, + { + "epoch": 0.04333937944429284, + "grad_norm": 0.7433910369873047, + "learning_rate": 4.994292691805612e-05, + "loss": 0.8006, + "step": 2700 + }, + { + "epoch": 0.043499895664456896, + "grad_norm": 0.6868785619735718, + "learning_rate": 4.9942500368557376e-05, + "loss": 0.7952, + "step": 2710 + }, + { + "epoch": 0.04366041188462094, + "grad_norm": 0.7200313806533813, + "learning_rate": 4.9942072232862016e-05, + "loss": 0.8445, + "step": 2720 + }, + { + "epoch": 0.04382092810478499, + "grad_norm": 1.0808799266815186, + "learning_rate": 4.994164251099724e-05, + "loss": 0.7419, + "step": 2730 + }, + { + "epoch": 0.043981444324949034, + "grad_norm": 0.6139755249023438, + "learning_rate": 4.9941211202990415e-05, + "loss": 0.8413, + "step": 2740 + }, + { + "epoch": 0.04414196054511308, + "grad_norm": 0.7271484732627869, + "learning_rate": 4.994077830886893e-05, + "loss": 0.8343, + "step": 2750 + }, + { + "epoch": 0.044302476765277134, + "grad_norm": 0.943121612071991, + "learning_rate": 4.994034382866034e-05, + "loss": 0.9527, + "step": 2760 + }, + { + "epoch": 0.04446299298544118, + "grad_norm": 1.03134024143219, + "learning_rate": 4.993990776239227e-05, + "loss": 0.9253, + "step": 2770 + }, + { + "epoch": 0.044623509205605226, + "grad_norm": 0.8567516803741455, + "learning_rate": 4.993947011009244e-05, + "loss": 0.8859, + "step": 2780 + }, + { + "epoch": 0.04478402542576927, + "grad_norm": 0.9977815747261047, + "learning_rate": 4.99390308717887e-05, + "loss": 0.7696, + "step": 2790 + }, + { + "epoch": 0.04494454164593332, + "grad_norm": 0.8506883382797241, + "learning_rate": 4.993859004750896e-05, + "loss": 0.8843, + "step": 2800 + }, + { + "epoch": 0.04510505786609737, + "grad_norm": 0.6180737018585205, + "learning_rate": 4.993814763728129e-05, + "loss": 0.793, + "step": 2810 + }, + { + "epoch": 0.04526557408626142, + "grad_norm": 0.9680590629577637, + "learning_rate": 4.993770364113378e-05, + "loss": 0.9464, + "step": 2820 + }, + { + "epoch": 0.045426090306425464, + "grad_norm": 0.8054211139678955, + "learning_rate": 4.99372580590947e-05, + "loss": 0.8002, + "step": 2830 + }, + { + "epoch": 0.04558660652658951, + "grad_norm": 0.6427432298660278, + "learning_rate": 4.993681089119237e-05, + "loss": 0.877, + "step": 2840 + }, + { + "epoch": 0.045747122746753556, + "grad_norm": 0.639093279838562, + "learning_rate": 4.9936362137455234e-05, + "loss": 0.8048, + "step": 2850 + }, + { + "epoch": 0.04590763896691761, + "grad_norm": 0.7337633967399597, + "learning_rate": 4.993591179791183e-05, + "loss": 0.7931, + "step": 2860 + }, + { + "epoch": 0.046068155187081655, + "grad_norm": 0.7473687529563904, + "learning_rate": 4.993545987259078e-05, + "loss": 0.8375, + "step": 2870 + }, + { + "epoch": 0.0462286714072457, + "grad_norm": 0.5373056530952454, + "learning_rate": 4.993500636152085e-05, + "loss": 0.7359, + "step": 2880 + }, + { + "epoch": 0.04638918762740975, + "grad_norm": 0.8950625061988831, + "learning_rate": 4.993455126473088e-05, + "loss": 0.8986, + "step": 2890 + }, + { + "epoch": 0.0465497038475738, + "grad_norm": 0.6049714684486389, + "learning_rate": 4.993409458224978e-05, + "loss": 0.8017, + "step": 2900 + }, + { + "epoch": 0.04671022006773785, + "grad_norm": 0.6163057684898376, + "learning_rate": 4.993363631410662e-05, + "loss": 0.8693, + "step": 2910 + }, + { + "epoch": 0.04687073628790189, + "grad_norm": 0.7487789392471313, + "learning_rate": 4.9933176460330544e-05, + "loss": 0.8928, + "step": 2920 + }, + { + "epoch": 0.04703125250806594, + "grad_norm": 0.473751038312912, + "learning_rate": 4.993271502095078e-05, + "loss": 0.7566, + "step": 2930 + }, + { + "epoch": 0.047191768728229985, + "grad_norm": 0.7331180572509766, + "learning_rate": 4.9932251995996683e-05, + "loss": 0.7881, + "step": 2940 + }, + { + "epoch": 0.04735228494839404, + "grad_norm": 0.7221668362617493, + "learning_rate": 4.99317873854977e-05, + "loss": 0.9219, + "step": 2950 + }, + { + "epoch": 0.047512801168558084, + "grad_norm": 0.6089577078819275, + "learning_rate": 4.993132118948337e-05, + "loss": 0.7515, + "step": 2960 + }, + { + "epoch": 0.04767331738872213, + "grad_norm": 0.5228952765464783, + "learning_rate": 4.9930853407983344e-05, + "loss": 0.8704, + "step": 2970 + }, + { + "epoch": 0.04783383360888618, + "grad_norm": 0.7228468656539917, + "learning_rate": 4.9930384041027364e-05, + "loss": 0.8588, + "step": 2980 + }, + { + "epoch": 0.04799434982905022, + "grad_norm": 0.9450807571411133, + "learning_rate": 4.99299130886453e-05, + "loss": 0.7913, + "step": 2990 + }, + { + "epoch": 0.048154866049214276, + "grad_norm": 0.978137731552124, + "learning_rate": 4.992944055086708e-05, + "loss": 0.8482, + "step": 3000 + }, + { + "epoch": 0.04831538226937832, + "grad_norm": 0.6149356365203857, + "learning_rate": 4.9928966427722754e-05, + "loss": 0.8939, + "step": 3010 + }, + { + "epoch": 0.04847589848954237, + "grad_norm": 0.6904190182685852, + "learning_rate": 4.992849071924249e-05, + "loss": 0.8482, + "step": 3020 + }, + { + "epoch": 0.048636414709706415, + "grad_norm": 0.6471465826034546, + "learning_rate": 4.9928013425456534e-05, + "loss": 0.7764, + "step": 3030 + }, + { + "epoch": 0.04879693092987046, + "grad_norm": 0.985926628112793, + "learning_rate": 4.992753454639524e-05, + "loss": 0.7959, + "step": 3040 + }, + { + "epoch": 0.048957447150034514, + "grad_norm": 0.7977290749549866, + "learning_rate": 4.9927054082089045e-05, + "loss": 0.8771, + "step": 3050 + }, + { + "epoch": 0.04911796337019856, + "grad_norm": 0.6518970727920532, + "learning_rate": 4.992657203256853e-05, + "loss": 0.9357, + "step": 3060 + }, + { + "epoch": 0.049278479590362606, + "grad_norm": 0.6328456401824951, + "learning_rate": 4.992608839786433e-05, + "loss": 0.8856, + "step": 3070 + }, + { + "epoch": 0.04943899581052665, + "grad_norm": 0.6585646271705627, + "learning_rate": 4.992560317800721e-05, + "loss": 0.7295, + "step": 3080 + }, + { + "epoch": 0.0495995120306907, + "grad_norm": 0.5283005237579346, + "learning_rate": 4.992511637302803e-05, + "loss": 0.8418, + "step": 3090 + }, + { + "epoch": 0.04976002825085475, + "grad_norm": 0.7332683801651001, + "learning_rate": 4.992462798295774e-05, + "loss": 0.9007, + "step": 3100 + }, + { + "epoch": 0.0499205444710188, + "grad_norm": 0.665544867515564, + "learning_rate": 4.99241380078274e-05, + "loss": 0.8895, + "step": 3110 + }, + { + "epoch": 0.050081060691182844, + "grad_norm": 0.5490899682044983, + "learning_rate": 4.9923646447668185e-05, + "loss": 0.8446, + "step": 3120 + }, + { + "epoch": 0.05024157691134689, + "grad_norm": 0.7333801984786987, + "learning_rate": 4.992315330251134e-05, + "loss": 0.8417, + "step": 3130 + }, + { + "epoch": 0.050402093131510936, + "grad_norm": 0.6811645030975342, + "learning_rate": 4.992265857238822e-05, + "loss": 0.8592, + "step": 3140 + }, + { + "epoch": 0.05056260935167499, + "grad_norm": 0.7462317943572998, + "learning_rate": 4.99221622573303e-05, + "loss": 0.8462, + "step": 3150 + }, + { + "epoch": 0.050723125571839035, + "grad_norm": 0.5971294641494751, + "learning_rate": 4.992166435736914e-05, + "loss": 0.7506, + "step": 3160 + }, + { + "epoch": 0.05088364179200308, + "grad_norm": 0.7456640005111694, + "learning_rate": 4.9921164872536405e-05, + "loss": 0.8658, + "step": 3170 + }, + { + "epoch": 0.05104415801216713, + "grad_norm": 0.5070510506629944, + "learning_rate": 4.992066380286385e-05, + "loss": 0.8022, + "step": 3180 + }, + { + "epoch": 0.051204674232331174, + "grad_norm": 1.157918930053711, + "learning_rate": 4.992016114838335e-05, + "loss": 0.8357, + "step": 3190 + }, + { + "epoch": 0.05136519045249523, + "grad_norm": 0.709805965423584, + "learning_rate": 4.991965690912687e-05, + "loss": 0.7609, + "step": 3200 + }, + { + "epoch": 0.05152570667265927, + "grad_norm": 0.7477627396583557, + "learning_rate": 4.991915108512647e-05, + "loss": 0.9228, + "step": 3210 + }, + { + "epoch": 0.05168622289282332, + "grad_norm": 0.608065128326416, + "learning_rate": 4.9918643676414326e-05, + "loss": 0.8851, + "step": 3220 + }, + { + "epoch": 0.051846739112987365, + "grad_norm": 0.547233521938324, + "learning_rate": 4.9918134683022695e-05, + "loss": 0.8292, + "step": 3230 + }, + { + "epoch": 0.05200725533315142, + "grad_norm": 0.9720051884651184, + "learning_rate": 4.991762410498395e-05, + "loss": 0.7762, + "step": 3240 + }, + { + "epoch": 0.052167771553315465, + "grad_norm": 0.6115720868110657, + "learning_rate": 4.991711194233057e-05, + "loss": 0.6953, + "step": 3250 + }, + { + "epoch": 0.05232828777347951, + "grad_norm": 0.6271176338195801, + "learning_rate": 4.9916598195095124e-05, + "loss": 0.7572, + "step": 3260 + }, + { + "epoch": 0.05248880399364356, + "grad_norm": 0.6528588533401489, + "learning_rate": 4.991608286331027e-05, + "loss": 0.8558, + "step": 3270 + }, + { + "epoch": 0.0526493202138076, + "grad_norm": 0.7678706049919128, + "learning_rate": 4.991556594700879e-05, + "loss": 0.814, + "step": 3280 + }, + { + "epoch": 0.052809836433971656, + "grad_norm": 1.0857385396957397, + "learning_rate": 4.9915047446223564e-05, + "loss": 0.7628, + "step": 3290 + }, + { + "epoch": 0.0529703526541357, + "grad_norm": 0.7009933590888977, + "learning_rate": 4.9914527360987544e-05, + "loss": 0.9097, + "step": 3300 + }, + { + "epoch": 0.05313086887429975, + "grad_norm": 0.5350796580314636, + "learning_rate": 4.991400569133383e-05, + "loss": 0.8858, + "step": 3310 + }, + { + "epoch": 0.053291385094463795, + "grad_norm": 0.5796728134155273, + "learning_rate": 4.991348243729557e-05, + "loss": 0.8522, + "step": 3320 + }, + { + "epoch": 0.05345190131462784, + "grad_norm": 0.47660040855407715, + "learning_rate": 4.991295759890606e-05, + "loss": 0.8885, + "step": 3330 + }, + { + "epoch": 0.053612417534791894, + "grad_norm": 0.5526592135429382, + "learning_rate": 4.9912431176198674e-05, + "loss": 0.9435, + "step": 3340 + }, + { + "epoch": 0.05377293375495594, + "grad_norm": 0.6522344350814819, + "learning_rate": 4.991190316920689e-05, + "loss": 1.033, + "step": 3350 + }, + { + "epoch": 0.053933449975119986, + "grad_norm": 0.6035854816436768, + "learning_rate": 4.9911373577964276e-05, + "loss": 0.8746, + "step": 3360 + }, + { + "epoch": 0.05409396619528403, + "grad_norm": 0.9521782994270325, + "learning_rate": 4.991084240250452e-05, + "loss": 0.8489, + "step": 3370 + }, + { + "epoch": 0.05425448241544808, + "grad_norm": 0.5610072016716003, + "learning_rate": 4.9910309642861395e-05, + "loss": 0.8311, + "step": 3380 + }, + { + "epoch": 0.05441499863561213, + "grad_norm": 0.6748316884040833, + "learning_rate": 4.9909775299068786e-05, + "loss": 0.9182, + "step": 3390 + }, + { + "epoch": 0.05457551485577618, + "grad_norm": 0.486301064491272, + "learning_rate": 4.9909239371160676e-05, + "loss": 0.8221, + "step": 3400 + }, + { + "epoch": 0.054736031075940224, + "grad_norm": 0.5103740096092224, + "learning_rate": 4.990870185917115e-05, + "loss": 0.791, + "step": 3410 + }, + { + "epoch": 0.05489654729610427, + "grad_norm": 1.1566028594970703, + "learning_rate": 4.990816276313438e-05, + "loss": 0.8556, + "step": 3420 + }, + { + "epoch": 0.055057063516268316, + "grad_norm": 0.5324578881263733, + "learning_rate": 4.990762208308465e-05, + "loss": 0.7745, + "step": 3430 + }, + { + "epoch": 0.05521757973643237, + "grad_norm": 1.200225591659546, + "learning_rate": 4.9907079819056355e-05, + "loss": 0.7938, + "step": 3440 + }, + { + "epoch": 0.055378095956596415, + "grad_norm": 0.5813385844230652, + "learning_rate": 4.990653597108397e-05, + "loss": 0.8976, + "step": 3450 + }, + { + "epoch": 0.05553861217676046, + "grad_norm": 0.6667811274528503, + "learning_rate": 4.9905990539202084e-05, + "loss": 0.8747, + "step": 3460 + }, + { + "epoch": 0.05569912839692451, + "grad_norm": 1.685661792755127, + "learning_rate": 4.9905443523445385e-05, + "loss": 0.8028, + "step": 3470 + }, + { + "epoch": 0.055859644617088554, + "grad_norm": 1.422810435295105, + "learning_rate": 4.990489492384866e-05, + "loss": 0.6897, + "step": 3480 + }, + { + "epoch": 0.05602016083725261, + "grad_norm": 3.1430835723876953, + "learning_rate": 4.990434474044678e-05, + "loss": 0.7714, + "step": 3490 + }, + { + "epoch": 0.05618067705741665, + "grad_norm": 0.6666979193687439, + "learning_rate": 4.9903792973274764e-05, + "loss": 0.6677, + "step": 3500 + }, + { + "epoch": 0.0563411932775807, + "grad_norm": 0.5863631963729858, + "learning_rate": 4.9903239622367684e-05, + "loss": 0.8402, + "step": 3510 + }, + { + "epoch": 0.056501709497744745, + "grad_norm": 0.7695502042770386, + "learning_rate": 4.990268468776072e-05, + "loss": 0.8182, + "step": 3520 + }, + { + "epoch": 0.05666222571790879, + "grad_norm": 0.8387759923934937, + "learning_rate": 4.990212816948918e-05, + "loss": 0.7155, + "step": 3530 + }, + { + "epoch": 0.056822741938072845, + "grad_norm": 0.814034640789032, + "learning_rate": 4.990157006758845e-05, + "loss": 0.7612, + "step": 3540 + }, + { + "epoch": 0.05698325815823689, + "grad_norm": 0.6496325731277466, + "learning_rate": 4.9901010382094024e-05, + "loss": 0.8432, + "step": 3550 + }, + { + "epoch": 0.05714377437840094, + "grad_norm": 0.9213372468948364, + "learning_rate": 4.9900449113041495e-05, + "loss": 0.9179, + "step": 3560 + }, + { + "epoch": 0.05730429059856498, + "grad_norm": 3.3083972930908203, + "learning_rate": 4.989988626046654e-05, + "loss": 0.8018, + "step": 3570 + }, + { + "epoch": 0.05746480681872903, + "grad_norm": 1.7098815441131592, + "learning_rate": 4.989932182440498e-05, + "loss": 0.844, + "step": 3580 + }, + { + "epoch": 0.05762532303889308, + "grad_norm": 0.5278778076171875, + "learning_rate": 4.989875580489268e-05, + "loss": 0.8936, + "step": 3590 + }, + { + "epoch": 0.05778583925905713, + "grad_norm": 2.0705349445343018, + "learning_rate": 4.9898188201965665e-05, + "loss": 0.9053, + "step": 3600 + }, + { + "epoch": 0.057946355479221175, + "grad_norm": 0.45104074478149414, + "learning_rate": 4.989761901566001e-05, + "loss": 0.753, + "step": 3610 + }, + { + "epoch": 0.05810687169938522, + "grad_norm": 0.7248548865318298, + "learning_rate": 4.9897048246011936e-05, + "loss": 0.7593, + "step": 3620 + }, + { + "epoch": 0.058267387919549274, + "grad_norm": 0.9850057363510132, + "learning_rate": 4.9896475893057706e-05, + "loss": 0.8893, + "step": 3630 + }, + { + "epoch": 0.05842790413971332, + "grad_norm": 0.6829609274864197, + "learning_rate": 4.989590195683375e-05, + "loss": 0.8033, + "step": 3640 + }, + { + "epoch": 0.058588420359877366, + "grad_norm": 1.3653976917266846, + "learning_rate": 4.989532643737654e-05, + "loss": 0.8244, + "step": 3650 + }, + { + "epoch": 0.05874893658004141, + "grad_norm": 0.9167060256004333, + "learning_rate": 4.9894749334722704e-05, + "loss": 0.8357, + "step": 3660 + }, + { + "epoch": 0.05890945280020546, + "grad_norm": 0.7005366683006287, + "learning_rate": 4.989417064890891e-05, + "loss": 0.9093, + "step": 3670 + }, + { + "epoch": 0.05906996902036951, + "grad_norm": 0.7115045189857483, + "learning_rate": 4.9893590379971986e-05, + "loss": 0.8376, + "step": 3680 + }, + { + "epoch": 0.05923048524053356, + "grad_norm": 0.5496437549591064, + "learning_rate": 4.989300852794883e-05, + "loss": 0.7259, + "step": 3690 + }, + { + "epoch": 0.059391001460697604, + "grad_norm": 0.7465274930000305, + "learning_rate": 4.9892425092876436e-05, + "loss": 0.9612, + "step": 3700 + }, + { + "epoch": 0.05955151768086165, + "grad_norm": 2.5004208087921143, + "learning_rate": 4.989184007479191e-05, + "loss": 0.7713, + "step": 3710 + }, + { + "epoch": 0.059712033901025696, + "grad_norm": 0.5796469449996948, + "learning_rate": 4.989125347373245e-05, + "loss": 0.924, + "step": 3720 + }, + { + "epoch": 0.05987255012118975, + "grad_norm": 0.8845891952514648, + "learning_rate": 4.989066528973537e-05, + "loss": 0.8194, + "step": 3730 + }, + { + "epoch": 0.060033066341353795, + "grad_norm": 0.6454999446868896, + "learning_rate": 4.989007552283806e-05, + "loss": 0.8988, + "step": 3740 + }, + { + "epoch": 0.06019358256151784, + "grad_norm": 0.9743319749832153, + "learning_rate": 4.988948417307806e-05, + "loss": 0.7821, + "step": 3750 + }, + { + "epoch": 0.06035409878168189, + "grad_norm": 0.7008170485496521, + "learning_rate": 4.988889124049293e-05, + "loss": 0.7892, + "step": 3760 + }, + { + "epoch": 0.060514615001845934, + "grad_norm": 0.7502579092979431, + "learning_rate": 4.9888296725120414e-05, + "loss": 0.8698, + "step": 3770 + }, + { + "epoch": 0.06067513122200999, + "grad_norm": 0.9112087488174438, + "learning_rate": 4.9887700626998304e-05, + "loss": 0.9069, + "step": 3780 + }, + { + "epoch": 0.06083564744217403, + "grad_norm": 0.5786211490631104, + "learning_rate": 4.9887102946164504e-05, + "loss": 0.779, + "step": 3790 + }, + { + "epoch": 0.06099616366233808, + "grad_norm": 0.48969465494155884, + "learning_rate": 4.9886503682657035e-05, + "loss": 0.7792, + "step": 3800 + }, + { + "epoch": 0.061156679882502125, + "grad_norm": 0.5591505169868469, + "learning_rate": 4.9885902836514e-05, + "loss": 1.0106, + "step": 3810 + }, + { + "epoch": 0.06131719610266617, + "grad_norm": 0.7305964231491089, + "learning_rate": 4.98853004077736e-05, + "loss": 0.8644, + "step": 3820 + }, + { + "epoch": 0.061477712322830225, + "grad_norm": 1.0755397081375122, + "learning_rate": 4.988469639647417e-05, + "loss": 0.8506, + "step": 3830 + }, + { + "epoch": 0.06163822854299427, + "grad_norm": 0.5971508622169495, + "learning_rate": 4.9884090802654104e-05, + "loss": 0.9395, + "step": 3840 + }, + { + "epoch": 0.06179874476315832, + "grad_norm": 0.9282480478286743, + "learning_rate": 4.9883483626351914e-05, + "loss": 0.76, + "step": 3850 + }, + { + "epoch": 0.06195926098332236, + "grad_norm": 0.5379375219345093, + "learning_rate": 4.988287486760622e-05, + "loss": 0.7456, + "step": 3860 + }, + { + "epoch": 0.06211977720348641, + "grad_norm": 0.656202495098114, + "learning_rate": 4.988226452645573e-05, + "loss": 0.9439, + "step": 3870 + }, + { + "epoch": 0.06228029342365046, + "grad_norm": 0.8413389325141907, + "learning_rate": 4.988165260293926e-05, + "loss": 0.8367, + "step": 3880 + }, + { + "epoch": 0.06244080964381451, + "grad_norm": 0.7484732866287231, + "learning_rate": 4.9881039097095725e-05, + "loss": 0.7995, + "step": 3890 + }, + { + "epoch": 0.06260132586397855, + "grad_norm": 0.7127920985221863, + "learning_rate": 4.988042400896414e-05, + "loss": 0.7065, + "step": 3900 + }, + { + "epoch": 0.06276184208414261, + "grad_norm": 0.7519083619117737, + "learning_rate": 4.9879807338583615e-05, + "loss": 0.8589, + "step": 3910 + }, + { + "epoch": 0.06292235830430665, + "grad_norm": 0.7857540249824524, + "learning_rate": 4.9879189085993385e-05, + "loss": 0.8187, + "step": 3920 + }, + { + "epoch": 0.0630828745244707, + "grad_norm": 0.6778761744499207, + "learning_rate": 4.987856925123275e-05, + "loss": 0.832, + "step": 3930 + }, + { + "epoch": 0.06324339074463474, + "grad_norm": 0.5746778249740601, + "learning_rate": 4.9877947834341135e-05, + "loss": 0.8324, + "step": 3940 + }, + { + "epoch": 0.06340390696479879, + "grad_norm": 0.8190612196922302, + "learning_rate": 4.987732483535805e-05, + "loss": 0.8077, + "step": 3950 + }, + { + "epoch": 0.06356442318496285, + "grad_norm": 0.6815077066421509, + "learning_rate": 4.987670025432313e-05, + "loss": 0.8131, + "step": 3960 + }, + { + "epoch": 0.06372493940512688, + "grad_norm": 0.5266811847686768, + "learning_rate": 4.9876074091276085e-05, + "loss": 0.9279, + "step": 3970 + }, + { + "epoch": 0.06388545562529094, + "grad_norm": 0.5469416379928589, + "learning_rate": 4.987544634625673e-05, + "loss": 0.9088, + "step": 3980 + }, + { + "epoch": 0.06404597184545498, + "grad_norm": 1.888418197631836, + "learning_rate": 4.9874817019304984e-05, + "loss": 0.6911, + "step": 3990 + }, + { + "epoch": 0.06420648806561903, + "grad_norm": 0.7068798542022705, + "learning_rate": 4.987418611046089e-05, + "loss": 0.8426, + "step": 4000 + }, + { + "epoch": 0.06420648806561903, + "eval_loss": 0.8358421921730042, + "eval_runtime": 1832.6384, + "eval_samples_per_second": 14.313, + "eval_steps_per_second": 1.789, + "step": 4000 + }, + { + "epoch": 0.06436700428578308, + "grad_norm": 0.7507159113883972, + "learning_rate": 4.987355361976455e-05, + "loss": 0.747, + "step": 4010 + }, + { + "epoch": 0.06452752050594712, + "grad_norm": 0.5392659306526184, + "learning_rate": 4.987291954725619e-05, + "loss": 0.91, + "step": 4020 + }, + { + "epoch": 0.06468803672611118, + "grad_norm": 1.0730758905410767, + "learning_rate": 4.987228389297614e-05, + "loss": 0.8541, + "step": 4030 + }, + { + "epoch": 0.06484855294627521, + "grad_norm": 0.6917558908462524, + "learning_rate": 4.987164665696482e-05, + "loss": 0.7526, + "step": 4040 + }, + { + "epoch": 0.06500906916643927, + "grad_norm": 1.0618762969970703, + "learning_rate": 4.987100783926275e-05, + "loss": 0.8195, + "step": 4050 + }, + { + "epoch": 0.06516958538660332, + "grad_norm": 0.6466637253761292, + "learning_rate": 4.9870367439910565e-05, + "loss": 0.7317, + "step": 4060 + }, + { + "epoch": 0.06533010160676736, + "grad_norm": 0.688151478767395, + "learning_rate": 4.9869725458948976e-05, + "loss": 0.9373, + "step": 4070 + }, + { + "epoch": 0.06549061782693141, + "grad_norm": 0.6851164102554321, + "learning_rate": 4.986908189641883e-05, + "loss": 0.8585, + "step": 4080 + }, + { + "epoch": 0.06565113404709545, + "grad_norm": 0.7174627780914307, + "learning_rate": 4.9868436752361034e-05, + "loss": 0.9409, + "step": 4090 + }, + { + "epoch": 0.0658116502672595, + "grad_norm": 0.590286135673523, + "learning_rate": 4.986779002681663e-05, + "loss": 0.8813, + "step": 4100 + }, + { + "epoch": 0.06597216648742356, + "grad_norm": 0.6998488903045654, + "learning_rate": 4.986714171982674e-05, + "loss": 0.7804, + "step": 4110 + }, + { + "epoch": 0.0661326827075876, + "grad_norm": 0.5555098056793213, + "learning_rate": 4.986649183143258e-05, + "loss": 0.7455, + "step": 4120 + }, + { + "epoch": 0.06629319892775165, + "grad_norm": 0.7227287292480469, + "learning_rate": 4.98658403616755e-05, + "loss": 0.7841, + "step": 4130 + }, + { + "epoch": 0.06645371514791569, + "grad_norm": 0.7411467432975769, + "learning_rate": 4.986518731059692e-05, + "loss": 0.7846, + "step": 4140 + }, + { + "epoch": 0.06661423136807974, + "grad_norm": 0.8251270055770874, + "learning_rate": 4.9864532678238374e-05, + "loss": 0.9007, + "step": 4150 + }, + { + "epoch": 0.0667747475882438, + "grad_norm": 0.8036007881164551, + "learning_rate": 4.9863876464641485e-05, + "loss": 0.9157, + "step": 4160 + }, + { + "epoch": 0.06693526380840784, + "grad_norm": 0.7135885953903198, + "learning_rate": 4.986321866984799e-05, + "loss": 0.8454, + "step": 4170 + }, + { + "epoch": 0.06709578002857189, + "grad_norm": 0.4591521918773651, + "learning_rate": 4.986255929389972e-05, + "loss": 0.8012, + "step": 4180 + }, + { + "epoch": 0.06725629624873594, + "grad_norm": 0.8118031620979309, + "learning_rate": 4.986189833683861e-05, + "loss": 0.8181, + "step": 4190 + }, + { + "epoch": 0.06741681246889998, + "grad_norm": 0.5405042171478271, + "learning_rate": 4.986123579870669e-05, + "loss": 0.828, + "step": 4200 + }, + { + "epoch": 0.06757732868906403, + "grad_norm": 0.8515379428863525, + "learning_rate": 4.986057167954609e-05, + "loss": 0.8146, + "step": 4210 + }, + { + "epoch": 0.06773784490922807, + "grad_norm": 0.721128523349762, + "learning_rate": 4.9859905979399044e-05, + "loss": 0.7422, + "step": 4220 + }, + { + "epoch": 0.06789836112939213, + "grad_norm": 0.9396044015884399, + "learning_rate": 4.9859238698307896e-05, + "loss": 0.8526, + "step": 4230 + }, + { + "epoch": 0.06805887734955618, + "grad_norm": 0.6598013639450073, + "learning_rate": 4.985856983631507e-05, + "loss": 0.9101, + "step": 4240 + }, + { + "epoch": 0.06821939356972022, + "grad_norm": 0.6736505031585693, + "learning_rate": 4.985789939346312e-05, + "loss": 0.8183, + "step": 4250 + }, + { + "epoch": 0.06837990978988427, + "grad_norm": 0.9446700811386108, + "learning_rate": 4.9857227369794664e-05, + "loss": 0.8493, + "step": 4260 + }, + { + "epoch": 0.06854042601004831, + "grad_norm": 0.5616721510887146, + "learning_rate": 4.985655376535244e-05, + "loss": 0.9115, + "step": 4270 + }, + { + "epoch": 0.06870094223021236, + "grad_norm": 0.7519762516021729, + "learning_rate": 4.9855878580179294e-05, + "loss": 0.8474, + "step": 4280 + }, + { + "epoch": 0.06886145845037642, + "grad_norm": 0.9141714572906494, + "learning_rate": 4.9855201814318156e-05, + "loss": 0.8172, + "step": 4290 + }, + { + "epoch": 0.06902197467054046, + "grad_norm": 0.8487675786018372, + "learning_rate": 4.985452346781207e-05, + "loss": 0.8395, + "step": 4300 + }, + { + "epoch": 0.06918249089070451, + "grad_norm": 0.44210654497146606, + "learning_rate": 4.985384354070417e-05, + "loss": 0.8098, + "step": 4310 + }, + { + "epoch": 0.06934300711086855, + "grad_norm": 0.48085522651672363, + "learning_rate": 4.98531620330377e-05, + "loss": 0.834, + "step": 4320 + }, + { + "epoch": 0.0695035233310326, + "grad_norm": 0.5930798053741455, + "learning_rate": 4.985247894485601e-05, + "loss": 0.8461, + "step": 4330 + }, + { + "epoch": 0.06966403955119665, + "grad_norm": 0.5532956719398499, + "learning_rate": 4.9851794276202516e-05, + "loss": 0.7806, + "step": 4340 + }, + { + "epoch": 0.0698245557713607, + "grad_norm": 0.5843446254730225, + "learning_rate": 4.9851108027120765e-05, + "loss": 0.756, + "step": 4350 + }, + { + "epoch": 0.06998507199152475, + "grad_norm": 0.6215834021568298, + "learning_rate": 4.9850420197654414e-05, + "loss": 0.8625, + "step": 4360 + }, + { + "epoch": 0.07014558821168879, + "grad_norm": 0.823356568813324, + "learning_rate": 4.984973078784719e-05, + "loss": 0.7629, + "step": 4370 + }, + { + "epoch": 0.07030610443185284, + "grad_norm": 0.6540110111236572, + "learning_rate": 4.9849039797742945e-05, + "loss": 0.8084, + "step": 4380 + }, + { + "epoch": 0.07046662065201689, + "grad_norm": 0.4977014362812042, + "learning_rate": 4.984834722738562e-05, + "loss": 0.7975, + "step": 4390 + }, + { + "epoch": 0.07062713687218093, + "grad_norm": 0.8272045850753784, + "learning_rate": 4.984765307681925e-05, + "loss": 0.816, + "step": 4400 + }, + { + "epoch": 0.07078765309234498, + "grad_norm": 0.5218859314918518, + "learning_rate": 4.9846957346087994e-05, + "loss": 0.753, + "step": 4410 + }, + { + "epoch": 0.07094816931250902, + "grad_norm": 0.6901600360870361, + "learning_rate": 4.9846260035236084e-05, + "loss": 0.909, + "step": 4420 + }, + { + "epoch": 0.07110868553267308, + "grad_norm": 0.9629558324813843, + "learning_rate": 4.984556114430786e-05, + "loss": 0.8082, + "step": 4430 + }, + { + "epoch": 0.07126920175283713, + "grad_norm": 0.7795243263244629, + "learning_rate": 4.984486067334779e-05, + "loss": 0.836, + "step": 4440 + }, + { + "epoch": 0.07142971797300117, + "grad_norm": 0.6706166863441467, + "learning_rate": 4.98441586224004e-05, + "loss": 0.7804, + "step": 4450 + }, + { + "epoch": 0.07159023419316522, + "grad_norm": 0.6811739802360535, + "learning_rate": 4.984345499151034e-05, + "loss": 0.8111, + "step": 4460 + }, + { + "epoch": 0.07175075041332926, + "grad_norm": 0.5750110745429993, + "learning_rate": 4.984274978072236e-05, + "loss": 0.8333, + "step": 4470 + }, + { + "epoch": 0.07191126663349331, + "grad_norm": 1.889555811882019, + "learning_rate": 4.9842042990081304e-05, + "loss": 0.8059, + "step": 4480 + }, + { + "epoch": 0.07207178285365737, + "grad_norm": 0.5498510599136353, + "learning_rate": 4.9841334619632126e-05, + "loss": 0.8281, + "step": 4490 + }, + { + "epoch": 0.07223229907382141, + "grad_norm": 0.5821611881256104, + "learning_rate": 4.9840624669419866e-05, + "loss": 0.8625, + "step": 4500 + }, + { + "epoch": 0.07239281529398546, + "grad_norm": 0.734484076499939, + "learning_rate": 4.983991313948968e-05, + "loss": 0.8952, + "step": 4510 + }, + { + "epoch": 0.0725533315141495, + "grad_norm": 0.7995845675468445, + "learning_rate": 4.9839200029886814e-05, + "loss": 0.8329, + "step": 4520 + }, + { + "epoch": 0.07271384773431355, + "grad_norm": 0.6916190385818481, + "learning_rate": 4.983848534065661e-05, + "loss": 0.8466, + "step": 4530 + }, + { + "epoch": 0.0728743639544776, + "grad_norm": 0.6343372464179993, + "learning_rate": 4.983776907184453e-05, + "loss": 0.8051, + "step": 4540 + }, + { + "epoch": 0.07303488017464164, + "grad_norm": 0.5903369784355164, + "learning_rate": 4.9837051223496125e-05, + "loss": 0.9284, + "step": 4550 + }, + { + "epoch": 0.0731953963948057, + "grad_norm": 0.37210342288017273, + "learning_rate": 4.983633179565703e-05, + "loss": 0.9008, + "step": 4560 + }, + { + "epoch": 0.07335591261496974, + "grad_norm": 0.5192412734031677, + "learning_rate": 4.9835610788373024e-05, + "loss": 0.7832, + "step": 4570 + }, + { + "epoch": 0.07351642883513379, + "grad_norm": 0.5392162799835205, + "learning_rate": 4.983488820168992e-05, + "loss": 0.7748, + "step": 4580 + }, + { + "epoch": 0.07367694505529784, + "grad_norm": 0.755095899105072, + "learning_rate": 4.983416403565371e-05, + "loss": 0.8098, + "step": 4590 + }, + { + "epoch": 0.07383746127546188, + "grad_norm": 0.7568362951278687, + "learning_rate": 4.9833438290310426e-05, + "loss": 0.818, + "step": 4600 + }, + { + "epoch": 0.07399797749562594, + "grad_norm": 0.6639801263809204, + "learning_rate": 4.983271096570622e-05, + "loss": 0.8011, + "step": 4610 + }, + { + "epoch": 0.07415849371578997, + "grad_norm": 0.7641569375991821, + "learning_rate": 4.983198206188735e-05, + "loss": 0.8315, + "step": 4620 + }, + { + "epoch": 0.07431900993595403, + "grad_norm": 0.6164222955703735, + "learning_rate": 4.983125157890017e-05, + "loss": 0.8094, + "step": 4630 + }, + { + "epoch": 0.07447952615611808, + "grad_norm": 0.680185079574585, + "learning_rate": 4.983051951679114e-05, + "loss": 0.8129, + "step": 4640 + }, + { + "epoch": 0.07464004237628212, + "grad_norm": 1.4984862804412842, + "learning_rate": 4.9829785875606805e-05, + "loss": 0.7802, + "step": 4650 + }, + { + "epoch": 0.07480055859644617, + "grad_norm": 0.739003598690033, + "learning_rate": 4.982905065539382e-05, + "loss": 0.8073, + "step": 4660 + }, + { + "epoch": 0.07496107481661021, + "grad_norm": 0.5276608467102051, + "learning_rate": 4.982831385619895e-05, + "loss": 0.8837, + "step": 4670 + }, + { + "epoch": 0.07512159103677427, + "grad_norm": 0.5367230772972107, + "learning_rate": 4.982757547806904e-05, + "loss": 0.8444, + "step": 4680 + }, + { + "epoch": 0.07528210725693832, + "grad_norm": 1.1381862163543701, + "learning_rate": 4.982683552105106e-05, + "loss": 0.7094, + "step": 4690 + }, + { + "epoch": 0.07544262347710236, + "grad_norm": 0.8516503572463989, + "learning_rate": 4.9826093985192066e-05, + "loss": 0.8708, + "step": 4700 + }, + { + "epoch": 0.07560313969726641, + "grad_norm": 0.6097139716148376, + "learning_rate": 4.98253508705392e-05, + "loss": 0.8652, + "step": 4710 + }, + { + "epoch": 0.07576365591743045, + "grad_norm": 0.6801013350486755, + "learning_rate": 4.982460617713973e-05, + "loss": 0.7858, + "step": 4720 + }, + { + "epoch": 0.0759241721375945, + "grad_norm": 0.6138126254081726, + "learning_rate": 4.9823859905041006e-05, + "loss": 0.7776, + "step": 4730 + }, + { + "epoch": 0.07608468835775856, + "grad_norm": 1.023868441581726, + "learning_rate": 4.98231120542905e-05, + "loss": 0.9303, + "step": 4740 + }, + { + "epoch": 0.0762452045779226, + "grad_norm": 0.6560516953468323, + "learning_rate": 4.982236262493577e-05, + "loss": 0.7817, + "step": 4750 + }, + { + "epoch": 0.07640572079808665, + "grad_norm": 0.8147192001342773, + "learning_rate": 4.982161161702446e-05, + "loss": 0.844, + "step": 4760 + }, + { + "epoch": 0.07656623701825069, + "grad_norm": 0.7299351692199707, + "learning_rate": 4.982085903060434e-05, + "loss": 0.7914, + "step": 4770 + }, + { + "epoch": 0.07672675323841474, + "grad_norm": 0.6195127367973328, + "learning_rate": 4.982010486572327e-05, + "loss": 0.9299, + "step": 4780 + }, + { + "epoch": 0.0768872694585788, + "grad_norm": 0.8138960003852844, + "learning_rate": 4.9819349122429214e-05, + "loss": 0.8163, + "step": 4790 + }, + { + "epoch": 0.07704778567874283, + "grad_norm": 0.5003154277801514, + "learning_rate": 4.981859180077022e-05, + "loss": 0.8374, + "step": 4800 + }, + { + "epoch": 0.07720830189890689, + "grad_norm": 0.6467990875244141, + "learning_rate": 4.981783290079447e-05, + "loss": 0.9608, + "step": 4810 + }, + { + "epoch": 0.07736881811907093, + "grad_norm": 0.6087930202484131, + "learning_rate": 4.98170724225502e-05, + "loss": 0.9208, + "step": 4820 + }, + { + "epoch": 0.07752933433923498, + "grad_norm": 0.7973120808601379, + "learning_rate": 4.98163103660858e-05, + "loss": 0.7762, + "step": 4830 + }, + { + "epoch": 0.07768985055939903, + "grad_norm": 0.7601298689842224, + "learning_rate": 4.981554673144971e-05, + "loss": 0.7299, + "step": 4840 + }, + { + "epoch": 0.07785036677956307, + "grad_norm": 0.5610087513923645, + "learning_rate": 4.98147815186905e-05, + "loss": 0.7748, + "step": 4850 + }, + { + "epoch": 0.07801088299972712, + "grad_norm": 0.5368123650550842, + "learning_rate": 4.981401472785683e-05, + "loss": 0.7782, + "step": 4860 + }, + { + "epoch": 0.07817139921989116, + "grad_norm": 0.5782795548439026, + "learning_rate": 4.981324635899747e-05, + "loss": 0.8542, + "step": 4870 + }, + { + "epoch": 0.07833191544005522, + "grad_norm": 1.3043900728225708, + "learning_rate": 4.981247641216128e-05, + "loss": 0.8074, + "step": 4880 + }, + { + "epoch": 0.07849243166021927, + "grad_norm": 0.8274016976356506, + "learning_rate": 4.9811704887397225e-05, + "loss": 0.8595, + "step": 4890 + }, + { + "epoch": 0.07865294788038331, + "grad_norm": 0.7085676789283752, + "learning_rate": 4.9810931784754375e-05, + "loss": 0.8265, + "step": 4900 + }, + { + "epoch": 0.07881346410054736, + "grad_norm": 0.6755526065826416, + "learning_rate": 4.981015710428189e-05, + "loss": 0.8258, + "step": 4910 + }, + { + "epoch": 0.0789739803207114, + "grad_norm": 0.6607016324996948, + "learning_rate": 4.980938084602902e-05, + "loss": 0.7691, + "step": 4920 + }, + { + "epoch": 0.07913449654087545, + "grad_norm": 1.049286127090454, + "learning_rate": 4.980860301004515e-05, + "loss": 0.8641, + "step": 4930 + }, + { + "epoch": 0.07929501276103951, + "grad_norm": 0.6621214151382446, + "learning_rate": 4.980782359637974e-05, + "loss": 0.8625, + "step": 4940 + }, + { + "epoch": 0.07945552898120355, + "grad_norm": 0.6265298128128052, + "learning_rate": 4.9807042605082364e-05, + "loss": 0.7949, + "step": 4950 + }, + { + "epoch": 0.0796160452013676, + "grad_norm": 0.6270440816879272, + "learning_rate": 4.980626003620268e-05, + "loss": 0.9143, + "step": 4960 + }, + { + "epoch": 0.07977656142153165, + "grad_norm": 0.7270088195800781, + "learning_rate": 4.9805475889790446e-05, + "loss": 0.7329, + "step": 4970 + }, + { + "epoch": 0.07993707764169569, + "grad_norm": 0.5899733304977417, + "learning_rate": 4.980469016589554e-05, + "loss": 0.8865, + "step": 4980 + }, + { + "epoch": 0.08009759386185974, + "grad_norm": 0.9041550755500793, + "learning_rate": 4.9803902864567944e-05, + "loss": 0.8496, + "step": 4990 + }, + { + "epoch": 0.08025811008202378, + "grad_norm": 0.5721524357795715, + "learning_rate": 4.9803113985857697e-05, + "loss": 0.8068, + "step": 5000 + }, + { + "epoch": 0.08041862630218784, + "grad_norm": 0.658949613571167, + "learning_rate": 4.9802323529814974e-05, + "loss": 0.7371, + "step": 5010 + }, + { + "epoch": 0.08057914252235189, + "grad_norm": 0.6667006015777588, + "learning_rate": 4.980153149649006e-05, + "loss": 0.7961, + "step": 5020 + }, + { + "epoch": 0.08073965874251593, + "grad_norm": 0.8930113911628723, + "learning_rate": 4.9800737885933315e-05, + "loss": 0.7279, + "step": 5030 + }, + { + "epoch": 0.08090017496267998, + "grad_norm": 0.5951740145683289, + "learning_rate": 4.9799942698195197e-05, + "loss": 0.8665, + "step": 5040 + }, + { + "epoch": 0.08106069118284402, + "grad_norm": 0.4891197383403778, + "learning_rate": 4.9799145933326284e-05, + "loss": 0.9868, + "step": 5050 + }, + { + "epoch": 0.08122120740300807, + "grad_norm": 0.4886051416397095, + "learning_rate": 4.979834759137725e-05, + "loss": 0.7738, + "step": 5060 + }, + { + "epoch": 0.08138172362317213, + "grad_norm": 0.9526973366737366, + "learning_rate": 4.9797547672398864e-05, + "loss": 0.8482, + "step": 5070 + }, + { + "epoch": 0.08154223984333617, + "grad_norm": 0.581875205039978, + "learning_rate": 4.979674617644199e-05, + "loss": 0.7837, + "step": 5080 + }, + { + "epoch": 0.08170275606350022, + "grad_norm": 0.5297834873199463, + "learning_rate": 4.9795943103557604e-05, + "loss": 0.9312, + "step": 5090 + }, + { + "epoch": 0.08186327228366426, + "grad_norm": 0.5918177962303162, + "learning_rate": 4.979513845379677e-05, + "loss": 0.8669, + "step": 5100 + }, + { + "epoch": 0.08202378850382831, + "grad_norm": 0.499869704246521, + "learning_rate": 4.979433222721066e-05, + "loss": 0.7905, + "step": 5110 + }, + { + "epoch": 0.08218430472399237, + "grad_norm": 1.0463935136795044, + "learning_rate": 4.979352442385056e-05, + "loss": 0.8642, + "step": 5120 + }, + { + "epoch": 0.0823448209441564, + "grad_norm": 0.41697394847869873, + "learning_rate": 4.9792715043767815e-05, + "loss": 0.7562, + "step": 5130 + }, + { + "epoch": 0.08250533716432046, + "grad_norm": 1.1177881956100464, + "learning_rate": 4.979190408701392e-05, + "loss": 0.789, + "step": 5140 + }, + { + "epoch": 0.0826658533844845, + "grad_norm": 1.567852258682251, + "learning_rate": 4.979109155364043e-05, + "loss": 0.8806, + "step": 5150 + }, + { + "epoch": 0.08282636960464855, + "grad_norm": 0.4269421696662903, + "learning_rate": 4.979027744369904e-05, + "loss": 0.8118, + "step": 5160 + }, + { + "epoch": 0.0829868858248126, + "grad_norm": 0.5848773121833801, + "learning_rate": 4.978946175724151e-05, + "loss": 0.842, + "step": 5170 + }, + { + "epoch": 0.08314740204497664, + "grad_norm": 0.8039216995239258, + "learning_rate": 4.97886444943197e-05, + "loss": 0.7718, + "step": 5180 + }, + { + "epoch": 0.0833079182651407, + "grad_norm": 0.5829214453697205, + "learning_rate": 4.97878256549856e-05, + "loss": 0.7354, + "step": 5190 + }, + { + "epoch": 0.08346843448530473, + "grad_norm": 0.4820760190486908, + "learning_rate": 4.9787005239291276e-05, + "loss": 0.8718, + "step": 5200 + }, + { + "epoch": 0.08362895070546879, + "grad_norm": 0.9006637930870056, + "learning_rate": 4.97861832472889e-05, + "loss": 0.8186, + "step": 5210 + }, + { + "epoch": 0.08378946692563284, + "grad_norm": 1.207576036453247, + "learning_rate": 4.978535967903075e-05, + "loss": 0.7678, + "step": 5220 + }, + { + "epoch": 0.08394998314579688, + "grad_norm": 0.7606614828109741, + "learning_rate": 4.978453453456921e-05, + "loss": 0.8689, + "step": 5230 + }, + { + "epoch": 0.08411049936596093, + "grad_norm": 0.8895362019538879, + "learning_rate": 4.978370781395674e-05, + "loss": 0.8608, + "step": 5240 + }, + { + "epoch": 0.08427101558612497, + "grad_norm": 0.6444730162620544, + "learning_rate": 4.978287951724592e-05, + "loss": 0.8331, + "step": 5250 + }, + { + "epoch": 0.08443153180628903, + "grad_norm": 0.8593733906745911, + "learning_rate": 4.978204964448941e-05, + "loss": 0.7451, + "step": 5260 + }, + { + "epoch": 0.08459204802645308, + "grad_norm": 0.965161919593811, + "learning_rate": 4.978121819574001e-05, + "loss": 0.9464, + "step": 5270 + }, + { + "epoch": 0.08475256424661712, + "grad_norm": 0.9342039823532104, + "learning_rate": 4.9780385171050586e-05, + "loss": 0.9307, + "step": 5280 + }, + { + "epoch": 0.08491308046678117, + "grad_norm": 0.6402196288108826, + "learning_rate": 4.977955057047411e-05, + "loss": 0.9662, + "step": 5290 + }, + { + "epoch": 0.08507359668694521, + "grad_norm": 0.5332078337669373, + "learning_rate": 4.977871439406365e-05, + "loss": 0.8155, + "step": 5300 + }, + { + "epoch": 0.08523411290710926, + "grad_norm": 0.885236918926239, + "learning_rate": 4.9777876641872403e-05, + "loss": 0.9573, + "step": 5310 + }, + { + "epoch": 0.08539462912727332, + "grad_norm": 0.7114346623420715, + "learning_rate": 4.9777037313953625e-05, + "loss": 0.9361, + "step": 5320 + }, + { + "epoch": 0.08555514534743736, + "grad_norm": 0.8049807548522949, + "learning_rate": 4.97761964103607e-05, + "loss": 0.8305, + "step": 5330 + }, + { + "epoch": 0.08571566156760141, + "grad_norm": 0.6812551021575928, + "learning_rate": 4.977535393114711e-05, + "loss": 0.8538, + "step": 5340 + }, + { + "epoch": 0.08587617778776545, + "grad_norm": 0.4870963990688324, + "learning_rate": 4.9774509876366424e-05, + "loss": 0.9374, + "step": 5350 + }, + { + "epoch": 0.0860366940079295, + "grad_norm": 0.6675782203674316, + "learning_rate": 4.977366424607232e-05, + "loss": 0.8467, + "step": 5360 + }, + { + "epoch": 0.08619721022809355, + "grad_norm": 0.7909801006317139, + "learning_rate": 4.977281704031858e-05, + "loss": 0.8749, + "step": 5370 + }, + { + "epoch": 0.0863577264482576, + "grad_norm": 0.9837456941604614, + "learning_rate": 4.977196825915907e-05, + "loss": 0.971, + "step": 5380 + }, + { + "epoch": 0.08651824266842165, + "grad_norm": 0.6306933760643005, + "learning_rate": 4.9771117902647784e-05, + "loss": 1.0623, + "step": 5390 + }, + { + "epoch": 0.08667875888858569, + "grad_norm": 0.8662747144699097, + "learning_rate": 4.977026597083879e-05, + "loss": 0.846, + "step": 5400 + }, + { + "epoch": 0.08683927510874974, + "grad_norm": 0.8349126577377319, + "learning_rate": 4.976941246378627e-05, + "loss": 0.7465, + "step": 5410 + }, + { + "epoch": 0.08699979132891379, + "grad_norm": 0.8934556841850281, + "learning_rate": 4.976855738154449e-05, + "loss": 0.7782, + "step": 5420 + }, + { + "epoch": 0.08716030754907783, + "grad_norm": 0.5733688473701477, + "learning_rate": 4.9767700724167845e-05, + "loss": 0.8108, + "step": 5430 + }, + { + "epoch": 0.08732082376924188, + "grad_norm": 0.6906566619873047, + "learning_rate": 4.97668424917108e-05, + "loss": 0.8036, + "step": 5440 + }, + { + "epoch": 0.08748133998940592, + "grad_norm": 0.7255134582519531, + "learning_rate": 4.976598268422794e-05, + "loss": 0.8437, + "step": 5450 + }, + { + "epoch": 0.08764185620956998, + "grad_norm": 0.9949924945831299, + "learning_rate": 4.9765121301773946e-05, + "loss": 0.7983, + "step": 5460 + }, + { + "epoch": 0.08780237242973403, + "grad_norm": 0.5099851489067078, + "learning_rate": 4.976425834440359e-05, + "loss": 0.8442, + "step": 5470 + }, + { + "epoch": 0.08796288864989807, + "grad_norm": 0.6770345568656921, + "learning_rate": 4.9763393812171754e-05, + "loss": 0.7248, + "step": 5480 + }, + { + "epoch": 0.08812340487006212, + "grad_norm": 0.5260286927223206, + "learning_rate": 4.9762527705133425e-05, + "loss": 0.8668, + "step": 5490 + }, + { + "epoch": 0.08828392109022616, + "grad_norm": 0.6396322250366211, + "learning_rate": 4.976166002334367e-05, + "loss": 0.7534, + "step": 5500 + }, + { + "epoch": 0.08844443731039021, + "grad_norm": 0.697437047958374, + "learning_rate": 4.9760790766857666e-05, + "loss": 0.8802, + "step": 5510 + }, + { + "epoch": 0.08860495353055427, + "grad_norm": 0.9053968191146851, + "learning_rate": 4.975991993573072e-05, + "loss": 0.9082, + "step": 5520 + }, + { + "epoch": 0.0887654697507183, + "grad_norm": 0.630977988243103, + "learning_rate": 4.975904753001818e-05, + "loss": 0.928, + "step": 5530 + }, + { + "epoch": 0.08892598597088236, + "grad_norm": 0.6082474589347839, + "learning_rate": 4.975817354977553e-05, + "loss": 0.6979, + "step": 5540 + }, + { + "epoch": 0.0890865021910464, + "grad_norm": 0.648831844329834, + "learning_rate": 4.975729799505837e-05, + "loss": 0.7395, + "step": 5550 + }, + { + "epoch": 0.08924701841121045, + "grad_norm": 0.6335418820381165, + "learning_rate": 4.975642086592236e-05, + "loss": 0.9219, + "step": 5560 + }, + { + "epoch": 0.0894075346313745, + "grad_norm": 0.7699565291404724, + "learning_rate": 4.9755542162423294e-05, + "loss": 0.7935, + "step": 5570 + }, + { + "epoch": 0.08956805085153854, + "grad_norm": 0.8067440986633301, + "learning_rate": 4.9754661884617046e-05, + "loss": 0.7491, + "step": 5580 + }, + { + "epoch": 0.0897285670717026, + "grad_norm": 0.6367438435554504, + "learning_rate": 4.9753780032559604e-05, + "loss": 0.8713, + "step": 5590 + }, + { + "epoch": 0.08988908329186664, + "grad_norm": 0.7243168354034424, + "learning_rate": 4.975289660630703e-05, + "loss": 0.8379, + "step": 5600 + }, + { + "epoch": 0.09004959951203069, + "grad_norm": 0.54554682970047, + "learning_rate": 4.975201160591553e-05, + "loss": 0.8019, + "step": 5610 + }, + { + "epoch": 0.09021011573219474, + "grad_norm": 0.6722500920295715, + "learning_rate": 4.975112503144136e-05, + "loss": 0.7702, + "step": 5620 + }, + { + "epoch": 0.09037063195235878, + "grad_norm": 0.5174979567527771, + "learning_rate": 4.975023688294091e-05, + "loss": 0.8352, + "step": 5630 + }, + { + "epoch": 0.09053114817252284, + "grad_norm": 0.7191705703735352, + "learning_rate": 4.974934716047067e-05, + "loss": 0.8505, + "step": 5640 + }, + { + "epoch": 0.09069166439268687, + "grad_norm": 0.4101858139038086, + "learning_rate": 4.974845586408722e-05, + "loss": 0.827, + "step": 5650 + }, + { + "epoch": 0.09085218061285093, + "grad_norm": 0.5754759907722473, + "learning_rate": 4.9747562993847224e-05, + "loss": 0.8762, + "step": 5660 + }, + { + "epoch": 0.09101269683301498, + "grad_norm": 0.6240162253379822, + "learning_rate": 4.974666854980748e-05, + "loss": 0.891, + "step": 5670 + }, + { + "epoch": 0.09117321305317902, + "grad_norm": 0.6000036001205444, + "learning_rate": 4.974577253202487e-05, + "loss": 0.9223, + "step": 5680 + }, + { + "epoch": 0.09133372927334307, + "grad_norm": 0.559847891330719, + "learning_rate": 4.9744874940556356e-05, + "loss": 0.7893, + "step": 5690 + }, + { + "epoch": 0.09149424549350711, + "grad_norm": 1.095662236213684, + "learning_rate": 4.9743975775459044e-05, + "loss": 0.8646, + "step": 5700 + }, + { + "epoch": 0.09165476171367117, + "grad_norm": 0.4484492242336273, + "learning_rate": 4.97430750367901e-05, + "loss": 0.7727, + "step": 5710 + }, + { + "epoch": 0.09181527793383522, + "grad_norm": 0.5171186327934265, + "learning_rate": 4.974217272460682e-05, + "loss": 0.8204, + "step": 5720 + }, + { + "epoch": 0.09197579415399926, + "grad_norm": 0.6421565413475037, + "learning_rate": 4.974126883896657e-05, + "loss": 0.8024, + "step": 5730 + }, + { + "epoch": 0.09213631037416331, + "grad_norm": 0.7157865166664124, + "learning_rate": 4.974036337992684e-05, + "loss": 0.7474, + "step": 5740 + }, + { + "epoch": 0.09229682659432736, + "grad_norm": 0.5040503740310669, + "learning_rate": 4.973945634754521e-05, + "loss": 0.7859, + "step": 5750 + }, + { + "epoch": 0.0924573428144914, + "grad_norm": 0.6482319831848145, + "learning_rate": 4.973854774187936e-05, + "loss": 0.8166, + "step": 5760 + }, + { + "epoch": 0.09261785903465546, + "grad_norm": 0.5509939193725586, + "learning_rate": 4.973763756298707e-05, + "loss": 0.9123, + "step": 5770 + }, + { + "epoch": 0.0927783752548195, + "grad_norm": 0.4647476375102997, + "learning_rate": 4.973672581092623e-05, + "loss": 0.8205, + "step": 5780 + }, + { + "epoch": 0.09293889147498355, + "grad_norm": 0.6051394939422607, + "learning_rate": 4.973581248575482e-05, + "loss": 0.8551, + "step": 5790 + }, + { + "epoch": 0.0930994076951476, + "grad_norm": 0.5965158343315125, + "learning_rate": 4.973489758753092e-05, + "loss": 0.8347, + "step": 5800 + }, + { + "epoch": 0.09325992391531164, + "grad_norm": 0.6225267052650452, + "learning_rate": 4.973398111631271e-05, + "loss": 0.8196, + "step": 5810 + }, + { + "epoch": 0.0934204401354757, + "grad_norm": 0.49813467264175415, + "learning_rate": 4.973306307215848e-05, + "loss": 0.728, + "step": 5820 + }, + { + "epoch": 0.09358095635563973, + "grad_norm": 1.08168363571167, + "learning_rate": 4.973214345512661e-05, + "loss": 0.7016, + "step": 5830 + }, + { + "epoch": 0.09374147257580379, + "grad_norm": 0.826213538646698, + "learning_rate": 4.973122226527557e-05, + "loss": 0.9008, + "step": 5840 + }, + { + "epoch": 0.09390198879596784, + "grad_norm": 0.6427194476127625, + "learning_rate": 4.9730299502663955e-05, + "loss": 0.7945, + "step": 5850 + }, + { + "epoch": 0.09406250501613188, + "grad_norm": 0.7080214619636536, + "learning_rate": 4.9729375167350444e-05, + "loss": 0.8577, + "step": 5860 + }, + { + "epoch": 0.09422302123629593, + "grad_norm": 0.4240402579307556, + "learning_rate": 4.9728449259393815e-05, + "loss": 0.7914, + "step": 5870 + }, + { + "epoch": 0.09438353745645997, + "grad_norm": 0.5544884204864502, + "learning_rate": 4.972752177885296e-05, + "loss": 0.7989, + "step": 5880 + }, + { + "epoch": 0.09454405367662402, + "grad_norm": 0.793300449848175, + "learning_rate": 4.972659272578685e-05, + "loss": 0.7958, + "step": 5890 + }, + { + "epoch": 0.09470456989678808, + "grad_norm": 0.6919771432876587, + "learning_rate": 4.9725662100254576e-05, + "loss": 0.7577, + "step": 5900 + }, + { + "epoch": 0.09486508611695212, + "grad_norm": 1.2173265218734741, + "learning_rate": 4.9724729902315326e-05, + "loss": 0.8421, + "step": 5910 + }, + { + "epoch": 0.09502560233711617, + "grad_norm": 1.1644525527954102, + "learning_rate": 4.972379613202837e-05, + "loss": 0.8188, + "step": 5920 + }, + { + "epoch": 0.09518611855728021, + "grad_norm": 0.6745923161506653, + "learning_rate": 4.9722860789453086e-05, + "loss": 0.8061, + "step": 5930 + }, + { + "epoch": 0.09534663477744426, + "grad_norm": 0.8278532028198242, + "learning_rate": 4.9721923874648964e-05, + "loss": 0.9739, + "step": 5940 + }, + { + "epoch": 0.09550715099760831, + "grad_norm": 0.5676952004432678, + "learning_rate": 4.972098538767559e-05, + "loss": 0.7616, + "step": 5950 + }, + { + "epoch": 0.09566766721777235, + "grad_norm": 0.9391337633132935, + "learning_rate": 4.9720045328592644e-05, + "loss": 0.7889, + "step": 5960 + }, + { + "epoch": 0.0958281834379364, + "grad_norm": 1.2207417488098145, + "learning_rate": 4.971910369745991e-05, + "loss": 0.7806, + "step": 5970 + }, + { + "epoch": 0.09598869965810045, + "grad_norm": 1.053891897201538, + "learning_rate": 4.971816049433726e-05, + "loss": 0.784, + "step": 5980 + }, + { + "epoch": 0.0961492158782645, + "grad_norm": 0.8357020616531372, + "learning_rate": 4.9717215719284685e-05, + "loss": 0.8036, + "step": 5990 + }, + { + "epoch": 0.09630973209842855, + "grad_norm": 0.7009357213973999, + "learning_rate": 4.971626937236227e-05, + "loss": 0.8148, + "step": 6000 + }, + { + "epoch": 0.09647024831859259, + "grad_norm": 0.6102061867713928, + "learning_rate": 4.9715321453630195e-05, + "loss": 0.8692, + "step": 6010 + }, + { + "epoch": 0.09663076453875664, + "grad_norm": 0.6960811614990234, + "learning_rate": 4.971437196314873e-05, + "loss": 0.8336, + "step": 6020 + }, + { + "epoch": 0.09679128075892068, + "grad_norm": 0.4518715739250183, + "learning_rate": 4.971342090097827e-05, + "loss": 0.8842, + "step": 6030 + }, + { + "epoch": 0.09695179697908474, + "grad_norm": 0.7107439041137695, + "learning_rate": 4.97124682671793e-05, + "loss": 0.7806, + "step": 6040 + }, + { + "epoch": 0.09711231319924879, + "grad_norm": 0.5301147699356079, + "learning_rate": 4.9711514061812383e-05, + "loss": 0.7385, + "step": 6050 + }, + { + "epoch": 0.09727282941941283, + "grad_norm": 1.1728076934814453, + "learning_rate": 4.971055828493823e-05, + "loss": 0.8618, + "step": 6060 + }, + { + "epoch": 0.09743334563957688, + "grad_norm": 0.48140501976013184, + "learning_rate": 4.9709600936617594e-05, + "loss": 0.8255, + "step": 6070 + }, + { + "epoch": 0.09759386185974092, + "grad_norm": 0.4912091791629791, + "learning_rate": 4.970864201691138e-05, + "loss": 0.9792, + "step": 6080 + }, + { + "epoch": 0.09775437807990497, + "grad_norm": 0.7042690515518188, + "learning_rate": 4.970768152588055e-05, + "loss": 0.7714, + "step": 6090 + }, + { + "epoch": 0.09791489430006903, + "grad_norm": 0.6503696441650391, + "learning_rate": 4.9706719463586196e-05, + "loss": 0.8294, + "step": 6100 + }, + { + "epoch": 0.09807541052023307, + "grad_norm": 0.7590261101722717, + "learning_rate": 4.9705755830089494e-05, + "loss": 0.845, + "step": 6110 + }, + { + "epoch": 0.09823592674039712, + "grad_norm": 0.6959348320960999, + "learning_rate": 4.9704790625451735e-05, + "loss": 0.808, + "step": 6120 + }, + { + "epoch": 0.09839644296056116, + "grad_norm": 0.5665030479431152, + "learning_rate": 4.970382384973429e-05, + "loss": 0.8163, + "step": 6130 + }, + { + "epoch": 0.09855695918072521, + "grad_norm": 0.8008496165275574, + "learning_rate": 4.970285550299865e-05, + "loss": 0.7664, + "step": 6140 + }, + { + "epoch": 0.09871747540088927, + "grad_norm": 0.49030375480651855, + "learning_rate": 4.9701885585306385e-05, + "loss": 0.8843, + "step": 6150 + }, + { + "epoch": 0.0988779916210533, + "grad_norm": 0.6074206233024597, + "learning_rate": 4.9700914096719184e-05, + "loss": 0.9292, + "step": 6160 + }, + { + "epoch": 0.09903850784121736, + "grad_norm": 0.4551931917667389, + "learning_rate": 4.969994103729884e-05, + "loss": 0.8638, + "step": 6170 + }, + { + "epoch": 0.0991990240613814, + "grad_norm": 0.711082935333252, + "learning_rate": 4.96989664071072e-05, + "loss": 0.7177, + "step": 6180 + }, + { + "epoch": 0.09935954028154545, + "grad_norm": 1.3119475841522217, + "learning_rate": 4.969799020620627e-05, + "loss": 0.7595, + "step": 6190 + }, + { + "epoch": 0.0995200565017095, + "grad_norm": 0.6023959517478943, + "learning_rate": 4.969701243465813e-05, + "loss": 0.8263, + "step": 6200 + }, + { + "epoch": 0.09968057272187354, + "grad_norm": 0.627305805683136, + "learning_rate": 4.9696033092524956e-05, + "loss": 0.8064, + "step": 6210 + }, + { + "epoch": 0.0998410889420376, + "grad_norm": 0.7188780307769775, + "learning_rate": 4.9695052179869025e-05, + "loss": 0.7923, + "step": 6220 + }, + { + "epoch": 0.10000160516220163, + "grad_norm": 0.7978658080101013, + "learning_rate": 4.969406969675272e-05, + "loss": 0.8504, + "step": 6230 + }, + { + "epoch": 0.10016212138236569, + "grad_norm": 0.6654890775680542, + "learning_rate": 4.969308564323853e-05, + "loss": 0.7825, + "step": 6240 + }, + { + "epoch": 0.10032263760252974, + "grad_norm": 0.6158043742179871, + "learning_rate": 4.969210001938902e-05, + "loss": 0.6775, + "step": 6250 + }, + { + "epoch": 0.10048315382269378, + "grad_norm": 0.7158005237579346, + "learning_rate": 4.969111282526687e-05, + "loss": 0.787, + "step": 6260 + }, + { + "epoch": 0.10064367004285783, + "grad_norm": 0.6232050061225891, + "learning_rate": 4.969012406093488e-05, + "loss": 0.7805, + "step": 6270 + }, + { + "epoch": 0.10080418626302187, + "grad_norm": 0.5896686911582947, + "learning_rate": 4.9689133726455905e-05, + "loss": 0.7446, + "step": 6280 + }, + { + "epoch": 0.10096470248318593, + "grad_norm": 0.9042745232582092, + "learning_rate": 4.968814182189294e-05, + "loss": 0.7741, + "step": 6290 + }, + { + "epoch": 0.10112521870334998, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.968714834730907e-05, + "loss": 0.8896, + "step": 6300 + }, + { + "epoch": 0.10128573492351402, + "grad_norm": 1.1120280027389526, + "learning_rate": 4.9686153302767445e-05, + "loss": 0.8711, + "step": 6310 + }, + { + "epoch": 0.10144625114367807, + "grad_norm": 0.7423319816589355, + "learning_rate": 4.968515668833137e-05, + "loss": 0.7748, + "step": 6320 + }, + { + "epoch": 0.10160676736384211, + "grad_norm": 0.5252354741096497, + "learning_rate": 4.9684158504064224e-05, + "loss": 0.812, + "step": 6330 + }, + { + "epoch": 0.10176728358400616, + "grad_norm": 0.643298864364624, + "learning_rate": 4.968315875002947e-05, + "loss": 0.9017, + "step": 6340 + }, + { + "epoch": 0.10192779980417022, + "grad_norm": 0.5801575779914856, + "learning_rate": 4.9682157426290695e-05, + "loss": 0.8635, + "step": 6350 + }, + { + "epoch": 0.10208831602433426, + "grad_norm": 0.5492390394210815, + "learning_rate": 4.968115453291159e-05, + "loss": 0.8682, + "step": 6360 + }, + { + "epoch": 0.10224883224449831, + "grad_norm": 0.7266674637794495, + "learning_rate": 4.968015006995591e-05, + "loss": 0.8438, + "step": 6370 + }, + { + "epoch": 0.10240934846466235, + "grad_norm": 0.8347406387329102, + "learning_rate": 4.967914403748754e-05, + "loss": 0.6859, + "step": 6380 + }, + { + "epoch": 0.1025698646848264, + "grad_norm": 0.5172735452651978, + "learning_rate": 4.9678136435570464e-05, + "loss": 0.8719, + "step": 6390 + }, + { + "epoch": 0.10273038090499045, + "grad_norm": 0.736014723777771, + "learning_rate": 4.967712726426876e-05, + "loss": 0.7821, + "step": 6400 + }, + { + "epoch": 0.10289089712515449, + "grad_norm": 0.7579060792922974, + "learning_rate": 4.96761165236466e-05, + "loss": 0.7992, + "step": 6410 + }, + { + "epoch": 0.10305141334531855, + "grad_norm": 0.6723949313163757, + "learning_rate": 4.9675104213768265e-05, + "loss": 0.6671, + "step": 6420 + }, + { + "epoch": 0.10321192956548259, + "grad_norm": 0.6743065714836121, + "learning_rate": 4.967409033469813e-05, + "loss": 0.8176, + "step": 6430 + }, + { + "epoch": 0.10337244578564664, + "grad_norm": 0.7978116273880005, + "learning_rate": 4.967307488650067e-05, + "loss": 0.7697, + "step": 6440 + }, + { + "epoch": 0.10353296200581069, + "grad_norm": 0.622351348400116, + "learning_rate": 4.967205786924046e-05, + "loss": 0.8451, + "step": 6450 + }, + { + "epoch": 0.10369347822597473, + "grad_norm": 1.2404674291610718, + "learning_rate": 4.967103928298219e-05, + "loss": 0.8831, + "step": 6460 + }, + { + "epoch": 0.10385399444613878, + "grad_norm": 1.0042359828948975, + "learning_rate": 4.967001912779061e-05, + "loss": 0.7824, + "step": 6470 + }, + { + "epoch": 0.10401451066630284, + "grad_norm": 0.7587305307388306, + "learning_rate": 4.9668997403730624e-05, + "loss": 0.8869, + "step": 6480 + }, + { + "epoch": 0.10417502688646688, + "grad_norm": 0.491556316614151, + "learning_rate": 4.966797411086719e-05, + "loss": 0.8259, + "step": 6490 + }, + { + "epoch": 0.10433554310663093, + "grad_norm": 0.7956057190895081, + "learning_rate": 4.9666949249265396e-05, + "loss": 0.8685, + "step": 6500 + }, + { + "epoch": 0.10449605932679497, + "grad_norm": 1.0975209474563599, + "learning_rate": 4.9665922818990405e-05, + "loss": 0.7238, + "step": 6510 + }, + { + "epoch": 0.10465657554695902, + "grad_norm": 0.5236459374427795, + "learning_rate": 4.96648948201075e-05, + "loss": 0.7522, + "step": 6520 + }, + { + "epoch": 0.10481709176712307, + "grad_norm": 0.8335128426551819, + "learning_rate": 4.9663865252682046e-05, + "loss": 0.8378, + "step": 6530 + }, + { + "epoch": 0.10497760798728711, + "grad_norm": 0.5251429677009583, + "learning_rate": 4.9662834116779526e-05, + "loss": 0.8976, + "step": 6540 + }, + { + "epoch": 0.10513812420745117, + "grad_norm": 0.39923354983329773, + "learning_rate": 4.9661801412465515e-05, + "loss": 0.7441, + "step": 6550 + }, + { + "epoch": 0.1052986404276152, + "grad_norm": 0.5642669200897217, + "learning_rate": 4.966076713980569e-05, + "loss": 0.8824, + "step": 6560 + }, + { + "epoch": 0.10545915664777926, + "grad_norm": 0.5910675525665283, + "learning_rate": 4.965973129886581e-05, + "loss": 0.6561, + "step": 6570 + }, + { + "epoch": 0.10561967286794331, + "grad_norm": 0.6751990914344788, + "learning_rate": 4.965869388971176e-05, + "loss": 0.8171, + "step": 6580 + }, + { + "epoch": 0.10578018908810735, + "grad_norm": 0.9761715531349182, + "learning_rate": 4.965765491240951e-05, + "loss": 0.8425, + "step": 6590 + }, + { + "epoch": 0.1059407053082714, + "grad_norm": 0.9147478938102722, + "learning_rate": 4.9656614367025136e-05, + "loss": 0.8868, + "step": 6600 + }, + { + "epoch": 0.10610122152843544, + "grad_norm": 0.9975594282150269, + "learning_rate": 4.96555722536248e-05, + "loss": 0.8485, + "step": 6610 + }, + { + "epoch": 0.1062617377485995, + "grad_norm": 0.7568285465240479, + "learning_rate": 4.965452857227479e-05, + "loss": 0.9683, + "step": 6620 + }, + { + "epoch": 0.10642225396876355, + "grad_norm": 0.7107162475585938, + "learning_rate": 4.965348332304147e-05, + "loss": 0.8755, + "step": 6630 + }, + { + "epoch": 0.10658277018892759, + "grad_norm": 0.464067667722702, + "learning_rate": 4.9652436505991304e-05, + "loss": 0.741, + "step": 6640 + }, + { + "epoch": 0.10674328640909164, + "grad_norm": 0.6330915689468384, + "learning_rate": 4.9651388121190876e-05, + "loss": 0.8675, + "step": 6650 + }, + { + "epoch": 0.10690380262925568, + "grad_norm": 0.8672825694084167, + "learning_rate": 4.965033816870686e-05, + "loss": 0.7488, + "step": 6660 + }, + { + "epoch": 0.10706431884941973, + "grad_norm": 0.9625267386436462, + "learning_rate": 4.964928664860601e-05, + "loss": 0.754, + "step": 6670 + }, + { + "epoch": 0.10722483506958379, + "grad_norm": 0.8684001564979553, + "learning_rate": 4.9648233560955207e-05, + "loss": 0.9072, + "step": 6680 + }, + { + "epoch": 0.10738535128974783, + "grad_norm": 0.7753514051437378, + "learning_rate": 4.9647178905821426e-05, + "loss": 0.865, + "step": 6690 + }, + { + "epoch": 0.10754586750991188, + "grad_norm": 0.5667429566383362, + "learning_rate": 4.964612268327172e-05, + "loss": 0.853, + "step": 6700 + }, + { + "epoch": 0.10770638373007592, + "grad_norm": 0.5255639553070068, + "learning_rate": 4.9645064893373285e-05, + "loss": 0.7515, + "step": 6710 + }, + { + "epoch": 0.10786689995023997, + "grad_norm": 0.6378414034843445, + "learning_rate": 4.964400553619336e-05, + "loss": 0.8821, + "step": 6720 + }, + { + "epoch": 0.10802741617040403, + "grad_norm": 0.9916431307792664, + "learning_rate": 4.9642944611799334e-05, + "loss": 0.8417, + "step": 6730 + }, + { + "epoch": 0.10818793239056806, + "grad_norm": 0.5597088932991028, + "learning_rate": 4.9641882120258676e-05, + "loss": 0.7632, + "step": 6740 + }, + { + "epoch": 0.10834844861073212, + "grad_norm": 1.023353099822998, + "learning_rate": 4.9640818061638937e-05, + "loss": 0.9756, + "step": 6750 + }, + { + "epoch": 0.10850896483089616, + "grad_norm": 0.5876442790031433, + "learning_rate": 4.9639752436007805e-05, + "loss": 0.8349, + "step": 6760 + }, + { + "epoch": 0.10866948105106021, + "grad_norm": 0.6906800866127014, + "learning_rate": 4.9638685243433036e-05, + "loss": 0.7867, + "step": 6770 + }, + { + "epoch": 0.10882999727122426, + "grad_norm": 0.4739511013031006, + "learning_rate": 4.9637616483982494e-05, + "loss": 0.7642, + "step": 6780 + }, + { + "epoch": 0.1089905134913883, + "grad_norm": 0.8069915175437927, + "learning_rate": 4.963654615772416e-05, + "loss": 0.9303, + "step": 6790 + }, + { + "epoch": 0.10915102971155236, + "grad_norm": 0.40920764207839966, + "learning_rate": 4.963547426472609e-05, + "loss": 0.7689, + "step": 6800 + }, + { + "epoch": 0.1093115459317164, + "grad_norm": 0.4951220452785492, + "learning_rate": 4.9634400805056455e-05, + "loss": 0.893, + "step": 6810 + }, + { + "epoch": 0.10947206215188045, + "grad_norm": 0.465627521276474, + "learning_rate": 4.963332577878351e-05, + "loss": 0.8563, + "step": 6820 + }, + { + "epoch": 0.1096325783720445, + "grad_norm": 0.5285079479217529, + "learning_rate": 4.9632249185975634e-05, + "loss": 0.7752, + "step": 6830 + }, + { + "epoch": 0.10979309459220854, + "grad_norm": 0.5838503241539001, + "learning_rate": 4.9631171026701285e-05, + "loss": 0.8414, + "step": 6840 + }, + { + "epoch": 0.10995361081237259, + "grad_norm": 0.5998960733413696, + "learning_rate": 4.963009130102904e-05, + "loss": 0.809, + "step": 6850 + }, + { + "epoch": 0.11011412703253663, + "grad_norm": 0.5242504477500916, + "learning_rate": 4.962901000902753e-05, + "loss": 0.7574, + "step": 6860 + }, + { + "epoch": 0.11027464325270069, + "grad_norm": 0.5347145795822144, + "learning_rate": 4.9627927150765556e-05, + "loss": 0.8801, + "step": 6870 + }, + { + "epoch": 0.11043515947286474, + "grad_norm": 0.46791011095046997, + "learning_rate": 4.962684272631196e-05, + "loss": 0.7834, + "step": 6880 + }, + { + "epoch": 0.11059567569302878, + "grad_norm": 0.8309893608093262, + "learning_rate": 4.962575673573572e-05, + "loss": 0.7776, + "step": 6890 + }, + { + "epoch": 0.11075619191319283, + "grad_norm": 0.6618214845657349, + "learning_rate": 4.9624669179105884e-05, + "loss": 0.8557, + "step": 6900 + }, + { + "epoch": 0.11091670813335687, + "grad_norm": 0.7419338226318359, + "learning_rate": 4.9623580056491625e-05, + "loss": 0.8806, + "step": 6910 + }, + { + "epoch": 0.11107722435352092, + "grad_norm": 0.5460858941078186, + "learning_rate": 4.9622489367962196e-05, + "loss": 0.8375, + "step": 6920 + }, + { + "epoch": 0.11123774057368498, + "grad_norm": 6.020754814147949, + "learning_rate": 4.962139711358696e-05, + "loss": 0.7746, + "step": 6930 + }, + { + "epoch": 0.11139825679384902, + "grad_norm": 0.8417242765426636, + "learning_rate": 4.9620303293435386e-05, + "loss": 0.7509, + "step": 6940 + }, + { + "epoch": 0.11155877301401307, + "grad_norm": 0.6862554550170898, + "learning_rate": 4.9619207907577026e-05, + "loss": 0.8583, + "step": 6950 + }, + { + "epoch": 0.11171928923417711, + "grad_norm": 0.6542831063270569, + "learning_rate": 4.961811095608154e-05, + "loss": 0.8191, + "step": 6960 + }, + { + "epoch": 0.11187980545434116, + "grad_norm": 0.529766321182251, + "learning_rate": 4.96170124390187e-05, + "loss": 0.8648, + "step": 6970 + }, + { + "epoch": 0.11204032167450521, + "grad_norm": 0.6700744032859802, + "learning_rate": 4.961591235645835e-05, + "loss": 0.8076, + "step": 6980 + }, + { + "epoch": 0.11220083789466925, + "grad_norm": 0.5787862539291382, + "learning_rate": 4.961481070847045e-05, + "loss": 0.8717, + "step": 6990 + }, + { + "epoch": 0.1123613541148333, + "grad_norm": 0.5088741183280945, + "learning_rate": 4.9613707495125074e-05, + "loss": 0.8801, + "step": 7000 + }, + { + "epoch": 0.11252187033499735, + "grad_norm": 0.6575955748558044, + "learning_rate": 4.961260271649236e-05, + "loss": 0.8007, + "step": 7010 + }, + { + "epoch": 0.1126823865551614, + "grad_norm": 2.0116429328918457, + "learning_rate": 4.961149637264258e-05, + "loss": 0.7869, + "step": 7020 + }, + { + "epoch": 0.11284290277532545, + "grad_norm": 0.8368249535560608, + "learning_rate": 4.9610388463646085e-05, + "loss": 0.8903, + "step": 7030 + }, + { + "epoch": 0.11300341899548949, + "grad_norm": 0.6007033586502075, + "learning_rate": 4.9609278989573325e-05, + "loss": 0.8172, + "step": 7040 + }, + { + "epoch": 0.11316393521565354, + "grad_norm": 0.9996740818023682, + "learning_rate": 4.960816795049488e-05, + "loss": 0.8889, + "step": 7050 + }, + { + "epoch": 0.11332445143581758, + "grad_norm": 0.8898464441299438, + "learning_rate": 4.9607055346481366e-05, + "loss": 0.8362, + "step": 7060 + }, + { + "epoch": 0.11348496765598164, + "grad_norm": 0.7738519310951233, + "learning_rate": 4.9605941177603576e-05, + "loss": 0.8781, + "step": 7070 + }, + { + "epoch": 0.11364548387614569, + "grad_norm": 0.6867246627807617, + "learning_rate": 4.9604825443932344e-05, + "loss": 0.7854, + "step": 7080 + }, + { + "epoch": 0.11380600009630973, + "grad_norm": 0.4578961133956909, + "learning_rate": 4.960370814553863e-05, + "loss": 0.8011, + "step": 7090 + }, + { + "epoch": 0.11396651631647378, + "grad_norm": 0.4565252661705017, + "learning_rate": 4.960258928249349e-05, + "loss": 0.7852, + "step": 7100 + }, + { + "epoch": 0.11412703253663782, + "grad_norm": 0.7755783796310425, + "learning_rate": 4.960146885486807e-05, + "loss": 1.0006, + "step": 7110 + }, + { + "epoch": 0.11428754875680187, + "grad_norm": 0.7860531210899353, + "learning_rate": 4.960034686273363e-05, + "loss": 0.9579, + "step": 7120 + }, + { + "epoch": 0.11444806497696593, + "grad_norm": 0.7670453190803528, + "learning_rate": 4.959922330616152e-05, + "loss": 0.8708, + "step": 7130 + }, + { + "epoch": 0.11460858119712997, + "grad_norm": 0.6403518915176392, + "learning_rate": 4.959809818522319e-05, + "loss": 0.7281, + "step": 7140 + }, + { + "epoch": 0.11476909741729402, + "grad_norm": 0.7091251611709595, + "learning_rate": 4.9596971499990186e-05, + "loss": 0.7909, + "step": 7150 + }, + { + "epoch": 0.11492961363745806, + "grad_norm": 0.6626346707344055, + "learning_rate": 4.9595843250534166e-05, + "loss": 0.8842, + "step": 7160 + }, + { + "epoch": 0.11509012985762211, + "grad_norm": 0.9913311004638672, + "learning_rate": 4.9594713436926886e-05, + "loss": 0.8556, + "step": 7170 + }, + { + "epoch": 0.11525064607778616, + "grad_norm": 0.6676866412162781, + "learning_rate": 4.9593582059240186e-05, + "loss": 0.733, + "step": 7180 + }, + { + "epoch": 0.1154111622979502, + "grad_norm": 0.9559468030929565, + "learning_rate": 4.959244911754601e-05, + "loss": 0.769, + "step": 7190 + }, + { + "epoch": 0.11557167851811426, + "grad_norm": 1.2284274101257324, + "learning_rate": 4.959131461191642e-05, + "loss": 0.7624, + "step": 7200 + }, + { + "epoch": 0.1157321947382783, + "grad_norm": 0.6036985516548157, + "learning_rate": 4.9590178542423546e-05, + "loss": 0.7394, + "step": 7210 + }, + { + "epoch": 0.11589271095844235, + "grad_norm": 0.6309494376182556, + "learning_rate": 4.958904090913966e-05, + "loss": 0.7821, + "step": 7220 + }, + { + "epoch": 0.1160532271786064, + "grad_norm": 0.9127384424209595, + "learning_rate": 4.9587901712137085e-05, + "loss": 0.8124, + "step": 7230 + }, + { + "epoch": 0.11621374339877044, + "grad_norm": 0.6562053561210632, + "learning_rate": 4.958676095148828e-05, + "loss": 0.7019, + "step": 7240 + }, + { + "epoch": 0.1163742596189345, + "grad_norm": 0.43222248554229736, + "learning_rate": 4.958561862726579e-05, + "loss": 0.77, + "step": 7250 + }, + { + "epoch": 0.11653477583909855, + "grad_norm": 0.5966606736183167, + "learning_rate": 4.958447473954226e-05, + "loss": 0.7556, + "step": 7260 + }, + { + "epoch": 0.11669529205926259, + "grad_norm": 0.6489827036857605, + "learning_rate": 4.958332928839043e-05, + "loss": 0.8131, + "step": 7270 + }, + { + "epoch": 0.11685580827942664, + "grad_norm": 0.6865770816802979, + "learning_rate": 4.958218227388315e-05, + "loss": 0.7823, + "step": 7280 + }, + { + "epoch": 0.11701632449959068, + "grad_norm": 0.5792673230171204, + "learning_rate": 4.958103369609335e-05, + "loss": 0.7888, + "step": 7290 + }, + { + "epoch": 0.11717684071975473, + "grad_norm": 0.6628190875053406, + "learning_rate": 4.9579883555094094e-05, + "loss": 0.8084, + "step": 7300 + }, + { + "epoch": 0.11733735693991879, + "grad_norm": 0.7477450370788574, + "learning_rate": 4.957873185095851e-05, + "loss": 0.8379, + "step": 7310 + }, + { + "epoch": 0.11749787316008282, + "grad_norm": 0.5744036436080933, + "learning_rate": 4.9577578583759845e-05, + "loss": 0.743, + "step": 7320 + }, + { + "epoch": 0.11765838938024688, + "grad_norm": 0.6430788040161133, + "learning_rate": 4.957642375357143e-05, + "loss": 0.7225, + "step": 7330 + }, + { + "epoch": 0.11781890560041092, + "grad_norm": 0.5719828605651855, + "learning_rate": 4.957526736046672e-05, + "loss": 0.7624, + "step": 7340 + }, + { + "epoch": 0.11797942182057497, + "grad_norm": 1.268825650215149, + "learning_rate": 4.957410940451924e-05, + "loss": 0.7908, + "step": 7350 + }, + { + "epoch": 0.11813993804073902, + "grad_norm": 0.8671867251396179, + "learning_rate": 4.957294988580265e-05, + "loss": 0.8955, + "step": 7360 + }, + { + "epoch": 0.11830045426090306, + "grad_norm": 0.5530576705932617, + "learning_rate": 4.9571788804390665e-05, + "loss": 0.7048, + "step": 7370 + }, + { + "epoch": 0.11846097048106712, + "grad_norm": 0.9712499380111694, + "learning_rate": 4.957062616035714e-05, + "loss": 0.8613, + "step": 7380 + }, + { + "epoch": 0.11862148670123115, + "grad_norm": 0.8646267056465149, + "learning_rate": 4.9569461953776006e-05, + "loss": 0.8367, + "step": 7390 + }, + { + "epoch": 0.11878200292139521, + "grad_norm": 0.8489744663238525, + "learning_rate": 4.95682961847213e-05, + "loss": 0.8421, + "step": 7400 + }, + { + "epoch": 0.11894251914155926, + "grad_norm": 0.5692564845085144, + "learning_rate": 4.9567128853267156e-05, + "loss": 0.7631, + "step": 7410 + }, + { + "epoch": 0.1191030353617233, + "grad_norm": 1.177336573600769, + "learning_rate": 4.956595995948781e-05, + "loss": 0.7596, + "step": 7420 + }, + { + "epoch": 0.11926355158188735, + "grad_norm": 0.6668830513954163, + "learning_rate": 4.95647895034576e-05, + "loss": 0.7524, + "step": 7430 + }, + { + "epoch": 0.11942406780205139, + "grad_norm": 0.9313139319419861, + "learning_rate": 4.956361748525096e-05, + "loss": 0.7942, + "step": 7440 + }, + { + "epoch": 0.11958458402221545, + "grad_norm": 0.4326212406158447, + "learning_rate": 4.956244390494241e-05, + "loss": 0.807, + "step": 7450 + }, + { + "epoch": 0.1197451002423795, + "grad_norm": 0.9447592496871948, + "learning_rate": 4.9561268762606614e-05, + "loss": 0.7367, + "step": 7460 + }, + { + "epoch": 0.11990561646254354, + "grad_norm": 0.5198068618774414, + "learning_rate": 4.956009205831827e-05, + "loss": 0.8138, + "step": 7470 + }, + { + "epoch": 0.12006613268270759, + "grad_norm": 0.5498432517051697, + "learning_rate": 4.9558913792152225e-05, + "loss": 0.9098, + "step": 7480 + }, + { + "epoch": 0.12022664890287163, + "grad_norm": 0.7200393676757812, + "learning_rate": 4.9557733964183416e-05, + "loss": 0.8539, + "step": 7490 + }, + { + "epoch": 0.12038716512303568, + "grad_norm": 0.5442637801170349, + "learning_rate": 4.9556552574486864e-05, + "loss": 0.9415, + "step": 7500 + }, + { + "epoch": 0.12054768134319974, + "grad_norm": 0.6471676826477051, + "learning_rate": 4.95553696231377e-05, + "loss": 0.7057, + "step": 7510 + }, + { + "epoch": 0.12070819756336378, + "grad_norm": 0.5909531712532043, + "learning_rate": 4.9554185110211156e-05, + "loss": 0.7605, + "step": 7520 + }, + { + "epoch": 0.12086871378352783, + "grad_norm": 0.5825837254524231, + "learning_rate": 4.9552999035782557e-05, + "loss": 0.9078, + "step": 7530 + }, + { + "epoch": 0.12102923000369187, + "grad_norm": 1.0324580669403076, + "learning_rate": 4.955181139992732e-05, + "loss": 0.8276, + "step": 7540 + }, + { + "epoch": 0.12118974622385592, + "grad_norm": 0.6641337871551514, + "learning_rate": 4.9550622202721e-05, + "loss": 0.8105, + "step": 7550 + }, + { + "epoch": 0.12135026244401997, + "grad_norm": 0.8113497495651245, + "learning_rate": 4.954943144423919e-05, + "loss": 0.9215, + "step": 7560 + }, + { + "epoch": 0.12151077866418401, + "grad_norm": 0.5558072924613953, + "learning_rate": 4.954823912455764e-05, + "loss": 0.7831, + "step": 7570 + }, + { + "epoch": 0.12167129488434807, + "grad_norm": 0.8189428448677063, + "learning_rate": 4.9547045243752164e-05, + "loss": 0.8238, + "step": 7580 + }, + { + "epoch": 0.1218318111045121, + "grad_norm": 0.4845101237297058, + "learning_rate": 4.954584980189868e-05, + "loss": 0.81, + "step": 7590 + }, + { + "epoch": 0.12199232732467616, + "grad_norm": 0.45706677436828613, + "learning_rate": 4.954465279907322e-05, + "loss": 0.9084, + "step": 7600 + }, + { + "epoch": 0.12215284354484021, + "grad_norm": 0.6302416920661926, + "learning_rate": 4.954345423535191e-05, + "loss": 0.7863, + "step": 7610 + }, + { + "epoch": 0.12231335976500425, + "grad_norm": 0.5536419153213501, + "learning_rate": 4.954225411081097e-05, + "loss": 0.8713, + "step": 7620 + }, + { + "epoch": 0.1224738759851683, + "grad_norm": 0.7035227417945862, + "learning_rate": 4.954105242552671e-05, + "loss": 0.7607, + "step": 7630 + }, + { + "epoch": 0.12263439220533234, + "grad_norm": 0.5038375854492188, + "learning_rate": 4.9539849179575556e-05, + "loss": 0.6914, + "step": 7640 + }, + { + "epoch": 0.1227949084254964, + "grad_norm": 0.7105484008789062, + "learning_rate": 4.953864437303403e-05, + "loss": 0.7551, + "step": 7650 + }, + { + "epoch": 0.12295542464566045, + "grad_norm": 0.4519463777542114, + "learning_rate": 4.9537438005978747e-05, + "loss": 0.8369, + "step": 7660 + }, + { + "epoch": 0.12311594086582449, + "grad_norm": 0.6141395568847656, + "learning_rate": 4.953623007848643e-05, + "loss": 0.819, + "step": 7670 + }, + { + "epoch": 0.12327645708598854, + "grad_norm": 0.8037229776382446, + "learning_rate": 4.953502059063389e-05, + "loss": 0.8371, + "step": 7680 + }, + { + "epoch": 0.12343697330615258, + "grad_norm": 0.6665588021278381, + "learning_rate": 4.953380954249805e-05, + "loss": 0.797, + "step": 7690 + }, + { + "epoch": 0.12359748952631663, + "grad_norm": 0.7837285995483398, + "learning_rate": 4.953259693415592e-05, + "loss": 0.7621, + "step": 7700 + }, + { + "epoch": 0.12375800574648069, + "grad_norm": 1.0198335647583008, + "learning_rate": 4.953138276568462e-05, + "loss": 0.8935, + "step": 7710 + }, + { + "epoch": 0.12391852196664473, + "grad_norm": 0.5702205300331116, + "learning_rate": 4.953016703716136e-05, + "loss": 0.7643, + "step": 7720 + }, + { + "epoch": 0.12407903818680878, + "grad_norm": 0.4490784704685211, + "learning_rate": 4.952894974866345e-05, + "loss": 0.8711, + "step": 7730 + }, + { + "epoch": 0.12423955440697282, + "grad_norm": 0.6235283017158508, + "learning_rate": 4.952773090026831e-05, + "loss": 0.8659, + "step": 7740 + }, + { + "epoch": 0.12440007062713687, + "grad_norm": 0.851807713508606, + "learning_rate": 4.952651049205345e-05, + "loss": 0.8215, + "step": 7750 + }, + { + "epoch": 0.12456058684730092, + "grad_norm": 0.46224525570869446, + "learning_rate": 4.952528852409647e-05, + "loss": 0.7995, + "step": 7760 + }, + { + "epoch": 0.12472110306746496, + "grad_norm": 0.6914575099945068, + "learning_rate": 4.9524064996475096e-05, + "loss": 0.8142, + "step": 7770 + }, + { + "epoch": 0.12488161928762902, + "grad_norm": 0.6312603950500488, + "learning_rate": 4.952283990926713e-05, + "loss": 0.7826, + "step": 7780 + }, + { + "epoch": 0.12504213550779306, + "grad_norm": 0.6183178424835205, + "learning_rate": 4.952161326255047e-05, + "loss": 0.7839, + "step": 7790 + }, + { + "epoch": 0.1252026517279571, + "grad_norm": 0.5130000710487366, + "learning_rate": 4.952038505640314e-05, + "loss": 0.7457, + "step": 7800 + }, + { + "epoch": 0.12536316794812116, + "grad_norm": 0.6203510761260986, + "learning_rate": 4.951915529090325e-05, + "loss": 0.6855, + "step": 7810 + }, + { + "epoch": 0.12552368416828522, + "grad_norm": 0.6549975275993347, + "learning_rate": 4.951792396612899e-05, + "loss": 0.9076, + "step": 7820 + }, + { + "epoch": 0.12568420038844924, + "grad_norm": 0.5156996846199036, + "learning_rate": 4.951669108215866e-05, + "loss": 0.8059, + "step": 7830 + }, + { + "epoch": 0.1258447166086133, + "grad_norm": 0.4865977466106415, + "learning_rate": 4.951545663907069e-05, + "loss": 0.812, + "step": 7840 + }, + { + "epoch": 0.12600523282877735, + "grad_norm": 1.1439883708953857, + "learning_rate": 4.9514220636943564e-05, + "loss": 0.8168, + "step": 7850 + }, + { + "epoch": 0.1261657490489414, + "grad_norm": 0.4886620044708252, + "learning_rate": 4.951298307585589e-05, + "loss": 0.8018, + "step": 7860 + }, + { + "epoch": 0.12632626526910545, + "grad_norm": 0.7361254096031189, + "learning_rate": 4.951174395588637e-05, + "loss": 0.8049, + "step": 7870 + }, + { + "epoch": 0.12648678148926948, + "grad_norm": 0.48358526825904846, + "learning_rate": 4.95105032771138e-05, + "loss": 0.7624, + "step": 7880 + }, + { + "epoch": 0.12664729770943353, + "grad_norm": 0.8150381445884705, + "learning_rate": 4.950926103961708e-05, + "loss": 0.7731, + "step": 7890 + }, + { + "epoch": 0.12680781392959758, + "grad_norm": 0.6734673380851746, + "learning_rate": 4.9508017243475224e-05, + "loss": 0.7511, + "step": 7900 + }, + { + "epoch": 0.12696833014976164, + "grad_norm": 0.5873427987098694, + "learning_rate": 4.950677188876731e-05, + "loss": 0.8667, + "step": 7910 + }, + { + "epoch": 0.1271288463699257, + "grad_norm": 0.6206739544868469, + "learning_rate": 4.950552497557255e-05, + "loss": 0.6918, + "step": 7920 + }, + { + "epoch": 0.12728936259008972, + "grad_norm": 0.562235951423645, + "learning_rate": 4.950427650397023e-05, + "loss": 0.7523, + "step": 7930 + }, + { + "epoch": 0.12744987881025377, + "grad_norm": 0.6368486285209656, + "learning_rate": 4.9503026474039754e-05, + "loss": 0.7719, + "step": 7940 + }, + { + "epoch": 0.12761039503041782, + "grad_norm": 0.4996802508831024, + "learning_rate": 4.950177488586061e-05, + "loss": 0.8486, + "step": 7950 + }, + { + "epoch": 0.12777091125058188, + "grad_norm": 1.0380438566207886, + "learning_rate": 4.9500521739512396e-05, + "loss": 0.8769, + "step": 7960 + }, + { + "epoch": 0.12793142747074593, + "grad_norm": 0.7048365473747253, + "learning_rate": 4.9499267035074806e-05, + "loss": 0.7938, + "step": 7970 + }, + { + "epoch": 0.12809194369090995, + "grad_norm": 0.731074869632721, + "learning_rate": 4.9498010772627615e-05, + "loss": 0.8938, + "step": 7980 + }, + { + "epoch": 0.128252459911074, + "grad_norm": 0.6032593250274658, + "learning_rate": 4.9496752952250735e-05, + "loss": 0.7387, + "step": 7990 + }, + { + "epoch": 0.12841297613123806, + "grad_norm": 0.6821911334991455, + "learning_rate": 4.9495493574024154e-05, + "loss": 0.7477, + "step": 8000 + }, + { + "epoch": 0.12841297613123806, + "eval_loss": 0.8223568797111511, + "eval_runtime": 1833.345, + "eval_samples_per_second": 14.308, + "eval_steps_per_second": 1.789, + "step": 8000 + }, + { + "epoch": 0.1285734923514021, + "grad_norm": 0.48409780859947205, + "learning_rate": 4.9494232638027946e-05, + "loss": 0.7542, + "step": 8010 + }, + { + "epoch": 0.12873400857156617, + "grad_norm": 0.6881629824638367, + "learning_rate": 4.949297014434232e-05, + "loss": 0.7527, + "step": 8020 + }, + { + "epoch": 0.1288945247917302, + "grad_norm": 0.7070799469947815, + "learning_rate": 4.949170609304754e-05, + "loss": 0.7466, + "step": 8030 + }, + { + "epoch": 0.12905504101189424, + "grad_norm": 0.799085795879364, + "learning_rate": 4.9490440484224015e-05, + "loss": 0.7916, + "step": 8040 + }, + { + "epoch": 0.1292155572320583, + "grad_norm": 0.4917910695075989, + "learning_rate": 4.9489173317952214e-05, + "loss": 0.8531, + "step": 8050 + }, + { + "epoch": 0.12937607345222235, + "grad_norm": 0.711161732673645, + "learning_rate": 4.948790459431273e-05, + "loss": 0.7816, + "step": 8060 + }, + { + "epoch": 0.1295365896723864, + "grad_norm": 1.0423327684402466, + "learning_rate": 4.948663431338624e-05, + "loss": 0.8393, + "step": 8070 + }, + { + "epoch": 0.12969710589255043, + "grad_norm": 0.4255164861679077, + "learning_rate": 4.9485362475253526e-05, + "loss": 0.7175, + "step": 8080 + }, + { + "epoch": 0.12985762211271448, + "grad_norm": 0.7532893419265747, + "learning_rate": 4.948408907999548e-05, + "loss": 0.7847, + "step": 8090 + }, + { + "epoch": 0.13001813833287854, + "grad_norm": 0.6510115265846252, + "learning_rate": 4.9482814127693075e-05, + "loss": 0.7911, + "step": 8100 + }, + { + "epoch": 0.1301786545530426, + "grad_norm": 0.69105464220047, + "learning_rate": 4.9481537618427386e-05, + "loss": 0.8096, + "step": 8110 + }, + { + "epoch": 0.13033917077320664, + "grad_norm": 1.1142642498016357, + "learning_rate": 4.94802595522796e-05, + "loss": 0.7563, + "step": 8120 + }, + { + "epoch": 0.13049968699337067, + "grad_norm": 0.6555029153823853, + "learning_rate": 4.9478979929330995e-05, + "loss": 0.7579, + "step": 8130 + }, + { + "epoch": 0.13066020321353472, + "grad_norm": 0.6786519289016724, + "learning_rate": 4.9477698749662936e-05, + "loss": 0.8066, + "step": 8140 + }, + { + "epoch": 0.13082071943369877, + "grad_norm": 0.46242934465408325, + "learning_rate": 4.9476416013356907e-05, + "loss": 0.8767, + "step": 8150 + }, + { + "epoch": 0.13098123565386283, + "grad_norm": 1.0882372856140137, + "learning_rate": 4.947513172049448e-05, + "loss": 0.8606, + "step": 8160 + }, + { + "epoch": 0.13114175187402688, + "grad_norm": 0.4720267653465271, + "learning_rate": 4.9473845871157344e-05, + "loss": 0.8438, + "step": 8170 + }, + { + "epoch": 0.1313022680941909, + "grad_norm": 0.5936036109924316, + "learning_rate": 4.947255846542724e-05, + "loss": 0.7812, + "step": 8180 + }, + { + "epoch": 0.13146278431435496, + "grad_norm": 0.6407374739646912, + "learning_rate": 4.947126950338607e-05, + "loss": 0.7683, + "step": 8190 + }, + { + "epoch": 0.131623300534519, + "grad_norm": 0.5871694087982178, + "learning_rate": 4.946997898511578e-05, + "loss": 0.8045, + "step": 8200 + }, + { + "epoch": 0.13178381675468306, + "grad_norm": 0.636341392993927, + "learning_rate": 4.9468686910698455e-05, + "loss": 0.8198, + "step": 8210 + }, + { + "epoch": 0.13194433297484712, + "grad_norm": 0.5445073246955872, + "learning_rate": 4.946739328021627e-05, + "loss": 0.7956, + "step": 8220 + }, + { + "epoch": 0.13210484919501114, + "grad_norm": 0.8113119602203369, + "learning_rate": 4.946609809375147e-05, + "loss": 0.7389, + "step": 8230 + }, + { + "epoch": 0.1322653654151752, + "grad_norm": 0.5548624396324158, + "learning_rate": 4.946480135138643e-05, + "loss": 0.7931, + "step": 8240 + }, + { + "epoch": 0.13242588163533925, + "grad_norm": 0.6181760430335999, + "learning_rate": 4.9463503053203617e-05, + "loss": 0.7858, + "step": 8250 + }, + { + "epoch": 0.1325863978555033, + "grad_norm": 0.4675312042236328, + "learning_rate": 4.9462203199285606e-05, + "loss": 0.7832, + "step": 8260 + }, + { + "epoch": 0.13274691407566735, + "grad_norm": 0.4073457717895508, + "learning_rate": 4.946090178971505e-05, + "loss": 0.8521, + "step": 8270 + }, + { + "epoch": 0.13290743029583138, + "grad_norm": 0.6459699869155884, + "learning_rate": 4.945959882457471e-05, + "loss": 0.8454, + "step": 8280 + }, + { + "epoch": 0.13306794651599543, + "grad_norm": 0.5950840711593628, + "learning_rate": 4.945829430394744e-05, + "loss": 0.8055, + "step": 8290 + }, + { + "epoch": 0.1332284627361595, + "grad_norm": 0.6453191637992859, + "learning_rate": 4.9456988227916215e-05, + "loss": 0.8274, + "step": 8300 + }, + { + "epoch": 0.13338897895632354, + "grad_norm": 0.8541464805603027, + "learning_rate": 4.945568059656408e-05, + "loss": 0.7526, + "step": 8310 + }, + { + "epoch": 0.1335494951764876, + "grad_norm": 0.8791118860244751, + "learning_rate": 4.94543714099742e-05, + "loss": 0.8176, + "step": 8320 + }, + { + "epoch": 0.13371001139665162, + "grad_norm": 0.5349208116531372, + "learning_rate": 4.9453060668229835e-05, + "loss": 0.7783, + "step": 8330 + }, + { + "epoch": 0.13387052761681567, + "grad_norm": 0.6989527940750122, + "learning_rate": 4.9451748371414336e-05, + "loss": 0.7333, + "step": 8340 + }, + { + "epoch": 0.13403104383697972, + "grad_norm": 0.5725222229957581, + "learning_rate": 4.945043451961116e-05, + "loss": 0.8501, + "step": 8350 + }, + { + "epoch": 0.13419156005714378, + "grad_norm": 0.4307886064052582, + "learning_rate": 4.944911911290385e-05, + "loss": 0.8407, + "step": 8360 + }, + { + "epoch": 0.13435207627730783, + "grad_norm": 0.5525847673416138, + "learning_rate": 4.944780215137608e-05, + "loss": 0.7971, + "step": 8370 + }, + { + "epoch": 0.13451259249747188, + "grad_norm": 0.9002968668937683, + "learning_rate": 4.9446483635111574e-05, + "loss": 0.9049, + "step": 8380 + }, + { + "epoch": 0.1346731087176359, + "grad_norm": 0.8359232544898987, + "learning_rate": 4.94451635641942e-05, + "loss": 0.8002, + "step": 8390 + }, + { + "epoch": 0.13483362493779996, + "grad_norm": 0.7340505719184875, + "learning_rate": 4.94438419387079e-05, + "loss": 0.673, + "step": 8400 + }, + { + "epoch": 0.13499414115796401, + "grad_norm": 0.5400354266166687, + "learning_rate": 4.944251875873673e-05, + "loss": 0.7826, + "step": 8410 + }, + { + "epoch": 0.13515465737812807, + "grad_norm": 0.7891965508460999, + "learning_rate": 4.944119402436482e-05, + "loss": 0.6921, + "step": 8420 + }, + { + "epoch": 0.13531517359829212, + "grad_norm": 0.5523108839988708, + "learning_rate": 4.943986773567643e-05, + "loss": 0.8219, + "step": 8430 + }, + { + "epoch": 0.13547568981845615, + "grad_norm": 0.7268702387809753, + "learning_rate": 4.9438539892755905e-05, + "loss": 0.8132, + "step": 8440 + }, + { + "epoch": 0.1356362060386202, + "grad_norm": 0.6222569346427917, + "learning_rate": 4.9437210495687674e-05, + "loss": 0.7787, + "step": 8450 + }, + { + "epoch": 0.13579672225878425, + "grad_norm": 0.6029394865036011, + "learning_rate": 4.94358795445563e-05, + "loss": 0.8482, + "step": 8460 + }, + { + "epoch": 0.1359572384789483, + "grad_norm": 1.7519153356552124, + "learning_rate": 4.9434547039446397e-05, + "loss": 0.8441, + "step": 8470 + }, + { + "epoch": 0.13611775469911236, + "grad_norm": 0.5061825513839722, + "learning_rate": 4.943321298044272e-05, + "loss": 0.8776, + "step": 8480 + }, + { + "epoch": 0.13627827091927638, + "grad_norm": 0.49155721068382263, + "learning_rate": 4.943187736763011e-05, + "loss": 0.8241, + "step": 8490 + }, + { + "epoch": 0.13643878713944044, + "grad_norm": 0.8436824679374695, + "learning_rate": 4.94305402010935e-05, + "loss": 0.8889, + "step": 8500 + }, + { + "epoch": 0.1365993033596045, + "grad_norm": 0.594657301902771, + "learning_rate": 4.9429201480917925e-05, + "loss": 0.8216, + "step": 8510 + }, + { + "epoch": 0.13675981957976854, + "grad_norm": 0.466307669878006, + "learning_rate": 4.942786120718852e-05, + "loss": 0.8341, + "step": 8520 + }, + { + "epoch": 0.1369203357999326, + "grad_norm": 0.8644033670425415, + "learning_rate": 4.9426519379990524e-05, + "loss": 0.8078, + "step": 8530 + }, + { + "epoch": 0.13708085202009662, + "grad_norm": 0.5822124481201172, + "learning_rate": 4.9425175999409256e-05, + "loss": 0.9199, + "step": 8540 + }, + { + "epoch": 0.13724136824026067, + "grad_norm": 0.7931888699531555, + "learning_rate": 4.942383106553016e-05, + "loss": 0.8725, + "step": 8550 + }, + { + "epoch": 0.13740188446042473, + "grad_norm": 0.6234556436538696, + "learning_rate": 4.9422484578438765e-05, + "loss": 0.8099, + "step": 8560 + }, + { + "epoch": 0.13756240068058878, + "grad_norm": 0.8281916379928589, + "learning_rate": 4.942113653822069e-05, + "loss": 0.7863, + "step": 8570 + }, + { + "epoch": 0.13772291690075283, + "grad_norm": 0.9302041530609131, + "learning_rate": 4.941978694496167e-05, + "loss": 0.8056, + "step": 8580 + }, + { + "epoch": 0.13788343312091686, + "grad_norm": 0.5987799167633057, + "learning_rate": 4.941843579874753e-05, + "loss": 0.9196, + "step": 8590 + }, + { + "epoch": 0.1380439493410809, + "grad_norm": 0.6321382522583008, + "learning_rate": 4.94170830996642e-05, + "loss": 0.8355, + "step": 8600 + }, + { + "epoch": 0.13820446556124497, + "grad_norm": 0.6856383085250854, + "learning_rate": 4.9415728847797694e-05, + "loss": 0.7069, + "step": 8610 + }, + { + "epoch": 0.13836498178140902, + "grad_norm": 0.5345771312713623, + "learning_rate": 4.941437304323414e-05, + "loss": 0.7499, + "step": 8620 + }, + { + "epoch": 0.13852549800157307, + "grad_norm": 0.8807782530784607, + "learning_rate": 4.941301568605976e-05, + "loss": 0.7478, + "step": 8630 + }, + { + "epoch": 0.1386860142217371, + "grad_norm": 1.255810022354126, + "learning_rate": 4.941165677636087e-05, + "loss": 0.8977, + "step": 8640 + }, + { + "epoch": 0.13884653044190115, + "grad_norm": 0.6205518841743469, + "learning_rate": 4.941029631422389e-05, + "loss": 0.9095, + "step": 8650 + }, + { + "epoch": 0.1390070466620652, + "grad_norm": 0.591781735420227, + "learning_rate": 4.940893429973534e-05, + "loss": 0.8062, + "step": 8660 + }, + { + "epoch": 0.13916756288222926, + "grad_norm": 0.6991277933120728, + "learning_rate": 4.9407570732981834e-05, + "loss": 0.7443, + "step": 8670 + }, + { + "epoch": 0.1393280791023933, + "grad_norm": 0.794530987739563, + "learning_rate": 4.940620561405009e-05, + "loss": 0.9364, + "step": 8680 + }, + { + "epoch": 0.13948859532255733, + "grad_norm": 0.593948245048523, + "learning_rate": 4.940483894302692e-05, + "loss": 0.8058, + "step": 8690 + }, + { + "epoch": 0.1396491115427214, + "grad_norm": 0.730877697467804, + "learning_rate": 4.940347071999923e-05, + "loss": 0.806, + "step": 8700 + }, + { + "epoch": 0.13980962776288544, + "grad_norm": 0.6529077887535095, + "learning_rate": 4.940210094505404e-05, + "loss": 0.8593, + "step": 8710 + }, + { + "epoch": 0.1399701439830495, + "grad_norm": 0.9248010516166687, + "learning_rate": 4.9400729618278454e-05, + "loss": 0.8595, + "step": 8720 + }, + { + "epoch": 0.14013066020321355, + "grad_norm": 1.1374351978302002, + "learning_rate": 4.939935673975967e-05, + "loss": 0.7852, + "step": 8730 + }, + { + "epoch": 0.14029117642337757, + "grad_norm": 0.6035959124565125, + "learning_rate": 4.939798230958502e-05, + "loss": 0.8422, + "step": 8740 + }, + { + "epoch": 0.14045169264354163, + "grad_norm": 0.7135036587715149, + "learning_rate": 4.93966063278419e-05, + "loss": 0.8544, + "step": 8750 + }, + { + "epoch": 0.14061220886370568, + "grad_norm": 1.2750235795974731, + "learning_rate": 4.939522879461781e-05, + "loss": 0.8637, + "step": 8760 + }, + { + "epoch": 0.14077272508386973, + "grad_norm": 0.7933558225631714, + "learning_rate": 4.9393849710000345e-05, + "loss": 0.7721, + "step": 8770 + }, + { + "epoch": 0.14093324130403379, + "grad_norm": 0.5809767246246338, + "learning_rate": 4.939246907407722e-05, + "loss": 0.8477, + "step": 8780 + }, + { + "epoch": 0.1410937575241978, + "grad_norm": 0.5704832077026367, + "learning_rate": 4.939108688693623e-05, + "loss": 0.6545, + "step": 8790 + }, + { + "epoch": 0.14125427374436186, + "grad_norm": 0.6629199385643005, + "learning_rate": 4.938970314866528e-05, + "loss": 0.7978, + "step": 8800 + }, + { + "epoch": 0.14141478996452592, + "grad_norm": 0.6948702335357666, + "learning_rate": 4.938831785935236e-05, + "loss": 0.7563, + "step": 8810 + }, + { + "epoch": 0.14157530618468997, + "grad_norm": 0.5792553424835205, + "learning_rate": 4.938693101908557e-05, + "loss": 0.8331, + "step": 8820 + }, + { + "epoch": 0.14173582240485402, + "grad_norm": 0.5380318760871887, + "learning_rate": 4.9385542627953105e-05, + "loss": 0.8444, + "step": 8830 + }, + { + "epoch": 0.14189633862501805, + "grad_norm": 1.3333637714385986, + "learning_rate": 4.938415268604326e-05, + "loss": 0.7586, + "step": 8840 + }, + { + "epoch": 0.1420568548451821, + "grad_norm": 0.5236332416534424, + "learning_rate": 4.938276119344441e-05, + "loss": 0.9377, + "step": 8850 + }, + { + "epoch": 0.14221737106534615, + "grad_norm": 0.45762574672698975, + "learning_rate": 4.938136815024508e-05, + "loss": 0.8511, + "step": 8860 + }, + { + "epoch": 0.1423778872855102, + "grad_norm": 0.9582352042198181, + "learning_rate": 4.937997355653383e-05, + "loss": 0.8752, + "step": 8870 + }, + { + "epoch": 0.14253840350567426, + "grad_norm": 0.7155029773712158, + "learning_rate": 4.9378577412399355e-05, + "loss": 0.7949, + "step": 8880 + }, + { + "epoch": 0.14269891972583829, + "grad_norm": 0.5068984031677246, + "learning_rate": 4.937717971793044e-05, + "loss": 0.8316, + "step": 8890 + }, + { + "epoch": 0.14285943594600234, + "grad_norm": 0.6589087843894958, + "learning_rate": 4.937578047321599e-05, + "loss": 0.7708, + "step": 8900 + }, + { + "epoch": 0.1430199521661664, + "grad_norm": 0.5931029319763184, + "learning_rate": 4.937437967834496e-05, + "loss": 0.7176, + "step": 8910 + }, + { + "epoch": 0.14318046838633045, + "grad_norm": 0.9213016033172607, + "learning_rate": 4.937297733340646e-05, + "loss": 0.803, + "step": 8920 + }, + { + "epoch": 0.1433409846064945, + "grad_norm": 0.49457964301109314, + "learning_rate": 4.937157343848964e-05, + "loss": 0.7604, + "step": 8930 + }, + { + "epoch": 0.14350150082665852, + "grad_norm": 0.6976996660232544, + "learning_rate": 4.93701679936838e-05, + "loss": 0.7898, + "step": 8940 + }, + { + "epoch": 0.14366201704682258, + "grad_norm": 0.8648852705955505, + "learning_rate": 4.936876099907832e-05, + "loss": 0.8185, + "step": 8950 + }, + { + "epoch": 0.14382253326698663, + "grad_norm": 0.5931202173233032, + "learning_rate": 4.936735245476267e-05, + "loss": 0.8381, + "step": 8960 + }, + { + "epoch": 0.14398304948715068, + "grad_norm": 0.6218159198760986, + "learning_rate": 4.936594236082642e-05, + "loss": 0.8664, + "step": 8970 + }, + { + "epoch": 0.14414356570731474, + "grad_norm": 0.6545647978782654, + "learning_rate": 4.936453071735925e-05, + "loss": 0.9064, + "step": 8980 + }, + { + "epoch": 0.14430408192747876, + "grad_norm": 0.5656579732894897, + "learning_rate": 4.936311752445094e-05, + "loss": 0.849, + "step": 8990 + }, + { + "epoch": 0.14446459814764281, + "grad_norm": 0.5101510286331177, + "learning_rate": 4.9361702782191346e-05, + "loss": 0.7359, + "step": 9000 + }, + { + "epoch": 0.14462511436780687, + "grad_norm": 0.4775393605232239, + "learning_rate": 4.936028649067044e-05, + "loss": 0.8137, + "step": 9010 + }, + { + "epoch": 0.14478563058797092, + "grad_norm": 0.7609969973564148, + "learning_rate": 4.93588686499783e-05, + "loss": 0.7999, + "step": 9020 + }, + { + "epoch": 0.14494614680813497, + "grad_norm": 0.5407885313034058, + "learning_rate": 4.935744926020508e-05, + "loss": 0.8383, + "step": 9030 + }, + { + "epoch": 0.145106663028299, + "grad_norm": 0.5695872902870178, + "learning_rate": 4.935602832144106e-05, + "loss": 0.7108, + "step": 9040 + }, + { + "epoch": 0.14526717924846305, + "grad_norm": 0.6251730918884277, + "learning_rate": 4.935460583377659e-05, + "loss": 0.7516, + "step": 9050 + }, + { + "epoch": 0.1454276954686271, + "grad_norm": 0.887856662273407, + "learning_rate": 4.9353181797302134e-05, + "loss": 0.8316, + "step": 9060 + }, + { + "epoch": 0.14558821168879116, + "grad_norm": 0.6385762691497803, + "learning_rate": 4.9351756212108255e-05, + "loss": 0.7983, + "step": 9070 + }, + { + "epoch": 0.1457487279089552, + "grad_norm": 0.6145803928375244, + "learning_rate": 4.935032907828561e-05, + "loss": 0.8472, + "step": 9080 + }, + { + "epoch": 0.14590924412911924, + "grad_norm": 0.6733904480934143, + "learning_rate": 4.9348900395924956e-05, + "loss": 0.8305, + "step": 9090 + }, + { + "epoch": 0.1460697603492833, + "grad_norm": 0.6146711111068726, + "learning_rate": 4.934747016511716e-05, + "loss": 0.8392, + "step": 9100 + }, + { + "epoch": 0.14623027656944734, + "grad_norm": 0.5587322115898132, + "learning_rate": 4.934603838595316e-05, + "loss": 0.9436, + "step": 9110 + }, + { + "epoch": 0.1463907927896114, + "grad_norm": 0.8152000308036804, + "learning_rate": 4.9344605058524016e-05, + "loss": 0.8499, + "step": 9120 + }, + { + "epoch": 0.14655130900977545, + "grad_norm": 0.6170291304588318, + "learning_rate": 4.9343170182920875e-05, + "loss": 0.8434, + "step": 9130 + }, + { + "epoch": 0.14671182522993947, + "grad_norm": 1.2782976627349854, + "learning_rate": 4.934173375923499e-05, + "loss": 0.7103, + "step": 9140 + }, + { + "epoch": 0.14687234145010353, + "grad_norm": 0.4873592257499695, + "learning_rate": 4.9340295787557715e-05, + "loss": 0.8142, + "step": 9150 + }, + { + "epoch": 0.14703285767026758, + "grad_norm": 0.6120432615280151, + "learning_rate": 4.9338856267980485e-05, + "loss": 0.7471, + "step": 9160 + }, + { + "epoch": 0.14719337389043163, + "grad_norm": 0.5942836999893188, + "learning_rate": 4.933741520059486e-05, + "loss": 0.9172, + "step": 9170 + }, + { + "epoch": 0.1473538901105957, + "grad_norm": 0.45376843214035034, + "learning_rate": 4.933597258549247e-05, + "loss": 0.7419, + "step": 9180 + }, + { + "epoch": 0.1475144063307597, + "grad_norm": 0.49604564905166626, + "learning_rate": 4.933452842276506e-05, + "loss": 0.7989, + "step": 9190 + }, + { + "epoch": 0.14767492255092377, + "grad_norm": 0.621819019317627, + "learning_rate": 4.933308271250447e-05, + "loss": 0.9173, + "step": 9200 + }, + { + "epoch": 0.14783543877108782, + "grad_norm": 0.5055466294288635, + "learning_rate": 4.933163545480264e-05, + "loss": 0.6797, + "step": 9210 + }, + { + "epoch": 0.14799595499125187, + "grad_norm": 0.5114377737045288, + "learning_rate": 4.933018664975161e-05, + "loss": 0.8157, + "step": 9220 + }, + { + "epoch": 0.14815647121141592, + "grad_norm": 0.7457907199859619, + "learning_rate": 4.932873629744351e-05, + "loss": 0.8525, + "step": 9230 + }, + { + "epoch": 0.14831698743157995, + "grad_norm": 0.7940027713775635, + "learning_rate": 4.932728439797058e-05, + "loss": 0.8209, + "step": 9240 + }, + { + "epoch": 0.148477503651744, + "grad_norm": 0.8524682521820068, + "learning_rate": 4.932583095142515e-05, + "loss": 0.7039, + "step": 9250 + }, + { + "epoch": 0.14863801987190806, + "grad_norm": 0.7554528117179871, + "learning_rate": 4.932437595789964e-05, + "loss": 0.8756, + "step": 9260 + }, + { + "epoch": 0.1487985360920721, + "grad_norm": 0.8182176351547241, + "learning_rate": 4.93229194174866e-05, + "loss": 0.7777, + "step": 9270 + }, + { + "epoch": 0.14895905231223616, + "grad_norm": 0.4581071734428406, + "learning_rate": 4.932146133027864e-05, + "loss": 0.7715, + "step": 9280 + }, + { + "epoch": 0.1491195685324002, + "grad_norm": 0.5772873163223267, + "learning_rate": 4.93200016963685e-05, + "loss": 0.7277, + "step": 9290 + }, + { + "epoch": 0.14928008475256424, + "grad_norm": 0.5201414823532104, + "learning_rate": 4.931854051584899e-05, + "loss": 0.7986, + "step": 9300 + }, + { + "epoch": 0.1494406009727283, + "grad_norm": 0.7059119343757629, + "learning_rate": 4.931707778881303e-05, + "loss": 0.7422, + "step": 9310 + }, + { + "epoch": 0.14960111719289235, + "grad_norm": 0.9789541363716125, + "learning_rate": 4.9315613515353664e-05, + "loss": 0.9512, + "step": 9320 + }, + { + "epoch": 0.1497616334130564, + "grad_norm": 0.6077494025230408, + "learning_rate": 4.9314147695563986e-05, + "loss": 0.7389, + "step": 9330 + }, + { + "epoch": 0.14992214963322043, + "grad_norm": 0.4460678696632385, + "learning_rate": 4.931268032953723e-05, + "loss": 0.8025, + "step": 9340 + }, + { + "epoch": 0.15008266585338448, + "grad_norm": 0.9289214611053467, + "learning_rate": 4.93112114173667e-05, + "loss": 0.7502, + "step": 9350 + }, + { + "epoch": 0.15024318207354853, + "grad_norm": 0.5348705649375916, + "learning_rate": 4.9309740959145824e-05, + "loss": 0.8392, + "step": 9360 + }, + { + "epoch": 0.15040369829371258, + "grad_norm": 0.46966010332107544, + "learning_rate": 4.930826895496811e-05, + "loss": 0.8238, + "step": 9370 + }, + { + "epoch": 0.15056421451387664, + "grad_norm": 0.7592468857765198, + "learning_rate": 4.930679540492715e-05, + "loss": 0.9375, + "step": 9380 + }, + { + "epoch": 0.15072473073404066, + "grad_norm": 0.7148717641830444, + "learning_rate": 4.9305320309116677e-05, + "loss": 0.8018, + "step": 9390 + }, + { + "epoch": 0.15088524695420472, + "grad_norm": 1.4315929412841797, + "learning_rate": 4.9303843667630486e-05, + "loss": 0.8064, + "step": 9400 + }, + { + "epoch": 0.15104576317436877, + "grad_norm": 0.9749792218208313, + "learning_rate": 4.93023654805625e-05, + "loss": 0.9272, + "step": 9410 + }, + { + "epoch": 0.15120627939453282, + "grad_norm": 0.5720782279968262, + "learning_rate": 4.93008857480067e-05, + "loss": 0.9268, + "step": 9420 + }, + { + "epoch": 0.15136679561469688, + "grad_norm": 1.2671095132827759, + "learning_rate": 4.9299404470057196e-05, + "loss": 0.7826, + "step": 9430 + }, + { + "epoch": 0.1515273118348609, + "grad_norm": 0.5497077107429504, + "learning_rate": 4.92979216468082e-05, + "loss": 0.7208, + "step": 9440 + }, + { + "epoch": 0.15168782805502495, + "grad_norm": 0.48560959100723267, + "learning_rate": 4.929643727835399e-05, + "loss": 0.7516, + "step": 9450 + }, + { + "epoch": 0.151848344275189, + "grad_norm": 0.7097569108009338, + "learning_rate": 4.929495136478898e-05, + "loss": 0.8334, + "step": 9460 + }, + { + "epoch": 0.15200886049535306, + "grad_norm": 0.6942985653877258, + "learning_rate": 4.929346390620766e-05, + "loss": 0.8838, + "step": 9470 + }, + { + "epoch": 0.1521693767155171, + "grad_norm": 0.4978102445602417, + "learning_rate": 4.929197490270463e-05, + "loss": 0.8341, + "step": 9480 + }, + { + "epoch": 0.15232989293568114, + "grad_norm": 0.5511559844017029, + "learning_rate": 4.9290484354374565e-05, + "loss": 0.7284, + "step": 9490 + }, + { + "epoch": 0.1524904091558452, + "grad_norm": 0.8412987589836121, + "learning_rate": 4.928899226131227e-05, + "loss": 0.8216, + "step": 9500 + }, + { + "epoch": 0.15265092537600924, + "grad_norm": 0.8107486367225647, + "learning_rate": 4.928749862361263e-05, + "loss": 0.8413, + "step": 9510 + }, + { + "epoch": 0.1528114415961733, + "grad_norm": 0.6007057428359985, + "learning_rate": 4.928600344137063e-05, + "loss": 0.8854, + "step": 9520 + }, + { + "epoch": 0.15297195781633735, + "grad_norm": 0.5900388956069946, + "learning_rate": 4.9284506714681354e-05, + "loss": 0.7866, + "step": 9530 + }, + { + "epoch": 0.15313247403650138, + "grad_norm": 1.0158817768096924, + "learning_rate": 4.928300844363999e-05, + "loss": 0.776, + "step": 9540 + }, + { + "epoch": 0.15329299025666543, + "grad_norm": 0.8094683289527893, + "learning_rate": 4.9281508628341814e-05, + "loss": 0.6936, + "step": 9550 + }, + { + "epoch": 0.15345350647682948, + "grad_norm": 0.850523054599762, + "learning_rate": 4.928000726888221e-05, + "loss": 0.781, + "step": 9560 + }, + { + "epoch": 0.15361402269699354, + "grad_norm": 0.6288497447967529, + "learning_rate": 4.927850436535665e-05, + "loss": 0.652, + "step": 9570 + }, + { + "epoch": 0.1537745389171576, + "grad_norm": 0.8309193849563599, + "learning_rate": 4.927699991786071e-05, + "loss": 0.8856, + "step": 9580 + }, + { + "epoch": 0.1539350551373216, + "grad_norm": 0.6912679672241211, + "learning_rate": 4.9275493926490066e-05, + "loss": 0.8631, + "step": 9590 + }, + { + "epoch": 0.15409557135748567, + "grad_norm": 0.6084126830101013, + "learning_rate": 4.92739863913405e-05, + "loss": 0.8402, + "step": 9600 + }, + { + "epoch": 0.15425608757764972, + "grad_norm": 0.9142325520515442, + "learning_rate": 4.927247731250787e-05, + "loss": 0.7587, + "step": 9610 + }, + { + "epoch": 0.15441660379781377, + "grad_norm": 0.8059949278831482, + "learning_rate": 4.927096669008815e-05, + "loss": 0.9055, + "step": 9620 + }, + { + "epoch": 0.15457712001797783, + "grad_norm": 0.46955493092536926, + "learning_rate": 4.9269454524177395e-05, + "loss": 0.9552, + "step": 9630 + }, + { + "epoch": 0.15473763623814185, + "grad_norm": 0.9614445567131042, + "learning_rate": 4.926794081487178e-05, + "loss": 0.7585, + "step": 9640 + }, + { + "epoch": 0.1548981524583059, + "grad_norm": 0.6639200448989868, + "learning_rate": 4.9266425562267573e-05, + "loss": 0.7256, + "step": 9650 + }, + { + "epoch": 0.15505866867846996, + "grad_norm": 0.5765352845191956, + "learning_rate": 4.926490876646114e-05, + "loss": 0.8052, + "step": 9660 + }, + { + "epoch": 0.155219184898634, + "grad_norm": 0.7494244575500488, + "learning_rate": 4.926339042754892e-05, + "loss": 0.7778, + "step": 9670 + }, + { + "epoch": 0.15537970111879806, + "grad_norm": 0.5746403336524963, + "learning_rate": 4.9261870545627476e-05, + "loss": 0.7958, + "step": 9680 + }, + { + "epoch": 0.1555402173389621, + "grad_norm": 0.7931315898895264, + "learning_rate": 4.926034912079347e-05, + "loss": 0.7766, + "step": 9690 + }, + { + "epoch": 0.15570073355912614, + "grad_norm": 0.4971093535423279, + "learning_rate": 4.925882615314365e-05, + "loss": 0.7137, + "step": 9700 + }, + { + "epoch": 0.1558612497792902, + "grad_norm": 0.5430861115455627, + "learning_rate": 4.925730164277488e-05, + "loss": 0.713, + "step": 9710 + }, + { + "epoch": 0.15602176599945425, + "grad_norm": 0.4995921850204468, + "learning_rate": 4.9255775589784096e-05, + "loss": 0.837, + "step": 9720 + }, + { + "epoch": 0.1561822822196183, + "grad_norm": 0.3292381167411804, + "learning_rate": 4.9254247994268357e-05, + "loss": 0.8857, + "step": 9730 + }, + { + "epoch": 0.15634279843978233, + "grad_norm": 0.8479230403900146, + "learning_rate": 4.9252718856324797e-05, + "loss": 0.7751, + "step": 9740 + }, + { + "epoch": 0.15650331465994638, + "grad_norm": 0.7066742777824402, + "learning_rate": 4.925118817605067e-05, + "loss": 0.87, + "step": 9750 + }, + { + "epoch": 0.15666383088011043, + "grad_norm": 0.5896716117858887, + "learning_rate": 4.9249655953543316e-05, + "loss": 0.7302, + "step": 9760 + }, + { + "epoch": 0.15682434710027449, + "grad_norm": 0.5411186814308167, + "learning_rate": 4.924812218890017e-05, + "loss": 0.7953, + "step": 9770 + }, + { + "epoch": 0.15698486332043854, + "grad_norm": 0.9316664934158325, + "learning_rate": 4.924658688221878e-05, + "loss": 0.7751, + "step": 9780 + }, + { + "epoch": 0.15714537954060256, + "grad_norm": 0.8137153387069702, + "learning_rate": 4.924505003359678e-05, + "loss": 0.8087, + "step": 9790 + }, + { + "epoch": 0.15730589576076662, + "grad_norm": 0.5312881469726562, + "learning_rate": 4.9243511643131904e-05, + "loss": 0.7297, + "step": 9800 + }, + { + "epoch": 0.15746641198093067, + "grad_norm": 0.7728757858276367, + "learning_rate": 4.924197171092197e-05, + "loss": 0.777, + "step": 9810 + }, + { + "epoch": 0.15762692820109472, + "grad_norm": 0.9061352014541626, + "learning_rate": 4.9240430237064924e-05, + "loss": 0.8472, + "step": 9820 + }, + { + "epoch": 0.15778744442125878, + "grad_norm": 0.9561904668807983, + "learning_rate": 4.9238887221658797e-05, + "loss": 0.7615, + "step": 9830 + }, + { + "epoch": 0.1579479606414228, + "grad_norm": 0.7751533389091492, + "learning_rate": 4.923734266480171e-05, + "loss": 0.7535, + "step": 9840 + }, + { + "epoch": 0.15810847686158686, + "grad_norm": 0.6939054131507874, + "learning_rate": 4.923579656659189e-05, + "loss": 0.8548, + "step": 9850 + }, + { + "epoch": 0.1582689930817509, + "grad_norm": 1.5750150680541992, + "learning_rate": 4.9234248927127646e-05, + "loss": 0.7087, + "step": 9860 + }, + { + "epoch": 0.15842950930191496, + "grad_norm": 0.637879490852356, + "learning_rate": 4.923269974650743e-05, + "loss": 0.8586, + "step": 9870 + }, + { + "epoch": 0.15859002552207901, + "grad_norm": 0.4740722179412842, + "learning_rate": 4.9231149024829725e-05, + "loss": 0.8126, + "step": 9880 + }, + { + "epoch": 0.15875054174224304, + "grad_norm": 0.8732372522354126, + "learning_rate": 4.922959676219317e-05, + "loss": 0.8364, + "step": 9890 + }, + { + "epoch": 0.1589110579624071, + "grad_norm": 1.2889374494552612, + "learning_rate": 4.922804295869647e-05, + "loss": 0.8109, + "step": 9900 + }, + { + "epoch": 0.15907157418257115, + "grad_norm": 0.7183406949043274, + "learning_rate": 4.922648761443846e-05, + "loss": 0.8383, + "step": 9910 + }, + { + "epoch": 0.1592320904027352, + "grad_norm": 0.6521397829055786, + "learning_rate": 4.9224930729518014e-05, + "loss": 0.8614, + "step": 9920 + }, + { + "epoch": 0.15939260662289925, + "grad_norm": 0.7482319474220276, + "learning_rate": 4.9223372304034166e-05, + "loss": 0.9626, + "step": 9930 + }, + { + "epoch": 0.1595531228430633, + "grad_norm": 0.5910117030143738, + "learning_rate": 4.9221812338086014e-05, + "loss": 0.7976, + "step": 9940 + }, + { + "epoch": 0.15971363906322733, + "grad_norm": 0.5343197584152222, + "learning_rate": 4.9220250831772765e-05, + "loss": 0.9098, + "step": 9950 + }, + { + "epoch": 0.15987415528339138, + "grad_norm": 0.5926607251167297, + "learning_rate": 4.921868778519372e-05, + "loss": 0.9288, + "step": 9960 + }, + { + "epoch": 0.16003467150355544, + "grad_norm": 0.8713418245315552, + "learning_rate": 4.921712319844829e-05, + "loss": 0.7754, + "step": 9970 + }, + { + "epoch": 0.1601951877237195, + "grad_norm": 0.4283025562763214, + "learning_rate": 4.9215557071635954e-05, + "loss": 0.7728, + "step": 9980 + }, + { + "epoch": 0.16035570394388354, + "grad_norm": 0.6863767504692078, + "learning_rate": 4.921398940485633e-05, + "loss": 0.8693, + "step": 9990 + }, + { + "epoch": 0.16051622016404757, + "grad_norm": 0.6383336186408997, + "learning_rate": 4.921242019820909e-05, + "loss": 0.8031, + "step": 10000 + }, + { + "epoch": 0.16067673638421162, + "grad_norm": 0.6166431307792664, + "learning_rate": 4.921084945179404e-05, + "loss": 0.8035, + "step": 10010 + }, + { + "epoch": 0.16083725260437567, + "grad_norm": 0.7027831673622131, + "learning_rate": 4.920927716571107e-05, + "loss": 0.8515, + "step": 10020 + }, + { + "epoch": 0.16099776882453973, + "grad_norm": 0.7675538063049316, + "learning_rate": 4.9207703340060175e-05, + "loss": 0.6853, + "step": 10030 + }, + { + "epoch": 0.16115828504470378, + "grad_norm": 0.5839877724647522, + "learning_rate": 4.920612797494142e-05, + "loss": 0.7468, + "step": 10040 + }, + { + "epoch": 0.1613188012648678, + "grad_norm": 0.7493927478790283, + "learning_rate": 4.920455107045501e-05, + "loss": 0.8672, + "step": 10050 + }, + { + "epoch": 0.16147931748503186, + "grad_norm": 0.6170351505279541, + "learning_rate": 4.920297262670122e-05, + "loss": 0.7734, + "step": 10060 + }, + { + "epoch": 0.1616398337051959, + "grad_norm": 0.6521604657173157, + "learning_rate": 4.9201392643780416e-05, + "loss": 0.7278, + "step": 10070 + }, + { + "epoch": 0.16180034992535997, + "grad_norm": 0.6624895334243774, + "learning_rate": 4.919981112179309e-05, + "loss": 0.7708, + "step": 10080 + }, + { + "epoch": 0.16196086614552402, + "grad_norm": 0.6493077874183655, + "learning_rate": 4.9198228060839825e-05, + "loss": 0.7795, + "step": 10090 + }, + { + "epoch": 0.16212138236568804, + "grad_norm": 0.5198938846588135, + "learning_rate": 4.919664346102128e-05, + "loss": 0.8126, + "step": 10100 + }, + { + "epoch": 0.1622818985858521, + "grad_norm": 0.5299177169799805, + "learning_rate": 4.9195057322438224e-05, + "loss": 0.7882, + "step": 10110 + }, + { + "epoch": 0.16244241480601615, + "grad_norm": 0.5809518694877625, + "learning_rate": 4.919346964519154e-05, + "loss": 0.8235, + "step": 10120 + }, + { + "epoch": 0.1626029310261802, + "grad_norm": 1.0289487838745117, + "learning_rate": 4.919188042938219e-05, + "loss": 0.7922, + "step": 10130 + }, + { + "epoch": 0.16276344724634426, + "grad_norm": 0.5807226300239563, + "learning_rate": 4.919028967511123e-05, + "loss": 0.8457, + "step": 10140 + }, + { + "epoch": 0.16292396346650828, + "grad_norm": 0.6146193146705627, + "learning_rate": 4.918869738247983e-05, + "loss": 0.7808, + "step": 10150 + }, + { + "epoch": 0.16308447968667233, + "grad_norm": 0.6335358023643494, + "learning_rate": 4.9187103551589255e-05, + "loss": 0.7107, + "step": 10160 + }, + { + "epoch": 0.1632449959068364, + "grad_norm": 0.6423411965370178, + "learning_rate": 4.918550818254085e-05, + "loss": 0.7555, + "step": 10170 + }, + { + "epoch": 0.16340551212700044, + "grad_norm": 0.6658979654312134, + "learning_rate": 4.918391127543609e-05, + "loss": 0.8006, + "step": 10180 + }, + { + "epoch": 0.1635660283471645, + "grad_norm": 0.7223939299583435, + "learning_rate": 4.918231283037651e-05, + "loss": 0.7963, + "step": 10190 + }, + { + "epoch": 0.16372654456732852, + "grad_norm": 0.9772313833236694, + "learning_rate": 4.918071284746377e-05, + "loss": 0.7653, + "step": 10200 + }, + { + "epoch": 0.16388706078749257, + "grad_norm": 1.0045733451843262, + "learning_rate": 4.917911132679962e-05, + "loss": 0.7429, + "step": 10210 + }, + { + "epoch": 0.16404757700765663, + "grad_norm": 0.5584812164306641, + "learning_rate": 4.917750826848591e-05, + "loss": 0.8017, + "step": 10220 + }, + { + "epoch": 0.16420809322782068, + "grad_norm": 0.5612406134605408, + "learning_rate": 4.917590367262458e-05, + "loss": 0.8108, + "step": 10230 + }, + { + "epoch": 0.16436860944798473, + "grad_norm": 0.5013061165809631, + "learning_rate": 4.917429753931767e-05, + "loss": 0.7279, + "step": 10240 + }, + { + "epoch": 0.16452912566814876, + "grad_norm": 0.6464181542396545, + "learning_rate": 4.9172689868667335e-05, + "loss": 0.8401, + "step": 10250 + }, + { + "epoch": 0.1646896418883128, + "grad_norm": 0.9775183200836182, + "learning_rate": 4.9171080660775796e-05, + "loss": 0.7287, + "step": 10260 + }, + { + "epoch": 0.16485015810847686, + "grad_norm": 0.6618439555168152, + "learning_rate": 4.916946991574539e-05, + "loss": 0.8652, + "step": 10270 + }, + { + "epoch": 0.16501067432864092, + "grad_norm": 0.4188896119594574, + "learning_rate": 4.916785763367857e-05, + "loss": 0.7245, + "step": 10280 + }, + { + "epoch": 0.16517119054880497, + "grad_norm": 0.5474289655685425, + "learning_rate": 4.9166243814677855e-05, + "loss": 0.9952, + "step": 10290 + }, + { + "epoch": 0.165331706768969, + "grad_norm": 0.41949909925460815, + "learning_rate": 4.9164628458845876e-05, + "loss": 0.7359, + "step": 10300 + }, + { + "epoch": 0.16549222298913305, + "grad_norm": 0.9447283148765564, + "learning_rate": 4.916301156628535e-05, + "loss": 0.8178, + "step": 10310 + }, + { + "epoch": 0.1656527392092971, + "grad_norm": 0.9754018187522888, + "learning_rate": 4.916139313709911e-05, + "loss": 0.8022, + "step": 10320 + }, + { + "epoch": 0.16581325542946115, + "grad_norm": 0.6596773266792297, + "learning_rate": 4.9159773171390086e-05, + "loss": 0.8613, + "step": 10330 + }, + { + "epoch": 0.1659737716496252, + "grad_norm": 0.6182686686515808, + "learning_rate": 4.915815166926129e-05, + "loss": 0.7505, + "step": 10340 + }, + { + "epoch": 0.16613428786978923, + "grad_norm": 0.6671663522720337, + "learning_rate": 4.915652863081585e-05, + "loss": 0.834, + "step": 10350 + }, + { + "epoch": 0.16629480408995329, + "grad_norm": 0.8500170707702637, + "learning_rate": 4.915490405615696e-05, + "loss": 0.75, + "step": 10360 + }, + { + "epoch": 0.16645532031011734, + "grad_norm": 0.5699482560157776, + "learning_rate": 4.915327794538795e-05, + "loss": 0.8271, + "step": 10370 + }, + { + "epoch": 0.1666158365302814, + "grad_norm": 0.46715933084487915, + "learning_rate": 4.915165029861223e-05, + "loss": 0.7382, + "step": 10380 + }, + { + "epoch": 0.16677635275044544, + "grad_norm": 0.7244930267333984, + "learning_rate": 4.915002111593331e-05, + "loss": 0.9069, + "step": 10390 + }, + { + "epoch": 0.16693686897060947, + "grad_norm": 0.6618782877922058, + "learning_rate": 4.914839039745479e-05, + "loss": 0.8069, + "step": 10400 + }, + { + "epoch": 0.16709738519077352, + "grad_norm": 0.43329063057899475, + "learning_rate": 4.914675814328038e-05, + "loss": 0.7605, + "step": 10410 + }, + { + "epoch": 0.16725790141093758, + "grad_norm": 0.6046708822250366, + "learning_rate": 4.914512435351388e-05, + "loss": 0.7312, + "step": 10420 + }, + { + "epoch": 0.16741841763110163, + "grad_norm": 0.48591741919517517, + "learning_rate": 4.914348902825918e-05, + "loss": 0.7207, + "step": 10430 + }, + { + "epoch": 0.16757893385126568, + "grad_norm": 0.6001600027084351, + "learning_rate": 4.914185216762029e-05, + "loss": 0.7089, + "step": 10440 + }, + { + "epoch": 0.1677394500714297, + "grad_norm": 0.7008888721466064, + "learning_rate": 4.914021377170131e-05, + "loss": 0.7858, + "step": 10450 + }, + { + "epoch": 0.16789996629159376, + "grad_norm": 0.7884200215339661, + "learning_rate": 4.913857384060642e-05, + "loss": 0.8189, + "step": 10460 + }, + { + "epoch": 0.1680604825117578, + "grad_norm": 0.571982741355896, + "learning_rate": 4.91369323744399e-05, + "loss": 0.8575, + "step": 10470 + }, + { + "epoch": 0.16822099873192187, + "grad_norm": 0.7083830833435059, + "learning_rate": 4.9135289373306156e-05, + "loss": 0.7158, + "step": 10480 + }, + { + "epoch": 0.16838151495208592, + "grad_norm": 0.7038744688034058, + "learning_rate": 4.913364483730968e-05, + "loss": 0.797, + "step": 10490 + }, + { + "epoch": 0.16854203117224995, + "grad_norm": 0.8867170810699463, + "learning_rate": 4.913199876655502e-05, + "loss": 0.7415, + "step": 10500 + }, + { + "epoch": 0.168702547392414, + "grad_norm": 0.5323980450630188, + "learning_rate": 4.91303511611469e-05, + "loss": 0.7767, + "step": 10510 + }, + { + "epoch": 0.16886306361257805, + "grad_norm": 0.7018535137176514, + "learning_rate": 4.912870202119006e-05, + "loss": 0.7426, + "step": 10520 + }, + { + "epoch": 0.1690235798327421, + "grad_norm": 0.6567034125328064, + "learning_rate": 4.912705134678941e-05, + "loss": 0.8341, + "step": 10530 + }, + { + "epoch": 0.16918409605290616, + "grad_norm": 0.8170708417892456, + "learning_rate": 4.912539913804989e-05, + "loss": 0.9386, + "step": 10540 + }, + { + "epoch": 0.16934461227307018, + "grad_norm": 0.4879869222640991, + "learning_rate": 4.912374539507659e-05, + "loss": 0.7896, + "step": 10550 + }, + { + "epoch": 0.16950512849323424, + "grad_norm": 1.46728515625, + "learning_rate": 4.912209011797468e-05, + "loss": 0.9008, + "step": 10560 + }, + { + "epoch": 0.1696656447133983, + "grad_norm": 0.5582660436630249, + "learning_rate": 4.9120433306849423e-05, + "loss": 0.6834, + "step": 10570 + }, + { + "epoch": 0.16982616093356234, + "grad_norm": 0.7243114709854126, + "learning_rate": 4.911877496180617e-05, + "loss": 0.7772, + "step": 10580 + }, + { + "epoch": 0.1699866771537264, + "grad_norm": 0.6952108144760132, + "learning_rate": 4.91171150829504e-05, + "loss": 0.8321, + "step": 10590 + }, + { + "epoch": 0.17014719337389042, + "grad_norm": 0.5720369815826416, + "learning_rate": 4.911545367038767e-05, + "loss": 0.9308, + "step": 10600 + }, + { + "epoch": 0.17030770959405447, + "grad_norm": 0.7475024461746216, + "learning_rate": 4.9113790724223615e-05, + "loss": 0.79, + "step": 10610 + }, + { + "epoch": 0.17046822581421853, + "grad_norm": 0.964226484298706, + "learning_rate": 4.911212624456402e-05, + "loss": 0.7076, + "step": 10620 + }, + { + "epoch": 0.17062874203438258, + "grad_norm": 0.6514016389846802, + "learning_rate": 4.91104602315147e-05, + "loss": 0.8509, + "step": 10630 + }, + { + "epoch": 0.17078925825454663, + "grad_norm": 0.8975919485092163, + "learning_rate": 4.910879268518164e-05, + "loss": 0.8009, + "step": 10640 + }, + { + "epoch": 0.17094977447471066, + "grad_norm": 0.5019351840019226, + "learning_rate": 4.910712360567086e-05, + "loss": 0.8829, + "step": 10650 + }, + { + "epoch": 0.1711102906948747, + "grad_norm": 0.5511919856071472, + "learning_rate": 4.910545299308852e-05, + "loss": 0.7782, + "step": 10660 + }, + { + "epoch": 0.17127080691503876, + "grad_norm": 0.5189537405967712, + "learning_rate": 4.910378084754085e-05, + "loss": 0.8573, + "step": 10670 + }, + { + "epoch": 0.17143132313520282, + "grad_norm": 0.9278902411460876, + "learning_rate": 4.910210716913419e-05, + "loss": 0.8171, + "step": 10680 + }, + { + "epoch": 0.17159183935536687, + "grad_norm": 0.4458676278591156, + "learning_rate": 4.910043195797498e-05, + "loss": 0.7875, + "step": 10690 + }, + { + "epoch": 0.1717523555755309, + "grad_norm": 0.45472124218940735, + "learning_rate": 4.9098755214169755e-05, + "loss": 0.8384, + "step": 10700 + }, + { + "epoch": 0.17191287179569495, + "grad_norm": 0.7674721479415894, + "learning_rate": 4.909707693782515e-05, + "loss": 0.8508, + "step": 10710 + }, + { + "epoch": 0.172073388015859, + "grad_norm": 0.6027742624282837, + "learning_rate": 4.909539712904788e-05, + "loss": 0.8298, + "step": 10720 + }, + { + "epoch": 0.17223390423602306, + "grad_norm": 0.8294141292572021, + "learning_rate": 4.909371578794477e-05, + "loss": 0.8973, + "step": 10730 + }, + { + "epoch": 0.1723944204561871, + "grad_norm": 1.4726732969284058, + "learning_rate": 4.9092032914622766e-05, + "loss": 0.8312, + "step": 10740 + }, + { + "epoch": 0.17255493667635113, + "grad_norm": 0.8279612064361572, + "learning_rate": 4.909034850918887e-05, + "loss": 0.8909, + "step": 10750 + }, + { + "epoch": 0.1727154528965152, + "grad_norm": 0.6300029754638672, + "learning_rate": 4.90886625717502e-05, + "loss": 0.8616, + "step": 10760 + }, + { + "epoch": 0.17287596911667924, + "grad_norm": 0.6635826826095581, + "learning_rate": 4.908697510241398e-05, + "loss": 0.7283, + "step": 10770 + }, + { + "epoch": 0.1730364853368433, + "grad_norm": 0.5336458086967468, + "learning_rate": 4.908528610128752e-05, + "loss": 0.8229, + "step": 10780 + }, + { + "epoch": 0.17319700155700735, + "grad_norm": 0.8471100330352783, + "learning_rate": 4.908359556847823e-05, + "loss": 0.7531, + "step": 10790 + }, + { + "epoch": 0.17335751777717137, + "grad_norm": 0.6426544785499573, + "learning_rate": 4.9081903504093624e-05, + "loss": 0.7834, + "step": 10800 + }, + { + "epoch": 0.17351803399733542, + "grad_norm": 0.5473433136940002, + "learning_rate": 4.908020990824129e-05, + "loss": 0.8294, + "step": 10810 + }, + { + "epoch": 0.17367855021749948, + "grad_norm": 0.5126625895500183, + "learning_rate": 4.907851478102895e-05, + "loss": 0.8836, + "step": 10820 + }, + { + "epoch": 0.17383906643766353, + "grad_norm": 0.6371644139289856, + "learning_rate": 4.90768181225644e-05, + "loss": 0.9078, + "step": 10830 + }, + { + "epoch": 0.17399958265782758, + "grad_norm": 0.6427643895149231, + "learning_rate": 4.907511993295553e-05, + "loss": 0.833, + "step": 10840 + }, + { + "epoch": 0.1741600988779916, + "grad_norm": 0.38501980900764465, + "learning_rate": 4.907342021231034e-05, + "loss": 0.8293, + "step": 10850 + }, + { + "epoch": 0.17432061509815566, + "grad_norm": 0.9270192980766296, + "learning_rate": 4.907171896073691e-05, + "loss": 0.8232, + "step": 10860 + }, + { + "epoch": 0.17448113131831972, + "grad_norm": 0.8319985270500183, + "learning_rate": 4.907001617834345e-05, + "loss": 0.919, + "step": 10870 + }, + { + "epoch": 0.17464164753848377, + "grad_norm": 0.47695937752723694, + "learning_rate": 4.906831186523824e-05, + "loss": 0.7423, + "step": 10880 + }, + { + "epoch": 0.17480216375864782, + "grad_norm": 0.8398069739341736, + "learning_rate": 4.906660602152966e-05, + "loss": 0.9076, + "step": 10890 + }, + { + "epoch": 0.17496267997881185, + "grad_norm": 0.7106353640556335, + "learning_rate": 4.906489864732619e-05, + "loss": 0.5861, + "step": 10900 + }, + { + "epoch": 0.1751231961989759, + "grad_norm": 0.5744534134864807, + "learning_rate": 4.906318974273642e-05, + "loss": 0.7751, + "step": 10910 + }, + { + "epoch": 0.17528371241913995, + "grad_norm": 0.6399341821670532, + "learning_rate": 4.906147930786902e-05, + "loss": 0.8647, + "step": 10920 + }, + { + "epoch": 0.175444228639304, + "grad_norm": 0.5677672624588013, + "learning_rate": 4.9059767342832754e-05, + "loss": 0.5865, + "step": 10930 + }, + { + "epoch": 0.17560474485946806, + "grad_norm": 0.5180374979972839, + "learning_rate": 4.905805384773651e-05, + "loss": 0.9483, + "step": 10940 + }, + { + "epoch": 0.17576526107963208, + "grad_norm": 0.8110500574111938, + "learning_rate": 4.905633882268924e-05, + "loss": 0.9717, + "step": 10950 + }, + { + "epoch": 0.17592577729979614, + "grad_norm": 0.6621297001838684, + "learning_rate": 4.905462226780001e-05, + "loss": 0.9464, + "step": 10960 + }, + { + "epoch": 0.1760862935199602, + "grad_norm": 0.7507532835006714, + "learning_rate": 4.905290418317801e-05, + "loss": 0.8609, + "step": 10970 + }, + { + "epoch": 0.17624680974012424, + "grad_norm": 0.7834741473197937, + "learning_rate": 4.905118456893246e-05, + "loss": 0.8998, + "step": 10980 + }, + { + "epoch": 0.1764073259602883, + "grad_norm": 0.6373550891876221, + "learning_rate": 4.904946342517275e-05, + "loss": 0.7516, + "step": 10990 + }, + { + "epoch": 0.17656784218045232, + "grad_norm": 0.7723615765571594, + "learning_rate": 4.904774075200832e-05, + "loss": 0.7874, + "step": 11000 + }, + { + "epoch": 0.17672835840061638, + "grad_norm": 1.4654874801635742, + "learning_rate": 4.904601654954872e-05, + "loss": 0.8789, + "step": 11010 + }, + { + "epoch": 0.17688887462078043, + "grad_norm": 0.8759693503379822, + "learning_rate": 4.904429081790361e-05, + "loss": 0.7679, + "step": 11020 + }, + { + "epoch": 0.17704939084094448, + "grad_norm": 0.504368245601654, + "learning_rate": 4.904256355718273e-05, + "loss": 0.7372, + "step": 11030 + }, + { + "epoch": 0.17720990706110853, + "grad_norm": 0.8316324353218079, + "learning_rate": 4.904083476749592e-05, + "loss": 0.8608, + "step": 11040 + }, + { + "epoch": 0.17737042328127256, + "grad_norm": 0.6061356067657471, + "learning_rate": 4.9039104448953124e-05, + "loss": 0.6936, + "step": 11050 + }, + { + "epoch": 0.1775309395014366, + "grad_norm": 0.866585373878479, + "learning_rate": 4.903737260166438e-05, + "loss": 0.8714, + "step": 11060 + }, + { + "epoch": 0.17769145572160067, + "grad_norm": 0.5624390840530396, + "learning_rate": 4.9035639225739825e-05, + "loss": 0.9504, + "step": 11070 + }, + { + "epoch": 0.17785197194176472, + "grad_norm": 0.7574442625045776, + "learning_rate": 4.903390432128969e-05, + "loss": 0.8134, + "step": 11080 + }, + { + "epoch": 0.17801248816192877, + "grad_norm": 0.613847017288208, + "learning_rate": 4.903216788842431e-05, + "loss": 0.9091, + "step": 11090 + }, + { + "epoch": 0.1781730043820928, + "grad_norm": 1.5024479627609253, + "learning_rate": 4.903042992725409e-05, + "loss": 0.7372, + "step": 11100 + }, + { + "epoch": 0.17833352060225685, + "grad_norm": 0.8547614216804504, + "learning_rate": 4.9028690437889586e-05, + "loss": 0.8269, + "step": 11110 + }, + { + "epoch": 0.1784940368224209, + "grad_norm": 0.5690340399742126, + "learning_rate": 4.90269494204414e-05, + "loss": 0.752, + "step": 11120 + }, + { + "epoch": 0.17865455304258496, + "grad_norm": 0.7079929709434509, + "learning_rate": 4.902520687502026e-05, + "loss": 0.8041, + "step": 11130 + }, + { + "epoch": 0.178815069262749, + "grad_norm": 0.6027037501335144, + "learning_rate": 4.9023462801736964e-05, + "loss": 0.7461, + "step": 11140 + }, + { + "epoch": 0.17897558548291304, + "grad_norm": 0.497420072555542, + "learning_rate": 4.902171720070244e-05, + "loss": 0.8518, + "step": 11150 + }, + { + "epoch": 0.1791361017030771, + "grad_norm": 0.9100947380065918, + "learning_rate": 4.901997007202771e-05, + "loss": 0.7886, + "step": 11160 + }, + { + "epoch": 0.17929661792324114, + "grad_norm": 1.1102112531661987, + "learning_rate": 4.901822141582385e-05, + "loss": 0.8517, + "step": 11170 + }, + { + "epoch": 0.1794571341434052, + "grad_norm": 0.696198046207428, + "learning_rate": 4.901647123220209e-05, + "loss": 0.9015, + "step": 11180 + }, + { + "epoch": 0.17961765036356925, + "grad_norm": 0.6905004978179932, + "learning_rate": 4.9014719521273714e-05, + "loss": 0.7723, + "step": 11190 + }, + { + "epoch": 0.17977816658373327, + "grad_norm": 0.9232503771781921, + "learning_rate": 4.901296628315014e-05, + "loss": 0.8141, + "step": 11200 + }, + { + "epoch": 0.17993868280389733, + "grad_norm": 0.6662132143974304, + "learning_rate": 4.9011211517942845e-05, + "loss": 0.7695, + "step": 11210 + }, + { + "epoch": 0.18009919902406138, + "grad_norm": 0.5204128623008728, + "learning_rate": 4.900945522576342e-05, + "loss": 0.8082, + "step": 11220 + }, + { + "epoch": 0.18025971524422543, + "grad_norm": 0.5291982293128967, + "learning_rate": 4.900769740672358e-05, + "loss": 0.7867, + "step": 11230 + }, + { + "epoch": 0.18042023146438949, + "grad_norm": 0.5215858221054077, + "learning_rate": 4.9005938060935084e-05, + "loss": 0.8056, + "step": 11240 + }, + { + "epoch": 0.1805807476845535, + "grad_norm": 0.5328707098960876, + "learning_rate": 4.9004177188509837e-05, + "loss": 0.8337, + "step": 11250 + }, + { + "epoch": 0.18074126390471756, + "grad_norm": 0.6902971863746643, + "learning_rate": 4.9002414789559805e-05, + "loss": 0.7399, + "step": 11260 + }, + { + "epoch": 0.18090178012488162, + "grad_norm": 0.6105660796165466, + "learning_rate": 4.900065086419707e-05, + "loss": 0.8064, + "step": 11270 + }, + { + "epoch": 0.18106229634504567, + "grad_norm": 0.6839336156845093, + "learning_rate": 4.899888541253382e-05, + "loss": 0.8141, + "step": 11280 + }, + { + "epoch": 0.18122281256520972, + "grad_norm": 0.5222414135932922, + "learning_rate": 4.899711843468231e-05, + "loss": 0.88, + "step": 11290 + }, + { + "epoch": 0.18138332878537375, + "grad_norm": 0.6008721590042114, + "learning_rate": 4.899534993075492e-05, + "loss": 0.851, + "step": 11300 + }, + { + "epoch": 0.1815438450055378, + "grad_norm": 0.505827009677887, + "learning_rate": 4.899357990086411e-05, + "loss": 0.8433, + "step": 11310 + }, + { + "epoch": 0.18170436122570185, + "grad_norm": 0.5383577346801758, + "learning_rate": 4.899180834512244e-05, + "loss": 0.8024, + "step": 11320 + }, + { + "epoch": 0.1818648774458659, + "grad_norm": 0.5109674334526062, + "learning_rate": 4.899003526364259e-05, + "loss": 0.9077, + "step": 11330 + }, + { + "epoch": 0.18202539366602996, + "grad_norm": 1.098524570465088, + "learning_rate": 4.8988260656537294e-05, + "loss": 0.8724, + "step": 11340 + }, + { + "epoch": 0.182185909886194, + "grad_norm": 0.6726560592651367, + "learning_rate": 4.898648452391942e-05, + "loss": 0.7793, + "step": 11350 + }, + { + "epoch": 0.18234642610635804, + "grad_norm": 0.6418630480766296, + "learning_rate": 4.898470686590192e-05, + "loss": 0.7849, + "step": 11360 + }, + { + "epoch": 0.1825069423265221, + "grad_norm": 0.6708397269248962, + "learning_rate": 4.8982927682597833e-05, + "loss": 0.7716, + "step": 11370 + }, + { + "epoch": 0.18266745854668615, + "grad_norm": 0.5344885587692261, + "learning_rate": 4.898114697412031e-05, + "loss": 0.8254, + "step": 11380 + }, + { + "epoch": 0.1828279747668502, + "grad_norm": 0.9326540231704712, + "learning_rate": 4.89793647405826e-05, + "loss": 0.7097, + "step": 11390 + }, + { + "epoch": 0.18298849098701422, + "grad_norm": 0.6128734350204468, + "learning_rate": 4.897758098209804e-05, + "loss": 0.8761, + "step": 11400 + }, + { + "epoch": 0.18314900720717828, + "grad_norm": 0.9772377610206604, + "learning_rate": 4.8975795698780056e-05, + "loss": 0.7942, + "step": 11410 + }, + { + "epoch": 0.18330952342734233, + "grad_norm": 0.5638630390167236, + "learning_rate": 4.8974008890742196e-05, + "loss": 0.8319, + "step": 11420 + }, + { + "epoch": 0.18347003964750638, + "grad_norm": 0.6060901284217834, + "learning_rate": 4.897222055809808e-05, + "loss": 0.8398, + "step": 11430 + }, + { + "epoch": 0.18363055586767044, + "grad_norm": 0.5634408593177795, + "learning_rate": 4.8970430700961434e-05, + "loss": 0.7241, + "step": 11440 + }, + { + "epoch": 0.1837910720878345, + "grad_norm": 0.4987777769565582, + "learning_rate": 4.89686393194461e-05, + "loss": 0.859, + "step": 11450 + }, + { + "epoch": 0.18395158830799851, + "grad_norm": 0.8541100025177002, + "learning_rate": 4.896684641366598e-05, + "loss": 0.7393, + "step": 11460 + }, + { + "epoch": 0.18411210452816257, + "grad_norm": 0.5242561101913452, + "learning_rate": 4.89650519837351e-05, + "loss": 0.8291, + "step": 11470 + }, + { + "epoch": 0.18427262074832662, + "grad_norm": 0.705387532711029, + "learning_rate": 4.896325602976757e-05, + "loss": 0.8394, + "step": 11480 + }, + { + "epoch": 0.18443313696849067, + "grad_norm": 0.5540165901184082, + "learning_rate": 4.896145855187761e-05, + "loss": 0.7717, + "step": 11490 + }, + { + "epoch": 0.18459365318865473, + "grad_norm": 0.5656931400299072, + "learning_rate": 4.8959659550179523e-05, + "loss": 0.8728, + "step": 11500 + }, + { + "epoch": 0.18475416940881875, + "grad_norm": 0.6961113810539246, + "learning_rate": 4.8957859024787723e-05, + "loss": 0.798, + "step": 11510 + }, + { + "epoch": 0.1849146856289828, + "grad_norm": 0.5795508027076721, + "learning_rate": 4.89560569758167e-05, + "loss": 0.8797, + "step": 11520 + }, + { + "epoch": 0.18507520184914686, + "grad_norm": 0.5087544918060303, + "learning_rate": 4.895425340338107e-05, + "loss": 0.859, + "step": 11530 + }, + { + "epoch": 0.1852357180693109, + "grad_norm": 0.7043845653533936, + "learning_rate": 4.8952448307595516e-05, + "loss": 0.8274, + "step": 11540 + }, + { + "epoch": 0.18539623428947496, + "grad_norm": 0.81352698802948, + "learning_rate": 4.895064168857484e-05, + "loss": 0.9208, + "step": 11550 + }, + { + "epoch": 0.185556750509639, + "grad_norm": 0.6559650897979736, + "learning_rate": 4.894883354643392e-05, + "loss": 0.7067, + "step": 11560 + }, + { + "epoch": 0.18571726672980304, + "grad_norm": 1.0008224248886108, + "learning_rate": 4.894702388128776e-05, + "loss": 0.8512, + "step": 11570 + }, + { + "epoch": 0.1858777829499671, + "grad_norm": 0.7530532479286194, + "learning_rate": 4.894521269325143e-05, + "loss": 0.8827, + "step": 11580 + }, + { + "epoch": 0.18603829917013115, + "grad_norm": 0.5487101674079895, + "learning_rate": 4.894339998244012e-05, + "loss": 0.7643, + "step": 11590 + }, + { + "epoch": 0.1861988153902952, + "grad_norm": 0.520658552646637, + "learning_rate": 4.894158574896911e-05, + "loss": 0.8958, + "step": 11600 + }, + { + "epoch": 0.18635933161045923, + "grad_norm": 0.8748995065689087, + "learning_rate": 4.893976999295376e-05, + "loss": 0.7568, + "step": 11610 + }, + { + "epoch": 0.18651984783062328, + "grad_norm": 0.704588770866394, + "learning_rate": 4.8937952714509554e-05, + "loss": 0.8897, + "step": 11620 + }, + { + "epoch": 0.18668036405078733, + "grad_norm": 0.832264244556427, + "learning_rate": 4.8936133913752056e-05, + "loss": 0.8559, + "step": 11630 + }, + { + "epoch": 0.1868408802709514, + "grad_norm": 0.668847918510437, + "learning_rate": 4.8934313590796935e-05, + "loss": 0.9074, + "step": 11640 + }, + { + "epoch": 0.18700139649111544, + "grad_norm": 0.4560314118862152, + "learning_rate": 4.893249174575995e-05, + "loss": 0.8428, + "step": 11650 + }, + { + "epoch": 0.18716191271127947, + "grad_norm": 0.6987394690513611, + "learning_rate": 4.893066837875696e-05, + "loss": 0.9658, + "step": 11660 + }, + { + "epoch": 0.18732242893144352, + "grad_norm": 0.90873122215271, + "learning_rate": 4.892884348990391e-05, + "loss": 0.7619, + "step": 11670 + }, + { + "epoch": 0.18748294515160757, + "grad_norm": 0.7380156517028809, + "learning_rate": 4.892701707931687e-05, + "loss": 0.8696, + "step": 11680 + }, + { + "epoch": 0.18764346137177162, + "grad_norm": 0.5298711061477661, + "learning_rate": 4.8925189147111985e-05, + "loss": 0.947, + "step": 11690 + }, + { + "epoch": 0.18780397759193568, + "grad_norm": 0.7438448071479797, + "learning_rate": 4.892335969340549e-05, + "loss": 0.8806, + "step": 11700 + }, + { + "epoch": 0.1879644938120997, + "grad_norm": 0.8054751753807068, + "learning_rate": 4.8921528718313734e-05, + "loss": 0.9031, + "step": 11710 + }, + { + "epoch": 0.18812501003226376, + "grad_norm": 0.4906221032142639, + "learning_rate": 4.891969622195316e-05, + "loss": 0.9616, + "step": 11720 + }, + { + "epoch": 0.1882855262524278, + "grad_norm": 0.9057896733283997, + "learning_rate": 4.89178622044403e-05, + "loss": 0.8048, + "step": 11730 + }, + { + "epoch": 0.18844604247259186, + "grad_norm": 0.575579822063446, + "learning_rate": 4.891602666589179e-05, + "loss": 0.848, + "step": 11740 + }, + { + "epoch": 0.18860655869275592, + "grad_norm": 1.3101791143417358, + "learning_rate": 4.891418960642435e-05, + "loss": 0.7986, + "step": 11750 + }, + { + "epoch": 0.18876707491291994, + "grad_norm": 0.6164715886116028, + "learning_rate": 4.891235102615482e-05, + "loss": 0.9275, + "step": 11760 + }, + { + "epoch": 0.188927591133084, + "grad_norm": 0.7182970643043518, + "learning_rate": 4.8910510925200115e-05, + "loss": 0.8425, + "step": 11770 + }, + { + "epoch": 0.18908810735324805, + "grad_norm": 0.5174481868743896, + "learning_rate": 4.890866930367725e-05, + "loss": 0.8288, + "step": 11780 + }, + { + "epoch": 0.1892486235734121, + "grad_norm": 0.6631144285202026, + "learning_rate": 4.8906826161703355e-05, + "loss": 0.7348, + "step": 11790 + }, + { + "epoch": 0.18940913979357615, + "grad_norm": 0.680849015712738, + "learning_rate": 4.890498149939563e-05, + "loss": 0.8307, + "step": 11800 + }, + { + "epoch": 0.18956965601374018, + "grad_norm": 0.5609121322631836, + "learning_rate": 4.890313531687139e-05, + "loss": 0.8353, + "step": 11810 + }, + { + "epoch": 0.18973017223390423, + "grad_norm": 0.5094613432884216, + "learning_rate": 4.890128761424804e-05, + "loss": 0.852, + "step": 11820 + }, + { + "epoch": 0.18989068845406828, + "grad_norm": 3.5489320755004883, + "learning_rate": 4.889943839164309e-05, + "loss": 0.8015, + "step": 11830 + }, + { + "epoch": 0.19005120467423234, + "grad_norm": 0.436279833316803, + "learning_rate": 4.889758764917412e-05, + "loss": 0.7456, + "step": 11840 + }, + { + "epoch": 0.1902117208943964, + "grad_norm": 1.8942945003509521, + "learning_rate": 4.889573538695885e-05, + "loss": 0.8363, + "step": 11850 + }, + { + "epoch": 0.19037223711456042, + "grad_norm": 0.711031973361969, + "learning_rate": 4.889388160511506e-05, + "loss": 0.8225, + "step": 11860 + }, + { + "epoch": 0.19053275333472447, + "grad_norm": 0.8844591975212097, + "learning_rate": 4.8892026303760655e-05, + "loss": 0.8197, + "step": 11870 + }, + { + "epoch": 0.19069326955488852, + "grad_norm": 0.9295217394828796, + "learning_rate": 4.889016948301359e-05, + "loss": 0.8151, + "step": 11880 + }, + { + "epoch": 0.19085378577505258, + "grad_norm": 1.0230188369750977, + "learning_rate": 4.888831114299198e-05, + "loss": 0.8016, + "step": 11890 + }, + { + "epoch": 0.19101430199521663, + "grad_norm": 0.7457553744316101, + "learning_rate": 4.8886451283813994e-05, + "loss": 0.7102, + "step": 11900 + }, + { + "epoch": 0.19117481821538065, + "grad_norm": 0.5262829065322876, + "learning_rate": 4.88845899055979e-05, + "loss": 0.7502, + "step": 11910 + }, + { + "epoch": 0.1913353344355447, + "grad_norm": 1.521360993385315, + "learning_rate": 4.888272700846207e-05, + "loss": 0.8385, + "step": 11920 + }, + { + "epoch": 0.19149585065570876, + "grad_norm": 0.6730622053146362, + "learning_rate": 4.888086259252499e-05, + "loss": 0.7625, + "step": 11930 + }, + { + "epoch": 0.1916563668758728, + "grad_norm": 0.6845219135284424, + "learning_rate": 4.887899665790521e-05, + "loss": 0.885, + "step": 11940 + }, + { + "epoch": 0.19181688309603687, + "grad_norm": 0.6787387728691101, + "learning_rate": 4.887712920472139e-05, + "loss": 0.7132, + "step": 11950 + }, + { + "epoch": 0.1919773993162009, + "grad_norm": 0.25640878081321716, + "learning_rate": 4.887526023309231e-05, + "loss": 0.8774, + "step": 11960 + }, + { + "epoch": 0.19213791553636494, + "grad_norm": 0.6451675295829773, + "learning_rate": 4.887338974313681e-05, + "loss": 0.7807, + "step": 11970 + }, + { + "epoch": 0.192298431756529, + "grad_norm": 0.5974339842796326, + "learning_rate": 4.887151773497385e-05, + "loss": 0.844, + "step": 11980 + }, + { + "epoch": 0.19245894797669305, + "grad_norm": 0.42439132928848267, + "learning_rate": 4.886964420872247e-05, + "loss": 0.7911, + "step": 11990 + }, + { + "epoch": 0.1926194641968571, + "grad_norm": 1.0810766220092773, + "learning_rate": 4.886776916450181e-05, + "loss": 0.7546, + "step": 12000 + }, + { + "epoch": 0.1926194641968571, + "eval_loss": 0.8132067322731018, + "eval_runtime": 1833.4168, + "eval_samples_per_second": 14.307, + "eval_steps_per_second": 1.788, + "step": 12000 + }, + { + "epoch": 0.19277998041702113, + "grad_norm": 0.8802374601364136, + "learning_rate": 4.8865892602431124e-05, + "loss": 0.9244, + "step": 12010 + }, + { + "epoch": 0.19294049663718518, + "grad_norm": 0.8587452173233032, + "learning_rate": 4.8864014522629754e-05, + "loss": 0.8523, + "step": 12020 + }, + { + "epoch": 0.19310101285734924, + "grad_norm": 0.5683992505073547, + "learning_rate": 4.886213492521712e-05, + "loss": 0.7412, + "step": 12030 + }, + { + "epoch": 0.1932615290775133, + "grad_norm": 0.46220889687538147, + "learning_rate": 4.886025381031276e-05, + "loss": 0.8305, + "step": 12040 + }, + { + "epoch": 0.19342204529767734, + "grad_norm": 1.3861923217773438, + "learning_rate": 4.885837117803631e-05, + "loss": 0.8438, + "step": 12050 + }, + { + "epoch": 0.19358256151784137, + "grad_norm": 0.7454645037651062, + "learning_rate": 4.885648702850749e-05, + "loss": 0.8436, + "step": 12060 + }, + { + "epoch": 0.19374307773800542, + "grad_norm": 0.5441152453422546, + "learning_rate": 4.885460136184611e-05, + "loss": 0.764, + "step": 12070 + }, + { + "epoch": 0.19390359395816947, + "grad_norm": 0.8197323679924011, + "learning_rate": 4.88527141781721e-05, + "loss": 0.7927, + "step": 12080 + }, + { + "epoch": 0.19406411017833353, + "grad_norm": 0.7341776490211487, + "learning_rate": 4.8850825477605475e-05, + "loss": 0.8873, + "step": 12090 + }, + { + "epoch": 0.19422462639849758, + "grad_norm": 0.6845585107803345, + "learning_rate": 4.884893526026633e-05, + "loss": 0.8067, + "step": 12100 + }, + { + "epoch": 0.1943851426186616, + "grad_norm": 0.6292068958282471, + "learning_rate": 4.8847043526274885e-05, + "loss": 0.7866, + "step": 12110 + }, + { + "epoch": 0.19454565883882566, + "grad_norm": 0.6105372905731201, + "learning_rate": 4.884515027575144e-05, + "loss": 0.7092, + "step": 12120 + }, + { + "epoch": 0.1947061750589897, + "grad_norm": 0.9235106706619263, + "learning_rate": 4.88432555088164e-05, + "loss": 0.9024, + "step": 12130 + }, + { + "epoch": 0.19486669127915376, + "grad_norm": 0.5185039639472961, + "learning_rate": 4.884135922559024e-05, + "loss": 0.7645, + "step": 12140 + }, + { + "epoch": 0.19502720749931782, + "grad_norm": 0.5810807943344116, + "learning_rate": 4.8839461426193584e-05, + "loss": 0.7515, + "step": 12150 + }, + { + "epoch": 0.19518772371948184, + "grad_norm": 0.7526199221611023, + "learning_rate": 4.8837562110747096e-05, + "loss": 0.8337, + "step": 12160 + }, + { + "epoch": 0.1953482399396459, + "grad_norm": 0.8419879078865051, + "learning_rate": 4.8835661279371575e-05, + "loss": 0.8363, + "step": 12170 + }, + { + "epoch": 0.19550875615980995, + "grad_norm": 0.9004353284835815, + "learning_rate": 4.88337589321879e-05, + "loss": 0.8604, + "step": 12180 + }, + { + "epoch": 0.195669272379974, + "grad_norm": 0.5311726927757263, + "learning_rate": 4.8831855069317035e-05, + "loss": 0.87, + "step": 12190 + }, + { + "epoch": 0.19582978860013805, + "grad_norm": 0.7516449689865112, + "learning_rate": 4.882994969088007e-05, + "loss": 0.6459, + "step": 12200 + }, + { + "epoch": 0.19599030482030208, + "grad_norm": 0.7501307725906372, + "learning_rate": 4.882804279699818e-05, + "loss": 0.7784, + "step": 12210 + }, + { + "epoch": 0.19615082104046613, + "grad_norm": 1.1613565683364868, + "learning_rate": 4.882613438779262e-05, + "loss": 0.7226, + "step": 12220 + }, + { + "epoch": 0.1963113372606302, + "grad_norm": 0.6988255381584167, + "learning_rate": 4.882422446338476e-05, + "loss": 0.8108, + "step": 12230 + }, + { + "epoch": 0.19647185348079424, + "grad_norm": 1.152274489402771, + "learning_rate": 4.882231302389606e-05, + "loss": 0.8379, + "step": 12240 + }, + { + "epoch": 0.1966323697009583, + "grad_norm": 0.6547920107841492, + "learning_rate": 4.8820400069448066e-05, + "loss": 0.8416, + "step": 12250 + }, + { + "epoch": 0.19679288592112232, + "grad_norm": 0.5356700420379639, + "learning_rate": 4.881848560016244e-05, + "loss": 0.7473, + "step": 12260 + }, + { + "epoch": 0.19695340214128637, + "grad_norm": 0.6937645673751831, + "learning_rate": 4.8816569616160936e-05, + "loss": 0.9505, + "step": 12270 + }, + { + "epoch": 0.19711391836145042, + "grad_norm": 0.4069420397281647, + "learning_rate": 4.8814652117565396e-05, + "loss": 0.7711, + "step": 12280 + }, + { + "epoch": 0.19727443458161448, + "grad_norm": 1.081818699836731, + "learning_rate": 4.8812733104497755e-05, + "loss": 0.6925, + "step": 12290 + }, + { + "epoch": 0.19743495080177853, + "grad_norm": 0.7000516653060913, + "learning_rate": 4.8810812577080054e-05, + "loss": 0.8935, + "step": 12300 + }, + { + "epoch": 0.19759546702194256, + "grad_norm": 0.5766004920005798, + "learning_rate": 4.880889053543443e-05, + "loss": 0.739, + "step": 12310 + }, + { + "epoch": 0.1977559832421066, + "grad_norm": 0.5092904567718506, + "learning_rate": 4.8806966979683114e-05, + "loss": 0.9063, + "step": 12320 + }, + { + "epoch": 0.19791649946227066, + "grad_norm": 0.6116263270378113, + "learning_rate": 4.880504190994842e-05, + "loss": 0.799, + "step": 12330 + }, + { + "epoch": 0.19807701568243472, + "grad_norm": 0.5313858389854431, + "learning_rate": 4.88031153263528e-05, + "loss": 0.7538, + "step": 12340 + }, + { + "epoch": 0.19823753190259877, + "grad_norm": 0.7680122256278992, + "learning_rate": 4.880118722901874e-05, + "loss": 0.7452, + "step": 12350 + }, + { + "epoch": 0.1983980481227628, + "grad_norm": 1.0960530042648315, + "learning_rate": 4.879925761806887e-05, + "loss": 0.8384, + "step": 12360 + }, + { + "epoch": 0.19855856434292685, + "grad_norm": 0.7587502002716064, + "learning_rate": 4.879732649362592e-05, + "loss": 0.705, + "step": 12370 + }, + { + "epoch": 0.1987190805630909, + "grad_norm": 0.6712979078292847, + "learning_rate": 4.8795393855812665e-05, + "loss": 0.8748, + "step": 12380 + }, + { + "epoch": 0.19887959678325495, + "grad_norm": 0.856330931186676, + "learning_rate": 4.879345970475203e-05, + "loss": 0.7688, + "step": 12390 + }, + { + "epoch": 0.199040113003419, + "grad_norm": 0.6378706097602844, + "learning_rate": 4.879152404056702e-05, + "loss": 0.7321, + "step": 12400 + }, + { + "epoch": 0.19920062922358303, + "grad_norm": 0.6520065665245056, + "learning_rate": 4.878958686338071e-05, + "loss": 0.7892, + "step": 12410 + }, + { + "epoch": 0.19936114544374708, + "grad_norm": 0.8331012725830078, + "learning_rate": 4.8787648173316316e-05, + "loss": 0.9396, + "step": 12420 + }, + { + "epoch": 0.19952166166391114, + "grad_norm": 0.9308022856712341, + "learning_rate": 4.878570797049711e-05, + "loss": 0.6441, + "step": 12430 + }, + { + "epoch": 0.1996821778840752, + "grad_norm": 0.7707381844520569, + "learning_rate": 4.878376625504648e-05, + "loss": 0.8357, + "step": 12440 + }, + { + "epoch": 0.19984269410423924, + "grad_norm": 0.7358359694480896, + "learning_rate": 4.878182302708793e-05, + "loss": 0.7989, + "step": 12450 + }, + { + "epoch": 0.20000321032440327, + "grad_norm": 0.5844005942344666, + "learning_rate": 4.8779878286745e-05, + "loss": 0.895, + "step": 12460 + }, + { + "epoch": 0.20016372654456732, + "grad_norm": 0.6421613693237305, + "learning_rate": 4.8777932034141405e-05, + "loss": 0.9052, + "step": 12470 + }, + { + "epoch": 0.20032424276473138, + "grad_norm": 0.6037919521331787, + "learning_rate": 4.877598426940089e-05, + "loss": 0.8092, + "step": 12480 + }, + { + "epoch": 0.20048475898489543, + "grad_norm": 0.77273029088974, + "learning_rate": 4.8774034992647314e-05, + "loss": 0.8653, + "step": 12490 + }, + { + "epoch": 0.20064527520505948, + "grad_norm": 0.7956115007400513, + "learning_rate": 4.877208420400466e-05, + "loss": 0.8583, + "step": 12500 + }, + { + "epoch": 0.2008057914252235, + "grad_norm": 0.7663795351982117, + "learning_rate": 4.877013190359698e-05, + "loss": 0.7225, + "step": 12510 + }, + { + "epoch": 0.20096630764538756, + "grad_norm": 0.5765401124954224, + "learning_rate": 4.876817809154843e-05, + "loss": 0.9007, + "step": 12520 + }, + { + "epoch": 0.2011268238655516, + "grad_norm": 0.7025519013404846, + "learning_rate": 4.876622276798325e-05, + "loss": 0.7579, + "step": 12530 + }, + { + "epoch": 0.20128734008571567, + "grad_norm": 0.5902249813079834, + "learning_rate": 4.87642659330258e-05, + "loss": 0.9054, + "step": 12540 + }, + { + "epoch": 0.20144785630587972, + "grad_norm": 0.6048217415809631, + "learning_rate": 4.8762307586800524e-05, + "loss": 0.8682, + "step": 12550 + }, + { + "epoch": 0.20160837252604374, + "grad_norm": 0.8684887290000916, + "learning_rate": 4.8760347729431946e-05, + "loss": 0.7521, + "step": 12560 + }, + { + "epoch": 0.2017688887462078, + "grad_norm": 0.8083131909370422, + "learning_rate": 4.8758386361044716e-05, + "loss": 0.8565, + "step": 12570 + }, + { + "epoch": 0.20192940496637185, + "grad_norm": 0.6710031628608704, + "learning_rate": 4.8756423481763564e-05, + "loss": 0.7301, + "step": 12580 + }, + { + "epoch": 0.2020899211865359, + "grad_norm": 0.5912664532661438, + "learning_rate": 4.875445909171331e-05, + "loss": 0.8147, + "step": 12590 + }, + { + "epoch": 0.20225043740669996, + "grad_norm": 0.5153264999389648, + "learning_rate": 4.875249319101889e-05, + "loss": 0.8969, + "step": 12600 + }, + { + "epoch": 0.20241095362686398, + "grad_norm": 0.8006905913352966, + "learning_rate": 4.875052577980531e-05, + "loss": 0.7789, + "step": 12610 + }, + { + "epoch": 0.20257146984702804, + "grad_norm": 0.806279718875885, + "learning_rate": 4.874855685819769e-05, + "loss": 0.8689, + "step": 12620 + }, + { + "epoch": 0.2027319860671921, + "grad_norm": 0.5468489527702332, + "learning_rate": 4.8746586426321244e-05, + "loss": 0.7215, + "step": 12630 + }, + { + "epoch": 0.20289250228735614, + "grad_norm": 0.35073840618133545, + "learning_rate": 4.8744614484301285e-05, + "loss": 0.7387, + "step": 12640 + }, + { + "epoch": 0.2030530185075202, + "grad_norm": 0.46115097403526306, + "learning_rate": 4.87426410322632e-05, + "loss": 0.8136, + "step": 12650 + }, + { + "epoch": 0.20321353472768422, + "grad_norm": 0.8414697647094727, + "learning_rate": 4.874066607033252e-05, + "loss": 0.7697, + "step": 12660 + }, + { + "epoch": 0.20337405094784827, + "grad_norm": 0.5322723388671875, + "learning_rate": 4.87386895986348e-05, + "loss": 0.7372, + "step": 12670 + }, + { + "epoch": 0.20353456716801233, + "grad_norm": 0.6220024824142456, + "learning_rate": 4.873671161729577e-05, + "loss": 0.7756, + "step": 12680 + }, + { + "epoch": 0.20369508338817638, + "grad_norm": 0.8658562898635864, + "learning_rate": 4.8734732126441196e-05, + "loss": 0.844, + "step": 12690 + }, + { + "epoch": 0.20385559960834043, + "grad_norm": 3.5017833709716797, + "learning_rate": 4.873275112619696e-05, + "loss": 0.72, + "step": 12700 + }, + { + "epoch": 0.20401611582850446, + "grad_norm": 0.6345027089118958, + "learning_rate": 4.873076861668906e-05, + "loss": 0.9038, + "step": 12710 + }, + { + "epoch": 0.2041766320486685, + "grad_norm": 0.9347177743911743, + "learning_rate": 4.8728784598043556e-05, + "loss": 0.812, + "step": 12720 + }, + { + "epoch": 0.20433714826883256, + "grad_norm": 0.3330080509185791, + "learning_rate": 4.872679907038663e-05, + "loss": 0.8021, + "step": 12730 + }, + { + "epoch": 0.20449766448899662, + "grad_norm": 0.6588268876075745, + "learning_rate": 4.872481203384454e-05, + "loss": 0.8127, + "step": 12740 + }, + { + "epoch": 0.20465818070916067, + "grad_norm": 0.4912445545196533, + "learning_rate": 4.872282348854366e-05, + "loss": 0.7959, + "step": 12750 + }, + { + "epoch": 0.2048186969293247, + "grad_norm": 0.4277562201023102, + "learning_rate": 4.872083343461044e-05, + "loss": 0.7059, + "step": 12760 + }, + { + "epoch": 0.20497921314948875, + "grad_norm": 3.2408454418182373, + "learning_rate": 4.8718841872171446e-05, + "loss": 0.7798, + "step": 12770 + }, + { + "epoch": 0.2051397293696528, + "grad_norm": 0.5868845582008362, + "learning_rate": 4.871684880135332e-05, + "loss": 0.9376, + "step": 12780 + }, + { + "epoch": 0.20530024558981685, + "grad_norm": 0.6987655758857727, + "learning_rate": 4.8714854222282815e-05, + "loss": 0.7153, + "step": 12790 + }, + { + "epoch": 0.2054607618099809, + "grad_norm": 0.5804741382598877, + "learning_rate": 4.871285813508678e-05, + "loss": 0.8214, + "step": 12800 + }, + { + "epoch": 0.20562127803014493, + "grad_norm": 0.7555805444717407, + "learning_rate": 4.871086053989213e-05, + "loss": 0.7628, + "step": 12810 + }, + { + "epoch": 0.20578179425030899, + "grad_norm": 0.6590709686279297, + "learning_rate": 4.870886143682593e-05, + "loss": 0.8159, + "step": 12820 + }, + { + "epoch": 0.20594231047047304, + "grad_norm": 0.8610821962356567, + "learning_rate": 4.8706860826015305e-05, + "loss": 0.8224, + "step": 12830 + }, + { + "epoch": 0.2061028266906371, + "grad_norm": 1.0005724430084229, + "learning_rate": 4.870485870758747e-05, + "loss": 0.7642, + "step": 12840 + }, + { + "epoch": 0.20626334291080115, + "grad_norm": 0.523483395576477, + "learning_rate": 4.870285508166976e-05, + "loss": 0.832, + "step": 12850 + }, + { + "epoch": 0.20642385913096517, + "grad_norm": 0.6513292193412781, + "learning_rate": 4.870084994838958e-05, + "loss": 0.6602, + "step": 12860 + }, + { + "epoch": 0.20658437535112922, + "grad_norm": 0.5926905274391174, + "learning_rate": 4.869884330787446e-05, + "loss": 0.8309, + "step": 12870 + }, + { + "epoch": 0.20674489157129328, + "grad_norm": 0.4304194450378418, + "learning_rate": 4.8696835160252e-05, + "loss": 0.7802, + "step": 12880 + }, + { + "epoch": 0.20690540779145733, + "grad_norm": 0.9892992973327637, + "learning_rate": 4.869482550564991e-05, + "loss": 0.7657, + "step": 12890 + }, + { + "epoch": 0.20706592401162138, + "grad_norm": 0.674333393573761, + "learning_rate": 4.8692814344196e-05, + "loss": 0.8954, + "step": 12900 + }, + { + "epoch": 0.2072264402317854, + "grad_norm": 2.8069934844970703, + "learning_rate": 4.869080167601815e-05, + "loss": 0.8113, + "step": 12910 + }, + { + "epoch": 0.20738695645194946, + "grad_norm": 0.7997820377349854, + "learning_rate": 4.868878750124437e-05, + "loss": 0.7255, + "step": 12920 + }, + { + "epoch": 0.20754747267211351, + "grad_norm": 0.46556082367897034, + "learning_rate": 4.868677182000274e-05, + "loss": 0.6889, + "step": 12930 + }, + { + "epoch": 0.20770798889227757, + "grad_norm": 0.6469829082489014, + "learning_rate": 4.8684754632421456e-05, + "loss": 0.9389, + "step": 12940 + }, + { + "epoch": 0.20786850511244162, + "grad_norm": 0.8764975070953369, + "learning_rate": 4.868273593862879e-05, + "loss": 0.8323, + "step": 12950 + }, + { + "epoch": 0.20802902133260567, + "grad_norm": 0.6434170603752136, + "learning_rate": 4.868071573875311e-05, + "loss": 0.7311, + "step": 12960 + }, + { + "epoch": 0.2081895375527697, + "grad_norm": 0.38058286905288696, + "learning_rate": 4.867869403292292e-05, + "loss": 0.8407, + "step": 12970 + }, + { + "epoch": 0.20835005377293375, + "grad_norm": 0.7341654300689697, + "learning_rate": 4.867667082126676e-05, + "loss": 0.7753, + "step": 12980 + }, + { + "epoch": 0.2085105699930978, + "grad_norm": 0.6672992706298828, + "learning_rate": 4.86746461039133e-05, + "loss": 0.7907, + "step": 12990 + }, + { + "epoch": 0.20867108621326186, + "grad_norm": 0.9220626354217529, + "learning_rate": 4.867261988099131e-05, + "loss": 0.8581, + "step": 13000 + }, + { + "epoch": 0.2088316024334259, + "grad_norm": 0.6023329496383667, + "learning_rate": 4.867059215262964e-05, + "loss": 0.7083, + "step": 13010 + }, + { + "epoch": 0.20899211865358994, + "grad_norm": 0.6796655654907227, + "learning_rate": 4.866856291895724e-05, + "loss": 0.7828, + "step": 13020 + }, + { + "epoch": 0.209152634873754, + "grad_norm": 0.6681435704231262, + "learning_rate": 4.866653218010315e-05, + "loss": 0.8905, + "step": 13030 + }, + { + "epoch": 0.20931315109391804, + "grad_norm": 0.985504686832428, + "learning_rate": 4.866449993619654e-05, + "loss": 0.8972, + "step": 13040 + }, + { + "epoch": 0.2094736673140821, + "grad_norm": 1.033186674118042, + "learning_rate": 4.866246618736662e-05, + "loss": 0.8741, + "step": 13050 + }, + { + "epoch": 0.20963418353424615, + "grad_norm": 0.7205432653427124, + "learning_rate": 4.866043093374273e-05, + "loss": 0.8088, + "step": 13060 + }, + { + "epoch": 0.20979469975441017, + "grad_norm": 1.0006065368652344, + "learning_rate": 4.8658394175454315e-05, + "loss": 0.6488, + "step": 13070 + }, + { + "epoch": 0.20995521597457423, + "grad_norm": 0.5281996130943298, + "learning_rate": 4.865635591263088e-05, + "loss": 0.8431, + "step": 13080 + }, + { + "epoch": 0.21011573219473828, + "grad_norm": 0.3656073808670044, + "learning_rate": 4.8654316145402065e-05, + "loss": 0.7504, + "step": 13090 + }, + { + "epoch": 0.21027624841490233, + "grad_norm": 0.890722393989563, + "learning_rate": 4.8652274873897584e-05, + "loss": 0.6791, + "step": 13100 + }, + { + "epoch": 0.2104367646350664, + "grad_norm": 0.642337441444397, + "learning_rate": 4.865023209824724e-05, + "loss": 0.738, + "step": 13110 + }, + { + "epoch": 0.2105972808552304, + "grad_norm": 0.6358085870742798, + "learning_rate": 4.8648187818580945e-05, + "loss": 0.7684, + "step": 13120 + }, + { + "epoch": 0.21075779707539447, + "grad_norm": 0.44458454847335815, + "learning_rate": 4.864614203502871e-05, + "loss": 0.6716, + "step": 13130 + }, + { + "epoch": 0.21091831329555852, + "grad_norm": 0.7113595604896545, + "learning_rate": 4.864409474772063e-05, + "loss": 0.7073, + "step": 13140 + }, + { + "epoch": 0.21107882951572257, + "grad_norm": 0.7350926399230957, + "learning_rate": 4.8642045956786905e-05, + "loss": 0.7707, + "step": 13150 + }, + { + "epoch": 0.21123934573588662, + "grad_norm": 0.7003143429756165, + "learning_rate": 4.863999566235781e-05, + "loss": 0.7943, + "step": 13160 + }, + { + "epoch": 0.21139986195605065, + "grad_norm": 0.838293731212616, + "learning_rate": 4.863794386456375e-05, + "loss": 0.8123, + "step": 13170 + }, + { + "epoch": 0.2115603781762147, + "grad_norm": 0.6116281747817993, + "learning_rate": 4.86358905635352e-05, + "loss": 0.7295, + "step": 13180 + }, + { + "epoch": 0.21172089439637876, + "grad_norm": 0.6048389077186584, + "learning_rate": 4.8633835759402745e-05, + "loss": 0.7909, + "step": 13190 + }, + { + "epoch": 0.2118814106165428, + "grad_norm": 0.8293211460113525, + "learning_rate": 4.863177945229704e-05, + "loss": 0.7939, + "step": 13200 + }, + { + "epoch": 0.21204192683670686, + "grad_norm": 0.7670401334762573, + "learning_rate": 4.862972164234888e-05, + "loss": 0.7868, + "step": 13210 + }, + { + "epoch": 0.2122024430568709, + "grad_norm": 0.5840343236923218, + "learning_rate": 4.8627662329689114e-05, + "loss": 0.8959, + "step": 13220 + }, + { + "epoch": 0.21236295927703494, + "grad_norm": 0.5325667858123779, + "learning_rate": 4.862560151444869e-05, + "loss": 0.7167, + "step": 13230 + }, + { + "epoch": 0.212523475497199, + "grad_norm": 0.5467821955680847, + "learning_rate": 4.862353919675869e-05, + "loss": 0.7751, + "step": 13240 + }, + { + "epoch": 0.21268399171736305, + "grad_norm": 0.5489977598190308, + "learning_rate": 4.862147537675026e-05, + "loss": 0.8228, + "step": 13250 + }, + { + "epoch": 0.2128445079375271, + "grad_norm": 0.9548693895339966, + "learning_rate": 4.8619410054554625e-05, + "loss": 0.7746, + "step": 13260 + }, + { + "epoch": 0.21300502415769113, + "grad_norm": 0.4440135955810547, + "learning_rate": 4.8617343230303146e-05, + "loss": 0.7611, + "step": 13270 + }, + { + "epoch": 0.21316554037785518, + "grad_norm": 0.5570626258850098, + "learning_rate": 4.861527490412726e-05, + "loss": 0.8055, + "step": 13280 + }, + { + "epoch": 0.21332605659801923, + "grad_norm": 0.7159692645072937, + "learning_rate": 4.86132050761585e-05, + "loss": 0.8394, + "step": 13290 + }, + { + "epoch": 0.21348657281818328, + "grad_norm": 0.9184303879737854, + "learning_rate": 4.861113374652849e-05, + "loss": 0.6441, + "step": 13300 + }, + { + "epoch": 0.21364708903834734, + "grad_norm": 1.1139557361602783, + "learning_rate": 4.8609060915368956e-05, + "loss": 0.9352, + "step": 13310 + }, + { + "epoch": 0.21380760525851136, + "grad_norm": 0.7093321681022644, + "learning_rate": 4.860698658281172e-05, + "loss": 0.7737, + "step": 13320 + }, + { + "epoch": 0.21396812147867542, + "grad_norm": 0.8025379180908203, + "learning_rate": 4.8604910748988694e-05, + "loss": 0.7934, + "step": 13330 + }, + { + "epoch": 0.21412863769883947, + "grad_norm": 0.6841307282447815, + "learning_rate": 4.86028334140319e-05, + "loss": 0.788, + "step": 13340 + }, + { + "epoch": 0.21428915391900352, + "grad_norm": 1.1647191047668457, + "learning_rate": 4.860075457807343e-05, + "loss": 0.8216, + "step": 13350 + }, + { + "epoch": 0.21444967013916758, + "grad_norm": 0.5426352024078369, + "learning_rate": 4.859867424124548e-05, + "loss": 0.9177, + "step": 13360 + }, + { + "epoch": 0.2146101863593316, + "grad_norm": 0.683161735534668, + "learning_rate": 4.859659240368037e-05, + "loss": 0.7969, + "step": 13370 + }, + { + "epoch": 0.21477070257949565, + "grad_norm": 0.572302520275116, + "learning_rate": 4.8594509065510485e-05, + "loss": 0.7564, + "step": 13380 + }, + { + "epoch": 0.2149312187996597, + "grad_norm": 0.6772443652153015, + "learning_rate": 4.85924242268683e-05, + "loss": 0.6944, + "step": 13390 + }, + { + "epoch": 0.21509173501982376, + "grad_norm": 0.3862336277961731, + "learning_rate": 4.859033788788641e-05, + "loss": 0.7431, + "step": 13400 + }, + { + "epoch": 0.2152522512399878, + "grad_norm": 0.73334139585495, + "learning_rate": 4.858825004869749e-05, + "loss": 0.7819, + "step": 13410 + }, + { + "epoch": 0.21541276746015184, + "grad_norm": 0.9421506524085999, + "learning_rate": 4.8586160709434325e-05, + "loss": 0.7482, + "step": 13420 + }, + { + "epoch": 0.2155732836803159, + "grad_norm": 0.4510180354118347, + "learning_rate": 4.858406987022977e-05, + "loss": 0.8517, + "step": 13430 + }, + { + "epoch": 0.21573379990047994, + "grad_norm": 0.9711111783981323, + "learning_rate": 4.8581977531216795e-05, + "loss": 0.6764, + "step": 13440 + }, + { + "epoch": 0.215894316120644, + "grad_norm": 0.761773943901062, + "learning_rate": 4.857988369252846e-05, + "loss": 0.7242, + "step": 13450 + }, + { + "epoch": 0.21605483234080805, + "grad_norm": 0.9307165741920471, + "learning_rate": 4.857778835429792e-05, + "loss": 0.8162, + "step": 13460 + }, + { + "epoch": 0.21621534856097208, + "grad_norm": 0.7644413709640503, + "learning_rate": 4.857569151665843e-05, + "loss": 0.7672, + "step": 13470 + }, + { + "epoch": 0.21637586478113613, + "grad_norm": 0.8483150601387024, + "learning_rate": 4.857359317974334e-05, + "loss": 0.696, + "step": 13480 + }, + { + "epoch": 0.21653638100130018, + "grad_norm": 0.6497825384140015, + "learning_rate": 4.857149334368608e-05, + "loss": 0.7425, + "step": 13490 + }, + { + "epoch": 0.21669689722146424, + "grad_norm": 0.4644838571548462, + "learning_rate": 4.8569392008620194e-05, + "loss": 0.728, + "step": 13500 + }, + { + "epoch": 0.2168574134416283, + "grad_norm": 0.6722881197929382, + "learning_rate": 4.856728917467932e-05, + "loss": 0.8209, + "step": 13510 + }, + { + "epoch": 0.2170179296617923, + "grad_norm": 0.6383801698684692, + "learning_rate": 4.856518484199718e-05, + "loss": 0.8846, + "step": 13520 + }, + { + "epoch": 0.21717844588195637, + "grad_norm": 0.4729352295398712, + "learning_rate": 4.856307901070759e-05, + "loss": 0.8848, + "step": 13530 + }, + { + "epoch": 0.21733896210212042, + "grad_norm": 0.5868790745735168, + "learning_rate": 4.856097168094448e-05, + "loss": 0.885, + "step": 13540 + }, + { + "epoch": 0.21749947832228447, + "grad_norm": 0.7673324942588806, + "learning_rate": 4.855886285284187e-05, + "loss": 0.8196, + "step": 13550 + }, + { + "epoch": 0.21765999454244853, + "grad_norm": 1.2812150716781616, + "learning_rate": 4.8556752526533845e-05, + "loss": 0.9778, + "step": 13560 + }, + { + "epoch": 0.21782051076261255, + "grad_norm": 0.5946052670478821, + "learning_rate": 4.855464070215463e-05, + "loss": 0.8175, + "step": 13570 + }, + { + "epoch": 0.2179810269827766, + "grad_norm": 0.937473475933075, + "learning_rate": 4.855252737983852e-05, + "loss": 0.8613, + "step": 13580 + }, + { + "epoch": 0.21814154320294066, + "grad_norm": 0.6231008768081665, + "learning_rate": 4.85504125597199e-05, + "loss": 0.913, + "step": 13590 + }, + { + "epoch": 0.2183020594231047, + "grad_norm": 0.4079624116420746, + "learning_rate": 4.854829624193328e-05, + "loss": 0.7529, + "step": 13600 + }, + { + "epoch": 0.21846257564326876, + "grad_norm": 0.6336984038352966, + "learning_rate": 4.854617842661322e-05, + "loss": 0.9183, + "step": 13610 + }, + { + "epoch": 0.2186230918634328, + "grad_norm": 0.5060218572616577, + "learning_rate": 4.8544059113894425e-05, + "loss": 0.7857, + "step": 13620 + }, + { + "epoch": 0.21878360808359684, + "grad_norm": 0.6043427586555481, + "learning_rate": 4.854193830391165e-05, + "loss": 0.7394, + "step": 13630 + }, + { + "epoch": 0.2189441243037609, + "grad_norm": 0.7516580820083618, + "learning_rate": 4.8539815996799784e-05, + "loss": 0.9399, + "step": 13640 + }, + { + "epoch": 0.21910464052392495, + "grad_norm": 0.6188008189201355, + "learning_rate": 4.853769219269379e-05, + "loss": 0.8602, + "step": 13650 + }, + { + "epoch": 0.219265156744089, + "grad_norm": 0.7574178576469421, + "learning_rate": 4.853556689172871e-05, + "loss": 0.9059, + "step": 13660 + }, + { + "epoch": 0.21942567296425303, + "grad_norm": 0.7071336507797241, + "learning_rate": 4.853344009403972e-05, + "loss": 0.7056, + "step": 13670 + }, + { + "epoch": 0.21958618918441708, + "grad_norm": 0.6208372116088867, + "learning_rate": 4.853131179976207e-05, + "loss": 0.7558, + "step": 13680 + }, + { + "epoch": 0.21974670540458113, + "grad_norm": 0.6016774773597717, + "learning_rate": 4.8529182009031106e-05, + "loss": 0.8255, + "step": 13690 + }, + { + "epoch": 0.21990722162474519, + "grad_norm": 0.7173605561256409, + "learning_rate": 4.852705072198227e-05, + "loss": 0.7293, + "step": 13700 + }, + { + "epoch": 0.22006773784490924, + "grad_norm": 0.591118574142456, + "learning_rate": 4.852491793875109e-05, + "loss": 0.8333, + "step": 13710 + }, + { + "epoch": 0.22022825406507326, + "grad_norm": 0.5249565839767456, + "learning_rate": 4.852278365947321e-05, + "loss": 0.6992, + "step": 13720 + }, + { + "epoch": 0.22038877028523732, + "grad_norm": 0.6611059308052063, + "learning_rate": 4.852064788428436e-05, + "loss": 0.7971, + "step": 13730 + }, + { + "epoch": 0.22054928650540137, + "grad_norm": 0.8350611329078674, + "learning_rate": 4.8518510613320355e-05, + "loss": 0.6873, + "step": 13740 + }, + { + "epoch": 0.22070980272556542, + "grad_norm": 0.8024451732635498, + "learning_rate": 4.8516371846717115e-05, + "loss": 0.687, + "step": 13750 + }, + { + "epoch": 0.22087031894572948, + "grad_norm": 1.2762309312820435, + "learning_rate": 4.851423158461065e-05, + "loss": 0.7638, + "step": 13760 + }, + { + "epoch": 0.2210308351658935, + "grad_norm": 0.7608177065849304, + "learning_rate": 4.851208982713706e-05, + "loss": 0.6597, + "step": 13770 + }, + { + "epoch": 0.22119135138605756, + "grad_norm": 0.6529844999313354, + "learning_rate": 4.850994657443257e-05, + "loss": 0.8813, + "step": 13780 + }, + { + "epoch": 0.2213518676062216, + "grad_norm": 0.9952006340026855, + "learning_rate": 4.850780182663347e-05, + "loss": 0.8334, + "step": 13790 + }, + { + "epoch": 0.22151238382638566, + "grad_norm": 0.5059579610824585, + "learning_rate": 4.850565558387615e-05, + "loss": 0.9252, + "step": 13800 + }, + { + "epoch": 0.22167290004654971, + "grad_norm": 0.6789982914924622, + "learning_rate": 4.850350784629709e-05, + "loss": 0.7137, + "step": 13810 + }, + { + "epoch": 0.22183341626671374, + "grad_norm": 0.5182785987854004, + "learning_rate": 4.850135861403289e-05, + "loss": 0.7943, + "step": 13820 + }, + { + "epoch": 0.2219939324868778, + "grad_norm": 0.8834701776504517, + "learning_rate": 4.849920788722022e-05, + "loss": 0.9162, + "step": 13830 + }, + { + "epoch": 0.22215444870704185, + "grad_norm": 0.6992667317390442, + "learning_rate": 4.8497055665995854e-05, + "loss": 0.7412, + "step": 13840 + }, + { + "epoch": 0.2223149649272059, + "grad_norm": 0.42296481132507324, + "learning_rate": 4.8494901950496665e-05, + "loss": 0.8301, + "step": 13850 + }, + { + "epoch": 0.22247548114736995, + "grad_norm": 0.610984742641449, + "learning_rate": 4.849274674085961e-05, + "loss": 0.807, + "step": 13860 + }, + { + "epoch": 0.22263599736753398, + "grad_norm": 0.755004346370697, + "learning_rate": 4.849059003722175e-05, + "loss": 0.8933, + "step": 13870 + }, + { + "epoch": 0.22279651358769803, + "grad_norm": 0.5799021124839783, + "learning_rate": 4.848843183972024e-05, + "loss": 0.829, + "step": 13880 + }, + { + "epoch": 0.22295702980786208, + "grad_norm": 0.9081137180328369, + "learning_rate": 4.848627214849233e-05, + "loss": 0.7962, + "step": 13890 + }, + { + "epoch": 0.22311754602802614, + "grad_norm": 0.67518150806427, + "learning_rate": 4.8484110963675354e-05, + "loss": 0.9081, + "step": 13900 + }, + { + "epoch": 0.2232780622481902, + "grad_norm": 0.5436902642250061, + "learning_rate": 4.848194828540676e-05, + "loss": 0.7019, + "step": 13910 + }, + { + "epoch": 0.22343857846835422, + "grad_norm": 0.9985558986663818, + "learning_rate": 4.847978411382409e-05, + "loss": 0.8431, + "step": 13920 + }, + { + "epoch": 0.22359909468851827, + "grad_norm": 1.3048858642578125, + "learning_rate": 4.847761844906495e-05, + "loss": 0.8131, + "step": 13930 + }, + { + "epoch": 0.22375961090868232, + "grad_norm": 0.8139610886573792, + "learning_rate": 4.847545129126709e-05, + "loss": 0.7428, + "step": 13940 + }, + { + "epoch": 0.22392012712884637, + "grad_norm": 1.1074861288070679, + "learning_rate": 4.847328264056831e-05, + "loss": 0.7167, + "step": 13950 + }, + { + "epoch": 0.22408064334901043, + "grad_norm": 0.82290118932724, + "learning_rate": 4.8471112497106524e-05, + "loss": 0.6778, + "step": 13960 + }, + { + "epoch": 0.22424115956917445, + "grad_norm": 0.6848230957984924, + "learning_rate": 4.8468940861019754e-05, + "loss": 0.7589, + "step": 13970 + }, + { + "epoch": 0.2244016757893385, + "grad_norm": 0.5549249053001404, + "learning_rate": 4.8466767732446084e-05, + "loss": 0.7552, + "step": 13980 + }, + { + "epoch": 0.22456219200950256, + "grad_norm": 0.7346891760826111, + "learning_rate": 4.846459311152372e-05, + "loss": 0.8555, + "step": 13990 + }, + { + "epoch": 0.2247227082296666, + "grad_norm": 0.5167901515960693, + "learning_rate": 4.846241699839096e-05, + "loss": 0.7615, + "step": 14000 + }, + { + "epoch": 0.22488322444983067, + "grad_norm": 0.5887021422386169, + "learning_rate": 4.8460239393186196e-05, + "loss": 0.8153, + "step": 14010 + }, + { + "epoch": 0.2250437406699947, + "grad_norm": 0.5884560346603394, + "learning_rate": 4.84580602960479e-05, + "loss": 0.8351, + "step": 14020 + }, + { + "epoch": 0.22520425689015874, + "grad_norm": 0.5950517654418945, + "learning_rate": 4.845587970711465e-05, + "loss": 0.9171, + "step": 14030 + }, + { + "epoch": 0.2253647731103228, + "grad_norm": 0.5549572110176086, + "learning_rate": 4.845369762652513e-05, + "loss": 0.7261, + "step": 14040 + }, + { + "epoch": 0.22552528933048685, + "grad_norm": 0.6446338295936584, + "learning_rate": 4.845151405441809e-05, + "loss": 0.8158, + "step": 14050 + }, + { + "epoch": 0.2256858055506509, + "grad_norm": 0.7633540630340576, + "learning_rate": 4.844932899093242e-05, + "loss": 0.7201, + "step": 14060 + }, + { + "epoch": 0.22584632177081493, + "grad_norm": 0.5280660390853882, + "learning_rate": 4.844714243620704e-05, + "loss": 0.7456, + "step": 14070 + }, + { + "epoch": 0.22600683799097898, + "grad_norm": 0.7394150495529175, + "learning_rate": 4.8444954390381035e-05, + "loss": 0.9771, + "step": 14080 + }, + { + "epoch": 0.22616735421114303, + "grad_norm": 0.9522365927696228, + "learning_rate": 4.8442764853593534e-05, + "loss": 0.7923, + "step": 14090 + }, + { + "epoch": 0.2263278704313071, + "grad_norm": 0.7995865941047668, + "learning_rate": 4.844057382598378e-05, + "loss": 0.7481, + "step": 14100 + }, + { + "epoch": 0.22648838665147114, + "grad_norm": 1.8771640062332153, + "learning_rate": 4.843838130769112e-05, + "loss": 0.7385, + "step": 14110 + }, + { + "epoch": 0.22664890287163517, + "grad_norm": 0.90758216381073, + "learning_rate": 4.8436187298854976e-05, + "loss": 0.9146, + "step": 14120 + }, + { + "epoch": 0.22680941909179922, + "grad_norm": 0.5905675888061523, + "learning_rate": 4.843399179961488e-05, + "loss": 0.9061, + "step": 14130 + }, + { + "epoch": 0.22696993531196327, + "grad_norm": 0.6033791303634644, + "learning_rate": 4.8431794810110445e-05, + "loss": 0.8191, + "step": 14140 + }, + { + "epoch": 0.22713045153212733, + "grad_norm": 0.7500686049461365, + "learning_rate": 4.842959633048139e-05, + "loss": 0.7387, + "step": 14150 + }, + { + "epoch": 0.22729096775229138, + "grad_norm": 0.9297177791595459, + "learning_rate": 4.8427396360867524e-05, + "loss": 0.7273, + "step": 14160 + }, + { + "epoch": 0.2274514839724554, + "grad_norm": 1.0234181880950928, + "learning_rate": 4.842519490140876e-05, + "loss": 0.8484, + "step": 14170 + }, + { + "epoch": 0.22761200019261946, + "grad_norm": 0.6142573952674866, + "learning_rate": 4.842299195224508e-05, + "loss": 0.826, + "step": 14180 + }, + { + "epoch": 0.2277725164127835, + "grad_norm": 0.6258146166801453, + "learning_rate": 4.8420787513516604e-05, + "loss": 0.7599, + "step": 14190 + }, + { + "epoch": 0.22793303263294756, + "grad_norm": 0.8744311928749084, + "learning_rate": 4.8418581585363496e-05, + "loss": 0.8634, + "step": 14200 + }, + { + "epoch": 0.22809354885311162, + "grad_norm": 0.7451462745666504, + "learning_rate": 4.841637416792607e-05, + "loss": 0.8218, + "step": 14210 + }, + { + "epoch": 0.22825406507327564, + "grad_norm": 0.43015557527542114, + "learning_rate": 4.8414165261344676e-05, + "loss": 0.8676, + "step": 14220 + }, + { + "epoch": 0.2284145812934397, + "grad_norm": 0.7763070464134216, + "learning_rate": 4.84119548657598e-05, + "loss": 0.7707, + "step": 14230 + }, + { + "epoch": 0.22857509751360375, + "grad_norm": 0.6814308166503906, + "learning_rate": 4.8409742981312e-05, + "loss": 0.9265, + "step": 14240 + }, + { + "epoch": 0.2287356137337678, + "grad_norm": 0.6307319402694702, + "learning_rate": 4.840752960814197e-05, + "loss": 0.7913, + "step": 14250 + }, + { + "epoch": 0.22889612995393185, + "grad_norm": 0.6133452653884888, + "learning_rate": 4.8405314746390424e-05, + "loss": 0.7896, + "step": 14260 + }, + { + "epoch": 0.22905664617409588, + "grad_norm": 0.5222846865653992, + "learning_rate": 4.8403098396198243e-05, + "loss": 0.7761, + "step": 14270 + }, + { + "epoch": 0.22921716239425993, + "grad_norm": 0.5986525416374207, + "learning_rate": 4.8400880557706365e-05, + "loss": 0.8821, + "step": 14280 + }, + { + "epoch": 0.22937767861442399, + "grad_norm": 0.6499159336090088, + "learning_rate": 4.839866123105583e-05, + "loss": 0.7973, + "step": 14290 + }, + { + "epoch": 0.22953819483458804, + "grad_norm": 0.6045275330543518, + "learning_rate": 4.839644041638778e-05, + "loss": 0.7991, + "step": 14300 + }, + { + "epoch": 0.2296987110547521, + "grad_norm": 0.7771904468536377, + "learning_rate": 4.8394218113843445e-05, + "loss": 0.8734, + "step": 14310 + }, + { + "epoch": 0.22985922727491612, + "grad_norm": 0.6543309092521667, + "learning_rate": 4.839199432356415e-05, + "loss": 0.7864, + "step": 14320 + }, + { + "epoch": 0.23001974349508017, + "grad_norm": 0.4549863338470459, + "learning_rate": 4.8389769045691306e-05, + "loss": 0.7777, + "step": 14330 + }, + { + "epoch": 0.23018025971524422, + "grad_norm": 0.7188288569450378, + "learning_rate": 4.838754228036644e-05, + "loss": 0.8142, + "step": 14340 + }, + { + "epoch": 0.23034077593540828, + "grad_norm": 0.5713337063789368, + "learning_rate": 4.838531402773115e-05, + "loss": 0.8555, + "step": 14350 + }, + { + "epoch": 0.23050129215557233, + "grad_norm": 0.6994351744651794, + "learning_rate": 4.838308428792716e-05, + "loss": 0.7662, + "step": 14360 + }, + { + "epoch": 0.23066180837573635, + "grad_norm": 0.5737929344177246, + "learning_rate": 4.838085306109624e-05, + "loss": 0.9961, + "step": 14370 + }, + { + "epoch": 0.2308223245959004, + "grad_norm": 0.49600979685783386, + "learning_rate": 4.83786203473803e-05, + "loss": 0.773, + "step": 14380 + }, + { + "epoch": 0.23098284081606446, + "grad_norm": 0.4189739227294922, + "learning_rate": 4.837638614692133e-05, + "loss": 0.8373, + "step": 14390 + }, + { + "epoch": 0.23114335703622851, + "grad_norm": 0.4438650906085968, + "learning_rate": 4.8374150459861395e-05, + "loss": 0.6157, + "step": 14400 + }, + { + "epoch": 0.23130387325639257, + "grad_norm": 0.5696516036987305, + "learning_rate": 4.837191328634269e-05, + "loss": 0.7136, + "step": 14410 + }, + { + "epoch": 0.2314643894765566, + "grad_norm": 0.777023434638977, + "learning_rate": 4.836967462650748e-05, + "loss": 0.8115, + "step": 14420 + }, + { + "epoch": 0.23162490569672065, + "grad_norm": 0.6263437867164612, + "learning_rate": 4.836743448049813e-05, + "loss": 0.9264, + "step": 14430 + }, + { + "epoch": 0.2317854219168847, + "grad_norm": 0.6344231963157654, + "learning_rate": 4.8365192848457095e-05, + "loss": 0.7335, + "step": 14440 + }, + { + "epoch": 0.23194593813704875, + "grad_norm": 0.6128970384597778, + "learning_rate": 4.836294973052694e-05, + "loss": 0.8023, + "step": 14450 + }, + { + "epoch": 0.2321064543572128, + "grad_norm": 0.7868950366973877, + "learning_rate": 4.8360705126850305e-05, + "loss": 0.782, + "step": 14460 + }, + { + "epoch": 0.23226697057737683, + "grad_norm": 0.5872012376785278, + "learning_rate": 4.835845903756994e-05, + "loss": 0.8689, + "step": 14470 + }, + { + "epoch": 0.23242748679754088, + "grad_norm": 0.3408048450946808, + "learning_rate": 4.835621146282868e-05, + "loss": 0.7538, + "step": 14480 + }, + { + "epoch": 0.23258800301770494, + "grad_norm": 0.575236976146698, + "learning_rate": 4.835396240276946e-05, + "loss": 0.7215, + "step": 14490 + }, + { + "epoch": 0.232748519237869, + "grad_norm": 0.590338408946991, + "learning_rate": 4.8351711857535305e-05, + "loss": 0.9276, + "step": 14500 + }, + { + "epoch": 0.23290903545803304, + "grad_norm": 0.9277654886245728, + "learning_rate": 4.8349459827269334e-05, + "loss": 0.765, + "step": 14510 + }, + { + "epoch": 0.2330695516781971, + "grad_norm": 0.6697081923484802, + "learning_rate": 4.834720631211477e-05, + "loss": 0.8355, + "step": 14520 + }, + { + "epoch": 0.23323006789836112, + "grad_norm": 0.844781219959259, + "learning_rate": 4.834495131221491e-05, + "loss": 0.8531, + "step": 14530 + }, + { + "epoch": 0.23339058411852517, + "grad_norm": 0.7250335216522217, + "learning_rate": 4.8342694827713173e-05, + "loss": 0.695, + "step": 14540 + }, + { + "epoch": 0.23355110033868923, + "grad_norm": 0.4732781648635864, + "learning_rate": 4.834043685875305e-05, + "loss": 0.8097, + "step": 14550 + }, + { + "epoch": 0.23371161655885328, + "grad_norm": 0.7548187971115112, + "learning_rate": 4.833817740547814e-05, + "loss": 0.7479, + "step": 14560 + }, + { + "epoch": 0.23387213277901733, + "grad_norm": 0.885338544845581, + "learning_rate": 4.833591646803213e-05, + "loss": 0.8116, + "step": 14570 + }, + { + "epoch": 0.23403264899918136, + "grad_norm": 0.798949122428894, + "learning_rate": 4.8333654046558794e-05, + "loss": 0.7861, + "step": 14580 + }, + { + "epoch": 0.2341931652193454, + "grad_norm": 0.5753592252731323, + "learning_rate": 4.833139014120202e-05, + "loss": 0.739, + "step": 14590 + }, + { + "epoch": 0.23435368143950946, + "grad_norm": 0.685437798500061, + "learning_rate": 4.832912475210578e-05, + "loss": 0.8135, + "step": 14600 + }, + { + "epoch": 0.23451419765967352, + "grad_norm": 0.6089953780174255, + "learning_rate": 4.8326857879414125e-05, + "loss": 0.9204, + "step": 14610 + }, + { + "epoch": 0.23467471387983757, + "grad_norm": 0.4205803871154785, + "learning_rate": 4.832458952327122e-05, + "loss": 0.6984, + "step": 14620 + }, + { + "epoch": 0.2348352301000016, + "grad_norm": 0.4616701900959015, + "learning_rate": 4.8322319683821335e-05, + "loss": 0.877, + "step": 14630 + }, + { + "epoch": 0.23499574632016565, + "grad_norm": 0.7745165228843689, + "learning_rate": 4.83200483612088e-05, + "loss": 0.7399, + "step": 14640 + }, + { + "epoch": 0.2351562625403297, + "grad_norm": 0.7561837434768677, + "learning_rate": 4.8317775555578056e-05, + "loss": 0.8355, + "step": 14650 + }, + { + "epoch": 0.23531677876049376, + "grad_norm": 0.6511695981025696, + "learning_rate": 4.831550126707366e-05, + "loss": 0.8467, + "step": 14660 + }, + { + "epoch": 0.2354772949806578, + "grad_norm": 0.9767116904258728, + "learning_rate": 4.831322549584023e-05, + "loss": 0.7896, + "step": 14670 + }, + { + "epoch": 0.23563781120082183, + "grad_norm": 0.8324074745178223, + "learning_rate": 4.831094824202249e-05, + "loss": 0.7156, + "step": 14680 + }, + { + "epoch": 0.2357983274209859, + "grad_norm": 0.8110487461090088, + "learning_rate": 4.8308669505765266e-05, + "loss": 0.8113, + "step": 14690 + }, + { + "epoch": 0.23595884364114994, + "grad_norm": 0.6916394829750061, + "learning_rate": 4.830638928721347e-05, + "loss": 0.6978, + "step": 14700 + }, + { + "epoch": 0.236119359861314, + "grad_norm": 0.7289244532585144, + "learning_rate": 4.8304107586512104e-05, + "loss": 0.7562, + "step": 14710 + }, + { + "epoch": 0.23627987608147805, + "grad_norm": 0.6905339956283569, + "learning_rate": 4.830182440380628e-05, + "loss": 0.7529, + "step": 14720 + }, + { + "epoch": 0.23644039230164207, + "grad_norm": 0.689193606376648, + "learning_rate": 4.829953973924119e-05, + "loss": 0.7214, + "step": 14730 + }, + { + "epoch": 0.23660090852180612, + "grad_norm": 0.5075684189796448, + "learning_rate": 4.8297253592962136e-05, + "loss": 0.7087, + "step": 14740 + }, + { + "epoch": 0.23676142474197018, + "grad_norm": 0.6125611662864685, + "learning_rate": 4.829496596511448e-05, + "loss": 1.0088, + "step": 14750 + }, + { + "epoch": 0.23692194096213423, + "grad_norm": 0.5236259698867798, + "learning_rate": 4.829267685584372e-05, + "loss": 0.8576, + "step": 14760 + }, + { + "epoch": 0.23708245718229828, + "grad_norm": 0.43868839740753174, + "learning_rate": 4.829038626529543e-05, + "loss": 0.7769, + "step": 14770 + }, + { + "epoch": 0.2372429734024623, + "grad_norm": 1.002180576324463, + "learning_rate": 4.828809419361527e-05, + "loss": 0.8075, + "step": 14780 + }, + { + "epoch": 0.23740348962262636, + "grad_norm": 0.6141275763511658, + "learning_rate": 4.828580064094902e-05, + "loss": 0.7208, + "step": 14790 + }, + { + "epoch": 0.23756400584279042, + "grad_norm": 0.6364452242851257, + "learning_rate": 4.8283505607442515e-05, + "loss": 0.7301, + "step": 14800 + }, + { + "epoch": 0.23772452206295447, + "grad_norm": 0.5992729067802429, + "learning_rate": 4.828120909324171e-05, + "loss": 0.7376, + "step": 14810 + }, + { + "epoch": 0.23788503828311852, + "grad_norm": 0.7901551723480225, + "learning_rate": 4.827891109849265e-05, + "loss": 0.7725, + "step": 14820 + }, + { + "epoch": 0.23804555450328255, + "grad_norm": 0.8955700993537903, + "learning_rate": 4.827661162334149e-05, + "loss": 0.8557, + "step": 14830 + }, + { + "epoch": 0.2382060707234466, + "grad_norm": 0.6759335398674011, + "learning_rate": 4.8274310667934444e-05, + "loss": 0.8676, + "step": 14840 + }, + { + "epoch": 0.23836658694361065, + "grad_norm": 0.44429272413253784, + "learning_rate": 4.827200823241784e-05, + "loss": 0.7936, + "step": 14850 + }, + { + "epoch": 0.2385271031637747, + "grad_norm": 0.7910467386245728, + "learning_rate": 4.826970431693811e-05, + "loss": 0.8095, + "step": 14860 + }, + { + "epoch": 0.23868761938393876, + "grad_norm": 0.8068251013755798, + "learning_rate": 4.826739892164177e-05, + "loss": 0.7304, + "step": 14870 + }, + { + "epoch": 0.23884813560410278, + "grad_norm": 0.7361287474632263, + "learning_rate": 4.8265092046675416e-05, + "loss": 0.755, + "step": 14880 + }, + { + "epoch": 0.23900865182426684, + "grad_norm": 0.7092207670211792, + "learning_rate": 4.8262783692185765e-05, + "loss": 0.6821, + "step": 14890 + }, + { + "epoch": 0.2391691680444309, + "grad_norm": 0.49865809082984924, + "learning_rate": 4.826047385831961e-05, + "loss": 0.9177, + "step": 14900 + }, + { + "epoch": 0.23932968426459494, + "grad_norm": 0.5236395001411438, + "learning_rate": 4.825816254522384e-05, + "loss": 0.8403, + "step": 14910 + }, + { + "epoch": 0.239490200484759, + "grad_norm": 0.8613777756690979, + "learning_rate": 4.825584975304545e-05, + "loss": 0.8112, + "step": 14920 + }, + { + "epoch": 0.23965071670492302, + "grad_norm": 0.9249621033668518, + "learning_rate": 4.825353548193151e-05, + "loss": 0.7709, + "step": 14930 + }, + { + "epoch": 0.23981123292508708, + "grad_norm": 0.6940563917160034, + "learning_rate": 4.82512197320292e-05, + "loss": 0.8338, + "step": 14940 + }, + { + "epoch": 0.23997174914525113, + "grad_norm": 0.688951849937439, + "learning_rate": 4.824890250348578e-05, + "loss": 0.7461, + "step": 14950 + }, + { + "epoch": 0.24013226536541518, + "grad_norm": 0.5069225430488586, + "learning_rate": 4.824658379644862e-05, + "loss": 0.7859, + "step": 14960 + }, + { + "epoch": 0.24029278158557923, + "grad_norm": 0.6696979999542236, + "learning_rate": 4.8244263611065174e-05, + "loss": 0.8848, + "step": 14970 + }, + { + "epoch": 0.24045329780574326, + "grad_norm": 0.4867374002933502, + "learning_rate": 4.824194194748299e-05, + "loss": 0.8753, + "step": 14980 + }, + { + "epoch": 0.2406138140259073, + "grad_norm": 1.083250641822815, + "learning_rate": 4.823961880584972e-05, + "loss": 0.8496, + "step": 14990 + }, + { + "epoch": 0.24077433024607137, + "grad_norm": 0.8277796506881714, + "learning_rate": 4.823729418631309e-05, + "loss": 0.731, + "step": 15000 + }, + { + "epoch": 0.24093484646623542, + "grad_norm": 0.6538226008415222, + "learning_rate": 4.823496808902094e-05, + "loss": 0.9306, + "step": 15010 + }, + { + "epoch": 0.24109536268639947, + "grad_norm": 0.6083245873451233, + "learning_rate": 4.82326405141212e-05, + "loss": 0.7298, + "step": 15020 + }, + { + "epoch": 0.2412558789065635, + "grad_norm": 0.6032076478004456, + "learning_rate": 4.823031146176188e-05, + "loss": 0.818, + "step": 15030 + }, + { + "epoch": 0.24141639512672755, + "grad_norm": 0.8936601281166077, + "learning_rate": 4.822798093209111e-05, + "loss": 0.6651, + "step": 15040 + }, + { + "epoch": 0.2415769113468916, + "grad_norm": 0.5797984004020691, + "learning_rate": 4.8225648925257076e-05, + "loss": 0.8197, + "step": 15050 + }, + { + "epoch": 0.24173742756705566, + "grad_norm": 0.8111540079116821, + "learning_rate": 4.82233154414081e-05, + "loss": 0.7945, + "step": 15060 + }, + { + "epoch": 0.2418979437872197, + "grad_norm": 0.6489710807800293, + "learning_rate": 4.8220980480692556e-05, + "loss": 0.8545, + "step": 15070 + }, + { + "epoch": 0.24205846000738374, + "grad_norm": 0.4926404058933258, + "learning_rate": 4.8218644043258955e-05, + "loss": 0.9236, + "step": 15080 + }, + { + "epoch": 0.2422189762275478, + "grad_norm": 0.7634552121162415, + "learning_rate": 4.821630612925587e-05, + "loss": 0.8581, + "step": 15090 + }, + { + "epoch": 0.24237949244771184, + "grad_norm": 0.5391568541526794, + "learning_rate": 4.8213966738831986e-05, + "loss": 0.8388, + "step": 15100 + }, + { + "epoch": 0.2425400086678759, + "grad_norm": 0.5054505467414856, + "learning_rate": 4.821162587213607e-05, + "loss": 0.8402, + "step": 15110 + }, + { + "epoch": 0.24270052488803995, + "grad_norm": 0.6762367486953735, + "learning_rate": 4.820928352931698e-05, + "loss": 0.8057, + "step": 15120 + }, + { + "epoch": 0.24286104110820397, + "grad_norm": 0.7394393086433411, + "learning_rate": 4.820693971052368e-05, + "loss": 0.707, + "step": 15130 + }, + { + "epoch": 0.24302155732836803, + "grad_norm": 0.5590454936027527, + "learning_rate": 4.820459441590524e-05, + "loss": 1.0176, + "step": 15140 + }, + { + "epoch": 0.24318207354853208, + "grad_norm": 0.7493161559104919, + "learning_rate": 4.8202247645610775e-05, + "loss": 0.8846, + "step": 15150 + }, + { + "epoch": 0.24334258976869613, + "grad_norm": 1.5133421421051025, + "learning_rate": 4.819989939978955e-05, + "loss": 0.8461, + "step": 15160 + }, + { + "epoch": 0.24350310598886019, + "grad_norm": 0.9439326524734497, + "learning_rate": 4.819754967859089e-05, + "loss": 0.7979, + "step": 15170 + }, + { + "epoch": 0.2436636222090242, + "grad_norm": 0.7156214714050293, + "learning_rate": 4.819519848216424e-05, + "loss": 0.8153, + "step": 15180 + }, + { + "epoch": 0.24382413842918826, + "grad_norm": 1.0778024196624756, + "learning_rate": 4.8192845810659096e-05, + "loss": 0.8642, + "step": 15190 + }, + { + "epoch": 0.24398465464935232, + "grad_norm": 0.4866945445537567, + "learning_rate": 4.819049166422509e-05, + "loss": 0.761, + "step": 15200 + }, + { + "epoch": 0.24414517086951637, + "grad_norm": 0.6040950417518616, + "learning_rate": 4.8188136043011925e-05, + "loss": 0.7543, + "step": 15210 + }, + { + "epoch": 0.24430568708968042, + "grad_norm": 0.846322774887085, + "learning_rate": 4.8185778947169415e-05, + "loss": 0.8041, + "step": 15220 + }, + { + "epoch": 0.24446620330984445, + "grad_norm": 0.7580413222312927, + "learning_rate": 4.818342037684744e-05, + "loss": 0.7825, + "step": 15230 + }, + { + "epoch": 0.2446267195300085, + "grad_norm": 0.5066349506378174, + "learning_rate": 4.818106033219601e-05, + "loss": 0.7716, + "step": 15240 + }, + { + "epoch": 0.24478723575017255, + "grad_norm": 0.9556266069412231, + "learning_rate": 4.81786988133652e-05, + "loss": 0.7471, + "step": 15250 + }, + { + "epoch": 0.2449477519703366, + "grad_norm": 1.3518913984298706, + "learning_rate": 4.8176335820505194e-05, + "loss": 0.7893, + "step": 15260 + }, + { + "epoch": 0.24510826819050066, + "grad_norm": 0.7178061604499817, + "learning_rate": 4.8173971353766255e-05, + "loss": 0.9106, + "step": 15270 + }, + { + "epoch": 0.2452687844106647, + "grad_norm": 0.9257813096046448, + "learning_rate": 4.817160541329876e-05, + "loss": 0.8617, + "step": 15280 + }, + { + "epoch": 0.24542930063082874, + "grad_norm": 0.4955219626426697, + "learning_rate": 4.816923799925316e-05, + "loss": 0.8205, + "step": 15290 + }, + { + "epoch": 0.2455898168509928, + "grad_norm": 1.034417748451233, + "learning_rate": 4.8166869111780014e-05, + "loss": 0.8762, + "step": 15300 + }, + { + "epoch": 0.24575033307115685, + "grad_norm": 0.7103487849235535, + "learning_rate": 4.816449875102997e-05, + "loss": 0.8824, + "step": 15310 + }, + { + "epoch": 0.2459108492913209, + "grad_norm": 0.6207161545753479, + "learning_rate": 4.816212691715377e-05, + "loss": 0.8683, + "step": 15320 + }, + { + "epoch": 0.24607136551148492, + "grad_norm": 0.6626294851303101, + "learning_rate": 4.815975361030224e-05, + "loss": 0.7538, + "step": 15330 + }, + { + "epoch": 0.24623188173164898, + "grad_norm": 0.5730000734329224, + "learning_rate": 4.815737883062632e-05, + "loss": 0.8559, + "step": 15340 + }, + { + "epoch": 0.24639239795181303, + "grad_norm": 0.6241885423660278, + "learning_rate": 4.815500257827702e-05, + "loss": 0.7265, + "step": 15350 + }, + { + "epoch": 0.24655291417197708, + "grad_norm": 1.2518810033798218, + "learning_rate": 4.815262485340547e-05, + "loss": 0.878, + "step": 15360 + }, + { + "epoch": 0.24671343039214114, + "grad_norm": 0.776299238204956, + "learning_rate": 4.815024565616286e-05, + "loss": 0.7618, + "step": 15370 + }, + { + "epoch": 0.24687394661230516, + "grad_norm": 0.6517278552055359, + "learning_rate": 4.814786498670052e-05, + "loss": 0.8432, + "step": 15380 + }, + { + "epoch": 0.24703446283246921, + "grad_norm": 0.7477286458015442, + "learning_rate": 4.814548284516982e-05, + "loss": 0.8722, + "step": 15390 + }, + { + "epoch": 0.24719497905263327, + "grad_norm": 0.7205628752708435, + "learning_rate": 4.814309923172227e-05, + "loss": 0.8149, + "step": 15400 + }, + { + "epoch": 0.24735549527279732, + "grad_norm": 1.2258459329605103, + "learning_rate": 4.814071414650943e-05, + "loss": 0.7168, + "step": 15410 + }, + { + "epoch": 0.24751601149296137, + "grad_norm": 0.7187419533729553, + "learning_rate": 4.8138327589683e-05, + "loss": 0.8821, + "step": 15420 + }, + { + "epoch": 0.2476765277131254, + "grad_norm": 1.162879467010498, + "learning_rate": 4.813593956139475e-05, + "loss": 0.8116, + "step": 15430 + }, + { + "epoch": 0.24783704393328945, + "grad_norm": 0.7006785273551941, + "learning_rate": 4.813355006179653e-05, + "loss": 0.8791, + "step": 15440 + }, + { + "epoch": 0.2479975601534535, + "grad_norm": 1.0724836587905884, + "learning_rate": 4.813115909104031e-05, + "loss": 0.7813, + "step": 15450 + }, + { + "epoch": 0.24815807637361756, + "grad_norm": 0.5649840831756592, + "learning_rate": 4.8128766649278136e-05, + "loss": 0.815, + "step": 15460 + }, + { + "epoch": 0.2483185925937816, + "grad_norm": 0.5587908029556274, + "learning_rate": 4.812637273666216e-05, + "loss": 0.7269, + "step": 15470 + }, + { + "epoch": 0.24847910881394564, + "grad_norm": 0.6698653101921082, + "learning_rate": 4.8123977353344604e-05, + "loss": 0.8549, + "step": 15480 + }, + { + "epoch": 0.2486396250341097, + "grad_norm": 1.2181439399719238, + "learning_rate": 4.812158049947783e-05, + "loss": 0.7975, + "step": 15490 + }, + { + "epoch": 0.24880014125427374, + "grad_norm": 0.6637641787528992, + "learning_rate": 4.811918217521423e-05, + "loss": 0.8054, + "step": 15500 + }, + { + "epoch": 0.2489606574744378, + "grad_norm": 0.7509404420852661, + "learning_rate": 4.811678238070635e-05, + "loss": 0.8768, + "step": 15510 + }, + { + "epoch": 0.24912117369460185, + "grad_norm": 0.8445881605148315, + "learning_rate": 4.8114381116106786e-05, + "loss": 0.9612, + "step": 15520 + }, + { + "epoch": 0.24928168991476587, + "grad_norm": 0.6622403860092163, + "learning_rate": 4.8111978381568254e-05, + "loss": 0.8186, + "step": 15530 + }, + { + "epoch": 0.24944220613492993, + "grad_norm": 0.5675623416900635, + "learning_rate": 4.810957417724355e-05, + "loss": 0.79, + "step": 15540 + }, + { + "epoch": 0.24960272235509398, + "grad_norm": 0.5572085976600647, + "learning_rate": 4.810716850328556e-05, + "loss": 0.7922, + "step": 15550 + }, + { + "epoch": 0.24976323857525803, + "grad_norm": 0.7334800958633423, + "learning_rate": 4.81047613598473e-05, + "loss": 0.79, + "step": 15560 + }, + { + "epoch": 0.2499237547954221, + "grad_norm": 0.5873247981071472, + "learning_rate": 4.81023527470818e-05, + "loss": 0.8177, + "step": 15570 + }, + { + "epoch": 0.2500842710155861, + "grad_norm": 0.6377978920936584, + "learning_rate": 4.809994266514228e-05, + "loss": 0.784, + "step": 15580 + }, + { + "epoch": 0.25024478723575017, + "grad_norm": 0.574284017086029, + "learning_rate": 4.8097531114181996e-05, + "loss": 0.8156, + "step": 15590 + }, + { + "epoch": 0.2504053034559142, + "grad_norm": 0.5264580845832825, + "learning_rate": 4.809511809435429e-05, + "loss": 0.726, + "step": 15600 + }, + { + "epoch": 0.25056581967607827, + "grad_norm": 0.5220769047737122, + "learning_rate": 4.809270360581263e-05, + "loss": 0.8812, + "step": 15610 + }, + { + "epoch": 0.2507263358962423, + "grad_norm": 0.6588559150695801, + "learning_rate": 4.809028764871055e-05, + "loss": 0.782, + "step": 15620 + }, + { + "epoch": 0.2508868521164064, + "grad_norm": 0.6239479184150696, + "learning_rate": 4.808787022320172e-05, + "loss": 0.6833, + "step": 15630 + }, + { + "epoch": 0.25104736833657043, + "grad_norm": 0.5192775130271912, + "learning_rate": 4.808545132943984e-05, + "loss": 0.8254, + "step": 15640 + }, + { + "epoch": 0.2512078845567345, + "grad_norm": 0.6934115886688232, + "learning_rate": 4.808303096757877e-05, + "loss": 0.8532, + "step": 15650 + }, + { + "epoch": 0.2513684007768985, + "grad_norm": 0.6492730975151062, + "learning_rate": 4.80806091377724e-05, + "loss": 0.9395, + "step": 15660 + }, + { + "epoch": 0.25152891699706253, + "grad_norm": 1.0295647382736206, + "learning_rate": 4.8078185840174774e-05, + "loss": 0.6954, + "step": 15670 + }, + { + "epoch": 0.2516894332172266, + "grad_norm": 0.4895052909851074, + "learning_rate": 4.807576107493997e-05, + "loss": 0.8165, + "step": 15680 + }, + { + "epoch": 0.25184994943739064, + "grad_norm": 0.5137814283370972, + "learning_rate": 4.8073334842222206e-05, + "loss": 0.8377, + "step": 15690 + }, + { + "epoch": 0.2520104656575547, + "grad_norm": 0.9995269775390625, + "learning_rate": 4.8070907142175774e-05, + "loss": 0.8479, + "step": 15700 + }, + { + "epoch": 0.25217098187771875, + "grad_norm": 0.810484766960144, + "learning_rate": 4.806847797495506e-05, + "loss": 0.8397, + "step": 15710 + }, + { + "epoch": 0.2523314980978828, + "grad_norm": 0.9456301331520081, + "learning_rate": 4.806604734071454e-05, + "loss": 0.7774, + "step": 15720 + }, + { + "epoch": 0.25249201431804685, + "grad_norm": 0.5646312236785889, + "learning_rate": 4.806361523960881e-05, + "loss": 0.7641, + "step": 15730 + }, + { + "epoch": 0.2526525305382109, + "grad_norm": 0.5984691977500916, + "learning_rate": 4.80611816717925e-05, + "loss": 0.7996, + "step": 15740 + }, + { + "epoch": 0.25281304675837496, + "grad_norm": 0.6021932363510132, + "learning_rate": 4.80587466374204e-05, + "loss": 0.8499, + "step": 15750 + }, + { + "epoch": 0.25297356297853896, + "grad_norm": 0.5682885050773621, + "learning_rate": 4.8056310136647356e-05, + "loss": 0.7855, + "step": 15760 + }, + { + "epoch": 0.253134079198703, + "grad_norm": 0.5695458054542542, + "learning_rate": 4.805387216962831e-05, + "loss": 0.7705, + "step": 15770 + }, + { + "epoch": 0.25329459541886706, + "grad_norm": 0.6581767201423645, + "learning_rate": 4.805143273651831e-05, + "loss": 0.7271, + "step": 15780 + }, + { + "epoch": 0.2534551116390311, + "grad_norm": 0.5934671759605408, + "learning_rate": 4.804899183747248e-05, + "loss": 0.8278, + "step": 15790 + }, + { + "epoch": 0.25361562785919517, + "grad_norm": 0.5937804579734802, + "learning_rate": 4.804654947264606e-05, + "loss": 0.8152, + "step": 15800 + }, + { + "epoch": 0.2537761440793592, + "grad_norm": 1.107985019683838, + "learning_rate": 4.8044105642194356e-05, + "loss": 0.7226, + "step": 15810 + }, + { + "epoch": 0.2539366602995233, + "grad_norm": 0.5367686152458191, + "learning_rate": 4.8041660346272797e-05, + "loss": 0.7122, + "step": 15820 + }, + { + "epoch": 0.25409717651968733, + "grad_norm": 0.605766236782074, + "learning_rate": 4.803921358503688e-05, + "loss": 0.7733, + "step": 15830 + }, + { + "epoch": 0.2542576927398514, + "grad_norm": 0.7217006087303162, + "learning_rate": 4.80367653586422e-05, + "loss": 0.7857, + "step": 15840 + }, + { + "epoch": 0.25441820896001544, + "grad_norm": 0.6244975328445435, + "learning_rate": 4.803431566724446e-05, + "loss": 0.7823, + "step": 15850 + }, + { + "epoch": 0.25457872518017943, + "grad_norm": 0.6938493251800537, + "learning_rate": 4.803186451099944e-05, + "loss": 0.7901, + "step": 15860 + }, + { + "epoch": 0.2547392414003435, + "grad_norm": 0.5168447494506836, + "learning_rate": 4.802941189006302e-05, + "loss": 0.7304, + "step": 15870 + }, + { + "epoch": 0.25489975762050754, + "grad_norm": 0.527228593826294, + "learning_rate": 4.802695780459117e-05, + "loss": 0.894, + "step": 15880 + }, + { + "epoch": 0.2550602738406716, + "grad_norm": 0.5455058217048645, + "learning_rate": 4.802450225473996e-05, + "loss": 0.8762, + "step": 15890 + }, + { + "epoch": 0.25522079006083565, + "grad_norm": 0.5691890120506287, + "learning_rate": 4.8022045240665545e-05, + "loss": 0.7392, + "step": 15900 + }, + { + "epoch": 0.2553813062809997, + "grad_norm": 0.40384453535079956, + "learning_rate": 4.8019586762524184e-05, + "loss": 0.7918, + "step": 15910 + }, + { + "epoch": 0.25554182250116375, + "grad_norm": 0.6135216951370239, + "learning_rate": 4.801712682047221e-05, + "loss": 0.8394, + "step": 15920 + }, + { + "epoch": 0.2557023387213278, + "grad_norm": 0.7364075779914856, + "learning_rate": 4.8014665414666074e-05, + "loss": 0.784, + "step": 15930 + }, + { + "epoch": 0.25586285494149186, + "grad_norm": 0.6119745969772339, + "learning_rate": 4.8012202545262294e-05, + "loss": 0.8697, + "step": 15940 + }, + { + "epoch": 0.2560233711616559, + "grad_norm": 1.0958137512207031, + "learning_rate": 4.80097382124175e-05, + "loss": 0.7711, + "step": 15950 + }, + { + "epoch": 0.2561838873818199, + "grad_norm": 0.7495238184928894, + "learning_rate": 4.800727241628841e-05, + "loss": 0.8732, + "step": 15960 + }, + { + "epoch": 0.25634440360198396, + "grad_norm": 0.684649646282196, + "learning_rate": 4.800480515703184e-05, + "loss": 0.7977, + "step": 15970 + }, + { + "epoch": 0.256504919822148, + "grad_norm": 0.7530116438865662, + "learning_rate": 4.8002336434804676e-05, + "loss": 0.79, + "step": 15980 + }, + { + "epoch": 0.25666543604231207, + "grad_norm": 0.879090428352356, + "learning_rate": 4.799986624976393e-05, + "loss": 0.8133, + "step": 15990 + }, + { + "epoch": 0.2568259522624761, + "grad_norm": 0.7083548903465271, + "learning_rate": 4.799739460206669e-05, + "loss": 0.6699, + "step": 16000 + }, + { + "epoch": 0.2568259522624761, + "eval_loss": 0.8066674470901489, + "eval_runtime": 1833.4303, + "eval_samples_per_second": 14.307, + "eval_steps_per_second": 1.788, + "step": 16000 + }, + { + "epoch": 0.2569864684826402, + "grad_norm": 0.6517744660377502, + "learning_rate": 4.799492149187013e-05, + "loss": 0.9126, + "step": 16010 + }, + { + "epoch": 0.2571469847028042, + "grad_norm": 0.675533652305603, + "learning_rate": 4.799244691933152e-05, + "loss": 0.8461, + "step": 16020 + }, + { + "epoch": 0.2573075009229683, + "grad_norm": 0.7077280282974243, + "learning_rate": 4.7989970884608255e-05, + "loss": 0.7133, + "step": 16030 + }, + { + "epoch": 0.25746801714313233, + "grad_norm": 0.6264495253562927, + "learning_rate": 4.798749338785777e-05, + "loss": 0.8618, + "step": 16040 + }, + { + "epoch": 0.2576285333632964, + "grad_norm": 0.5425000190734863, + "learning_rate": 4.7985014429237626e-05, + "loss": 0.8856, + "step": 16050 + }, + { + "epoch": 0.2577890495834604, + "grad_norm": 1.0632232427597046, + "learning_rate": 4.798253400890548e-05, + "loss": 0.8027, + "step": 16060 + }, + { + "epoch": 0.25794956580362444, + "grad_norm": 0.5992028713226318, + "learning_rate": 4.798005212701906e-05, + "loss": 0.85, + "step": 16070 + }, + { + "epoch": 0.2581100820237885, + "grad_norm": 0.7159598469734192, + "learning_rate": 4.797756878373621e-05, + "loss": 0.7862, + "step": 16080 + }, + { + "epoch": 0.25827059824395254, + "grad_norm": 0.4884517192840576, + "learning_rate": 4.797508397921484e-05, + "loss": 0.7906, + "step": 16090 + }, + { + "epoch": 0.2584311144641166, + "grad_norm": 0.9861000180244446, + "learning_rate": 4.797259771361299e-05, + "loss": 0.8786, + "step": 16100 + }, + { + "epoch": 0.25859163068428065, + "grad_norm": 0.6144243478775024, + "learning_rate": 4.797010998708875e-05, + "loss": 0.7866, + "step": 16110 + }, + { + "epoch": 0.2587521469044447, + "grad_norm": 1.016109824180603, + "learning_rate": 4.7967620799800336e-05, + "loss": 0.8665, + "step": 16120 + }, + { + "epoch": 0.25891266312460876, + "grad_norm": 0.5869709253311157, + "learning_rate": 4.7965130151906057e-05, + "loss": 0.7296, + "step": 16130 + }, + { + "epoch": 0.2590731793447728, + "grad_norm": 0.803551971912384, + "learning_rate": 4.7962638043564285e-05, + "loss": 0.8892, + "step": 16140 + }, + { + "epoch": 0.25923369556493686, + "grad_norm": 0.7160310745239258, + "learning_rate": 4.796014447493351e-05, + "loss": 0.9203, + "step": 16150 + }, + { + "epoch": 0.25939421178510086, + "grad_norm": 0.8699983358383179, + "learning_rate": 4.795764944617231e-05, + "loss": 0.7303, + "step": 16160 + }, + { + "epoch": 0.2595547280052649, + "grad_norm": 0.4295152425765991, + "learning_rate": 4.7955152957439355e-05, + "loss": 0.8318, + "step": 16170 + }, + { + "epoch": 0.25971524422542897, + "grad_norm": 0.607502818107605, + "learning_rate": 4.79526550088934e-05, + "loss": 0.7425, + "step": 16180 + }, + { + "epoch": 0.259875760445593, + "grad_norm": 0.8842675089836121, + "learning_rate": 4.79501556006933e-05, + "loss": 0.7919, + "step": 16190 + }, + { + "epoch": 0.26003627666575707, + "grad_norm": 0.7976868748664856, + "learning_rate": 4.794765473299802e-05, + "loss": 0.7461, + "step": 16200 + }, + { + "epoch": 0.2601967928859211, + "grad_norm": 0.5506873726844788, + "learning_rate": 4.7945152405966586e-05, + "loss": 0.7923, + "step": 16210 + }, + { + "epoch": 0.2603573091060852, + "grad_norm": 0.4843102991580963, + "learning_rate": 4.7942648619758126e-05, + "loss": 0.7382, + "step": 16220 + }, + { + "epoch": 0.26051782532624923, + "grad_norm": 0.628411591053009, + "learning_rate": 4.794014337453188e-05, + "loss": 0.7672, + "step": 16230 + }, + { + "epoch": 0.2606783415464133, + "grad_norm": 0.7728725671768188, + "learning_rate": 4.793763667044716e-05, + "loss": 0.7837, + "step": 16240 + }, + { + "epoch": 0.26083885776657734, + "grad_norm": 0.6196062564849854, + "learning_rate": 4.7935128507663374e-05, + "loss": 0.7953, + "step": 16250 + }, + { + "epoch": 0.26099937398674133, + "grad_norm": 0.6246039271354675, + "learning_rate": 4.793261888634003e-05, + "loss": 0.7503, + "step": 16260 + }, + { + "epoch": 0.2611598902069054, + "grad_norm": 0.6988003253936768, + "learning_rate": 4.793010780663673e-05, + "loss": 0.7113, + "step": 16270 + }, + { + "epoch": 0.26132040642706944, + "grad_norm": 0.6660541296005249, + "learning_rate": 4.792759526871316e-05, + "loss": 0.6846, + "step": 16280 + }, + { + "epoch": 0.2614809226472335, + "grad_norm": 0.6419786214828491, + "learning_rate": 4.79250812727291e-05, + "loss": 0.8006, + "step": 16290 + }, + { + "epoch": 0.26164143886739755, + "grad_norm": 0.5048224925994873, + "learning_rate": 4.792256581884442e-05, + "loss": 0.7296, + "step": 16300 + }, + { + "epoch": 0.2618019550875616, + "grad_norm": 0.7442851662635803, + "learning_rate": 4.7920048907219104e-05, + "loss": 0.7939, + "step": 16310 + }, + { + "epoch": 0.26196247130772565, + "grad_norm": 0.5673630833625793, + "learning_rate": 4.791753053801321e-05, + "loss": 0.7103, + "step": 16320 + }, + { + "epoch": 0.2621229875278897, + "grad_norm": 0.5997719764709473, + "learning_rate": 4.7915010711386874e-05, + "loss": 0.7162, + "step": 16330 + }, + { + "epoch": 0.26228350374805376, + "grad_norm": 0.5474566221237183, + "learning_rate": 4.791248942750036e-05, + "loss": 0.8411, + "step": 16340 + }, + { + "epoch": 0.2624440199682178, + "grad_norm": 0.8503760099411011, + "learning_rate": 4.7909966686514004e-05, + "loss": 0.7229, + "step": 16350 + }, + { + "epoch": 0.2626045361883818, + "grad_norm": 0.6091781258583069, + "learning_rate": 4.790744248858823e-05, + "loss": 0.9025, + "step": 16360 + }, + { + "epoch": 0.26276505240854586, + "grad_norm": 0.5771880745887756, + "learning_rate": 4.7904916833883574e-05, + "loss": 0.9422, + "step": 16370 + }, + { + "epoch": 0.2629255686287099, + "grad_norm": 0.5568141341209412, + "learning_rate": 4.7902389722560636e-05, + "loss": 0.7734, + "step": 16380 + }, + { + "epoch": 0.26308608484887397, + "grad_norm": 0.7337527275085449, + "learning_rate": 4.789986115478013e-05, + "loss": 0.7764, + "step": 16390 + }, + { + "epoch": 0.263246601069038, + "grad_norm": 0.49977874755859375, + "learning_rate": 4.789733113070288e-05, + "loss": 0.8426, + "step": 16400 + }, + { + "epoch": 0.2634071172892021, + "grad_norm": 0.5780274271965027, + "learning_rate": 4.789479965048976e-05, + "loss": 0.8755, + "step": 16410 + }, + { + "epoch": 0.26356763350936613, + "grad_norm": 0.6562539935112, + "learning_rate": 4.7892266714301755e-05, + "loss": 0.7452, + "step": 16420 + }, + { + "epoch": 0.2637281497295302, + "grad_norm": 0.8614453673362732, + "learning_rate": 4.788973232229995e-05, + "loss": 0.7827, + "step": 16430 + }, + { + "epoch": 0.26388866594969423, + "grad_norm": 0.6614359617233276, + "learning_rate": 4.7887196474645524e-05, + "loss": 0.825, + "step": 16440 + }, + { + "epoch": 0.2640491821698583, + "grad_norm": 0.6949373483657837, + "learning_rate": 4.788465917149974e-05, + "loss": 0.8683, + "step": 16450 + }, + { + "epoch": 0.2642096983900223, + "grad_norm": 0.8293480277061462, + "learning_rate": 4.788212041302393e-05, + "loss": 0.7946, + "step": 16460 + }, + { + "epoch": 0.26437021461018634, + "grad_norm": 0.5784265398979187, + "learning_rate": 4.787958019937959e-05, + "loss": 0.7758, + "step": 16470 + }, + { + "epoch": 0.2645307308303504, + "grad_norm": 1.0124883651733398, + "learning_rate": 4.787703853072823e-05, + "loss": 0.8407, + "step": 16480 + }, + { + "epoch": 0.26469124705051444, + "grad_norm": 0.4468782842159271, + "learning_rate": 4.7874495407231496e-05, + "loss": 0.811, + "step": 16490 + }, + { + "epoch": 0.2648517632706785, + "grad_norm": 1.0265731811523438, + "learning_rate": 4.787195082905111e-05, + "loss": 0.732, + "step": 16500 + }, + { + "epoch": 0.26501227949084255, + "grad_norm": 0.6299212574958801, + "learning_rate": 4.78694047963489e-05, + "loss": 0.8473, + "step": 16510 + }, + { + "epoch": 0.2651727957110066, + "grad_norm": 0.6708685159683228, + "learning_rate": 4.786685730928677e-05, + "loss": 0.7159, + "step": 16520 + }, + { + "epoch": 0.26533331193117066, + "grad_norm": 0.9455600380897522, + "learning_rate": 4.786430836802673e-05, + "loss": 0.6943, + "step": 16530 + }, + { + "epoch": 0.2654938281513347, + "grad_norm": 0.6185752749443054, + "learning_rate": 4.786175797273089e-05, + "loss": 0.7779, + "step": 16540 + }, + { + "epoch": 0.26565434437149876, + "grad_norm": 0.7738140821456909, + "learning_rate": 4.785920612356142e-05, + "loss": 0.8058, + "step": 16550 + }, + { + "epoch": 0.26581486059166276, + "grad_norm": 0.5631877779960632, + "learning_rate": 4.78566528206806e-05, + "loss": 0.8717, + "step": 16560 + }, + { + "epoch": 0.2659753768118268, + "grad_norm": 0.9598537683486938, + "learning_rate": 4.785409806425083e-05, + "loss": 0.8944, + "step": 16570 + }, + { + "epoch": 0.26613589303199087, + "grad_norm": 0.9290374517440796, + "learning_rate": 4.785154185443456e-05, + "loss": 0.7823, + "step": 16580 + }, + { + "epoch": 0.2662964092521549, + "grad_norm": 0.5466352701187134, + "learning_rate": 4.784898419139435e-05, + "loss": 0.7206, + "step": 16590 + }, + { + "epoch": 0.266456925472319, + "grad_norm": 0.5536758303642273, + "learning_rate": 4.784642507529286e-05, + "loss": 0.924, + "step": 16600 + }, + { + "epoch": 0.266617441692483, + "grad_norm": 0.7524639964103699, + "learning_rate": 4.7843864506292815e-05, + "loss": 0.849, + "step": 16610 + }, + { + "epoch": 0.2667779579126471, + "grad_norm": 0.5917685031890869, + "learning_rate": 4.784130248455708e-05, + "loss": 0.8773, + "step": 16620 + }, + { + "epoch": 0.26693847413281113, + "grad_norm": 0.5129404067993164, + "learning_rate": 4.783873901024857e-05, + "loss": 0.7818, + "step": 16630 + }, + { + "epoch": 0.2670989903529752, + "grad_norm": 0.6915667057037354, + "learning_rate": 4.783617408353031e-05, + "loss": 0.9129, + "step": 16640 + }, + { + "epoch": 0.26725950657313924, + "grad_norm": 0.728692889213562, + "learning_rate": 4.7833607704565415e-05, + "loss": 0.773, + "step": 16650 + }, + { + "epoch": 0.26742002279330324, + "grad_norm": 0.8049497604370117, + "learning_rate": 4.783103987351709e-05, + "loss": 0.9128, + "step": 16660 + }, + { + "epoch": 0.2675805390134673, + "grad_norm": 0.7093148827552795, + "learning_rate": 4.782847059054862e-05, + "loss": 0.7712, + "step": 16670 + }, + { + "epoch": 0.26774105523363134, + "grad_norm": 0.588988721370697, + "learning_rate": 4.782589985582343e-05, + "loss": 0.8397, + "step": 16680 + }, + { + "epoch": 0.2679015714537954, + "grad_norm": 0.9920754432678223, + "learning_rate": 4.782332766950497e-05, + "loss": 0.7752, + "step": 16690 + }, + { + "epoch": 0.26806208767395945, + "grad_norm": 0.6157994866371155, + "learning_rate": 4.782075403175683e-05, + "loss": 0.8258, + "step": 16700 + }, + { + "epoch": 0.2682226038941235, + "grad_norm": 0.9361831545829773, + "learning_rate": 4.7818178942742685e-05, + "loss": 0.8665, + "step": 16710 + }, + { + "epoch": 0.26838312011428755, + "grad_norm": 0.74977707862854, + "learning_rate": 4.7815602402626285e-05, + "loss": 0.8886, + "step": 16720 + }, + { + "epoch": 0.2685436363344516, + "grad_norm": 0.572387158870697, + "learning_rate": 4.781302441157148e-05, + "loss": 0.8489, + "step": 16730 + }, + { + "epoch": 0.26870415255461566, + "grad_norm": 0.5368378162384033, + "learning_rate": 4.781044496974223e-05, + "loss": 0.7203, + "step": 16740 + }, + { + "epoch": 0.2688646687747797, + "grad_norm": 0.662004828453064, + "learning_rate": 4.7807864077302566e-05, + "loss": 0.8815, + "step": 16750 + }, + { + "epoch": 0.26902518499494377, + "grad_norm": 0.7131313681602478, + "learning_rate": 4.7805281734416606e-05, + "loss": 0.875, + "step": 16760 + }, + { + "epoch": 0.26918570121510776, + "grad_norm": 0.6501905918121338, + "learning_rate": 4.7802697941248585e-05, + "loss": 0.7948, + "step": 16770 + }, + { + "epoch": 0.2693462174352718, + "grad_norm": 1.26823890209198, + "learning_rate": 4.7800112697962815e-05, + "loss": 0.7463, + "step": 16780 + }, + { + "epoch": 0.26950673365543587, + "grad_norm": 0.4727516770362854, + "learning_rate": 4.7797526004723695e-05, + "loss": 0.8191, + "step": 16790 + }, + { + "epoch": 0.2696672498755999, + "grad_norm": 0.7914152145385742, + "learning_rate": 4.779493786169573e-05, + "loss": 0.8283, + "step": 16800 + }, + { + "epoch": 0.269827766095764, + "grad_norm": 0.6244381666183472, + "learning_rate": 4.779234826904352e-05, + "loss": 0.8582, + "step": 16810 + }, + { + "epoch": 0.26998828231592803, + "grad_norm": 0.8491032123565674, + "learning_rate": 4.778975722693173e-05, + "loss": 0.7909, + "step": 16820 + }, + { + "epoch": 0.2701487985360921, + "grad_norm": 0.600550651550293, + "learning_rate": 4.778716473552514e-05, + "loss": 0.8239, + "step": 16830 + }, + { + "epoch": 0.27030931475625614, + "grad_norm": 0.5665378570556641, + "learning_rate": 4.778457079498863e-05, + "loss": 0.8148, + "step": 16840 + }, + { + "epoch": 0.2704698309764202, + "grad_norm": 0.5673250555992126, + "learning_rate": 4.7781975405487136e-05, + "loss": 0.8524, + "step": 16850 + }, + { + "epoch": 0.27063034719658424, + "grad_norm": 0.9270396828651428, + "learning_rate": 4.777937856718573e-05, + "loss": 0.8314, + "step": 16860 + }, + { + "epoch": 0.27079086341674824, + "grad_norm": 0.8021392822265625, + "learning_rate": 4.777678028024954e-05, + "loss": 0.8614, + "step": 16870 + }, + { + "epoch": 0.2709513796369123, + "grad_norm": 0.5818308591842651, + "learning_rate": 4.777418054484382e-05, + "loss": 0.7995, + "step": 16880 + }, + { + "epoch": 0.27111189585707635, + "grad_norm": 0.6289852261543274, + "learning_rate": 4.777157936113389e-05, + "loss": 0.7967, + "step": 16890 + }, + { + "epoch": 0.2712724120772404, + "grad_norm": 1.096947193145752, + "learning_rate": 4.7768976729285166e-05, + "loss": 0.8779, + "step": 16900 + }, + { + "epoch": 0.27143292829740445, + "grad_norm": 0.6340774297714233, + "learning_rate": 4.776637264946316e-05, + "loss": 0.7598, + "step": 16910 + }, + { + "epoch": 0.2715934445175685, + "grad_norm": 0.5556398034095764, + "learning_rate": 4.776376712183347e-05, + "loss": 0.7419, + "step": 16920 + }, + { + "epoch": 0.27175396073773256, + "grad_norm": 0.8345324993133545, + "learning_rate": 4.776116014656181e-05, + "loss": 0.8052, + "step": 16930 + }, + { + "epoch": 0.2719144769578966, + "grad_norm": 0.709220826625824, + "learning_rate": 4.7758551723813966e-05, + "loss": 0.8999, + "step": 16940 + }, + { + "epoch": 0.27207499317806066, + "grad_norm": 0.3802470564842224, + "learning_rate": 4.77559418537558e-05, + "loss": 0.9377, + "step": 16950 + }, + { + "epoch": 0.2722355093982247, + "grad_norm": 0.6437955498695374, + "learning_rate": 4.77533305365533e-05, + "loss": 0.8976, + "step": 16960 + }, + { + "epoch": 0.2723960256183887, + "grad_norm": 0.48675209283828735, + "learning_rate": 4.775071777237253e-05, + "loss": 0.8266, + "step": 16970 + }, + { + "epoch": 0.27255654183855277, + "grad_norm": 0.5760757327079773, + "learning_rate": 4.7748103561379644e-05, + "loss": 0.823, + "step": 16980 + }, + { + "epoch": 0.2727170580587168, + "grad_norm": 0.601068913936615, + "learning_rate": 4.7745487903740885e-05, + "loss": 0.8285, + "step": 16990 + }, + { + "epoch": 0.2728775742788809, + "grad_norm": 0.5054754614830017, + "learning_rate": 4.774287079962259e-05, + "loss": 0.7752, + "step": 17000 + }, + { + "epoch": 0.2730380904990449, + "grad_norm": 0.6444161534309387, + "learning_rate": 4.774025224919121e-05, + "loss": 0.6357, + "step": 17010 + }, + { + "epoch": 0.273198606719209, + "grad_norm": 0.4748232066631317, + "learning_rate": 4.7737632252613254e-05, + "loss": 0.7608, + "step": 17020 + }, + { + "epoch": 0.27335912293937303, + "grad_norm": 0.5307843089103699, + "learning_rate": 4.773501081005535e-05, + "loss": 0.8283, + "step": 17030 + }, + { + "epoch": 0.2735196391595371, + "grad_norm": 0.7355693578720093, + "learning_rate": 4.7732387921684185e-05, + "loss": 0.7722, + "step": 17040 + }, + { + "epoch": 0.27368015537970114, + "grad_norm": 0.485502690076828, + "learning_rate": 4.772976358766659e-05, + "loss": 0.87, + "step": 17050 + }, + { + "epoch": 0.2738406715998652, + "grad_norm": 0.539783775806427, + "learning_rate": 4.772713780816943e-05, + "loss": 0.693, + "step": 17060 + }, + { + "epoch": 0.2740011878200292, + "grad_norm": 0.6077142357826233, + "learning_rate": 4.77245105833597e-05, + "loss": 0.861, + "step": 17070 + }, + { + "epoch": 0.27416170404019324, + "grad_norm": 1.735343098640442, + "learning_rate": 4.7721881913404487e-05, + "loss": 0.7579, + "step": 17080 + }, + { + "epoch": 0.2743222202603573, + "grad_norm": 0.8336731791496277, + "learning_rate": 4.771925179847094e-05, + "loss": 0.6897, + "step": 17090 + }, + { + "epoch": 0.27448273648052135, + "grad_norm": 0.8977804183959961, + "learning_rate": 4.771662023872633e-05, + "loss": 0.7779, + "step": 17100 + }, + { + "epoch": 0.2746432527006854, + "grad_norm": 0.39159390330314636, + "learning_rate": 4.7713987234337996e-05, + "loss": 0.7398, + "step": 17110 + }, + { + "epoch": 0.27480376892084946, + "grad_norm": 0.6881596446037292, + "learning_rate": 4.7711352785473405e-05, + "loss": 0.7401, + "step": 17120 + }, + { + "epoch": 0.2749642851410135, + "grad_norm": 0.9957073926925659, + "learning_rate": 4.770871689230008e-05, + "loss": 0.7853, + "step": 17130 + }, + { + "epoch": 0.27512480136117756, + "grad_norm": 0.7392405867576599, + "learning_rate": 4.770607955498564e-05, + "loss": 0.8293, + "step": 17140 + }, + { + "epoch": 0.2752853175813416, + "grad_norm": 0.7355318665504456, + "learning_rate": 4.770344077369781e-05, + "loss": 0.7318, + "step": 17150 + }, + { + "epoch": 0.27544583380150567, + "grad_norm": 0.5525994300842285, + "learning_rate": 4.7700800548604404e-05, + "loss": 0.853, + "step": 17160 + }, + { + "epoch": 0.27560635002166967, + "grad_norm": 0.639967143535614, + "learning_rate": 4.7698158879873325e-05, + "loss": 0.7746, + "step": 17170 + }, + { + "epoch": 0.2757668662418337, + "grad_norm": 0.6170299053192139, + "learning_rate": 4.769551576767256e-05, + "loss": 0.7091, + "step": 17180 + }, + { + "epoch": 0.27592738246199777, + "grad_norm": 0.578981339931488, + "learning_rate": 4.7692871212170206e-05, + "loss": 0.8159, + "step": 17190 + }, + { + "epoch": 0.2760878986821618, + "grad_norm": 0.5659828186035156, + "learning_rate": 4.769022521353444e-05, + "loss": 0.8802, + "step": 17200 + }, + { + "epoch": 0.2762484149023259, + "grad_norm": 0.6970683932304382, + "learning_rate": 4.768757777193352e-05, + "loss": 0.6859, + "step": 17210 + }, + { + "epoch": 0.27640893112248993, + "grad_norm": 0.6407801508903503, + "learning_rate": 4.768492888753583e-05, + "loss": 0.8244, + "step": 17220 + }, + { + "epoch": 0.276569447342654, + "grad_norm": 0.568252444267273, + "learning_rate": 4.76822785605098e-05, + "loss": 0.7537, + "step": 17230 + }, + { + "epoch": 0.27672996356281804, + "grad_norm": 0.5851914882659912, + "learning_rate": 4.7679626791023986e-05, + "loss": 0.8017, + "step": 17240 + }, + { + "epoch": 0.2768904797829821, + "grad_norm": 0.5348182320594788, + "learning_rate": 4.767697357924702e-05, + "loss": 0.8517, + "step": 17250 + }, + { + "epoch": 0.27705099600314614, + "grad_norm": 0.5235812067985535, + "learning_rate": 4.767431892534764e-05, + "loss": 0.8148, + "step": 17260 + }, + { + "epoch": 0.27721151222331014, + "grad_norm": 0.7636592388153076, + "learning_rate": 4.7671662829494665e-05, + "loss": 0.8025, + "step": 17270 + }, + { + "epoch": 0.2773720284434742, + "grad_norm": 0.736334502696991, + "learning_rate": 4.7669005291856994e-05, + "loss": 0.8172, + "step": 17280 + }, + { + "epoch": 0.27753254466363825, + "grad_norm": 0.6361430287361145, + "learning_rate": 4.766634631260365e-05, + "loss": 0.8717, + "step": 17290 + }, + { + "epoch": 0.2776930608838023, + "grad_norm": 1.237838864326477, + "learning_rate": 4.7663685891903706e-05, + "loss": 0.8085, + "step": 17300 + }, + { + "epoch": 0.27785357710396635, + "grad_norm": 0.7284049987792969, + "learning_rate": 4.7661024029926374e-05, + "loss": 0.702, + "step": 17310 + }, + { + "epoch": 0.2780140933241304, + "grad_norm": 1.0462013483047485, + "learning_rate": 4.765836072684091e-05, + "loss": 0.7956, + "step": 17320 + }, + { + "epoch": 0.27817460954429446, + "grad_norm": 0.5609205365180969, + "learning_rate": 4.7655695982816704e-05, + "loss": 0.6566, + "step": 17330 + }, + { + "epoch": 0.2783351257644585, + "grad_norm": 0.5365473031997681, + "learning_rate": 4.7653029798023206e-05, + "loss": 0.8562, + "step": 17340 + }, + { + "epoch": 0.27849564198462257, + "grad_norm": 0.5952497124671936, + "learning_rate": 4.765036217262997e-05, + "loss": 0.7507, + "step": 17350 + }, + { + "epoch": 0.2786561582047866, + "grad_norm": 0.9335415959358215, + "learning_rate": 4.764769310680665e-05, + "loss": 0.7144, + "step": 17360 + }, + { + "epoch": 0.2788166744249506, + "grad_norm": 0.645261287689209, + "learning_rate": 4.7645022600722965e-05, + "loss": 0.8143, + "step": 17370 + }, + { + "epoch": 0.27897719064511467, + "grad_norm": 0.50775545835495, + "learning_rate": 4.764235065454876e-05, + "loss": 0.6992, + "step": 17380 + }, + { + "epoch": 0.2791377068652787, + "grad_norm": 0.5382126569747925, + "learning_rate": 4.763967726845396e-05, + "loss": 0.6501, + "step": 17390 + }, + { + "epoch": 0.2792982230854428, + "grad_norm": 0.5423483848571777, + "learning_rate": 4.763700244260856e-05, + "loss": 0.7953, + "step": 17400 + }, + { + "epoch": 0.27945873930560683, + "grad_norm": 0.7748095989227295, + "learning_rate": 4.763432617718267e-05, + "loss": 0.7713, + "step": 17410 + }, + { + "epoch": 0.2796192555257709, + "grad_norm": 0.6577771306037903, + "learning_rate": 4.7631648472346484e-05, + "loss": 0.763, + "step": 17420 + }, + { + "epoch": 0.27977977174593494, + "grad_norm": 0.5155861973762512, + "learning_rate": 4.762896932827029e-05, + "loss": 0.7115, + "step": 17430 + }, + { + "epoch": 0.279940287966099, + "grad_norm": 0.5297248959541321, + "learning_rate": 4.762628874512447e-05, + "loss": 0.7897, + "step": 17440 + }, + { + "epoch": 0.28010080418626304, + "grad_norm": 0.6344960927963257, + "learning_rate": 4.7623606723079486e-05, + "loss": 0.8501, + "step": 17450 + }, + { + "epoch": 0.2802613204064271, + "grad_norm": 1.3175835609436035, + "learning_rate": 4.7620923262305897e-05, + "loss": 0.775, + "step": 17460 + }, + { + "epoch": 0.2804218366265911, + "grad_norm": 0.918360710144043, + "learning_rate": 4.761823836297437e-05, + "loss": 0.8116, + "step": 17470 + }, + { + "epoch": 0.28058235284675515, + "grad_norm": 0.5104888081550598, + "learning_rate": 4.761555202525563e-05, + "loss": 0.8499, + "step": 17480 + }, + { + "epoch": 0.2807428690669192, + "grad_norm": 0.7081618309020996, + "learning_rate": 4.7612864249320524e-05, + "loss": 0.8388, + "step": 17490 + }, + { + "epoch": 0.28090338528708325, + "grad_norm": 0.678294837474823, + "learning_rate": 4.761017503533997e-05, + "loss": 0.7969, + "step": 17500 + }, + { + "epoch": 0.2810639015072473, + "grad_norm": 0.746579647064209, + "learning_rate": 4.7607484383485e-05, + "loss": 0.8265, + "step": 17510 + }, + { + "epoch": 0.28122441772741136, + "grad_norm": 0.7211241126060486, + "learning_rate": 4.7604792293926714e-05, + "loss": 0.7662, + "step": 17520 + }, + { + "epoch": 0.2813849339475754, + "grad_norm": 0.5042513012886047, + "learning_rate": 4.760209876683631e-05, + "loss": 0.8274, + "step": 17530 + }, + { + "epoch": 0.28154545016773946, + "grad_norm": 0.574884831905365, + "learning_rate": 4.759940380238509e-05, + "loss": 0.8105, + "step": 17540 + }, + { + "epoch": 0.2817059663879035, + "grad_norm": 0.7494096159934998, + "learning_rate": 4.759670740074443e-05, + "loss": 0.7964, + "step": 17550 + }, + { + "epoch": 0.28186648260806757, + "grad_norm": 0.61440110206604, + "learning_rate": 4.759400956208581e-05, + "loss": 0.7772, + "step": 17560 + }, + { + "epoch": 0.28202699882823157, + "grad_norm": 0.6371849179267883, + "learning_rate": 4.75913102865808e-05, + "loss": 0.7577, + "step": 17570 + }, + { + "epoch": 0.2821875150483956, + "grad_norm": 0.5974372625350952, + "learning_rate": 4.7588609574401046e-05, + "loss": 0.7866, + "step": 17580 + }, + { + "epoch": 0.2823480312685597, + "grad_norm": 0.7063047885894775, + "learning_rate": 4.7585907425718306e-05, + "loss": 0.6535, + "step": 17590 + }, + { + "epoch": 0.2825085474887237, + "grad_norm": 0.7525917291641235, + "learning_rate": 4.758320384070443e-05, + "loss": 0.8851, + "step": 17600 + }, + { + "epoch": 0.2826690637088878, + "grad_norm": 0.39431098103523254, + "learning_rate": 4.758049881953133e-05, + "loss": 0.7467, + "step": 17610 + }, + { + "epoch": 0.28282957992905183, + "grad_norm": 0.6390130519866943, + "learning_rate": 4.757779236237103e-05, + "loss": 0.7587, + "step": 17620 + }, + { + "epoch": 0.2829900961492159, + "grad_norm": 0.7626198530197144, + "learning_rate": 4.757508446939567e-05, + "loss": 0.8118, + "step": 17630 + }, + { + "epoch": 0.28315061236937994, + "grad_norm": 0.5095770359039307, + "learning_rate": 4.7572375140777435e-05, + "loss": 0.8137, + "step": 17640 + }, + { + "epoch": 0.283311128589544, + "grad_norm": 1.0803956985473633, + "learning_rate": 4.7569664376688624e-05, + "loss": 0.8295, + "step": 17650 + }, + { + "epoch": 0.28347164480970805, + "grad_norm": 0.6011291742324829, + "learning_rate": 4.756695217730163e-05, + "loss": 0.8753, + "step": 17660 + }, + { + "epoch": 0.28363216102987204, + "grad_norm": 0.6754547953605652, + "learning_rate": 4.7564238542788936e-05, + "loss": 0.9007, + "step": 17670 + }, + { + "epoch": 0.2837926772500361, + "grad_norm": 0.5892003774642944, + "learning_rate": 4.756152347332311e-05, + "loss": 0.7016, + "step": 17680 + }, + { + "epoch": 0.28395319347020015, + "grad_norm": 0.743270754814148, + "learning_rate": 4.755880696907681e-05, + "loss": 0.6879, + "step": 17690 + }, + { + "epoch": 0.2841137096903642, + "grad_norm": 0.7437613010406494, + "learning_rate": 4.755608903022279e-05, + "loss": 0.8477, + "step": 17700 + }, + { + "epoch": 0.28427422591052826, + "grad_norm": 0.6061498522758484, + "learning_rate": 4.7553369656933905e-05, + "loss": 0.7627, + "step": 17710 + }, + { + "epoch": 0.2844347421306923, + "grad_norm": 0.8391322493553162, + "learning_rate": 4.755064884938308e-05, + "loss": 0.8771, + "step": 17720 + }, + { + "epoch": 0.28459525835085636, + "grad_norm": 0.44497421383857727, + "learning_rate": 4.754792660774335e-05, + "loss": 0.8462, + "step": 17730 + }, + { + "epoch": 0.2847557745710204, + "grad_norm": 1.0510557889938354, + "learning_rate": 4.754520293218783e-05, + "loss": 0.8095, + "step": 17740 + }, + { + "epoch": 0.28491629079118447, + "grad_norm": 0.6247544884681702, + "learning_rate": 4.754247782288973e-05, + "loss": 0.7396, + "step": 17750 + }, + { + "epoch": 0.2850768070113485, + "grad_norm": 0.6862468719482422, + "learning_rate": 4.753975128002235e-05, + "loss": 0.8385, + "step": 17760 + }, + { + "epoch": 0.2852373232315125, + "grad_norm": 0.9485924243927002, + "learning_rate": 4.7537023303759085e-05, + "loss": 0.7641, + "step": 17770 + }, + { + "epoch": 0.28539783945167657, + "grad_norm": 0.7141318321228027, + "learning_rate": 4.753429389427342e-05, + "loss": 0.7586, + "step": 17780 + }, + { + "epoch": 0.2855583556718406, + "grad_norm": 0.37865516543388367, + "learning_rate": 4.7531563051738923e-05, + "loss": 0.807, + "step": 17790 + }, + { + "epoch": 0.2857188718920047, + "grad_norm": 0.523504912853241, + "learning_rate": 4.752883077632927e-05, + "loss": 0.8795, + "step": 17800 + }, + { + "epoch": 0.28587938811216873, + "grad_norm": 0.7844220995903015, + "learning_rate": 4.75260970682182e-05, + "loss": 0.848, + "step": 17810 + }, + { + "epoch": 0.2860399043323328, + "grad_norm": 0.5537945032119751, + "learning_rate": 4.752336192757957e-05, + "loss": 0.8551, + "step": 17820 + }, + { + "epoch": 0.28620042055249684, + "grad_norm": 0.503201425075531, + "learning_rate": 4.752062535458733e-05, + "loss": 0.9617, + "step": 17830 + }, + { + "epoch": 0.2863609367726609, + "grad_norm": 0.7125167846679688, + "learning_rate": 4.751788734941549e-05, + "loss": 0.8603, + "step": 17840 + }, + { + "epoch": 0.28652145299282494, + "grad_norm": 0.6251235008239746, + "learning_rate": 4.751514791223819e-05, + "loss": 0.7585, + "step": 17850 + }, + { + "epoch": 0.286681969212989, + "grad_norm": 1.0592995882034302, + "learning_rate": 4.7512407043229636e-05, + "loss": 0.7657, + "step": 17860 + }, + { + "epoch": 0.286842485433153, + "grad_norm": 1.2189550399780273, + "learning_rate": 4.750966474256412e-05, + "loss": 0.9143, + "step": 17870 + }, + { + "epoch": 0.28700300165331705, + "grad_norm": 0.8036786913871765, + "learning_rate": 4.750692101041605e-05, + "loss": 0.8511, + "step": 17880 + }, + { + "epoch": 0.2871635178734811, + "grad_norm": 0.5292152762413025, + "learning_rate": 4.7504175846959896e-05, + "loss": 0.8155, + "step": 17890 + }, + { + "epoch": 0.28732403409364515, + "grad_norm": 0.9692254662513733, + "learning_rate": 4.750142925237025e-05, + "loss": 0.8424, + "step": 17900 + }, + { + "epoch": 0.2874845503138092, + "grad_norm": 0.4792726933956146, + "learning_rate": 4.749868122682177e-05, + "loss": 0.8334, + "step": 17910 + }, + { + "epoch": 0.28764506653397326, + "grad_norm": 0.7929838299751282, + "learning_rate": 4.7495931770489235e-05, + "loss": 0.7956, + "step": 17920 + }, + { + "epoch": 0.2878055827541373, + "grad_norm": 0.5596081018447876, + "learning_rate": 4.749318088354746e-05, + "loss": 0.8981, + "step": 17930 + }, + { + "epoch": 0.28796609897430137, + "grad_norm": 0.8276767134666443, + "learning_rate": 4.749042856617141e-05, + "loss": 0.7506, + "step": 17940 + }, + { + "epoch": 0.2881266151944654, + "grad_norm": 0.5455018877983093, + "learning_rate": 4.74876748185361e-05, + "loss": 0.8803, + "step": 17950 + }, + { + "epoch": 0.28828713141462947, + "grad_norm": 0.5499597191810608, + "learning_rate": 4.7484919640816666e-05, + "loss": 0.7997, + "step": 17960 + }, + { + "epoch": 0.28844764763479347, + "grad_norm": 0.690523624420166, + "learning_rate": 4.748216303318832e-05, + "loss": 0.7043, + "step": 17970 + }, + { + "epoch": 0.2886081638549575, + "grad_norm": 0.6027265787124634, + "learning_rate": 4.7479404995826355e-05, + "loss": 0.8269, + "step": 17980 + }, + { + "epoch": 0.2887686800751216, + "grad_norm": 0.6171227097511292, + "learning_rate": 4.747664552890617e-05, + "loss": 0.8487, + "step": 17990 + }, + { + "epoch": 0.28892919629528563, + "grad_norm": 0.7227647304534912, + "learning_rate": 4.7473884632603266e-05, + "loss": 0.8254, + "step": 18000 + }, + { + "epoch": 0.2890897125154497, + "grad_norm": 0.6248397827148438, + "learning_rate": 4.7471122307093205e-05, + "loss": 0.8595, + "step": 18010 + }, + { + "epoch": 0.28925022873561373, + "grad_norm": 0.5586646795272827, + "learning_rate": 4.746835855255165e-05, + "loss": 0.7884, + "step": 18020 + }, + { + "epoch": 0.2894107449557778, + "grad_norm": 0.7580485939979553, + "learning_rate": 4.746559336915437e-05, + "loss": 0.8037, + "step": 18030 + }, + { + "epoch": 0.28957126117594184, + "grad_norm": 1.0993611812591553, + "learning_rate": 4.7462826757077215e-05, + "loss": 0.7463, + "step": 18040 + }, + { + "epoch": 0.2897317773961059, + "grad_norm": 1.010907530784607, + "learning_rate": 4.746005871649611e-05, + "loss": 0.8276, + "step": 18050 + }, + { + "epoch": 0.28989229361626995, + "grad_norm": 0.8964709639549255, + "learning_rate": 4.7457289247587114e-05, + "loss": 0.8311, + "step": 18060 + }, + { + "epoch": 0.29005280983643394, + "grad_norm": 1.227984070777893, + "learning_rate": 4.7454518350526324e-05, + "loss": 0.8064, + "step": 18070 + }, + { + "epoch": 0.290213326056598, + "grad_norm": 0.671233594417572, + "learning_rate": 4.745174602548996e-05, + "loss": 0.7299, + "step": 18080 + }, + { + "epoch": 0.29037384227676205, + "grad_norm": 1.0590484142303467, + "learning_rate": 4.744897227265434e-05, + "loss": 0.7315, + "step": 18090 + }, + { + "epoch": 0.2905343584969261, + "grad_norm": 0.6138243675231934, + "learning_rate": 4.7446197092195835e-05, + "loss": 0.8436, + "step": 18100 + }, + { + "epoch": 0.29069487471709016, + "grad_norm": 0.5970408916473389, + "learning_rate": 4.7443420484290934e-05, + "loss": 0.7376, + "step": 18110 + }, + { + "epoch": 0.2908553909372542, + "grad_norm": 0.6940883994102478, + "learning_rate": 4.744064244911624e-05, + "loss": 0.8113, + "step": 18120 + }, + { + "epoch": 0.29101590715741826, + "grad_norm": 0.7792630791664124, + "learning_rate": 4.743786298684838e-05, + "loss": 0.8552, + "step": 18130 + }, + { + "epoch": 0.2911764233775823, + "grad_norm": 0.6445935368537903, + "learning_rate": 4.743508209766413e-05, + "loss": 0.7169, + "step": 18140 + }, + { + "epoch": 0.29133693959774637, + "grad_norm": 0.5781461000442505, + "learning_rate": 4.7432299781740355e-05, + "loss": 0.8002, + "step": 18150 + }, + { + "epoch": 0.2914974558179104, + "grad_norm": 0.7960131764411926, + "learning_rate": 4.742951603925397e-05, + "loss": 0.8707, + "step": 18160 + }, + { + "epoch": 0.2916579720380744, + "grad_norm": 0.9116689562797546, + "learning_rate": 4.7426730870382007e-05, + "loss": 0.9107, + "step": 18170 + }, + { + "epoch": 0.2918184882582385, + "grad_norm": 0.728033721446991, + "learning_rate": 4.7423944275301604e-05, + "loss": 0.7344, + "step": 18180 + }, + { + "epoch": 0.2919790044784025, + "grad_norm": 0.5908055305480957, + "learning_rate": 4.7421156254189945e-05, + "loss": 0.7975, + "step": 18190 + }, + { + "epoch": 0.2921395206985666, + "grad_norm": 0.5678460597991943, + "learning_rate": 4.7418366807224364e-05, + "loss": 0.9057, + "step": 18200 + }, + { + "epoch": 0.29230003691873063, + "grad_norm": 0.7046424150466919, + "learning_rate": 4.7415575934582224e-05, + "loss": 0.8112, + "step": 18210 + }, + { + "epoch": 0.2924605531388947, + "grad_norm": 0.7491626739501953, + "learning_rate": 4.741278363644103e-05, + "loss": 0.6796, + "step": 18220 + }, + { + "epoch": 0.29262106935905874, + "grad_norm": 0.5524681210517883, + "learning_rate": 4.740998991297834e-05, + "loss": 0.6912, + "step": 18230 + }, + { + "epoch": 0.2927815855792228, + "grad_norm": 0.5028963685035706, + "learning_rate": 4.740719476437183e-05, + "loss": 0.8246, + "step": 18240 + }, + { + "epoch": 0.29294210179938684, + "grad_norm": 0.5380961894989014, + "learning_rate": 4.7404398190799244e-05, + "loss": 0.6003, + "step": 18250 + }, + { + "epoch": 0.2931026180195509, + "grad_norm": 0.8473597168922424, + "learning_rate": 4.7401600192438435e-05, + "loss": 0.8205, + "step": 18260 + }, + { + "epoch": 0.2932631342397149, + "grad_norm": 0.9900450706481934, + "learning_rate": 4.739880076946734e-05, + "loss": 0.75, + "step": 18270 + }, + { + "epoch": 0.29342365045987895, + "grad_norm": 0.7254894375801086, + "learning_rate": 4.739599992206398e-05, + "loss": 0.7546, + "step": 18280 + }, + { + "epoch": 0.293584166680043, + "grad_norm": 1.0066196918487549, + "learning_rate": 4.739319765040648e-05, + "loss": 0.7764, + "step": 18290 + }, + { + "epoch": 0.29374468290020705, + "grad_norm": 1.0690150260925293, + "learning_rate": 4.739039395467304e-05, + "loss": 0.735, + "step": 18300 + }, + { + "epoch": 0.2939051991203711, + "grad_norm": 1.176764726638794, + "learning_rate": 4.738758883504197e-05, + "loss": 0.8202, + "step": 18310 + }, + { + "epoch": 0.29406571534053516, + "grad_norm": 0.6089344620704651, + "learning_rate": 4.738478229169164e-05, + "loss": 0.7229, + "step": 18320 + }, + { + "epoch": 0.2942262315606992, + "grad_norm": 0.6435672640800476, + "learning_rate": 4.7381974324800554e-05, + "loss": 0.7915, + "step": 18330 + }, + { + "epoch": 0.29438674778086327, + "grad_norm": 0.7333799600601196, + "learning_rate": 4.737916493454726e-05, + "loss": 0.8486, + "step": 18340 + }, + { + "epoch": 0.2945472640010273, + "grad_norm": 0.5215854048728943, + "learning_rate": 4.737635412111044e-05, + "loss": 0.7856, + "step": 18350 + }, + { + "epoch": 0.2947077802211914, + "grad_norm": 1.7937767505645752, + "learning_rate": 4.7373541884668825e-05, + "loss": 0.75, + "step": 18360 + }, + { + "epoch": 0.2948682964413554, + "grad_norm": 0.616848349571228, + "learning_rate": 4.737072822540127e-05, + "loss": 0.87, + "step": 18370 + }, + { + "epoch": 0.2950288126615194, + "grad_norm": 0.747935950756073, + "learning_rate": 4.73679131434867e-05, + "loss": 0.7828, + "step": 18380 + }, + { + "epoch": 0.2951893288816835, + "grad_norm": 0.655630886554718, + "learning_rate": 4.736509663910414e-05, + "loss": 0.8092, + "step": 18390 + }, + { + "epoch": 0.29534984510184753, + "grad_norm": 0.5675885677337646, + "learning_rate": 4.736227871243271e-05, + "loss": 0.813, + "step": 18400 + }, + { + "epoch": 0.2955103613220116, + "grad_norm": 0.9600654244422913, + "learning_rate": 4.7359459363651594e-05, + "loss": 0.829, + "step": 18410 + }, + { + "epoch": 0.29567087754217564, + "grad_norm": 0.7537384033203125, + "learning_rate": 4.73566385929401e-05, + "loss": 0.7555, + "step": 18420 + }, + { + "epoch": 0.2958313937623397, + "grad_norm": 0.6121558547019958, + "learning_rate": 4.735381640047762e-05, + "loss": 0.8043, + "step": 18430 + }, + { + "epoch": 0.29599190998250374, + "grad_norm": 0.5588534474372864, + "learning_rate": 4.735099278644363e-05, + "loss": 0.8544, + "step": 18440 + }, + { + "epoch": 0.2961524262026678, + "grad_norm": 0.7123005390167236, + "learning_rate": 4.734816775101767e-05, + "loss": 0.6369, + "step": 18450 + }, + { + "epoch": 0.29631294242283185, + "grad_norm": 0.687372624874115, + "learning_rate": 4.734534129437942e-05, + "loss": 0.8389, + "step": 18460 + }, + { + "epoch": 0.2964734586429959, + "grad_norm": 1.154371738433838, + "learning_rate": 4.734251341670861e-05, + "loss": 0.9186, + "step": 18470 + }, + { + "epoch": 0.2966339748631599, + "grad_norm": 0.5778452754020691, + "learning_rate": 4.733968411818509e-05, + "loss": 0.7985, + "step": 18480 + }, + { + "epoch": 0.29679449108332395, + "grad_norm": 1.0746386051177979, + "learning_rate": 4.733685339898878e-05, + "loss": 0.5987, + "step": 18490 + }, + { + "epoch": 0.296955007303488, + "grad_norm": 0.5637757778167725, + "learning_rate": 4.733402125929969e-05, + "loss": 0.6845, + "step": 18500 + }, + { + "epoch": 0.29711552352365206, + "grad_norm": 0.589174211025238, + "learning_rate": 4.733118769929794e-05, + "loss": 0.8774, + "step": 18510 + }, + { + "epoch": 0.2972760397438161, + "grad_norm": 0.893913984298706, + "learning_rate": 4.732835271916373e-05, + "loss": 0.6593, + "step": 18520 + }, + { + "epoch": 0.29743655596398016, + "grad_norm": 0.6390604376792908, + "learning_rate": 4.7325516319077335e-05, + "loss": 0.8642, + "step": 18530 + }, + { + "epoch": 0.2975970721841442, + "grad_norm": 0.6444327235221863, + "learning_rate": 4.7322678499219135e-05, + "loss": 0.8411, + "step": 18540 + }, + { + "epoch": 0.29775758840430827, + "grad_norm": 0.546097457408905, + "learning_rate": 4.731983925976961e-05, + "loss": 0.7632, + "step": 18550 + }, + { + "epoch": 0.2979181046244723, + "grad_norm": 0.7329803705215454, + "learning_rate": 4.731699860090931e-05, + "loss": 0.887, + "step": 18560 + }, + { + "epoch": 0.2980786208446364, + "grad_norm": 0.6713755130767822, + "learning_rate": 4.7314156522818884e-05, + "loss": 0.837, + "step": 18570 + }, + { + "epoch": 0.2982391370648004, + "grad_norm": 0.7681695222854614, + "learning_rate": 4.731131302567908e-05, + "loss": 0.7009, + "step": 18580 + }, + { + "epoch": 0.29839965328496443, + "grad_norm": 0.6455259323120117, + "learning_rate": 4.7308468109670724e-05, + "loss": 0.8295, + "step": 18590 + }, + { + "epoch": 0.2985601695051285, + "grad_norm": 0.47563159465789795, + "learning_rate": 4.730562177497473e-05, + "loss": 0.8843, + "step": 18600 + }, + { + "epoch": 0.29872068572529253, + "grad_norm": 1.0737448930740356, + "learning_rate": 4.730277402177211e-05, + "loss": 0.6841, + "step": 18610 + }, + { + "epoch": 0.2988812019454566, + "grad_norm": 0.6389158964157104, + "learning_rate": 4.729992485024396e-05, + "loss": 0.8315, + "step": 18620 + }, + { + "epoch": 0.29904171816562064, + "grad_norm": 0.6511031985282898, + "learning_rate": 4.7297074260571486e-05, + "loss": 0.8956, + "step": 18630 + }, + { + "epoch": 0.2992022343857847, + "grad_norm": 0.7631812691688538, + "learning_rate": 4.729422225293596e-05, + "loss": 0.8074, + "step": 18640 + }, + { + "epoch": 0.29936275060594875, + "grad_norm": 0.5304189920425415, + "learning_rate": 4.729136882751874e-05, + "loss": 0.8494, + "step": 18650 + }, + { + "epoch": 0.2995232668261128, + "grad_norm": 0.5866103768348694, + "learning_rate": 4.7288513984501314e-05, + "loss": 0.8036, + "step": 18660 + }, + { + "epoch": 0.29968378304627685, + "grad_norm": 1.115495204925537, + "learning_rate": 4.728565772406521e-05, + "loss": 0.8114, + "step": 18670 + }, + { + "epoch": 0.29984429926644085, + "grad_norm": 0.6416015028953552, + "learning_rate": 4.728280004639208e-05, + "loss": 0.8336, + "step": 18680 + }, + { + "epoch": 0.3000048154866049, + "grad_norm": 0.6445107460021973, + "learning_rate": 4.727994095166366e-05, + "loss": 0.7681, + "step": 18690 + }, + { + "epoch": 0.30016533170676896, + "grad_norm": 0.43859556317329407, + "learning_rate": 4.7277080440061765e-05, + "loss": 0.8223, + "step": 18700 + }, + { + "epoch": 0.300325847926933, + "grad_norm": 0.7264307141304016, + "learning_rate": 4.7274218511768306e-05, + "loss": 0.7244, + "step": 18710 + }, + { + "epoch": 0.30048636414709706, + "grad_norm": 0.7324657440185547, + "learning_rate": 4.7271355166965284e-05, + "loss": 0.7531, + "step": 18720 + }, + { + "epoch": 0.3006468803672611, + "grad_norm": 0.5821089744567871, + "learning_rate": 4.7268490405834785e-05, + "loss": 0.8676, + "step": 18730 + }, + { + "epoch": 0.30080739658742517, + "grad_norm": 0.8080869317054749, + "learning_rate": 4.7265624228559016e-05, + "loss": 0.8041, + "step": 18740 + }, + { + "epoch": 0.3009679128075892, + "grad_norm": 0.8913848996162415, + "learning_rate": 4.726275663532022e-05, + "loss": 0.7144, + "step": 18750 + }, + { + "epoch": 0.3011284290277533, + "grad_norm": 0.876528263092041, + "learning_rate": 4.725988762630077e-05, + "loss": 0.7607, + "step": 18760 + }, + { + "epoch": 0.30128894524791733, + "grad_norm": 0.46937036514282227, + "learning_rate": 4.7257017201683124e-05, + "loss": 0.7077, + "step": 18770 + }, + { + "epoch": 0.3014494614680813, + "grad_norm": 0.7273525595664978, + "learning_rate": 4.725414536164981e-05, + "loss": 0.7001, + "step": 18780 + }, + { + "epoch": 0.3016099776882454, + "grad_norm": 0.5185369849205017, + "learning_rate": 4.7251272106383485e-05, + "loss": 0.6412, + "step": 18790 + }, + { + "epoch": 0.30177049390840943, + "grad_norm": 0.6824181079864502, + "learning_rate": 4.724839743606685e-05, + "loss": 0.7711, + "step": 18800 + }, + { + "epoch": 0.3019310101285735, + "grad_norm": 0.6424569487571716, + "learning_rate": 4.724552135088272e-05, + "loss": 0.8293, + "step": 18810 + }, + { + "epoch": 0.30209152634873754, + "grad_norm": 0.8032526969909668, + "learning_rate": 4.7242643851014e-05, + "loss": 0.6727, + "step": 18820 + }, + { + "epoch": 0.3022520425689016, + "grad_norm": 1.0124889612197876, + "learning_rate": 4.723976493664369e-05, + "loss": 0.7462, + "step": 18830 + }, + { + "epoch": 0.30241255878906564, + "grad_norm": 1.3537452220916748, + "learning_rate": 4.723688460795485e-05, + "loss": 0.8173, + "step": 18840 + }, + { + "epoch": 0.3025730750092297, + "grad_norm": 0.6548802852630615, + "learning_rate": 4.7234002865130674e-05, + "loss": 0.8622, + "step": 18850 + }, + { + "epoch": 0.30273359122939375, + "grad_norm": 0.7598744034767151, + "learning_rate": 4.7231119708354424e-05, + "loss": 0.7726, + "step": 18860 + }, + { + "epoch": 0.3028941074495578, + "grad_norm": 0.8300503492355347, + "learning_rate": 4.7228235137809433e-05, + "loss": 0.7995, + "step": 18870 + }, + { + "epoch": 0.3030546236697218, + "grad_norm": 0.7055060863494873, + "learning_rate": 4.722534915367916e-05, + "loss": 0.7551, + "step": 18880 + }, + { + "epoch": 0.30321513988988585, + "grad_norm": 0.7297700047492981, + "learning_rate": 4.722246175614712e-05, + "loss": 0.7642, + "step": 18890 + }, + { + "epoch": 0.3033756561100499, + "grad_norm": 0.5687710642814636, + "learning_rate": 4.721957294539696e-05, + "loss": 0.8015, + "step": 18900 + }, + { + "epoch": 0.30353617233021396, + "grad_norm": 0.605600893497467, + "learning_rate": 4.721668272161237e-05, + "loss": 0.7429, + "step": 18910 + }, + { + "epoch": 0.303696688550378, + "grad_norm": 0.6588074564933777, + "learning_rate": 4.721379108497716e-05, + "loss": 0.832, + "step": 18920 + }, + { + "epoch": 0.30385720477054207, + "grad_norm": 0.5994177460670471, + "learning_rate": 4.721089803567521e-05, + "loss": 0.7768, + "step": 18930 + }, + { + "epoch": 0.3040177209907061, + "grad_norm": 0.6763418912887573, + "learning_rate": 4.720800357389051e-05, + "loss": 0.764, + "step": 18940 + }, + { + "epoch": 0.3041782372108702, + "grad_norm": 0.5827973484992981, + "learning_rate": 4.720510769980714e-05, + "loss": 0.78, + "step": 18950 + }, + { + "epoch": 0.3043387534310342, + "grad_norm": 1.131916880607605, + "learning_rate": 4.7202210413609245e-05, + "loss": 0.7539, + "step": 18960 + }, + { + "epoch": 0.3044992696511983, + "grad_norm": 0.4261746406555176, + "learning_rate": 4.7199311715481084e-05, + "loss": 0.7348, + "step": 18970 + }, + { + "epoch": 0.3046597858713623, + "grad_norm": 0.5913972854614258, + "learning_rate": 4.719641160560699e-05, + "loss": 0.8103, + "step": 18980 + }, + { + "epoch": 0.30482030209152633, + "grad_norm": 0.6242275834083557, + "learning_rate": 4.71935100841714e-05, + "loss": 0.7882, + "step": 18990 + }, + { + "epoch": 0.3049808183116904, + "grad_norm": 0.5429794788360596, + "learning_rate": 4.719060715135883e-05, + "loss": 0.7511, + "step": 19000 + }, + { + "epoch": 0.30514133453185444, + "grad_norm": 0.8199679255485535, + "learning_rate": 4.718770280735389e-05, + "loss": 0.8613, + "step": 19010 + }, + { + "epoch": 0.3053018507520185, + "grad_norm": 0.6794747710227966, + "learning_rate": 4.718479705234129e-05, + "loss": 0.7998, + "step": 19020 + }, + { + "epoch": 0.30546236697218254, + "grad_norm": 2.0105581283569336, + "learning_rate": 4.71818898865058e-05, + "loss": 0.7364, + "step": 19030 + }, + { + "epoch": 0.3056228831923466, + "grad_norm": 0.7274786233901978, + "learning_rate": 4.717898131003231e-05, + "loss": 0.9135, + "step": 19040 + }, + { + "epoch": 0.30578339941251065, + "grad_norm": 0.45931723713874817, + "learning_rate": 4.7176071323105784e-05, + "loss": 0.852, + "step": 19050 + }, + { + "epoch": 0.3059439156326747, + "grad_norm": 1.0266475677490234, + "learning_rate": 4.717315992591129e-05, + "loss": 0.7057, + "step": 19060 + }, + { + "epoch": 0.30610443185283875, + "grad_norm": 0.47959402203559875, + "learning_rate": 4.717024711863396e-05, + "loss": 0.8061, + "step": 19070 + }, + { + "epoch": 0.30626494807300275, + "grad_norm": 0.5507435202598572, + "learning_rate": 4.7167332901459035e-05, + "loss": 0.8355, + "step": 19080 + }, + { + "epoch": 0.3064254642931668, + "grad_norm": 0.6578642725944519, + "learning_rate": 4.716441727457186e-05, + "loss": 0.7372, + "step": 19090 + }, + { + "epoch": 0.30658598051333086, + "grad_norm": 0.5130446553230286, + "learning_rate": 4.716150023815783e-05, + "loss": 0.7797, + "step": 19100 + }, + { + "epoch": 0.3067464967334949, + "grad_norm": 0.5004091858863831, + "learning_rate": 4.7158581792402454e-05, + "loss": 0.7471, + "step": 19110 + }, + { + "epoch": 0.30690701295365896, + "grad_norm": 0.8051559925079346, + "learning_rate": 4.715566193749135e-05, + "loss": 0.8862, + "step": 19120 + }, + { + "epoch": 0.307067529173823, + "grad_norm": 0.9079741835594177, + "learning_rate": 4.715274067361017e-05, + "loss": 0.7993, + "step": 19130 + }, + { + "epoch": 0.30722804539398707, + "grad_norm": 0.755452573299408, + "learning_rate": 4.7149818000944716e-05, + "loss": 0.7815, + "step": 19140 + }, + { + "epoch": 0.3073885616141511, + "grad_norm": 0.8961483836174011, + "learning_rate": 4.7146893919680844e-05, + "loss": 0.7839, + "step": 19150 + }, + { + "epoch": 0.3075490778343152, + "grad_norm": 0.6553107500076294, + "learning_rate": 4.71439684300045e-05, + "loss": 0.6874, + "step": 19160 + }, + { + "epoch": 0.30770959405447923, + "grad_norm": 0.6166565418243408, + "learning_rate": 4.714104153210175e-05, + "loss": 0.7181, + "step": 19170 + }, + { + "epoch": 0.3078701102746432, + "grad_norm": 0.8068774342536926, + "learning_rate": 4.7138113226158696e-05, + "loss": 0.7229, + "step": 19180 + }, + { + "epoch": 0.3080306264948073, + "grad_norm": 0.78798508644104, + "learning_rate": 4.713518351236159e-05, + "loss": 0.7687, + "step": 19190 + }, + { + "epoch": 0.30819114271497133, + "grad_norm": 0.7515683770179749, + "learning_rate": 4.713225239089673e-05, + "loss": 0.7928, + "step": 19200 + }, + { + "epoch": 0.3083516589351354, + "grad_norm": 0.8746390342712402, + "learning_rate": 4.712931986195052e-05, + "loss": 0.8311, + "step": 19210 + }, + { + "epoch": 0.30851217515529944, + "grad_norm": 0.6215503811836243, + "learning_rate": 4.7126385925709456e-05, + "loss": 0.911, + "step": 19220 + }, + { + "epoch": 0.3086726913754635, + "grad_norm": 0.6278584003448486, + "learning_rate": 4.7123450582360115e-05, + "loss": 0.7925, + "step": 19230 + }, + { + "epoch": 0.30883320759562755, + "grad_norm": 0.699363648891449, + "learning_rate": 4.7120513832089165e-05, + "loss": 0.7716, + "step": 19240 + }, + { + "epoch": 0.3089937238157916, + "grad_norm": 0.6741727590560913, + "learning_rate": 4.711757567508337e-05, + "loss": 0.8227, + "step": 19250 + }, + { + "epoch": 0.30915424003595565, + "grad_norm": 0.6587752103805542, + "learning_rate": 4.711463611152958e-05, + "loss": 0.8389, + "step": 19260 + }, + { + "epoch": 0.3093147562561197, + "grad_norm": 0.602256715297699, + "learning_rate": 4.711169514161473e-05, + "loss": 0.7656, + "step": 19270 + }, + { + "epoch": 0.3094752724762837, + "grad_norm": 0.5741090178489685, + "learning_rate": 4.710875276552586e-05, + "loss": 0.7618, + "step": 19280 + }, + { + "epoch": 0.30963578869644776, + "grad_norm": 0.702176570892334, + "learning_rate": 4.710580898345007e-05, + "loss": 0.7933, + "step": 19290 + }, + { + "epoch": 0.3097963049166118, + "grad_norm": 0.6441376209259033, + "learning_rate": 4.710286379557458e-05, + "loss": 0.7996, + "step": 19300 + }, + { + "epoch": 0.30995682113677586, + "grad_norm": 0.5637532472610474, + "learning_rate": 4.7099917202086685e-05, + "loss": 0.8061, + "step": 19310 + }, + { + "epoch": 0.3101173373569399, + "grad_norm": 0.7694031596183777, + "learning_rate": 4.709696920317377e-05, + "loss": 0.8713, + "step": 19320 + }, + { + "epoch": 0.31027785357710397, + "grad_norm": 0.9769605994224548, + "learning_rate": 4.709401979902331e-05, + "loss": 0.8317, + "step": 19330 + }, + { + "epoch": 0.310438369797268, + "grad_norm": 0.8914659023284912, + "learning_rate": 4.709106898982287e-05, + "loss": 0.7635, + "step": 19340 + }, + { + "epoch": 0.3105988860174321, + "grad_norm": 0.6866865158081055, + "learning_rate": 4.7088116775760096e-05, + "loss": 0.8607, + "step": 19350 + }, + { + "epoch": 0.3107594022375961, + "grad_norm": 0.8454020023345947, + "learning_rate": 4.708516315702275e-05, + "loss": 0.7346, + "step": 19360 + }, + { + "epoch": 0.3109199184577602, + "grad_norm": 0.6230736970901489, + "learning_rate": 4.708220813379864e-05, + "loss": 0.783, + "step": 19370 + }, + { + "epoch": 0.3110804346779242, + "grad_norm": 0.7801986336708069, + "learning_rate": 4.707925170627572e-05, + "loss": 0.6883, + "step": 19380 + }, + { + "epoch": 0.31124095089808823, + "grad_norm": 0.5438198447227478, + "learning_rate": 4.707629387464197e-05, + "loss": 0.8643, + "step": 19390 + }, + { + "epoch": 0.3114014671182523, + "grad_norm": 0.7491708993911743, + "learning_rate": 4.707333463908551e-05, + "loss": 0.8396, + "step": 19400 + }, + { + "epoch": 0.31156198333841634, + "grad_norm": 0.6488578915596008, + "learning_rate": 4.707037399979452e-05, + "loss": 0.9204, + "step": 19410 + }, + { + "epoch": 0.3117224995585804, + "grad_norm": 0.5728543400764465, + "learning_rate": 4.7067411956957284e-05, + "loss": 0.7356, + "step": 19420 + }, + { + "epoch": 0.31188301577874444, + "grad_norm": 0.5538259744644165, + "learning_rate": 4.706444851076217e-05, + "loss": 0.8686, + "step": 19430 + }, + { + "epoch": 0.3120435319989085, + "grad_norm": 0.7063315510749817, + "learning_rate": 4.7061483661397644e-05, + "loss": 0.7461, + "step": 19440 + }, + { + "epoch": 0.31220404821907255, + "grad_norm": 1.0832927227020264, + "learning_rate": 4.705851740905224e-05, + "loss": 0.8856, + "step": 19450 + }, + { + "epoch": 0.3123645644392366, + "grad_norm": 0.4529445469379425, + "learning_rate": 4.7055549753914595e-05, + "loss": 0.8726, + "step": 19460 + }, + { + "epoch": 0.31252508065940066, + "grad_norm": 0.6413730382919312, + "learning_rate": 4.705258069617344e-05, + "loss": 0.8526, + "step": 19470 + }, + { + "epoch": 0.31268559687956465, + "grad_norm": 0.6191111207008362, + "learning_rate": 4.704961023601759e-05, + "loss": 0.7082, + "step": 19480 + }, + { + "epoch": 0.3128461130997287, + "grad_norm": 0.9617066383361816, + "learning_rate": 4.7046638373635945e-05, + "loss": 0.7553, + "step": 19490 + }, + { + "epoch": 0.31300662931989276, + "grad_norm": 0.6199307441711426, + "learning_rate": 4.704366510921751e-05, + "loss": 0.8712, + "step": 19500 + }, + { + "epoch": 0.3131671455400568, + "grad_norm": 1.0873644351959229, + "learning_rate": 4.7040690442951344e-05, + "loss": 0.7668, + "step": 19510 + }, + { + "epoch": 0.31332766176022087, + "grad_norm": 0.5351448655128479, + "learning_rate": 4.703771437502664e-05, + "loss": 0.924, + "step": 19520 + }, + { + "epoch": 0.3134881779803849, + "grad_norm": 0.5819586515426636, + "learning_rate": 4.703473690563265e-05, + "loss": 0.7119, + "step": 19530 + }, + { + "epoch": 0.31364869420054897, + "grad_norm": 0.724324643611908, + "learning_rate": 4.7031758034958714e-05, + "loss": 0.8164, + "step": 19540 + }, + { + "epoch": 0.313809210420713, + "grad_norm": 0.7656018137931824, + "learning_rate": 4.702877776319429e-05, + "loss": 0.8519, + "step": 19550 + }, + { + "epoch": 0.3139697266408771, + "grad_norm": 1.6010020971298218, + "learning_rate": 4.702579609052889e-05, + "loss": 0.8093, + "step": 19560 + }, + { + "epoch": 0.31413024286104113, + "grad_norm": 2.2795510292053223, + "learning_rate": 4.702281301715214e-05, + "loss": 0.8321, + "step": 19570 + }, + { + "epoch": 0.31429075908120513, + "grad_norm": 1.0586585998535156, + "learning_rate": 4.701982854325374e-05, + "loss": 0.8768, + "step": 19580 + }, + { + "epoch": 0.3144512753013692, + "grad_norm": 0.5033349990844727, + "learning_rate": 4.701684266902349e-05, + "loss": 0.7088, + "step": 19590 + }, + { + "epoch": 0.31461179152153324, + "grad_norm": 0.5792729258537292, + "learning_rate": 4.7013855394651276e-05, + "loss": 0.8276, + "step": 19600 + }, + { + "epoch": 0.3147723077416973, + "grad_norm": 0.7063154578208923, + "learning_rate": 4.7010866720327065e-05, + "loss": 0.7297, + "step": 19610 + }, + { + "epoch": 0.31493282396186134, + "grad_norm": 0.5521560907363892, + "learning_rate": 4.7007876646240916e-05, + "loss": 0.7839, + "step": 19620 + }, + { + "epoch": 0.3150933401820254, + "grad_norm": 0.5696978569030762, + "learning_rate": 4.700488517258299e-05, + "loss": 0.7348, + "step": 19630 + }, + { + "epoch": 0.31525385640218945, + "grad_norm": 0.53019118309021, + "learning_rate": 4.7001892299543525e-05, + "loss": 0.7159, + "step": 19640 + }, + { + "epoch": 0.3154143726223535, + "grad_norm": 0.5081175565719604, + "learning_rate": 4.6998898027312844e-05, + "loss": 0.7777, + "step": 19650 + }, + { + "epoch": 0.31557488884251755, + "grad_norm": 0.7862869501113892, + "learning_rate": 4.699590235608137e-05, + "loss": 0.7923, + "step": 19660 + }, + { + "epoch": 0.3157354050626816, + "grad_norm": 0.977203905582428, + "learning_rate": 4.699290528603961e-05, + "loss": 0.7348, + "step": 19670 + }, + { + "epoch": 0.3158959212828456, + "grad_norm": 0.9169418215751648, + "learning_rate": 4.698990681737815e-05, + "loss": 0.8347, + "step": 19680 + }, + { + "epoch": 0.31605643750300966, + "grad_norm": 0.7115100026130676, + "learning_rate": 4.69869069502877e-05, + "loss": 0.7771, + "step": 19690 + }, + { + "epoch": 0.3162169537231737, + "grad_norm": 0.6924024820327759, + "learning_rate": 4.698390568495901e-05, + "loss": 0.8283, + "step": 19700 + }, + { + "epoch": 0.31637746994333776, + "grad_norm": 0.6899773478507996, + "learning_rate": 4.6980903021582946e-05, + "loss": 0.7633, + "step": 19710 + }, + { + "epoch": 0.3165379861635018, + "grad_norm": 0.5910447239875793, + "learning_rate": 4.697789896035047e-05, + "loss": 0.8661, + "step": 19720 + }, + { + "epoch": 0.31669850238366587, + "grad_norm": 0.7411358952522278, + "learning_rate": 4.6974893501452616e-05, + "loss": 0.6829, + "step": 19730 + }, + { + "epoch": 0.3168590186038299, + "grad_norm": 0.7992265224456787, + "learning_rate": 4.697188664508052e-05, + "loss": 0.8963, + "step": 19740 + }, + { + "epoch": 0.317019534823994, + "grad_norm": 1.1055302619934082, + "learning_rate": 4.696887839142539e-05, + "loss": 0.8494, + "step": 19750 + }, + { + "epoch": 0.31718005104415803, + "grad_norm": 0.4741075932979584, + "learning_rate": 4.696586874067853e-05, + "loss": 0.9035, + "step": 19760 + }, + { + "epoch": 0.3173405672643221, + "grad_norm": 0.743175208568573, + "learning_rate": 4.696285769303136e-05, + "loss": 0.7525, + "step": 19770 + }, + { + "epoch": 0.3175010834844861, + "grad_norm": 0.5927312970161438, + "learning_rate": 4.695984524867534e-05, + "loss": 0.8913, + "step": 19780 + }, + { + "epoch": 0.31766159970465013, + "grad_norm": 0.8793267607688904, + "learning_rate": 4.695683140780206e-05, + "loss": 0.7244, + "step": 19790 + }, + { + "epoch": 0.3178221159248142, + "grad_norm": 0.49607688188552856, + "learning_rate": 4.695381617060317e-05, + "loss": 0.7169, + "step": 19800 + }, + { + "epoch": 0.31798263214497824, + "grad_norm": 0.6057771444320679, + "learning_rate": 4.695079953727043e-05, + "loss": 0.8216, + "step": 19810 + }, + { + "epoch": 0.3181431483651423, + "grad_norm": 0.5300577282905579, + "learning_rate": 4.6947781507995683e-05, + "loss": 0.6605, + "step": 19820 + }, + { + "epoch": 0.31830366458530635, + "grad_norm": 0.4860164225101471, + "learning_rate": 4.694476208297084e-05, + "loss": 0.784, + "step": 19830 + }, + { + "epoch": 0.3184641808054704, + "grad_norm": 0.8039216995239258, + "learning_rate": 4.694174126238794e-05, + "loss": 0.8634, + "step": 19840 + }, + { + "epoch": 0.31862469702563445, + "grad_norm": 0.5519948601722717, + "learning_rate": 4.6938719046439087e-05, + "loss": 0.6922, + "step": 19850 + }, + { + "epoch": 0.3187852132457985, + "grad_norm": 0.5139049291610718, + "learning_rate": 4.6935695435316454e-05, + "loss": 0.8584, + "step": 19860 + }, + { + "epoch": 0.31894572946596256, + "grad_norm": 0.7781885862350464, + "learning_rate": 4.693267042921235e-05, + "loss": 0.7832, + "step": 19870 + }, + { + "epoch": 0.3191062456861266, + "grad_norm": 0.46826839447021484, + "learning_rate": 4.6929644028319136e-05, + "loss": 0.7253, + "step": 19880 + }, + { + "epoch": 0.3192667619062906, + "grad_norm": 0.7184168100357056, + "learning_rate": 4.692661623282928e-05, + "loss": 0.8112, + "step": 19890 + }, + { + "epoch": 0.31942727812645466, + "grad_norm": 0.5986685156822205, + "learning_rate": 4.6923587042935324e-05, + "loss": 0.8575, + "step": 19900 + }, + { + "epoch": 0.3195877943466187, + "grad_norm": 0.6878703236579895, + "learning_rate": 4.692055645882991e-05, + "loss": 0.8715, + "step": 19910 + }, + { + "epoch": 0.31974831056678277, + "grad_norm": 0.9332549571990967, + "learning_rate": 4.6917524480705765e-05, + "loss": 0.8312, + "step": 19920 + }, + { + "epoch": 0.3199088267869468, + "grad_norm": 1.170471429824829, + "learning_rate": 4.691449110875571e-05, + "loss": 0.7957, + "step": 19930 + }, + { + "epoch": 0.3200693430071109, + "grad_norm": 0.5972083210945129, + "learning_rate": 4.691145634317265e-05, + "loss": 0.751, + "step": 19940 + }, + { + "epoch": 0.3202298592272749, + "grad_norm": 0.7739942073822021, + "learning_rate": 4.690842018414956e-05, + "loss": 0.8748, + "step": 19950 + }, + { + "epoch": 0.320390375447439, + "grad_norm": 0.5972018837928772, + "learning_rate": 4.690538263187954e-05, + "loss": 0.7859, + "step": 19960 + }, + { + "epoch": 0.32055089166760303, + "grad_norm": 0.6361020803451538, + "learning_rate": 4.690234368655576e-05, + "loss": 0.836, + "step": 19970 + }, + { + "epoch": 0.3207114078877671, + "grad_norm": 0.9547080993652344, + "learning_rate": 4.689930334837147e-05, + "loss": 0.8598, + "step": 19980 + }, + { + "epoch": 0.3208719241079311, + "grad_norm": 0.4484705924987793, + "learning_rate": 4.689626161752003e-05, + "loss": 0.8861, + "step": 19990 + }, + { + "epoch": 0.32103244032809514, + "grad_norm": 0.583728015422821, + "learning_rate": 4.689321849419487e-05, + "loss": 0.8431, + "step": 20000 + }, + { + "epoch": 0.32103244032809514, + "eval_loss": 0.8029146194458008, + "eval_runtime": 1833.1444, + "eval_samples_per_second": 14.309, + "eval_steps_per_second": 1.789, + "step": 20000 + }, + { + "epoch": 0.3211929565482592, + "grad_norm": 1.2327808141708374, + "learning_rate": 4.689017397858951e-05, + "loss": 0.7538, + "step": 20010 + }, + { + "epoch": 0.32135347276842324, + "grad_norm": 0.5040415525436401, + "learning_rate": 4.688712807089756e-05, + "loss": 0.7756, + "step": 20020 + }, + { + "epoch": 0.3215139889885873, + "grad_norm": 0.8134372234344482, + "learning_rate": 4.688408077131274e-05, + "loss": 0.6812, + "step": 20030 + }, + { + "epoch": 0.32167450520875135, + "grad_norm": 0.5920359492301941, + "learning_rate": 4.6881032080028826e-05, + "loss": 0.8459, + "step": 20040 + }, + { + "epoch": 0.3218350214289154, + "grad_norm": 0.731289803981781, + "learning_rate": 4.68779819972397e-05, + "loss": 0.8506, + "step": 20050 + }, + { + "epoch": 0.32199553764907946, + "grad_norm": 0.5938640236854553, + "learning_rate": 4.687493052313933e-05, + "loss": 0.829, + "step": 20060 + }, + { + "epoch": 0.3221560538692435, + "grad_norm": 0.5469821691513062, + "learning_rate": 4.687187765792177e-05, + "loss": 0.8751, + "step": 20070 + }, + { + "epoch": 0.32231657008940756, + "grad_norm": 0.6215882301330566, + "learning_rate": 4.686882340178117e-05, + "loss": 0.7891, + "step": 20080 + }, + { + "epoch": 0.32247708630957156, + "grad_norm": 0.5203871726989746, + "learning_rate": 4.686576775491175e-05, + "loss": 0.7796, + "step": 20090 + }, + { + "epoch": 0.3226376025297356, + "grad_norm": 0.588094174861908, + "learning_rate": 4.686271071750785e-05, + "loss": 0.8297, + "step": 20100 + }, + { + "epoch": 0.32279811874989967, + "grad_norm": 0.6865624189376831, + "learning_rate": 4.685965228976387e-05, + "loss": 0.7169, + "step": 20110 + }, + { + "epoch": 0.3229586349700637, + "grad_norm": 0.5805290937423706, + "learning_rate": 4.6856592471874295e-05, + "loss": 0.7718, + "step": 20120 + }, + { + "epoch": 0.32311915119022777, + "grad_norm": 0.5227637887001038, + "learning_rate": 4.6853531264033735e-05, + "loss": 0.8575, + "step": 20130 + }, + { + "epoch": 0.3232796674103918, + "grad_norm": 0.6600632667541504, + "learning_rate": 4.685046866643685e-05, + "loss": 0.8856, + "step": 20140 + }, + { + "epoch": 0.3234401836305559, + "grad_norm": 0.8544142842292786, + "learning_rate": 4.68474046792784e-05, + "loss": 0.9212, + "step": 20150 + }, + { + "epoch": 0.32360069985071993, + "grad_norm": 0.5303751230239868, + "learning_rate": 4.684433930275326e-05, + "loss": 0.8409, + "step": 20160 + }, + { + "epoch": 0.323761216070884, + "grad_norm": 0.5132800340652466, + "learning_rate": 4.684127253705634e-05, + "loss": 0.7827, + "step": 20170 + }, + { + "epoch": 0.32392173229104804, + "grad_norm": 0.6366455554962158, + "learning_rate": 4.683820438238269e-05, + "loss": 0.7189, + "step": 20180 + }, + { + "epoch": 0.32408224851121203, + "grad_norm": 0.44401633739471436, + "learning_rate": 4.683513483892741e-05, + "loss": 0.8562, + "step": 20190 + }, + { + "epoch": 0.3242427647313761, + "grad_norm": 0.7061275243759155, + "learning_rate": 4.683206390688572e-05, + "loss": 0.8701, + "step": 20200 + }, + { + "epoch": 0.32440328095154014, + "grad_norm": 0.8356586694717407, + "learning_rate": 4.682899158645291e-05, + "loss": 0.7994, + "step": 20210 + }, + { + "epoch": 0.3245637971717042, + "grad_norm": 0.5950855016708374, + "learning_rate": 4.682591787782435e-05, + "loss": 0.7973, + "step": 20220 + }, + { + "epoch": 0.32472431339186825, + "grad_norm": 0.4964255392551422, + "learning_rate": 4.682284278119552e-05, + "loss": 0.8248, + "step": 20230 + }, + { + "epoch": 0.3248848296120323, + "grad_norm": 0.4775400757789612, + "learning_rate": 4.681976629676198e-05, + "loss": 0.7716, + "step": 20240 + }, + { + "epoch": 0.32504534583219635, + "grad_norm": 0.9203125238418579, + "learning_rate": 4.681668842471937e-05, + "loss": 0.6834, + "step": 20250 + }, + { + "epoch": 0.3252058620523604, + "grad_norm": 1.0049426555633545, + "learning_rate": 4.6813609165263426e-05, + "loss": 0.787, + "step": 20260 + }, + { + "epoch": 0.32536637827252446, + "grad_norm": 0.6308603286743164, + "learning_rate": 4.681052851858998e-05, + "loss": 0.7434, + "step": 20270 + }, + { + "epoch": 0.3255268944926885, + "grad_norm": 0.7726183533668518, + "learning_rate": 4.680744648489492e-05, + "loss": 0.8495, + "step": 20280 + }, + { + "epoch": 0.3256874107128525, + "grad_norm": 1.0201821327209473, + "learning_rate": 4.680436306437427e-05, + "loss": 0.768, + "step": 20290 + }, + { + "epoch": 0.32584792693301656, + "grad_norm": 0.7632128596305847, + "learning_rate": 4.680127825722411e-05, + "loss": 0.8191, + "step": 20300 + }, + { + "epoch": 0.3260084431531806, + "grad_norm": 0.5708560943603516, + "learning_rate": 4.6798192063640614e-05, + "loss": 0.7365, + "step": 20310 + }, + { + "epoch": 0.32616895937334467, + "grad_norm": 0.864834189414978, + "learning_rate": 4.6795104483820035e-05, + "loss": 0.729, + "step": 20320 + }, + { + "epoch": 0.3263294755935087, + "grad_norm": 0.8001208901405334, + "learning_rate": 4.6792015517958746e-05, + "loss": 0.8816, + "step": 20330 + }, + { + "epoch": 0.3264899918136728, + "grad_norm": 0.5884109735488892, + "learning_rate": 4.6788925166253164e-05, + "loss": 0.9167, + "step": 20340 + }, + { + "epoch": 0.32665050803383683, + "grad_norm": 0.7132514715194702, + "learning_rate": 4.678583342889984e-05, + "loss": 0.7637, + "step": 20350 + }, + { + "epoch": 0.3268110242540009, + "grad_norm": 1.073782205581665, + "learning_rate": 4.678274030609538e-05, + "loss": 0.8572, + "step": 20360 + }, + { + "epoch": 0.32697154047416493, + "grad_norm": 0.6243094205856323, + "learning_rate": 4.677964579803648e-05, + "loss": 0.9315, + "step": 20370 + }, + { + "epoch": 0.327132056694329, + "grad_norm": 0.4673779308795929, + "learning_rate": 4.677654990491995e-05, + "loss": 0.7639, + "step": 20380 + }, + { + "epoch": 0.327292572914493, + "grad_norm": 1.1014937162399292, + "learning_rate": 4.6773452626942655e-05, + "loss": 0.8476, + "step": 20390 + }, + { + "epoch": 0.32745308913465704, + "grad_norm": 0.7402098774909973, + "learning_rate": 4.677035396430157e-05, + "loss": 0.6677, + "step": 20400 + }, + { + "epoch": 0.3276136053548211, + "grad_norm": 1.5181245803833008, + "learning_rate": 4.676725391719376e-05, + "loss": 0.916, + "step": 20410 + }, + { + "epoch": 0.32777412157498514, + "grad_norm": 1.1885316371917725, + "learning_rate": 4.676415248581635e-05, + "loss": 0.7861, + "step": 20420 + }, + { + "epoch": 0.3279346377951492, + "grad_norm": 0.48845162987709045, + "learning_rate": 4.676104967036659e-05, + "loss": 0.699, + "step": 20430 + }, + { + "epoch": 0.32809515401531325, + "grad_norm": 0.6955060958862305, + "learning_rate": 4.6757945471041796e-05, + "loss": 0.8365, + "step": 20440 + }, + { + "epoch": 0.3282556702354773, + "grad_norm": 0.5299034714698792, + "learning_rate": 4.675483988803937e-05, + "loss": 0.8273, + "step": 20450 + }, + { + "epoch": 0.32841618645564136, + "grad_norm": 1.197553038597107, + "learning_rate": 4.6751732921556826e-05, + "loss": 0.7516, + "step": 20460 + }, + { + "epoch": 0.3285767026758054, + "grad_norm": 0.6526255011558533, + "learning_rate": 4.674862457179173e-05, + "loss": 0.8398, + "step": 20470 + }, + { + "epoch": 0.32873721889596946, + "grad_norm": 0.6649579405784607, + "learning_rate": 4.674551483894177e-05, + "loss": 0.72, + "step": 20480 + }, + { + "epoch": 0.32889773511613346, + "grad_norm": 0.8787596225738525, + "learning_rate": 4.6742403723204696e-05, + "loss": 0.8911, + "step": 20490 + }, + { + "epoch": 0.3290582513362975, + "grad_norm": 0.48834794759750366, + "learning_rate": 4.6739291224778357e-05, + "loss": 0.8773, + "step": 20500 + }, + { + "epoch": 0.32921876755646157, + "grad_norm": 0.8119209408760071, + "learning_rate": 4.67361773438607e-05, + "loss": 0.7065, + "step": 20510 + }, + { + "epoch": 0.3293792837766256, + "grad_norm": 0.6051014065742493, + "learning_rate": 4.673306208064974e-05, + "loss": 0.8006, + "step": 20520 + }, + { + "epoch": 0.3295397999967897, + "grad_norm": 0.7364853024482727, + "learning_rate": 4.672994543534358e-05, + "loss": 0.7606, + "step": 20530 + }, + { + "epoch": 0.3297003162169537, + "grad_norm": 0.9555888175964355, + "learning_rate": 4.672682740814044e-05, + "loss": 0.684, + "step": 20540 + }, + { + "epoch": 0.3298608324371178, + "grad_norm": 0.7596815824508667, + "learning_rate": 4.672370799923861e-05, + "loss": 0.7942, + "step": 20550 + }, + { + "epoch": 0.33002134865728183, + "grad_norm": 0.5103685259819031, + "learning_rate": 4.672058720883644e-05, + "loss": 0.7906, + "step": 20560 + }, + { + "epoch": 0.3301818648774459, + "grad_norm": 0.7453975677490234, + "learning_rate": 4.6717465037132426e-05, + "loss": 0.8463, + "step": 20570 + }, + { + "epoch": 0.33034238109760994, + "grad_norm": 0.8390095829963684, + "learning_rate": 4.671434148432509e-05, + "loss": 0.7963, + "step": 20580 + }, + { + "epoch": 0.33050289731777394, + "grad_norm": 0.6743981242179871, + "learning_rate": 4.671121655061309e-05, + "loss": 0.9429, + "step": 20590 + }, + { + "epoch": 0.330663413537938, + "grad_norm": 0.524705708026886, + "learning_rate": 4.670809023619516e-05, + "loss": 0.7698, + "step": 20600 + }, + { + "epoch": 0.33082392975810204, + "grad_norm": 0.9771848917007446, + "learning_rate": 4.670496254127009e-05, + "loss": 0.7978, + "step": 20610 + }, + { + "epoch": 0.3309844459782661, + "grad_norm": 0.8314015865325928, + "learning_rate": 4.670183346603681e-05, + "loss": 0.8833, + "step": 20620 + }, + { + "epoch": 0.33114496219843015, + "grad_norm": 0.6680324077606201, + "learning_rate": 4.669870301069429e-05, + "loss": 0.7881, + "step": 20630 + }, + { + "epoch": 0.3313054784185942, + "grad_norm": 0.4894575774669647, + "learning_rate": 4.669557117544162e-05, + "loss": 0.6946, + "step": 20640 + }, + { + "epoch": 0.33146599463875825, + "grad_norm": 0.5085126161575317, + "learning_rate": 4.669243796047796e-05, + "loss": 0.7631, + "step": 20650 + }, + { + "epoch": 0.3316265108589223, + "grad_norm": 0.5779781937599182, + "learning_rate": 4.6689303366002566e-05, + "loss": 0.8773, + "step": 20660 + }, + { + "epoch": 0.33178702707908636, + "grad_norm": 0.8042623996734619, + "learning_rate": 4.668616739221479e-05, + "loss": 0.6767, + "step": 20670 + }, + { + "epoch": 0.3319475432992504, + "grad_norm": 0.6267244219779968, + "learning_rate": 4.668303003931405e-05, + "loss": 0.7576, + "step": 20680 + }, + { + "epoch": 0.3321080595194144, + "grad_norm": 0.684073805809021, + "learning_rate": 4.667989130749986e-05, + "loss": 0.8094, + "step": 20690 + }, + { + "epoch": 0.33226857573957846, + "grad_norm": 0.7759580612182617, + "learning_rate": 4.667675119697183e-05, + "loss": 0.7744, + "step": 20700 + }, + { + "epoch": 0.3324290919597425, + "grad_norm": 0.8852571249008179, + "learning_rate": 4.6673609707929664e-05, + "loss": 0.8039, + "step": 20710 + }, + { + "epoch": 0.33258960817990657, + "grad_norm": 0.693628191947937, + "learning_rate": 4.667046684057312e-05, + "loss": 0.7047, + "step": 20720 + }, + { + "epoch": 0.3327501244000706, + "grad_norm": 0.9847446084022522, + "learning_rate": 4.666732259510209e-05, + "loss": 0.7733, + "step": 20730 + }, + { + "epoch": 0.3329106406202347, + "grad_norm": 0.5932134985923767, + "learning_rate": 4.666417697171651e-05, + "loss": 0.8793, + "step": 20740 + }, + { + "epoch": 0.33307115684039873, + "grad_norm": 1.4177590608596802, + "learning_rate": 4.6661029970616435e-05, + "loss": 0.7959, + "step": 20750 + }, + { + "epoch": 0.3332316730605628, + "grad_norm": 0.6112030744552612, + "learning_rate": 4.6657881592001987e-05, + "loss": 0.8554, + "step": 20760 + }, + { + "epoch": 0.33339218928072684, + "grad_norm": 0.8117067217826843, + "learning_rate": 4.665473183607338e-05, + "loss": 0.9213, + "step": 20770 + }, + { + "epoch": 0.3335527055008909, + "grad_norm": 0.6746890544891357, + "learning_rate": 4.665158070303095e-05, + "loss": 0.8497, + "step": 20780 + }, + { + "epoch": 0.3337132217210549, + "grad_norm": 0.48267242312431335, + "learning_rate": 4.664842819307506e-05, + "loss": 0.8129, + "step": 20790 + }, + { + "epoch": 0.33387373794121894, + "grad_norm": 0.7122848629951477, + "learning_rate": 4.664527430640619e-05, + "loss": 0.7531, + "step": 20800 + }, + { + "epoch": 0.334034254161383, + "grad_norm": 0.5495506525039673, + "learning_rate": 4.664211904322492e-05, + "loss": 0.7161, + "step": 20810 + }, + { + "epoch": 0.33419477038154705, + "grad_norm": 0.8692046403884888, + "learning_rate": 4.663896240373192e-05, + "loss": 0.8488, + "step": 20820 + }, + { + "epoch": 0.3343552866017111, + "grad_norm": 0.7632601857185364, + "learning_rate": 4.663580438812791e-05, + "loss": 0.8265, + "step": 20830 + }, + { + "epoch": 0.33451580282187515, + "grad_norm": 0.7879081964492798, + "learning_rate": 4.6632644996613726e-05, + "loss": 0.8466, + "step": 20840 + }, + { + "epoch": 0.3346763190420392, + "grad_norm": 0.9454790353775024, + "learning_rate": 4.66294842293903e-05, + "loss": 0.8888, + "step": 20850 + }, + { + "epoch": 0.33483683526220326, + "grad_norm": 0.9076864719390869, + "learning_rate": 4.6626322086658624e-05, + "loss": 0.6793, + "step": 20860 + }, + { + "epoch": 0.3349973514823673, + "grad_norm": 0.5791592001914978, + "learning_rate": 4.6623158568619793e-05, + "loss": 0.8405, + "step": 20870 + }, + { + "epoch": 0.33515786770253136, + "grad_norm": 0.5927159190177917, + "learning_rate": 4.6619993675475e-05, + "loss": 0.9141, + "step": 20880 + }, + { + "epoch": 0.33531838392269536, + "grad_norm": 0.7655045986175537, + "learning_rate": 4.66168274074255e-05, + "loss": 0.8888, + "step": 20890 + }, + { + "epoch": 0.3354789001428594, + "grad_norm": 0.31444117426872253, + "learning_rate": 4.661365976467266e-05, + "loss": 0.8249, + "step": 20900 + }, + { + "epoch": 0.33563941636302347, + "grad_norm": 0.8973138928413391, + "learning_rate": 4.661049074741791e-05, + "loss": 0.7499, + "step": 20910 + }, + { + "epoch": 0.3357999325831875, + "grad_norm": 0.5999227166175842, + "learning_rate": 4.660732035586279e-05, + "loss": 0.7134, + "step": 20920 + }, + { + "epoch": 0.3359604488033516, + "grad_norm": 1.1423169374465942, + "learning_rate": 4.660414859020893e-05, + "loss": 0.8325, + "step": 20930 + }, + { + "epoch": 0.3361209650235156, + "grad_norm": 0.7782297134399414, + "learning_rate": 4.660097545065801e-05, + "loss": 0.6777, + "step": 20940 + }, + { + "epoch": 0.3362814812436797, + "grad_norm": 1.091866374015808, + "learning_rate": 4.6597800937411854e-05, + "loss": 0.7727, + "step": 20950 + }, + { + "epoch": 0.33644199746384373, + "grad_norm": 0.6118764877319336, + "learning_rate": 4.65946250506723e-05, + "loss": 0.7831, + "step": 20960 + }, + { + "epoch": 0.3366025136840078, + "grad_norm": 0.6724144816398621, + "learning_rate": 4.6591447790641354e-05, + "loss": 0.7777, + "step": 20970 + }, + { + "epoch": 0.33676302990417184, + "grad_norm": 0.6121292114257812, + "learning_rate": 4.658826915752106e-05, + "loss": 0.8195, + "step": 20980 + }, + { + "epoch": 0.33692354612433584, + "grad_norm": 0.582399845123291, + "learning_rate": 4.658508915151355e-05, + "loss": 0.8068, + "step": 20990 + }, + { + "epoch": 0.3370840623444999, + "grad_norm": 0.712377667427063, + "learning_rate": 4.6581907772821065e-05, + "loss": 0.853, + "step": 21000 + }, + { + "epoch": 0.33724457856466394, + "grad_norm": 0.8814164400100708, + "learning_rate": 4.657872502164592e-05, + "loss": 0.8617, + "step": 21010 + }, + { + "epoch": 0.337405094784828, + "grad_norm": 0.44924628734588623, + "learning_rate": 4.6575540898190516e-05, + "loss": 0.7539, + "step": 21020 + }, + { + "epoch": 0.33756561100499205, + "grad_norm": 0.6449866890907288, + "learning_rate": 4.657235540265735e-05, + "loss": 0.8008, + "step": 21030 + }, + { + "epoch": 0.3377261272251561, + "grad_norm": 0.7958306074142456, + "learning_rate": 4.6569168535249e-05, + "loss": 0.7288, + "step": 21040 + }, + { + "epoch": 0.33788664344532016, + "grad_norm": 0.6748181581497192, + "learning_rate": 4.656598029616812e-05, + "loss": 0.8113, + "step": 21050 + }, + { + "epoch": 0.3380471596654842, + "grad_norm": 0.5492510795593262, + "learning_rate": 4.656279068561748e-05, + "loss": 0.7637, + "step": 21060 + }, + { + "epoch": 0.33820767588564826, + "grad_norm": 0.5792149901390076, + "learning_rate": 4.6559599703799904e-05, + "loss": 0.7861, + "step": 21070 + }, + { + "epoch": 0.3383681921058123, + "grad_norm": 0.5693486332893372, + "learning_rate": 4.6556407350918336e-05, + "loss": 0.8041, + "step": 21080 + }, + { + "epoch": 0.3385287083259763, + "grad_norm": 0.7105019688606262, + "learning_rate": 4.655321362717578e-05, + "loss": 0.7816, + "step": 21090 + }, + { + "epoch": 0.33868922454614037, + "grad_norm": 0.5358953475952148, + "learning_rate": 4.6550018532775344e-05, + "loss": 0.7495, + "step": 21100 + }, + { + "epoch": 0.3388497407663044, + "grad_norm": 0.6369436979293823, + "learning_rate": 4.654682206792022e-05, + "loss": 0.8722, + "step": 21110 + }, + { + "epoch": 0.3390102569864685, + "grad_norm": 0.861910879611969, + "learning_rate": 4.654362423281367e-05, + "loss": 0.8495, + "step": 21120 + }, + { + "epoch": 0.3391707732066325, + "grad_norm": 0.7397525310516357, + "learning_rate": 4.6540425027659076e-05, + "loss": 0.8156, + "step": 21130 + }, + { + "epoch": 0.3393312894267966, + "grad_norm": 1.5414807796478271, + "learning_rate": 4.6537224452659867e-05, + "loss": 0.7689, + "step": 21140 + }, + { + "epoch": 0.33949180564696063, + "grad_norm": 0.43827158212661743, + "learning_rate": 4.65340225080196e-05, + "loss": 0.9089, + "step": 21150 + }, + { + "epoch": 0.3396523218671247, + "grad_norm": 0.8097846508026123, + "learning_rate": 4.65308191939419e-05, + "loss": 0.7061, + "step": 21160 + }, + { + "epoch": 0.33981283808728874, + "grad_norm": 0.575946033000946, + "learning_rate": 4.652761451063047e-05, + "loss": 0.6663, + "step": 21170 + }, + { + "epoch": 0.3399733543074528, + "grad_norm": 0.6448619961738586, + "learning_rate": 4.65244084582891e-05, + "loss": 0.8779, + "step": 21180 + }, + { + "epoch": 0.3401338705276168, + "grad_norm": 1.190937876701355, + "learning_rate": 4.65212010371217e-05, + "loss": 0.7798, + "step": 21190 + }, + { + "epoch": 0.34029438674778084, + "grad_norm": 0.5569213032722473, + "learning_rate": 4.6517992247332234e-05, + "loss": 0.861, + "step": 21200 + }, + { + "epoch": 0.3404549029679449, + "grad_norm": 0.9274473190307617, + "learning_rate": 4.651478208912475e-05, + "loss": 0.7445, + "step": 21210 + }, + { + "epoch": 0.34061541918810895, + "grad_norm": 0.6113201975822449, + "learning_rate": 4.65115705627034e-05, + "loss": 0.8907, + "step": 21220 + }, + { + "epoch": 0.340775935408273, + "grad_norm": 2.4110774993896484, + "learning_rate": 4.650835766827243e-05, + "loss": 0.7443, + "step": 21230 + }, + { + "epoch": 0.34093645162843705, + "grad_norm": 0.9068794250488281, + "learning_rate": 4.6505143406036154e-05, + "loss": 0.8243, + "step": 21240 + }, + { + "epoch": 0.3410969678486011, + "grad_norm": 0.7678611874580383, + "learning_rate": 4.650192777619898e-05, + "loss": 0.7572, + "step": 21250 + }, + { + "epoch": 0.34125748406876516, + "grad_norm": 0.8370381593704224, + "learning_rate": 4.6498710778965396e-05, + "loss": 0.7213, + "step": 21260 + }, + { + "epoch": 0.3414180002889292, + "grad_norm": 0.6660412549972534, + "learning_rate": 4.649549241454e-05, + "loss": 0.8173, + "step": 21270 + }, + { + "epoch": 0.34157851650909327, + "grad_norm": 0.7190191745758057, + "learning_rate": 4.649227268312745e-05, + "loss": 0.855, + "step": 21280 + }, + { + "epoch": 0.34173903272925726, + "grad_norm": 0.6027255654335022, + "learning_rate": 4.6489051584932494e-05, + "loss": 0.7652, + "step": 21290 + }, + { + "epoch": 0.3418995489494213, + "grad_norm": 0.708335816860199, + "learning_rate": 4.648582912015999e-05, + "loss": 0.8245, + "step": 21300 + }, + { + "epoch": 0.34206006516958537, + "grad_norm": 0.5824891328811646, + "learning_rate": 4.648260528901487e-05, + "loss": 0.7424, + "step": 21310 + }, + { + "epoch": 0.3422205813897494, + "grad_norm": 0.9214373230934143, + "learning_rate": 4.647938009170214e-05, + "loss": 0.7311, + "step": 21320 + }, + { + "epoch": 0.3423810976099135, + "grad_norm": 0.6121675372123718, + "learning_rate": 4.647615352842691e-05, + "loss": 0.8041, + "step": 21330 + }, + { + "epoch": 0.34254161383007753, + "grad_norm": 0.5577830076217651, + "learning_rate": 4.647292559939437e-05, + "loss": 0.7175, + "step": 21340 + }, + { + "epoch": 0.3427021300502416, + "grad_norm": 0.636573314666748, + "learning_rate": 4.646969630480978e-05, + "loss": 0.871, + "step": 21350 + }, + { + "epoch": 0.34286264627040564, + "grad_norm": 0.9957367181777954, + "learning_rate": 4.646646564487853e-05, + "loss": 0.8048, + "step": 21360 + }, + { + "epoch": 0.3430231624905697, + "grad_norm": 0.9663298726081848, + "learning_rate": 4.646323361980607e-05, + "loss": 0.8336, + "step": 21370 + }, + { + "epoch": 0.34318367871073374, + "grad_norm": 0.7004472017288208, + "learning_rate": 4.646000022979791e-05, + "loss": 0.7829, + "step": 21380 + }, + { + "epoch": 0.3433441949308978, + "grad_norm": 0.8016113638877869, + "learning_rate": 4.6456765475059706e-05, + "loss": 0.834, + "step": 21390 + }, + { + "epoch": 0.3435047111510618, + "grad_norm": 0.9582916498184204, + "learning_rate": 4.645352935579715e-05, + "loss": 0.8107, + "step": 21400 + }, + { + "epoch": 0.34366522737122585, + "grad_norm": 1.017874002456665, + "learning_rate": 4.645029187221604e-05, + "loss": 0.7366, + "step": 21410 + }, + { + "epoch": 0.3438257435913899, + "grad_norm": 0.8042618036270142, + "learning_rate": 4.644705302452228e-05, + "loss": 0.76, + "step": 21420 + }, + { + "epoch": 0.34398625981155395, + "grad_norm": 0.5278865098953247, + "learning_rate": 4.644381281292183e-05, + "loss": 0.783, + "step": 21430 + }, + { + "epoch": 0.344146776031718, + "grad_norm": 0.9297468066215515, + "learning_rate": 4.644057123762073e-05, + "loss": 0.7063, + "step": 21440 + }, + { + "epoch": 0.34430729225188206, + "grad_norm": 1.1126030683517456, + "learning_rate": 4.643732829882515e-05, + "loss": 0.7605, + "step": 21450 + }, + { + "epoch": 0.3444678084720461, + "grad_norm": 0.6513435244560242, + "learning_rate": 4.643408399674133e-05, + "loss": 0.8165, + "step": 21460 + }, + { + "epoch": 0.34462832469221016, + "grad_norm": 1.0915026664733887, + "learning_rate": 4.643083833157556e-05, + "loss": 0.8249, + "step": 21470 + }, + { + "epoch": 0.3447888409123742, + "grad_norm": 0.5816873908042908, + "learning_rate": 4.6427591303534254e-05, + "loss": 0.6603, + "step": 21480 + }, + { + "epoch": 0.34494935713253827, + "grad_norm": 1.061826467514038, + "learning_rate": 4.642434291282392e-05, + "loss": 0.8453, + "step": 21490 + }, + { + "epoch": 0.34510987335270227, + "grad_norm": 0.962732195854187, + "learning_rate": 4.642109315965112e-05, + "loss": 0.81, + "step": 21500 + }, + { + "epoch": 0.3452703895728663, + "grad_norm": 0.5544922351837158, + "learning_rate": 4.641784204422252e-05, + "loss": 0.8622, + "step": 21510 + }, + { + "epoch": 0.3454309057930304, + "grad_norm": 0.577987015247345, + "learning_rate": 4.6414589566744884e-05, + "loss": 0.7742, + "step": 21520 + }, + { + "epoch": 0.3455914220131944, + "grad_norm": 0.5307692885398865, + "learning_rate": 4.6411335727425034e-05, + "loss": 0.7933, + "step": 21530 + }, + { + "epoch": 0.3457519382333585, + "grad_norm": 0.4474155008792877, + "learning_rate": 4.640808052646991e-05, + "loss": 0.8969, + "step": 21540 + }, + { + "epoch": 0.34591245445352253, + "grad_norm": 0.5672807097434998, + "learning_rate": 4.6404823964086513e-05, + "loss": 0.6848, + "step": 21550 + }, + { + "epoch": 0.3460729706736866, + "grad_norm": 0.716658890247345, + "learning_rate": 4.640156604048195e-05, + "loss": 0.8702, + "step": 21560 + }, + { + "epoch": 0.34623348689385064, + "grad_norm": 0.6536041498184204, + "learning_rate": 4.63983067558634e-05, + "loss": 0.7447, + "step": 21570 + }, + { + "epoch": 0.3463940031140147, + "grad_norm": 0.9859740734100342, + "learning_rate": 4.6395046110438126e-05, + "loss": 0.7198, + "step": 21580 + }, + { + "epoch": 0.34655451933417875, + "grad_norm": 0.7735467553138733, + "learning_rate": 4.63917841044135e-05, + "loss": 0.6976, + "step": 21590 + }, + { + "epoch": 0.34671503555434274, + "grad_norm": 0.6089875102043152, + "learning_rate": 4.638852073799697e-05, + "loss": 0.6984, + "step": 21600 + }, + { + "epoch": 0.3468755517745068, + "grad_norm": 1.0000568628311157, + "learning_rate": 4.638525601139605e-05, + "loss": 0.8431, + "step": 21610 + }, + { + "epoch": 0.34703606799467085, + "grad_norm": 0.7513206005096436, + "learning_rate": 4.6381989924818376e-05, + "loss": 0.7761, + "step": 21620 + }, + { + "epoch": 0.3471965842148349, + "grad_norm": 0.7533838152885437, + "learning_rate": 4.6378722478471636e-05, + "loss": 0.7737, + "step": 21630 + }, + { + "epoch": 0.34735710043499896, + "grad_norm": 0.637352705001831, + "learning_rate": 4.637545367256363e-05, + "loss": 0.763, + "step": 21640 + }, + { + "epoch": 0.347517616655163, + "grad_norm": 0.6224169731140137, + "learning_rate": 4.6372183507302225e-05, + "loss": 0.8386, + "step": 21650 + }, + { + "epoch": 0.34767813287532706, + "grad_norm": 0.7498801350593567, + "learning_rate": 4.63689119828954e-05, + "loss": 0.82, + "step": 21660 + }, + { + "epoch": 0.3478386490954911, + "grad_norm": 0.5264923572540283, + "learning_rate": 4.636563909955118e-05, + "loss": 0.7851, + "step": 21670 + }, + { + "epoch": 0.34799916531565517, + "grad_norm": 0.6816280484199524, + "learning_rate": 4.6362364857477734e-05, + "loss": 0.8977, + "step": 21680 + }, + { + "epoch": 0.3481596815358192, + "grad_norm": 0.6122158765792847, + "learning_rate": 4.6359089256883264e-05, + "loss": 0.7972, + "step": 21690 + }, + { + "epoch": 0.3483201977559832, + "grad_norm": 0.8479586839675903, + "learning_rate": 4.6355812297976074e-05, + "loss": 0.8879, + "step": 21700 + }, + { + "epoch": 0.34848071397614727, + "grad_norm": 0.862382709980011, + "learning_rate": 4.6352533980964575e-05, + "loss": 0.8513, + "step": 21710 + }, + { + "epoch": 0.3486412301963113, + "grad_norm": 0.5838552117347717, + "learning_rate": 4.6349254306057244e-05, + "loss": 0.7832, + "step": 21720 + }, + { + "epoch": 0.3488017464164754, + "grad_norm": 0.6843566298484802, + "learning_rate": 4.634597327346264e-05, + "loss": 0.7713, + "step": 21730 + }, + { + "epoch": 0.34896226263663943, + "grad_norm": 0.6465508341789246, + "learning_rate": 4.634269088338943e-05, + "loss": 0.8142, + "step": 21740 + }, + { + "epoch": 0.3491227788568035, + "grad_norm": 0.6180947422981262, + "learning_rate": 4.6339407136046344e-05, + "loss": 0.789, + "step": 21750 + }, + { + "epoch": 0.34928329507696754, + "grad_norm": 0.4731852412223816, + "learning_rate": 4.6336122031642215e-05, + "loss": 0.931, + "step": 21760 + }, + { + "epoch": 0.3494438112971316, + "grad_norm": 0.7998809218406677, + "learning_rate": 4.633283557038595e-05, + "loss": 0.7544, + "step": 21770 + }, + { + "epoch": 0.34960432751729564, + "grad_norm": 0.5640689730644226, + "learning_rate": 4.632954775248656e-05, + "loss": 0.7302, + "step": 21780 + }, + { + "epoch": 0.3497648437374597, + "grad_norm": 0.742168664932251, + "learning_rate": 4.632625857815313e-05, + "loss": 0.7655, + "step": 21790 + }, + { + "epoch": 0.3499253599576237, + "grad_norm": 1.0289870500564575, + "learning_rate": 4.632296804759482e-05, + "loss": 0.7232, + "step": 21800 + }, + { + "epoch": 0.35008587617778775, + "grad_norm": 0.6740556359291077, + "learning_rate": 4.6319676161020886e-05, + "loss": 0.833, + "step": 21810 + }, + { + "epoch": 0.3502463923979518, + "grad_norm": 0.4919874370098114, + "learning_rate": 4.631638291864069e-05, + "loss": 0.8369, + "step": 21820 + }, + { + "epoch": 0.35040690861811585, + "grad_norm": 1.0558432340621948, + "learning_rate": 4.6313088320663664e-05, + "loss": 0.7461, + "step": 21830 + }, + { + "epoch": 0.3505674248382799, + "grad_norm": 0.8790060877799988, + "learning_rate": 4.63097923672993e-05, + "loss": 0.76, + "step": 21840 + }, + { + "epoch": 0.35072794105844396, + "grad_norm": 0.6708624362945557, + "learning_rate": 4.6306495058757226e-05, + "loss": 0.862, + "step": 21850 + }, + { + "epoch": 0.350888457278608, + "grad_norm": 0.9446417093276978, + "learning_rate": 4.6303196395247125e-05, + "loss": 0.7591, + "step": 21860 + }, + { + "epoch": 0.35104897349877207, + "grad_norm": 0.708417534828186, + "learning_rate": 4.629989637697877e-05, + "loss": 0.8126, + "step": 21870 + }, + { + "epoch": 0.3512094897189361, + "grad_norm": 0.9033117890357971, + "learning_rate": 4.629659500416202e-05, + "loss": 0.8035, + "step": 21880 + }, + { + "epoch": 0.35137000593910017, + "grad_norm": 0.5593352317810059, + "learning_rate": 4.629329227700683e-05, + "loss": 0.93, + "step": 21890 + }, + { + "epoch": 0.35153052215926417, + "grad_norm": 0.7034105062484741, + "learning_rate": 4.6289988195723225e-05, + "loss": 0.7386, + "step": 21900 + }, + { + "epoch": 0.3516910383794282, + "grad_norm": 0.6641986966133118, + "learning_rate": 4.628668276052133e-05, + "loss": 0.8175, + "step": 21910 + }, + { + "epoch": 0.3518515545995923, + "grad_norm": 0.920795738697052, + "learning_rate": 4.628337597161135e-05, + "loss": 0.8251, + "step": 21920 + }, + { + "epoch": 0.35201207081975633, + "grad_norm": 0.7028707265853882, + "learning_rate": 4.628006782920359e-05, + "loss": 0.7479, + "step": 21930 + }, + { + "epoch": 0.3521725870399204, + "grad_norm": 0.5255286693572998, + "learning_rate": 4.627675833350841e-05, + "loss": 0.853, + "step": 21940 + }, + { + "epoch": 0.35233310326008443, + "grad_norm": 0.6818188428878784, + "learning_rate": 4.627344748473628e-05, + "loss": 0.8857, + "step": 21950 + }, + { + "epoch": 0.3524936194802485, + "grad_norm": 0.8017117381095886, + "learning_rate": 4.627013528309775e-05, + "loss": 0.684, + "step": 21960 + }, + { + "epoch": 0.35265413570041254, + "grad_norm": 0.6457096934318542, + "learning_rate": 4.6266821728803465e-05, + "loss": 0.8218, + "step": 21970 + }, + { + "epoch": 0.3528146519205766, + "grad_norm": 0.8979635834693909, + "learning_rate": 4.626350682206414e-05, + "loss": 0.6068, + "step": 21980 + }, + { + "epoch": 0.35297516814074065, + "grad_norm": 0.434284508228302, + "learning_rate": 4.6260190563090584e-05, + "loss": 0.8894, + "step": 21990 + }, + { + "epoch": 0.35313568436090464, + "grad_norm": 0.5166978240013123, + "learning_rate": 4.6256872952093694e-05, + "loss": 0.9484, + "step": 22000 + }, + { + "epoch": 0.3532962005810687, + "grad_norm": 0.5511228442192078, + "learning_rate": 4.6253553989284445e-05, + "loss": 0.8254, + "step": 22010 + }, + { + "epoch": 0.35345671680123275, + "grad_norm": 0.8663944602012634, + "learning_rate": 4.625023367487392e-05, + "loss": 0.8336, + "step": 22020 + }, + { + "epoch": 0.3536172330213968, + "grad_norm": 0.8029462695121765, + "learning_rate": 4.624691200907325e-05, + "loss": 0.8307, + "step": 22030 + }, + { + "epoch": 0.35377774924156086, + "grad_norm": 1.109559416770935, + "learning_rate": 4.624358899209368e-05, + "loss": 0.7364, + "step": 22040 + }, + { + "epoch": 0.3539382654617249, + "grad_norm": 0.6176952719688416, + "learning_rate": 4.624026462414655e-05, + "loss": 0.7819, + "step": 22050 + }, + { + "epoch": 0.35409878168188896, + "grad_norm": 0.7159645557403564, + "learning_rate": 4.623693890544324e-05, + "loss": 0.6737, + "step": 22060 + }, + { + "epoch": 0.354259297902053, + "grad_norm": 0.7181645631790161, + "learning_rate": 4.623361183619528e-05, + "loss": 0.8194, + "step": 22070 + }, + { + "epoch": 0.35441981412221707, + "grad_norm": 0.6925590634346008, + "learning_rate": 4.623028341661423e-05, + "loss": 0.7478, + "step": 22080 + }, + { + "epoch": 0.3545803303423811, + "grad_norm": 0.7585506439208984, + "learning_rate": 4.6226953646911765e-05, + "loss": 0.8595, + "step": 22090 + }, + { + "epoch": 0.3547408465625451, + "grad_norm": 0.8090323209762573, + "learning_rate": 4.622362252729963e-05, + "loss": 0.7871, + "step": 22100 + }, + { + "epoch": 0.3549013627827092, + "grad_norm": 0.5448397994041443, + "learning_rate": 4.622029005798968e-05, + "loss": 0.837, + "step": 22110 + }, + { + "epoch": 0.3550618790028732, + "grad_norm": 0.5719908475875854, + "learning_rate": 4.621695623919383e-05, + "loss": 0.8393, + "step": 22120 + }, + { + "epoch": 0.3552223952230373, + "grad_norm": 0.751198410987854, + "learning_rate": 4.6213621071124094e-05, + "loss": 0.8132, + "step": 22130 + }, + { + "epoch": 0.35538291144320133, + "grad_norm": 0.6494233012199402, + "learning_rate": 4.6210284553992565e-05, + "loss": 0.7917, + "step": 22140 + }, + { + "epoch": 0.3555434276633654, + "grad_norm": 0.8040640354156494, + "learning_rate": 4.620694668801144e-05, + "loss": 0.7288, + "step": 22150 + }, + { + "epoch": 0.35570394388352944, + "grad_norm": 0.815883457660675, + "learning_rate": 4.620360747339297e-05, + "loss": 0.7302, + "step": 22160 + }, + { + "epoch": 0.3558644601036935, + "grad_norm": 0.7437810301780701, + "learning_rate": 4.620026691034952e-05, + "loss": 0.8147, + "step": 22170 + }, + { + "epoch": 0.35602497632385754, + "grad_norm": 0.5854519605636597, + "learning_rate": 4.6196924999093526e-05, + "loss": 0.8891, + "step": 22180 + }, + { + "epoch": 0.3561854925440216, + "grad_norm": 0.7826079726219177, + "learning_rate": 4.619358173983752e-05, + "loss": 0.9335, + "step": 22190 + }, + { + "epoch": 0.3563460087641856, + "grad_norm": 0.6797077059745789, + "learning_rate": 4.6190237132794103e-05, + "loss": 0.7613, + "step": 22200 + }, + { + "epoch": 0.35650652498434965, + "grad_norm": 0.7483430504798889, + "learning_rate": 4.618689117817598e-05, + "loss": 0.8212, + "step": 22210 + }, + { + "epoch": 0.3566670412045137, + "grad_norm": 1.4763262271881104, + "learning_rate": 4.6183543876195946e-05, + "loss": 0.7812, + "step": 22220 + }, + { + "epoch": 0.35682755742467775, + "grad_norm": 1.4058552980422974, + "learning_rate": 4.6180195227066834e-05, + "loss": 0.8093, + "step": 22230 + }, + { + "epoch": 0.3569880736448418, + "grad_norm": 1.1750872135162354, + "learning_rate": 4.617684523100164e-05, + "loss": 0.7615, + "step": 22240 + }, + { + "epoch": 0.35714858986500586, + "grad_norm": 1.3582264184951782, + "learning_rate": 4.6173493888213374e-05, + "loss": 0.8372, + "step": 22250 + }, + { + "epoch": 0.3573091060851699, + "grad_norm": 0.8028630614280701, + "learning_rate": 4.617014119891517e-05, + "loss": 0.7873, + "step": 22260 + }, + { + "epoch": 0.35746962230533397, + "grad_norm": 0.6224728226661682, + "learning_rate": 4.616678716332025e-05, + "loss": 0.8265, + "step": 22270 + }, + { + "epoch": 0.357630138525498, + "grad_norm": 0.5409374237060547, + "learning_rate": 4.61634317816419e-05, + "loss": 0.694, + "step": 22280 + }, + { + "epoch": 0.3577906547456621, + "grad_norm": 0.4836641550064087, + "learning_rate": 4.61600750540935e-05, + "loss": 0.7788, + "step": 22290 + }, + { + "epoch": 0.35795117096582607, + "grad_norm": 0.5411898493766785, + "learning_rate": 4.615671698088853e-05, + "loss": 0.7265, + "step": 22300 + }, + { + "epoch": 0.3581116871859901, + "grad_norm": 0.5355805158615112, + "learning_rate": 4.6153357562240535e-05, + "loss": 0.7468, + "step": 22310 + }, + { + "epoch": 0.3582722034061542, + "grad_norm": 1.5743606090545654, + "learning_rate": 4.614999679836315e-05, + "loss": 0.7816, + "step": 22320 + }, + { + "epoch": 0.35843271962631823, + "grad_norm": 0.5029131770133972, + "learning_rate": 4.614663468947012e-05, + "loss": 0.9576, + "step": 22330 + }, + { + "epoch": 0.3585932358464823, + "grad_norm": 0.9086703658103943, + "learning_rate": 4.614327123577523e-05, + "loss": 0.7673, + "step": 22340 + }, + { + "epoch": 0.35875375206664634, + "grad_norm": 0.7776360511779785, + "learning_rate": 4.61399064374924e-05, + "loss": 0.8034, + "step": 22350 + }, + { + "epoch": 0.3589142682868104, + "grad_norm": 1.046457290649414, + "learning_rate": 4.613654029483559e-05, + "loss": 0.9025, + "step": 22360 + }, + { + "epoch": 0.35907478450697444, + "grad_norm": 0.8527055978775024, + "learning_rate": 4.613317280801888e-05, + "loss": 0.837, + "step": 22370 + }, + { + "epoch": 0.3592353007271385, + "grad_norm": 0.6108635067939758, + "learning_rate": 4.612980397725641e-05, + "loss": 0.7369, + "step": 22380 + }, + { + "epoch": 0.35939581694730255, + "grad_norm": 0.8282217383384705, + "learning_rate": 4.6126433802762434e-05, + "loss": 0.7591, + "step": 22390 + }, + { + "epoch": 0.35955633316746655, + "grad_norm": 0.4301918148994446, + "learning_rate": 4.6123062284751276e-05, + "loss": 0.7004, + "step": 22400 + }, + { + "epoch": 0.3597168493876306, + "grad_norm": 0.497417688369751, + "learning_rate": 4.611968942343732e-05, + "loss": 0.7982, + "step": 22410 + }, + { + "epoch": 0.35987736560779465, + "grad_norm": 0.6199057698249817, + "learning_rate": 4.611631521903509e-05, + "loss": 0.8375, + "step": 22420 + }, + { + "epoch": 0.3600378818279587, + "grad_norm": 0.7507228255271912, + "learning_rate": 4.611293967175914e-05, + "loss": 0.7639, + "step": 22430 + }, + { + "epoch": 0.36019839804812276, + "grad_norm": 0.7843051552772522, + "learning_rate": 4.6109562781824156e-05, + "loss": 0.8096, + "step": 22440 + }, + { + "epoch": 0.3603589142682868, + "grad_norm": 1.566915512084961, + "learning_rate": 4.6106184549444874e-05, + "loss": 0.7368, + "step": 22450 + }, + { + "epoch": 0.36051943048845086, + "grad_norm": 0.6397568583488464, + "learning_rate": 4.6102804974836144e-05, + "loss": 0.7659, + "step": 22460 + }, + { + "epoch": 0.3606799467086149, + "grad_norm": 0.7581170201301575, + "learning_rate": 4.609942405821287e-05, + "loss": 0.7536, + "step": 22470 + }, + { + "epoch": 0.36084046292877897, + "grad_norm": 0.604504406452179, + "learning_rate": 4.609604179979007e-05, + "loss": 0.9362, + "step": 22480 + }, + { + "epoch": 0.361000979148943, + "grad_norm": 0.9673589468002319, + "learning_rate": 4.6092658199782836e-05, + "loss": 0.7423, + "step": 22490 + }, + { + "epoch": 0.361161495369107, + "grad_norm": 0.869315505027771, + "learning_rate": 4.608927325840634e-05, + "loss": 0.7961, + "step": 22500 + }, + { + "epoch": 0.3613220115892711, + "grad_norm": 0.757993757724762, + "learning_rate": 4.608588697587585e-05, + "loss": 0.8031, + "step": 22510 + }, + { + "epoch": 0.36148252780943513, + "grad_norm": 0.9239144325256348, + "learning_rate": 4.60824993524067e-05, + "loss": 0.8866, + "step": 22520 + }, + { + "epoch": 0.3616430440295992, + "grad_norm": 0.5219390392303467, + "learning_rate": 4.607911038821434e-05, + "loss": 0.7321, + "step": 22530 + }, + { + "epoch": 0.36180356024976323, + "grad_norm": 0.6632308959960938, + "learning_rate": 4.6075720083514275e-05, + "loss": 0.8691, + "step": 22540 + }, + { + "epoch": 0.3619640764699273, + "grad_norm": 0.825103223323822, + "learning_rate": 4.6072328438522116e-05, + "loss": 0.8504, + "step": 22550 + }, + { + "epoch": 0.36212459269009134, + "grad_norm": 1.2998589277267456, + "learning_rate": 4.6068935453453557e-05, + "loss": 0.8594, + "step": 22560 + }, + { + "epoch": 0.3622851089102554, + "grad_norm": 1.0322076082229614, + "learning_rate": 4.606554112852436e-05, + "loss": 0.7275, + "step": 22570 + }, + { + "epoch": 0.36244562513041945, + "grad_norm": 0.6764788627624512, + "learning_rate": 4.606214546395039e-05, + "loss": 0.8077, + "step": 22580 + }, + { + "epoch": 0.3626061413505835, + "grad_norm": 0.9324022531509399, + "learning_rate": 4.6058748459947596e-05, + "loss": 0.7651, + "step": 22590 + }, + { + "epoch": 0.3627666575707475, + "grad_norm": 0.6156123876571655, + "learning_rate": 4.605535011673199e-05, + "loss": 0.7914, + "step": 22600 + }, + { + "epoch": 0.36292717379091155, + "grad_norm": 0.9105247259140015, + "learning_rate": 4.605195043451971e-05, + "loss": 0.7147, + "step": 22610 + }, + { + "epoch": 0.3630876900110756, + "grad_norm": 0.5956509709358215, + "learning_rate": 4.6048549413526945e-05, + "loss": 0.7259, + "step": 22620 + }, + { + "epoch": 0.36324820623123966, + "grad_norm": 1.051916480064392, + "learning_rate": 4.604514705396997e-05, + "loss": 0.8577, + "step": 22630 + }, + { + "epoch": 0.3634087224514037, + "grad_norm": 0.5614705681800842, + "learning_rate": 4.604174335606517e-05, + "loss": 0.8491, + "step": 22640 + }, + { + "epoch": 0.36356923867156776, + "grad_norm": 0.7012106776237488, + "learning_rate": 4.6038338320029e-05, + "loss": 0.7844, + "step": 22650 + }, + { + "epoch": 0.3637297548917318, + "grad_norm": 0.744378924369812, + "learning_rate": 4.6034931946077994e-05, + "loss": 0.8689, + "step": 22660 + }, + { + "epoch": 0.36389027111189587, + "grad_norm": 0.7036871314048767, + "learning_rate": 4.6031524234428776e-05, + "loss": 0.8659, + "step": 22670 + }, + { + "epoch": 0.3640507873320599, + "grad_norm": 0.8747723698616028, + "learning_rate": 4.602811518529806e-05, + "loss": 0.848, + "step": 22680 + }, + { + "epoch": 0.364211303552224, + "grad_norm": 0.6074729561805725, + "learning_rate": 4.602470479890264e-05, + "loss": 0.7572, + "step": 22690 + }, + { + "epoch": 0.364371819772388, + "grad_norm": 0.6035022139549255, + "learning_rate": 4.6021293075459394e-05, + "loss": 0.8131, + "step": 22700 + }, + { + "epoch": 0.364532335992552, + "grad_norm": 0.6070325374603271, + "learning_rate": 4.601788001518529e-05, + "loss": 0.848, + "step": 22710 + }, + { + "epoch": 0.3646928522127161, + "grad_norm": 0.5724703669548035, + "learning_rate": 4.601446561829739e-05, + "loss": 0.7451, + "step": 22720 + }, + { + "epoch": 0.36485336843288013, + "grad_norm": 0.6547208428382874, + "learning_rate": 4.601104988501281e-05, + "loss": 0.9496, + "step": 22730 + }, + { + "epoch": 0.3650138846530442, + "grad_norm": 0.6288239359855652, + "learning_rate": 4.6007632815548784e-05, + "loss": 0.7365, + "step": 22740 + }, + { + "epoch": 0.36517440087320824, + "grad_norm": 0.5909592509269714, + "learning_rate": 4.600421441012261e-05, + "loss": 0.7475, + "step": 22750 + }, + { + "epoch": 0.3653349170933723, + "grad_norm": 0.5226154327392578, + "learning_rate": 4.600079466895169e-05, + "loss": 0.7738, + "step": 22760 + }, + { + "epoch": 0.36549543331353634, + "grad_norm": 0.49801647663116455, + "learning_rate": 4.599737359225348e-05, + "loss": 0.8365, + "step": 22770 + }, + { + "epoch": 0.3656559495337004, + "grad_norm": 0.9397007822990417, + "learning_rate": 4.5993951180245555e-05, + "loss": 0.6659, + "step": 22780 + }, + { + "epoch": 0.36581646575386445, + "grad_norm": 0.5491136312484741, + "learning_rate": 4.599052743314556e-05, + "loss": 0.8115, + "step": 22790 + }, + { + "epoch": 0.36597698197402845, + "grad_norm": 0.9416855573654175, + "learning_rate": 4.598710235117122e-05, + "loss": 0.7425, + "step": 22800 + }, + { + "epoch": 0.3661374981941925, + "grad_norm": 0.7795324325561523, + "learning_rate": 4.598367593454035e-05, + "loss": 0.6967, + "step": 22810 + }, + { + "epoch": 0.36629801441435655, + "grad_norm": 0.6830748915672302, + "learning_rate": 4.5980248183470855e-05, + "loss": 0.7433, + "step": 22820 + }, + { + "epoch": 0.3664585306345206, + "grad_norm": 0.6831149458885193, + "learning_rate": 4.597681909818071e-05, + "loss": 0.8206, + "step": 22830 + }, + { + "epoch": 0.36661904685468466, + "grad_norm": 0.6852809190750122, + "learning_rate": 4.5973388678888006e-05, + "loss": 0.8042, + "step": 22840 + }, + { + "epoch": 0.3667795630748487, + "grad_norm": 0.6128742098808289, + "learning_rate": 4.5969956925810875e-05, + "loss": 0.896, + "step": 22850 + }, + { + "epoch": 0.36694007929501277, + "grad_norm": 0.5458340644836426, + "learning_rate": 4.5966523839167566e-05, + "loss": 0.8764, + "step": 22860 + }, + { + "epoch": 0.3671005955151768, + "grad_norm": 0.7620818018913269, + "learning_rate": 4.5963089419176396e-05, + "loss": 0.7963, + "step": 22870 + }, + { + "epoch": 0.3672611117353409, + "grad_norm": 0.5906163454055786, + "learning_rate": 4.595965366605579e-05, + "loss": 0.6822, + "step": 22880 + }, + { + "epoch": 0.3674216279555049, + "grad_norm": 0.705746591091156, + "learning_rate": 4.5956216580024224e-05, + "loss": 0.9274, + "step": 22890 + }, + { + "epoch": 0.367582144175669, + "grad_norm": 0.5981976985931396, + "learning_rate": 4.5952778161300286e-05, + "loss": 0.733, + "step": 22900 + }, + { + "epoch": 0.367742660395833, + "grad_norm": 0.9638124108314514, + "learning_rate": 4.594933841010264e-05, + "loss": 0.7133, + "step": 22910 + }, + { + "epoch": 0.36790317661599703, + "grad_norm": 0.6463102698326111, + "learning_rate": 4.5945897326650025e-05, + "loss": 1.0117, + "step": 22920 + }, + { + "epoch": 0.3680636928361611, + "grad_norm": 0.6187431812286377, + "learning_rate": 4.594245491116129e-05, + "loss": 0.818, + "step": 22930 + }, + { + "epoch": 0.36822420905632514, + "grad_norm": 0.5668833255767822, + "learning_rate": 4.593901116385533e-05, + "loss": 0.7952, + "step": 22940 + }, + { + "epoch": 0.3683847252764892, + "grad_norm": 0.594817578792572, + "learning_rate": 4.593556608495117e-05, + "loss": 0.795, + "step": 22950 + }, + { + "epoch": 0.36854524149665324, + "grad_norm": 0.510923445224762, + "learning_rate": 4.593211967466788e-05, + "loss": 0.815, + "step": 22960 + }, + { + "epoch": 0.3687057577168173, + "grad_norm": 3.5334794521331787, + "learning_rate": 4.592867193322464e-05, + "loss": 0.7831, + "step": 22970 + }, + { + "epoch": 0.36886627393698135, + "grad_norm": 0.5729717016220093, + "learning_rate": 4.592522286084071e-05, + "loss": 0.7338, + "step": 22980 + }, + { + "epoch": 0.3690267901571454, + "grad_norm": 0.6657503843307495, + "learning_rate": 4.5921772457735414e-05, + "loss": 0.6599, + "step": 22990 + }, + { + "epoch": 0.36918730637730945, + "grad_norm": 0.5805906653404236, + "learning_rate": 4.5918320724128194e-05, + "loss": 0.6931, + "step": 23000 + }, + { + "epoch": 0.36934782259747345, + "grad_norm": 0.7658077478408813, + "learning_rate": 4.591486766023855e-05, + "loss": 0.7051, + "step": 23010 + }, + { + "epoch": 0.3695083388176375, + "grad_norm": 0.8324223756790161, + "learning_rate": 4.591141326628608e-05, + "loss": 0.7752, + "step": 23020 + }, + { + "epoch": 0.36966885503780156, + "grad_norm": 0.565764307975769, + "learning_rate": 4.5907957542490475e-05, + "loss": 0.7959, + "step": 23030 + }, + { + "epoch": 0.3698293712579656, + "grad_norm": 0.8167107105255127, + "learning_rate": 4.590450048907148e-05, + "loss": 0.7136, + "step": 23040 + }, + { + "epoch": 0.36998988747812966, + "grad_norm": 0.6801853179931641, + "learning_rate": 4.590104210624895e-05, + "loss": 0.6574, + "step": 23050 + }, + { + "epoch": 0.3701504036982937, + "grad_norm": 0.8040290474891663, + "learning_rate": 4.5897582394242815e-05, + "loss": 0.815, + "step": 23060 + }, + { + "epoch": 0.37031091991845777, + "grad_norm": 0.7362086772918701, + "learning_rate": 4.58941213532731e-05, + "loss": 0.8321, + "step": 23070 + }, + { + "epoch": 0.3704714361386218, + "grad_norm": 0.700812816619873, + "learning_rate": 4.589065898355991e-05, + "loss": 0.8919, + "step": 23080 + }, + { + "epoch": 0.3706319523587859, + "grad_norm": 0.9948484897613525, + "learning_rate": 4.588719528532342e-05, + "loss": 0.9166, + "step": 23090 + }, + { + "epoch": 0.37079246857894993, + "grad_norm": 0.7245079278945923, + "learning_rate": 4.58837302587839e-05, + "loss": 0.7956, + "step": 23100 + }, + { + "epoch": 0.3709529847991139, + "grad_norm": 0.4015323221683502, + "learning_rate": 4.5880263904161715e-05, + "loss": 0.7506, + "step": 23110 + }, + { + "epoch": 0.371113501019278, + "grad_norm": 0.5833758115768433, + "learning_rate": 4.5876796221677294e-05, + "loss": 0.8118, + "step": 23120 + }, + { + "epoch": 0.37127401723944203, + "grad_norm": 0.6006354689598083, + "learning_rate": 4.587332721155117e-05, + "loss": 0.8133, + "step": 23130 + }, + { + "epoch": 0.3714345334596061, + "grad_norm": 0.5327298641204834, + "learning_rate": 4.586985687400396e-05, + "loss": 0.7537, + "step": 23140 + }, + { + "epoch": 0.37159504967977014, + "grad_norm": 0.7171925902366638, + "learning_rate": 4.5866385209256336e-05, + "loss": 0.7846, + "step": 23150 + }, + { + "epoch": 0.3717555658999342, + "grad_norm": 0.9267024993896484, + "learning_rate": 4.586291221752908e-05, + "loss": 0.8326, + "step": 23160 + }, + { + "epoch": 0.37191608212009825, + "grad_norm": 0.5947498679161072, + "learning_rate": 4.585943789904307e-05, + "loss": 0.8203, + "step": 23170 + }, + { + "epoch": 0.3720765983402623, + "grad_norm": 0.7526811361312866, + "learning_rate": 4.5855962254019244e-05, + "loss": 0.7218, + "step": 23180 + }, + { + "epoch": 0.37223711456042635, + "grad_norm": 0.7850362658500671, + "learning_rate": 4.5852485282678626e-05, + "loss": 0.8304, + "step": 23190 + }, + { + "epoch": 0.3723976307805904, + "grad_norm": 1.0868958234786987, + "learning_rate": 4.5849006985242335e-05, + "loss": 0.8431, + "step": 23200 + }, + { + "epoch": 0.3725581470007544, + "grad_norm": 0.5976437330245972, + "learning_rate": 4.5845527361931575e-05, + "loss": 0.7186, + "step": 23210 + }, + { + "epoch": 0.37271866322091846, + "grad_norm": 1.6988844871520996, + "learning_rate": 4.584204641296762e-05, + "loss": 0.7459, + "step": 23220 + }, + { + "epoch": 0.3728791794410825, + "grad_norm": 1.550459384918213, + "learning_rate": 4.5838564138571846e-05, + "loss": 0.7807, + "step": 23230 + }, + { + "epoch": 0.37303969566124656, + "grad_norm": 0.6035885810852051, + "learning_rate": 4.5835080538965705e-05, + "loss": 0.8304, + "step": 23240 + }, + { + "epoch": 0.3732002118814106, + "grad_norm": 0.5907220244407654, + "learning_rate": 4.583159561437073e-05, + "loss": 0.7908, + "step": 23250 + }, + { + "epoch": 0.37336072810157467, + "grad_norm": 0.7619858980178833, + "learning_rate": 4.582810936500854e-05, + "loss": 0.7794, + "step": 23260 + }, + { + "epoch": 0.3735212443217387, + "grad_norm": 0.6596529483795166, + "learning_rate": 4.582462179110085e-05, + "loss": 0.9044, + "step": 23270 + }, + { + "epoch": 0.3736817605419028, + "grad_norm": 0.6034225225448608, + "learning_rate": 4.582113289286943e-05, + "loss": 0.7223, + "step": 23280 + }, + { + "epoch": 0.3738422767620668, + "grad_norm": 1.7812895774841309, + "learning_rate": 4.581764267053618e-05, + "loss": 0.7921, + "step": 23290 + }, + { + "epoch": 0.3740027929822309, + "grad_norm": 0.7158597111701965, + "learning_rate": 4.5814151124323036e-05, + "loss": 0.7958, + "step": 23300 + }, + { + "epoch": 0.3741633092023949, + "grad_norm": 0.6867296695709229, + "learning_rate": 4.581065825445205e-05, + "loss": 0.7653, + "step": 23310 + }, + { + "epoch": 0.37432382542255893, + "grad_norm": 0.7133305072784424, + "learning_rate": 4.580716406114534e-05, + "loss": 0.8817, + "step": 23320 + }, + { + "epoch": 0.374484341642723, + "grad_norm": 0.5379882454872131, + "learning_rate": 4.5803668544625125e-05, + "loss": 0.8487, + "step": 23330 + }, + { + "epoch": 0.37464485786288704, + "grad_norm": 0.5826191902160645, + "learning_rate": 4.580017170511369e-05, + "loss": 0.8048, + "step": 23340 + }, + { + "epoch": 0.3748053740830511, + "grad_norm": 0.7290608882904053, + "learning_rate": 4.579667354283343e-05, + "loss": 0.6935, + "step": 23350 + }, + { + "epoch": 0.37496589030321514, + "grad_norm": 0.6665160059928894, + "learning_rate": 4.579317405800679e-05, + "loss": 0.7149, + "step": 23360 + }, + { + "epoch": 0.3751264065233792, + "grad_norm": 1.0511313676834106, + "learning_rate": 4.578967325085632e-05, + "loss": 0.8042, + "step": 23370 + }, + { + "epoch": 0.37528692274354325, + "grad_norm": 0.7534080743789673, + "learning_rate": 4.5786171121604664e-05, + "loss": 0.8528, + "step": 23380 + }, + { + "epoch": 0.3754474389637073, + "grad_norm": 0.48247018456459045, + "learning_rate": 4.578266767047452e-05, + "loss": 0.7519, + "step": 23390 + }, + { + "epoch": 0.37560795518387136, + "grad_norm": 0.8346527814865112, + "learning_rate": 4.57791628976887e-05, + "loss": 0.8201, + "step": 23400 + }, + { + "epoch": 0.37576847140403535, + "grad_norm": 0.4536258578300476, + "learning_rate": 4.577565680347008e-05, + "loss": 0.766, + "step": 23410 + }, + { + "epoch": 0.3759289876241994, + "grad_norm": 0.5089630484580994, + "learning_rate": 4.577214938804162e-05, + "loss": 0.7804, + "step": 23420 + }, + { + "epoch": 0.37608950384436346, + "grad_norm": 0.6607599258422852, + "learning_rate": 4.576864065162638e-05, + "loss": 0.8253, + "step": 23430 + }, + { + "epoch": 0.3762500200645275, + "grad_norm": 0.517585039138794, + "learning_rate": 4.57651305944475e-05, + "loss": 0.8294, + "step": 23440 + }, + { + "epoch": 0.37641053628469157, + "grad_norm": 0.5855730175971985, + "learning_rate": 4.57616192167282e-05, + "loss": 0.6539, + "step": 23450 + }, + { + "epoch": 0.3765710525048556, + "grad_norm": 0.8179181814193726, + "learning_rate": 4.575810651869176e-05, + "loss": 0.6778, + "step": 23460 + }, + { + "epoch": 0.37673156872501967, + "grad_norm": 0.8776317834854126, + "learning_rate": 4.5754592500561597e-05, + "loss": 0.7605, + "step": 23470 + }, + { + "epoch": 0.3768920849451837, + "grad_norm": 0.5281404852867126, + "learning_rate": 4.575107716256116e-05, + "loss": 0.8415, + "step": 23480 + }, + { + "epoch": 0.3770526011653478, + "grad_norm": 0.42407017946243286, + "learning_rate": 4.574756050491401e-05, + "loss": 0.7176, + "step": 23490 + }, + { + "epoch": 0.37721311738551183, + "grad_norm": 0.6243557929992676, + "learning_rate": 4.574404252784379e-05, + "loss": 0.7453, + "step": 23500 + }, + { + "epoch": 0.37737363360567583, + "grad_norm": 0.5536555647850037, + "learning_rate": 4.5740523231574223e-05, + "loss": 0.7076, + "step": 23510 + }, + { + "epoch": 0.3775341498258399, + "grad_norm": 0.515758752822876, + "learning_rate": 4.573700261632911e-05, + "loss": 0.7423, + "step": 23520 + }, + { + "epoch": 0.37769466604600394, + "grad_norm": 0.6168943643569946, + "learning_rate": 4.5733480682332355e-05, + "loss": 0.7302, + "step": 23530 + }, + { + "epoch": 0.377855182266168, + "grad_norm": 0.5707930326461792, + "learning_rate": 4.572995742980791e-05, + "loss": 0.7748, + "step": 23540 + }, + { + "epoch": 0.37801569848633204, + "grad_norm": 0.5863920450210571, + "learning_rate": 4.572643285897984e-05, + "loss": 0.883, + "step": 23550 + }, + { + "epoch": 0.3781762147064961, + "grad_norm": 0.5148717164993286, + "learning_rate": 4.57229069700723e-05, + "loss": 0.8692, + "step": 23560 + }, + { + "epoch": 0.37833673092666015, + "grad_norm": 0.4728262722492218, + "learning_rate": 4.571937976330951e-05, + "loss": 0.7161, + "step": 23570 + }, + { + "epoch": 0.3784972471468242, + "grad_norm": 0.8802021145820618, + "learning_rate": 4.571585123891577e-05, + "loss": 0.8541, + "step": 23580 + }, + { + "epoch": 0.37865776336698825, + "grad_norm": 0.7392709255218506, + "learning_rate": 4.571232139711549e-05, + "loss": 0.7207, + "step": 23590 + }, + { + "epoch": 0.3788182795871523, + "grad_norm": 0.8767966628074646, + "learning_rate": 4.570879023813314e-05, + "loss": 0.7783, + "step": 23600 + }, + { + "epoch": 0.3789787958073163, + "grad_norm": 0.49736180901527405, + "learning_rate": 4.5705257762193273e-05, + "loss": 0.7235, + "step": 23610 + }, + { + "epoch": 0.37913931202748036, + "grad_norm": 0.6473224759101868, + "learning_rate": 4.570172396952054e-05, + "loss": 0.9119, + "step": 23620 + }, + { + "epoch": 0.3792998282476444, + "grad_norm": 0.846282422542572, + "learning_rate": 4.569818886033966e-05, + "loss": 0.7271, + "step": 23630 + }, + { + "epoch": 0.37946034446780846, + "grad_norm": 0.6917523741722107, + "learning_rate": 4.5694652434875474e-05, + "loss": 0.8758, + "step": 23640 + }, + { + "epoch": 0.3796208606879725, + "grad_norm": 4.209697246551514, + "learning_rate": 4.5691114693352844e-05, + "loss": 0.7974, + "step": 23650 + }, + { + "epoch": 0.37978137690813657, + "grad_norm": 0.474092960357666, + "learning_rate": 4.568757563599677e-05, + "loss": 0.7933, + "step": 23660 + }, + { + "epoch": 0.3799418931283006, + "grad_norm": 1.1997113227844238, + "learning_rate": 4.568403526303231e-05, + "loss": 0.8324, + "step": 23670 + }, + { + "epoch": 0.3801024093484647, + "grad_norm": 0.6084169745445251, + "learning_rate": 4.56804935746846e-05, + "loss": 0.7881, + "step": 23680 + }, + { + "epoch": 0.38026292556862873, + "grad_norm": 0.5473749041557312, + "learning_rate": 4.5676950571178894e-05, + "loss": 0.8441, + "step": 23690 + }, + { + "epoch": 0.3804234417887928, + "grad_norm": 1.2384791374206543, + "learning_rate": 4.5673406252740495e-05, + "loss": 0.7289, + "step": 23700 + }, + { + "epoch": 0.3805839580089568, + "grad_norm": 0.6605951189994812, + "learning_rate": 4.566986061959479e-05, + "loss": 0.7778, + "step": 23710 + }, + { + "epoch": 0.38074447422912083, + "grad_norm": 0.6466602683067322, + "learning_rate": 4.5666313671967265e-05, + "loss": 0.8859, + "step": 23720 + }, + { + "epoch": 0.3809049904492849, + "grad_norm": 1.6250874996185303, + "learning_rate": 4.56627654100835e-05, + "loss": 0.8543, + "step": 23730 + }, + { + "epoch": 0.38106550666944894, + "grad_norm": 0.7567691206932068, + "learning_rate": 4.5659215834169125e-05, + "loss": 0.7134, + "step": 23740 + }, + { + "epoch": 0.381226022889613, + "grad_norm": 0.6296892166137695, + "learning_rate": 4.565566494444988e-05, + "loss": 0.821, + "step": 23750 + }, + { + "epoch": 0.38138653910977705, + "grad_norm": 0.5961968302726746, + "learning_rate": 4.565211274115159e-05, + "loss": 0.7403, + "step": 23760 + }, + { + "epoch": 0.3815470553299411, + "grad_norm": 1.9127998352050781, + "learning_rate": 4.564855922450014e-05, + "loss": 0.8367, + "step": 23770 + }, + { + "epoch": 0.38170757155010515, + "grad_norm": 0.8796523809432983, + "learning_rate": 4.564500439472151e-05, + "loss": 0.8188, + "step": 23780 + }, + { + "epoch": 0.3818680877702692, + "grad_norm": 0.728481113910675, + "learning_rate": 4.564144825204179e-05, + "loss": 0.7612, + "step": 23790 + }, + { + "epoch": 0.38202860399043326, + "grad_norm": 0.6583888530731201, + "learning_rate": 4.56378907966871e-05, + "loss": 0.7883, + "step": 23800 + }, + { + "epoch": 0.38218912021059726, + "grad_norm": 1.1330615282058716, + "learning_rate": 4.56343320288837e-05, + "loss": 0.8509, + "step": 23810 + }, + { + "epoch": 0.3823496364307613, + "grad_norm": 0.5186707973480225, + "learning_rate": 4.5630771948857886e-05, + "loss": 0.8855, + "step": 23820 + }, + { + "epoch": 0.38251015265092536, + "grad_norm": 0.5887451171875, + "learning_rate": 4.562721055683607e-05, + "loss": 0.8516, + "step": 23830 + }, + { + "epoch": 0.3826706688710894, + "grad_norm": 0.9954094886779785, + "learning_rate": 4.562364785304473e-05, + "loss": 0.8433, + "step": 23840 + }, + { + "epoch": 0.38283118509125347, + "grad_norm": 0.9373178482055664, + "learning_rate": 4.5620083837710436e-05, + "loss": 0.8328, + "step": 23850 + }, + { + "epoch": 0.3829917013114175, + "grad_norm": 0.669390857219696, + "learning_rate": 4.561651851105984e-05, + "loss": 0.8539, + "step": 23860 + }, + { + "epoch": 0.3831522175315816, + "grad_norm": 0.49178019165992737, + "learning_rate": 4.561295187331967e-05, + "loss": 0.8911, + "step": 23870 + }, + { + "epoch": 0.3833127337517456, + "grad_norm": 0.8359445333480835, + "learning_rate": 4.560938392471674e-05, + "loss": 0.811, + "step": 23880 + }, + { + "epoch": 0.3834732499719097, + "grad_norm": 0.6635581851005554, + "learning_rate": 4.5605814665477975e-05, + "loss": 0.7113, + "step": 23890 + }, + { + "epoch": 0.38363376619207373, + "grad_norm": 0.4507424235343933, + "learning_rate": 4.560224409583033e-05, + "loss": 0.6616, + "step": 23900 + }, + { + "epoch": 0.38379428241223773, + "grad_norm": 0.6536629796028137, + "learning_rate": 4.559867221600088e-05, + "loss": 0.7455, + "step": 23910 + }, + { + "epoch": 0.3839547986324018, + "grad_norm": 1.7415928840637207, + "learning_rate": 4.559509902621678e-05, + "loss": 0.9262, + "step": 23920 + }, + { + "epoch": 0.38411531485256584, + "grad_norm": 0.8906059861183167, + "learning_rate": 4.559152452670527e-05, + "loss": 0.7307, + "step": 23930 + }, + { + "epoch": 0.3842758310727299, + "grad_norm": 0.5937884449958801, + "learning_rate": 4.5587948717693655e-05, + "loss": 0.8123, + "step": 23940 + }, + { + "epoch": 0.38443634729289394, + "grad_norm": 0.6662039756774902, + "learning_rate": 4.558437159940934e-05, + "loss": 0.7055, + "step": 23950 + }, + { + "epoch": 0.384596863513058, + "grad_norm": 0.5371652245521545, + "learning_rate": 4.5580793172079815e-05, + "loss": 0.7488, + "step": 23960 + }, + { + "epoch": 0.38475737973322205, + "grad_norm": 0.7653747200965881, + "learning_rate": 4.557721343593263e-05, + "loss": 0.8513, + "step": 23970 + }, + { + "epoch": 0.3849178959533861, + "grad_norm": 0.9889721870422363, + "learning_rate": 4.557363239119546e-05, + "loss": 0.815, + "step": 23980 + }, + { + "epoch": 0.38507841217355016, + "grad_norm": 0.8755155801773071, + "learning_rate": 4.5570050038096014e-05, + "loss": 0.7534, + "step": 23990 + }, + { + "epoch": 0.3852389283937142, + "grad_norm": 0.6618263721466064, + "learning_rate": 4.556646637686213e-05, + "loss": 0.76, + "step": 24000 + }, + { + "epoch": 0.3852389283937142, + "eval_loss": 0.799217700958252, + "eval_runtime": 1833.4635, + "eval_samples_per_second": 14.307, + "eval_steps_per_second": 1.788, + "step": 24000 + }, + { + "epoch": 0.3853994446138782, + "grad_norm": 0.6454071402549744, + "learning_rate": 4.556288140772169e-05, + "loss": 0.7493, + "step": 24010 + }, + { + "epoch": 0.38555996083404226, + "grad_norm": 0.5538886785507202, + "learning_rate": 4.5559295130902676e-05, + "loss": 0.7471, + "step": 24020 + }, + { + "epoch": 0.3857204770542063, + "grad_norm": 0.5739086866378784, + "learning_rate": 4.555570754663318e-05, + "loss": 0.778, + "step": 24030 + }, + { + "epoch": 0.38588099327437037, + "grad_norm": 0.6470929980278015, + "learning_rate": 4.555211865514133e-05, + "loss": 0.7001, + "step": 24040 + }, + { + "epoch": 0.3860415094945344, + "grad_norm": 1.022923231124878, + "learning_rate": 4.554852845665535e-05, + "loss": 0.7099, + "step": 24050 + }, + { + "epoch": 0.38620202571469847, + "grad_norm": 0.786246657371521, + "learning_rate": 4.5544936951403585e-05, + "loss": 0.7529, + "step": 24060 + }, + { + "epoch": 0.3863625419348625, + "grad_norm": 0.7856319546699524, + "learning_rate": 4.5541344139614405e-05, + "loss": 0.6663, + "step": 24070 + }, + { + "epoch": 0.3865230581550266, + "grad_norm": 0.7413445115089417, + "learning_rate": 4.55377500215163e-05, + "loss": 0.9612, + "step": 24080 + }, + { + "epoch": 0.38668357437519063, + "grad_norm": 0.7457872033119202, + "learning_rate": 4.553415459733785e-05, + "loss": 0.7643, + "step": 24090 + }, + { + "epoch": 0.3868440905953547, + "grad_norm": 0.571061909198761, + "learning_rate": 4.553055786730768e-05, + "loss": 0.8576, + "step": 24100 + }, + { + "epoch": 0.3870046068155187, + "grad_norm": 0.5689486861228943, + "learning_rate": 4.5526959831654536e-05, + "loss": 0.6946, + "step": 24110 + }, + { + "epoch": 0.38716512303568273, + "grad_norm": 0.7376084923744202, + "learning_rate": 4.5523360490607225e-05, + "loss": 0.8631, + "step": 24120 + }, + { + "epoch": 0.3873256392558468, + "grad_norm": 1.0781927108764648, + "learning_rate": 4.551975984439465e-05, + "loss": 0.7813, + "step": 24130 + }, + { + "epoch": 0.38748615547601084, + "grad_norm": 0.6681115031242371, + "learning_rate": 4.551615789324579e-05, + "loss": 0.7509, + "step": 24140 + }, + { + "epoch": 0.3876466716961749, + "grad_norm": 0.6831288933753967, + "learning_rate": 4.551255463738971e-05, + "loss": 0.7115, + "step": 24150 + }, + { + "epoch": 0.38780718791633895, + "grad_norm": 1.1624404191970825, + "learning_rate": 4.550895007705553e-05, + "loss": 0.8075, + "step": 24160 + }, + { + "epoch": 0.387967704136503, + "grad_norm": 0.7589865326881409, + "learning_rate": 4.550534421247252e-05, + "loss": 0.8273, + "step": 24170 + }, + { + "epoch": 0.38812822035666705, + "grad_norm": 0.6146861910820007, + "learning_rate": 4.550173704386997e-05, + "loss": 0.6485, + "step": 24180 + }, + { + "epoch": 0.3882887365768311, + "grad_norm": 0.787503182888031, + "learning_rate": 4.5498128571477274e-05, + "loss": 0.6966, + "step": 24190 + }, + { + "epoch": 0.38844925279699516, + "grad_norm": 0.6921367049217224, + "learning_rate": 4.549451879552391e-05, + "loss": 0.7534, + "step": 24200 + }, + { + "epoch": 0.38860976901715916, + "grad_norm": 0.7584224343299866, + "learning_rate": 4.549090771623945e-05, + "loss": 0.8118, + "step": 24210 + }, + { + "epoch": 0.3887702852373232, + "grad_norm": 1.418880820274353, + "learning_rate": 4.548729533385352e-05, + "loss": 0.7651, + "step": 24220 + }, + { + "epoch": 0.38893080145748726, + "grad_norm": 0.9400140643119812, + "learning_rate": 4.5483681648595856e-05, + "loss": 0.9064, + "step": 24230 + }, + { + "epoch": 0.3890913176776513, + "grad_norm": 0.7684749960899353, + "learning_rate": 4.548006666069626e-05, + "loss": 0.6838, + "step": 24240 + }, + { + "epoch": 0.38925183389781537, + "grad_norm": 0.7030231952667236, + "learning_rate": 4.5476450370384645e-05, + "loss": 0.7556, + "step": 24250 + }, + { + "epoch": 0.3894123501179794, + "grad_norm": 0.6187105774879456, + "learning_rate": 4.547283277789096e-05, + "loss": 0.8249, + "step": 24260 + }, + { + "epoch": 0.3895728663381435, + "grad_norm": 0.9427775144577026, + "learning_rate": 4.5469213883445285e-05, + "loss": 0.9291, + "step": 24270 + }, + { + "epoch": 0.38973338255830753, + "grad_norm": 0.5715904235839844, + "learning_rate": 4.546559368727774e-05, + "loss": 0.74, + "step": 24280 + }, + { + "epoch": 0.3898938987784716, + "grad_norm": 0.5233327150344849, + "learning_rate": 4.5461972189618554e-05, + "loss": 0.7604, + "step": 24290 + }, + { + "epoch": 0.39005441499863563, + "grad_norm": 0.5863792300224304, + "learning_rate": 4.545834939069804e-05, + "loss": 0.7522, + "step": 24300 + }, + { + "epoch": 0.39021493121879963, + "grad_norm": 0.6770857572555542, + "learning_rate": 4.5454725290746585e-05, + "loss": 0.6389, + "step": 24310 + }, + { + "epoch": 0.3903754474389637, + "grad_norm": 0.5156643390655518, + "learning_rate": 4.545109988999466e-05, + "loss": 0.7756, + "step": 24320 + }, + { + "epoch": 0.39053596365912774, + "grad_norm": 0.9823703765869141, + "learning_rate": 4.5447473188672816e-05, + "loss": 0.8086, + "step": 24330 + }, + { + "epoch": 0.3906964798792918, + "grad_norm": 0.6309024691581726, + "learning_rate": 4.544384518701169e-05, + "loss": 0.7816, + "step": 24340 + }, + { + "epoch": 0.39085699609945584, + "grad_norm": 0.5780825018882751, + "learning_rate": 4.544021588524201e-05, + "loss": 0.7156, + "step": 24350 + }, + { + "epoch": 0.3910175123196199, + "grad_norm": 0.9141623377799988, + "learning_rate": 4.543658528359456e-05, + "loss": 0.8516, + "step": 24360 + }, + { + "epoch": 0.39117802853978395, + "grad_norm": 0.6925287842750549, + "learning_rate": 4.5432953382300245e-05, + "loss": 0.8436, + "step": 24370 + }, + { + "epoch": 0.391338544759948, + "grad_norm": 0.6072632670402527, + "learning_rate": 4.542932018159002e-05, + "loss": 0.7856, + "step": 24380 + }, + { + "epoch": 0.39149906098011206, + "grad_norm": 0.695770263671875, + "learning_rate": 4.542568568169494e-05, + "loss": 0.828, + "step": 24390 + }, + { + "epoch": 0.3916595772002761, + "grad_norm": 1.002071738243103, + "learning_rate": 4.542204988284614e-05, + "loss": 0.8811, + "step": 24400 + }, + { + "epoch": 0.39182009342044016, + "grad_norm": 0.8940895199775696, + "learning_rate": 4.541841278527483e-05, + "loss": 0.7918, + "step": 24410 + }, + { + "epoch": 0.39198060964060416, + "grad_norm": 0.6040815711021423, + "learning_rate": 4.541477438921232e-05, + "loss": 0.7464, + "step": 24420 + }, + { + "epoch": 0.3921411258607682, + "grad_norm": 0.879496157169342, + "learning_rate": 4.541113469488997e-05, + "loss": 0.7942, + "step": 24430 + }, + { + "epoch": 0.39230164208093227, + "grad_norm": 0.8317579030990601, + "learning_rate": 4.540749370253925e-05, + "loss": 0.8502, + "step": 24440 + }, + { + "epoch": 0.3924621583010963, + "grad_norm": 0.5303470492362976, + "learning_rate": 4.5403851412391726e-05, + "loss": 0.7239, + "step": 24450 + }, + { + "epoch": 0.3926226745212604, + "grad_norm": 0.8070871829986572, + "learning_rate": 4.5400207824679e-05, + "loss": 0.8896, + "step": 24460 + }, + { + "epoch": 0.3927831907414244, + "grad_norm": 0.7398529052734375, + "learning_rate": 4.53965629396328e-05, + "loss": 0.9194, + "step": 24470 + }, + { + "epoch": 0.3929437069615885, + "grad_norm": 0.9029960036277771, + "learning_rate": 4.53929167574849e-05, + "loss": 0.9052, + "step": 24480 + }, + { + "epoch": 0.39310422318175253, + "grad_norm": 0.5188467502593994, + "learning_rate": 4.538926927846721e-05, + "loss": 0.7623, + "step": 24490 + }, + { + "epoch": 0.3932647394019166, + "grad_norm": 0.8813090920448303, + "learning_rate": 4.538562050281165e-05, + "loss": 0.8609, + "step": 24500 + }, + { + "epoch": 0.39342525562208064, + "grad_norm": 0.6028708815574646, + "learning_rate": 4.538197043075028e-05, + "loss": 0.7565, + "step": 24510 + }, + { + "epoch": 0.39358577184224464, + "grad_norm": 0.8088762760162354, + "learning_rate": 4.5378319062515223e-05, + "loss": 0.7336, + "step": 24520 + }, + { + "epoch": 0.3937462880624087, + "grad_norm": 0.5796242952346802, + "learning_rate": 4.537466639833868e-05, + "loss": 0.725, + "step": 24530 + }, + { + "epoch": 0.39390680428257274, + "grad_norm": 0.46934226155281067, + "learning_rate": 4.537101243845295e-05, + "loss": 0.8238, + "step": 24540 + }, + { + "epoch": 0.3940673205027368, + "grad_norm": 0.614686131477356, + "learning_rate": 4.5367357183090384e-05, + "loss": 0.8637, + "step": 24550 + }, + { + "epoch": 0.39422783672290085, + "grad_norm": 0.8760473132133484, + "learning_rate": 4.536370063248345e-05, + "loss": 0.8053, + "step": 24560 + }, + { + "epoch": 0.3943883529430649, + "grad_norm": 0.5828807950019836, + "learning_rate": 4.5360042786864674e-05, + "loss": 0.691, + "step": 24570 + }, + { + "epoch": 0.39454886916322895, + "grad_norm": 0.7572222352027893, + "learning_rate": 4.5356383646466684e-05, + "loss": 0.8255, + "step": 24580 + }, + { + "epoch": 0.394709385383393, + "grad_norm": 0.5499728322029114, + "learning_rate": 4.535272321152217e-05, + "loss": 0.8174, + "step": 24590 + }, + { + "epoch": 0.39486990160355706, + "grad_norm": 0.7506630420684814, + "learning_rate": 4.5349061482263923e-05, + "loss": 0.7741, + "step": 24600 + }, + { + "epoch": 0.3950304178237211, + "grad_norm": 0.6165923476219177, + "learning_rate": 4.534539845892479e-05, + "loss": 0.7386, + "step": 24610 + }, + { + "epoch": 0.3951909340438851, + "grad_norm": 0.5080572366714478, + "learning_rate": 4.534173414173774e-05, + "loss": 0.8171, + "step": 24620 + }, + { + "epoch": 0.39535145026404916, + "grad_norm": 0.46716031432151794, + "learning_rate": 4.5338068530935794e-05, + "loss": 0.7312, + "step": 24630 + }, + { + "epoch": 0.3955119664842132, + "grad_norm": 0.7674124240875244, + "learning_rate": 4.533440162675207e-05, + "loss": 0.7986, + "step": 24640 + }, + { + "epoch": 0.39567248270437727, + "grad_norm": 0.8052200675010681, + "learning_rate": 4.533073342941974e-05, + "loss": 0.8095, + "step": 24650 + }, + { + "epoch": 0.3958329989245413, + "grad_norm": 0.4918213188648224, + "learning_rate": 4.5327063939172094e-05, + "loss": 0.6853, + "step": 24660 + }, + { + "epoch": 0.3959935151447054, + "grad_norm": 0.5860998034477234, + "learning_rate": 4.5323393156242486e-05, + "loss": 0.8525, + "step": 24670 + }, + { + "epoch": 0.39615403136486943, + "grad_norm": 0.6232370734214783, + "learning_rate": 4.5319721080864375e-05, + "loss": 0.8487, + "step": 24680 + }, + { + "epoch": 0.3963145475850335, + "grad_norm": 1.1743518114089966, + "learning_rate": 4.531604771327125e-05, + "loss": 0.8491, + "step": 24690 + }, + { + "epoch": 0.39647506380519754, + "grad_norm": 0.6020656824111938, + "learning_rate": 4.531237305369674e-05, + "loss": 0.7851, + "step": 24700 + }, + { + "epoch": 0.3966355800253616, + "grad_norm": 0.5549682974815369, + "learning_rate": 4.530869710237453e-05, + "loss": 0.7092, + "step": 24710 + }, + { + "epoch": 0.3967960962455256, + "grad_norm": 0.8065360188484192, + "learning_rate": 4.5305019859538385e-05, + "loss": 0.8434, + "step": 24720 + }, + { + "epoch": 0.39695661246568964, + "grad_norm": 0.9088863730430603, + "learning_rate": 4.5301341325422155e-05, + "loss": 0.8094, + "step": 24730 + }, + { + "epoch": 0.3971171286858537, + "grad_norm": 0.7540697455406189, + "learning_rate": 4.5297661500259764e-05, + "loss": 0.878, + "step": 24740 + }, + { + "epoch": 0.39727764490601775, + "grad_norm": 0.6445760130882263, + "learning_rate": 4.529398038428524e-05, + "loss": 0.734, + "step": 24750 + }, + { + "epoch": 0.3974381611261818, + "grad_norm": 0.6647154092788696, + "learning_rate": 4.529029797773268e-05, + "loss": 0.8562, + "step": 24760 + }, + { + "epoch": 0.39759867734634585, + "grad_norm": 0.699026346206665, + "learning_rate": 4.528661428083626e-05, + "loss": 0.7086, + "step": 24770 + }, + { + "epoch": 0.3977591935665099, + "grad_norm": 0.5066846013069153, + "learning_rate": 4.5282929293830246e-05, + "loss": 0.8477, + "step": 24780 + }, + { + "epoch": 0.39791970978667396, + "grad_norm": 0.8635606169700623, + "learning_rate": 4.5279243016948976e-05, + "loss": 0.8262, + "step": 24790 + }, + { + "epoch": 0.398080226006838, + "grad_norm": 0.5619319677352905, + "learning_rate": 4.5275555450426874e-05, + "loss": 0.7553, + "step": 24800 + }, + { + "epoch": 0.39824074222700206, + "grad_norm": 0.5164029598236084, + "learning_rate": 4.5271866594498446e-05, + "loss": 0.8701, + "step": 24810 + }, + { + "epoch": 0.39840125844716606, + "grad_norm": 0.468504399061203, + "learning_rate": 4.52681764493983e-05, + "loss": 0.6765, + "step": 24820 + }, + { + "epoch": 0.3985617746673301, + "grad_norm": 0.5469168424606323, + "learning_rate": 4.526448501536108e-05, + "loss": 0.7744, + "step": 24830 + }, + { + "epoch": 0.39872229088749417, + "grad_norm": 0.7058166861534119, + "learning_rate": 4.526079229262156e-05, + "loss": 0.8329, + "step": 24840 + }, + { + "epoch": 0.3988828071076582, + "grad_norm": 0.6004014611244202, + "learning_rate": 4.525709828141457e-05, + "loss": 0.7797, + "step": 24850 + }, + { + "epoch": 0.3990433233278223, + "grad_norm": 0.5698897242546082, + "learning_rate": 4.525340298197502e-05, + "loss": 0.7078, + "step": 24860 + }, + { + "epoch": 0.3992038395479863, + "grad_norm": 0.6155770421028137, + "learning_rate": 4.524970639453793e-05, + "loss": 0.9668, + "step": 24870 + }, + { + "epoch": 0.3993643557681504, + "grad_norm": 0.6789118051528931, + "learning_rate": 4.524600851933836e-05, + "loss": 0.8537, + "step": 24880 + }, + { + "epoch": 0.39952487198831443, + "grad_norm": 0.4898076057434082, + "learning_rate": 4.524230935661147e-05, + "loss": 0.742, + "step": 24890 + }, + { + "epoch": 0.3996853882084785, + "grad_norm": 0.7299210429191589, + "learning_rate": 4.523860890659253e-05, + "loss": 0.8702, + "step": 24900 + }, + { + "epoch": 0.39984590442864254, + "grad_norm": 0.6520401835441589, + "learning_rate": 4.523490716951684e-05, + "loss": 0.8081, + "step": 24910 + }, + { + "epoch": 0.40000642064880654, + "grad_norm": 0.8714832663536072, + "learning_rate": 4.523120414561983e-05, + "loss": 0.8355, + "step": 24920 + }, + { + "epoch": 0.4001669368689706, + "grad_norm": 0.653069257736206, + "learning_rate": 4.522749983513698e-05, + "loss": 0.9109, + "step": 24930 + }, + { + "epoch": 0.40032745308913464, + "grad_norm": 0.5572654008865356, + "learning_rate": 4.522379423830386e-05, + "loss": 0.7446, + "step": 24940 + }, + { + "epoch": 0.4004879693092987, + "grad_norm": 0.9861459732055664, + "learning_rate": 4.522008735535613e-05, + "loss": 0.692, + "step": 24950 + }, + { + "epoch": 0.40064848552946275, + "grad_norm": 0.9940382838249207, + "learning_rate": 4.521637918652952e-05, + "loss": 0.8473, + "step": 24960 + }, + { + "epoch": 0.4008090017496268, + "grad_norm": 0.7015661001205444, + "learning_rate": 4.5212669732059856e-05, + "loss": 0.7705, + "step": 24970 + }, + { + "epoch": 0.40096951796979086, + "grad_norm": 0.8791458010673523, + "learning_rate": 4.520895899218304e-05, + "loss": 0.8137, + "step": 24980 + }, + { + "epoch": 0.4011300341899549, + "grad_norm": 0.7917571067810059, + "learning_rate": 4.5205246967135034e-05, + "loss": 0.7644, + "step": 24990 + }, + { + "epoch": 0.40129055041011896, + "grad_norm": 0.6815536022186279, + "learning_rate": 4.520153365715192e-05, + "loss": 0.7258, + "step": 25000 + }, + { + "epoch": 0.401451066630283, + "grad_norm": 0.815666675567627, + "learning_rate": 4.519781906246984e-05, + "loss": 0.7981, + "step": 25010 + }, + { + "epoch": 0.401611582850447, + "grad_norm": 0.6237431168556213, + "learning_rate": 4.519410318332501e-05, + "loss": 0.7788, + "step": 25020 + }, + { + "epoch": 0.40177209907061107, + "grad_norm": 1.030613660812378, + "learning_rate": 4.519038601995375e-05, + "loss": 0.8537, + "step": 25030 + }, + { + "epoch": 0.4019326152907751, + "grad_norm": 0.7832012176513672, + "learning_rate": 4.518666757259243e-05, + "loss": 0.8662, + "step": 25040 + }, + { + "epoch": 0.4020931315109392, + "grad_norm": 0.5063576102256775, + "learning_rate": 4.518294784147755e-05, + "loss": 0.8419, + "step": 25050 + }, + { + "epoch": 0.4022536477311032, + "grad_norm": 0.9271500110626221, + "learning_rate": 4.517922682684565e-05, + "loss": 0.9761, + "step": 25060 + }, + { + "epoch": 0.4024141639512673, + "grad_norm": 0.45984119176864624, + "learning_rate": 4.517550452893337e-05, + "loss": 0.8871, + "step": 25070 + }, + { + "epoch": 0.40257468017143133, + "grad_norm": 0.7789679765701294, + "learning_rate": 4.5171780947977406e-05, + "loss": 0.7832, + "step": 25080 + }, + { + "epoch": 0.4027351963915954, + "grad_norm": 0.5886250734329224, + "learning_rate": 4.516805608421457e-05, + "loss": 0.8046, + "step": 25090 + }, + { + "epoch": 0.40289571261175944, + "grad_norm": 0.7177681922912598, + "learning_rate": 4.516432993788175e-05, + "loss": 0.8583, + "step": 25100 + }, + { + "epoch": 0.4030562288319235, + "grad_norm": 2.339966058731079, + "learning_rate": 4.51606025092159e-05, + "loss": 0.7466, + "step": 25110 + }, + { + "epoch": 0.4032167450520875, + "grad_norm": 1.0352290868759155, + "learning_rate": 4.515687379845405e-05, + "loss": 0.8183, + "step": 25120 + }, + { + "epoch": 0.40337726127225154, + "grad_norm": 1.1042932271957397, + "learning_rate": 4.515314380583334e-05, + "loss": 0.8371, + "step": 25130 + }, + { + "epoch": 0.4035377774924156, + "grad_norm": 0.6539503335952759, + "learning_rate": 4.514941253159097e-05, + "loss": 0.8534, + "step": 25140 + }, + { + "epoch": 0.40369829371257965, + "grad_norm": 0.826859712600708, + "learning_rate": 4.5145679975964236e-05, + "loss": 0.8539, + "step": 25150 + }, + { + "epoch": 0.4038588099327437, + "grad_norm": 0.5967040657997131, + "learning_rate": 4.514194613919049e-05, + "loss": 0.7393, + "step": 25160 + }, + { + "epoch": 0.40401932615290775, + "grad_norm": 0.7431623339653015, + "learning_rate": 4.51382110215072e-05, + "loss": 0.7353, + "step": 25170 + }, + { + "epoch": 0.4041798423730718, + "grad_norm": 0.5350872874259949, + "learning_rate": 4.513447462315188e-05, + "loss": 0.8583, + "step": 25180 + }, + { + "epoch": 0.40434035859323586, + "grad_norm": 0.47206249833106995, + "learning_rate": 4.513073694436216e-05, + "loss": 0.7303, + "step": 25190 + }, + { + "epoch": 0.4045008748133999, + "grad_norm": 0.6374521851539612, + "learning_rate": 4.5126997985375717e-05, + "loss": 0.7589, + "step": 25200 + }, + { + "epoch": 0.40466139103356397, + "grad_norm": 1.0443916320800781, + "learning_rate": 4.5123257746430334e-05, + "loss": 0.8063, + "step": 25210 + }, + { + "epoch": 0.40482190725372796, + "grad_norm": 0.6520248651504517, + "learning_rate": 4.511951622776388e-05, + "loss": 0.6654, + "step": 25220 + }, + { + "epoch": 0.404982423473892, + "grad_norm": 1.280554175376892, + "learning_rate": 4.511577342961427e-05, + "loss": 0.788, + "step": 25230 + }, + { + "epoch": 0.40514293969405607, + "grad_norm": 1.081368088722229, + "learning_rate": 4.5112029352219546e-05, + "loss": 0.7686, + "step": 25240 + }, + { + "epoch": 0.4053034559142201, + "grad_norm": 1.1077321767807007, + "learning_rate": 4.5108283995817805e-05, + "loss": 0.8106, + "step": 25250 + }, + { + "epoch": 0.4054639721343842, + "grad_norm": 1.1062424182891846, + "learning_rate": 4.510453736064721e-05, + "loss": 0.8504, + "step": 25260 + }, + { + "epoch": 0.40562448835454823, + "grad_norm": 0.7002018690109253, + "learning_rate": 4.510078944694606e-05, + "loss": 0.7367, + "step": 25270 + }, + { + "epoch": 0.4057850045747123, + "grad_norm": 0.7555592656135559, + "learning_rate": 4.509704025495267e-05, + "loss": 0.7302, + "step": 25280 + }, + { + "epoch": 0.40594552079487634, + "grad_norm": 0.5861351490020752, + "learning_rate": 4.509328978490548e-05, + "loss": 0.772, + "step": 25290 + }, + { + "epoch": 0.4061060370150404, + "grad_norm": 1.0273548364639282, + "learning_rate": 4.5089538037042994e-05, + "loss": 0.7587, + "step": 25300 + }, + { + "epoch": 0.40626655323520444, + "grad_norm": 1.065994143486023, + "learning_rate": 4.50857850116038e-05, + "loss": 0.7428, + "step": 25310 + }, + { + "epoch": 0.40642706945536844, + "grad_norm": 0.7554126977920532, + "learning_rate": 4.508203070882658e-05, + "loss": 0.8659, + "step": 25320 + }, + { + "epoch": 0.4065875856755325, + "grad_norm": 0.6028984189033508, + "learning_rate": 4.507827512895006e-05, + "loss": 0.7562, + "step": 25330 + }, + { + "epoch": 0.40674810189569655, + "grad_norm": 0.48964154720306396, + "learning_rate": 4.5074518272213105e-05, + "loss": 0.8076, + "step": 25340 + }, + { + "epoch": 0.4069086181158606, + "grad_norm": 0.6770232319831848, + "learning_rate": 4.5070760138854605e-05, + "loss": 0.7994, + "step": 25350 + }, + { + "epoch": 0.40706913433602465, + "grad_norm": 0.6591051816940308, + "learning_rate": 4.5067000729113575e-05, + "loss": 0.6815, + "step": 25360 + }, + { + "epoch": 0.4072296505561887, + "grad_norm": 0.6788187026977539, + "learning_rate": 4.506324004322907e-05, + "loss": 0.7853, + "step": 25370 + }, + { + "epoch": 0.40739016677635276, + "grad_norm": 0.7069053649902344, + "learning_rate": 4.505947808144026e-05, + "loss": 0.7078, + "step": 25380 + }, + { + "epoch": 0.4075506829965168, + "grad_norm": 0.9749802947044373, + "learning_rate": 4.5055714843986374e-05, + "loss": 0.7754, + "step": 25390 + }, + { + "epoch": 0.40771119921668086, + "grad_norm": 1.0444962978363037, + "learning_rate": 4.505195033110675e-05, + "loss": 0.8137, + "step": 25400 + }, + { + "epoch": 0.4078717154368449, + "grad_norm": 0.7288854718208313, + "learning_rate": 4.5048184543040774e-05, + "loss": 0.8419, + "step": 25410 + }, + { + "epoch": 0.4080322316570089, + "grad_norm": 0.7674839496612549, + "learning_rate": 4.504441748002793e-05, + "loss": 0.7541, + "step": 25420 + }, + { + "epoch": 0.40819274787717297, + "grad_norm": 0.6403718590736389, + "learning_rate": 4.504064914230778e-05, + "loss": 0.7149, + "step": 25430 + }, + { + "epoch": 0.408353264097337, + "grad_norm": 0.7159228920936584, + "learning_rate": 4.503687953011998e-05, + "loss": 0.7518, + "step": 25440 + }, + { + "epoch": 0.4085137803175011, + "grad_norm": 0.7931586503982544, + "learning_rate": 4.5033108643704236e-05, + "loss": 0.7752, + "step": 25450 + }, + { + "epoch": 0.4086742965376651, + "grad_norm": 0.9666417837142944, + "learning_rate": 4.5029336483300375e-05, + "loss": 0.8228, + "step": 25460 + }, + { + "epoch": 0.4088348127578292, + "grad_norm": 0.5619932413101196, + "learning_rate": 4.502556304914827e-05, + "loss": 0.7763, + "step": 25470 + }, + { + "epoch": 0.40899532897799323, + "grad_norm": 0.5086804628372192, + "learning_rate": 4.502178834148789e-05, + "loss": 0.7509, + "step": 25480 + }, + { + "epoch": 0.4091558451981573, + "grad_norm": 0.8416545391082764, + "learning_rate": 4.5018012360559294e-05, + "loss": 0.6955, + "step": 25490 + }, + { + "epoch": 0.40931636141832134, + "grad_norm": 0.6998310089111328, + "learning_rate": 4.5014235106602596e-05, + "loss": 0.744, + "step": 25500 + }, + { + "epoch": 0.4094768776384854, + "grad_norm": 0.9111866354942322, + "learning_rate": 4.501045657985803e-05, + "loss": 0.7362, + "step": 25510 + }, + { + "epoch": 0.4096373938586494, + "grad_norm": 0.5711603760719299, + "learning_rate": 4.500667678056586e-05, + "loss": 0.7358, + "step": 25520 + }, + { + "epoch": 0.40979791007881344, + "grad_norm": 0.6176214218139648, + "learning_rate": 4.5002895708966484e-05, + "loss": 0.7028, + "step": 25530 + }, + { + "epoch": 0.4099584262989775, + "grad_norm": 0.662450909614563, + "learning_rate": 4.499911336530034e-05, + "loss": 0.9169, + "step": 25540 + }, + { + "epoch": 0.41011894251914155, + "grad_norm": 0.6060582399368286, + "learning_rate": 4.499532974980798e-05, + "loss": 0.8691, + "step": 25550 + }, + { + "epoch": 0.4102794587393056, + "grad_norm": 1.2691251039505005, + "learning_rate": 4.499154486273e-05, + "loss": 0.7529, + "step": 25560 + }, + { + "epoch": 0.41043997495946966, + "grad_norm": 0.5733831524848938, + "learning_rate": 4.4987758704307104e-05, + "loss": 0.8862, + "step": 25570 + }, + { + "epoch": 0.4106004911796337, + "grad_norm": 0.8366230726242065, + "learning_rate": 4.498397127478007e-05, + "loss": 0.8463, + "step": 25580 + }, + { + "epoch": 0.41076100739979776, + "grad_norm": 0.5599640607833862, + "learning_rate": 4.498018257438976e-05, + "loss": 0.7743, + "step": 25590 + }, + { + "epoch": 0.4109215236199618, + "grad_norm": 0.771367073059082, + "learning_rate": 4.497639260337711e-05, + "loss": 0.8415, + "step": 25600 + }, + { + "epoch": 0.41108203984012587, + "grad_norm": 0.7658674716949463, + "learning_rate": 4.497260136198314e-05, + "loss": 0.7644, + "step": 25610 + }, + { + "epoch": 0.41124255606028987, + "grad_norm": 0.8896645903587341, + "learning_rate": 4.4968808850448954e-05, + "loss": 0.7541, + "step": 25620 + }, + { + "epoch": 0.4114030722804539, + "grad_norm": 0.744754433631897, + "learning_rate": 4.4965015069015724e-05, + "loss": 0.8328, + "step": 25630 + }, + { + "epoch": 0.41156358850061797, + "grad_norm": 0.8065279126167297, + "learning_rate": 4.496122001792472e-05, + "loss": 0.8102, + "step": 25640 + }, + { + "epoch": 0.411724104720782, + "grad_norm": 0.5003470182418823, + "learning_rate": 4.495742369741729e-05, + "loss": 0.8043, + "step": 25650 + }, + { + "epoch": 0.4118846209409461, + "grad_norm": 0.7646490931510925, + "learning_rate": 4.495362610773484e-05, + "loss": 0.8178, + "step": 25660 + }, + { + "epoch": 0.41204513716111013, + "grad_norm": 0.7647542953491211, + "learning_rate": 4.494982724911889e-05, + "loss": 0.8476, + "step": 25670 + }, + { + "epoch": 0.4122056533812742, + "grad_norm": 0.7367228865623474, + "learning_rate": 4.4946027121811015e-05, + "loss": 0.7038, + "step": 25680 + }, + { + "epoch": 0.41236616960143824, + "grad_norm": 0.6035006046295166, + "learning_rate": 4.49422257260529e-05, + "loss": 0.7185, + "step": 25690 + }, + { + "epoch": 0.4125266858216023, + "grad_norm": 0.8641030788421631, + "learning_rate": 4.4938423062086274e-05, + "loss": 0.7714, + "step": 25700 + }, + { + "epoch": 0.41268720204176634, + "grad_norm": 0.893911600112915, + "learning_rate": 4.493461913015296e-05, + "loss": 0.8288, + "step": 25710 + }, + { + "epoch": 0.41284771826193034, + "grad_norm": 0.8260449171066284, + "learning_rate": 4.493081393049488e-05, + "loss": 0.693, + "step": 25720 + }, + { + "epoch": 0.4130082344820944, + "grad_norm": 0.58067786693573, + "learning_rate": 4.492700746335401e-05, + "loss": 0.6781, + "step": 25730 + }, + { + "epoch": 0.41316875070225845, + "grad_norm": 1.0043631792068481, + "learning_rate": 4.492319972897243e-05, + "loss": 0.7569, + "step": 25740 + }, + { + "epoch": 0.4133292669224225, + "grad_norm": 0.5949000716209412, + "learning_rate": 4.4919390727592284e-05, + "loss": 0.8653, + "step": 25750 + }, + { + "epoch": 0.41348978314258655, + "grad_norm": 0.596773087978363, + "learning_rate": 4.491558045945581e-05, + "loss": 0.7355, + "step": 25760 + }, + { + "epoch": 0.4136502993627506, + "grad_norm": 0.5546436309814453, + "learning_rate": 4.4911768924805295e-05, + "loss": 0.7464, + "step": 25770 + }, + { + "epoch": 0.41381081558291466, + "grad_norm": 0.6991166472434998, + "learning_rate": 4.490795612388315e-05, + "loss": 0.7736, + "step": 25780 + }, + { + "epoch": 0.4139713318030787, + "grad_norm": 0.7279170751571655, + "learning_rate": 4.490414205693186e-05, + "loss": 0.8785, + "step": 25790 + }, + { + "epoch": 0.41413184802324277, + "grad_norm": 1.007216453552246, + "learning_rate": 4.4900326724193944e-05, + "loss": 0.8706, + "step": 25800 + }, + { + "epoch": 0.4142923642434068, + "grad_norm": 0.5356006622314453, + "learning_rate": 4.4896510125912064e-05, + "loss": 0.7693, + "step": 25810 + }, + { + "epoch": 0.4144528804635708, + "grad_norm": 0.4963458180427551, + "learning_rate": 4.4892692262328905e-05, + "loss": 0.8858, + "step": 25820 + }, + { + "epoch": 0.41461339668373487, + "grad_norm": 0.8027575612068176, + "learning_rate": 4.488887313368729e-05, + "loss": 0.7289, + "step": 25830 + }, + { + "epoch": 0.4147739129038989, + "grad_norm": 0.561345636844635, + "learning_rate": 4.488505274023007e-05, + "loss": 0.7589, + "step": 25840 + }, + { + "epoch": 0.414934429124063, + "grad_norm": 0.5900092720985413, + "learning_rate": 4.4881231082200215e-05, + "loss": 0.8618, + "step": 25850 + }, + { + "epoch": 0.41509494534422703, + "grad_norm": 0.7609391808509827, + "learning_rate": 4.487740815984075e-05, + "loss": 0.9149, + "step": 25860 + }, + { + "epoch": 0.4152554615643911, + "grad_norm": 0.9404726028442383, + "learning_rate": 4.4873583973394796e-05, + "loss": 0.7462, + "step": 25870 + }, + { + "epoch": 0.41541597778455513, + "grad_norm": 0.5283678770065308, + "learning_rate": 4.486975852310555e-05, + "loss": 0.7031, + "step": 25880 + }, + { + "epoch": 0.4155764940047192, + "grad_norm": 0.7788367867469788, + "learning_rate": 4.4865931809216285e-05, + "loss": 0.7722, + "step": 25890 + }, + { + "epoch": 0.41573701022488324, + "grad_norm": 0.8642920851707458, + "learning_rate": 4.486210383197036e-05, + "loss": 0.7287, + "step": 25900 + }, + { + "epoch": 0.4158975264450473, + "grad_norm": 0.5500311851501465, + "learning_rate": 4.4858274591611205e-05, + "loss": 0.7386, + "step": 25910 + }, + { + "epoch": 0.41605804266521135, + "grad_norm": 0.7521904706954956, + "learning_rate": 4.485444408838234e-05, + "loss": 0.8609, + "step": 25920 + }, + { + "epoch": 0.41621855888537534, + "grad_norm": 0.7565950751304626, + "learning_rate": 4.485061232252737e-05, + "loss": 0.7932, + "step": 25930 + }, + { + "epoch": 0.4163790751055394, + "grad_norm": 0.49843865633010864, + "learning_rate": 4.484677929428997e-05, + "loss": 0.7022, + "step": 25940 + }, + { + "epoch": 0.41653959132570345, + "grad_norm": 0.8471740484237671, + "learning_rate": 4.4842945003913896e-05, + "loss": 0.8238, + "step": 25950 + }, + { + "epoch": 0.4167001075458675, + "grad_norm": 0.8572263717651367, + "learning_rate": 4.483910945164298e-05, + "loss": 0.8827, + "step": 25960 + }, + { + "epoch": 0.41686062376603156, + "grad_norm": 0.5333188772201538, + "learning_rate": 4.483527263772115e-05, + "loss": 0.7999, + "step": 25970 + }, + { + "epoch": 0.4170211399861956, + "grad_norm": 0.9517744183540344, + "learning_rate": 4.4831434562392396e-05, + "loss": 0.7639, + "step": 25980 + }, + { + "epoch": 0.41718165620635966, + "grad_norm": 0.665369987487793, + "learning_rate": 4.482759522590081e-05, + "loss": 0.8042, + "step": 25990 + }, + { + "epoch": 0.4173421724265237, + "grad_norm": 0.7651581764221191, + "learning_rate": 4.4823754628490536e-05, + "loss": 0.7476, + "step": 26000 + }, + { + "epoch": 0.41750268864668777, + "grad_norm": 0.7233784198760986, + "learning_rate": 4.481991277040583e-05, + "loss": 0.8596, + "step": 26010 + }, + { + "epoch": 0.4176632048668518, + "grad_norm": 0.8313022255897522, + "learning_rate": 4.4816069651891004e-05, + "loss": 0.756, + "step": 26020 + }, + { + "epoch": 0.4178237210870158, + "grad_norm": 0.5175731182098389, + "learning_rate": 4.4812225273190446e-05, + "loss": 0.7559, + "step": 26030 + }, + { + "epoch": 0.4179842373071799, + "grad_norm": 0.689504861831665, + "learning_rate": 4.480837963454865e-05, + "loss": 0.7965, + "step": 26040 + }, + { + "epoch": 0.4181447535273439, + "grad_norm": 0.8751513957977295, + "learning_rate": 4.480453273621018e-05, + "loss": 0.8922, + "step": 26050 + }, + { + "epoch": 0.418305269747508, + "grad_norm": 0.5681271553039551, + "learning_rate": 4.480068457841966e-05, + "loss": 0.7717, + "step": 26060 + }, + { + "epoch": 0.41846578596767203, + "grad_norm": 0.8880943655967712, + "learning_rate": 4.479683516142183e-05, + "loss": 0.8509, + "step": 26070 + }, + { + "epoch": 0.4186263021878361, + "grad_norm": 0.6971122622489929, + "learning_rate": 4.479298448546146e-05, + "loss": 0.7083, + "step": 26080 + }, + { + "epoch": 0.41878681840800014, + "grad_norm": 0.5813639760017395, + "learning_rate": 4.478913255078346e-05, + "loss": 0.7973, + "step": 26090 + }, + { + "epoch": 0.4189473346281642, + "grad_norm": 0.8776058554649353, + "learning_rate": 4.478527935763279e-05, + "loss": 0.8481, + "step": 26100 + }, + { + "epoch": 0.41910785084832824, + "grad_norm": 1.487004041671753, + "learning_rate": 4.478142490625447e-05, + "loss": 0.7543, + "step": 26110 + }, + { + "epoch": 0.4192683670684923, + "grad_norm": 0.8894581198692322, + "learning_rate": 4.4777569196893633e-05, + "loss": 0.8182, + "step": 26120 + }, + { + "epoch": 0.4194288832886563, + "grad_norm": 1.1202421188354492, + "learning_rate": 4.477371222979548e-05, + "loss": 0.8123, + "step": 26130 + }, + { + "epoch": 0.41958939950882035, + "grad_norm": 0.6067134737968445, + "learning_rate": 4.476985400520529e-05, + "loss": 0.7525, + "step": 26140 + }, + { + "epoch": 0.4197499157289844, + "grad_norm": 0.562217652797699, + "learning_rate": 4.476599452336841e-05, + "loss": 0.7298, + "step": 26150 + }, + { + "epoch": 0.41991043194914845, + "grad_norm": 0.748976469039917, + "learning_rate": 4.476213378453031e-05, + "loss": 0.7779, + "step": 26160 + }, + { + "epoch": 0.4200709481693125, + "grad_norm": 0.794520914554596, + "learning_rate": 4.4758271788936484e-05, + "loss": 0.7646, + "step": 26170 + }, + { + "epoch": 0.42023146438947656, + "grad_norm": 0.8652470707893372, + "learning_rate": 4.475440853683255e-05, + "loss": 0.8593, + "step": 26180 + }, + { + "epoch": 0.4203919806096406, + "grad_norm": 0.6524502038955688, + "learning_rate": 4.475054402846417e-05, + "loss": 0.7368, + "step": 26190 + }, + { + "epoch": 0.42055249682980467, + "grad_norm": 0.8017826080322266, + "learning_rate": 4.4746678264077113e-05, + "loss": 0.8269, + "step": 26200 + }, + { + "epoch": 0.4207130130499687, + "grad_norm": 0.7184417843818665, + "learning_rate": 4.474281124391723e-05, + "loss": 0.8322, + "step": 26210 + }, + { + "epoch": 0.4208735292701328, + "grad_norm": 0.7461978793144226, + "learning_rate": 4.473894296823043e-05, + "loss": 0.8096, + "step": 26220 + }, + { + "epoch": 0.42103404549029677, + "grad_norm": 0.7483953237533569, + "learning_rate": 4.47350734372627e-05, + "loss": 0.7546, + "step": 26230 + }, + { + "epoch": 0.4211945617104608, + "grad_norm": 0.5690829157829285, + "learning_rate": 4.473120265126014e-05, + "loss": 0.8473, + "step": 26240 + }, + { + "epoch": 0.4213550779306249, + "grad_norm": 0.7566683888435364, + "learning_rate": 4.47273306104689e-05, + "loss": 0.7402, + "step": 26250 + }, + { + "epoch": 0.42151559415078893, + "grad_norm": 0.5186206698417664, + "learning_rate": 4.472345731513522e-05, + "loss": 0.9424, + "step": 26260 + }, + { + "epoch": 0.421676110370953, + "grad_norm": 0.6196357607841492, + "learning_rate": 4.4719582765505425e-05, + "loss": 0.7707, + "step": 26270 + }, + { + "epoch": 0.42183662659111704, + "grad_norm": 0.6412242650985718, + "learning_rate": 4.47157069618259e-05, + "loss": 0.8171, + "step": 26280 + }, + { + "epoch": 0.4219971428112811, + "grad_norm": 0.5959380269050598, + "learning_rate": 4.4711829904343136e-05, + "loss": 0.7867, + "step": 26290 + }, + { + "epoch": 0.42215765903144514, + "grad_norm": 0.8295561671257019, + "learning_rate": 4.4707951593303685e-05, + "loss": 0.7497, + "step": 26300 + }, + { + "epoch": 0.4223181752516092, + "grad_norm": 0.7001942992210388, + "learning_rate": 4.470407202895419e-05, + "loss": 0.891, + "step": 26310 + }, + { + "epoch": 0.42247869147177325, + "grad_norm": 0.5174795985221863, + "learning_rate": 4.470019121154137e-05, + "loss": 0.7099, + "step": 26320 + }, + { + "epoch": 0.42263920769193725, + "grad_norm": 0.4643927216529846, + "learning_rate": 4.469630914131201e-05, + "loss": 0.816, + "step": 26330 + }, + { + "epoch": 0.4227997239121013, + "grad_norm": 0.4730410873889923, + "learning_rate": 4.4692425818512997e-05, + "loss": 0.7967, + "step": 26340 + }, + { + "epoch": 0.42296024013226535, + "grad_norm": 0.9865112900733948, + "learning_rate": 4.468854124339129e-05, + "loss": 0.8965, + "step": 26350 + }, + { + "epoch": 0.4231207563524294, + "grad_norm": 1.7899413108825684, + "learning_rate": 4.468465541619391e-05, + "loss": 0.7127, + "step": 26360 + }, + { + "epoch": 0.42328127257259346, + "grad_norm": 1.0242228507995605, + "learning_rate": 4.468076833716799e-05, + "loss": 0.8408, + "step": 26370 + }, + { + "epoch": 0.4234417887927575, + "grad_norm": 0.9304213523864746, + "learning_rate": 4.467688000656071e-05, + "loss": 0.7773, + "step": 26380 + }, + { + "epoch": 0.42360230501292157, + "grad_norm": 0.41810446977615356, + "learning_rate": 4.4672990424619355e-05, + "loss": 0.7774, + "step": 26390 + }, + { + "epoch": 0.4237628212330856, + "grad_norm": 0.5603705048561096, + "learning_rate": 4.4669099591591276e-05, + "loss": 0.6967, + "step": 26400 + }, + { + "epoch": 0.42392333745324967, + "grad_norm": 0.7455936670303345, + "learning_rate": 4.466520750772392e-05, + "loss": 0.8537, + "step": 26410 + }, + { + "epoch": 0.4240838536734137, + "grad_norm": 0.5762329697608948, + "learning_rate": 4.466131417326478e-05, + "loss": 0.8023, + "step": 26420 + }, + { + "epoch": 0.4242443698935777, + "grad_norm": 0.7126042246818542, + "learning_rate": 4.465741958846147e-05, + "loss": 0.7806, + "step": 26430 + }, + { + "epoch": 0.4244048861137418, + "grad_norm": 0.7919391989707947, + "learning_rate": 4.465352375356164e-05, + "loss": 0.7201, + "step": 26440 + }, + { + "epoch": 0.42456540233390583, + "grad_norm": 0.791644811630249, + "learning_rate": 4.464962666881305e-05, + "loss": 0.839, + "step": 26450 + }, + { + "epoch": 0.4247259185540699, + "grad_norm": 0.4493503272533417, + "learning_rate": 4.4645728334463545e-05, + "loss": 0.773, + "step": 26460 + }, + { + "epoch": 0.42488643477423393, + "grad_norm": 0.6722261309623718, + "learning_rate": 4.4641828750761026e-05, + "loss": 0.7407, + "step": 26470 + }, + { + "epoch": 0.425046950994398, + "grad_norm": 0.5178573727607727, + "learning_rate": 4.463792791795348e-05, + "loss": 0.7656, + "step": 26480 + }, + { + "epoch": 0.42520746721456204, + "grad_norm": 0.5902160406112671, + "learning_rate": 4.4634025836288986e-05, + "loss": 0.7251, + "step": 26490 + }, + { + "epoch": 0.4253679834347261, + "grad_norm": 0.5122612714767456, + "learning_rate": 4.4630122506015685e-05, + "loss": 0.8524, + "step": 26500 + }, + { + "epoch": 0.42552849965489015, + "grad_norm": 0.6587738990783691, + "learning_rate": 4.462621792738181e-05, + "loss": 0.8467, + "step": 26510 + }, + { + "epoch": 0.4256890158750542, + "grad_norm": 0.6771251559257507, + "learning_rate": 4.462231210063568e-05, + "loss": 0.9072, + "step": 26520 + }, + { + "epoch": 0.4258495320952182, + "grad_norm": 0.7958436608314514, + "learning_rate": 4.4618405026025656e-05, + "loss": 0.8534, + "step": 26530 + }, + { + "epoch": 0.42601004831538225, + "grad_norm": 0.8012518882751465, + "learning_rate": 4.4614496703800225e-05, + "loss": 0.8421, + "step": 26540 + }, + { + "epoch": 0.4261705645355463, + "grad_norm": 0.6730217337608337, + "learning_rate": 4.461058713420793e-05, + "loss": 0.7616, + "step": 26550 + }, + { + "epoch": 0.42633108075571036, + "grad_norm": 0.6121853590011597, + "learning_rate": 4.46066763174974e-05, + "loss": 0.8726, + "step": 26560 + }, + { + "epoch": 0.4264915969758744, + "grad_norm": 0.6428724527359009, + "learning_rate": 4.460276425391733e-05, + "loss": 0.7955, + "step": 26570 + }, + { + "epoch": 0.42665211319603846, + "grad_norm": 0.7657767534255981, + "learning_rate": 4.4598850943716514e-05, + "loss": 0.8339, + "step": 26580 + }, + { + "epoch": 0.4268126294162025, + "grad_norm": 1.1492704153060913, + "learning_rate": 4.45949363871438e-05, + "loss": 0.6923, + "step": 26590 + }, + { + "epoch": 0.42697314563636657, + "grad_norm": 0.9895263314247131, + "learning_rate": 4.459102058444816e-05, + "loss": 0.8428, + "step": 26600 + }, + { + "epoch": 0.4271336618565306, + "grad_norm": 0.753165602684021, + "learning_rate": 4.458710353587859e-05, + "loss": 0.8501, + "step": 26610 + }, + { + "epoch": 0.4272941780766947, + "grad_norm": 0.5952677726745605, + "learning_rate": 4.45831852416842e-05, + "loss": 0.7372, + "step": 26620 + }, + { + "epoch": 0.4274546942968587, + "grad_norm": 1.4860609769821167, + "learning_rate": 4.457926570211417e-05, + "loss": 0.7221, + "step": 26630 + }, + { + "epoch": 0.4276152105170227, + "grad_norm": 0.5871453285217285, + "learning_rate": 4.457534491741776e-05, + "loss": 0.7978, + "step": 26640 + }, + { + "epoch": 0.4277757267371868, + "grad_norm": 2.7540345191955566, + "learning_rate": 4.457142288784431e-05, + "loss": 0.8985, + "step": 26650 + }, + { + "epoch": 0.42793624295735083, + "grad_norm": 0.5464576482772827, + "learning_rate": 4.456749961364324e-05, + "loss": 0.7836, + "step": 26660 + }, + { + "epoch": 0.4280967591775149, + "grad_norm": 0.6010030508041382, + "learning_rate": 4.456357509506404e-05, + "loss": 0.698, + "step": 26670 + }, + { + "epoch": 0.42825727539767894, + "grad_norm": 0.7287788391113281, + "learning_rate": 4.455964933235629e-05, + "loss": 0.7418, + "step": 26680 + }, + { + "epoch": 0.428417791617843, + "grad_norm": 0.589003324508667, + "learning_rate": 4.455572232576964e-05, + "loss": 0.7113, + "step": 26690 + }, + { + "epoch": 0.42857830783800704, + "grad_norm": 0.9084124565124512, + "learning_rate": 4.455179407555384e-05, + "loss": 0.7717, + "step": 26700 + }, + { + "epoch": 0.4287388240581711, + "grad_norm": 0.7361232042312622, + "learning_rate": 4.454786458195869e-05, + "loss": 0.8314, + "step": 26710 + }, + { + "epoch": 0.42889934027833515, + "grad_norm": 0.6028701066970825, + "learning_rate": 4.4543933845234096e-05, + "loss": 0.7555, + "step": 26720 + }, + { + "epoch": 0.42905985649849915, + "grad_norm": 0.6133211851119995, + "learning_rate": 4.454000186563001e-05, + "loss": 0.6657, + "step": 26730 + }, + { + "epoch": 0.4292203727186632, + "grad_norm": 0.6498537659645081, + "learning_rate": 4.45360686433965e-05, + "loss": 0.8047, + "step": 26740 + }, + { + "epoch": 0.42938088893882725, + "grad_norm": 0.5860034227371216, + "learning_rate": 4.453213417878368e-05, + "loss": 0.7979, + "step": 26750 + }, + { + "epoch": 0.4295414051589913, + "grad_norm": 1.127522587776184, + "learning_rate": 4.452819847204177e-05, + "loss": 0.8215, + "step": 26760 + }, + { + "epoch": 0.42970192137915536, + "grad_norm": 0.4870767295360565, + "learning_rate": 4.4524261523421065e-05, + "loss": 0.8709, + "step": 26770 + }, + { + "epoch": 0.4298624375993194, + "grad_norm": 0.6741671562194824, + "learning_rate": 4.4520323333171923e-05, + "loss": 0.7419, + "step": 26780 + }, + { + "epoch": 0.43002295381948347, + "grad_norm": 0.3895234763622284, + "learning_rate": 4.4516383901544775e-05, + "loss": 0.7114, + "step": 26790 + }, + { + "epoch": 0.4301834700396475, + "grad_norm": 0.9474360942840576, + "learning_rate": 4.451244322879018e-05, + "loss": 0.775, + "step": 26800 + }, + { + "epoch": 0.4303439862598116, + "grad_norm": 1.8713579177856445, + "learning_rate": 4.450850131515871e-05, + "loss": 0.68, + "step": 26810 + }, + { + "epoch": 0.4305045024799756, + "grad_norm": 0.8230732679367065, + "learning_rate": 4.450455816090106e-05, + "loss": 0.7098, + "step": 26820 + }, + { + "epoch": 0.4306650187001396, + "grad_norm": 0.966434895992279, + "learning_rate": 4.4500613766268e-05, + "loss": 0.672, + "step": 26830 + }, + { + "epoch": 0.4308255349203037, + "grad_norm": 0.6243548393249512, + "learning_rate": 4.4496668131510345e-05, + "loss": 0.9278, + "step": 26840 + }, + { + "epoch": 0.43098605114046773, + "grad_norm": 1.1842705011367798, + "learning_rate": 4.4492721256879044e-05, + "loss": 0.7451, + "step": 26850 + }, + { + "epoch": 0.4311465673606318, + "grad_norm": 0.9384071826934814, + "learning_rate": 4.448877314262508e-05, + "loss": 0.8473, + "step": 26860 + }, + { + "epoch": 0.43130708358079584, + "grad_norm": 0.6714761257171631, + "learning_rate": 4.448482378899953e-05, + "loss": 0.7911, + "step": 26870 + }, + { + "epoch": 0.4314675998009599, + "grad_norm": 0.930238664150238, + "learning_rate": 4.4480873196253545e-05, + "loss": 0.8687, + "step": 26880 + }, + { + "epoch": 0.43162811602112394, + "grad_norm": 0.6251193881034851, + "learning_rate": 4.447692136463838e-05, + "loss": 0.7577, + "step": 26890 + }, + { + "epoch": 0.431788632241288, + "grad_norm": 0.7241990566253662, + "learning_rate": 4.447296829440532e-05, + "loss": 0.764, + "step": 26900 + }, + { + "epoch": 0.43194914846145205, + "grad_norm": 0.809296727180481, + "learning_rate": 4.4469013985805777e-05, + "loss": 0.7361, + "step": 26910 + }, + { + "epoch": 0.4321096646816161, + "grad_norm": 0.6928038597106934, + "learning_rate": 4.4465058439091215e-05, + "loss": 0.7062, + "step": 26920 + }, + { + "epoch": 0.4322701809017801, + "grad_norm": 0.8356100916862488, + "learning_rate": 4.446110165451319e-05, + "loss": 0.8185, + "step": 26930 + }, + { + "epoch": 0.43243069712194415, + "grad_norm": 0.7882551550865173, + "learning_rate": 4.445714363232332e-05, + "loss": 0.7772, + "step": 26940 + }, + { + "epoch": 0.4325912133421082, + "grad_norm": 0.72243332862854, + "learning_rate": 4.445318437277332e-05, + "loss": 0.7201, + "step": 26950 + }, + { + "epoch": 0.43275172956227226, + "grad_norm": 0.6982499361038208, + "learning_rate": 4.444922387611497e-05, + "loss": 0.8265, + "step": 26960 + }, + { + "epoch": 0.4329122457824363, + "grad_norm": 0.6321655511856079, + "learning_rate": 4.4445262142600133e-05, + "loss": 0.7846, + "step": 26970 + }, + { + "epoch": 0.43307276200260036, + "grad_norm": 0.553848922252655, + "learning_rate": 4.444129917248076e-05, + "loss": 0.8332, + "step": 26980 + }, + { + "epoch": 0.4332332782227644, + "grad_norm": 0.490133672952652, + "learning_rate": 4.4437334966008864e-05, + "loss": 0.7218, + "step": 26990 + }, + { + "epoch": 0.43339379444292847, + "grad_norm": 0.5971039533615112, + "learning_rate": 4.4433369523436555e-05, + "loss": 0.7588, + "step": 27000 + }, + { + "epoch": 0.4335543106630925, + "grad_norm": 0.8737935423851013, + "learning_rate": 4.4429402845016e-05, + "loss": 0.7423, + "step": 27010 + }, + { + "epoch": 0.4337148268832566, + "grad_norm": 0.5920777320861816, + "learning_rate": 4.4425434930999475e-05, + "loss": 0.7575, + "step": 27020 + }, + { + "epoch": 0.4338753431034206, + "grad_norm": 0.7200000286102295, + "learning_rate": 4.44214657816393e-05, + "loss": 0.8391, + "step": 27030 + }, + { + "epoch": 0.4340358593235846, + "grad_norm": 0.866863489151001, + "learning_rate": 4.441749539718789e-05, + "loss": 0.8549, + "step": 27040 + }, + { + "epoch": 0.4341963755437487, + "grad_norm": 0.9664353132247925, + "learning_rate": 4.441352377789774e-05, + "loss": 0.8094, + "step": 27050 + }, + { + "epoch": 0.43435689176391273, + "grad_norm": 0.7067638635635376, + "learning_rate": 4.4409550924021424e-05, + "loss": 0.8271, + "step": 27060 + }, + { + "epoch": 0.4345174079840768, + "grad_norm": 0.7469492554664612, + "learning_rate": 4.44055768358116e-05, + "loss": 0.8255, + "step": 27070 + }, + { + "epoch": 0.43467792420424084, + "grad_norm": 0.7472586035728455, + "learning_rate": 4.4401601513520976e-05, + "loss": 0.6996, + "step": 27080 + }, + { + "epoch": 0.4348384404244049, + "grad_norm": 1.0484514236450195, + "learning_rate": 4.4397624957402386e-05, + "loss": 0.7963, + "step": 27090 + }, + { + "epoch": 0.43499895664456895, + "grad_norm": 0.558362603187561, + "learning_rate": 4.439364716770869e-05, + "loss": 0.7574, + "step": 27100 + }, + { + "epoch": 0.435159472864733, + "grad_norm": 0.9162964224815369, + "learning_rate": 4.4389668144692864e-05, + "loss": 0.8811, + "step": 27110 + }, + { + "epoch": 0.43531998908489705, + "grad_norm": 0.6889044046401978, + "learning_rate": 4.438568788860795e-05, + "loss": 0.8404, + "step": 27120 + }, + { + "epoch": 0.43548050530506105, + "grad_norm": 0.7744122743606567, + "learning_rate": 4.438170639970708e-05, + "loss": 0.7509, + "step": 27130 + }, + { + "epoch": 0.4356410215252251, + "grad_norm": 0.5462216734886169, + "learning_rate": 4.4377723678243435e-05, + "loss": 0.8192, + "step": 27140 + }, + { + "epoch": 0.43580153774538916, + "grad_norm": 0.5279184579849243, + "learning_rate": 4.4373739724470294e-05, + "loss": 0.7285, + "step": 27150 + }, + { + "epoch": 0.4359620539655532, + "grad_norm": 0.6063891053199768, + "learning_rate": 4.436975453864102e-05, + "loss": 0.8295, + "step": 27160 + }, + { + "epoch": 0.43612257018571726, + "grad_norm": 0.6753969788551331, + "learning_rate": 4.436576812100905e-05, + "loss": 0.787, + "step": 27170 + }, + { + "epoch": 0.4362830864058813, + "grad_norm": 0.8856828212738037, + "learning_rate": 4.43617804718279e-05, + "loss": 0.8285, + "step": 27180 + }, + { + "epoch": 0.43644360262604537, + "grad_norm": 0.7369129657745361, + "learning_rate": 4.435779159135114e-05, + "loss": 0.7349, + "step": 27190 + }, + { + "epoch": 0.4366041188462094, + "grad_norm": 0.6938046216964722, + "learning_rate": 4.435380147983246e-05, + "loss": 0.9913, + "step": 27200 + }, + { + "epoch": 0.4367646350663735, + "grad_norm": 0.7514333724975586, + "learning_rate": 4.4349810137525605e-05, + "loss": 0.7942, + "step": 27210 + }, + { + "epoch": 0.4369251512865375, + "grad_norm": 0.6844658255577087, + "learning_rate": 4.434581756468439e-05, + "loss": 0.9214, + "step": 27220 + }, + { + "epoch": 0.4370856675067015, + "grad_norm": 0.9211117029190063, + "learning_rate": 4.434182376156273e-05, + "loss": 0.7274, + "step": 27230 + }, + { + "epoch": 0.4372461837268656, + "grad_norm": 0.6270101070404053, + "learning_rate": 4.43378287284146e-05, + "loss": 0.7955, + "step": 27240 + }, + { + "epoch": 0.43740669994702963, + "grad_norm": 1.0281156301498413, + "learning_rate": 4.433383246549407e-05, + "loss": 0.7701, + "step": 27250 + }, + { + "epoch": 0.4375672161671937, + "grad_norm": 0.6229696273803711, + "learning_rate": 4.432983497305527e-05, + "loss": 0.7569, + "step": 27260 + }, + { + "epoch": 0.43772773238735774, + "grad_norm": 0.6804062724113464, + "learning_rate": 4.432583625135242e-05, + "loss": 0.7212, + "step": 27270 + }, + { + "epoch": 0.4378882486075218, + "grad_norm": 0.6662002801895142, + "learning_rate": 4.432183630063982e-05, + "loss": 0.837, + "step": 27280 + }, + { + "epoch": 0.43804876482768584, + "grad_norm": 1.0897363424301147, + "learning_rate": 4.431783512117184e-05, + "loss": 0.7869, + "step": 27290 + }, + { + "epoch": 0.4382092810478499, + "grad_norm": 1.356842041015625, + "learning_rate": 4.431383271320292e-05, + "loss": 0.8499, + "step": 27300 + }, + { + "epoch": 0.43836979726801395, + "grad_norm": 1.0216854810714722, + "learning_rate": 4.430982907698761e-05, + "loss": 0.8511, + "step": 27310 + }, + { + "epoch": 0.438530313488178, + "grad_norm": 0.6667689681053162, + "learning_rate": 4.43058242127805e-05, + "loss": 0.7731, + "step": 27320 + }, + { + "epoch": 0.438690829708342, + "grad_norm": 0.5599516034126282, + "learning_rate": 4.430181812083629e-05, + "loss": 0.7988, + "step": 27330 + }, + { + "epoch": 0.43885134592850605, + "grad_norm": 0.7028535604476929, + "learning_rate": 4.4297810801409735e-05, + "loss": 0.7771, + "step": 27340 + }, + { + "epoch": 0.4390118621486701, + "grad_norm": 0.7905014157295227, + "learning_rate": 4.429380225475568e-05, + "loss": 0.781, + "step": 27350 + }, + { + "epoch": 0.43917237836883416, + "grad_norm": 0.750091016292572, + "learning_rate": 4.428979248112904e-05, + "loss": 0.9501, + "step": 27360 + }, + { + "epoch": 0.4393328945889982, + "grad_norm": 0.7221774458885193, + "learning_rate": 4.428578148078483e-05, + "loss": 0.7726, + "step": 27370 + }, + { + "epoch": 0.43949341080916227, + "grad_norm": 0.6588048338890076, + "learning_rate": 4.42817692539781e-05, + "loss": 0.8475, + "step": 27380 + }, + { + "epoch": 0.4396539270293263, + "grad_norm": 0.561495840549469, + "learning_rate": 4.4277755800964026e-05, + "loss": 0.7054, + "step": 27390 + }, + { + "epoch": 0.43981444324949037, + "grad_norm": 0.9468235373497009, + "learning_rate": 4.427374112199784e-05, + "loss": 0.7908, + "step": 27400 + }, + { + "epoch": 0.4399749594696544, + "grad_norm": 0.46467891335487366, + "learning_rate": 4.4269725217334826e-05, + "loss": 0.819, + "step": 27410 + }, + { + "epoch": 0.4401354756898185, + "grad_norm": 0.8108429908752441, + "learning_rate": 4.42657080872304e-05, + "loss": 0.7914, + "step": 27420 + }, + { + "epoch": 0.44029599190998253, + "grad_norm": 0.6685031056404114, + "learning_rate": 4.4261689731940015e-05, + "loss": 0.6377, + "step": 27430 + }, + { + "epoch": 0.44045650813014653, + "grad_norm": 0.6427508592605591, + "learning_rate": 4.425767015171922e-05, + "loss": 0.7657, + "step": 27440 + }, + { + "epoch": 0.4406170243503106, + "grad_norm": 0.7445377707481384, + "learning_rate": 4.425364934682363e-05, + "loss": 0.781, + "step": 27450 + }, + { + "epoch": 0.44077754057047464, + "grad_norm": 0.5718990564346313, + "learning_rate": 4.4249627317508956e-05, + "loss": 0.7576, + "step": 27460 + }, + { + "epoch": 0.4409380567906387, + "grad_norm": 0.37249574065208435, + "learning_rate": 4.424560406403097e-05, + "loss": 0.6378, + "step": 27470 + }, + { + "epoch": 0.44109857301080274, + "grad_norm": 0.5916839241981506, + "learning_rate": 4.424157958664552e-05, + "loss": 0.9204, + "step": 27480 + }, + { + "epoch": 0.4412590892309668, + "grad_norm": 0.7071251273155212, + "learning_rate": 4.423755388560854e-05, + "loss": 0.8255, + "step": 27490 + }, + { + "epoch": 0.44141960545113085, + "grad_norm": 0.757023811340332, + "learning_rate": 4.423352696117605e-05, + "loss": 0.8675, + "step": 27500 + }, + { + "epoch": 0.4415801216712949, + "grad_norm": 0.8819933533668518, + "learning_rate": 4.422949881360414e-05, + "loss": 0.8657, + "step": 27510 + }, + { + "epoch": 0.44174063789145895, + "grad_norm": 0.8755509853363037, + "learning_rate": 4.422546944314897e-05, + "loss": 0.7458, + "step": 27520 + }, + { + "epoch": 0.441901154111623, + "grad_norm": 0.6529979705810547, + "learning_rate": 4.422143885006678e-05, + "loss": 0.8044, + "step": 27530 + }, + { + "epoch": 0.442061670331787, + "grad_norm": 0.566278874874115, + "learning_rate": 4.42174070346139e-05, + "loss": 0.7993, + "step": 27540 + }, + { + "epoch": 0.44222218655195106, + "grad_norm": 0.789067804813385, + "learning_rate": 4.421337399704672e-05, + "loss": 0.7848, + "step": 27550 + }, + { + "epoch": 0.4423827027721151, + "grad_norm": 2.0815443992614746, + "learning_rate": 4.4209339737621746e-05, + "loss": 0.7078, + "step": 27560 + }, + { + "epoch": 0.44254321899227916, + "grad_norm": 0.7802446484565735, + "learning_rate": 4.420530425659549e-05, + "loss": 0.7158, + "step": 27570 + }, + { + "epoch": 0.4427037352124432, + "grad_norm": 0.7983391880989075, + "learning_rate": 4.4201267554224624e-05, + "loss": 0.8311, + "step": 27580 + }, + { + "epoch": 0.44286425143260727, + "grad_norm": 0.6692320704460144, + "learning_rate": 4.419722963076583e-05, + "loss": 0.82, + "step": 27590 + }, + { + "epoch": 0.4430247676527713, + "grad_norm": 0.7410433888435364, + "learning_rate": 4.419319048647591e-05, + "loss": 0.7259, + "step": 27600 + }, + { + "epoch": 0.4431852838729354, + "grad_norm": 0.9267972111701965, + "learning_rate": 4.4189150121611736e-05, + "loss": 0.7395, + "step": 27610 + }, + { + "epoch": 0.44334580009309943, + "grad_norm": 0.5969142913818359, + "learning_rate": 4.418510853643023e-05, + "loss": 0.6921, + "step": 27620 + }, + { + "epoch": 0.4435063163132635, + "grad_norm": 0.8814087510108948, + "learning_rate": 4.418106573118843e-05, + "loss": 0.8688, + "step": 27630 + }, + { + "epoch": 0.4436668325334275, + "grad_norm": 0.6739051342010498, + "learning_rate": 4.4177021706143434e-05, + "loss": 0.8218, + "step": 27640 + }, + { + "epoch": 0.44382734875359153, + "grad_norm": 0.7281884551048279, + "learning_rate": 4.417297646155242e-05, + "loss": 0.7574, + "step": 27650 + }, + { + "epoch": 0.4439878649737556, + "grad_norm": 1.3058174848556519, + "learning_rate": 4.416892999767262e-05, + "loss": 0.8481, + "step": 27660 + }, + { + "epoch": 0.44414838119391964, + "grad_norm": 0.7837875485420227, + "learning_rate": 4.41648823147614e-05, + "loss": 0.711, + "step": 27670 + }, + { + "epoch": 0.4443088974140837, + "grad_norm": 0.8412690758705139, + "learning_rate": 4.4160833413076145e-05, + "loss": 0.8457, + "step": 27680 + }, + { + "epoch": 0.44446941363424775, + "grad_norm": 0.7972171306610107, + "learning_rate": 4.4156783292874344e-05, + "loss": 0.7518, + "step": 27690 + }, + { + "epoch": 0.4446299298544118, + "grad_norm": 1.036928653717041, + "learning_rate": 4.415273195441357e-05, + "loss": 0.7152, + "step": 27700 + }, + { + "epoch": 0.44479044607457585, + "grad_norm": 1.1777012348175049, + "learning_rate": 4.4148679397951456e-05, + "loss": 0.7578, + "step": 27710 + }, + { + "epoch": 0.4449509622947399, + "grad_norm": 0.49075058102607727, + "learning_rate": 4.4144625623745724e-05, + "loss": 0.9078, + "step": 27720 + }, + { + "epoch": 0.44511147851490396, + "grad_norm": 0.7180351614952087, + "learning_rate": 4.414057063205417e-05, + "loss": 0.7169, + "step": 27730 + }, + { + "epoch": 0.44527199473506796, + "grad_norm": 0.7162033319473267, + "learning_rate": 4.413651442313467e-05, + "loss": 0.694, + "step": 27740 + }, + { + "epoch": 0.445432510955232, + "grad_norm": 0.6978956460952759, + "learning_rate": 4.4132456997245165e-05, + "loss": 0.8306, + "step": 27750 + }, + { + "epoch": 0.44559302717539606, + "grad_norm": 1.300992488861084, + "learning_rate": 4.412839835464369e-05, + "loss": 0.6604, + "step": 27760 + }, + { + "epoch": 0.4457535433955601, + "grad_norm": 0.8368169069290161, + "learning_rate": 4.412433849558837e-05, + "loss": 0.9056, + "step": 27770 + }, + { + "epoch": 0.44591405961572417, + "grad_norm": 0.7826017737388611, + "learning_rate": 4.412027742033735e-05, + "loss": 0.8746, + "step": 27780 + }, + { + "epoch": 0.4460745758358882, + "grad_norm": 0.6479897499084473, + "learning_rate": 4.411621512914893e-05, + "loss": 0.6988, + "step": 27790 + }, + { + "epoch": 0.4462350920560523, + "grad_norm": 2.544811964035034, + "learning_rate": 4.411215162228141e-05, + "loss": 0.7223, + "step": 27800 + }, + { + "epoch": 0.4463956082762163, + "grad_norm": 0.595211386680603, + "learning_rate": 4.4108086899993235e-05, + "loss": 0.8353, + "step": 27810 + }, + { + "epoch": 0.4465561244963804, + "grad_norm": 0.7154767513275146, + "learning_rate": 4.410402096254288e-05, + "loss": 0.8006, + "step": 27820 + }, + { + "epoch": 0.44671664071654443, + "grad_norm": 1.085875153541565, + "learning_rate": 4.4099953810188924e-05, + "loss": 0.7824, + "step": 27830 + }, + { + "epoch": 0.44687715693670843, + "grad_norm": 0.5088634490966797, + "learning_rate": 4.409588544319001e-05, + "loss": 0.733, + "step": 27840 + }, + { + "epoch": 0.4470376731568725, + "grad_norm": 0.6802466511726379, + "learning_rate": 4.409181586180486e-05, + "loss": 0.8271, + "step": 27850 + }, + { + "epoch": 0.44719818937703654, + "grad_norm": 0.6940221190452576, + "learning_rate": 4.4087745066292286e-05, + "loss": 0.9062, + "step": 27860 + }, + { + "epoch": 0.4473587055972006, + "grad_norm": 0.8006892204284668, + "learning_rate": 4.408367305691116e-05, + "loss": 0.8061, + "step": 27870 + }, + { + "epoch": 0.44751922181736464, + "grad_norm": 0.5013044476509094, + "learning_rate": 4.407959983392043e-05, + "loss": 0.8305, + "step": 27880 + }, + { + "epoch": 0.4476797380375287, + "grad_norm": 0.7768628597259521, + "learning_rate": 4.407552539757914e-05, + "loss": 0.8679, + "step": 27890 + }, + { + "epoch": 0.44784025425769275, + "grad_norm": 0.5721296072006226, + "learning_rate": 4.40714497481464e-05, + "loss": 0.8562, + "step": 27900 + }, + { + "epoch": 0.4480007704778568, + "grad_norm": 1.0607661008834839, + "learning_rate": 4.406737288588139e-05, + "loss": 0.7989, + "step": 27910 + }, + { + "epoch": 0.44816128669802086, + "grad_norm": 0.9355794787406921, + "learning_rate": 4.406329481104338e-05, + "loss": 0.8048, + "step": 27920 + }, + { + "epoch": 0.4483218029181849, + "grad_norm": 0.9326393008232117, + "learning_rate": 4.405921552389171e-05, + "loss": 0.8301, + "step": 27930 + }, + { + "epoch": 0.4484823191383489, + "grad_norm": 0.8735690116882324, + "learning_rate": 4.40551350246858e-05, + "loss": 0.7217, + "step": 27940 + }, + { + "epoch": 0.44864283535851296, + "grad_norm": 0.6691705584526062, + "learning_rate": 4.405105331368514e-05, + "loss": 0.8651, + "step": 27950 + }, + { + "epoch": 0.448803351578677, + "grad_norm": 0.6603869795799255, + "learning_rate": 4.404697039114931e-05, + "loss": 0.8376, + "step": 27960 + }, + { + "epoch": 0.44896386779884107, + "grad_norm": 0.6665390729904175, + "learning_rate": 4.4042886257337954e-05, + "loss": 0.7798, + "step": 27970 + }, + { + "epoch": 0.4491243840190051, + "grad_norm": 0.5204685926437378, + "learning_rate": 4.403880091251081e-05, + "loss": 0.8024, + "step": 27980 + }, + { + "epoch": 0.44928490023916917, + "grad_norm": 0.5900185704231262, + "learning_rate": 4.4034714356927675e-05, + "loss": 0.9261, + "step": 27990 + }, + { + "epoch": 0.4494454164593332, + "grad_norm": 1.117396354675293, + "learning_rate": 4.4030626590848425e-05, + "loss": 0.8465, + "step": 28000 + }, + { + "epoch": 0.4494454164593332, + "eval_loss": 0.7943926453590393, + "eval_runtime": 1820.5868, + "eval_samples_per_second": 14.408, + "eval_steps_per_second": 1.801, + "step": 28000 + }, + { + "epoch": 0.4496059326794973, + "grad_norm": 0.6931092143058777, + "learning_rate": 4.4026537614533026e-05, + "loss": 0.8412, + "step": 28010 + }, + { + "epoch": 0.44976644889966133, + "grad_norm": 0.605839729309082, + "learning_rate": 4.40224474282415e-05, + "loss": 0.7418, + "step": 28020 + }, + { + "epoch": 0.4499269651198254, + "grad_norm": 0.6419365406036377, + "learning_rate": 4.401835603223398e-05, + "loss": 0.7829, + "step": 28030 + }, + { + "epoch": 0.4500874813399894, + "grad_norm": 1.0612138509750366, + "learning_rate": 4.401426342677063e-05, + "loss": 0.8014, + "step": 28040 + }, + { + "epoch": 0.45024799756015343, + "grad_norm": 0.9188058376312256, + "learning_rate": 4.4010169612111743e-05, + "loss": 0.7274, + "step": 28050 + }, + { + "epoch": 0.4504085137803175, + "grad_norm": 0.9512656927108765, + "learning_rate": 4.400607458851763e-05, + "loss": 0.7576, + "step": 28060 + }, + { + "epoch": 0.45056903000048154, + "grad_norm": 0.9576501250267029, + "learning_rate": 4.400197835624874e-05, + "loss": 0.7123, + "step": 28070 + }, + { + "epoch": 0.4507295462206456, + "grad_norm": 0.6672779321670532, + "learning_rate": 4.3997880915565556e-05, + "loss": 0.7198, + "step": 28080 + }, + { + "epoch": 0.45089006244080965, + "grad_norm": 0.7291007041931152, + "learning_rate": 4.3993782266728645e-05, + "loss": 0.7407, + "step": 28090 + }, + { + "epoch": 0.4510505786609737, + "grad_norm": 0.6929943561553955, + "learning_rate": 4.398968240999867e-05, + "loss": 0.7098, + "step": 28100 + }, + { + "epoch": 0.45121109488113775, + "grad_norm": 0.646226167678833, + "learning_rate": 4.398558134563635e-05, + "loss": 0.7911, + "step": 28110 + }, + { + "epoch": 0.4513716111013018, + "grad_norm": 0.7653489708900452, + "learning_rate": 4.3981479073902495e-05, + "loss": 0.7805, + "step": 28120 + }, + { + "epoch": 0.45153212732146586, + "grad_norm": 0.9290266036987305, + "learning_rate": 4.3977375595057974e-05, + "loss": 0.6997, + "step": 28130 + }, + { + "epoch": 0.45169264354162986, + "grad_norm": 0.6910009980201721, + "learning_rate": 4.397327090936375e-05, + "loss": 0.7573, + "step": 28140 + }, + { + "epoch": 0.4518531597617939, + "grad_norm": 0.681077241897583, + "learning_rate": 4.396916501708086e-05, + "loss": 0.9551, + "step": 28150 + }, + { + "epoch": 0.45201367598195796, + "grad_norm": 0.7271087765693665, + "learning_rate": 4.396505791847042e-05, + "loss": 0.8689, + "step": 28160 + }, + { + "epoch": 0.452174192202122, + "grad_norm": 0.7999036312103271, + "learning_rate": 4.39609496137936e-05, + "loss": 0.8442, + "step": 28170 + }, + { + "epoch": 0.45233470842228607, + "grad_norm": 0.6026002168655396, + "learning_rate": 4.395684010331168e-05, + "loss": 0.8542, + "step": 28180 + }, + { + "epoch": 0.4524952246424501, + "grad_norm": 0.751765787601471, + "learning_rate": 4.3952729387286e-05, + "loss": 0.7311, + "step": 28190 + }, + { + "epoch": 0.4526557408626142, + "grad_norm": 1.1355946063995361, + "learning_rate": 4.394861746597796e-05, + "loss": 0.7324, + "step": 28200 + }, + { + "epoch": 0.45281625708277823, + "grad_norm": 0.9288970232009888, + "learning_rate": 4.3944504339649075e-05, + "loss": 0.8262, + "step": 28210 + }, + { + "epoch": 0.4529767733029423, + "grad_norm": 1.3700438737869263, + "learning_rate": 4.39403900085609e-05, + "loss": 0.7184, + "step": 28220 + }, + { + "epoch": 0.45313728952310633, + "grad_norm": 1.380336880683899, + "learning_rate": 4.393627447297509e-05, + "loss": 0.7902, + "step": 28230 + }, + { + "epoch": 0.45329780574327033, + "grad_norm": 0.8012150526046753, + "learning_rate": 4.3932157733153374e-05, + "loss": 0.9167, + "step": 28240 + }, + { + "epoch": 0.4534583219634344, + "grad_norm": 0.6956283450126648, + "learning_rate": 4.392803978935754e-05, + "loss": 0.8539, + "step": 28250 + }, + { + "epoch": 0.45361883818359844, + "grad_norm": 0.7548871040344238, + "learning_rate": 4.392392064184948e-05, + "loss": 0.7754, + "step": 28260 + }, + { + "epoch": 0.4537793544037625, + "grad_norm": 0.6622401475906372, + "learning_rate": 4.391980029089113e-05, + "loss": 0.7984, + "step": 28270 + }, + { + "epoch": 0.45393987062392654, + "grad_norm": 0.9521957635879517, + "learning_rate": 4.391567873674454e-05, + "loss": 0.7707, + "step": 28280 + }, + { + "epoch": 0.4541003868440906, + "grad_norm": 0.8882354497909546, + "learning_rate": 4.39115559796718e-05, + "loss": 0.8595, + "step": 28290 + }, + { + "epoch": 0.45426090306425465, + "grad_norm": 1.0319477319717407, + "learning_rate": 4.3907432019935097e-05, + "loss": 0.8535, + "step": 28300 + }, + { + "epoch": 0.4544214192844187, + "grad_norm": 0.4857092499732971, + "learning_rate": 4.3903306857796696e-05, + "loss": 0.7799, + "step": 28310 + }, + { + "epoch": 0.45458193550458276, + "grad_norm": 0.7268199324607849, + "learning_rate": 4.389918049351893e-05, + "loss": 0.7937, + "step": 28320 + }, + { + "epoch": 0.4547424517247468, + "grad_norm": 0.764670193195343, + "learning_rate": 4.389505292736421e-05, + "loss": 0.7732, + "step": 28330 + }, + { + "epoch": 0.4549029679449108, + "grad_norm": 0.6096231341362, + "learning_rate": 4.389092415959502e-05, + "loss": 0.77, + "step": 28340 + }, + { + "epoch": 0.45506348416507486, + "grad_norm": 0.5912678241729736, + "learning_rate": 4.3886794190473944e-05, + "loss": 0.725, + "step": 28350 + }, + { + "epoch": 0.4552240003852389, + "grad_norm": 0.677013099193573, + "learning_rate": 4.388266302026361e-05, + "loss": 0.7325, + "step": 28360 + }, + { + "epoch": 0.45538451660540297, + "grad_norm": 0.6365012526512146, + "learning_rate": 4.387853064922674e-05, + "loss": 0.8707, + "step": 28370 + }, + { + "epoch": 0.455545032825567, + "grad_norm": 0.7310138940811157, + "learning_rate": 4.387439707762612e-05, + "loss": 0.7864, + "step": 28380 + }, + { + "epoch": 0.4557055490457311, + "grad_norm": 0.9219204783439636, + "learning_rate": 4.3870262305724625e-05, + "loss": 0.7586, + "step": 28390 + }, + { + "epoch": 0.4558660652658951, + "grad_norm": 0.5998927354812622, + "learning_rate": 4.3866126333785206e-05, + "loss": 0.7625, + "step": 28400 + }, + { + "epoch": 0.4560265814860592, + "grad_norm": 1.104305624961853, + "learning_rate": 4.386198916207089e-05, + "loss": 0.7522, + "step": 28410 + }, + { + "epoch": 0.45618709770622323, + "grad_norm": 1.35391366481781, + "learning_rate": 4.385785079084477e-05, + "loss": 0.6611, + "step": 28420 + }, + { + "epoch": 0.4563476139263873, + "grad_norm": 0.6589605808258057, + "learning_rate": 4.385371122037002e-05, + "loss": 0.8639, + "step": 28430 + }, + { + "epoch": 0.4565081301465513, + "grad_norm": 0.6269050240516663, + "learning_rate": 4.3849570450909904e-05, + "loss": 0.8169, + "step": 28440 + }, + { + "epoch": 0.45666864636671534, + "grad_norm": 0.9773812890052795, + "learning_rate": 4.384542848272774e-05, + "loss": 0.7509, + "step": 28450 + }, + { + "epoch": 0.4568291625868794, + "grad_norm": 0.6572521328926086, + "learning_rate": 4.3841285316086935e-05, + "loss": 0.717, + "step": 28460 + }, + { + "epoch": 0.45698967880704344, + "grad_norm": 0.6002798676490784, + "learning_rate": 4.383714095125096e-05, + "loss": 0.8048, + "step": 28470 + }, + { + "epoch": 0.4571501950272075, + "grad_norm": 1.0349558591842651, + "learning_rate": 4.38329953884834e-05, + "loss": 0.6111, + "step": 28480 + }, + { + "epoch": 0.45731071124737155, + "grad_norm": 0.5172317624092102, + "learning_rate": 4.382884862804787e-05, + "loss": 0.8134, + "step": 28490 + }, + { + "epoch": 0.4574712274675356, + "grad_norm": 1.1347928047180176, + "learning_rate": 4.382470067020807e-05, + "loss": 0.7289, + "step": 28500 + }, + { + "epoch": 0.45763174368769965, + "grad_norm": 0.6180327534675598, + "learning_rate": 4.382055151522781e-05, + "loss": 0.9812, + "step": 28510 + }, + { + "epoch": 0.4577922599078637, + "grad_norm": 1.4759896993637085, + "learning_rate": 4.3816401163370936e-05, + "loss": 0.7862, + "step": 28520 + }, + { + "epoch": 0.45795277612802776, + "grad_norm": 0.5476906299591064, + "learning_rate": 4.381224961490138e-05, + "loss": 0.7628, + "step": 28530 + }, + { + "epoch": 0.45811329234819176, + "grad_norm": 0.8859686851501465, + "learning_rate": 4.380809687008317e-05, + "loss": 0.8508, + "step": 28540 + }, + { + "epoch": 0.4582738085683558, + "grad_norm": 0.6758394241333008, + "learning_rate": 4.3803942929180395e-05, + "loss": 0.7844, + "step": 28550 + }, + { + "epoch": 0.45843432478851986, + "grad_norm": 0.6077747941017151, + "learning_rate": 4.379978779245722e-05, + "loss": 0.7416, + "step": 28560 + }, + { + "epoch": 0.4585948410086839, + "grad_norm": 0.9498460292816162, + "learning_rate": 4.379563146017788e-05, + "loss": 0.7554, + "step": 28570 + }, + { + "epoch": 0.45875535722884797, + "grad_norm": 0.8298545479774475, + "learning_rate": 4.3791473932606696e-05, + "loss": 0.7805, + "step": 28580 + }, + { + "epoch": 0.458915873449012, + "grad_norm": 0.8243091702461243, + "learning_rate": 4.378731521000807e-05, + "loss": 0.6603, + "step": 28590 + }, + { + "epoch": 0.4590763896691761, + "grad_norm": 1.171275019645691, + "learning_rate": 4.378315529264646e-05, + "loss": 0.9216, + "step": 28600 + }, + { + "epoch": 0.45923690588934013, + "grad_norm": 0.4842662513256073, + "learning_rate": 4.377899418078643e-05, + "loss": 0.7424, + "step": 28610 + }, + { + "epoch": 0.4593974221095042, + "grad_norm": 0.45504334568977356, + "learning_rate": 4.3774831874692576e-05, + "loss": 0.7635, + "step": 28620 + }, + { + "epoch": 0.45955793832966824, + "grad_norm": 0.648007869720459, + "learning_rate": 4.3770668374629625e-05, + "loss": 0.8143, + "step": 28630 + }, + { + "epoch": 0.45971845454983223, + "grad_norm": 0.6428667306900024, + "learning_rate": 4.376650368086234e-05, + "loss": 0.7931, + "step": 28640 + }, + { + "epoch": 0.4598789707699963, + "grad_norm": 0.7624646425247192, + "learning_rate": 4.376233779365556e-05, + "loss": 0.8219, + "step": 28650 + }, + { + "epoch": 0.46003948699016034, + "grad_norm": 0.6698193550109863, + "learning_rate": 4.375817071327423e-05, + "loss": 0.6926, + "step": 28660 + }, + { + "epoch": 0.4602000032103244, + "grad_norm": 1.0482820272445679, + "learning_rate": 4.375400243998333e-05, + "loss": 0.7475, + "step": 28670 + }, + { + "epoch": 0.46036051943048845, + "grad_norm": 0.8840123414993286, + "learning_rate": 4.374983297404796e-05, + "loss": 0.8832, + "step": 28680 + }, + { + "epoch": 0.4605210356506525, + "grad_norm": 0.7830825448036194, + "learning_rate": 4.374566231573325e-05, + "loss": 0.7645, + "step": 28690 + }, + { + "epoch": 0.46068155187081655, + "grad_norm": 0.8536558747291565, + "learning_rate": 4.374149046530446e-05, + "loss": 0.849, + "step": 28700 + }, + { + "epoch": 0.4608420680909806, + "grad_norm": 0.505868673324585, + "learning_rate": 4.373731742302686e-05, + "loss": 0.9303, + "step": 28710 + }, + { + "epoch": 0.46100258431114466, + "grad_norm": 0.9979052543640137, + "learning_rate": 4.3733143189165856e-05, + "loss": 0.7621, + "step": 28720 + }, + { + "epoch": 0.4611631005313087, + "grad_norm": 0.7040022015571594, + "learning_rate": 4.372896776398691e-05, + "loss": 0.8668, + "step": 28730 + }, + { + "epoch": 0.4613236167514727, + "grad_norm": 0.9015858173370361, + "learning_rate": 4.372479114775553e-05, + "loss": 0.7293, + "step": 28740 + }, + { + "epoch": 0.46148413297163676, + "grad_norm": 0.6607682108879089, + "learning_rate": 4.372061334073733e-05, + "loss": 0.8043, + "step": 28750 + }, + { + "epoch": 0.4616446491918008, + "grad_norm": 0.7500408291816711, + "learning_rate": 4.371643434319801e-05, + "loss": 0.8416, + "step": 28760 + }, + { + "epoch": 0.46180516541196487, + "grad_norm": 1.2551764249801636, + "learning_rate": 4.371225415540332e-05, + "loss": 0.7386, + "step": 28770 + }, + { + "epoch": 0.4619656816321289, + "grad_norm": 0.501532793045044, + "learning_rate": 4.370807277761909e-05, + "loss": 0.9084, + "step": 28780 + }, + { + "epoch": 0.462126197852293, + "grad_norm": 0.7609218955039978, + "learning_rate": 4.3703890210111245e-05, + "loss": 0.7308, + "step": 28790 + }, + { + "epoch": 0.46228671407245703, + "grad_norm": 0.9288061857223511, + "learning_rate": 4.369970645314575e-05, + "loss": 0.7941, + "step": 28800 + }, + { + "epoch": 0.4624472302926211, + "grad_norm": 0.5450596809387207, + "learning_rate": 4.369552150698869e-05, + "loss": 0.663, + "step": 28810 + }, + { + "epoch": 0.46260774651278513, + "grad_norm": 0.7796692848205566, + "learning_rate": 4.3691335371906196e-05, + "loss": 0.8925, + "step": 28820 + }, + { + "epoch": 0.4627682627329492, + "grad_norm": 0.7628099322319031, + "learning_rate": 4.368714804816447e-05, + "loss": 0.7456, + "step": 28830 + }, + { + "epoch": 0.4629287789531132, + "grad_norm": 0.9282774925231934, + "learning_rate": 4.368295953602982e-05, + "loss": 0.8495, + "step": 28840 + }, + { + "epoch": 0.46308929517327724, + "grad_norm": 1.2956024408340454, + "learning_rate": 4.36787698357686e-05, + "loss": 0.7729, + "step": 28850 + }, + { + "epoch": 0.4632498113934413, + "grad_norm": 0.6601397395133972, + "learning_rate": 4.367457894764725e-05, + "loss": 0.7658, + "step": 28860 + }, + { + "epoch": 0.46341032761360534, + "grad_norm": 0.4742456376552582, + "learning_rate": 4.367038687193229e-05, + "loss": 0.7721, + "step": 28870 + }, + { + "epoch": 0.4635708438337694, + "grad_norm": 0.9957752227783203, + "learning_rate": 4.3666193608890305e-05, + "loss": 0.8536, + "step": 28880 + }, + { + "epoch": 0.46373136005393345, + "grad_norm": 0.6655280590057373, + "learning_rate": 4.366199915878797e-05, + "loss": 0.7544, + "step": 28890 + }, + { + "epoch": 0.4638918762740975, + "grad_norm": 0.6445185542106628, + "learning_rate": 4.3657803521892014e-05, + "loss": 0.7907, + "step": 28900 + }, + { + "epoch": 0.46405239249426156, + "grad_norm": 0.5519848465919495, + "learning_rate": 4.365360669846927e-05, + "loss": 0.8216, + "step": 28910 + }, + { + "epoch": 0.4642129087144256, + "grad_norm": 0.7921583652496338, + "learning_rate": 4.364940868878663e-05, + "loss": 0.8712, + "step": 28920 + }, + { + "epoch": 0.46437342493458966, + "grad_norm": 0.779462993144989, + "learning_rate": 4.364520949311106e-05, + "loss": 0.9258, + "step": 28930 + }, + { + "epoch": 0.46453394115475366, + "grad_norm": 0.8022353053092957, + "learning_rate": 4.3641009111709594e-05, + "loss": 0.7078, + "step": 28940 + }, + { + "epoch": 0.4646944573749177, + "grad_norm": 0.8986223340034485, + "learning_rate": 4.3636807544849364e-05, + "loss": 0.8708, + "step": 28950 + }, + { + "epoch": 0.46485497359508177, + "grad_norm": 0.8981783986091614, + "learning_rate": 4.363260479279756e-05, + "loss": 0.684, + "step": 28960 + }, + { + "epoch": 0.4650154898152458, + "grad_norm": 1.0687438249588013, + "learning_rate": 4.3628400855821447e-05, + "loss": 0.7722, + "step": 28970 + }, + { + "epoch": 0.4651760060354099, + "grad_norm": 0.8285211324691772, + "learning_rate": 4.362419573418839e-05, + "loss": 0.7555, + "step": 28980 + }, + { + "epoch": 0.4653365222555739, + "grad_norm": 0.6209325790405273, + "learning_rate": 4.361998942816579e-05, + "loss": 0.8679, + "step": 28990 + }, + { + "epoch": 0.465497038475738, + "grad_norm": 0.6233692169189453, + "learning_rate": 4.361578193802115e-05, + "loss": 0.7906, + "step": 29000 + }, + { + "epoch": 0.46565755469590203, + "grad_norm": 0.9567122459411621, + "learning_rate": 4.361157326402204e-05, + "loss": 0.7536, + "step": 29010 + }, + { + "epoch": 0.4658180709160661, + "grad_norm": 0.8930002450942993, + "learning_rate": 4.360736340643612e-05, + "loss": 0.8105, + "step": 29020 + }, + { + "epoch": 0.46597858713623014, + "grad_norm": 0.6602556109428406, + "learning_rate": 4.36031523655311e-05, + "loss": 0.8, + "step": 29030 + }, + { + "epoch": 0.4661391033563942, + "grad_norm": 2.091876268386841, + "learning_rate": 4.359894014157477e-05, + "loss": 0.8823, + "step": 29040 + }, + { + "epoch": 0.4662996195765582, + "grad_norm": 0.7717236876487732, + "learning_rate": 4.359472673483501e-05, + "loss": 0.7887, + "step": 29050 + }, + { + "epoch": 0.46646013579672224, + "grad_norm": 0.6584618091583252, + "learning_rate": 4.359051214557978e-05, + "loss": 0.8021, + "step": 29060 + }, + { + "epoch": 0.4666206520168863, + "grad_norm": 0.752008318901062, + "learning_rate": 4.358629637407709e-05, + "loss": 0.8581, + "step": 29070 + }, + { + "epoch": 0.46678116823705035, + "grad_norm": 0.6912227272987366, + "learning_rate": 4.3582079420595037e-05, + "loss": 0.7872, + "step": 29080 + }, + { + "epoch": 0.4669416844572144, + "grad_norm": 0.659751296043396, + "learning_rate": 4.35778612854018e-05, + "loss": 0.7261, + "step": 29090 + }, + { + "epoch": 0.46710220067737845, + "grad_norm": 0.781707227230072, + "learning_rate": 4.357364196876563e-05, + "loss": 0.8286, + "step": 29100 + }, + { + "epoch": 0.4672627168975425, + "grad_norm": 0.6234973669052124, + "learning_rate": 4.356942147095484e-05, + "loss": 0.8105, + "step": 29110 + }, + { + "epoch": 0.46742323311770656, + "grad_norm": 0.5165020227432251, + "learning_rate": 4.356519979223784e-05, + "loss": 0.771, + "step": 29120 + }, + { + "epoch": 0.4675837493378706, + "grad_norm": 0.8332949876785278, + "learning_rate": 4.3560976932883104e-05, + "loss": 0.8077, + "step": 29130 + }, + { + "epoch": 0.46774426555803467, + "grad_norm": 0.8012968897819519, + "learning_rate": 4.355675289315917e-05, + "loss": 0.8148, + "step": 29140 + }, + { + "epoch": 0.46790478177819866, + "grad_norm": 0.6727684736251831, + "learning_rate": 4.355252767333467e-05, + "loss": 0.7241, + "step": 29150 + }, + { + "epoch": 0.4680652979983627, + "grad_norm": 0.6484970450401306, + "learning_rate": 4.3548301273678304e-05, + "loss": 0.6983, + "step": 29160 + }, + { + "epoch": 0.46822581421852677, + "grad_norm": 0.6436958909034729, + "learning_rate": 4.354407369445884e-05, + "loss": 0.8143, + "step": 29170 + }, + { + "epoch": 0.4683863304386908, + "grad_norm": 0.6954262852668762, + "learning_rate": 4.353984493594514e-05, + "loss": 0.7033, + "step": 29180 + }, + { + "epoch": 0.4685468466588549, + "grad_norm": 0.7868225574493408, + "learning_rate": 4.353561499840611e-05, + "loss": 0.6903, + "step": 29190 + }, + { + "epoch": 0.46870736287901893, + "grad_norm": 0.7303930521011353, + "learning_rate": 4.353138388211077e-05, + "loss": 0.834, + "step": 29200 + }, + { + "epoch": 0.468867879099183, + "grad_norm": 0.3097763657569885, + "learning_rate": 4.352715158732818e-05, + "loss": 0.8127, + "step": 29210 + }, + { + "epoch": 0.46902839531934704, + "grad_norm": 0.5527016520500183, + "learning_rate": 4.352291811432749e-05, + "loss": 0.8079, + "step": 29220 + }, + { + "epoch": 0.4691889115395111, + "grad_norm": 1.2938857078552246, + "learning_rate": 4.3518683463377926e-05, + "loss": 0.8852, + "step": 29230 + }, + { + "epoch": 0.46934942775967514, + "grad_norm": 0.7087628841400146, + "learning_rate": 4.351444763474879e-05, + "loss": 0.7105, + "step": 29240 + }, + { + "epoch": 0.46950994397983914, + "grad_norm": 0.7062098979949951, + "learning_rate": 4.351021062870945e-05, + "loss": 0.7822, + "step": 29250 + }, + { + "epoch": 0.4696704602000032, + "grad_norm": 0.7596428394317627, + "learning_rate": 4.3505972445529366e-05, + "loss": 0.6654, + "step": 29260 + }, + { + "epoch": 0.46983097642016725, + "grad_norm": 0.6849297285079956, + "learning_rate": 4.3501733085478046e-05, + "loss": 0.886, + "step": 29270 + }, + { + "epoch": 0.4699914926403313, + "grad_norm": 0.658161461353302, + "learning_rate": 4.349749254882509e-05, + "loss": 0.8276, + "step": 29280 + }, + { + "epoch": 0.47015200886049535, + "grad_norm": 0.7268000245094299, + "learning_rate": 4.349325083584019e-05, + "loss": 0.7801, + "step": 29290 + }, + { + "epoch": 0.4703125250806594, + "grad_norm": 0.8180345892906189, + "learning_rate": 4.348900794679307e-05, + "loss": 0.7937, + "step": 29300 + }, + { + "epoch": 0.47047304130082346, + "grad_norm": 0.4046365022659302, + "learning_rate": 4.3484763881953574e-05, + "loss": 0.7495, + "step": 29310 + }, + { + "epoch": 0.4706335575209875, + "grad_norm": 0.6541575193405151, + "learning_rate": 4.3480518641591585e-05, + "loss": 0.6818, + "step": 29320 + }, + { + "epoch": 0.47079407374115156, + "grad_norm": 0.7584018707275391, + "learning_rate": 4.347627222597708e-05, + "loss": 0.8987, + "step": 29330 + }, + { + "epoch": 0.4709545899613156, + "grad_norm": 0.9362959265708923, + "learning_rate": 4.347202463538011e-05, + "loss": 0.8337, + "step": 29340 + }, + { + "epoch": 0.4711151061814796, + "grad_norm": 0.7821162343025208, + "learning_rate": 4.346777587007079e-05, + "loss": 0.7889, + "step": 29350 + }, + { + "epoch": 0.47127562240164367, + "grad_norm": 0.5021392703056335, + "learning_rate": 4.3463525930319315e-05, + "loss": 0.8446, + "step": 29360 + }, + { + "epoch": 0.4714361386218077, + "grad_norm": 0.7175103425979614, + "learning_rate": 4.345927481639597e-05, + "loss": 0.8907, + "step": 29370 + }, + { + "epoch": 0.4715966548419718, + "grad_norm": 0.7335010766983032, + "learning_rate": 4.345502252857107e-05, + "loss": 0.8993, + "step": 29380 + }, + { + "epoch": 0.4717571710621358, + "grad_norm": 0.9173566699028015, + "learning_rate": 4.3450769067115085e-05, + "loss": 0.8807, + "step": 29390 + }, + { + "epoch": 0.4719176872822999, + "grad_norm": 0.45690029859542847, + "learning_rate": 4.344651443229847e-05, + "loss": 0.8262, + "step": 29400 + }, + { + "epoch": 0.47207820350246393, + "grad_norm": 1.059487223625183, + "learning_rate": 4.344225862439181e-05, + "loss": 0.702, + "step": 29410 + }, + { + "epoch": 0.472238719722628, + "grad_norm": 0.6763082146644592, + "learning_rate": 4.343800164366574e-05, + "loss": 0.8444, + "step": 29420 + }, + { + "epoch": 0.47239923594279204, + "grad_norm": 0.9105213284492493, + "learning_rate": 4.343374349039099e-05, + "loss": 0.7475, + "step": 29430 + }, + { + "epoch": 0.4725597521629561, + "grad_norm": 0.6635963320732117, + "learning_rate": 4.3429484164838356e-05, + "loss": 0.7761, + "step": 29440 + }, + { + "epoch": 0.4727202683831201, + "grad_norm": 0.7060190439224243, + "learning_rate": 4.342522366727869e-05, + "loss": 0.7527, + "step": 29450 + }, + { + "epoch": 0.47288078460328414, + "grad_norm": 0.8705611228942871, + "learning_rate": 4.342096199798294e-05, + "loss": 0.8483, + "step": 29460 + }, + { + "epoch": 0.4730413008234482, + "grad_norm": 0.48388218879699707, + "learning_rate": 4.3416699157222133e-05, + "loss": 0.8061, + "step": 29470 + }, + { + "epoch": 0.47320181704361225, + "grad_norm": 0.6512983441352844, + "learning_rate": 4.3412435145267364e-05, + "loss": 0.8105, + "step": 29480 + }, + { + "epoch": 0.4733623332637763, + "grad_norm": 0.5832868218421936, + "learning_rate": 4.340816996238978e-05, + "loss": 0.8502, + "step": 29490 + }, + { + "epoch": 0.47352284948394036, + "grad_norm": 0.8378017544746399, + "learning_rate": 4.3403903608860626e-05, + "loss": 0.8001, + "step": 29500 + }, + { + "epoch": 0.4736833657041044, + "grad_norm": 1.0181732177734375, + "learning_rate": 4.339963608495123e-05, + "loss": 0.8199, + "step": 29510 + }, + { + "epoch": 0.47384388192426846, + "grad_norm": 0.6476365327835083, + "learning_rate": 4.339536739093297e-05, + "loss": 0.7573, + "step": 29520 + }, + { + "epoch": 0.4740043981444325, + "grad_norm": 0.8465306162834167, + "learning_rate": 4.339109752707732e-05, + "loss": 0.7548, + "step": 29530 + }, + { + "epoch": 0.47416491436459657, + "grad_norm": 0.5893762707710266, + "learning_rate": 4.33868264936558e-05, + "loss": 0.8289, + "step": 29540 + }, + { + "epoch": 0.47432543058476057, + "grad_norm": 0.4058952331542969, + "learning_rate": 4.338255429094005e-05, + "loss": 0.7585, + "step": 29550 + }, + { + "epoch": 0.4744859468049246, + "grad_norm": 0.6057544350624084, + "learning_rate": 4.337828091920173e-05, + "loss": 0.7828, + "step": 29560 + }, + { + "epoch": 0.47464646302508867, + "grad_norm": 1.2511169910430908, + "learning_rate": 4.3374006378712617e-05, + "loss": 0.8377, + "step": 29570 + }, + { + "epoch": 0.4748069792452527, + "grad_norm": 0.6020371913909912, + "learning_rate": 4.336973066974455e-05, + "loss": 0.8176, + "step": 29580 + }, + { + "epoch": 0.4749674954654168, + "grad_norm": 0.6637784242630005, + "learning_rate": 4.336545379256942e-05, + "loss": 0.8687, + "step": 29590 + }, + { + "epoch": 0.47512801168558083, + "grad_norm": 0.5814506411552429, + "learning_rate": 4.336117574745923e-05, + "loss": 0.807, + "step": 29600 + }, + { + "epoch": 0.4752885279057449, + "grad_norm": 0.6854686737060547, + "learning_rate": 4.335689653468603e-05, + "loss": 0.8496, + "step": 29610 + }, + { + "epoch": 0.47544904412590894, + "grad_norm": 0.7349613308906555, + "learning_rate": 4.335261615452195e-05, + "loss": 0.819, + "step": 29620 + }, + { + "epoch": 0.475609560346073, + "grad_norm": 0.5658441781997681, + "learning_rate": 4.3348334607239213e-05, + "loss": 0.8542, + "step": 29630 + }, + { + "epoch": 0.47577007656623704, + "grad_norm": 0.6305376291275024, + "learning_rate": 4.334405189311008e-05, + "loss": 0.7219, + "step": 29640 + }, + { + "epoch": 0.47593059278640104, + "grad_norm": 0.8253692984580994, + "learning_rate": 4.333976801240691e-05, + "loss": 0.7762, + "step": 29650 + }, + { + "epoch": 0.4760911090065651, + "grad_norm": 1.1223697662353516, + "learning_rate": 4.333548296540215e-05, + "loss": 0.7697, + "step": 29660 + }, + { + "epoch": 0.47625162522672915, + "grad_norm": 0.4996967613697052, + "learning_rate": 4.3331196752368286e-05, + "loss": 0.8882, + "step": 29670 + }, + { + "epoch": 0.4764121414468932, + "grad_norm": 1.2250174283981323, + "learning_rate": 4.332690937357791e-05, + "loss": 0.8389, + "step": 29680 + }, + { + "epoch": 0.47657265766705725, + "grad_norm": 0.6846598982810974, + "learning_rate": 4.332262082930365e-05, + "loss": 0.9114, + "step": 29690 + }, + { + "epoch": 0.4767331738872213, + "grad_norm": 0.9290558695793152, + "learning_rate": 4.331833111981826e-05, + "loss": 0.7583, + "step": 29700 + }, + { + "epoch": 0.47689369010738536, + "grad_norm": 0.6650123596191406, + "learning_rate": 4.331404024539452e-05, + "loss": 0.8328, + "step": 29710 + }, + { + "epoch": 0.4770542063275494, + "grad_norm": 0.7360630631446838, + "learning_rate": 4.330974820630532e-05, + "loss": 0.8619, + "step": 29720 + }, + { + "epoch": 0.47721472254771347, + "grad_norm": 1.1315349340438843, + "learning_rate": 4.33054550028236e-05, + "loss": 0.8351, + "step": 29730 + }, + { + "epoch": 0.4773752387678775, + "grad_norm": 0.6896051168441772, + "learning_rate": 4.330116063522238e-05, + "loss": 0.8914, + "step": 29740 + }, + { + "epoch": 0.4775357549880415, + "grad_norm": 0.8569513559341431, + "learning_rate": 4.329686510377476e-05, + "loss": 0.8757, + "step": 29750 + }, + { + "epoch": 0.47769627120820557, + "grad_norm": 0.7677883505821228, + "learning_rate": 4.329256840875392e-05, + "loss": 0.7, + "step": 29760 + }, + { + "epoch": 0.4778567874283696, + "grad_norm": 0.667560338973999, + "learning_rate": 4.328827055043309e-05, + "loss": 0.7486, + "step": 29770 + }, + { + "epoch": 0.4780173036485337, + "grad_norm": 0.7758519649505615, + "learning_rate": 4.328397152908559e-05, + "loss": 0.7003, + "step": 29780 + }, + { + "epoch": 0.47817781986869773, + "grad_norm": 1.0441042184829712, + "learning_rate": 4.327967134498483e-05, + "loss": 0.729, + "step": 29790 + }, + { + "epoch": 0.4783383360888618, + "grad_norm": 0.7854068279266357, + "learning_rate": 4.3275369998404244e-05, + "loss": 0.7849, + "step": 29800 + }, + { + "epoch": 0.47849885230902583, + "grad_norm": 0.6319164037704468, + "learning_rate": 4.32710674896174e-05, + "loss": 0.9261, + "step": 29810 + }, + { + "epoch": 0.4786593685291899, + "grad_norm": 1.0244877338409424, + "learning_rate": 4.326676381889792e-05, + "loss": 0.7795, + "step": 29820 + }, + { + "epoch": 0.47881988474935394, + "grad_norm": 0.9103929400444031, + "learning_rate": 4.326245898651945e-05, + "loss": 0.7319, + "step": 29830 + }, + { + "epoch": 0.478980400969518, + "grad_norm": 0.6928017735481262, + "learning_rate": 4.32581529927558e-05, + "loss": 0.8603, + "step": 29840 + }, + { + "epoch": 0.479140917189682, + "grad_norm": 0.5793690085411072, + "learning_rate": 4.325384583788077e-05, + "loss": 0.7693, + "step": 29850 + }, + { + "epoch": 0.47930143340984604, + "grad_norm": 0.7657708525657654, + "learning_rate": 4.3249537522168295e-05, + "loss": 0.826, + "step": 29860 + }, + { + "epoch": 0.4794619496300101, + "grad_norm": 0.7046738266944885, + "learning_rate": 4.324522804589234e-05, + "loss": 0.6993, + "step": 29870 + }, + { + "epoch": 0.47962246585017415, + "grad_norm": 0.6710096001625061, + "learning_rate": 4.3240917409326965e-05, + "loss": 0.7341, + "step": 29880 + }, + { + "epoch": 0.4797829820703382, + "grad_norm": 0.5445494055747986, + "learning_rate": 4.323660561274631e-05, + "loss": 0.755, + "step": 29890 + }, + { + "epoch": 0.47994349829050226, + "grad_norm": 0.5835373401641846, + "learning_rate": 4.3232292656424575e-05, + "loss": 0.7612, + "step": 29900 + }, + { + "epoch": 0.4801040145106663, + "grad_norm": 0.7528091073036194, + "learning_rate": 4.322797854063605e-05, + "loss": 0.7781, + "step": 29910 + }, + { + "epoch": 0.48026453073083036, + "grad_norm": 0.5030126571655273, + "learning_rate": 4.322366326565507e-05, + "loss": 0.7685, + "step": 29920 + }, + { + "epoch": 0.4804250469509944, + "grad_norm": 0.5784837007522583, + "learning_rate": 4.321934683175607e-05, + "loss": 0.7232, + "step": 29930 + }, + { + "epoch": 0.48058556317115847, + "grad_norm": 0.5963630676269531, + "learning_rate": 4.321502923921355e-05, + "loss": 0.8449, + "step": 29940 + }, + { + "epoch": 0.48074607939132247, + "grad_norm": 0.8827456831932068, + "learning_rate": 4.321071048830208e-05, + "loss": 0.7701, + "step": 29950 + }, + { + "epoch": 0.4809065956114865, + "grad_norm": 0.5178059935569763, + "learning_rate": 4.3206390579296295e-05, + "loss": 0.8404, + "step": 29960 + }, + { + "epoch": 0.4810671118316506, + "grad_norm": 0.9829713702201843, + "learning_rate": 4.3202069512470945e-05, + "loss": 0.7103, + "step": 29970 + }, + { + "epoch": 0.4812276280518146, + "grad_norm": 0.8353303670883179, + "learning_rate": 4.319774728810081e-05, + "loss": 0.7972, + "step": 29980 + }, + { + "epoch": 0.4813881442719787, + "grad_norm": 1.0501463413238525, + "learning_rate": 4.3193423906460754e-05, + "loss": 0.8002, + "step": 29990 + }, + { + "epoch": 0.48154866049214273, + "grad_norm": 0.7143295407295227, + "learning_rate": 4.318909936782572e-05, + "loss": 0.7623, + "step": 30000 + }, + { + "epoch": 0.4817091767123068, + "grad_norm": 0.7371399998664856, + "learning_rate": 4.3184773672470726e-05, + "loss": 0.786, + "step": 30010 + }, + { + "epoch": 0.48186969293247084, + "grad_norm": 0.7729040384292603, + "learning_rate": 4.318044682067086e-05, + "loss": 0.8712, + "step": 30020 + }, + { + "epoch": 0.4820302091526349, + "grad_norm": 0.5732158422470093, + "learning_rate": 4.317611881270129e-05, + "loss": 0.8098, + "step": 30030 + }, + { + "epoch": 0.48219072537279895, + "grad_norm": 0.815724790096283, + "learning_rate": 4.317178964883724e-05, + "loss": 0.7699, + "step": 30040 + }, + { + "epoch": 0.48235124159296294, + "grad_norm": 0.6834113001823425, + "learning_rate": 4.3167459329354034e-05, + "loss": 0.7757, + "step": 30050 + }, + { + "epoch": 0.482511757813127, + "grad_norm": 0.7599489092826843, + "learning_rate": 4.3163127854527035e-05, + "loss": 0.7372, + "step": 30060 + }, + { + "epoch": 0.48267227403329105, + "grad_norm": 0.9214230179786682, + "learning_rate": 4.315879522463172e-05, + "loss": 0.6987, + "step": 30070 + }, + { + "epoch": 0.4828327902534551, + "grad_norm": 0.5849825143814087, + "learning_rate": 4.315446143994361e-05, + "loss": 0.8928, + "step": 30080 + }, + { + "epoch": 0.48299330647361916, + "grad_norm": 0.7596210837364197, + "learning_rate": 4.31501265007383e-05, + "loss": 0.7962, + "step": 30090 + }, + { + "epoch": 0.4831538226937832, + "grad_norm": 0.5409344434738159, + "learning_rate": 4.3145790407291485e-05, + "loss": 0.655, + "step": 30100 + }, + { + "epoch": 0.48331433891394726, + "grad_norm": 1.1639219522476196, + "learning_rate": 4.31414531598789e-05, + "loss": 0.8739, + "step": 30110 + }, + { + "epoch": 0.4834748551341113, + "grad_norm": 0.6200413703918457, + "learning_rate": 4.313711475877638e-05, + "loss": 0.7908, + "step": 30120 + }, + { + "epoch": 0.48363537135427537, + "grad_norm": 0.6156409978866577, + "learning_rate": 4.31327752042598e-05, + "loss": 0.6811, + "step": 30130 + }, + { + "epoch": 0.4837958875744394, + "grad_norm": 0.614716649055481, + "learning_rate": 4.3128434496605156e-05, + "loss": 0.7402, + "step": 30140 + }, + { + "epoch": 0.4839564037946034, + "grad_norm": 0.8543381690979004, + "learning_rate": 4.3124092636088486e-05, + "loss": 0.789, + "step": 30150 + }, + { + "epoch": 0.48411692001476747, + "grad_norm": 0.9037030935287476, + "learning_rate": 4.3119749622985896e-05, + "loss": 0.7597, + "step": 30160 + }, + { + "epoch": 0.4842774362349315, + "grad_norm": 0.8781686425209045, + "learning_rate": 4.311540545757358e-05, + "loss": 0.6962, + "step": 30170 + }, + { + "epoch": 0.4844379524550956, + "grad_norm": 0.6169934272766113, + "learning_rate": 4.311106014012781e-05, + "loss": 0.8279, + "step": 30180 + }, + { + "epoch": 0.48459846867525963, + "grad_norm": 0.45732492208480835, + "learning_rate": 4.310671367092491e-05, + "loss": 0.6946, + "step": 30190 + }, + { + "epoch": 0.4847589848954237, + "grad_norm": 1.0509765148162842, + "learning_rate": 4.3102366050241295e-05, + "loss": 0.7644, + "step": 30200 + }, + { + "epoch": 0.48491950111558774, + "grad_norm": 0.6497449278831482, + "learning_rate": 4.309801727835345e-05, + "loss": 0.8295, + "step": 30210 + }, + { + "epoch": 0.4850800173357518, + "grad_norm": 0.7979898452758789, + "learning_rate": 4.309366735553794e-05, + "loss": 0.8385, + "step": 30220 + }, + { + "epoch": 0.48524053355591584, + "grad_norm": 0.42666828632354736, + "learning_rate": 4.308931628207138e-05, + "loss": 0.7719, + "step": 30230 + }, + { + "epoch": 0.4854010497760799, + "grad_norm": 0.8658029437065125, + "learning_rate": 4.308496405823048e-05, + "loss": 0.8867, + "step": 30240 + }, + { + "epoch": 0.4855615659962439, + "grad_norm": 1.0071920156478882, + "learning_rate": 4.308061068429201e-05, + "loss": 0.6695, + "step": 30250 + }, + { + "epoch": 0.48572208221640795, + "grad_norm": 0.5819129347801208, + "learning_rate": 4.307625616053282e-05, + "loss": 0.7853, + "step": 30260 + }, + { + "epoch": 0.485882598436572, + "grad_norm": 0.5760812163352966, + "learning_rate": 4.307190048722984e-05, + "loss": 0.8013, + "step": 30270 + }, + { + "epoch": 0.48604311465673605, + "grad_norm": 0.7665581703186035, + "learning_rate": 4.306754366466006e-05, + "loss": 0.7905, + "step": 30280 + }, + { + "epoch": 0.4862036308769001, + "grad_norm": 0.8012306094169617, + "learning_rate": 4.306318569310054e-05, + "loss": 0.8757, + "step": 30290 + }, + { + "epoch": 0.48636414709706416, + "grad_norm": 0.9509938955307007, + "learning_rate": 4.305882657282844e-05, + "loss": 0.8674, + "step": 30300 + }, + { + "epoch": 0.4865246633172282, + "grad_norm": 0.80370032787323, + "learning_rate": 4.3054466304120965e-05, + "loss": 0.7777, + "step": 30310 + }, + { + "epoch": 0.48668517953739227, + "grad_norm": 0.6109313368797302, + "learning_rate": 4.305010488725539e-05, + "loss": 0.7282, + "step": 30320 + }, + { + "epoch": 0.4868456957575563, + "grad_norm": 0.8264633417129517, + "learning_rate": 4.3045742322509104e-05, + "loss": 0.8363, + "step": 30330 + }, + { + "epoch": 0.48700621197772037, + "grad_norm": 0.6213743686676025, + "learning_rate": 4.304137861015952e-05, + "loss": 0.816, + "step": 30340 + }, + { + "epoch": 0.48716672819788437, + "grad_norm": 0.7835739254951477, + "learning_rate": 4.3037013750484144e-05, + "loss": 0.7384, + "step": 30350 + }, + { + "epoch": 0.4873272444180484, + "grad_norm": 0.7170085310935974, + "learning_rate": 4.3032647743760556e-05, + "loss": 0.8027, + "step": 30360 + }, + { + "epoch": 0.4874877606382125, + "grad_norm": 0.6107996702194214, + "learning_rate": 4.3028280590266423e-05, + "loss": 0.696, + "step": 30370 + }, + { + "epoch": 0.48764827685837653, + "grad_norm": 0.7407366037368774, + "learning_rate": 4.302391229027946e-05, + "loss": 0.7173, + "step": 30380 + }, + { + "epoch": 0.4878087930785406, + "grad_norm": 0.617738664150238, + "learning_rate": 4.301954284407745e-05, + "loss": 0.7939, + "step": 30390 + }, + { + "epoch": 0.48796930929870463, + "grad_norm": 0.8868194818496704, + "learning_rate": 4.301517225193829e-05, + "loss": 0.8332, + "step": 30400 + }, + { + "epoch": 0.4881298255188687, + "grad_norm": 0.4311992824077606, + "learning_rate": 4.301080051413992e-05, + "loss": 0.8299, + "step": 30410 + }, + { + "epoch": 0.48829034173903274, + "grad_norm": 0.4186556041240692, + "learning_rate": 4.3006427630960334e-05, + "loss": 0.8286, + "step": 30420 + }, + { + "epoch": 0.4884508579591968, + "grad_norm": 0.8194505572319031, + "learning_rate": 4.300205360267764e-05, + "loss": 0.7974, + "step": 30430 + }, + { + "epoch": 0.48861137417936085, + "grad_norm": 1.3275690078735352, + "learning_rate": 4.299767842957001e-05, + "loss": 0.7608, + "step": 30440 + }, + { + "epoch": 0.48877189039952484, + "grad_norm": 1.5765355825424194, + "learning_rate": 4.299330211191566e-05, + "loss": 0.8036, + "step": 30450 + }, + { + "epoch": 0.4889324066196889, + "grad_norm": 0.65500807762146, + "learning_rate": 4.298892464999291e-05, + "loss": 0.7376, + "step": 30460 + }, + { + "epoch": 0.48909292283985295, + "grad_norm": 0.4505905210971832, + "learning_rate": 4.2984546044080124e-05, + "loss": 0.6929, + "step": 30470 + }, + { + "epoch": 0.489253439060017, + "grad_norm": 2.696746826171875, + "learning_rate": 4.2980166294455784e-05, + "loss": 0.8211, + "step": 30480 + }, + { + "epoch": 0.48941395528018106, + "grad_norm": 1.2329789400100708, + "learning_rate": 4.2975785401398385e-05, + "loss": 0.7898, + "step": 30490 + }, + { + "epoch": 0.4895744715003451, + "grad_norm": 0.94597327709198, + "learning_rate": 4.297140336518656e-05, + "loss": 0.7505, + "step": 30500 + }, + { + "epoch": 0.48973498772050916, + "grad_norm": 1.0234047174453735, + "learning_rate": 4.296702018609895e-05, + "loss": 0.7586, + "step": 30510 + }, + { + "epoch": 0.4898955039406732, + "grad_norm": 0.5776852965354919, + "learning_rate": 4.296263586441431e-05, + "loss": 0.7751, + "step": 30520 + }, + { + "epoch": 0.49005602016083727, + "grad_norm": 0.6829172968864441, + "learning_rate": 4.295825040041146e-05, + "loss": 0.6626, + "step": 30530 + }, + { + "epoch": 0.4902165363810013, + "grad_norm": 0.7741425633430481, + "learning_rate": 4.29538637943693e-05, + "loss": 0.7258, + "step": 30540 + }, + { + "epoch": 0.4903770526011654, + "grad_norm": 1.4075734615325928, + "learning_rate": 4.2949476046566775e-05, + "loss": 0.8775, + "step": 30550 + }, + { + "epoch": 0.4905375688213294, + "grad_norm": 0.6303108334541321, + "learning_rate": 4.294508715728292e-05, + "loss": 0.7395, + "step": 30560 + }, + { + "epoch": 0.4906980850414934, + "grad_norm": 0.6584925651550293, + "learning_rate": 4.294069712679686e-05, + "loss": 1.0341, + "step": 30570 + }, + { + "epoch": 0.4908586012616575, + "grad_norm": 0.6336269378662109, + "learning_rate": 4.293630595538776e-05, + "loss": 0.7628, + "step": 30580 + }, + { + "epoch": 0.49101911748182153, + "grad_norm": 0.6883237361907959, + "learning_rate": 4.293191364333486e-05, + "loss": 0.7767, + "step": 30590 + }, + { + "epoch": 0.4911796337019856, + "grad_norm": 0.48670822381973267, + "learning_rate": 4.2927520190917526e-05, + "loss": 0.6164, + "step": 30600 + }, + { + "epoch": 0.49134014992214964, + "grad_norm": 0.6513984799385071, + "learning_rate": 4.2923125598415114e-05, + "loss": 0.6951, + "step": 30610 + }, + { + "epoch": 0.4915006661423137, + "grad_norm": 0.8256345987319946, + "learning_rate": 4.291872986610712e-05, + "loss": 0.6326, + "step": 30620 + }, + { + "epoch": 0.49166118236247774, + "grad_norm": 0.7616339325904846, + "learning_rate": 4.291433299427308e-05, + "loss": 0.8228, + "step": 30630 + }, + { + "epoch": 0.4918216985826418, + "grad_norm": 0.5234552621841431, + "learning_rate": 4.290993498319261e-05, + "loss": 0.7347, + "step": 30640 + }, + { + "epoch": 0.49198221480280585, + "grad_norm": 0.770200788974762, + "learning_rate": 4.2905535833145395e-05, + "loss": 0.8297, + "step": 30650 + }, + { + "epoch": 0.49214273102296985, + "grad_norm": 0.7441433668136597, + "learning_rate": 4.2901135544411186e-05, + "loss": 0.8292, + "step": 30660 + }, + { + "epoch": 0.4923032472431339, + "grad_norm": 0.8735257983207703, + "learning_rate": 4.289673411726983e-05, + "loss": 0.7973, + "step": 30670 + }, + { + "epoch": 0.49246376346329795, + "grad_norm": 0.5921701192855835, + "learning_rate": 4.2892331552001235e-05, + "loss": 0.7522, + "step": 30680 + }, + { + "epoch": 0.492624279683462, + "grad_norm": 0.6008188724517822, + "learning_rate": 4.288792784888537e-05, + "loss": 0.8956, + "step": 30690 + }, + { + "epoch": 0.49278479590362606, + "grad_norm": 0.8645350933074951, + "learning_rate": 4.2883523008202274e-05, + "loss": 0.7832, + "step": 30700 + }, + { + "epoch": 0.4929453121237901, + "grad_norm": 0.7033547759056091, + "learning_rate": 4.287911703023209e-05, + "loss": 0.7003, + "step": 30710 + }, + { + "epoch": 0.49310582834395417, + "grad_norm": 0.7422193288803101, + "learning_rate": 4.2874709915255e-05, + "loss": 0.7323, + "step": 30720 + }, + { + "epoch": 0.4932663445641182, + "grad_norm": 0.9374138116836548, + "learning_rate": 4.287030166355127e-05, + "loss": 0.7843, + "step": 30730 + }, + { + "epoch": 0.4934268607842823, + "grad_norm": 0.6293550133705139, + "learning_rate": 4.286589227540125e-05, + "loss": 0.6504, + "step": 30740 + }, + { + "epoch": 0.4935873770044463, + "grad_norm": 0.7147255539894104, + "learning_rate": 4.286148175108534e-05, + "loss": 0.7976, + "step": 30750 + }, + { + "epoch": 0.4937478932246103, + "grad_norm": 0.571442186832428, + "learning_rate": 4.2857070090884024e-05, + "loss": 0.7241, + "step": 30760 + }, + { + "epoch": 0.4939084094447744, + "grad_norm": 0.5547153353691101, + "learning_rate": 4.285265729507786e-05, + "loss": 0.7778, + "step": 30770 + }, + { + "epoch": 0.49406892566493843, + "grad_norm": 1.3406944274902344, + "learning_rate": 4.2848243363947484e-05, + "loss": 0.7659, + "step": 30780 + }, + { + "epoch": 0.4942294418851025, + "grad_norm": 0.6264866590499878, + "learning_rate": 4.284382829777358e-05, + "loss": 0.9109, + "step": 30790 + }, + { + "epoch": 0.49438995810526654, + "grad_norm": 0.6864950656890869, + "learning_rate": 4.283941209683693e-05, + "loss": 0.7787, + "step": 30800 + }, + { + "epoch": 0.4945504743254306, + "grad_norm": 0.5272615551948547, + "learning_rate": 4.2834994761418376e-05, + "loss": 0.7169, + "step": 30810 + }, + { + "epoch": 0.49471099054559464, + "grad_norm": 0.6958242058753967, + "learning_rate": 4.283057629179884e-05, + "loss": 0.741, + "step": 30820 + }, + { + "epoch": 0.4948715067657587, + "grad_norm": 0.7249924540519714, + "learning_rate": 4.282615668825931e-05, + "loss": 0.8055, + "step": 30830 + }, + { + "epoch": 0.49503202298592275, + "grad_norm": 0.8055647611618042, + "learning_rate": 4.282173595108084e-05, + "loss": 0.6666, + "step": 30840 + }, + { + "epoch": 0.4951925392060868, + "grad_norm": 0.842509925365448, + "learning_rate": 4.2817314080544565e-05, + "loss": 0.7529, + "step": 30850 + }, + { + "epoch": 0.4953530554262508, + "grad_norm": 0.6819928288459778, + "learning_rate": 4.281289107693169e-05, + "loss": 0.8184, + "step": 30860 + }, + { + "epoch": 0.49551357164641485, + "grad_norm": 1.3579487800598145, + "learning_rate": 4.280846694052349e-05, + "loss": 0.7104, + "step": 30870 + }, + { + "epoch": 0.4956740878665789, + "grad_norm": 0.5135917663574219, + "learning_rate": 4.2804041671601336e-05, + "loss": 0.7153, + "step": 30880 + }, + { + "epoch": 0.49583460408674296, + "grad_norm": 0.7079076170921326, + "learning_rate": 4.279961527044661e-05, + "loss": 0.6501, + "step": 30890 + }, + { + "epoch": 0.495995120306907, + "grad_norm": 0.7970148921012878, + "learning_rate": 4.2795187737340836e-05, + "loss": 0.7005, + "step": 30900 + }, + { + "epoch": 0.49615563652707106, + "grad_norm": 1.1683986186981201, + "learning_rate": 4.2790759072565575e-05, + "loss": 0.7589, + "step": 30910 + }, + { + "epoch": 0.4963161527472351, + "grad_norm": 0.6088690161705017, + "learning_rate": 4.278632927640245e-05, + "loss": 0.8718, + "step": 30920 + }, + { + "epoch": 0.49647666896739917, + "grad_norm": 0.9043493270874023, + "learning_rate": 4.2781898349133177e-05, + "loss": 0.9386, + "step": 30930 + }, + { + "epoch": 0.4966371851875632, + "grad_norm": 0.8023169636726379, + "learning_rate": 4.277746629103953e-05, + "loss": 0.7627, + "step": 30940 + }, + { + "epoch": 0.4967977014077273, + "grad_norm": 0.7447545528411865, + "learning_rate": 4.2773033102403385e-05, + "loss": 0.8213, + "step": 30950 + }, + { + "epoch": 0.4969582176278913, + "grad_norm": 0.7254095673561096, + "learning_rate": 4.276859878350665e-05, + "loss": 0.8247, + "step": 30960 + }, + { + "epoch": 0.4971187338480553, + "grad_norm": 0.6311300992965698, + "learning_rate": 4.276416333463132e-05, + "loss": 0.8963, + "step": 30970 + }, + { + "epoch": 0.4972792500682194, + "grad_norm": 0.7828834056854248, + "learning_rate": 4.275972675605947e-05, + "loss": 0.8506, + "step": 30980 + }, + { + "epoch": 0.49743976628838343, + "grad_norm": 0.6756554245948792, + "learning_rate": 4.275528904807323e-05, + "loss": 0.665, + "step": 30990 + }, + { + "epoch": 0.4976002825085475, + "grad_norm": 0.5704131722450256, + "learning_rate": 4.275085021095483e-05, + "loss": 0.8053, + "step": 31000 + }, + { + "epoch": 0.49776079872871154, + "grad_norm": 0.8062161207199097, + "learning_rate": 4.2746410244986535e-05, + "loss": 0.7323, + "step": 31010 + }, + { + "epoch": 0.4979213149488756, + "grad_norm": 0.5049219727516174, + "learning_rate": 4.274196915045071e-05, + "loss": 0.8345, + "step": 31020 + }, + { + "epoch": 0.49808183116903965, + "grad_norm": 0.6776707172393799, + "learning_rate": 4.273752692762979e-05, + "loss": 0.8867, + "step": 31030 + }, + { + "epoch": 0.4982423473892037, + "grad_norm": 0.556952714920044, + "learning_rate": 4.2733083576806255e-05, + "loss": 0.7879, + "step": 31040 + }, + { + "epoch": 0.49840286360936775, + "grad_norm": 0.6277993321418762, + "learning_rate": 4.272863909826269e-05, + "loss": 0.881, + "step": 31050 + }, + { + "epoch": 0.49856337982953175, + "grad_norm": 1.0031967163085938, + "learning_rate": 4.272419349228174e-05, + "loss": 0.8484, + "step": 31060 + }, + { + "epoch": 0.4987238960496958, + "grad_norm": 0.6998121738433838, + "learning_rate": 4.271974675914611e-05, + "loss": 0.6948, + "step": 31070 + }, + { + "epoch": 0.49888441226985986, + "grad_norm": 0.6211245059967041, + "learning_rate": 4.271529889913859e-05, + "loss": 0.9276, + "step": 31080 + }, + { + "epoch": 0.4990449284900239, + "grad_norm": 0.6871326565742493, + "learning_rate": 4.271084991254204e-05, + "loss": 0.9103, + "step": 31090 + }, + { + "epoch": 0.49920544471018796, + "grad_norm": 0.5606653094291687, + "learning_rate": 4.270639979963939e-05, + "loss": 0.68, + "step": 31100 + }, + { + "epoch": 0.499365960930352, + "grad_norm": 0.7777308225631714, + "learning_rate": 4.2701948560713636e-05, + "loss": 0.7468, + "step": 31110 + }, + { + "epoch": 0.49952647715051607, + "grad_norm": 1.1671674251556396, + "learning_rate": 4.2697496196047854e-05, + "loss": 0.8123, + "step": 31120 + }, + { + "epoch": 0.4996869933706801, + "grad_norm": 0.8598347306251526, + "learning_rate": 4.269304270592519e-05, + "loss": 0.9071, + "step": 31130 + }, + { + "epoch": 0.4998475095908442, + "grad_norm": 0.5631603002548218, + "learning_rate": 4.2688588090628856e-05, + "loss": 0.7554, + "step": 31140 + }, + { + "epoch": 0.5000080258110082, + "grad_norm": 0.6317096948623657, + "learning_rate": 4.268413235044214e-05, + "loss": 0.913, + "step": 31150 + }, + { + "epoch": 0.5001685420311722, + "grad_norm": 0.869176983833313, + "learning_rate": 4.2679675485648396e-05, + "loss": 0.797, + "step": 31160 + }, + { + "epoch": 0.5003290582513363, + "grad_norm": 0.6496331691741943, + "learning_rate": 4.2675217496531074e-05, + "loss": 0.661, + "step": 31170 + }, + { + "epoch": 0.5004895744715003, + "grad_norm": 0.6055790185928345, + "learning_rate": 4.2670758383373654e-05, + "loss": 0.8106, + "step": 31180 + }, + { + "epoch": 0.5006500906916644, + "grad_norm": 0.7637755274772644, + "learning_rate": 4.266629814645972e-05, + "loss": 0.8771, + "step": 31190 + }, + { + "epoch": 0.5008106069118284, + "grad_norm": 0.6173350214958191, + "learning_rate": 4.2661836786072905e-05, + "loss": 0.7268, + "step": 31200 + }, + { + "epoch": 0.5009711231319924, + "grad_norm": 0.7140908241271973, + "learning_rate": 4.2657374302496944e-05, + "loss": 0.8712, + "step": 31210 + }, + { + "epoch": 0.5011316393521565, + "grad_norm": 0.6632696390151978, + "learning_rate": 4.265291069601561e-05, + "loss": 0.7653, + "step": 31220 + }, + { + "epoch": 0.5012921555723205, + "grad_norm": 1.0593868494033813, + "learning_rate": 4.2648445966912774e-05, + "loss": 0.8841, + "step": 31230 + }, + { + "epoch": 0.5014526717924846, + "grad_norm": 2.657588243484497, + "learning_rate": 4.264398011547236e-05, + "loss": 0.8565, + "step": 31240 + }, + { + "epoch": 0.5016131880126486, + "grad_norm": 0.9940372705459595, + "learning_rate": 4.263951314197837e-05, + "loss": 0.7764, + "step": 31250 + }, + { + "epoch": 0.5017737042328128, + "grad_norm": 0.5826925039291382, + "learning_rate": 4.2635045046714874e-05, + "loss": 0.6533, + "step": 31260 + }, + { + "epoch": 0.5019342204529768, + "grad_norm": 0.6115134954452515, + "learning_rate": 4.263057582996602e-05, + "loss": 0.7471, + "step": 31270 + }, + { + "epoch": 0.5020947366731409, + "grad_norm": 0.7115318179130554, + "learning_rate": 4.262610549201603e-05, + "loss": 0.8096, + "step": 31280 + }, + { + "epoch": 0.5022552528933049, + "grad_norm": 0.7930119037628174, + "learning_rate": 4.262163403314918e-05, + "loss": 0.7174, + "step": 31290 + }, + { + "epoch": 0.502415769113469, + "grad_norm": 0.798816978931427, + "learning_rate": 4.2617161453649836e-05, + "loss": 0.7437, + "step": 31300 + }, + { + "epoch": 0.502576285333633, + "grad_norm": 0.6926154494285583, + "learning_rate": 4.261268775380243e-05, + "loss": 0.6807, + "step": 31310 + }, + { + "epoch": 0.502736801553797, + "grad_norm": 0.7722102403640747, + "learning_rate": 4.2608212933891456e-05, + "loss": 0.6207, + "step": 31320 + }, + { + "epoch": 0.5028973177739611, + "grad_norm": 0.7107654213905334, + "learning_rate": 4.2603736994201485e-05, + "loss": 0.8327, + "step": 31330 + }, + { + "epoch": 0.5030578339941251, + "grad_norm": 0.6628431677818298, + "learning_rate": 4.2599259935017165e-05, + "loss": 0.8441, + "step": 31340 + }, + { + "epoch": 0.5032183502142892, + "grad_norm": 0.49170514941215515, + "learning_rate": 4.2594781756623214e-05, + "loss": 0.8238, + "step": 31350 + }, + { + "epoch": 0.5033788664344532, + "grad_norm": 0.9698971509933472, + "learning_rate": 4.259030245930441e-05, + "loss": 0.7838, + "step": 31360 + }, + { + "epoch": 0.5035393826546173, + "grad_norm": 0.8171994090080261, + "learning_rate": 4.2585822043345625e-05, + "loss": 0.7168, + "step": 31370 + }, + { + "epoch": 0.5036998988747813, + "grad_norm": 0.7956370115280151, + "learning_rate": 4.258134050903176e-05, + "loss": 0.6877, + "step": 31380 + }, + { + "epoch": 0.5038604150949454, + "grad_norm": 0.5938856601715088, + "learning_rate": 4.2576857856647847e-05, + "loss": 0.7885, + "step": 31390 + }, + { + "epoch": 0.5040209313151094, + "grad_norm": 1.041290521621704, + "learning_rate": 4.257237408647893e-05, + "loss": 0.7109, + "step": 31400 + }, + { + "epoch": 0.5041814475352734, + "grad_norm": 0.9835456609725952, + "learning_rate": 4.2567889198810165e-05, + "loss": 0.7265, + "step": 31410 + }, + { + "epoch": 0.5043419637554375, + "grad_norm": 0.908500075340271, + "learning_rate": 4.256340319392675e-05, + "loss": 0.7711, + "step": 31420 + }, + { + "epoch": 0.5045024799756015, + "grad_norm": 0.9308565258979797, + "learning_rate": 4.255891607211399e-05, + "loss": 0.7666, + "step": 31430 + }, + { + "epoch": 0.5046629961957656, + "grad_norm": 0.730609655380249, + "learning_rate": 4.255442783365722e-05, + "loss": 0.8759, + "step": 31440 + }, + { + "epoch": 0.5048235124159296, + "grad_norm": 0.6961125731468201, + "learning_rate": 4.2549938478841875e-05, + "loss": 0.8003, + "step": 31450 + }, + { + "epoch": 0.5049840286360937, + "grad_norm": 0.646355152130127, + "learning_rate": 4.254544800795346e-05, + "loss": 0.88, + "step": 31460 + }, + { + "epoch": 0.5051445448562577, + "grad_norm": 0.6098170280456543, + "learning_rate": 4.254095642127752e-05, + "loss": 0.6829, + "step": 31470 + }, + { + "epoch": 0.5053050610764218, + "grad_norm": 0.7889835238456726, + "learning_rate": 4.253646371909972e-05, + "loss": 0.6461, + "step": 31480 + }, + { + "epoch": 0.5054655772965858, + "grad_norm": 1.3065271377563477, + "learning_rate": 4.2531969901705747e-05, + "loss": 0.7517, + "step": 31490 + }, + { + "epoch": 0.5056260935167499, + "grad_norm": 0.7620862126350403, + "learning_rate": 4.25274749693814e-05, + "loss": 0.6383, + "step": 31500 + }, + { + "epoch": 0.5057866097369139, + "grad_norm": 0.6097342371940613, + "learning_rate": 4.25229789224125e-05, + "loss": 0.788, + "step": 31510 + }, + { + "epoch": 0.5059471259570779, + "grad_norm": 0.8966332077980042, + "learning_rate": 4.251848176108501e-05, + "loss": 0.7776, + "step": 31520 + }, + { + "epoch": 0.506107642177242, + "grad_norm": 0.5242586135864258, + "learning_rate": 4.25139834856849e-05, + "loss": 0.7843, + "step": 31530 + }, + { + "epoch": 0.506268158397406, + "grad_norm": 0.6619998812675476, + "learning_rate": 4.250948409649823e-05, + "loss": 0.6729, + "step": 31540 + }, + { + "epoch": 0.5064286746175701, + "grad_norm": 0.7191646695137024, + "learning_rate": 4.250498359381114e-05, + "loss": 0.8493, + "step": 31550 + }, + { + "epoch": 0.5065891908377341, + "grad_norm": 0.7884981632232666, + "learning_rate": 4.250048197790984e-05, + "loss": 0.8078, + "step": 31560 + }, + { + "epoch": 0.5067497070578982, + "grad_norm": 0.9405893087387085, + "learning_rate": 4.2495979249080606e-05, + "loss": 0.8002, + "step": 31570 + }, + { + "epoch": 0.5069102232780622, + "grad_norm": 2.501591682434082, + "learning_rate": 4.249147540760979e-05, + "loss": 0.8437, + "step": 31580 + }, + { + "epoch": 0.5070707394982263, + "grad_norm": 1.4558464288711548, + "learning_rate": 4.248697045378379e-05, + "loss": 0.6691, + "step": 31590 + }, + { + "epoch": 0.5072312557183903, + "grad_norm": 0.7304548621177673, + "learning_rate": 4.248246438788912e-05, + "loss": 0.7987, + "step": 31600 + }, + { + "epoch": 0.5073917719385543, + "grad_norm": 0.8382411599159241, + "learning_rate": 4.247795721021232e-05, + "loss": 0.723, + "step": 31610 + }, + { + "epoch": 0.5075522881587184, + "grad_norm": 0.8799975514411926, + "learning_rate": 4.247344892104003e-05, + "loss": 0.6878, + "step": 31620 + }, + { + "epoch": 0.5077128043788824, + "grad_norm": 0.6284953355789185, + "learning_rate": 4.246893952065894e-05, + "loss": 0.7752, + "step": 31630 + }, + { + "epoch": 0.5078733205990466, + "grad_norm": 0.6350350379943848, + "learning_rate": 4.246442900935582e-05, + "loss": 0.782, + "step": 31640 + }, + { + "epoch": 0.5080338368192105, + "grad_norm": 0.9287889003753662, + "learning_rate": 4.245991738741754e-05, + "loss": 0.7852, + "step": 31650 + }, + { + "epoch": 0.5081943530393747, + "grad_norm": 3.399247646331787, + "learning_rate": 4.245540465513098e-05, + "loss": 0.9516, + "step": 31660 + }, + { + "epoch": 0.5083548692595387, + "grad_norm": 0.5698428750038147, + "learning_rate": 4.245089081278315e-05, + "loss": 0.857, + "step": 31670 + }, + { + "epoch": 0.5085153854797028, + "grad_norm": 0.724086582660675, + "learning_rate": 4.244637586066108e-05, + "loss": 0.7266, + "step": 31680 + }, + { + "epoch": 0.5086759016998668, + "grad_norm": 0.5891421437263489, + "learning_rate": 4.244185979905191e-05, + "loss": 0.7893, + "step": 31690 + }, + { + "epoch": 0.5088364179200309, + "grad_norm": 1.259103775024414, + "learning_rate": 4.243734262824283e-05, + "loss": 0.7409, + "step": 31700 + }, + { + "epoch": 0.5089969341401949, + "grad_norm": 0.4008646309375763, + "learning_rate": 4.243282434852111e-05, + "loss": 0.787, + "step": 31710 + }, + { + "epoch": 0.5091574503603589, + "grad_norm": 0.795930027961731, + "learning_rate": 4.242830496017407e-05, + "loss": 0.8343, + "step": 31720 + }, + { + "epoch": 0.509317966580523, + "grad_norm": 0.9630099534988403, + "learning_rate": 4.242378446348914e-05, + "loss": 0.7786, + "step": 31730 + }, + { + "epoch": 0.509478482800687, + "grad_norm": 0.7813802361488342, + "learning_rate": 4.2419262858753774e-05, + "loss": 0.8686, + "step": 31740 + }, + { + "epoch": 0.5096389990208511, + "grad_norm": 0.7775646448135376, + "learning_rate": 4.241474014625554e-05, + "loss": 0.8241, + "step": 31750 + }, + { + "epoch": 0.5097995152410151, + "grad_norm": 2.0779528617858887, + "learning_rate": 4.241021632628204e-05, + "loss": 0.7717, + "step": 31760 + }, + { + "epoch": 0.5099600314611792, + "grad_norm": 0.5520005822181702, + "learning_rate": 4.240569139912097e-05, + "loss": 0.8343, + "step": 31770 + }, + { + "epoch": 0.5101205476813432, + "grad_norm": 0.6182143688201904, + "learning_rate": 4.2401165365060095e-05, + "loss": 0.7798, + "step": 31780 + }, + { + "epoch": 0.5102810639015073, + "grad_norm": 0.7559928297996521, + "learning_rate": 4.239663822438723e-05, + "loss": 0.6911, + "step": 31790 + }, + { + "epoch": 0.5104415801216713, + "grad_norm": 0.5666390061378479, + "learning_rate": 4.239210997739029e-05, + "loss": 0.7646, + "step": 31800 + }, + { + "epoch": 0.5106020963418353, + "grad_norm": 0.9307319521903992, + "learning_rate": 4.238758062435723e-05, + "loss": 0.824, + "step": 31810 + }, + { + "epoch": 0.5107626125619994, + "grad_norm": 0.7129333019256592, + "learning_rate": 4.23830501655761e-05, + "loss": 0.82, + "step": 31820 + }, + { + "epoch": 0.5109231287821634, + "grad_norm": 1.444498062133789, + "learning_rate": 4.2378518601335005e-05, + "loss": 0.7217, + "step": 31830 + }, + { + "epoch": 0.5110836450023275, + "grad_norm": 0.7177569270133972, + "learning_rate": 4.2373985931922125e-05, + "loss": 0.7721, + "step": 31840 + }, + { + "epoch": 0.5112441612224915, + "grad_norm": 0.7271772623062134, + "learning_rate": 4.236945215762572e-05, + "loss": 0.8791, + "step": 31850 + }, + { + "epoch": 0.5114046774426556, + "grad_norm": 0.6103725433349609, + "learning_rate": 4.2364917278734106e-05, + "loss": 0.82, + "step": 31860 + }, + { + "epoch": 0.5115651936628196, + "grad_norm": 1.023320198059082, + "learning_rate": 4.236038129553568e-05, + "loss": 0.8163, + "step": 31870 + }, + { + "epoch": 0.5117257098829837, + "grad_norm": 0.8288299441337585, + "learning_rate": 4.235584420831888e-05, + "loss": 0.7287, + "step": 31880 + }, + { + "epoch": 0.5118862261031477, + "grad_norm": 0.7091042995452881, + "learning_rate": 4.235130601737227e-05, + "loss": 0.7601, + "step": 31890 + }, + { + "epoch": 0.5120467423233118, + "grad_norm": 0.4839083254337311, + "learning_rate": 4.234676672298444e-05, + "loss": 0.7432, + "step": 31900 + }, + { + "epoch": 0.5122072585434758, + "grad_norm": 0.9354607462882996, + "learning_rate": 4.2342226325444065e-05, + "loss": 0.704, + "step": 31910 + }, + { + "epoch": 0.5123677747636398, + "grad_norm": 1.0363391637802124, + "learning_rate": 4.233768482503988e-05, + "loss": 0.8599, + "step": 31920 + }, + { + "epoch": 0.5125282909838039, + "grad_norm": 0.5536139011383057, + "learning_rate": 4.233314222206071e-05, + "loss": 0.8276, + "step": 31930 + }, + { + "epoch": 0.5126888072039679, + "grad_norm": 0.5340831875801086, + "learning_rate": 4.2328598516795416e-05, + "loss": 0.7103, + "step": 31940 + }, + { + "epoch": 0.512849323424132, + "grad_norm": 0.7700903415679932, + "learning_rate": 4.2324053709532966e-05, + "loss": 0.876, + "step": 31950 + }, + { + "epoch": 0.513009839644296, + "grad_norm": 0.6049041748046875, + "learning_rate": 4.231950780056239e-05, + "loss": 0.7896, + "step": 31960 + }, + { + "epoch": 0.5131703558644601, + "grad_norm": 0.8790847063064575, + "learning_rate": 4.231496079017277e-05, + "loss": 0.7564, + "step": 31970 + }, + { + "epoch": 0.5133308720846241, + "grad_norm": 0.6073773503303528, + "learning_rate": 4.2310412678653274e-05, + "loss": 0.7748, + "step": 31980 + }, + { + "epoch": 0.5134913883047882, + "grad_norm": 0.7983996272087097, + "learning_rate": 4.2305863466293125e-05, + "loss": 0.7884, + "step": 31990 + }, + { + "epoch": 0.5136519045249522, + "grad_norm": 0.9207543730735779, + "learning_rate": 4.230131315338165e-05, + "loss": 0.7503, + "step": 32000 + }, + { + "epoch": 0.5136519045249522, + "eval_loss": 0.7924630641937256, + "eval_runtime": 1817.7397, + "eval_samples_per_second": 14.431, + "eval_steps_per_second": 1.804, + "step": 32000 + }, + { + "epoch": 0.5138124207451163, + "grad_norm": 0.6239632368087769, + "learning_rate": 4.229676174020819e-05, + "loss": 0.7754, + "step": 32010 + }, + { + "epoch": 0.5139729369652803, + "grad_norm": 0.8973591923713684, + "learning_rate": 4.229220922706222e-05, + "loss": 0.8938, + "step": 32020 + }, + { + "epoch": 0.5141334531854443, + "grad_norm": 0.6258607506752014, + "learning_rate": 4.228765561423322e-05, + "loss": 0.6123, + "step": 32030 + }, + { + "epoch": 0.5142939694056085, + "grad_norm": 1.1978086233139038, + "learning_rate": 4.228310090201081e-05, + "loss": 0.6563, + "step": 32040 + }, + { + "epoch": 0.5144544856257725, + "grad_norm": 0.7432782053947449, + "learning_rate": 4.2278545090684616e-05, + "loss": 0.7537, + "step": 32050 + }, + { + "epoch": 0.5146150018459366, + "grad_norm": 0.6395102143287659, + "learning_rate": 4.2273988180544364e-05, + "loss": 0.8568, + "step": 32060 + }, + { + "epoch": 0.5147755180661006, + "grad_norm": 0.7499186396598816, + "learning_rate": 4.226943017187986e-05, + "loss": 0.8293, + "step": 32070 + }, + { + "epoch": 0.5149360342862647, + "grad_norm": 0.5911881327629089, + "learning_rate": 4.2264871064980956e-05, + "loss": 0.864, + "step": 32080 + }, + { + "epoch": 0.5150965505064287, + "grad_norm": 0.7238508462905884, + "learning_rate": 4.2260310860137584e-05, + "loss": 0.6239, + "step": 32090 + }, + { + "epoch": 0.5152570667265928, + "grad_norm": 0.6771308779716492, + "learning_rate": 4.225574955763976e-05, + "loss": 0.8119, + "step": 32100 + }, + { + "epoch": 0.5154175829467568, + "grad_norm": 0.8339574933052063, + "learning_rate": 4.225118715777753e-05, + "loss": 0.7605, + "step": 32110 + }, + { + "epoch": 0.5155780991669208, + "grad_norm": 0.8179017901420593, + "learning_rate": 4.224662366084106e-05, + "loss": 0.7621, + "step": 32120 + }, + { + "epoch": 0.5157386153870849, + "grad_norm": 1.0342864990234375, + "learning_rate": 4.2242059067120545e-05, + "loss": 0.7732, + "step": 32130 + }, + { + "epoch": 0.5158991316072489, + "grad_norm": 0.5973677039146423, + "learning_rate": 4.223749337690628e-05, + "loss": 0.7225, + "step": 32140 + }, + { + "epoch": 0.516059647827413, + "grad_norm": 0.8442959189414978, + "learning_rate": 4.223292659048861e-05, + "loss": 0.7896, + "step": 32150 + }, + { + "epoch": 0.516220164047577, + "grad_norm": 0.8290172219276428, + "learning_rate": 4.2228358708157954e-05, + "loss": 0.8084, + "step": 32160 + }, + { + "epoch": 0.5163806802677411, + "grad_norm": 0.5138882398605347, + "learning_rate": 4.222378973020481e-05, + "loss": 0.7092, + "step": 32170 + }, + { + "epoch": 0.5165411964879051, + "grad_norm": 0.7751917839050293, + "learning_rate": 4.2219219656919725e-05, + "loss": 0.753, + "step": 32180 + }, + { + "epoch": 0.5167017127080692, + "grad_norm": 0.5899917483329773, + "learning_rate": 4.2214648488593336e-05, + "loss": 0.6922, + "step": 32190 + }, + { + "epoch": 0.5168622289282332, + "grad_norm": 1.145951509475708, + "learning_rate": 4.221007622551634e-05, + "loss": 0.8109, + "step": 32200 + }, + { + "epoch": 0.5170227451483973, + "grad_norm": 0.6243473887443542, + "learning_rate": 4.2205502867979516e-05, + "loss": 0.7473, + "step": 32210 + }, + { + "epoch": 0.5171832613685613, + "grad_norm": 0.5716540813446045, + "learning_rate": 4.22009284162737e-05, + "loss": 0.863, + "step": 32220 + }, + { + "epoch": 0.5173437775887253, + "grad_norm": 0.9998703598976135, + "learning_rate": 4.219635287068979e-05, + "loss": 0.8237, + "step": 32230 + }, + { + "epoch": 0.5175042938088894, + "grad_norm": 1.4995415210723877, + "learning_rate": 4.219177623151877e-05, + "loss": 0.8297, + "step": 32240 + }, + { + "epoch": 0.5176648100290534, + "grad_norm": 0.6767054200172424, + "learning_rate": 4.218719849905168e-05, + "loss": 0.7477, + "step": 32250 + }, + { + "epoch": 0.5178253262492175, + "grad_norm": 1.1411861181259155, + "learning_rate": 4.218261967357966e-05, + "loss": 0.7327, + "step": 32260 + }, + { + "epoch": 0.5179858424693815, + "grad_norm": 0.9188896417617798, + "learning_rate": 4.217803975539387e-05, + "loss": 0.7706, + "step": 32270 + }, + { + "epoch": 0.5181463586895456, + "grad_norm": 0.6651268601417542, + "learning_rate": 4.217345874478558e-05, + "loss": 0.7926, + "step": 32280 + }, + { + "epoch": 0.5183068749097096, + "grad_norm": 0.6675897836685181, + "learning_rate": 4.2168876642046105e-05, + "loss": 0.7602, + "step": 32290 + }, + { + "epoch": 0.5184673911298737, + "grad_norm": 0.8952314257621765, + "learning_rate": 4.216429344746685e-05, + "loss": 0.8736, + "step": 32300 + }, + { + "epoch": 0.5186279073500377, + "grad_norm": 0.6726568937301636, + "learning_rate": 4.2159709161339285e-05, + "loss": 0.811, + "step": 32310 + }, + { + "epoch": 0.5187884235702017, + "grad_norm": 1.4663333892822266, + "learning_rate": 4.215512378395493e-05, + "loss": 0.7737, + "step": 32320 + }, + { + "epoch": 0.5189489397903658, + "grad_norm": 1.1713261604309082, + "learning_rate": 4.215053731560539e-05, + "loss": 0.9134, + "step": 32330 + }, + { + "epoch": 0.5191094560105298, + "grad_norm": 0.6209425330162048, + "learning_rate": 4.214594975658234e-05, + "loss": 0.7369, + "step": 32340 + }, + { + "epoch": 0.5192699722306939, + "grad_norm": 0.5083849430084229, + "learning_rate": 4.2141361107177514e-05, + "loss": 0.8479, + "step": 32350 + }, + { + "epoch": 0.5194304884508579, + "grad_norm": 0.7706215977668762, + "learning_rate": 4.213677136768274e-05, + "loss": 0.8101, + "step": 32360 + }, + { + "epoch": 0.519591004671022, + "grad_norm": 0.6244750022888184, + "learning_rate": 4.2132180538389884e-05, + "loss": 0.8727, + "step": 32370 + }, + { + "epoch": 0.519751520891186, + "grad_norm": 1.3525649309158325, + "learning_rate": 4.2127588619590906e-05, + "loss": 0.7898, + "step": 32380 + }, + { + "epoch": 0.5199120371113501, + "grad_norm": 0.610212504863739, + "learning_rate": 4.212299561157783e-05, + "loss": 0.7594, + "step": 32390 + }, + { + "epoch": 0.5200725533315141, + "grad_norm": 0.6718541979789734, + "learning_rate": 4.211840151464273e-05, + "loss": 0.8157, + "step": 32400 + }, + { + "epoch": 0.5202330695516783, + "grad_norm": 0.7326704263687134, + "learning_rate": 4.2113806329077755e-05, + "loss": 0.7324, + "step": 32410 + }, + { + "epoch": 0.5203935857718422, + "grad_norm": 0.6388335227966309, + "learning_rate": 4.210921005517515e-05, + "loss": 0.6584, + "step": 32420 + }, + { + "epoch": 0.5205541019920062, + "grad_norm": 0.813713788986206, + "learning_rate": 4.210461269322722e-05, + "loss": 0.7102, + "step": 32430 + }, + { + "epoch": 0.5207146182121704, + "grad_norm": 0.7218260169029236, + "learning_rate": 4.21000142435263e-05, + "loss": 0.862, + "step": 32440 + }, + { + "epoch": 0.5208751344323344, + "grad_norm": 0.6985005140304565, + "learning_rate": 4.209541470636485e-05, + "loss": 0.7336, + "step": 32450 + }, + { + "epoch": 0.5210356506524985, + "grad_norm": 0.47204655408859253, + "learning_rate": 4.209081408203537e-05, + "loss": 0.6571, + "step": 32460 + }, + { + "epoch": 0.5211961668726625, + "grad_norm": 0.8179554343223572, + "learning_rate": 4.208621237083041e-05, + "loss": 0.8471, + "step": 32470 + }, + { + "epoch": 0.5213566830928266, + "grad_norm": 0.6630221605300903, + "learning_rate": 4.208160957304265e-05, + "loss": 0.8009, + "step": 32480 + }, + { + "epoch": 0.5215171993129906, + "grad_norm": 0.6019934415817261, + "learning_rate": 4.2077005688964765e-05, + "loss": 0.6889, + "step": 32490 + }, + { + "epoch": 0.5216777155331547, + "grad_norm": 1.032981038093567, + "learning_rate": 4.2072400718889555e-05, + "loss": 0.7348, + "step": 32500 + }, + { + "epoch": 0.5218382317533187, + "grad_norm": 1.9597060680389404, + "learning_rate": 4.206779466310986e-05, + "loss": 0.8577, + "step": 32510 + }, + { + "epoch": 0.5219987479734827, + "grad_norm": 0.6442403793334961, + "learning_rate": 4.2063187521918614e-05, + "loss": 0.7428, + "step": 32520 + }, + { + "epoch": 0.5221592641936468, + "grad_norm": 0.5207464694976807, + "learning_rate": 4.205857929560878e-05, + "loss": 0.6865, + "step": 32530 + }, + { + "epoch": 0.5223197804138108, + "grad_norm": 0.8052921295166016, + "learning_rate": 4.205396998447343e-05, + "loss": 0.7402, + "step": 32540 + }, + { + "epoch": 0.5224802966339749, + "grad_norm": 0.7850711941719055, + "learning_rate": 4.204935958880569e-05, + "loss": 0.6988, + "step": 32550 + }, + { + "epoch": 0.5226408128541389, + "grad_norm": 0.9929980635643005, + "learning_rate": 4.204474810889875e-05, + "loss": 0.7107, + "step": 32560 + }, + { + "epoch": 0.522801329074303, + "grad_norm": 0.8615232110023499, + "learning_rate": 4.2040135545045854e-05, + "loss": 0.6912, + "step": 32570 + }, + { + "epoch": 0.522961845294467, + "grad_norm": 0.6796423196792603, + "learning_rate": 4.2035521897540375e-05, + "loss": 0.7934, + "step": 32580 + }, + { + "epoch": 0.5231223615146311, + "grad_norm": 0.7275878190994263, + "learning_rate": 4.2030907166675684e-05, + "loss": 0.7244, + "step": 32590 + }, + { + "epoch": 0.5232828777347951, + "grad_norm": 0.6414124369621277, + "learning_rate": 4.202629135274526e-05, + "loss": 0.7143, + "step": 32600 + }, + { + "epoch": 0.5234433939549592, + "grad_norm": 0.7687493562698364, + "learning_rate": 4.2021674456042634e-05, + "loss": 0.6961, + "step": 32610 + }, + { + "epoch": 0.5236039101751232, + "grad_norm": 3.997098684310913, + "learning_rate": 4.201705647686143e-05, + "loss": 0.7811, + "step": 32620 + }, + { + "epoch": 0.5237644263952872, + "grad_norm": 2.812927007675171, + "learning_rate": 4.201243741549531e-05, + "loss": 0.7189, + "step": 32630 + }, + { + "epoch": 0.5239249426154513, + "grad_norm": 0.639051616191864, + "learning_rate": 4.2007817272238017e-05, + "loss": 0.7878, + "step": 32640 + }, + { + "epoch": 0.5240854588356153, + "grad_norm": 0.6656807065010071, + "learning_rate": 4.2003196047383376e-05, + "loss": 0.9898, + "step": 32650 + }, + { + "epoch": 0.5242459750557794, + "grad_norm": 0.6485801935195923, + "learning_rate": 4.1998573741225265e-05, + "loss": 0.8579, + "step": 32660 + }, + { + "epoch": 0.5244064912759434, + "grad_norm": 1.228444218635559, + "learning_rate": 4.199395035405763e-05, + "loss": 0.7175, + "step": 32670 + }, + { + "epoch": 0.5245670074961075, + "grad_norm": 0.7228076457977295, + "learning_rate": 4.198932588617451e-05, + "loss": 0.6976, + "step": 32680 + }, + { + "epoch": 0.5247275237162715, + "grad_norm": 0.6841951012611389, + "learning_rate": 4.1984700337869975e-05, + "loss": 0.7978, + "step": 32690 + }, + { + "epoch": 0.5248880399364356, + "grad_norm": 0.8914342522621155, + "learning_rate": 4.1980073709438185e-05, + "loss": 0.8205, + "step": 32700 + }, + { + "epoch": 0.5250485561565996, + "grad_norm": 1.2906630039215088, + "learning_rate": 4.197544600117337e-05, + "loss": 0.7968, + "step": 32710 + }, + { + "epoch": 0.5252090723767636, + "grad_norm": 0.6466856598854065, + "learning_rate": 4.197081721336983e-05, + "loss": 0.82, + "step": 32720 + }, + { + "epoch": 0.5253695885969277, + "grad_norm": 0.9564992785453796, + "learning_rate": 4.196618734632192e-05, + "loss": 0.8645, + "step": 32730 + }, + { + "epoch": 0.5255301048170917, + "grad_norm": 0.5533783435821533, + "learning_rate": 4.1961556400324076e-05, + "loss": 0.8033, + "step": 32740 + }, + { + "epoch": 0.5256906210372558, + "grad_norm": 0.6107707023620605, + "learning_rate": 4.1956924375670805e-05, + "loss": 0.7208, + "step": 32750 + }, + { + "epoch": 0.5258511372574198, + "grad_norm": 0.7974788546562195, + "learning_rate": 4.1952291272656663e-05, + "loss": 0.9411, + "step": 32760 + }, + { + "epoch": 0.5260116534775839, + "grad_norm": 0.6421275734901428, + "learning_rate": 4.19476570915763e-05, + "loss": 0.7225, + "step": 32770 + }, + { + "epoch": 0.5261721696977479, + "grad_norm": 0.8147570490837097, + "learning_rate": 4.194302183272442e-05, + "loss": 0.6826, + "step": 32780 + }, + { + "epoch": 0.526332685917912, + "grad_norm": 0.6522143483161926, + "learning_rate": 4.193838549639579e-05, + "loss": 0.829, + "step": 32790 + }, + { + "epoch": 0.526493202138076, + "grad_norm": 0.8373857736587524, + "learning_rate": 4.193374808288526e-05, + "loss": 0.8474, + "step": 32800 + }, + { + "epoch": 0.5266537183582402, + "grad_norm": 1.4037081003189087, + "learning_rate": 4.192910959248775e-05, + "loss": 0.6551, + "step": 32810 + }, + { + "epoch": 0.5268142345784042, + "grad_norm": 0.8050301671028137, + "learning_rate": 4.192447002549823e-05, + "loss": 0.6785, + "step": 32820 + }, + { + "epoch": 0.5269747507985681, + "grad_norm": 0.7416819334030151, + "learning_rate": 4.191982938221176e-05, + "loss": 0.8163, + "step": 32830 + }, + { + "epoch": 0.5271352670187323, + "grad_norm": 0.7565349340438843, + "learning_rate": 4.1915187662923444e-05, + "loss": 0.8742, + "step": 32840 + }, + { + "epoch": 0.5272957832388963, + "grad_norm": 0.4744003415107727, + "learning_rate": 4.1910544867928476e-05, + "loss": 0.676, + "step": 32850 + }, + { + "epoch": 0.5274562994590604, + "grad_norm": 1.5922305583953857, + "learning_rate": 4.190590099752211e-05, + "loss": 0.7974, + "step": 32860 + }, + { + "epoch": 0.5276168156792244, + "grad_norm": 0.6926668882369995, + "learning_rate": 4.190125605199967e-05, + "loss": 0.8163, + "step": 32870 + }, + { + "epoch": 0.5277773318993885, + "grad_norm": 0.9570972919464111, + "learning_rate": 4.189661003165654e-05, + "loss": 0.8241, + "step": 32880 + }, + { + "epoch": 0.5279378481195525, + "grad_norm": 0.7233933210372925, + "learning_rate": 4.1891962936788184e-05, + "loss": 0.7066, + "step": 32890 + }, + { + "epoch": 0.5280983643397166, + "grad_norm": 1.2809197902679443, + "learning_rate": 4.188731476769014e-05, + "loss": 0.6935, + "step": 32900 + }, + { + "epoch": 0.5282588805598806, + "grad_norm": 0.7466334700584412, + "learning_rate": 4.188266552465799e-05, + "loss": 0.7186, + "step": 32910 + }, + { + "epoch": 0.5284193967800446, + "grad_norm": 1.2174291610717773, + "learning_rate": 4.18780152079874e-05, + "loss": 0.7297, + "step": 32920 + }, + { + "epoch": 0.5285799130002087, + "grad_norm": 0.7874540686607361, + "learning_rate": 4.187336381797411e-05, + "loss": 0.785, + "step": 32930 + }, + { + "epoch": 0.5287404292203727, + "grad_norm": 0.8389835357666016, + "learning_rate": 4.186871135491392e-05, + "loss": 0.8085, + "step": 32940 + }, + { + "epoch": 0.5289009454405368, + "grad_norm": 0.5677398443222046, + "learning_rate": 4.18640578191027e-05, + "loss": 0.8847, + "step": 32950 + }, + { + "epoch": 0.5290614616607008, + "grad_norm": 2.146815061569214, + "learning_rate": 4.1859403210836376e-05, + "loss": 0.8069, + "step": 32960 + }, + { + "epoch": 0.5292219778808649, + "grad_norm": 0.8241454362869263, + "learning_rate": 4.185474753041097e-05, + "loss": 0.7539, + "step": 32970 + }, + { + "epoch": 0.5293824941010289, + "grad_norm": 1.1075246334075928, + "learning_rate": 4.185009077812254e-05, + "loss": 0.7563, + "step": 32980 + }, + { + "epoch": 0.529543010321193, + "grad_norm": 0.8522462844848633, + "learning_rate": 4.1845432954267235e-05, + "loss": 0.6846, + "step": 32990 + }, + { + "epoch": 0.529703526541357, + "grad_norm": 0.7734183669090271, + "learning_rate": 4.184077405914128e-05, + "loss": 0.7768, + "step": 33000 + }, + { + "epoch": 0.5298640427615211, + "grad_norm": 0.7622288465499878, + "learning_rate": 4.1836114093040934e-05, + "loss": 0.7755, + "step": 33010 + }, + { + "epoch": 0.5300245589816851, + "grad_norm": 1.1768383979797363, + "learning_rate": 4.183145305626255e-05, + "loss": 0.7572, + "step": 33020 + }, + { + "epoch": 0.5301850752018491, + "grad_norm": 0.8353914022445679, + "learning_rate": 4.182679094910254e-05, + "loss": 0.7815, + "step": 33030 + }, + { + "epoch": 0.5303455914220132, + "grad_norm": 0.6166107058525085, + "learning_rate": 4.182212777185739e-05, + "loss": 0.8543, + "step": 33040 + }, + { + "epoch": 0.5305061076421772, + "grad_norm": 0.9405189752578735, + "learning_rate": 4.181746352482365e-05, + "loss": 0.7904, + "step": 33050 + }, + { + "epoch": 0.5306666238623413, + "grad_norm": 1.096293568611145, + "learning_rate": 4.181279820829794e-05, + "loss": 0.727, + "step": 33060 + }, + { + "epoch": 0.5308271400825053, + "grad_norm": 0.6748653650283813, + "learning_rate": 4.180813182257693e-05, + "loss": 0.8903, + "step": 33070 + }, + { + "epoch": 0.5309876563026694, + "grad_norm": 0.6751288771629333, + "learning_rate": 4.180346436795741e-05, + "loss": 0.7507, + "step": 33080 + }, + { + "epoch": 0.5311481725228334, + "grad_norm": 1.4667162895202637, + "learning_rate": 4.179879584473618e-05, + "loss": 0.7321, + "step": 33090 + }, + { + "epoch": 0.5313086887429975, + "grad_norm": 0.7877492904663086, + "learning_rate": 4.179412625321012e-05, + "loss": 0.7696, + "step": 33100 + }, + { + "epoch": 0.5314692049631615, + "grad_norm": 0.979097843170166, + "learning_rate": 4.178945559367622e-05, + "loss": 0.7677, + "step": 33110 + }, + { + "epoch": 0.5316297211833255, + "grad_norm": 0.6016647815704346, + "learning_rate": 4.178478386643148e-05, + "loss": 0.7131, + "step": 33120 + }, + { + "epoch": 0.5317902374034896, + "grad_norm": 0.4882412850856781, + "learning_rate": 4.178011107177299e-05, + "loss": 0.8421, + "step": 33130 + }, + { + "epoch": 0.5319507536236536, + "grad_norm": 0.7520220875740051, + "learning_rate": 4.177543720999795e-05, + "loss": 0.6767, + "step": 33140 + }, + { + "epoch": 0.5321112698438177, + "grad_norm": 0.7596615552902222, + "learning_rate": 4.177076228140354e-05, + "loss": 0.7558, + "step": 33150 + }, + { + "epoch": 0.5322717860639817, + "grad_norm": 1.1586612462997437, + "learning_rate": 4.1766086286287095e-05, + "loss": 0.7834, + "step": 33160 + }, + { + "epoch": 0.5324323022841458, + "grad_norm": 0.7505896687507629, + "learning_rate": 4.176140922494597e-05, + "loss": 0.7553, + "step": 33170 + }, + { + "epoch": 0.5325928185043098, + "grad_norm": 0.7300997376441956, + "learning_rate": 4.17567310976776e-05, + "loss": 0.8169, + "step": 33180 + }, + { + "epoch": 0.532753334724474, + "grad_norm": 0.9721516966819763, + "learning_rate": 4.1752051904779475e-05, + "loss": 0.7478, + "step": 33190 + }, + { + "epoch": 0.532913850944638, + "grad_norm": 0.517260730266571, + "learning_rate": 4.174737164654918e-05, + "loss": 0.8044, + "step": 33200 + }, + { + "epoch": 0.533074367164802, + "grad_norm": 0.6487359404563904, + "learning_rate": 4.174269032328435e-05, + "loss": 0.7319, + "step": 33210 + }, + { + "epoch": 0.533234883384966, + "grad_norm": 0.6028271317481995, + "learning_rate": 4.173800793528268e-05, + "loss": 0.7933, + "step": 33220 + }, + { + "epoch": 0.53339539960513, + "grad_norm": 0.7288081645965576, + "learning_rate": 4.1733324482841944e-05, + "loss": 0.8012, + "step": 33230 + }, + { + "epoch": 0.5335559158252942, + "grad_norm": 1.3787254095077515, + "learning_rate": 4.172863996625999e-05, + "loss": 0.8005, + "step": 33240 + }, + { + "epoch": 0.5337164320454582, + "grad_norm": 0.8908776640892029, + "learning_rate": 4.1723954385834725e-05, + "loss": 0.7197, + "step": 33250 + }, + { + "epoch": 0.5338769482656223, + "grad_norm": 0.932698130607605, + "learning_rate": 4.171926774186411e-05, + "loss": 0.85, + "step": 33260 + }, + { + "epoch": 0.5340374644857863, + "grad_norm": 0.4542863070964813, + "learning_rate": 4.171458003464621e-05, + "loss": 0.6717, + "step": 33270 + }, + { + "epoch": 0.5341979807059504, + "grad_norm": 0.8011603951454163, + "learning_rate": 4.170989126447912e-05, + "loss": 0.7209, + "step": 33280 + }, + { + "epoch": 0.5343584969261144, + "grad_norm": 0.7942695021629333, + "learning_rate": 4.170520143166102e-05, + "loss": 0.7463, + "step": 33290 + }, + { + "epoch": 0.5345190131462785, + "grad_norm": 0.6266053318977356, + "learning_rate": 4.170051053649016e-05, + "loss": 0.819, + "step": 33300 + }, + { + "epoch": 0.5346795293664425, + "grad_norm": 1.539284586906433, + "learning_rate": 4.169581857926486e-05, + "loss": 0.837, + "step": 33310 + }, + { + "epoch": 0.5348400455866065, + "grad_norm": 0.518083393573761, + "learning_rate": 4.1691125560283475e-05, + "loss": 0.768, + "step": 33320 + }, + { + "epoch": 0.5350005618067706, + "grad_norm": 1.2952311038970947, + "learning_rate": 4.168643147984448e-05, + "loss": 0.8219, + "step": 33330 + }, + { + "epoch": 0.5351610780269346, + "grad_norm": 0.761269211769104, + "learning_rate": 4.168173633824639e-05, + "loss": 0.6577, + "step": 33340 + }, + { + "epoch": 0.5353215942470987, + "grad_norm": 0.6563074588775635, + "learning_rate": 4.167704013578777e-05, + "loss": 0.7426, + "step": 33350 + }, + { + "epoch": 0.5354821104672627, + "grad_norm": 0.7652223706245422, + "learning_rate": 4.167234287276729e-05, + "loss": 0.7963, + "step": 33360 + }, + { + "epoch": 0.5356426266874268, + "grad_norm": 0.923707127571106, + "learning_rate": 4.166764454948365e-05, + "loss": 0.8595, + "step": 33370 + }, + { + "epoch": 0.5358031429075908, + "grad_norm": 0.4496426582336426, + "learning_rate": 4.166294516623566e-05, + "loss": 0.7011, + "step": 33380 + }, + { + "epoch": 0.5359636591277549, + "grad_norm": 0.7854878306388855, + "learning_rate": 4.1658244723322146e-05, + "loss": 0.6685, + "step": 33390 + }, + { + "epoch": 0.5361241753479189, + "grad_norm": 0.6694662570953369, + "learning_rate": 4.1653543221042044e-05, + "loss": 0.8127, + "step": 33400 + }, + { + "epoch": 0.536284691568083, + "grad_norm": 0.6011378169059753, + "learning_rate": 4.164884065969434e-05, + "loss": 0.8671, + "step": 33410 + }, + { + "epoch": 0.536445207788247, + "grad_norm": 0.938650906085968, + "learning_rate": 4.16441370395781e-05, + "loss": 0.8611, + "step": 33420 + }, + { + "epoch": 0.536605724008411, + "grad_norm": 0.7139781713485718, + "learning_rate": 4.1639432360992426e-05, + "loss": 0.7871, + "step": 33430 + }, + { + "epoch": 0.5367662402285751, + "grad_norm": 1.0984851121902466, + "learning_rate": 4.163472662423652e-05, + "loss": 0.8489, + "step": 33440 + }, + { + "epoch": 0.5369267564487391, + "grad_norm": 0.5331991910934448, + "learning_rate": 4.163001982960962e-05, + "loss": 0.7543, + "step": 33450 + }, + { + "epoch": 0.5370872726689032, + "grad_norm": 0.7238531708717346, + "learning_rate": 4.162531197741109e-05, + "loss": 0.7541, + "step": 33460 + }, + { + "epoch": 0.5372477888890672, + "grad_norm": 0.6373295783996582, + "learning_rate": 4.162060306794029e-05, + "loss": 0.7333, + "step": 33470 + }, + { + "epoch": 0.5374083051092313, + "grad_norm": 0.7363165616989136, + "learning_rate": 4.161589310149668e-05, + "loss": 0.7491, + "step": 33480 + }, + { + "epoch": 0.5375688213293953, + "grad_norm": 0.8164400458335876, + "learning_rate": 4.161118207837981e-05, + "loss": 0.8473, + "step": 33490 + }, + { + "epoch": 0.5377293375495594, + "grad_norm": 0.5919418931007385, + "learning_rate": 4.1606469998889255e-05, + "loss": 0.6516, + "step": 33500 + }, + { + "epoch": 0.5378898537697234, + "grad_norm": 0.5232661366462708, + "learning_rate": 4.160175686332467e-05, + "loss": 0.8531, + "step": 33510 + }, + { + "epoch": 0.5380503699898875, + "grad_norm": 0.945315957069397, + "learning_rate": 4.15970426719858e-05, + "loss": 0.803, + "step": 33520 + }, + { + "epoch": 0.5382108862100515, + "grad_norm": 0.8264966011047363, + "learning_rate": 4.1592327425172415e-05, + "loss": 0.8237, + "step": 33530 + }, + { + "epoch": 0.5383714024302155, + "grad_norm": 0.6652306318283081, + "learning_rate": 4.1587611123184406e-05, + "loss": 0.8116, + "step": 33540 + }, + { + "epoch": 0.5385319186503796, + "grad_norm": 0.80149906873703, + "learning_rate": 4.1582893766321684e-05, + "loss": 0.7313, + "step": 33550 + }, + { + "epoch": 0.5386924348705436, + "grad_norm": 0.6538906097412109, + "learning_rate": 4.157817535488425e-05, + "loss": 0.7294, + "step": 33560 + }, + { + "epoch": 0.5388529510907077, + "grad_norm": 0.8926109671592712, + "learning_rate": 4.1573455889172174e-05, + "loss": 0.8294, + "step": 33570 + }, + { + "epoch": 0.5390134673108717, + "grad_norm": 0.9933943152427673, + "learning_rate": 4.156873536948558e-05, + "loss": 0.8027, + "step": 33580 + }, + { + "epoch": 0.5391739835310358, + "grad_norm": 0.7277526259422302, + "learning_rate": 4.1564013796124654e-05, + "loss": 0.7873, + "step": 33590 + }, + { + "epoch": 0.5393344997511998, + "grad_norm": 0.5975390672683716, + "learning_rate": 4.155929116938968e-05, + "loss": 0.8538, + "step": 33600 + }, + { + "epoch": 0.539495015971364, + "grad_norm": 0.8460789918899536, + "learning_rate": 4.155456748958098e-05, + "loss": 0.8614, + "step": 33610 + }, + { + "epoch": 0.539655532191528, + "grad_norm": 0.7842246890068054, + "learning_rate": 4.1549842756998944e-05, + "loss": 0.7788, + "step": 33620 + }, + { + "epoch": 0.539816048411692, + "grad_norm": 0.6007382869720459, + "learning_rate": 4.1545116971944056e-05, + "loss": 0.6954, + "step": 33630 + }, + { + "epoch": 0.5399765646318561, + "grad_norm": 0.7254155278205872, + "learning_rate": 4.154039013471684e-05, + "loss": 0.804, + "step": 33640 + }, + { + "epoch": 0.5401370808520201, + "grad_norm": 0.6683213114738464, + "learning_rate": 4.153566224561789e-05, + "loss": 0.6619, + "step": 33650 + }, + { + "epoch": 0.5402975970721842, + "grad_norm": 0.7764713168144226, + "learning_rate": 4.153093330494787e-05, + "loss": 0.8814, + "step": 33660 + }, + { + "epoch": 0.5404581132923482, + "grad_norm": 0.8366062641143799, + "learning_rate": 4.152620331300753e-05, + "loss": 0.7956, + "step": 33670 + }, + { + "epoch": 0.5406186295125123, + "grad_norm": 0.681605339050293, + "learning_rate": 4.152147227009765e-05, + "loss": 0.7588, + "step": 33680 + }, + { + "epoch": 0.5407791457326763, + "grad_norm": 0.820444643497467, + "learning_rate": 4.151674017651911e-05, + "loss": 0.8285, + "step": 33690 + }, + { + "epoch": 0.5409396619528404, + "grad_norm": 0.7192450761795044, + "learning_rate": 4.151200703257284e-05, + "loss": 0.7357, + "step": 33700 + }, + { + "epoch": 0.5411001781730044, + "grad_norm": 1.0289722681045532, + "learning_rate": 4.150727283855983e-05, + "loss": 0.9474, + "step": 33710 + }, + { + "epoch": 0.5412606943931685, + "grad_norm": 0.5375916957855225, + "learning_rate": 4.150253759478116e-05, + "loss": 0.709, + "step": 33720 + }, + { + "epoch": 0.5414212106133325, + "grad_norm": 0.5887445211410522, + "learning_rate": 4.149780130153795e-05, + "loss": 0.6977, + "step": 33730 + }, + { + "epoch": 0.5415817268334965, + "grad_norm": 0.8859736323356628, + "learning_rate": 4.149306395913143e-05, + "loss": 0.8188, + "step": 33740 + }, + { + "epoch": 0.5417422430536606, + "grad_norm": 0.8983931541442871, + "learning_rate": 4.148832556786283e-05, + "loss": 0.6311, + "step": 33750 + }, + { + "epoch": 0.5419027592738246, + "grad_norm": 1.0813406705856323, + "learning_rate": 4.14835861280335e-05, + "loss": 0.8347, + "step": 33760 + }, + { + "epoch": 0.5420632754939887, + "grad_norm": 0.561225950717926, + "learning_rate": 4.1478845639944853e-05, + "loss": 0.8556, + "step": 33770 + }, + { + "epoch": 0.5422237917141527, + "grad_norm": 0.7689872980117798, + "learning_rate": 4.1474104103898336e-05, + "loss": 0.8524, + "step": 33780 + }, + { + "epoch": 0.5423843079343168, + "grad_norm": 0.7003741264343262, + "learning_rate": 4.146936152019549e-05, + "loss": 0.9608, + "step": 33790 + }, + { + "epoch": 0.5425448241544808, + "grad_norm": 0.6603992581367493, + "learning_rate": 4.146461788913793e-05, + "loss": 0.6916, + "step": 33800 + }, + { + "epoch": 0.5427053403746449, + "grad_norm": 0.6627947092056274, + "learning_rate": 4.145987321102729e-05, + "loss": 0.76, + "step": 33810 + }, + { + "epoch": 0.5428658565948089, + "grad_norm": 0.9388800263404846, + "learning_rate": 4.145512748616534e-05, + "loss": 0.8804, + "step": 33820 + }, + { + "epoch": 0.5430263728149729, + "grad_norm": 1.309693455696106, + "learning_rate": 4.145038071485385e-05, + "loss": 0.8143, + "step": 33830 + }, + { + "epoch": 0.543186889035137, + "grad_norm": 0.6096706986427307, + "learning_rate": 4.14456328973947e-05, + "loss": 0.8019, + "step": 33840 + }, + { + "epoch": 0.543347405255301, + "grad_norm": 0.5230737328529358, + "learning_rate": 4.144088403408983e-05, + "loss": 0.8657, + "step": 33850 + }, + { + "epoch": 0.5435079214754651, + "grad_norm": 0.7134584188461304, + "learning_rate": 4.1436134125241235e-05, + "loss": 0.9368, + "step": 33860 + }, + { + "epoch": 0.5436684376956291, + "grad_norm": 0.8122164607048035, + "learning_rate": 4.143138317115098e-05, + "loss": 0.7955, + "step": 33870 + }, + { + "epoch": 0.5438289539157932, + "grad_norm": 0.9274346232414246, + "learning_rate": 4.14266311721212e-05, + "loss": 0.8049, + "step": 33880 + }, + { + "epoch": 0.5439894701359572, + "grad_norm": 0.473124623298645, + "learning_rate": 4.142187812845409e-05, + "loss": 0.7954, + "step": 33890 + }, + { + "epoch": 0.5441499863561213, + "grad_norm": 0.5776810050010681, + "learning_rate": 4.1417124040451914e-05, + "loss": 0.8288, + "step": 33900 + }, + { + "epoch": 0.5443105025762853, + "grad_norm": 0.7850297093391418, + "learning_rate": 4.141236890841701e-05, + "loss": 0.7221, + "step": 33910 + }, + { + "epoch": 0.5444710187964494, + "grad_norm": 0.7124345302581787, + "learning_rate": 4.140761273265178e-05, + "loss": 0.7639, + "step": 33920 + }, + { + "epoch": 0.5446315350166134, + "grad_norm": 0.776951014995575, + "learning_rate": 4.1402855513458674e-05, + "loss": 0.8711, + "step": 33930 + }, + { + "epoch": 0.5447920512367774, + "grad_norm": 1.1084684133529663, + "learning_rate": 4.139809725114023e-05, + "loss": 0.7399, + "step": 33940 + }, + { + "epoch": 0.5449525674569415, + "grad_norm": 0.5762219429016113, + "learning_rate": 4.139333794599906e-05, + "loss": 0.8156, + "step": 33950 + }, + { + "epoch": 0.5451130836771055, + "grad_norm": 0.8049226999282837, + "learning_rate": 4.138857759833781e-05, + "loss": 0.7508, + "step": 33960 + }, + { + "epoch": 0.5452735998972696, + "grad_norm": 0.6092366576194763, + "learning_rate": 4.138381620845921e-05, + "loss": 0.7708, + "step": 33970 + }, + { + "epoch": 0.5454341161174336, + "grad_norm": 0.9193657636642456, + "learning_rate": 4.1379053776666075e-05, + "loss": 0.7436, + "step": 33980 + }, + { + "epoch": 0.5455946323375978, + "grad_norm": 0.6834642291069031, + "learning_rate": 4.137429030326124e-05, + "loss": 0.8574, + "step": 33990 + }, + { + "epoch": 0.5457551485577617, + "grad_norm": 0.6989868879318237, + "learning_rate": 4.136952578854766e-05, + "loss": 0.8412, + "step": 34000 + }, + { + "epoch": 0.5459156647779259, + "grad_norm": 0.8961001038551331, + "learning_rate": 4.1364760232828315e-05, + "loss": 0.8623, + "step": 34010 + }, + { + "epoch": 0.5460761809980899, + "grad_norm": 0.4630860388278961, + "learning_rate": 4.1359993636406277e-05, + "loss": 0.8736, + "step": 34020 + }, + { + "epoch": 0.5462366972182539, + "grad_norm": 0.8421053290367126, + "learning_rate": 4.135522599958466e-05, + "loss": 0.7445, + "step": 34030 + }, + { + "epoch": 0.546397213438418, + "grad_norm": 0.5906469821929932, + "learning_rate": 4.135045732266666e-05, + "loss": 0.9089, + "step": 34040 + }, + { + "epoch": 0.546557729658582, + "grad_norm": 0.9692172408103943, + "learning_rate": 4.134568760595555e-05, + "loss": 0.7872, + "step": 34050 + }, + { + "epoch": 0.5467182458787461, + "grad_norm": 1.3541032075881958, + "learning_rate": 4.1340916849754653e-05, + "loss": 0.6245, + "step": 34060 + }, + { + "epoch": 0.5468787620989101, + "grad_norm": 0.6540245413780212, + "learning_rate": 4.133614505436734e-05, + "loss": 0.7918, + "step": 34070 + }, + { + "epoch": 0.5470392783190742, + "grad_norm": 0.5179737210273743, + "learning_rate": 4.13313722200971e-05, + "loss": 0.7967, + "step": 34080 + }, + { + "epoch": 0.5471997945392382, + "grad_norm": 0.5116573572158813, + "learning_rate": 4.132659834724744e-05, + "loss": 0.9153, + "step": 34090 + }, + { + "epoch": 0.5473603107594023, + "grad_norm": 0.6793535351753235, + "learning_rate": 4.132182343612194e-05, + "loss": 0.8568, + "step": 34100 + }, + { + "epoch": 0.5475208269795663, + "grad_norm": 0.5629017353057861, + "learning_rate": 4.1317047487024286e-05, + "loss": 0.8899, + "step": 34110 + }, + { + "epoch": 0.5476813431997304, + "grad_norm": 0.6273512244224548, + "learning_rate": 4.131227050025817e-05, + "loss": 0.868, + "step": 34120 + }, + { + "epoch": 0.5478418594198944, + "grad_norm": 0.7909152507781982, + "learning_rate": 4.13074924761274e-05, + "loss": 0.7497, + "step": 34130 + }, + { + "epoch": 0.5480023756400584, + "grad_norm": 0.6701558828353882, + "learning_rate": 4.130271341493582e-05, + "loss": 0.7273, + "step": 34140 + }, + { + "epoch": 0.5481628918602225, + "grad_norm": 0.7887182235717773, + "learning_rate": 4.1297933316987356e-05, + "loss": 0.8841, + "step": 34150 + }, + { + "epoch": 0.5483234080803865, + "grad_norm": 0.7982737421989441, + "learning_rate": 4.129315218258599e-05, + "loss": 0.7501, + "step": 34160 + }, + { + "epoch": 0.5484839243005506, + "grad_norm": 0.6284668445587158, + "learning_rate": 4.128837001203578e-05, + "loss": 0.7544, + "step": 34170 + }, + { + "epoch": 0.5486444405207146, + "grad_norm": 0.656151533126831, + "learning_rate": 4.128358680564084e-05, + "loss": 0.7342, + "step": 34180 + }, + { + "epoch": 0.5488049567408787, + "grad_norm": 0.5014504790306091, + "learning_rate": 4.1278802563705353e-05, + "loss": 0.7573, + "step": 34190 + }, + { + "epoch": 0.5489654729610427, + "grad_norm": 0.7546064853668213, + "learning_rate": 4.127401728653357e-05, + "loss": 0.805, + "step": 34200 + }, + { + "epoch": 0.5491259891812068, + "grad_norm": 0.7332468032836914, + "learning_rate": 4.126923097442981e-05, + "loss": 0.8952, + "step": 34210 + }, + { + "epoch": 0.5492865054013708, + "grad_norm": 0.5509406924247742, + "learning_rate": 4.126444362769844e-05, + "loss": 0.6797, + "step": 34220 + }, + { + "epoch": 0.5494470216215348, + "grad_norm": 0.5889402031898499, + "learning_rate": 4.125965524664392e-05, + "loss": 0.8392, + "step": 34230 + }, + { + "epoch": 0.5496075378416989, + "grad_norm": 0.6074414849281311, + "learning_rate": 4.125486583157077e-05, + "loss": 0.8307, + "step": 34240 + }, + { + "epoch": 0.5497680540618629, + "grad_norm": 0.5884202122688293, + "learning_rate": 4.125007538278356e-05, + "loss": 0.7128, + "step": 34250 + }, + { + "epoch": 0.549928570282027, + "grad_norm": 0.7446367144584656, + "learning_rate": 4.124528390058693e-05, + "loss": 0.7333, + "step": 34260 + }, + { + "epoch": 0.550089086502191, + "grad_norm": 0.6684912443161011, + "learning_rate": 4.124049138528559e-05, + "loss": 0.799, + "step": 34270 + }, + { + "epoch": 0.5502496027223551, + "grad_norm": 1.0224424600601196, + "learning_rate": 4.123569783718432e-05, + "loss": 0.7658, + "step": 34280 + }, + { + "epoch": 0.5504101189425191, + "grad_norm": 0.5992064476013184, + "learning_rate": 4.123090325658796e-05, + "loss": 0.8992, + "step": 34290 + }, + { + "epoch": 0.5505706351626832, + "grad_norm": 0.9953117966651917, + "learning_rate": 4.1226107643801426e-05, + "loss": 0.6968, + "step": 34300 + }, + { + "epoch": 0.5507311513828472, + "grad_norm": 0.9961788058280945, + "learning_rate": 4.122131099912968e-05, + "loss": 0.7082, + "step": 34310 + }, + { + "epoch": 0.5508916676030113, + "grad_norm": 1.198551058769226, + "learning_rate": 4.121651332287777e-05, + "loss": 0.8208, + "step": 34320 + }, + { + "epoch": 0.5510521838231753, + "grad_norm": 0.5073196291923523, + "learning_rate": 4.121171461535078e-05, + "loss": 0.8166, + "step": 34330 + }, + { + "epoch": 0.5512127000433393, + "grad_norm": 0.929816484451294, + "learning_rate": 4.1206914876853906e-05, + "loss": 0.7827, + "step": 34340 + }, + { + "epoch": 0.5513732162635034, + "grad_norm": 0.8879731297492981, + "learning_rate": 4.120211410769237e-05, + "loss": 0.771, + "step": 34350 + }, + { + "epoch": 0.5515337324836674, + "grad_norm": 0.9742740392684937, + "learning_rate": 4.119731230817146e-05, + "loss": 0.7367, + "step": 34360 + }, + { + "epoch": 0.5516942487038315, + "grad_norm": 0.6395083665847778, + "learning_rate": 4.1192509478596574e-05, + "loss": 0.7233, + "step": 34370 + }, + { + "epoch": 0.5518547649239955, + "grad_norm": 0.5449427962303162, + "learning_rate": 4.118770561927311e-05, + "loss": 0.8984, + "step": 34380 + }, + { + "epoch": 0.5520152811441597, + "grad_norm": 0.5121623277664185, + "learning_rate": 4.11829007305066e-05, + "loss": 0.7519, + "step": 34390 + }, + { + "epoch": 0.5521757973643237, + "grad_norm": 1.1930886507034302, + "learning_rate": 4.117809481260257e-05, + "loss": 0.845, + "step": 34400 + }, + { + "epoch": 0.5523363135844878, + "grad_norm": 0.5388491749763489, + "learning_rate": 4.117328786586667e-05, + "loss": 0.841, + "step": 34410 + }, + { + "epoch": 0.5524968298046518, + "grad_norm": 0.6218625903129578, + "learning_rate": 4.116847989060459e-05, + "loss": 0.8829, + "step": 34420 + }, + { + "epoch": 0.5526573460248158, + "grad_norm": 0.5587846040725708, + "learning_rate": 4.116367088712209e-05, + "loss": 0.916, + "step": 34430 + }, + { + "epoch": 0.5528178622449799, + "grad_norm": 0.7198716998100281, + "learning_rate": 4.115886085572499e-05, + "loss": 0.7196, + "step": 34440 + }, + { + "epoch": 0.5529783784651439, + "grad_norm": 0.671875, + "learning_rate": 4.1154049796719183e-05, + "loss": 0.7069, + "step": 34450 + }, + { + "epoch": 0.553138894685308, + "grad_norm": 0.6865593791007996, + "learning_rate": 4.114923771041063e-05, + "loss": 0.7248, + "step": 34460 + }, + { + "epoch": 0.553299410905472, + "grad_norm": 0.7356981039047241, + "learning_rate": 4.114442459710534e-05, + "loss": 0.7969, + "step": 34470 + }, + { + "epoch": 0.5534599271256361, + "grad_norm": 1.807395577430725, + "learning_rate": 4.11396104571094e-05, + "loss": 0.8393, + "step": 34480 + }, + { + "epoch": 0.5536204433458001, + "grad_norm": 0.8109560608863831, + "learning_rate": 4.1134795290728966e-05, + "loss": 0.8049, + "step": 34490 + }, + { + "epoch": 0.5537809595659642, + "grad_norm": 0.6673467755317688, + "learning_rate": 4.112997909827025e-05, + "loss": 0.7874, + "step": 34500 + }, + { + "epoch": 0.5539414757861282, + "grad_norm": 0.7599040269851685, + "learning_rate": 4.1125161880039555e-05, + "loss": 0.7504, + "step": 34510 + }, + { + "epoch": 0.5541019920062923, + "grad_norm": 0.48313093185424805, + "learning_rate": 4.1120343636343196e-05, + "loss": 0.6852, + "step": 34520 + }, + { + "epoch": 0.5542625082264563, + "grad_norm": 0.762444019317627, + "learning_rate": 4.111552436748759e-05, + "loss": 0.6781, + "step": 34530 + }, + { + "epoch": 0.5544230244466203, + "grad_norm": 1.026167392730713, + "learning_rate": 4.1110704073779236e-05, + "loss": 0.8483, + "step": 34540 + }, + { + "epoch": 0.5545835406667844, + "grad_norm": 4.25227165222168, + "learning_rate": 4.110588275552466e-05, + "loss": 0.833, + "step": 34550 + }, + { + "epoch": 0.5547440568869484, + "grad_norm": 0.6271966695785522, + "learning_rate": 4.1101060413030466e-05, + "loss": 0.7977, + "step": 34560 + }, + { + "epoch": 0.5549045731071125, + "grad_norm": 0.8609335422515869, + "learning_rate": 4.109623704660334e-05, + "loss": 0.778, + "step": 34570 + }, + { + "epoch": 0.5550650893272765, + "grad_norm": 0.5632936954498291, + "learning_rate": 4.109141265655002e-05, + "loss": 0.7479, + "step": 34580 + }, + { + "epoch": 0.5552256055474406, + "grad_norm": 0.621466338634491, + "learning_rate": 4.108658724317729e-05, + "loss": 0.702, + "step": 34590 + }, + { + "epoch": 0.5553861217676046, + "grad_norm": 0.6267712712287903, + "learning_rate": 4.108176080679203e-05, + "loss": 0.6951, + "step": 34600 + }, + { + "epoch": 0.5555466379877687, + "grad_norm": 1.8469659090042114, + "learning_rate": 4.107693334770119e-05, + "loss": 0.8509, + "step": 34610 + }, + { + "epoch": 0.5557071542079327, + "grad_norm": 0.7590411305427551, + "learning_rate": 4.107210486621174e-05, + "loss": 0.7595, + "step": 34620 + }, + { + "epoch": 0.5558676704280967, + "grad_norm": 1.0387099981307983, + "learning_rate": 4.106727536263074e-05, + "loss": 0.6651, + "step": 34630 + }, + { + "epoch": 0.5560281866482608, + "grad_norm": 0.8106061220169067, + "learning_rate": 4.1062444837265345e-05, + "loss": 0.9182, + "step": 34640 + }, + { + "epoch": 0.5561887028684248, + "grad_norm": 0.6998714804649353, + "learning_rate": 4.1057613290422736e-05, + "loss": 0.7273, + "step": 34650 + }, + { + "epoch": 0.5563492190885889, + "grad_norm": 1.2504839897155762, + "learning_rate": 4.1052780722410175e-05, + "loss": 0.7622, + "step": 34660 + }, + { + "epoch": 0.5565097353087529, + "grad_norm": 0.5386705994606018, + "learning_rate": 4.104794713353497e-05, + "loss": 0.8531, + "step": 34670 + }, + { + "epoch": 0.556670251528917, + "grad_norm": 0.6393482089042664, + "learning_rate": 4.104311252410453e-05, + "loss": 0.8047, + "step": 34680 + }, + { + "epoch": 0.556830767749081, + "grad_norm": 0.7615498304367065, + "learning_rate": 4.103827689442629e-05, + "loss": 0.7372, + "step": 34690 + }, + { + "epoch": 0.5569912839692451, + "grad_norm": 0.9042619466781616, + "learning_rate": 4.103344024480778e-05, + "loss": 0.7674, + "step": 34700 + }, + { + "epoch": 0.5571518001894091, + "grad_norm": 1.7895212173461914, + "learning_rate": 4.1028602575556564e-05, + "loss": 0.7103, + "step": 34710 + }, + { + "epoch": 0.5573123164095732, + "grad_norm": 0.5806755423545837, + "learning_rate": 4.102376388698031e-05, + "loss": 0.838, + "step": 34720 + }, + { + "epoch": 0.5574728326297372, + "grad_norm": 0.7656424641609192, + "learning_rate": 4.101892417938673e-05, + "loss": 0.7488, + "step": 34730 + }, + { + "epoch": 0.5576333488499012, + "grad_norm": 1.4596829414367676, + "learning_rate": 4.1014083453083586e-05, + "loss": 0.7549, + "step": 34740 + }, + { + "epoch": 0.5577938650700653, + "grad_norm": 1.2473405599594116, + "learning_rate": 4.100924170837873e-05, + "loss": 0.8871, + "step": 34750 + }, + { + "epoch": 0.5579543812902293, + "grad_norm": 0.665269136428833, + "learning_rate": 4.100439894558007e-05, + "loss": 0.7524, + "step": 34760 + }, + { + "epoch": 0.5581148975103934, + "grad_norm": 0.8557600975036621, + "learning_rate": 4.0999555164995564e-05, + "loss": 0.8329, + "step": 34770 + }, + { + "epoch": 0.5582754137305574, + "grad_norm": 0.7234677076339722, + "learning_rate": 4.0994710366933266e-05, + "loss": 0.8099, + "step": 34780 + }, + { + "epoch": 0.5584359299507216, + "grad_norm": 0.7696384191513062, + "learning_rate": 4.098986455170126e-05, + "loss": 0.8089, + "step": 34790 + }, + { + "epoch": 0.5585964461708856, + "grad_norm": 0.5301506519317627, + "learning_rate": 4.0985017719607724e-05, + "loss": 0.7508, + "step": 34800 + }, + { + "epoch": 0.5587569623910497, + "grad_norm": 1.39701509475708, + "learning_rate": 4.098016987096088e-05, + "loss": 0.7813, + "step": 34810 + }, + { + "epoch": 0.5589174786112137, + "grad_norm": 0.7749080061912537, + "learning_rate": 4.0975321006069036e-05, + "loss": 0.7738, + "step": 34820 + }, + { + "epoch": 0.5590779948313777, + "grad_norm": 0.5500766634941101, + "learning_rate": 4.0970471125240534e-05, + "loss": 0.7127, + "step": 34830 + }, + { + "epoch": 0.5592385110515418, + "grad_norm": 0.7680782675743103, + "learning_rate": 4.096562022878381e-05, + "loss": 0.7683, + "step": 34840 + }, + { + "epoch": 0.5593990272717058, + "grad_norm": 0.34369027614593506, + "learning_rate": 4.0960768317007334e-05, + "loss": 0.8147, + "step": 34850 + }, + { + "epoch": 0.5595595434918699, + "grad_norm": 0.6990272402763367, + "learning_rate": 4.095591539021969e-05, + "loss": 0.8006, + "step": 34860 + }, + { + "epoch": 0.5597200597120339, + "grad_norm": 0.8191579580307007, + "learning_rate": 4.0951061448729475e-05, + "loss": 0.8626, + "step": 34870 + }, + { + "epoch": 0.559880575932198, + "grad_norm": 0.6218969821929932, + "learning_rate": 4.0946206492845376e-05, + "loss": 0.8078, + "step": 34880 + }, + { + "epoch": 0.560041092152362, + "grad_norm": 0.7335914373397827, + "learning_rate": 4.094135052287613e-05, + "loss": 0.7243, + "step": 34890 + }, + { + "epoch": 0.5602016083725261, + "grad_norm": 1.027144193649292, + "learning_rate": 4.0936493539130566e-05, + "loss": 0.6833, + "step": 34900 + }, + { + "epoch": 0.5603621245926901, + "grad_norm": 0.7444616556167603, + "learning_rate": 4.0931635541917546e-05, + "loss": 0.7675, + "step": 34910 + }, + { + "epoch": 0.5605226408128542, + "grad_norm": 3.942539930343628, + "learning_rate": 4.0926776531546016e-05, + "loss": 0.8526, + "step": 34920 + }, + { + "epoch": 0.5606831570330182, + "grad_norm": 0.526620626449585, + "learning_rate": 4.092191650832498e-05, + "loss": 0.8043, + "step": 34930 + }, + { + "epoch": 0.5608436732531822, + "grad_norm": 0.6320582032203674, + "learning_rate": 4.09170554725635e-05, + "loss": 0.7741, + "step": 34940 + }, + { + "epoch": 0.5610041894733463, + "grad_norm": 1.5831085443496704, + "learning_rate": 4.091219342457073e-05, + "loss": 0.82, + "step": 34950 + }, + { + "epoch": 0.5611647056935103, + "grad_norm": 0.9906290173530579, + "learning_rate": 4.090733036465584e-05, + "loss": 0.7758, + "step": 34960 + }, + { + "epoch": 0.5613252219136744, + "grad_norm": 0.47555768489837646, + "learning_rate": 4.090246629312811e-05, + "loss": 0.8432, + "step": 34970 + }, + { + "epoch": 0.5614857381338384, + "grad_norm": 0.9650971293449402, + "learning_rate": 4.089760121029686e-05, + "loss": 0.9287, + "step": 34980 + }, + { + "epoch": 0.5616462543540025, + "grad_norm": 0.5900084972381592, + "learning_rate": 4.0892735116471494e-05, + "loss": 0.8027, + "step": 34990 + }, + { + "epoch": 0.5618067705741665, + "grad_norm": 0.6211195588111877, + "learning_rate": 4.088786801196144e-05, + "loss": 0.8002, + "step": 35000 + }, + { + "epoch": 0.5619672867943306, + "grad_norm": 1.2061971426010132, + "learning_rate": 4.088299989707624e-05, + "loss": 0.8277, + "step": 35010 + }, + { + "epoch": 0.5621278030144946, + "grad_norm": 0.7027051448822021, + "learning_rate": 4.0878130772125466e-05, + "loss": 0.827, + "step": 35020 + }, + { + "epoch": 0.5622883192346586, + "grad_norm": 0.5584590435028076, + "learning_rate": 4.087326063741877e-05, + "loss": 0.6847, + "step": 35030 + }, + { + "epoch": 0.5624488354548227, + "grad_norm": 0.6898772716522217, + "learning_rate": 4.0868389493265876e-05, + "loss": 0.6934, + "step": 35040 + }, + { + "epoch": 0.5626093516749867, + "grad_norm": 0.9483516812324524, + "learning_rate": 4.0863517339976534e-05, + "loss": 0.8136, + "step": 35050 + }, + { + "epoch": 0.5627698678951508, + "grad_norm": 0.6987947821617126, + "learning_rate": 4.0858644177860604e-05, + "loss": 0.813, + "step": 35060 + }, + { + "epoch": 0.5629303841153148, + "grad_norm": 0.548069179058075, + "learning_rate": 4.0853770007227986e-05, + "loss": 0.7733, + "step": 35070 + }, + { + "epoch": 0.5630909003354789, + "grad_norm": 0.8747618198394775, + "learning_rate": 4.084889482838865e-05, + "loss": 0.8518, + "step": 35080 + }, + { + "epoch": 0.5632514165556429, + "grad_norm": 0.6611346006393433, + "learning_rate": 4.084401864165262e-05, + "loss": 0.8398, + "step": 35090 + }, + { + "epoch": 0.563411932775807, + "grad_norm": 1.3021435737609863, + "learning_rate": 4.083914144733e-05, + "loss": 0.8103, + "step": 35100 + }, + { + "epoch": 0.563572448995971, + "grad_norm": 0.5760120749473572, + "learning_rate": 4.0834263245730955e-05, + "loss": 0.7702, + "step": 35110 + }, + { + "epoch": 0.5637329652161351, + "grad_norm": 0.6920192837715149, + "learning_rate": 4.08293840371657e-05, + "loss": 0.8082, + "step": 35120 + }, + { + "epoch": 0.5638934814362991, + "grad_norm": 0.6268174648284912, + "learning_rate": 4.082450382194454e-05, + "loss": 0.9559, + "step": 35130 + }, + { + "epoch": 0.5640539976564631, + "grad_norm": 0.6310826539993286, + "learning_rate": 4.0819622600377805e-05, + "loss": 0.7816, + "step": 35140 + }, + { + "epoch": 0.5642145138766272, + "grad_norm": 0.5229182243347168, + "learning_rate": 4.081474037277593e-05, + "loss": 0.8037, + "step": 35150 + }, + { + "epoch": 0.5643750300967912, + "grad_norm": 0.6522727012634277, + "learning_rate": 4.080985713944938e-05, + "loss": 0.8448, + "step": 35160 + }, + { + "epoch": 0.5645355463169554, + "grad_norm": 0.3864535987377167, + "learning_rate": 4.0804972900708724e-05, + "loss": 0.8043, + "step": 35170 + }, + { + "epoch": 0.5646960625371193, + "grad_norm": 0.6956907510757446, + "learning_rate": 4.080008765686456e-05, + "loss": 0.8467, + "step": 35180 + }, + { + "epoch": 0.5648565787572835, + "grad_norm": 0.9465835690498352, + "learning_rate": 4.0795201408227544e-05, + "loss": 0.8496, + "step": 35190 + }, + { + "epoch": 0.5650170949774475, + "grad_norm": 0.7282283902168274, + "learning_rate": 4.0790314155108436e-05, + "loss": 0.7097, + "step": 35200 + }, + { + "epoch": 0.5651776111976116, + "grad_norm": 0.707148015499115, + "learning_rate": 4.0785425897818027e-05, + "loss": 0.7959, + "step": 35210 + }, + { + "epoch": 0.5653381274177756, + "grad_norm": 1.7742736339569092, + "learning_rate": 4.078053663666718e-05, + "loss": 0.7389, + "step": 35220 + }, + { + "epoch": 0.5654986436379397, + "grad_norm": 0.6361789107322693, + "learning_rate": 4.077564637196682e-05, + "loss": 0.8013, + "step": 35230 + }, + { + "epoch": 0.5656591598581037, + "grad_norm": 0.6761757731437683, + "learning_rate": 4.0770755104027955e-05, + "loss": 0.8779, + "step": 35240 + }, + { + "epoch": 0.5658196760782677, + "grad_norm": 0.689520537853241, + "learning_rate": 4.076586283316163e-05, + "loss": 0.7342, + "step": 35250 + }, + { + "epoch": 0.5659801922984318, + "grad_norm": 0.8196476697921753, + "learning_rate": 4.076096955967895e-05, + "loss": 0.7687, + "step": 35260 + }, + { + "epoch": 0.5661407085185958, + "grad_norm": 0.6328451633453369, + "learning_rate": 4.075607528389113e-05, + "loss": 0.8168, + "step": 35270 + }, + { + "epoch": 0.5663012247387599, + "grad_norm": 1.082281470298767, + "learning_rate": 4.0751180006109394e-05, + "loss": 0.9208, + "step": 35280 + }, + { + "epoch": 0.5664617409589239, + "grad_norm": 0.4900531768798828, + "learning_rate": 4.074628372664506e-05, + "loss": 0.7787, + "step": 35290 + }, + { + "epoch": 0.566622257179088, + "grad_norm": 0.7204940319061279, + "learning_rate": 4.0741386445809504e-05, + "loss": 0.7652, + "step": 35300 + }, + { + "epoch": 0.566782773399252, + "grad_norm": 0.5037017464637756, + "learning_rate": 4.0736488163914156e-05, + "loss": 0.6972, + "step": 35310 + }, + { + "epoch": 0.5669432896194161, + "grad_norm": 0.8006725907325745, + "learning_rate": 4.0731588881270536e-05, + "loss": 0.6722, + "step": 35320 + }, + { + "epoch": 0.5671038058395801, + "grad_norm": 0.622901201248169, + "learning_rate": 4.072668859819019e-05, + "loss": 0.9348, + "step": 35330 + }, + { + "epoch": 0.5672643220597441, + "grad_norm": 1.1717931032180786, + "learning_rate": 4.072178731498476e-05, + "loss": 0.7752, + "step": 35340 + }, + { + "epoch": 0.5674248382799082, + "grad_norm": 0.49536818265914917, + "learning_rate": 4.0716885031965935e-05, + "loss": 0.8945, + "step": 35350 + }, + { + "epoch": 0.5675853545000722, + "grad_norm": 0.6012334227561951, + "learning_rate": 4.071198174944546e-05, + "loss": 0.8225, + "step": 35360 + }, + { + "epoch": 0.5677458707202363, + "grad_norm": 0.6746866106987, + "learning_rate": 4.070707746773518e-05, + "loss": 0.7756, + "step": 35370 + }, + { + "epoch": 0.5679063869404003, + "grad_norm": 0.6839065551757812, + "learning_rate": 4.070217218714695e-05, + "loss": 0.7602, + "step": 35380 + }, + { + "epoch": 0.5680669031605644, + "grad_norm": 0.7013005614280701, + "learning_rate": 4.069726590799274e-05, + "loss": 0.7418, + "step": 35390 + }, + { + "epoch": 0.5682274193807284, + "grad_norm": 0.6720461249351501, + "learning_rate": 4.069235863058455e-05, + "loss": 0.6625, + "step": 35400 + }, + { + "epoch": 0.5683879356008925, + "grad_norm": 0.4959336817264557, + "learning_rate": 4.068745035523446e-05, + "loss": 0.7813, + "step": 35410 + }, + { + "epoch": 0.5685484518210565, + "grad_norm": 0.5670273303985596, + "learning_rate": 4.06825410822546e-05, + "loss": 0.782, + "step": 35420 + }, + { + "epoch": 0.5687089680412206, + "grad_norm": 0.633188009262085, + "learning_rate": 4.067763081195717e-05, + "loss": 0.7362, + "step": 35430 + }, + { + "epoch": 0.5688694842613846, + "grad_norm": 1.1757333278656006, + "learning_rate": 4.067271954465444e-05, + "loss": 0.8456, + "step": 35440 + }, + { + "epoch": 0.5690300004815486, + "grad_norm": 1.0846099853515625, + "learning_rate": 4.0667807280658746e-05, + "loss": 0.9, + "step": 35450 + }, + { + "epoch": 0.5691905167017127, + "grad_norm": 0.4582432210445404, + "learning_rate": 4.066289402028246e-05, + "loss": 0.8187, + "step": 35460 + }, + { + "epoch": 0.5693510329218767, + "grad_norm": 0.6259080171585083, + "learning_rate": 4.065797976383805e-05, + "loss": 0.9205, + "step": 35470 + }, + { + "epoch": 0.5695115491420408, + "grad_norm": 0.6870418190956116, + "learning_rate": 4.065306451163803e-05, + "loss": 0.7444, + "step": 35480 + }, + { + "epoch": 0.5696720653622048, + "grad_norm": 0.9470134973526001, + "learning_rate": 4.064814826399498e-05, + "loss": 0.8919, + "step": 35490 + }, + { + "epoch": 0.5698325815823689, + "grad_norm": 1.8796249628067017, + "learning_rate": 4.064323102122154e-05, + "loss": 0.7267, + "step": 35500 + }, + { + "epoch": 0.5699930978025329, + "grad_norm": 0.6204911470413208, + "learning_rate": 4.063831278363044e-05, + "loss": 0.7788, + "step": 35510 + }, + { + "epoch": 0.570153614022697, + "grad_norm": 0.610247015953064, + "learning_rate": 4.0633393551534415e-05, + "loss": 0.8763, + "step": 35520 + }, + { + "epoch": 0.570314130242861, + "grad_norm": 0.7366138100624084, + "learning_rate": 4.062847332524633e-05, + "loss": 0.8456, + "step": 35530 + }, + { + "epoch": 0.570474646463025, + "grad_norm": 0.5201377272605896, + "learning_rate": 4.062355210507907e-05, + "loss": 0.852, + "step": 35540 + }, + { + "epoch": 0.5706351626831891, + "grad_norm": 0.6136771440505981, + "learning_rate": 4.0618629891345595e-05, + "loss": 0.8298, + "step": 35550 + }, + { + "epoch": 0.5707956789033531, + "grad_norm": 0.6575615406036377, + "learning_rate": 4.061370668435893e-05, + "loss": 0.8259, + "step": 35560 + }, + { + "epoch": 0.5709561951235173, + "grad_norm": 0.5918340086936951, + "learning_rate": 4.060878248443217e-05, + "loss": 0.777, + "step": 35570 + }, + { + "epoch": 0.5711167113436812, + "grad_norm": 0.9569399356842041, + "learning_rate": 4.0603857291878455e-05, + "loss": 0.7592, + "step": 35580 + }, + { + "epoch": 0.5712772275638454, + "grad_norm": 0.9958445429801941, + "learning_rate": 4.0598931107011e-05, + "loss": 0.935, + "step": 35590 + }, + { + "epoch": 0.5714377437840094, + "grad_norm": 0.4502687454223633, + "learning_rate": 4.059400393014308e-05, + "loss": 0.9162, + "step": 35600 + }, + { + "epoch": 0.5715982600041735, + "grad_norm": 0.9299821853637695, + "learning_rate": 4.0589075761588044e-05, + "loss": 0.8176, + "step": 35610 + }, + { + "epoch": 0.5717587762243375, + "grad_norm": 0.7311582565307617, + "learning_rate": 4.0584146601659275e-05, + "loss": 0.7944, + "step": 35620 + }, + { + "epoch": 0.5719192924445016, + "grad_norm": 0.9702523946762085, + "learning_rate": 4.0579216450670266e-05, + "loss": 0.8441, + "step": 35630 + }, + { + "epoch": 0.5720798086646656, + "grad_norm": 0.8632036447525024, + "learning_rate": 4.057428530893453e-05, + "loss": 0.7291, + "step": 35640 + }, + { + "epoch": 0.5722403248848296, + "grad_norm": 1.1548035144805908, + "learning_rate": 4.0569353176765655e-05, + "loss": 0.7908, + "step": 35650 + }, + { + "epoch": 0.5724008411049937, + "grad_norm": 0.6654862761497498, + "learning_rate": 4.0564420054477306e-05, + "loss": 0.7747, + "step": 35660 + }, + { + "epoch": 0.5725613573251577, + "grad_norm": 0.7837027907371521, + "learning_rate": 4.055948594238319e-05, + "loss": 0.9439, + "step": 35670 + }, + { + "epoch": 0.5727218735453218, + "grad_norm": 1.0406510829925537, + "learning_rate": 4.05545508407971e-05, + "loss": 0.7688, + "step": 35680 + }, + { + "epoch": 0.5728823897654858, + "grad_norm": 0.6176325678825378, + "learning_rate": 4.054961475003286e-05, + "loss": 0.7689, + "step": 35690 + }, + { + "epoch": 0.5730429059856499, + "grad_norm": 0.5449222922325134, + "learning_rate": 4.054467767040441e-05, + "loss": 0.9563, + "step": 35700 + }, + { + "epoch": 0.5732034222058139, + "grad_norm": 0.7573217749595642, + "learning_rate": 4.053973960222568e-05, + "loss": 0.6127, + "step": 35710 + }, + { + "epoch": 0.573363938425978, + "grad_norm": 0.6467248797416687, + "learning_rate": 4.053480054581073e-05, + "loss": 0.9426, + "step": 35720 + }, + { + "epoch": 0.573524454646142, + "grad_norm": 0.6534467935562134, + "learning_rate": 4.052986050147363e-05, + "loss": 0.9197, + "step": 35730 + }, + { + "epoch": 0.573684970866306, + "grad_norm": 0.6843330264091492, + "learning_rate": 4.0524919469528565e-05, + "loss": 0.8961, + "step": 35740 + }, + { + "epoch": 0.5738454870864701, + "grad_norm": 0.6680182218551636, + "learning_rate": 4.051997745028975e-05, + "loss": 0.7604, + "step": 35750 + }, + { + "epoch": 0.5740060033066341, + "grad_norm": 0.877436637878418, + "learning_rate": 4.0515034444071457e-05, + "loss": 0.8421, + "step": 35760 + }, + { + "epoch": 0.5741665195267982, + "grad_norm": 0.8607841730117798, + "learning_rate": 4.0510090451188027e-05, + "loss": 0.8256, + "step": 35770 + }, + { + "epoch": 0.5743270357469622, + "grad_norm": 1.039703130722046, + "learning_rate": 4.0505145471953886e-05, + "loss": 0.7974, + "step": 35780 + }, + { + "epoch": 0.5744875519671263, + "grad_norm": 1.2523432970046997, + "learning_rate": 4.05001995066835e-05, + "loss": 0.6544, + "step": 35790 + }, + { + "epoch": 0.5746480681872903, + "grad_norm": 0.5991509556770325, + "learning_rate": 4.0495252555691405e-05, + "loss": 0.8729, + "step": 35800 + }, + { + "epoch": 0.5748085844074544, + "grad_norm": 1.3034154176712036, + "learning_rate": 4.049030461929219e-05, + "loss": 0.7719, + "step": 35810 + }, + { + "epoch": 0.5749691006276184, + "grad_norm": 0.7173115015029907, + "learning_rate": 4.048535569780053e-05, + "loss": 0.8294, + "step": 35820 + }, + { + "epoch": 0.5751296168477825, + "grad_norm": 1.3822613954544067, + "learning_rate": 4.048040579153113e-05, + "loss": 0.7892, + "step": 35830 + }, + { + "epoch": 0.5752901330679465, + "grad_norm": 0.6448975801467896, + "learning_rate": 4.047545490079877e-05, + "loss": 0.7576, + "step": 35840 + }, + { + "epoch": 0.5754506492881105, + "grad_norm": 0.885646641254425, + "learning_rate": 4.0470503025918334e-05, + "loss": 0.8336, + "step": 35850 + }, + { + "epoch": 0.5756111655082746, + "grad_norm": 0.5219306349754333, + "learning_rate": 4.0465550167204694e-05, + "loss": 0.7493, + "step": 35860 + }, + { + "epoch": 0.5757716817284386, + "grad_norm": 0.6691696643829346, + "learning_rate": 4.046059632497284e-05, + "loss": 0.74, + "step": 35870 + }, + { + "epoch": 0.5759321979486027, + "grad_norm": 0.45305126905441284, + "learning_rate": 4.045564149953781e-05, + "loss": 0.6858, + "step": 35880 + }, + { + "epoch": 0.5760927141687667, + "grad_norm": 0.6955490112304688, + "learning_rate": 4.0450685691214685e-05, + "loss": 0.769, + "step": 35890 + }, + { + "epoch": 0.5762532303889308, + "grad_norm": 0.9219595789909363, + "learning_rate": 4.044572890031864e-05, + "loss": 0.7519, + "step": 35900 + }, + { + "epoch": 0.5764137466090948, + "grad_norm": 0.8848703503608704, + "learning_rate": 4.0440771127164895e-05, + "loss": 0.7966, + "step": 35910 + }, + { + "epoch": 0.5765742628292589, + "grad_norm": 0.5764527916908264, + "learning_rate": 4.043581237206874e-05, + "loss": 0.7609, + "step": 35920 + }, + { + "epoch": 0.5767347790494229, + "grad_norm": 0.9867295026779175, + "learning_rate": 4.0430852635345504e-05, + "loss": 0.889, + "step": 35930 + }, + { + "epoch": 0.5768952952695869, + "grad_norm": 1.6472734212875366, + "learning_rate": 4.042589191731062e-05, + "loss": 0.8321, + "step": 35940 + }, + { + "epoch": 0.577055811489751, + "grad_norm": 0.6475095748901367, + "learning_rate": 4.042093021827955e-05, + "loss": 0.7937, + "step": 35950 + }, + { + "epoch": 0.577216327709915, + "grad_norm": 0.534731924533844, + "learning_rate": 4.041596753856783e-05, + "loss": 0.7261, + "step": 35960 + }, + { + "epoch": 0.5773768439300792, + "grad_norm": 0.6448218822479248, + "learning_rate": 4.041100387849106e-05, + "loss": 0.8588, + "step": 35970 + }, + { + "epoch": 0.5775373601502432, + "grad_norm": 0.9210166931152344, + "learning_rate": 4.040603923836489e-05, + "loss": 0.7598, + "step": 35980 + }, + { + "epoch": 0.5776978763704073, + "grad_norm": 0.5217772722244263, + "learning_rate": 4.040107361850505e-05, + "loss": 0.7627, + "step": 35990 + }, + { + "epoch": 0.5778583925905713, + "grad_norm": 0.6340890526771545, + "learning_rate": 4.0396107019227326e-05, + "loss": 0.6978, + "step": 36000 + }, + { + "epoch": 0.5778583925905713, + "eval_loss": 0.7891989946365356, + "eval_runtime": 1831.8851, + "eval_samples_per_second": 14.319, + "eval_steps_per_second": 1.79, + "step": 36000 + }, + { + "epoch": 0.5780189088107354, + "grad_norm": 0.7844753265380859, + "learning_rate": 4.039113944084756e-05, + "loss": 0.8622, + "step": 36010 + }, + { + "epoch": 0.5781794250308994, + "grad_norm": 1.1523690223693848, + "learning_rate": 4.0386170883681665e-05, + "loss": 0.8162, + "step": 36020 + }, + { + "epoch": 0.5783399412510635, + "grad_norm": 0.5563941597938538, + "learning_rate": 4.03812013480456e-05, + "loss": 0.7172, + "step": 36030 + }, + { + "epoch": 0.5785004574712275, + "grad_norm": 0.9390705227851868, + "learning_rate": 4.0376230834255425e-05, + "loss": 0.7763, + "step": 36040 + }, + { + "epoch": 0.5786609736913915, + "grad_norm": 0.6679772734642029, + "learning_rate": 4.0371259342627204e-05, + "loss": 0.8318, + "step": 36050 + }, + { + "epoch": 0.5788214899115556, + "grad_norm": 0.9268079400062561, + "learning_rate": 4.0366286873477114e-05, + "loss": 0.8202, + "step": 36060 + }, + { + "epoch": 0.5789820061317196, + "grad_norm": 0.9446184039115906, + "learning_rate": 4.036131342712137e-05, + "loss": 0.7534, + "step": 36070 + }, + { + "epoch": 0.5791425223518837, + "grad_norm": 0.9851905107498169, + "learning_rate": 4.035633900387625e-05, + "loss": 0.8607, + "step": 36080 + }, + { + "epoch": 0.5793030385720477, + "grad_norm": 0.9195594191551208, + "learning_rate": 4.0351363604058104e-05, + "loss": 0.8004, + "step": 36090 + }, + { + "epoch": 0.5794635547922118, + "grad_norm": 0.6574283838272095, + "learning_rate": 4.034638722798334e-05, + "loss": 0.7621, + "step": 36100 + }, + { + "epoch": 0.5796240710123758, + "grad_norm": 0.6723333597183228, + "learning_rate": 4.034140987596842e-05, + "loss": 0.8373, + "step": 36110 + }, + { + "epoch": 0.5797845872325399, + "grad_norm": 0.6727994084358215, + "learning_rate": 4.0336431548329876e-05, + "loss": 0.6727, + "step": 36120 + }, + { + "epoch": 0.5799451034527039, + "grad_norm": 0.6110647320747375, + "learning_rate": 4.033145224538431e-05, + "loss": 0.7295, + "step": 36130 + }, + { + "epoch": 0.5801056196728679, + "grad_norm": 0.7365300059318542, + "learning_rate": 4.032647196744835e-05, + "loss": 0.8011, + "step": 36140 + }, + { + "epoch": 0.580266135893032, + "grad_norm": 0.7084189653396606, + "learning_rate": 4.032149071483874e-05, + "loss": 0.82, + "step": 36150 + }, + { + "epoch": 0.580426652113196, + "grad_norm": 0.807822585105896, + "learning_rate": 4.031650848787225e-05, + "loss": 0.7297, + "step": 36160 + }, + { + "epoch": 0.5805871683333601, + "grad_norm": 1.0231974124908447, + "learning_rate": 4.0311525286865716e-05, + "loss": 0.6347, + "step": 36170 + }, + { + "epoch": 0.5807476845535241, + "grad_norm": 0.5928379893302917, + "learning_rate": 4.0306541112136046e-05, + "loss": 0.7502, + "step": 36180 + }, + { + "epoch": 0.5809082007736882, + "grad_norm": 0.6683404445648193, + "learning_rate": 4.030155596400019e-05, + "loss": 0.7182, + "step": 36190 + }, + { + "epoch": 0.5810687169938522, + "grad_norm": 0.7753119468688965, + "learning_rate": 4.02965698427752e-05, + "loss": 0.7972, + "step": 36200 + }, + { + "epoch": 0.5812292332140163, + "grad_norm": 0.8151746988296509, + "learning_rate": 4.0291582748778145e-05, + "loss": 0.651, + "step": 36210 + }, + { + "epoch": 0.5813897494341803, + "grad_norm": 0.7202187180519104, + "learning_rate": 4.0286594682326176e-05, + "loss": 0.8884, + "step": 36220 + }, + { + "epoch": 0.5815502656543444, + "grad_norm": 0.5583833456039429, + "learning_rate": 4.028160564373652e-05, + "loss": 0.7026, + "step": 36230 + }, + { + "epoch": 0.5817107818745084, + "grad_norm": 0.8517146706581116, + "learning_rate": 4.027661563332643e-05, + "loss": 0.7768, + "step": 36240 + }, + { + "epoch": 0.5818712980946724, + "grad_norm": 0.5898287296295166, + "learning_rate": 4.0271624651413244e-05, + "loss": 0.7332, + "step": 36250 + }, + { + "epoch": 0.5820318143148365, + "grad_norm": 0.7190230488777161, + "learning_rate": 4.026663269831438e-05, + "loss": 0.8994, + "step": 36260 + }, + { + "epoch": 0.5821923305350005, + "grad_norm": 0.5976014137268066, + "learning_rate": 4.026163977434727e-05, + "loss": 0.7592, + "step": 36270 + }, + { + "epoch": 0.5823528467551646, + "grad_norm": 1.2017242908477783, + "learning_rate": 4.025664587982945e-05, + "loss": 0.8047, + "step": 36280 + }, + { + "epoch": 0.5825133629753286, + "grad_norm": 0.696507453918457, + "learning_rate": 4.025165101507851e-05, + "loss": 0.8209, + "step": 36290 + }, + { + "epoch": 0.5826738791954927, + "grad_norm": 0.6401665806770325, + "learning_rate": 4.0246655180412074e-05, + "loss": 0.7367, + "step": 36300 + }, + { + "epoch": 0.5828343954156567, + "grad_norm": 1.8609676361083984, + "learning_rate": 4.024165837614785e-05, + "loss": 0.7817, + "step": 36310 + }, + { + "epoch": 0.5829949116358208, + "grad_norm": 1.2039878368377686, + "learning_rate": 4.0236660602603624e-05, + "loss": 0.8189, + "step": 36320 + }, + { + "epoch": 0.5831554278559848, + "grad_norm": 0.6874423027038574, + "learning_rate": 4.0231661860097214e-05, + "loss": 0.8511, + "step": 36330 + }, + { + "epoch": 0.5833159440761488, + "grad_norm": 0.9722986221313477, + "learning_rate": 4.022666214894651e-05, + "loss": 0.7644, + "step": 36340 + }, + { + "epoch": 0.583476460296313, + "grad_norm": 1.0077062845230103, + "learning_rate": 4.022166146946946e-05, + "loss": 0.8717, + "step": 36350 + }, + { + "epoch": 0.583636976516477, + "grad_norm": 0.9290789365768433, + "learning_rate": 4.021665982198408e-05, + "loss": 0.8505, + "step": 36360 + }, + { + "epoch": 0.583797492736641, + "grad_norm": 0.6762076020240784, + "learning_rate": 4.021165720680846e-05, + "loss": 0.672, + "step": 36370 + }, + { + "epoch": 0.583958008956805, + "grad_norm": 1.225083827972412, + "learning_rate": 4.0206653624260714e-05, + "loss": 0.7355, + "step": 36380 + }, + { + "epoch": 0.5841185251769692, + "grad_norm": 0.5415376424789429, + "learning_rate": 4.020164907465906e-05, + "loss": 0.8422, + "step": 36390 + }, + { + "epoch": 0.5842790413971332, + "grad_norm": 0.6345513463020325, + "learning_rate": 4.0196643558321745e-05, + "loss": 0.6998, + "step": 36400 + }, + { + "epoch": 0.5844395576172973, + "grad_norm": 1.042907953262329, + "learning_rate": 4.0191637075567093e-05, + "loss": 0.9194, + "step": 36410 + }, + { + "epoch": 0.5846000738374613, + "grad_norm": 0.864111602306366, + "learning_rate": 4.018662962671349e-05, + "loss": 0.835, + "step": 36420 + }, + { + "epoch": 0.5847605900576254, + "grad_norm": 0.4797331690788269, + "learning_rate": 4.0181621212079377e-05, + "loss": 0.7603, + "step": 36430 + }, + { + "epoch": 0.5849211062777894, + "grad_norm": 0.8135360479354858, + "learning_rate": 4.0176611831983267e-05, + "loss": 0.7965, + "step": 36440 + }, + { + "epoch": 0.5850816224979534, + "grad_norm": 0.5516186356544495, + "learning_rate": 4.0171601486743725e-05, + "loss": 0.7192, + "step": 36450 + }, + { + "epoch": 0.5852421387181175, + "grad_norm": 0.4954865574836731, + "learning_rate": 4.016659017667937e-05, + "loss": 0.7418, + "step": 36460 + }, + { + "epoch": 0.5854026549382815, + "grad_norm": 0.709283709526062, + "learning_rate": 4.01615779021089e-05, + "loss": 0.8127, + "step": 36470 + }, + { + "epoch": 0.5855631711584456, + "grad_norm": 1.9759629964828491, + "learning_rate": 4.015656466335107e-05, + "loss": 0.8446, + "step": 36480 + }, + { + "epoch": 0.5857236873786096, + "grad_norm": 0.6203747987747192, + "learning_rate": 4.015155046072468e-05, + "loss": 0.7485, + "step": 36490 + }, + { + "epoch": 0.5858842035987737, + "grad_norm": 1.1151671409606934, + "learning_rate": 4.0146535294548614e-05, + "loss": 0.728, + "step": 36500 + }, + { + "epoch": 0.5860447198189377, + "grad_norm": 0.8537413477897644, + "learning_rate": 4.0141519165141806e-05, + "loss": 0.7581, + "step": 36510 + }, + { + "epoch": 0.5862052360391018, + "grad_norm": 0.7932829856872559, + "learning_rate": 4.013650207282325e-05, + "loss": 0.9027, + "step": 36520 + }, + { + "epoch": 0.5863657522592658, + "grad_norm": 1.3194361925125122, + "learning_rate": 4.0131484017912e-05, + "loss": 0.8039, + "step": 36530 + }, + { + "epoch": 0.5865262684794298, + "grad_norm": 3.304394245147705, + "learning_rate": 4.012646500072719e-05, + "loss": 0.8567, + "step": 36540 + }, + { + "epoch": 0.5866867846995939, + "grad_norm": 0.7391279935836792, + "learning_rate": 4.012144502158798e-05, + "loss": 0.8483, + "step": 36550 + }, + { + "epoch": 0.5868473009197579, + "grad_norm": 0.5603487491607666, + "learning_rate": 4.011642408081362e-05, + "loss": 0.7965, + "step": 36560 + }, + { + "epoch": 0.587007817139922, + "grad_norm": 0.8482446670532227, + "learning_rate": 4.011140217872341e-05, + "loss": 0.8064, + "step": 36570 + }, + { + "epoch": 0.587168333360086, + "grad_norm": 0.8865063786506653, + "learning_rate": 4.0106379315636726e-05, + "loss": 0.7553, + "step": 36580 + }, + { + "epoch": 0.5873288495802501, + "grad_norm": 0.6926368474960327, + "learning_rate": 4.010135549187298e-05, + "loss": 0.649, + "step": 36590 + }, + { + "epoch": 0.5874893658004141, + "grad_norm": 0.6363588571548462, + "learning_rate": 4.009633070775166e-05, + "loss": 0.7485, + "step": 36600 + }, + { + "epoch": 0.5876498820205782, + "grad_norm": 0.5571781992912292, + "learning_rate": 4.0091304963592315e-05, + "loss": 0.7858, + "step": 36610 + }, + { + "epoch": 0.5878103982407422, + "grad_norm": 0.605617880821228, + "learning_rate": 4.008627825971455e-05, + "loss": 0.7736, + "step": 36620 + }, + { + "epoch": 0.5879709144609063, + "grad_norm": 0.6736018657684326, + "learning_rate": 4.0081250596438035e-05, + "loss": 0.8218, + "step": 36630 + }, + { + "epoch": 0.5881314306810703, + "grad_norm": 0.556492030620575, + "learning_rate": 4.0076221974082494e-05, + "loss": 0.8667, + "step": 36640 + }, + { + "epoch": 0.5882919469012343, + "grad_norm": 0.7912616729736328, + "learning_rate": 4.0071192392967724e-05, + "loss": 0.7582, + "step": 36650 + }, + { + "epoch": 0.5884524631213984, + "grad_norm": 0.6351788640022278, + "learning_rate": 4.0066161853413585e-05, + "loss": 0.7425, + "step": 36660 + }, + { + "epoch": 0.5886129793415624, + "grad_norm": 0.9441076517105103, + "learning_rate": 4.0061130355739984e-05, + "loss": 0.9587, + "step": 36670 + }, + { + "epoch": 0.5887734955617265, + "grad_norm": 0.7481928467750549, + "learning_rate": 4.0056097900266884e-05, + "loss": 0.7771, + "step": 36680 + }, + { + "epoch": 0.5889340117818905, + "grad_norm": 0.6233396530151367, + "learning_rate": 4.005106448731433e-05, + "loss": 0.8223, + "step": 36690 + }, + { + "epoch": 0.5890945280020546, + "grad_norm": 0.9144085645675659, + "learning_rate": 4.0046030117202415e-05, + "loss": 0.8171, + "step": 36700 + }, + { + "epoch": 0.5892550442222186, + "grad_norm": 0.7877093553543091, + "learning_rate": 4.0040994790251296e-05, + "loss": 0.7407, + "step": 36710 + }, + { + "epoch": 0.5894155604423827, + "grad_norm": 0.8860836029052734, + "learning_rate": 4.003595850678119e-05, + "loss": 0.7909, + "step": 36720 + }, + { + "epoch": 0.5895760766625467, + "grad_norm": 0.8961226344108582, + "learning_rate": 4.003092126711238e-05, + "loss": 0.7277, + "step": 36730 + }, + { + "epoch": 0.5897365928827109, + "grad_norm": 0.6664389967918396, + "learning_rate": 4.0025883071565196e-05, + "loss": 0.7662, + "step": 36740 + }, + { + "epoch": 0.5898971091028749, + "grad_norm": 0.42634594440460205, + "learning_rate": 4.002084392046004e-05, + "loss": 0.8021, + "step": 36750 + }, + { + "epoch": 0.5900576253230388, + "grad_norm": 1.5837301015853882, + "learning_rate": 4.001580381411738e-05, + "loss": 0.6847, + "step": 36760 + }, + { + "epoch": 0.590218141543203, + "grad_norm": 0.4988773763179779, + "learning_rate": 4.001076275285772e-05, + "loss": 0.7769, + "step": 36770 + }, + { + "epoch": 0.590378657763367, + "grad_norm": 0.9028951525688171, + "learning_rate": 4.0005720737001663e-05, + "loss": 0.7772, + "step": 36780 + }, + { + "epoch": 0.5905391739835311, + "grad_norm": 0.6206338405609131, + "learning_rate": 4.0000677766869846e-05, + "loss": 0.7159, + "step": 36790 + }, + { + "epoch": 0.5906996902036951, + "grad_norm": 0.44510018825531006, + "learning_rate": 3.999563384278295e-05, + "loss": 0.9236, + "step": 36800 + }, + { + "epoch": 0.5908602064238592, + "grad_norm": 1.293382167816162, + "learning_rate": 3.999058896506177e-05, + "loss": 0.918, + "step": 36810 + }, + { + "epoch": 0.5910207226440232, + "grad_norm": 0.6884348392486572, + "learning_rate": 3.998554313402712e-05, + "loss": 0.8115, + "step": 36820 + }, + { + "epoch": 0.5911812388641873, + "grad_norm": 0.8421921133995056, + "learning_rate": 3.998049634999987e-05, + "loss": 0.8271, + "step": 36830 + }, + { + "epoch": 0.5913417550843513, + "grad_norm": 1.1043190956115723, + "learning_rate": 3.997544861330099e-05, + "loss": 0.783, + "step": 36840 + }, + { + "epoch": 0.5915022713045153, + "grad_norm": 1.3711042404174805, + "learning_rate": 3.9970399924251476e-05, + "loss": 0.7653, + "step": 36850 + }, + { + "epoch": 0.5916627875246794, + "grad_norm": 1.0495439767837524, + "learning_rate": 3.996535028317239e-05, + "loss": 0.7487, + "step": 36860 + }, + { + "epoch": 0.5918233037448434, + "grad_norm": 0.8064780235290527, + "learning_rate": 3.9960299690384864e-05, + "loss": 0.7359, + "step": 36870 + }, + { + "epoch": 0.5919838199650075, + "grad_norm": 0.9818336367607117, + "learning_rate": 3.995524814621009e-05, + "loss": 0.7723, + "step": 36880 + }, + { + "epoch": 0.5921443361851715, + "grad_norm": 0.8348130583763123, + "learning_rate": 3.9950195650969305e-05, + "loss": 0.7368, + "step": 36890 + }, + { + "epoch": 0.5923048524053356, + "grad_norm": 0.7028447389602661, + "learning_rate": 3.994514220498383e-05, + "loss": 0.8492, + "step": 36900 + }, + { + "epoch": 0.5924653686254996, + "grad_norm": 0.49457409977912903, + "learning_rate": 3.9940087808575034e-05, + "loss": 0.7253, + "step": 36910 + }, + { + "epoch": 0.5926258848456637, + "grad_norm": 0.89185631275177, + "learning_rate": 3.993503246206433e-05, + "loss": 0.7781, + "step": 36920 + }, + { + "epoch": 0.5927864010658277, + "grad_norm": 0.9073711037635803, + "learning_rate": 3.992997616577323e-05, + "loss": 0.8085, + "step": 36930 + }, + { + "epoch": 0.5929469172859918, + "grad_norm": 0.8012301921844482, + "learning_rate": 3.992491892002328e-05, + "loss": 0.7535, + "step": 36940 + }, + { + "epoch": 0.5931074335061558, + "grad_norm": 0.49172443151474, + "learning_rate": 3.991986072513608e-05, + "loss": 0.7493, + "step": 36950 + }, + { + "epoch": 0.5932679497263198, + "grad_norm": 1.0191590785980225, + "learning_rate": 3.99148015814333e-05, + "loss": 0.7781, + "step": 36960 + }, + { + "epoch": 0.5934284659464839, + "grad_norm": 0.6562097072601318, + "learning_rate": 3.99097414892367e-05, + "loss": 0.7676, + "step": 36970 + }, + { + "epoch": 0.5935889821666479, + "grad_norm": 0.8100775480270386, + "learning_rate": 3.990468044886804e-05, + "loss": 0.6887, + "step": 36980 + }, + { + "epoch": 0.593749498386812, + "grad_norm": 0.8319634795188904, + "learning_rate": 3.989961846064919e-05, + "loss": 0.7487, + "step": 36990 + }, + { + "epoch": 0.593910014606976, + "grad_norm": 0.7279748320579529, + "learning_rate": 3.9894555524902055e-05, + "loss": 0.8387, + "step": 37000 + }, + { + "epoch": 0.5940705308271401, + "grad_norm": 0.6251834034919739, + "learning_rate": 3.9889491641948605e-05, + "loss": 0.8559, + "step": 37010 + }, + { + "epoch": 0.5942310470473041, + "grad_norm": 1.012477993965149, + "learning_rate": 3.988442681211089e-05, + "loss": 0.7258, + "step": 37020 + }, + { + "epoch": 0.5943915632674682, + "grad_norm": 0.6363523006439209, + "learning_rate": 3.987936103571098e-05, + "loss": 0.8136, + "step": 37030 + }, + { + "epoch": 0.5945520794876322, + "grad_norm": 1.1769131422042847, + "learning_rate": 3.987429431307106e-05, + "loss": 0.7719, + "step": 37040 + }, + { + "epoch": 0.5947125957077962, + "grad_norm": 1.0872807502746582, + "learning_rate": 3.986922664451331e-05, + "loss": 0.934, + "step": 37050 + }, + { + "epoch": 0.5948731119279603, + "grad_norm": 0.8428120017051697, + "learning_rate": 3.986415803036002e-05, + "loss": 0.7682, + "step": 37060 + }, + { + "epoch": 0.5950336281481243, + "grad_norm": 0.683157205581665, + "learning_rate": 3.9859088470933534e-05, + "loss": 0.8123, + "step": 37070 + }, + { + "epoch": 0.5951941443682884, + "grad_norm": 0.6649780869483948, + "learning_rate": 3.985401796655622e-05, + "loss": 0.8671, + "step": 37080 + }, + { + "epoch": 0.5953546605884524, + "grad_norm": 2.0497260093688965, + "learning_rate": 3.984894651755056e-05, + "loss": 0.7844, + "step": 37090 + }, + { + "epoch": 0.5955151768086165, + "grad_norm": 0.7505207061767578, + "learning_rate": 3.984387412423905e-05, + "loss": 0.7093, + "step": 37100 + }, + { + "epoch": 0.5956756930287805, + "grad_norm": 0.8952100872993469, + "learning_rate": 3.9838800786944276e-05, + "loss": 0.7653, + "step": 37110 + }, + { + "epoch": 0.5958362092489446, + "grad_norm": 0.9246242642402649, + "learning_rate": 3.9833726505988855e-05, + "loss": 0.784, + "step": 37120 + }, + { + "epoch": 0.5959967254691086, + "grad_norm": 0.7178996801376343, + "learning_rate": 3.9828651281695504e-05, + "loss": 0.7726, + "step": 37130 + }, + { + "epoch": 0.5961572416892728, + "grad_norm": 0.7582240104675293, + "learning_rate": 3.9823575114386955e-05, + "loss": 0.7561, + "step": 37140 + }, + { + "epoch": 0.5963177579094368, + "grad_norm": 0.9726042151451111, + "learning_rate": 3.981849800438605e-05, + "loss": 0.7959, + "step": 37150 + }, + { + "epoch": 0.5964782741296007, + "grad_norm": 0.6401000618934631, + "learning_rate": 3.981341995201564e-05, + "loss": 0.7295, + "step": 37160 + }, + { + "epoch": 0.5966387903497649, + "grad_norm": 0.9473204016685486, + "learning_rate": 3.9808340957598664e-05, + "loss": 0.6725, + "step": 37170 + }, + { + "epoch": 0.5967993065699289, + "grad_norm": 1.1960269212722778, + "learning_rate": 3.980326102145812e-05, + "loss": 0.8762, + "step": 37180 + }, + { + "epoch": 0.596959822790093, + "grad_norm": 0.7847097516059875, + "learning_rate": 3.979818014391707e-05, + "loss": 0.8836, + "step": 37190 + }, + { + "epoch": 0.597120339010257, + "grad_norm": 0.67250657081604, + "learning_rate": 3.979309832529861e-05, + "loss": 0.7483, + "step": 37200 + }, + { + "epoch": 0.5972808552304211, + "grad_norm": 0.6315157413482666, + "learning_rate": 3.978801556592593e-05, + "loss": 0.8032, + "step": 37210 + }, + { + "epoch": 0.5974413714505851, + "grad_norm": 0.7019423246383667, + "learning_rate": 3.978293186612225e-05, + "loss": 0.7009, + "step": 37220 + }, + { + "epoch": 0.5976018876707492, + "grad_norm": 0.5296964049339294, + "learning_rate": 3.9777847226210874e-05, + "loss": 0.8369, + "step": 37230 + }, + { + "epoch": 0.5977624038909132, + "grad_norm": 0.5927723050117493, + "learning_rate": 3.977276164651515e-05, + "loss": 0.7044, + "step": 37240 + }, + { + "epoch": 0.5979229201110772, + "grad_norm": 0.6333862543106079, + "learning_rate": 3.976767512735849e-05, + "loss": 0.9035, + "step": 37250 + }, + { + "epoch": 0.5980834363312413, + "grad_norm": 0.6433098912239075, + "learning_rate": 3.9762587669064375e-05, + "loss": 0.8633, + "step": 37260 + }, + { + "epoch": 0.5982439525514053, + "grad_norm": 1.1758434772491455, + "learning_rate": 3.975749927195633e-05, + "loss": 0.7381, + "step": 37270 + }, + { + "epoch": 0.5984044687715694, + "grad_norm": 1.0726325511932373, + "learning_rate": 3.9752409936357945e-05, + "loss": 0.6415, + "step": 37280 + }, + { + "epoch": 0.5985649849917334, + "grad_norm": 1.1273441314697266, + "learning_rate": 3.974731966259289e-05, + "loss": 0.8496, + "step": 37290 + }, + { + "epoch": 0.5987255012118975, + "grad_norm": 1.0153250694274902, + "learning_rate": 3.974222845098485e-05, + "loss": 0.8801, + "step": 37300 + }, + { + "epoch": 0.5988860174320615, + "grad_norm": 0.7318530678749084, + "learning_rate": 3.973713630185761e-05, + "loss": 0.7906, + "step": 37310 + }, + { + "epoch": 0.5990465336522256, + "grad_norm": 0.7686176300048828, + "learning_rate": 3.9732043215535e-05, + "loss": 0.7588, + "step": 37320 + }, + { + "epoch": 0.5992070498723896, + "grad_norm": 0.5385235548019409, + "learning_rate": 3.972694919234091e-05, + "loss": 0.7595, + "step": 37330 + }, + { + "epoch": 0.5993675660925537, + "grad_norm": 0.6187199950218201, + "learning_rate": 3.972185423259929e-05, + "loss": 0.7731, + "step": 37340 + }, + { + "epoch": 0.5995280823127177, + "grad_norm": 0.8673129081726074, + "learning_rate": 3.9716758336634154e-05, + "loss": 0.7541, + "step": 37350 + }, + { + "epoch": 0.5996885985328817, + "grad_norm": 0.7093130946159363, + "learning_rate": 3.971166150476956e-05, + "loss": 0.6831, + "step": 37360 + }, + { + "epoch": 0.5998491147530458, + "grad_norm": 0.8425043821334839, + "learning_rate": 3.970656373732964e-05, + "loss": 0.7668, + "step": 37370 + }, + { + "epoch": 0.6000096309732098, + "grad_norm": 0.5954310297966003, + "learning_rate": 3.970146503463859e-05, + "loss": 0.8055, + "step": 37380 + }, + { + "epoch": 0.6001701471933739, + "grad_norm": 0.7442566156387329, + "learning_rate": 3.969636539702065e-05, + "loss": 0.8887, + "step": 37390 + }, + { + "epoch": 0.6003306634135379, + "grad_norm": 0.8808811902999878, + "learning_rate": 3.9691264824800134e-05, + "loss": 0.8158, + "step": 37400 + }, + { + "epoch": 0.600491179633702, + "grad_norm": 1.417517900466919, + "learning_rate": 3.968616331830141e-05, + "loss": 0.8184, + "step": 37410 + }, + { + "epoch": 0.600651695853866, + "grad_norm": 0.7371473908424377, + "learning_rate": 3.9681060877848886e-05, + "loss": 0.7027, + "step": 37420 + }, + { + "epoch": 0.6008122120740301, + "grad_norm": 0.8388651609420776, + "learning_rate": 3.967595750376706e-05, + "loss": 0.6824, + "step": 37430 + }, + { + "epoch": 0.6009727282941941, + "grad_norm": 0.6809362769126892, + "learning_rate": 3.9670853196380474e-05, + "loss": 0.9307, + "step": 37440 + }, + { + "epoch": 0.6011332445143581, + "grad_norm": 0.6995666027069092, + "learning_rate": 3.966574795601374e-05, + "loss": 0.862, + "step": 37450 + }, + { + "epoch": 0.6012937607345222, + "grad_norm": 0.7661793231964111, + "learning_rate": 3.9660641782991513e-05, + "loss": 0.8059, + "step": 37460 + }, + { + "epoch": 0.6014542769546862, + "grad_norm": 0.8158515095710754, + "learning_rate": 3.965553467763853e-05, + "loss": 0.8333, + "step": 37470 + }, + { + "epoch": 0.6016147931748503, + "grad_norm": 0.85378497838974, + "learning_rate": 3.9650426640279544e-05, + "loss": 0.8211, + "step": 37480 + }, + { + "epoch": 0.6017753093950143, + "grad_norm": 0.6210297346115112, + "learning_rate": 3.9645317671239416e-05, + "loss": 0.8578, + "step": 37490 + }, + { + "epoch": 0.6019358256151784, + "grad_norm": 0.46719712018966675, + "learning_rate": 3.964020777084305e-05, + "loss": 0.8473, + "step": 37500 + }, + { + "epoch": 0.6020963418353424, + "grad_norm": 0.6271728873252869, + "learning_rate": 3.9635096939415394e-05, + "loss": 0.675, + "step": 37510 + }, + { + "epoch": 0.6022568580555065, + "grad_norm": 0.7361509203910828, + "learning_rate": 3.962998517728147e-05, + "loss": 0.834, + "step": 37520 + }, + { + "epoch": 0.6024173742756705, + "grad_norm": 0.6509928703308105, + "learning_rate": 3.9624872484766365e-05, + "loss": 0.8438, + "step": 37530 + }, + { + "epoch": 0.6025778904958347, + "grad_norm": 0.8887346982955933, + "learning_rate": 3.9619758862195204e-05, + "loss": 0.6498, + "step": 37540 + }, + { + "epoch": 0.6027384067159987, + "grad_norm": 1.777078628540039, + "learning_rate": 3.961464430989319e-05, + "loss": 0.8227, + "step": 37550 + }, + { + "epoch": 0.6028989229361627, + "grad_norm": 0.5870428681373596, + "learning_rate": 3.9609528828185574e-05, + "loss": 0.8566, + "step": 37560 + }, + { + "epoch": 0.6030594391563268, + "grad_norm": 0.8179250955581665, + "learning_rate": 3.9604412417397686e-05, + "loss": 0.7457, + "step": 37570 + }, + { + "epoch": 0.6032199553764908, + "grad_norm": 0.5670053362846375, + "learning_rate": 3.959929507785487e-05, + "loss": 0.7869, + "step": 37580 + }, + { + "epoch": 0.6033804715966549, + "grad_norm": 0.5869922041893005, + "learning_rate": 3.9594176809882586e-05, + "loss": 0.7248, + "step": 37590 + }, + { + "epoch": 0.6035409878168189, + "grad_norm": 0.7177290916442871, + "learning_rate": 3.9589057613806314e-05, + "loss": 0.7632, + "step": 37600 + }, + { + "epoch": 0.603701504036983, + "grad_norm": 0.7386598587036133, + "learning_rate": 3.9583937489951606e-05, + "loss": 0.7896, + "step": 37610 + }, + { + "epoch": 0.603862020257147, + "grad_norm": 0.8497257828712463, + "learning_rate": 3.957881643864408e-05, + "loss": 0.689, + "step": 37620 + }, + { + "epoch": 0.6040225364773111, + "grad_norm": 0.56931072473526, + "learning_rate": 3.9573694460209386e-05, + "loss": 0.8229, + "step": 37630 + }, + { + "epoch": 0.6041830526974751, + "grad_norm": 0.8127453327178955, + "learning_rate": 3.956857155497327e-05, + "loss": 0.8337, + "step": 37640 + }, + { + "epoch": 0.6043435689176391, + "grad_norm": 1.5092357397079468, + "learning_rate": 3.9563447723261515e-05, + "loss": 0.8669, + "step": 37650 + }, + { + "epoch": 0.6045040851378032, + "grad_norm": 0.6576908826828003, + "learning_rate": 3.9558322965399964e-05, + "loss": 0.779, + "step": 37660 + }, + { + "epoch": 0.6046646013579672, + "grad_norm": 0.6040767431259155, + "learning_rate": 3.955319728171451e-05, + "loss": 0.7785, + "step": 37670 + }, + { + "epoch": 0.6048251175781313, + "grad_norm": 0.647095263004303, + "learning_rate": 3.9548070672531136e-05, + "loss": 0.7623, + "step": 37680 + }, + { + "epoch": 0.6049856337982953, + "grad_norm": 0.7080425024032593, + "learning_rate": 3.9542943138175856e-05, + "loss": 0.7269, + "step": 37690 + }, + { + "epoch": 0.6051461500184594, + "grad_norm": 0.573911190032959, + "learning_rate": 3.9537814678974744e-05, + "loss": 0.8294, + "step": 37700 + }, + { + "epoch": 0.6053066662386234, + "grad_norm": 1.3584871292114258, + "learning_rate": 3.9532685295253955e-05, + "loss": 0.7398, + "step": 37710 + }, + { + "epoch": 0.6054671824587875, + "grad_norm": 1.0353766679763794, + "learning_rate": 3.952755498733968e-05, + "loss": 0.7267, + "step": 37720 + }, + { + "epoch": 0.6056276986789515, + "grad_norm": 1.054316520690918, + "learning_rate": 3.952242375555817e-05, + "loss": 0.837, + "step": 37730 + }, + { + "epoch": 0.6057882148991156, + "grad_norm": 0.5551409125328064, + "learning_rate": 3.951729160023574e-05, + "loss": 0.867, + "step": 37740 + }, + { + "epoch": 0.6059487311192796, + "grad_norm": 1.3289270401000977, + "learning_rate": 3.951215852169878e-05, + "loss": 0.7875, + "step": 37750 + }, + { + "epoch": 0.6061092473394436, + "grad_norm": 0.5417864322662354, + "learning_rate": 3.950702452027372e-05, + "loss": 0.8288, + "step": 37760 + }, + { + "epoch": 0.6062697635596077, + "grad_norm": 0.729145348072052, + "learning_rate": 3.950188959628704e-05, + "loss": 0.8301, + "step": 37770 + }, + { + "epoch": 0.6064302797797717, + "grad_norm": 0.9074498414993286, + "learning_rate": 3.949675375006531e-05, + "loss": 0.8481, + "step": 37780 + }, + { + "epoch": 0.6065907959999358, + "grad_norm": 0.8220153450965881, + "learning_rate": 3.949161698193512e-05, + "loss": 0.716, + "step": 37790 + }, + { + "epoch": 0.6067513122200998, + "grad_norm": 0.4932674765586853, + "learning_rate": 3.948647929222315e-05, + "loss": 0.7909, + "step": 37800 + }, + { + "epoch": 0.6069118284402639, + "grad_norm": 0.3918898105621338, + "learning_rate": 3.948134068125612e-05, + "loss": 0.7838, + "step": 37810 + }, + { + "epoch": 0.6070723446604279, + "grad_norm": 0.9462749361991882, + "learning_rate": 3.9476201149360826e-05, + "loss": 0.8162, + "step": 37820 + }, + { + "epoch": 0.607232860880592, + "grad_norm": 1.8753472566604614, + "learning_rate": 3.94710606968641e-05, + "loss": 0.7887, + "step": 37830 + }, + { + "epoch": 0.607393377100756, + "grad_norm": 0.6074420809745789, + "learning_rate": 3.9465919324092854e-05, + "loss": 0.7473, + "step": 37840 + }, + { + "epoch": 0.60755389332092, + "grad_norm": 0.7443304061889648, + "learning_rate": 3.946077703137405e-05, + "loss": 0.7349, + "step": 37850 + }, + { + "epoch": 0.6077144095410841, + "grad_norm": 0.6612790822982788, + "learning_rate": 3.945563381903469e-05, + "loss": 0.7559, + "step": 37860 + }, + { + "epoch": 0.6078749257612481, + "grad_norm": 0.689988911151886, + "learning_rate": 3.945048968740188e-05, + "loss": 0.9061, + "step": 37870 + }, + { + "epoch": 0.6080354419814122, + "grad_norm": 0.4929659962654114, + "learning_rate": 3.944534463680273e-05, + "loss": 0.7682, + "step": 37880 + }, + { + "epoch": 0.6081959582015762, + "grad_norm": 0.6002166867256165, + "learning_rate": 3.944019866756446e-05, + "loss": 0.7044, + "step": 37890 + }, + { + "epoch": 0.6083564744217403, + "grad_norm": 0.8303666710853577, + "learning_rate": 3.943505178001429e-05, + "loss": 0.748, + "step": 37900 + }, + { + "epoch": 0.6085169906419043, + "grad_norm": 0.6792557835578918, + "learning_rate": 3.9429903974479566e-05, + "loss": 0.7213, + "step": 37910 + }, + { + "epoch": 0.6086775068620685, + "grad_norm": 0.8786492347717285, + "learning_rate": 3.942475525128764e-05, + "loss": 0.7519, + "step": 37920 + }, + { + "epoch": 0.6088380230822324, + "grad_norm": 0.7663836479187012, + "learning_rate": 3.941960561076594e-05, + "loss": 0.8188, + "step": 37930 + }, + { + "epoch": 0.6089985393023966, + "grad_norm": 0.6867387890815735, + "learning_rate": 3.9414455053241974e-05, + "loss": 0.7565, + "step": 37940 + }, + { + "epoch": 0.6091590555225606, + "grad_norm": 0.5119057893753052, + "learning_rate": 3.940930357904325e-05, + "loss": 0.7879, + "step": 37950 + }, + { + "epoch": 0.6093195717427246, + "grad_norm": 0.7302525043487549, + "learning_rate": 3.940415118849741e-05, + "loss": 0.7962, + "step": 37960 + }, + { + "epoch": 0.6094800879628887, + "grad_norm": 0.8698124885559082, + "learning_rate": 3.9398997881932086e-05, + "loss": 0.7184, + "step": 37970 + }, + { + "epoch": 0.6096406041830527, + "grad_norm": 0.8504899740219116, + "learning_rate": 3.939384365967502e-05, + "loss": 0.7378, + "step": 37980 + }, + { + "epoch": 0.6098011204032168, + "grad_norm": 0.6334413290023804, + "learning_rate": 3.9388688522053976e-05, + "loss": 0.7162, + "step": 37990 + }, + { + "epoch": 0.6099616366233808, + "grad_norm": 0.8022338151931763, + "learning_rate": 3.9383532469396787e-05, + "loss": 0.7773, + "step": 38000 + }, + { + "epoch": 0.6101221528435449, + "grad_norm": 1.2882499694824219, + "learning_rate": 3.9378375502031367e-05, + "loss": 0.7959, + "step": 38010 + }, + { + "epoch": 0.6102826690637089, + "grad_norm": 2.134371757507324, + "learning_rate": 3.9373217620285654e-05, + "loss": 0.8407, + "step": 38020 + }, + { + "epoch": 0.610443185283873, + "grad_norm": 0.6704578995704651, + "learning_rate": 3.9368058824487666e-05, + "loss": 0.8149, + "step": 38030 + }, + { + "epoch": 0.610603701504037, + "grad_norm": 0.37847965955734253, + "learning_rate": 3.9362899114965464e-05, + "loss": 0.7502, + "step": 38040 + }, + { + "epoch": 0.610764217724201, + "grad_norm": 1.2796109914779663, + "learning_rate": 3.935773849204718e-05, + "loss": 0.6621, + "step": 38050 + }, + { + "epoch": 0.6109247339443651, + "grad_norm": 0.5993833541870117, + "learning_rate": 3.9352576956060996e-05, + "loss": 0.8199, + "step": 38060 + }, + { + "epoch": 0.6110852501645291, + "grad_norm": 0.9239700436592102, + "learning_rate": 3.934741450733517e-05, + "loss": 0.7141, + "step": 38070 + }, + { + "epoch": 0.6112457663846932, + "grad_norm": 0.8174496293067932, + "learning_rate": 3.934225114619798e-05, + "loss": 0.8042, + "step": 38080 + }, + { + "epoch": 0.6114062826048572, + "grad_norm": 0.6069329977035522, + "learning_rate": 3.9337086872977805e-05, + "loss": 0.7504, + "step": 38090 + }, + { + "epoch": 0.6115667988250213, + "grad_norm": 0.8428872227668762, + "learning_rate": 3.9331921688003046e-05, + "loss": 0.7538, + "step": 38100 + }, + { + "epoch": 0.6117273150451853, + "grad_norm": 0.7161558270454407, + "learning_rate": 3.9326755591602194e-05, + "loss": 0.8893, + "step": 38110 + }, + { + "epoch": 0.6118878312653494, + "grad_norm": 0.6207103133201599, + "learning_rate": 3.932158858410378e-05, + "loss": 0.7803, + "step": 38120 + }, + { + "epoch": 0.6120483474855134, + "grad_norm": 0.7723967432975769, + "learning_rate": 3.931642066583639e-05, + "loss": 0.8321, + "step": 38130 + }, + { + "epoch": 0.6122088637056775, + "grad_norm": 0.7207299470901489, + "learning_rate": 3.931125183712867e-05, + "loss": 0.7539, + "step": 38140 + }, + { + "epoch": 0.6123693799258415, + "grad_norm": 1.9104727506637573, + "learning_rate": 3.9306082098309324e-05, + "loss": 0.7195, + "step": 38150 + }, + { + "epoch": 0.6125298961460055, + "grad_norm": 0.6953368186950684, + "learning_rate": 3.9300911449707144e-05, + "loss": 0.7804, + "step": 38160 + }, + { + "epoch": 0.6126904123661696, + "grad_norm": 0.7201284170150757, + "learning_rate": 3.929573989165093e-05, + "loss": 0.7466, + "step": 38170 + }, + { + "epoch": 0.6128509285863336, + "grad_norm": 0.481998085975647, + "learning_rate": 3.929056742446955e-05, + "loss": 0.7359, + "step": 38180 + }, + { + "epoch": 0.6130114448064977, + "grad_norm": 0.7106379270553589, + "learning_rate": 3.928539404849198e-05, + "loss": 0.8601, + "step": 38190 + }, + { + "epoch": 0.6131719610266617, + "grad_norm": 1.0681968927383423, + "learning_rate": 3.928021976404718e-05, + "loss": 0.8234, + "step": 38200 + }, + { + "epoch": 0.6133324772468258, + "grad_norm": 0.554364800453186, + "learning_rate": 3.927504457146422e-05, + "loss": 0.8632, + "step": 38210 + }, + { + "epoch": 0.6134929934669898, + "grad_norm": 0.5681819915771484, + "learning_rate": 3.926986847107221e-05, + "loss": 0.8023, + "step": 38220 + }, + { + "epoch": 0.6136535096871539, + "grad_norm": 1.9080106019973755, + "learning_rate": 3.926469146320033e-05, + "loss": 0.7491, + "step": 38230 + }, + { + "epoch": 0.6138140259073179, + "grad_norm": 0.8618949055671692, + "learning_rate": 3.92595135481778e-05, + "loss": 0.8623, + "step": 38240 + }, + { + "epoch": 0.613974542127482, + "grad_norm": 0.6629480719566345, + "learning_rate": 3.925433472633389e-05, + "loss": 0.7762, + "step": 38250 + }, + { + "epoch": 0.614135058347646, + "grad_norm": 0.6345796585083008, + "learning_rate": 3.924915499799796e-05, + "loss": 0.8692, + "step": 38260 + }, + { + "epoch": 0.61429557456781, + "grad_norm": 0.4921918213367462, + "learning_rate": 3.924397436349941e-05, + "loss": 0.8123, + "step": 38270 + }, + { + "epoch": 0.6144560907879741, + "grad_norm": 0.7029829025268555, + "learning_rate": 3.9238792823167705e-05, + "loss": 0.8281, + "step": 38280 + }, + { + "epoch": 0.6146166070081381, + "grad_norm": 0.6954795122146606, + "learning_rate": 3.923361037733233e-05, + "loss": 0.7262, + "step": 38290 + }, + { + "epoch": 0.6147771232283022, + "grad_norm": 1.2569034099578857, + "learning_rate": 3.922842702632289e-05, + "loss": 0.6913, + "step": 38300 + }, + { + "epoch": 0.6149376394484662, + "grad_norm": 0.8009582757949829, + "learning_rate": 3.9223242770469e-05, + "loss": 0.748, + "step": 38310 + }, + { + "epoch": 0.6150981556686304, + "grad_norm": 0.7248051166534424, + "learning_rate": 3.921805761010035e-05, + "loss": 0.6516, + "step": 38320 + }, + { + "epoch": 0.6152586718887944, + "grad_norm": 0.8301505446434021, + "learning_rate": 3.921287154554668e-05, + "loss": 0.7168, + "step": 38330 + }, + { + "epoch": 0.6154191881089585, + "grad_norm": 0.5859376192092896, + "learning_rate": 3.9207684577137815e-05, + "loss": 0.7759, + "step": 38340 + }, + { + "epoch": 0.6155797043291225, + "grad_norm": 0.8399546146392822, + "learning_rate": 3.9202496705203594e-05, + "loss": 0.7853, + "step": 38350 + }, + { + "epoch": 0.6157402205492865, + "grad_norm": 0.8448347449302673, + "learning_rate": 3.919730793007395e-05, + "loss": 0.8776, + "step": 38360 + }, + { + "epoch": 0.6159007367694506, + "grad_norm": 0.8245463371276855, + "learning_rate": 3.919211825207884e-05, + "loss": 0.7867, + "step": 38370 + }, + { + "epoch": 0.6160612529896146, + "grad_norm": 1.0247737169265747, + "learning_rate": 3.918692767154832e-05, + "loss": 0.7295, + "step": 38380 + }, + { + "epoch": 0.6162217692097787, + "grad_norm": 0.7011905908584595, + "learning_rate": 3.9181736188812456e-05, + "loss": 0.8679, + "step": 38390 + }, + { + "epoch": 0.6163822854299427, + "grad_norm": 0.8252625465393066, + "learning_rate": 3.9176543804201416e-05, + "loss": 0.7612, + "step": 38400 + }, + { + "epoch": 0.6165428016501068, + "grad_norm": 0.6579002141952515, + "learning_rate": 3.9171350518045404e-05, + "loss": 0.7537, + "step": 38410 + }, + { + "epoch": 0.6167033178702708, + "grad_norm": 0.7029485106468201, + "learning_rate": 3.916615633067467e-05, + "loss": 0.8399, + "step": 38420 + }, + { + "epoch": 0.6168638340904349, + "grad_norm": 0.573489248752594, + "learning_rate": 3.9160961242419535e-05, + "loss": 0.7585, + "step": 38430 + }, + { + "epoch": 0.6170243503105989, + "grad_norm": 0.6778472661972046, + "learning_rate": 3.915576525361039e-05, + "loss": 0.6708, + "step": 38440 + }, + { + "epoch": 0.617184866530763, + "grad_norm": 0.8818039298057556, + "learning_rate": 3.915056836457766e-05, + "loss": 0.9058, + "step": 38450 + }, + { + "epoch": 0.617345382750927, + "grad_norm": 0.6717743873596191, + "learning_rate": 3.9145370575651835e-05, + "loss": 0.7574, + "step": 38460 + }, + { + "epoch": 0.617505898971091, + "grad_norm": 0.5097578167915344, + "learning_rate": 3.914017188716347e-05, + "loss": 0.7424, + "step": 38470 + }, + { + "epoch": 0.6176664151912551, + "grad_norm": 0.6047983169555664, + "learning_rate": 3.913497229944316e-05, + "loss": 0.7341, + "step": 38480 + }, + { + "epoch": 0.6178269314114191, + "grad_norm": 1.2544664144515991, + "learning_rate": 3.912977181282158e-05, + "loss": 0.7481, + "step": 38490 + }, + { + "epoch": 0.6179874476315832, + "grad_norm": 1.1290738582611084, + "learning_rate": 3.912457042762945e-05, + "loss": 0.808, + "step": 38500 + }, + { + "epoch": 0.6181479638517472, + "grad_norm": 0.6708670854568481, + "learning_rate": 3.9119368144197536e-05, + "loss": 0.7776, + "step": 38510 + }, + { + "epoch": 0.6183084800719113, + "grad_norm": 0.7654004693031311, + "learning_rate": 3.911416496285668e-05, + "loss": 0.8072, + "step": 38520 + }, + { + "epoch": 0.6184689962920753, + "grad_norm": 0.6487962603569031, + "learning_rate": 3.9108960883937785e-05, + "loss": 0.8634, + "step": 38530 + }, + { + "epoch": 0.6186295125122394, + "grad_norm": 0.7295904159545898, + "learning_rate": 3.9103755907771776e-05, + "loss": 0.7332, + "step": 38540 + }, + { + "epoch": 0.6187900287324034, + "grad_norm": 1.149078130722046, + "learning_rate": 3.909855003468968e-05, + "loss": 0.8765, + "step": 38550 + }, + { + "epoch": 0.6189505449525674, + "grad_norm": 0.6436233520507812, + "learning_rate": 3.9093343265022553e-05, + "loss": 0.7927, + "step": 38560 + }, + { + "epoch": 0.6191110611727315, + "grad_norm": 0.8902339339256287, + "learning_rate": 3.90881355991015e-05, + "loss": 0.7362, + "step": 38570 + }, + { + "epoch": 0.6192715773928955, + "grad_norm": 0.6229974627494812, + "learning_rate": 3.9082927037257725e-05, + "loss": 0.774, + "step": 38580 + }, + { + "epoch": 0.6194320936130596, + "grad_norm": 0.508955180644989, + "learning_rate": 3.907771757982245e-05, + "loss": 0.7911, + "step": 38590 + }, + { + "epoch": 0.6195926098332236, + "grad_norm": 0.8855603337287903, + "learning_rate": 3.907250722712697e-05, + "loss": 0.7461, + "step": 38600 + }, + { + "epoch": 0.6197531260533877, + "grad_norm": 0.8658965229988098, + "learning_rate": 3.906729597950261e-05, + "loss": 0.7248, + "step": 38610 + }, + { + "epoch": 0.6199136422735517, + "grad_norm": 0.6366375684738159, + "learning_rate": 3.9062083837280806e-05, + "loss": 0.6719, + "step": 38620 + }, + { + "epoch": 0.6200741584937158, + "grad_norm": 1.5778453350067139, + "learning_rate": 3.9056870800793e-05, + "loss": 0.8892, + "step": 38630 + }, + { + "epoch": 0.6202346747138798, + "grad_norm": 0.6174941062927246, + "learning_rate": 3.9051656870370715e-05, + "loss": 0.6952, + "step": 38640 + }, + { + "epoch": 0.6203951909340439, + "grad_norm": 0.6001375317573547, + "learning_rate": 3.9046442046345524e-05, + "loss": 0.8322, + "step": 38650 + }, + { + "epoch": 0.6205557071542079, + "grad_norm": 1.3570482730865479, + "learning_rate": 3.904122632904908e-05, + "loss": 0.7535, + "step": 38660 + }, + { + "epoch": 0.6207162233743719, + "grad_norm": 0.5283072590827942, + "learning_rate": 3.903600971881304e-05, + "loss": 0.7954, + "step": 38670 + }, + { + "epoch": 0.620876739594536, + "grad_norm": 1.1652617454528809, + "learning_rate": 3.903079221596917e-05, + "loss": 0.737, + "step": 38680 + }, + { + "epoch": 0.6210372558147, + "grad_norm": 0.882037878036499, + "learning_rate": 3.902557382084926e-05, + "loss": 0.6685, + "step": 38690 + }, + { + "epoch": 0.6211977720348641, + "grad_norm": 0.8619228601455688, + "learning_rate": 3.9020354533785185e-05, + "loss": 0.7833, + "step": 38700 + }, + { + "epoch": 0.6213582882550281, + "grad_norm": 1.1697067022323608, + "learning_rate": 3.901513435510885e-05, + "loss": 0.7294, + "step": 38710 + }, + { + "epoch": 0.6215188044751923, + "grad_norm": 0.856677234172821, + "learning_rate": 3.9009913285152235e-05, + "loss": 0.9293, + "step": 38720 + }, + { + "epoch": 0.6216793206953563, + "grad_norm": 0.6528865694999695, + "learning_rate": 3.900469132424736e-05, + "loss": 0.7826, + "step": 38730 + }, + { + "epoch": 0.6218398369155204, + "grad_norm": 0.8786482810974121, + "learning_rate": 3.899946847272632e-05, + "loss": 0.7668, + "step": 38740 + }, + { + "epoch": 0.6220003531356844, + "grad_norm": 0.6822080612182617, + "learning_rate": 3.899424473092125e-05, + "loss": 0.8483, + "step": 38750 + }, + { + "epoch": 0.6221608693558484, + "grad_norm": 0.5774383544921875, + "learning_rate": 3.898902009916435e-05, + "loss": 0.7195, + "step": 38760 + }, + { + "epoch": 0.6223213855760125, + "grad_norm": 0.8303529620170593, + "learning_rate": 3.8983794577787893e-05, + "loss": 0.7621, + "step": 38770 + }, + { + "epoch": 0.6224819017961765, + "grad_norm": 0.6057352423667908, + "learning_rate": 3.897856816712416e-05, + "loss": 0.754, + "step": 38780 + }, + { + "epoch": 0.6226424180163406, + "grad_norm": 0.8845207691192627, + "learning_rate": 3.897334086750555e-05, + "loss": 0.8673, + "step": 38790 + }, + { + "epoch": 0.6228029342365046, + "grad_norm": 0.719015896320343, + "learning_rate": 3.896811267926447e-05, + "loss": 0.8861, + "step": 38800 + }, + { + "epoch": 0.6229634504566687, + "grad_norm": 0.5614177584648132, + "learning_rate": 3.896288360273341e-05, + "loss": 0.7597, + "step": 38810 + }, + { + "epoch": 0.6231239666768327, + "grad_norm": 0.8061092495918274, + "learning_rate": 3.895765363824491e-05, + "loss": 0.8726, + "step": 38820 + }, + { + "epoch": 0.6232844828969968, + "grad_norm": 0.8996514678001404, + "learning_rate": 3.8952422786131546e-05, + "loss": 0.8335, + "step": 38830 + }, + { + "epoch": 0.6234449991171608, + "grad_norm": 0.658605694770813, + "learning_rate": 3.8947191046726004e-05, + "loss": 0.9637, + "step": 38840 + }, + { + "epoch": 0.6236055153373249, + "grad_norm": 0.7160141468048096, + "learning_rate": 3.8941958420360963e-05, + "loss": 0.7861, + "step": 38850 + }, + { + "epoch": 0.6237660315574889, + "grad_norm": 1.227826714515686, + "learning_rate": 3.8936724907369206e-05, + "loss": 0.8138, + "step": 38860 + }, + { + "epoch": 0.6239265477776529, + "grad_norm": 0.5395148992538452, + "learning_rate": 3.8931490508083536e-05, + "loss": 0.8923, + "step": 38870 + }, + { + "epoch": 0.624087063997817, + "grad_norm": 0.6440101861953735, + "learning_rate": 3.892625522283684e-05, + "loss": 0.8314, + "step": 38880 + }, + { + "epoch": 0.624247580217981, + "grad_norm": 1.0478951930999756, + "learning_rate": 3.892101905196206e-05, + "loss": 0.9799, + "step": 38890 + }, + { + "epoch": 0.6244080964381451, + "grad_norm": 0.5907064080238342, + "learning_rate": 3.8915781995792166e-05, + "loss": 0.7634, + "step": 38900 + }, + { + "epoch": 0.6245686126583091, + "grad_norm": 0.605556845664978, + "learning_rate": 3.891054405466022e-05, + "loss": 0.7759, + "step": 38910 + }, + { + "epoch": 0.6247291288784732, + "grad_norm": 0.5810546278953552, + "learning_rate": 3.890530522889931e-05, + "loss": 0.7171, + "step": 38920 + }, + { + "epoch": 0.6248896450986372, + "grad_norm": 0.6882740259170532, + "learning_rate": 3.890006551884261e-05, + "loss": 0.8102, + "step": 38930 + }, + { + "epoch": 0.6250501613188013, + "grad_norm": 0.48953354358673096, + "learning_rate": 3.889482492482334e-05, + "loss": 0.725, + "step": 38940 + }, + { + "epoch": 0.6252106775389653, + "grad_norm": 1.1582659482955933, + "learning_rate": 3.8889583447174735e-05, + "loss": 0.6425, + "step": 38950 + }, + { + "epoch": 0.6253711937591293, + "grad_norm": 0.5571473836898804, + "learning_rate": 3.8884341086230165e-05, + "loss": 0.8508, + "step": 38960 + }, + { + "epoch": 0.6255317099792934, + "grad_norm": 0.6823153495788574, + "learning_rate": 3.887909784232299e-05, + "loss": 0.8447, + "step": 38970 + }, + { + "epoch": 0.6256922261994574, + "grad_norm": 1.1175944805145264, + "learning_rate": 3.8873853715786654e-05, + "loss": 0.7647, + "step": 38980 + }, + { + "epoch": 0.6258527424196215, + "grad_norm": 0.6182888150215149, + "learning_rate": 3.886860870695465e-05, + "loss": 0.9271, + "step": 38990 + }, + { + "epoch": 0.6260132586397855, + "grad_norm": 0.5734162330627441, + "learning_rate": 3.886336281616053e-05, + "loss": 0.7798, + "step": 39000 + }, + { + "epoch": 0.6261737748599496, + "grad_norm": 0.8643282651901245, + "learning_rate": 3.885811604373791e-05, + "loss": 0.7269, + "step": 39010 + }, + { + "epoch": 0.6263342910801136, + "grad_norm": 0.6926249265670776, + "learning_rate": 3.885286839002045e-05, + "loss": 0.7977, + "step": 39020 + }, + { + "epoch": 0.6264948073002777, + "grad_norm": 0.6150055527687073, + "learning_rate": 3.884761985534187e-05, + "loss": 0.8622, + "step": 39030 + }, + { + "epoch": 0.6266553235204417, + "grad_norm": 0.832316517829895, + "learning_rate": 3.884237044003594e-05, + "loss": 0.7032, + "step": 39040 + }, + { + "epoch": 0.6268158397406058, + "grad_norm": 0.7060390710830688, + "learning_rate": 3.883712014443649e-05, + "loss": 0.8706, + "step": 39050 + }, + { + "epoch": 0.6269763559607698, + "grad_norm": 0.6310925483703613, + "learning_rate": 3.8831868968877426e-05, + "loss": 0.7701, + "step": 39060 + }, + { + "epoch": 0.6271368721809338, + "grad_norm": 0.6766690015792847, + "learning_rate": 3.8826616913692674e-05, + "loss": 0.8708, + "step": 39070 + }, + { + "epoch": 0.6272973884010979, + "grad_norm": 0.967619001865387, + "learning_rate": 3.882136397921624e-05, + "loss": 0.6846, + "step": 39080 + }, + { + "epoch": 0.6274579046212619, + "grad_norm": 0.8793244361877441, + "learning_rate": 3.8816110165782186e-05, + "loss": 0.7775, + "step": 39090 + }, + { + "epoch": 0.627618420841426, + "grad_norm": 0.7329443693161011, + "learning_rate": 3.881085547372462e-05, + "loss": 0.806, + "step": 39100 + }, + { + "epoch": 0.62777893706159, + "grad_norm": 0.7828921675682068, + "learning_rate": 3.880559990337769e-05, + "loss": 0.8161, + "step": 39110 + }, + { + "epoch": 0.6279394532817542, + "grad_norm": 0.7719807028770447, + "learning_rate": 3.880034345507565e-05, + "loss": 0.8454, + "step": 39120 + }, + { + "epoch": 0.6280999695019182, + "grad_norm": 0.7153074741363525, + "learning_rate": 3.879508612915277e-05, + "loss": 0.865, + "step": 39130 + }, + { + "epoch": 0.6282604857220823, + "grad_norm": 0.7088330388069153, + "learning_rate": 3.878982792594337e-05, + "loss": 0.7454, + "step": 39140 + }, + { + "epoch": 0.6284210019422463, + "grad_norm": 0.5399261713027954, + "learning_rate": 3.878456884578186e-05, + "loss": 0.6726, + "step": 39150 + }, + { + "epoch": 0.6285815181624103, + "grad_norm": 0.648587703704834, + "learning_rate": 3.8779308889002676e-05, + "loss": 0.7683, + "step": 39160 + }, + { + "epoch": 0.6287420343825744, + "grad_norm": 0.4183673858642578, + "learning_rate": 3.877404805594032e-05, + "loss": 0.6671, + "step": 39170 + }, + { + "epoch": 0.6289025506027384, + "grad_norm": 0.7793266177177429, + "learning_rate": 3.8768786346929356e-05, + "loss": 0.8139, + "step": 39180 + }, + { + "epoch": 0.6290630668229025, + "grad_norm": 0.6110801100730896, + "learning_rate": 3.8763523762304396e-05, + "loss": 0.8473, + "step": 39190 + }, + { + "epoch": 0.6292235830430665, + "grad_norm": 0.6376945376396179, + "learning_rate": 3.875826030240011e-05, + "loss": 0.7625, + "step": 39200 + }, + { + "epoch": 0.6293840992632306, + "grad_norm": 0.6309436559677124, + "learning_rate": 3.8752995967551214e-05, + "loss": 0.8856, + "step": 39210 + }, + { + "epoch": 0.6295446154833946, + "grad_norm": 1.2552645206451416, + "learning_rate": 3.8747730758092506e-05, + "loss": 0.8989, + "step": 39220 + }, + { + "epoch": 0.6297051317035587, + "grad_norm": 0.8188368082046509, + "learning_rate": 3.87424646743588e-05, + "loss": 0.7538, + "step": 39230 + }, + { + "epoch": 0.6298656479237227, + "grad_norm": 0.9643368721008301, + "learning_rate": 3.8737197716685016e-05, + "loss": 0.8248, + "step": 39240 + }, + { + "epoch": 0.6300261641438868, + "grad_norm": 0.6017472743988037, + "learning_rate": 3.873192988540608e-05, + "loss": 0.7921, + "step": 39250 + }, + { + "epoch": 0.6301866803640508, + "grad_norm": 0.6743314266204834, + "learning_rate": 3.8726661180857e-05, + "loss": 0.8089, + "step": 39260 + }, + { + "epoch": 0.6303471965842148, + "grad_norm": 0.6778842210769653, + "learning_rate": 3.8721391603372834e-05, + "loss": 0.7824, + "step": 39270 + }, + { + "epoch": 0.6305077128043789, + "grad_norm": 0.743521511554718, + "learning_rate": 3.87161211532887e-05, + "loss": 0.7502, + "step": 39280 + }, + { + "epoch": 0.6306682290245429, + "grad_norm": 0.690519392490387, + "learning_rate": 3.871084983093977e-05, + "loss": 0.7656, + "step": 39290 + }, + { + "epoch": 0.630828745244707, + "grad_norm": 0.9039788842201233, + "learning_rate": 3.8705577636661264e-05, + "loss": 0.7449, + "step": 39300 + }, + { + "epoch": 0.630989261464871, + "grad_norm": 0.7963625192642212, + "learning_rate": 3.870030457078846e-05, + "loss": 0.7235, + "step": 39310 + }, + { + "epoch": 0.6311497776850351, + "grad_norm": 0.9618449807167053, + "learning_rate": 3.8695030633656705e-05, + "loss": 0.763, + "step": 39320 + }, + { + "epoch": 0.6313102939051991, + "grad_norm": 0.8159765005111694, + "learning_rate": 3.8689755825601374e-05, + "loss": 0.7457, + "step": 39330 + }, + { + "epoch": 0.6314708101253632, + "grad_norm": 1.0109665393829346, + "learning_rate": 3.868448014695794e-05, + "loss": 0.8527, + "step": 39340 + }, + { + "epoch": 0.6316313263455272, + "grad_norm": 0.9094493389129639, + "learning_rate": 3.8679203598061865e-05, + "loss": 0.8012, + "step": 39350 + }, + { + "epoch": 0.6317918425656912, + "grad_norm": 0.7967928647994995, + "learning_rate": 3.867392617924874e-05, + "loss": 0.695, + "step": 39360 + }, + { + "epoch": 0.6319523587858553, + "grad_norm": 0.7721534371376038, + "learning_rate": 3.8668647890854176e-05, + "loss": 0.8059, + "step": 39370 + }, + { + "epoch": 0.6321128750060193, + "grad_norm": 0.743707537651062, + "learning_rate": 3.866336873321382e-05, + "loss": 0.8136, + "step": 39380 + }, + { + "epoch": 0.6322733912261834, + "grad_norm": 0.5097008943557739, + "learning_rate": 3.865808870666342e-05, + "loss": 0.7765, + "step": 39390 + }, + { + "epoch": 0.6324339074463474, + "grad_norm": 0.8358458876609802, + "learning_rate": 3.865280781153874e-05, + "loss": 0.6758, + "step": 39400 + }, + { + "epoch": 0.6325944236665115, + "grad_norm": 1.1094107627868652, + "learning_rate": 3.8647526048175605e-05, + "loss": 0.6712, + "step": 39410 + }, + { + "epoch": 0.6327549398866755, + "grad_norm": 0.5389155149459839, + "learning_rate": 3.864224341690993e-05, + "loss": 0.6651, + "step": 39420 + }, + { + "epoch": 0.6329154561068396, + "grad_norm": 0.8623170256614685, + "learning_rate": 3.863695991807764e-05, + "loss": 0.7651, + "step": 39430 + }, + { + "epoch": 0.6330759723270036, + "grad_norm": 0.8383775949478149, + "learning_rate": 3.863167555201474e-05, + "loss": 0.8055, + "step": 39440 + }, + { + "epoch": 0.6332364885471677, + "grad_norm": 0.5349963903427124, + "learning_rate": 3.8626390319057293e-05, + "loss": 0.8115, + "step": 39450 + }, + { + "epoch": 0.6333970047673317, + "grad_norm": 0.7257948517799377, + "learning_rate": 3.862110421954139e-05, + "loss": 0.8227, + "step": 39460 + }, + { + "epoch": 0.6335575209874957, + "grad_norm": 0.5577735304832458, + "learning_rate": 3.861581725380321e-05, + "loss": 0.8243, + "step": 39470 + }, + { + "epoch": 0.6337180372076598, + "grad_norm": 0.5767903923988342, + "learning_rate": 3.861052942217897e-05, + "loss": 0.6703, + "step": 39480 + }, + { + "epoch": 0.6338785534278238, + "grad_norm": 1.0658624172210693, + "learning_rate": 3.8605240725004946e-05, + "loss": 0.8399, + "step": 39490 + }, + { + "epoch": 0.634039069647988, + "grad_norm": 0.6903990507125854, + "learning_rate": 3.859995116261747e-05, + "loss": 0.7705, + "step": 39500 + }, + { + "epoch": 0.634199585868152, + "grad_norm": 1.0357047319412231, + "learning_rate": 3.859466073535291e-05, + "loss": 0.7511, + "step": 39510 + }, + { + "epoch": 0.6343601020883161, + "grad_norm": 0.5914377570152283, + "learning_rate": 3.858936944354773e-05, + "loss": 0.8429, + "step": 39520 + }, + { + "epoch": 0.6345206183084801, + "grad_norm": 1.3885873556137085, + "learning_rate": 3.858407728753841e-05, + "loss": 0.7398, + "step": 39530 + }, + { + "epoch": 0.6346811345286442, + "grad_norm": 0.5460153222084045, + "learning_rate": 3.857878426766151e-05, + "loss": 0.7982, + "step": 39540 + }, + { + "epoch": 0.6348416507488082, + "grad_norm": 0.7979714870452881, + "learning_rate": 3.857349038425363e-05, + "loss": 0.7825, + "step": 39550 + }, + { + "epoch": 0.6350021669689722, + "grad_norm": 0.5885058641433716, + "learning_rate": 3.8568195637651425e-05, + "loss": 0.6416, + "step": 39560 + }, + { + "epoch": 0.6351626831891363, + "grad_norm": 0.5465879440307617, + "learning_rate": 3.856290002819162e-05, + "loss": 0.7285, + "step": 39570 + }, + { + "epoch": 0.6353231994093003, + "grad_norm": 0.9527516961097717, + "learning_rate": 3.8557603556210974e-05, + "loss": 0.759, + "step": 39580 + }, + { + "epoch": 0.6354837156294644, + "grad_norm": 0.7033668160438538, + "learning_rate": 3.855230622204632e-05, + "loss": 0.8385, + "step": 39590 + }, + { + "epoch": 0.6356442318496284, + "grad_norm": 0.5614866018295288, + "learning_rate": 3.854700802603454e-05, + "loss": 0.6642, + "step": 39600 + }, + { + "epoch": 0.6358047480697925, + "grad_norm": 0.8587833642959595, + "learning_rate": 3.8541708968512555e-05, + "loss": 0.8257, + "step": 39610 + }, + { + "epoch": 0.6359652642899565, + "grad_norm": 1.1065160036087036, + "learning_rate": 3.853640904981736e-05, + "loss": 0.8488, + "step": 39620 + }, + { + "epoch": 0.6361257805101206, + "grad_norm": 0.9200987815856934, + "learning_rate": 3.8531108270286e-05, + "loss": 0.7477, + "step": 39630 + }, + { + "epoch": 0.6362862967302846, + "grad_norm": 0.735983669757843, + "learning_rate": 3.852580663025558e-05, + "loss": 0.7733, + "step": 39640 + }, + { + "epoch": 0.6364468129504487, + "grad_norm": 0.7974525094032288, + "learning_rate": 3.852050413006324e-05, + "loss": 0.711, + "step": 39650 + }, + { + "epoch": 0.6366073291706127, + "grad_norm": 0.6517489552497864, + "learning_rate": 3.85152007700462e-05, + "loss": 0.8662, + "step": 39660 + }, + { + "epoch": 0.6367678453907767, + "grad_norm": 0.7777332067489624, + "learning_rate": 3.850989655054171e-05, + "loss": 0.7565, + "step": 39670 + }, + { + "epoch": 0.6369283616109408, + "grad_norm": 0.9857264757156372, + "learning_rate": 3.850459147188711e-05, + "loss": 0.8281, + "step": 39680 + }, + { + "epoch": 0.6370888778311048, + "grad_norm": 0.840593695640564, + "learning_rate": 3.8499285534419736e-05, + "loss": 0.7185, + "step": 39690 + }, + { + "epoch": 0.6372493940512689, + "grad_norm": 0.6453313827514648, + "learning_rate": 3.849397873847704e-05, + "loss": 0.9086, + "step": 39700 + }, + { + "epoch": 0.6374099102714329, + "grad_norm": 0.8974493741989136, + "learning_rate": 3.848867108439651e-05, + "loss": 0.8095, + "step": 39710 + }, + { + "epoch": 0.637570426491597, + "grad_norm": 0.6186667680740356, + "learning_rate": 3.8483362572515655e-05, + "loss": 0.739, + "step": 39720 + }, + { + "epoch": 0.637730942711761, + "grad_norm": 0.6679637432098389, + "learning_rate": 3.847805320317209e-05, + "loss": 0.7823, + "step": 39730 + }, + { + "epoch": 0.6378914589319251, + "grad_norm": 0.6926317811012268, + "learning_rate": 3.847274297670345e-05, + "loss": 0.7668, + "step": 39740 + }, + { + "epoch": 0.6380519751520891, + "grad_norm": 0.5793929696083069, + "learning_rate": 3.846743189344743e-05, + "loss": 0.7467, + "step": 39750 + }, + { + "epoch": 0.6382124913722532, + "grad_norm": 1.0341182947158813, + "learning_rate": 3.846211995374178e-05, + "loss": 0.8369, + "step": 39760 + }, + { + "epoch": 0.6383730075924172, + "grad_norm": 0.7414236068725586, + "learning_rate": 3.8456807157924326e-05, + "loss": 0.6976, + "step": 39770 + }, + { + "epoch": 0.6385335238125812, + "grad_norm": 0.701656699180603, + "learning_rate": 3.845149350633292e-05, + "loss": 0.8154, + "step": 39780 + }, + { + "epoch": 0.6386940400327453, + "grad_norm": 1.1818305253982544, + "learning_rate": 3.8446178999305474e-05, + "loss": 0.8031, + "step": 39790 + }, + { + "epoch": 0.6388545562529093, + "grad_norm": 0.5518404841423035, + "learning_rate": 3.8440863637179964e-05, + "loss": 0.8022, + "step": 39800 + }, + { + "epoch": 0.6390150724730734, + "grad_norm": 0.9845978617668152, + "learning_rate": 3.843554742029442e-05, + "loss": 0.7177, + "step": 39810 + }, + { + "epoch": 0.6391755886932374, + "grad_norm": 0.6715173721313477, + "learning_rate": 3.843023034898692e-05, + "loss": 0.7192, + "step": 39820 + }, + { + "epoch": 0.6393361049134015, + "grad_norm": 0.747756838798523, + "learning_rate": 3.842491242359559e-05, + "loss": 0.6778, + "step": 39830 + }, + { + "epoch": 0.6394966211335655, + "grad_norm": 0.7844328284263611, + "learning_rate": 3.841959364445863e-05, + "loss": 0.8052, + "step": 39840 + }, + { + "epoch": 0.6396571373537296, + "grad_norm": 0.6470012664794922, + "learning_rate": 3.841427401191428e-05, + "loss": 0.8331, + "step": 39850 + }, + { + "epoch": 0.6398176535738936, + "grad_norm": 0.6879901885986328, + "learning_rate": 3.840895352630084e-05, + "loss": 0.8672, + "step": 39860 + }, + { + "epoch": 0.6399781697940576, + "grad_norm": 0.6118249297142029, + "learning_rate": 3.840363218795666e-05, + "loss": 0.768, + "step": 39870 + }, + { + "epoch": 0.6401386860142217, + "grad_norm": 0.5959610939025879, + "learning_rate": 3.8398309997220135e-05, + "loss": 0.8211, + "step": 39880 + }, + { + "epoch": 0.6402992022343857, + "grad_norm": 0.5730933547019958, + "learning_rate": 3.839298695442973e-05, + "loss": 0.7851, + "step": 39890 + }, + { + "epoch": 0.6404597184545499, + "grad_norm": 2.9221153259277344, + "learning_rate": 3.838766305992399e-05, + "loss": 0.7635, + "step": 39900 + }, + { + "epoch": 0.6406202346747139, + "grad_norm": 0.6306505799293518, + "learning_rate": 3.8382338314041433e-05, + "loss": 0.7919, + "step": 39910 + }, + { + "epoch": 0.640780750894878, + "grad_norm": 0.8358721733093262, + "learning_rate": 3.837701271712071e-05, + "loss": 0.869, + "step": 39920 + }, + { + "epoch": 0.640941267115042, + "grad_norm": 0.6419198513031006, + "learning_rate": 3.837168626950049e-05, + "loss": 0.65, + "step": 39930 + }, + { + "epoch": 0.6411017833352061, + "grad_norm": 0.5764515995979309, + "learning_rate": 3.836635897151952e-05, + "loss": 0.7535, + "step": 39940 + }, + { + "epoch": 0.6412622995553701, + "grad_norm": 0.5320503115653992, + "learning_rate": 3.836103082351656e-05, + "loss": 0.7516, + "step": 39950 + }, + { + "epoch": 0.6414228157755342, + "grad_norm": 1.1317205429077148, + "learning_rate": 3.835570182583047e-05, + "loss": 0.808, + "step": 39960 + }, + { + "epoch": 0.6415833319956982, + "grad_norm": 1.0593405961990356, + "learning_rate": 3.835037197880013e-05, + "loss": 0.7931, + "step": 39970 + }, + { + "epoch": 0.6417438482158622, + "grad_norm": 0.8761583566665649, + "learning_rate": 3.8345041282764485e-05, + "loss": 0.8584, + "step": 39980 + }, + { + "epoch": 0.6419043644360263, + "grad_norm": 0.6453713178634644, + "learning_rate": 3.8339709738062544e-05, + "loss": 0.7129, + "step": 39990 + }, + { + "epoch": 0.6420648806561903, + "grad_norm": 0.8191493153572083, + "learning_rate": 3.833437734503337e-05, + "loss": 0.8221, + "step": 40000 + }, + { + "epoch": 0.6420648806561903, + "eval_loss": 0.7868782877922058, + "eval_runtime": 1832.8985, + "eval_samples_per_second": 14.311, + "eval_steps_per_second": 1.789, + "step": 40000 + }, + { + "epoch": 0.6422253968763544, + "grad_norm": 1.0147860050201416, + "learning_rate": 3.832904410401605e-05, + "loss": 0.7878, + "step": 40010 + }, + { + "epoch": 0.6423859130965184, + "grad_norm": 0.9309044480323792, + "learning_rate": 3.8323710015349756e-05, + "loss": 0.7946, + "step": 40020 + }, + { + "epoch": 0.6425464293166825, + "grad_norm": 0.9423816800117493, + "learning_rate": 3.831837507937372e-05, + "loss": 0.8158, + "step": 40030 + }, + { + "epoch": 0.6427069455368465, + "grad_norm": 0.78261399269104, + "learning_rate": 3.8313039296427193e-05, + "loss": 0.8736, + "step": 40040 + }, + { + "epoch": 0.6428674617570106, + "grad_norm": 0.5020898580551147, + "learning_rate": 3.830770266684951e-05, + "loss": 0.9396, + "step": 40050 + }, + { + "epoch": 0.6430279779771746, + "grad_norm": 0.4810607135295868, + "learning_rate": 3.830236519098004e-05, + "loss": 0.745, + "step": 40060 + }, + { + "epoch": 0.6431884941973386, + "grad_norm": 0.672624945640564, + "learning_rate": 3.829702686915823e-05, + "loss": 0.6258, + "step": 40070 + }, + { + "epoch": 0.6433490104175027, + "grad_norm": 0.7638553977012634, + "learning_rate": 3.8291687701723546e-05, + "loss": 0.8545, + "step": 40080 + }, + { + "epoch": 0.6435095266376667, + "grad_norm": 1.1648316383361816, + "learning_rate": 3.8286347689015544e-05, + "loss": 0.8382, + "step": 40090 + }, + { + "epoch": 0.6436700428578308, + "grad_norm": 0.6437062621116638, + "learning_rate": 3.828100683137381e-05, + "loss": 0.8364, + "step": 40100 + }, + { + "epoch": 0.6438305590779948, + "grad_norm": 0.6238693594932556, + "learning_rate": 3.8275665129137994e-05, + "loss": 0.7572, + "step": 40110 + }, + { + "epoch": 0.6439910752981589, + "grad_norm": 0.6579787135124207, + "learning_rate": 3.82703225826478e-05, + "loss": 0.8101, + "step": 40120 + }, + { + "epoch": 0.6441515915183229, + "grad_norm": 1.0210704803466797, + "learning_rate": 3.8264979192242973e-05, + "loss": 0.6092, + "step": 40130 + }, + { + "epoch": 0.644312107738487, + "grad_norm": 0.8782562017440796, + "learning_rate": 3.8259634958263323e-05, + "loss": 0.7318, + "step": 40140 + }, + { + "epoch": 0.644472623958651, + "grad_norm": 0.779369592666626, + "learning_rate": 3.825428988104872e-05, + "loss": 0.8235, + "step": 40150 + }, + { + "epoch": 0.6446331401788151, + "grad_norm": 0.7976564764976501, + "learning_rate": 3.8248943960939075e-05, + "loss": 0.8427, + "step": 40160 + }, + { + "epoch": 0.6447936563989791, + "grad_norm": 2.4022927284240723, + "learning_rate": 3.8243597198274366e-05, + "loss": 0.7698, + "step": 40170 + }, + { + "epoch": 0.6449541726191431, + "grad_norm": 0.6958513855934143, + "learning_rate": 3.82382495933946e-05, + "loss": 0.7902, + "step": 40180 + }, + { + "epoch": 0.6451146888393072, + "grad_norm": 1.5660901069641113, + "learning_rate": 3.8232901146639855e-05, + "loss": 0.7818, + "step": 40190 + }, + { + "epoch": 0.6452752050594712, + "grad_norm": 0.5135864019393921, + "learning_rate": 3.8227551858350275e-05, + "loss": 0.7675, + "step": 40200 + }, + { + "epoch": 0.6454357212796353, + "grad_norm": 0.6985871195793152, + "learning_rate": 3.8222201728866035e-05, + "loss": 0.8032, + "step": 40210 + }, + { + "epoch": 0.6455962374997993, + "grad_norm": 0.6190316677093506, + "learning_rate": 3.821685075852737e-05, + "loss": 0.782, + "step": 40220 + }, + { + "epoch": 0.6457567537199634, + "grad_norm": 0.6312642693519592, + "learning_rate": 3.821149894767456e-05, + "loss": 0.7609, + "step": 40230 + }, + { + "epoch": 0.6459172699401274, + "grad_norm": 1.006691813468933, + "learning_rate": 3.820614629664798e-05, + "loss": 0.7992, + "step": 40240 + }, + { + "epoch": 0.6460777861602915, + "grad_norm": 0.6068981289863586, + "learning_rate": 3.8200792805788e-05, + "loss": 0.81, + "step": 40250 + }, + { + "epoch": 0.6462383023804555, + "grad_norm": 0.9753682613372803, + "learning_rate": 3.8195438475435074e-05, + "loss": 0.7648, + "step": 40260 + }, + { + "epoch": 0.6463988186006195, + "grad_norm": 0.9492834210395813, + "learning_rate": 3.819008330592971e-05, + "loss": 0.7238, + "step": 40270 + }, + { + "epoch": 0.6465593348207836, + "grad_norm": 0.7385066747665405, + "learning_rate": 3.8184727297612474e-05, + "loss": 0.7893, + "step": 40280 + }, + { + "epoch": 0.6467198510409476, + "grad_norm": 0.7286997437477112, + "learning_rate": 3.817937045082396e-05, + "loss": 0.8529, + "step": 40290 + }, + { + "epoch": 0.6468803672611118, + "grad_norm": 0.8086140751838684, + "learning_rate": 3.817401276590484e-05, + "loss": 0.7506, + "step": 40300 + }, + { + "epoch": 0.6470408834812758, + "grad_norm": 0.6796326041221619, + "learning_rate": 3.8168654243195844e-05, + "loss": 0.7388, + "step": 40310 + }, + { + "epoch": 0.6472013997014399, + "grad_norm": 0.9214574098587036, + "learning_rate": 3.8163294883037714e-05, + "loss": 0.8929, + "step": 40320 + }, + { + "epoch": 0.6473619159216039, + "grad_norm": 0.4827450215816498, + "learning_rate": 3.81579346857713e-05, + "loss": 0.7155, + "step": 40330 + }, + { + "epoch": 0.647522432141768, + "grad_norm": 1.4209054708480835, + "learning_rate": 3.8152573651737464e-05, + "loss": 0.7572, + "step": 40340 + }, + { + "epoch": 0.647682948361932, + "grad_norm": 0.43666872382164, + "learning_rate": 3.8147211781277146e-05, + "loss": 0.7548, + "step": 40350 + }, + { + "epoch": 0.6478434645820961, + "grad_norm": 0.5615044236183167, + "learning_rate": 3.814184907473133e-05, + "loss": 0.847, + "step": 40360 + }, + { + "epoch": 0.6480039808022601, + "grad_norm": 0.7500429749488831, + "learning_rate": 3.8136485532441044e-05, + "loss": 0.8001, + "step": 40370 + }, + { + "epoch": 0.6481644970224241, + "grad_norm": 0.9535032510757446, + "learning_rate": 3.813112115474737e-05, + "loss": 0.741, + "step": 40380 + }, + { + "epoch": 0.6483250132425882, + "grad_norm": 0.7076505422592163, + "learning_rate": 3.812575594199147e-05, + "loss": 0.731, + "step": 40390 + }, + { + "epoch": 0.6484855294627522, + "grad_norm": 0.6944332718849182, + "learning_rate": 3.812038989451454e-05, + "loss": 0.7696, + "step": 40400 + }, + { + "epoch": 0.6486460456829163, + "grad_norm": 0.7085893750190735, + "learning_rate": 3.8115023012657816e-05, + "loss": 0.747, + "step": 40410 + }, + { + "epoch": 0.6488065619030803, + "grad_norm": 1.1754518747329712, + "learning_rate": 3.81096552967626e-05, + "loss": 0.757, + "step": 40420 + }, + { + "epoch": 0.6489670781232444, + "grad_norm": 0.8813371658325195, + "learning_rate": 3.810428674717026e-05, + "loss": 0.6882, + "step": 40430 + }, + { + "epoch": 0.6491275943434084, + "grad_norm": 0.759650468826294, + "learning_rate": 3.80989173642222e-05, + "loss": 0.7777, + "step": 40440 + }, + { + "epoch": 0.6492881105635725, + "grad_norm": 0.6346726417541504, + "learning_rate": 3.809354714825987e-05, + "loss": 0.7891, + "step": 40450 + }, + { + "epoch": 0.6494486267837365, + "grad_norm": 0.762059211730957, + "learning_rate": 3.8088176099624797e-05, + "loss": 0.7926, + "step": 40460 + }, + { + "epoch": 0.6496091430039005, + "grad_norm": 0.6740433573722839, + "learning_rate": 3.808280421865854e-05, + "loss": 0.7346, + "step": 40470 + }, + { + "epoch": 0.6497696592240646, + "grad_norm": 0.6266574263572693, + "learning_rate": 3.807743150570272e-05, + "loss": 0.7555, + "step": 40480 + }, + { + "epoch": 0.6499301754442286, + "grad_norm": 0.8313930034637451, + "learning_rate": 3.8072057961099025e-05, + "loss": 0.6297, + "step": 40490 + }, + { + "epoch": 0.6500906916643927, + "grad_norm": 0.6239114999771118, + "learning_rate": 3.8066683585189164e-05, + "loss": 0.779, + "step": 40500 + }, + { + "epoch": 0.6502512078845567, + "grad_norm": 0.9158800840377808, + "learning_rate": 3.8061308378314916e-05, + "loss": 0.7579, + "step": 40510 + }, + { + "epoch": 0.6504117241047208, + "grad_norm": 1.043367624282837, + "learning_rate": 3.805593234081812e-05, + "loss": 0.8699, + "step": 40520 + }, + { + "epoch": 0.6505722403248848, + "grad_norm": 0.7188341617584229, + "learning_rate": 3.805055547304066e-05, + "loss": 0.8097, + "step": 40530 + }, + { + "epoch": 0.6507327565450489, + "grad_norm": 1.0061707496643066, + "learning_rate": 3.804517777532447e-05, + "loss": 0.8479, + "step": 40540 + }, + { + "epoch": 0.6508932727652129, + "grad_norm": 0.6614341735839844, + "learning_rate": 3.803979924801154e-05, + "loss": 0.7281, + "step": 40550 + }, + { + "epoch": 0.651053788985377, + "grad_norm": 0.7299546003341675, + "learning_rate": 3.8034419891443916e-05, + "loss": 0.7955, + "step": 40560 + }, + { + "epoch": 0.651214305205541, + "grad_norm": 0.7591082453727722, + "learning_rate": 3.802903970596369e-05, + "loss": 0.782, + "step": 40570 + }, + { + "epoch": 0.651374821425705, + "grad_norm": 0.8950862288475037, + "learning_rate": 3.8023658691913e-05, + "loss": 0.8312, + "step": 40580 + }, + { + "epoch": 0.6515353376458691, + "grad_norm": 0.5887387990951538, + "learning_rate": 3.801827684963408e-05, + "loss": 0.7439, + "step": 40590 + }, + { + "epoch": 0.6516958538660331, + "grad_norm": 0.7629324793815613, + "learning_rate": 3.801289417946915e-05, + "loss": 0.7682, + "step": 40600 + }, + { + "epoch": 0.6518563700861972, + "grad_norm": 1.02439546585083, + "learning_rate": 3.800751068176053e-05, + "loss": 0.7271, + "step": 40610 + }, + { + "epoch": 0.6520168863063612, + "grad_norm": 1.004233956336975, + "learning_rate": 3.8002126356850574e-05, + "loss": 0.7591, + "step": 40620 + }, + { + "epoch": 0.6521774025265253, + "grad_norm": 0.7367459535598755, + "learning_rate": 3.79967412050817e-05, + "loss": 0.7182, + "step": 40630 + }, + { + "epoch": 0.6523379187466893, + "grad_norm": 0.6075099110603333, + "learning_rate": 3.7991355226796365e-05, + "loss": 0.7225, + "step": 40640 + }, + { + "epoch": 0.6524984349668534, + "grad_norm": 1.5331544876098633, + "learning_rate": 3.798596842233709e-05, + "loss": 0.6889, + "step": 40650 + }, + { + "epoch": 0.6526589511870174, + "grad_norm": 0.7296358346939087, + "learning_rate": 3.798058079204645e-05, + "loss": 0.8432, + "step": 40660 + }, + { + "epoch": 0.6528194674071814, + "grad_norm": 0.7966289520263672, + "learning_rate": 3.7975192336267045e-05, + "loss": 0.7752, + "step": 40670 + }, + { + "epoch": 0.6529799836273456, + "grad_norm": 0.8460220694541931, + "learning_rate": 3.796980305534158e-05, + "loss": 0.7868, + "step": 40680 + }, + { + "epoch": 0.6531404998475095, + "grad_norm": 0.7572133541107178, + "learning_rate": 3.7964412949612755e-05, + "loss": 0.74, + "step": 40690 + }, + { + "epoch": 0.6533010160676737, + "grad_norm": 0.5592225790023804, + "learning_rate": 3.795902201942336e-05, + "loss": 0.8485, + "step": 40700 + }, + { + "epoch": 0.6534615322878377, + "grad_norm": 1.0133415460586548, + "learning_rate": 3.7953630265116234e-05, + "loss": 0.7587, + "step": 40710 + }, + { + "epoch": 0.6536220485080018, + "grad_norm": 0.8999005556106567, + "learning_rate": 3.794823768703425e-05, + "loss": 0.9334, + "step": 40720 + }, + { + "epoch": 0.6537825647281658, + "grad_norm": 0.7803480625152588, + "learning_rate": 3.794284428552034e-05, + "loss": 0.7557, + "step": 40730 + }, + { + "epoch": 0.6539430809483299, + "grad_norm": 0.8019670844078064, + "learning_rate": 3.793745006091751e-05, + "loss": 0.7104, + "step": 40740 + }, + { + "epoch": 0.6541035971684939, + "grad_norm": 0.8985278010368347, + "learning_rate": 3.793205501356879e-05, + "loss": 0.8876, + "step": 40750 + }, + { + "epoch": 0.654264113388658, + "grad_norm": 0.6373836994171143, + "learning_rate": 3.7926659143817275e-05, + "loss": 0.7249, + "step": 40760 + }, + { + "epoch": 0.654424629608822, + "grad_norm": 1.3457505702972412, + "learning_rate": 3.79212624520061e-05, + "loss": 0.7798, + "step": 40770 + }, + { + "epoch": 0.654585145828986, + "grad_norm": 0.9450454115867615, + "learning_rate": 3.7915864938478484e-05, + "loss": 0.8199, + "step": 40780 + }, + { + "epoch": 0.6547456620491501, + "grad_norm": 0.6977336406707764, + "learning_rate": 3.791046660357766e-05, + "loss": 0.7823, + "step": 40790 + }, + { + "epoch": 0.6549061782693141, + "grad_norm": 0.9262348413467407, + "learning_rate": 3.7905067447646945e-05, + "loss": 0.8131, + "step": 40800 + }, + { + "epoch": 0.6550666944894782, + "grad_norm": 0.6051598191261292, + "learning_rate": 3.789966747102968e-05, + "loss": 0.6459, + "step": 40810 + }, + { + "epoch": 0.6552272107096422, + "grad_norm": 0.7569774389266968, + "learning_rate": 3.789426667406928e-05, + "loss": 0.7267, + "step": 40820 + }, + { + "epoch": 0.6553877269298063, + "grad_norm": 0.9116839170455933, + "learning_rate": 3.788886505710919e-05, + "loss": 0.9267, + "step": 40830 + }, + { + "epoch": 0.6555482431499703, + "grad_norm": 0.45495134592056274, + "learning_rate": 3.788346262049295e-05, + "loss": 0.8039, + "step": 40840 + }, + { + "epoch": 0.6557087593701344, + "grad_norm": 1.0835769176483154, + "learning_rate": 3.787805936456409e-05, + "loss": 0.772, + "step": 40850 + }, + { + "epoch": 0.6558692755902984, + "grad_norm": 0.7858428359031677, + "learning_rate": 3.7872655289666245e-05, + "loss": 0.8926, + "step": 40860 + }, + { + "epoch": 0.6560297918104624, + "grad_norm": 0.9090468883514404, + "learning_rate": 3.786725039614309e-05, + "loss": 0.9134, + "step": 40870 + }, + { + "epoch": 0.6561903080306265, + "grad_norm": 0.5642369389533997, + "learning_rate": 3.786184468433832e-05, + "loss": 0.768, + "step": 40880 + }, + { + "epoch": 0.6563508242507905, + "grad_norm": 0.6035107970237732, + "learning_rate": 3.785643815459573e-05, + "loss": 0.7893, + "step": 40890 + }, + { + "epoch": 0.6565113404709546, + "grad_norm": 1.3630729913711548, + "learning_rate": 3.785103080725912e-05, + "loss": 0.7062, + "step": 40900 + }, + { + "epoch": 0.6566718566911186, + "grad_norm": 1.3622866868972778, + "learning_rate": 3.784562264267239e-05, + "loss": 0.6836, + "step": 40910 + }, + { + "epoch": 0.6568323729112827, + "grad_norm": 1.3482273817062378, + "learning_rate": 3.7840213661179444e-05, + "loss": 0.8242, + "step": 40920 + }, + { + "epoch": 0.6569928891314467, + "grad_norm": 0.7719997763633728, + "learning_rate": 3.783480386312429e-05, + "loss": 0.8174, + "step": 40930 + }, + { + "epoch": 0.6571534053516108, + "grad_norm": 1.0773439407348633, + "learning_rate": 3.782939324885093e-05, + "loss": 0.8371, + "step": 40940 + }, + { + "epoch": 0.6573139215717748, + "grad_norm": 0.9004764556884766, + "learning_rate": 3.7823981818703467e-05, + "loss": 0.8745, + "step": 40950 + }, + { + "epoch": 0.6574744377919389, + "grad_norm": 0.737823486328125, + "learning_rate": 3.781856957302603e-05, + "loss": 0.7523, + "step": 40960 + }, + { + "epoch": 0.6576349540121029, + "grad_norm": 0.7818324565887451, + "learning_rate": 3.7813156512162794e-05, + "loss": 0.8852, + "step": 40970 + }, + { + "epoch": 0.6577954702322669, + "grad_norm": 0.7855982184410095, + "learning_rate": 3.7807742636458024e-05, + "loss": 0.8574, + "step": 40980 + }, + { + "epoch": 0.657955986452431, + "grad_norm": 0.8089808821678162, + "learning_rate": 3.780232794625599e-05, + "loss": 0.8949, + "step": 40990 + }, + { + "epoch": 0.658116502672595, + "grad_norm": 0.6120286583900452, + "learning_rate": 3.7796912441901036e-05, + "loss": 0.7253, + "step": 41000 + }, + { + "epoch": 0.6582770188927591, + "grad_norm": 1.038078784942627, + "learning_rate": 3.779149612373756e-05, + "loss": 0.7914, + "step": 41010 + }, + { + "epoch": 0.6584375351129231, + "grad_norm": 0.6817883849143982, + "learning_rate": 3.778607899211002e-05, + "loss": 0.8259, + "step": 41020 + }, + { + "epoch": 0.6585980513330872, + "grad_norm": 0.8258918523788452, + "learning_rate": 3.77806610473629e-05, + "loss": 0.7596, + "step": 41030 + }, + { + "epoch": 0.6587585675532512, + "grad_norm": 0.7596319913864136, + "learning_rate": 3.777524228984074e-05, + "loss": 0.8435, + "step": 41040 + }, + { + "epoch": 0.6589190837734153, + "grad_norm": 0.68320232629776, + "learning_rate": 3.776982271988817e-05, + "loss": 0.8514, + "step": 41050 + }, + { + "epoch": 0.6590795999935793, + "grad_norm": 0.6151543259620667, + "learning_rate": 3.776440233784981e-05, + "loss": 0.9222, + "step": 41060 + }, + { + "epoch": 0.6592401162137433, + "grad_norm": 0.6481803059577942, + "learning_rate": 3.7758981144070394e-05, + "loss": 0.7354, + "step": 41070 + }, + { + "epoch": 0.6594006324339075, + "grad_norm": 1.064741611480713, + "learning_rate": 3.775355913889466e-05, + "loss": 0.7815, + "step": 41080 + }, + { + "epoch": 0.6595611486540714, + "grad_norm": 0.8615776896476746, + "learning_rate": 3.7748136322667413e-05, + "loss": 0.7485, + "step": 41090 + }, + { + "epoch": 0.6597216648742356, + "grad_norm": 0.6764557361602783, + "learning_rate": 3.774271269573354e-05, + "loss": 0.7776, + "step": 41100 + }, + { + "epoch": 0.6598821810943996, + "grad_norm": 1.1117810010910034, + "learning_rate": 3.773728825843792e-05, + "loss": 0.925, + "step": 41110 + }, + { + "epoch": 0.6600426973145637, + "grad_norm": 0.6739598512649536, + "learning_rate": 3.773186301112552e-05, + "loss": 0.784, + "step": 41120 + }, + { + "epoch": 0.6602032135347277, + "grad_norm": 0.7962377071380615, + "learning_rate": 3.7726436954141365e-05, + "loss": 0.7274, + "step": 41130 + }, + { + "epoch": 0.6603637297548918, + "grad_norm": 0.7212782502174377, + "learning_rate": 3.772101008783052e-05, + "loss": 0.738, + "step": 41140 + }, + { + "epoch": 0.6605242459750558, + "grad_norm": 0.7357504963874817, + "learning_rate": 3.77155824125381e-05, + "loss": 0.7666, + "step": 41150 + }, + { + "epoch": 0.6606847621952199, + "grad_norm": 0.7649624347686768, + "learning_rate": 3.771015392860927e-05, + "loss": 0.8022, + "step": 41160 + }, + { + "epoch": 0.6608452784153839, + "grad_norm": 0.8350104093551636, + "learning_rate": 3.770472463638924e-05, + "loss": 0.6817, + "step": 41170 + }, + { + "epoch": 0.6610057946355479, + "grad_norm": 0.5137888193130493, + "learning_rate": 3.769929453622331e-05, + "loss": 0.8816, + "step": 41180 + }, + { + "epoch": 0.661166310855712, + "grad_norm": 0.5479916930198669, + "learning_rate": 3.769386362845678e-05, + "loss": 0.7354, + "step": 41190 + }, + { + "epoch": 0.661326827075876, + "grad_norm": 0.8390747904777527, + "learning_rate": 3.768843191343502e-05, + "loss": 0.5958, + "step": 41200 + }, + { + "epoch": 0.6614873432960401, + "grad_norm": 0.5682607889175415, + "learning_rate": 3.768299939150347e-05, + "loss": 0.7471, + "step": 41210 + }, + { + "epoch": 0.6616478595162041, + "grad_norm": 0.5527790188789368, + "learning_rate": 3.7677566063007596e-05, + "loss": 0.7679, + "step": 41220 + }, + { + "epoch": 0.6618083757363682, + "grad_norm": 0.5778656005859375, + "learning_rate": 3.767213192829294e-05, + "loss": 0.7063, + "step": 41230 + }, + { + "epoch": 0.6619688919565322, + "grad_norm": 0.538327693939209, + "learning_rate": 3.766669698770506e-05, + "loss": 0.8591, + "step": 41240 + }, + { + "epoch": 0.6621294081766963, + "grad_norm": 0.5548145771026611, + "learning_rate": 3.766126124158961e-05, + "loss": 0.8178, + "step": 41250 + }, + { + "epoch": 0.6622899243968603, + "grad_norm": 1.1938802003860474, + "learning_rate": 3.765582469029224e-05, + "loss": 0.7636, + "step": 41260 + }, + { + "epoch": 0.6624504406170244, + "grad_norm": 0.6409090757369995, + "learning_rate": 3.765038733415872e-05, + "loss": 0.7808, + "step": 41270 + }, + { + "epoch": 0.6626109568371884, + "grad_norm": 0.5131124258041382, + "learning_rate": 3.764494917353481e-05, + "loss": 0.7571, + "step": 41280 + }, + { + "epoch": 0.6627714730573524, + "grad_norm": 0.6379016637802124, + "learning_rate": 3.7639510208766345e-05, + "loss": 0.7635, + "step": 41290 + }, + { + "epoch": 0.6629319892775165, + "grad_norm": 0.8976826071739197, + "learning_rate": 3.763407044019923e-05, + "loss": 0.8813, + "step": 41300 + }, + { + "epoch": 0.6630925054976805, + "grad_norm": 0.7030189633369446, + "learning_rate": 3.7628629868179376e-05, + "loss": 0.9023, + "step": 41310 + }, + { + "epoch": 0.6632530217178446, + "grad_norm": 0.6968205571174622, + "learning_rate": 3.76231884930528e-05, + "loss": 0.8622, + "step": 41320 + }, + { + "epoch": 0.6634135379380086, + "grad_norm": 0.5944613814353943, + "learning_rate": 3.761774631516551e-05, + "loss": 0.8395, + "step": 41330 + }, + { + "epoch": 0.6635740541581727, + "grad_norm": 1.26247239112854, + "learning_rate": 3.761230333486362e-05, + "loss": 0.6842, + "step": 41340 + }, + { + "epoch": 0.6637345703783367, + "grad_norm": 1.1889090538024902, + "learning_rate": 3.760685955249327e-05, + "loss": 0.7958, + "step": 41350 + }, + { + "epoch": 0.6638950865985008, + "grad_norm": 0.7408890128135681, + "learning_rate": 3.760141496840064e-05, + "loss": 0.8036, + "step": 41360 + }, + { + "epoch": 0.6640556028186648, + "grad_norm": 0.6957759261131287, + "learning_rate": 3.759596958293198e-05, + "loss": 0.8226, + "step": 41370 + }, + { + "epoch": 0.6642161190388288, + "grad_norm": 0.9536712765693665, + "learning_rate": 3.7590523396433586e-05, + "loss": 0.8506, + "step": 41380 + }, + { + "epoch": 0.6643766352589929, + "grad_norm": 1.1573764085769653, + "learning_rate": 3.758507640925181e-05, + "loss": 0.7208, + "step": 41390 + }, + { + "epoch": 0.6645371514791569, + "grad_norm": 1.2143135070800781, + "learning_rate": 3.757962862173303e-05, + "loss": 0.8423, + "step": 41400 + }, + { + "epoch": 0.664697667699321, + "grad_norm": 0.6740461587905884, + "learning_rate": 3.757418003422371e-05, + "loss": 0.9092, + "step": 41410 + }, + { + "epoch": 0.664858183919485, + "grad_norm": 0.850688636302948, + "learning_rate": 3.756873064707034e-05, + "loss": 0.7777, + "step": 41420 + }, + { + "epoch": 0.6650187001396491, + "grad_norm": 0.6841651201248169, + "learning_rate": 3.756328046061948e-05, + "loss": 0.7198, + "step": 41430 + }, + { + "epoch": 0.6651792163598131, + "grad_norm": 0.8649442791938782, + "learning_rate": 3.755782947521772e-05, + "loss": 0.8136, + "step": 41440 + }, + { + "epoch": 0.6653397325799772, + "grad_norm": 1.1988195180892944, + "learning_rate": 3.75523776912117e-05, + "loss": 0.7376, + "step": 41450 + }, + { + "epoch": 0.6655002488001412, + "grad_norm": 0.6005812287330627, + "learning_rate": 3.754692510894815e-05, + "loss": 0.9102, + "step": 41460 + }, + { + "epoch": 0.6656607650203054, + "grad_norm": 0.8080295920372009, + "learning_rate": 3.7541471728773806e-05, + "loss": 0.822, + "step": 41470 + }, + { + "epoch": 0.6658212812404694, + "grad_norm": 0.7101423144340515, + "learning_rate": 3.753601755103545e-05, + "loss": 0.7216, + "step": 41480 + }, + { + "epoch": 0.6659817974606334, + "grad_norm": 0.6599165201187134, + "learning_rate": 3.753056257607998e-05, + "loss": 0.7622, + "step": 41490 + }, + { + "epoch": 0.6661423136807975, + "grad_norm": 0.5923312306404114, + "learning_rate": 3.752510680425427e-05, + "loss": 0.8095, + "step": 41500 + }, + { + "epoch": 0.6663028299009615, + "grad_norm": 0.762224018573761, + "learning_rate": 3.751965023590528e-05, + "loss": 0.8289, + "step": 41510 + }, + { + "epoch": 0.6664633461211256, + "grad_norm": 0.7710729837417603, + "learning_rate": 3.7514192871380014e-05, + "loss": 0.7938, + "step": 41520 + }, + { + "epoch": 0.6666238623412896, + "grad_norm": 0.615508496761322, + "learning_rate": 3.750873471102554e-05, + "loss": 0.7847, + "step": 41530 + }, + { + "epoch": 0.6667843785614537, + "grad_norm": 0.5509522557258606, + "learning_rate": 3.750327575518895e-05, + "loss": 0.7093, + "step": 41540 + }, + { + "epoch": 0.6669448947816177, + "grad_norm": 0.7156981229782104, + "learning_rate": 3.749781600421742e-05, + "loss": 0.8749, + "step": 41550 + }, + { + "epoch": 0.6671054110017818, + "grad_norm": 0.759962797164917, + "learning_rate": 3.749235545845813e-05, + "loss": 0.7348, + "step": 41560 + }, + { + "epoch": 0.6672659272219458, + "grad_norm": 0.688001811504364, + "learning_rate": 3.748689411825836e-05, + "loss": 0.7264, + "step": 41570 + }, + { + "epoch": 0.6674264434421098, + "grad_norm": 0.5750413537025452, + "learning_rate": 3.748143198396543e-05, + "loss": 0.788, + "step": 41580 + }, + { + "epoch": 0.6675869596622739, + "grad_norm": 1.3421992063522339, + "learning_rate": 3.747596905592667e-05, + "loss": 0.7658, + "step": 41590 + }, + { + "epoch": 0.6677474758824379, + "grad_norm": 0.6974620223045349, + "learning_rate": 3.747050533448951e-05, + "loss": 0.7574, + "step": 41600 + }, + { + "epoch": 0.667907992102602, + "grad_norm": 0.6416829228401184, + "learning_rate": 3.7465040820001406e-05, + "loss": 0.7748, + "step": 41610 + }, + { + "epoch": 0.668068508322766, + "grad_norm": 0.6274019479751587, + "learning_rate": 3.745957551280986e-05, + "loss": 0.6899, + "step": 41620 + }, + { + "epoch": 0.6682290245429301, + "grad_norm": 0.7445369958877563, + "learning_rate": 3.745410941326245e-05, + "loss": 0.7344, + "step": 41630 + }, + { + "epoch": 0.6683895407630941, + "grad_norm": 0.8492850661277771, + "learning_rate": 3.7448642521706777e-05, + "loss": 0.7846, + "step": 41640 + }, + { + "epoch": 0.6685500569832582, + "grad_norm": 2.0703442096710205, + "learning_rate": 3.744317483849051e-05, + "loss": 0.8148, + "step": 41650 + }, + { + "epoch": 0.6687105732034222, + "grad_norm": 0.7874878644943237, + "learning_rate": 3.743770636396135e-05, + "loss": 0.8214, + "step": 41660 + }, + { + "epoch": 0.6688710894235863, + "grad_norm": 0.8231170773506165, + "learning_rate": 3.7432237098467063e-05, + "loss": 0.7482, + "step": 41670 + }, + { + "epoch": 0.6690316056437503, + "grad_norm": 0.814921498298645, + "learning_rate": 3.7426767042355485e-05, + "loss": 0.758, + "step": 41680 + }, + { + "epoch": 0.6691921218639143, + "grad_norm": 0.4989892840385437, + "learning_rate": 3.7421296195974445e-05, + "loss": 0.7743, + "step": 41690 + }, + { + "epoch": 0.6693526380840784, + "grad_norm": 0.7531447410583496, + "learning_rate": 3.741582455967188e-05, + "loss": 0.8375, + "step": 41700 + }, + { + "epoch": 0.6695131543042424, + "grad_norm": 0.6536967754364014, + "learning_rate": 3.741035213379573e-05, + "loss": 0.7491, + "step": 41710 + }, + { + "epoch": 0.6696736705244065, + "grad_norm": 0.6599394679069519, + "learning_rate": 3.740487891869404e-05, + "loss": 0.7578, + "step": 41720 + }, + { + "epoch": 0.6698341867445705, + "grad_norm": 0.7899986505508423, + "learning_rate": 3.739940491471486e-05, + "loss": 0.8128, + "step": 41730 + }, + { + "epoch": 0.6699947029647346, + "grad_norm": 0.72526615858078, + "learning_rate": 3.739393012220631e-05, + "loss": 0.7706, + "step": 41740 + }, + { + "epoch": 0.6701552191848986, + "grad_norm": 0.7301754951477051, + "learning_rate": 3.7388454541516536e-05, + "loss": 0.9249, + "step": 41750 + }, + { + "epoch": 0.6703157354050627, + "grad_norm": 0.9249571561813354, + "learning_rate": 3.738297817299378e-05, + "loss": 0.8442, + "step": 41760 + }, + { + "epoch": 0.6704762516252267, + "grad_norm": 0.8312222361564636, + "learning_rate": 3.7377501016986285e-05, + "loss": 0.7189, + "step": 41770 + }, + { + "epoch": 0.6706367678453907, + "grad_norm": 0.6871547102928162, + "learning_rate": 3.737202307384237e-05, + "loss": 0.7833, + "step": 41780 + }, + { + "epoch": 0.6707972840655548, + "grad_norm": 0.6077675223350525, + "learning_rate": 3.736654434391041e-05, + "loss": 0.8424, + "step": 41790 + }, + { + "epoch": 0.6709578002857188, + "grad_norm": 0.9311270117759705, + "learning_rate": 3.7361064827538815e-05, + "loss": 0.8949, + "step": 41800 + }, + { + "epoch": 0.6711183165058829, + "grad_norm": 1.552382230758667, + "learning_rate": 3.7355584525076044e-05, + "loss": 0.7826, + "step": 41810 + }, + { + "epoch": 0.6712788327260469, + "grad_norm": 0.6227427124977112, + "learning_rate": 3.735010343687062e-05, + "loss": 0.7562, + "step": 41820 + }, + { + "epoch": 0.671439348946211, + "grad_norm": 1.0476288795471191, + "learning_rate": 3.734462156327111e-05, + "loss": 0.7431, + "step": 41830 + }, + { + "epoch": 0.671599865166375, + "grad_norm": 0.695776641368866, + "learning_rate": 3.733913890462611e-05, + "loss": 0.8256, + "step": 41840 + }, + { + "epoch": 0.6717603813865392, + "grad_norm": 0.7345027327537537, + "learning_rate": 3.733365546128431e-05, + "loss": 0.807, + "step": 41850 + }, + { + "epoch": 0.6719208976067031, + "grad_norm": 0.8022846579551697, + "learning_rate": 3.7328171233594414e-05, + "loss": 0.8391, + "step": 41860 + }, + { + "epoch": 0.6720814138268673, + "grad_norm": 0.5609604120254517, + "learning_rate": 3.7322686221905186e-05, + "loss": 0.7267, + "step": 41870 + }, + { + "epoch": 0.6722419300470313, + "grad_norm": 0.530646800994873, + "learning_rate": 3.7317200426565436e-05, + "loss": 0.8709, + "step": 41880 + }, + { + "epoch": 0.6724024462671953, + "grad_norm": 0.8401095271110535, + "learning_rate": 3.731171384792404e-05, + "loss": 0.7665, + "step": 41890 + }, + { + "epoch": 0.6725629624873594, + "grad_norm": 0.9697273373603821, + "learning_rate": 3.7306226486329906e-05, + "loss": 0.7391, + "step": 41900 + }, + { + "epoch": 0.6727234787075234, + "grad_norm": 0.9195349216461182, + "learning_rate": 3.7300738342131994e-05, + "loss": 0.7568, + "step": 41910 + }, + { + "epoch": 0.6728839949276875, + "grad_norm": 0.7025376558303833, + "learning_rate": 3.729524941567932e-05, + "loss": 0.8072, + "step": 41920 + }, + { + "epoch": 0.6730445111478515, + "grad_norm": 0.6777015328407288, + "learning_rate": 3.7289759707320945e-05, + "loss": 0.8288, + "step": 41930 + }, + { + "epoch": 0.6732050273680156, + "grad_norm": 0.723305881023407, + "learning_rate": 3.7284269217405995e-05, + "loss": 0.6273, + "step": 41940 + }, + { + "epoch": 0.6733655435881796, + "grad_norm": 1.2102113962173462, + "learning_rate": 3.727877794628362e-05, + "loss": 0.7485, + "step": 41950 + }, + { + "epoch": 0.6735260598083437, + "grad_norm": 1.15302312374115, + "learning_rate": 3.7273285894303044e-05, + "loss": 0.8932, + "step": 41960 + }, + { + "epoch": 0.6736865760285077, + "grad_norm": 0.9203145503997803, + "learning_rate": 3.7267793061813513e-05, + "loss": 0.8339, + "step": 41970 + }, + { + "epoch": 0.6738470922486717, + "grad_norm": 0.657259464263916, + "learning_rate": 3.726229944916435e-05, + "loss": 0.8048, + "step": 41980 + }, + { + "epoch": 0.6740076084688358, + "grad_norm": 0.892598569393158, + "learning_rate": 3.725680505670492e-05, + "loss": 0.7673, + "step": 41990 + }, + { + "epoch": 0.6741681246889998, + "grad_norm": 0.7693490386009216, + "learning_rate": 3.725130988478462e-05, + "loss": 0.8486, + "step": 42000 + }, + { + "epoch": 0.6743286409091639, + "grad_norm": 0.9398339986801147, + "learning_rate": 3.724581393375292e-05, + "loss": 0.7785, + "step": 42010 + }, + { + "epoch": 0.6744891571293279, + "grad_norm": 0.6997805237770081, + "learning_rate": 3.724031720395933e-05, + "loss": 0.8534, + "step": 42020 + }, + { + "epoch": 0.674649673349492, + "grad_norm": 0.5494412183761597, + "learning_rate": 3.723481969575341e-05, + "loss": 0.7799, + "step": 42030 + }, + { + "epoch": 0.674810189569656, + "grad_norm": 0.8530223369598389, + "learning_rate": 3.722932140948477e-05, + "loss": 0.8172, + "step": 42040 + }, + { + "epoch": 0.6749707057898201, + "grad_norm": 0.5037136673927307, + "learning_rate": 3.7223822345503067e-05, + "loss": 0.78, + "step": 42050 + }, + { + "epoch": 0.6751312220099841, + "grad_norm": 0.6362020969390869, + "learning_rate": 3.721832250415801e-05, + "loss": 0.8216, + "step": 42060 + }, + { + "epoch": 0.6752917382301482, + "grad_norm": 0.6908214688301086, + "learning_rate": 3.7212821885799356e-05, + "loss": 0.7784, + "step": 42070 + }, + { + "epoch": 0.6754522544503122, + "grad_norm": 1.0261778831481934, + "learning_rate": 3.7207320490776914e-05, + "loss": 0.8271, + "step": 42080 + }, + { + "epoch": 0.6756127706704762, + "grad_norm": 0.9311137199401855, + "learning_rate": 3.7201818319440535e-05, + "loss": 0.7797, + "step": 42090 + }, + { + "epoch": 0.6757732868906403, + "grad_norm": 0.6587698459625244, + "learning_rate": 3.7196315372140127e-05, + "loss": 0.8645, + "step": 42100 + }, + { + "epoch": 0.6759338031108043, + "grad_norm": 0.7474828958511353, + "learning_rate": 3.7190811649225655e-05, + "loss": 0.7571, + "step": 42110 + }, + { + "epoch": 0.6760943193309684, + "grad_norm": 1.0505611896514893, + "learning_rate": 3.718530715104711e-05, + "loss": 0.8541, + "step": 42120 + }, + { + "epoch": 0.6762548355511324, + "grad_norm": 0.41064414381980896, + "learning_rate": 3.7179801877954544e-05, + "loss": 0.7548, + "step": 42130 + }, + { + "epoch": 0.6764153517712965, + "grad_norm": 0.5265612602233887, + "learning_rate": 3.717429583029808e-05, + "loss": 0.6331, + "step": 42140 + }, + { + "epoch": 0.6765758679914605, + "grad_norm": 0.5266940593719482, + "learning_rate": 3.716878900842785e-05, + "loss": 0.704, + "step": 42150 + }, + { + "epoch": 0.6767363842116246, + "grad_norm": 0.5937430262565613, + "learning_rate": 3.7163281412694054e-05, + "loss": 0.8334, + "step": 42160 + }, + { + "epoch": 0.6768969004317886, + "grad_norm": 0.7128585577011108, + "learning_rate": 3.715777304344696e-05, + "loss": 0.7784, + "step": 42170 + }, + { + "epoch": 0.6770574166519526, + "grad_norm": 0.6202144026756287, + "learning_rate": 3.715226390103686e-05, + "loss": 0.7547, + "step": 42180 + }, + { + "epoch": 0.6772179328721167, + "grad_norm": 0.655834972858429, + "learning_rate": 3.71467539858141e-05, + "loss": 0.6654, + "step": 42190 + }, + { + "epoch": 0.6773784490922807, + "grad_norm": 0.7887683510780334, + "learning_rate": 3.714124329812908e-05, + "loss": 0.6992, + "step": 42200 + }, + { + "epoch": 0.6775389653124448, + "grad_norm": 0.7633284330368042, + "learning_rate": 3.7135731838332254e-05, + "loss": 0.7671, + "step": 42210 + }, + { + "epoch": 0.6776994815326088, + "grad_norm": 1.0573636293411255, + "learning_rate": 3.713021960677411e-05, + "loss": 0.777, + "step": 42220 + }, + { + "epoch": 0.677859997752773, + "grad_norm": 1.2773797512054443, + "learning_rate": 3.712470660380519e-05, + "loss": 0.87, + "step": 42230 + }, + { + "epoch": 0.678020513972937, + "grad_norm": 0.6210776567459106, + "learning_rate": 3.71191928297761e-05, + "loss": 0.6892, + "step": 42240 + }, + { + "epoch": 0.678181030193101, + "grad_norm": 0.696132242679596, + "learning_rate": 3.711367828503748e-05, + "loss": 0.823, + "step": 42250 + }, + { + "epoch": 0.678341546413265, + "grad_norm": 0.8913618922233582, + "learning_rate": 3.710816296994001e-05, + "loss": 0.7789, + "step": 42260 + }, + { + "epoch": 0.6785020626334292, + "grad_norm": 0.6351945996284485, + "learning_rate": 3.710264688483446e-05, + "loss": 0.6515, + "step": 42270 + }, + { + "epoch": 0.6786625788535932, + "grad_norm": 0.784048318862915, + "learning_rate": 3.709713003007159e-05, + "loss": 0.8012, + "step": 42280 + }, + { + "epoch": 0.6788230950737572, + "grad_norm": 0.6119341254234314, + "learning_rate": 3.709161240600226e-05, + "loss": 0.7071, + "step": 42290 + }, + { + "epoch": 0.6789836112939213, + "grad_norm": 1.164770245552063, + "learning_rate": 3.708609401297735e-05, + "loss": 0.9115, + "step": 42300 + }, + { + "epoch": 0.6791441275140853, + "grad_norm": 0.8517923355102539, + "learning_rate": 3.70805748513478e-05, + "loss": 0.7207, + "step": 42310 + }, + { + "epoch": 0.6793046437342494, + "grad_norm": 0.8242171406745911, + "learning_rate": 3.707505492146459e-05, + "loss": 0.732, + "step": 42320 + }, + { + "epoch": 0.6794651599544134, + "grad_norm": 1.054598093032837, + "learning_rate": 3.706953422367877e-05, + "loss": 0.8262, + "step": 42330 + }, + { + "epoch": 0.6796256761745775, + "grad_norm": 0.6744922995567322, + "learning_rate": 3.7064012758341404e-05, + "loss": 0.7628, + "step": 42340 + }, + { + "epoch": 0.6797861923947415, + "grad_norm": 0.5545477867126465, + "learning_rate": 3.705849052580364e-05, + "loss": 0.711, + "step": 42350 + }, + { + "epoch": 0.6799467086149056, + "grad_norm": 0.709716260433197, + "learning_rate": 3.705296752641666e-05, + "loss": 0.8251, + "step": 42360 + }, + { + "epoch": 0.6801072248350696, + "grad_norm": 0.784170389175415, + "learning_rate": 3.704744376053168e-05, + "loss": 0.7671, + "step": 42370 + }, + { + "epoch": 0.6802677410552336, + "grad_norm": 0.8451164364814758, + "learning_rate": 3.7041919228499986e-05, + "loss": 0.8228, + "step": 42380 + }, + { + "epoch": 0.6804282572753977, + "grad_norm": 0.9520201086997986, + "learning_rate": 3.703639393067292e-05, + "loss": 0.6673, + "step": 42390 + }, + { + "epoch": 0.6805887734955617, + "grad_norm": 0.7257710695266724, + "learning_rate": 3.703086786740184e-05, + "loss": 0.7895, + "step": 42400 + }, + { + "epoch": 0.6807492897157258, + "grad_norm": 0.5596155524253845, + "learning_rate": 3.7025341039038175e-05, + "loss": 0.853, + "step": 42410 + }, + { + "epoch": 0.6809098059358898, + "grad_norm": 0.7753205299377441, + "learning_rate": 3.701981344593341e-05, + "loss": 0.9063, + "step": 42420 + }, + { + "epoch": 0.6810703221560539, + "grad_norm": 0.9193815588951111, + "learning_rate": 3.701428508843905e-05, + "loss": 0.696, + "step": 42430 + }, + { + "epoch": 0.6812308383762179, + "grad_norm": 0.7019696235656738, + "learning_rate": 3.700875596690668e-05, + "loss": 0.7712, + "step": 42440 + }, + { + "epoch": 0.681391354596382, + "grad_norm": 0.8948472142219543, + "learning_rate": 3.700322608168791e-05, + "loss": 0.7726, + "step": 42450 + }, + { + "epoch": 0.681551870816546, + "grad_norm": 0.768733024597168, + "learning_rate": 3.699769543313442e-05, + "loss": 0.8955, + "step": 42460 + }, + { + "epoch": 0.6817123870367101, + "grad_norm": 0.4080570638179779, + "learning_rate": 3.699216402159791e-05, + "loss": 0.7636, + "step": 42470 + }, + { + "epoch": 0.6818729032568741, + "grad_norm": 0.7301815748214722, + "learning_rate": 3.6986631847430166e-05, + "loss": 0.716, + "step": 42480 + }, + { + "epoch": 0.6820334194770381, + "grad_norm": 0.7430961728096008, + "learning_rate": 3.6981098910982985e-05, + "loss": 0.7388, + "step": 42490 + }, + { + "epoch": 0.6821939356972022, + "grad_norm": 0.6572335362434387, + "learning_rate": 3.6975565212608234e-05, + "loss": 0.8811, + "step": 42500 + }, + { + "epoch": 0.6823544519173662, + "grad_norm": 0.8641484379768372, + "learning_rate": 3.6970030752657826e-05, + "loss": 0.8038, + "step": 42510 + }, + { + "epoch": 0.6825149681375303, + "grad_norm": 0.6794440150260925, + "learning_rate": 3.6964495531483726e-05, + "loss": 0.8334, + "step": 42520 + }, + { + "epoch": 0.6826754843576943, + "grad_norm": 1.0686402320861816, + "learning_rate": 3.695895954943793e-05, + "loss": 0.89, + "step": 42530 + }, + { + "epoch": 0.6828360005778584, + "grad_norm": 0.8536579608917236, + "learning_rate": 3.695342280687249e-05, + "loss": 0.7674, + "step": 42540 + }, + { + "epoch": 0.6829965167980224, + "grad_norm": 1.0050410032272339, + "learning_rate": 3.694788530413953e-05, + "loss": 0.8433, + "step": 42550 + }, + { + "epoch": 0.6831570330181865, + "grad_norm": 0.8322805166244507, + "learning_rate": 3.6942347041591183e-05, + "loss": 0.7819, + "step": 42560 + }, + { + "epoch": 0.6833175492383505, + "grad_norm": 0.5173788666725159, + "learning_rate": 3.6936808019579665e-05, + "loss": 0.7786, + "step": 42570 + }, + { + "epoch": 0.6834780654585145, + "grad_norm": 0.518917441368103, + "learning_rate": 3.6931268238457226e-05, + "loss": 0.7485, + "step": 42580 + }, + { + "epoch": 0.6836385816786786, + "grad_norm": 2.40338397026062, + "learning_rate": 3.692572769857614e-05, + "loss": 0.7966, + "step": 42590 + }, + { + "epoch": 0.6837990978988426, + "grad_norm": 0.7858580350875854, + "learning_rate": 3.692018640028878e-05, + "loss": 0.8025, + "step": 42600 + }, + { + "epoch": 0.6839596141190067, + "grad_norm": 0.5699024200439453, + "learning_rate": 3.691464434394752e-05, + "loss": 0.8399, + "step": 42610 + }, + { + "epoch": 0.6841201303391707, + "grad_norm": 0.7538118958473206, + "learning_rate": 3.6909101529904826e-05, + "loss": 0.7566, + "step": 42620 + }, + { + "epoch": 0.6842806465593348, + "grad_norm": 1.1350153684616089, + "learning_rate": 3.690355795851316e-05, + "loss": 0.7411, + "step": 42630 + }, + { + "epoch": 0.6844411627794988, + "grad_norm": 0.7242403030395508, + "learning_rate": 3.689801363012508e-05, + "loss": 0.7327, + "step": 42640 + }, + { + "epoch": 0.684601678999663, + "grad_norm": 0.9121872186660767, + "learning_rate": 3.689246854509317e-05, + "loss": 0.729, + "step": 42650 + }, + { + "epoch": 0.684762195219827, + "grad_norm": 1.0762670040130615, + "learning_rate": 3.688692270377006e-05, + "loss": 0.8113, + "step": 42660 + }, + { + "epoch": 0.6849227114399911, + "grad_norm": 0.5812260508537292, + "learning_rate": 3.688137610650843e-05, + "loss": 0.819, + "step": 42670 + }, + { + "epoch": 0.6850832276601551, + "grad_norm": 0.9428741931915283, + "learning_rate": 3.687582875366103e-05, + "loss": 0.9977, + "step": 42680 + }, + { + "epoch": 0.6852437438803191, + "grad_norm": 0.5356224775314331, + "learning_rate": 3.687028064558061e-05, + "loss": 0.7011, + "step": 42690 + }, + { + "epoch": 0.6854042601004832, + "grad_norm": 0.8672323822975159, + "learning_rate": 3.6864731782620026e-05, + "loss": 0.625, + "step": 42700 + }, + { + "epoch": 0.6855647763206472, + "grad_norm": 0.8467515707015991, + "learning_rate": 3.6859182165132134e-05, + "loss": 0.7661, + "step": 42710 + }, + { + "epoch": 0.6857252925408113, + "grad_norm": 0.572134256362915, + "learning_rate": 3.685363179346986e-05, + "loss": 0.7538, + "step": 42720 + }, + { + "epoch": 0.6858858087609753, + "grad_norm": 0.8416128754615784, + "learning_rate": 3.684808066798618e-05, + "loss": 0.804, + "step": 42730 + }, + { + "epoch": 0.6860463249811394, + "grad_norm": 0.8524983525276184, + "learning_rate": 3.684252878903411e-05, + "loss": 0.807, + "step": 42740 + }, + { + "epoch": 0.6862068412013034, + "grad_norm": 0.6853179335594177, + "learning_rate": 3.683697615696672e-05, + "loss": 0.7597, + "step": 42750 + }, + { + "epoch": 0.6863673574214675, + "grad_norm": 1.0148143768310547, + "learning_rate": 3.6831422772137116e-05, + "loss": 0.6698, + "step": 42760 + }, + { + "epoch": 0.6865278736416315, + "grad_norm": 0.9075413942337036, + "learning_rate": 3.6825868634898474e-05, + "loss": 0.7561, + "step": 42770 + }, + { + "epoch": 0.6866883898617956, + "grad_norm": 0.9871705174446106, + "learning_rate": 3.6820313745604e-05, + "loss": 0.7997, + "step": 42780 + }, + { + "epoch": 0.6868489060819596, + "grad_norm": 0.6650115847587585, + "learning_rate": 3.681475810460694e-05, + "loss": 0.7053, + "step": 42790 + }, + { + "epoch": 0.6870094223021236, + "grad_norm": 1.0327600240707397, + "learning_rate": 3.6809201712260616e-05, + "loss": 0.8156, + "step": 42800 + }, + { + "epoch": 0.6871699385222877, + "grad_norm": 0.9539392590522766, + "learning_rate": 3.6803644568918374e-05, + "loss": 0.7101, + "step": 42810 + }, + { + "epoch": 0.6873304547424517, + "grad_norm": 0.7840538620948792, + "learning_rate": 3.679808667493362e-05, + "loss": 0.7572, + "step": 42820 + }, + { + "epoch": 0.6874909709626158, + "grad_norm": 0.6205702424049377, + "learning_rate": 3.67925280306598e-05, + "loss": 0.6763, + "step": 42830 + }, + { + "epoch": 0.6876514871827798, + "grad_norm": 0.73989337682724, + "learning_rate": 3.678696863645041e-05, + "loss": 0.8373, + "step": 42840 + }, + { + "epoch": 0.6878120034029439, + "grad_norm": 0.6355834603309631, + "learning_rate": 3.6781408492658996e-05, + "loss": 0.6619, + "step": 42850 + }, + { + "epoch": 0.6879725196231079, + "grad_norm": 0.6531293392181396, + "learning_rate": 3.677584759963916e-05, + "loss": 0.7961, + "step": 42860 + }, + { + "epoch": 0.688133035843272, + "grad_norm": 1.7402806282043457, + "learning_rate": 3.677028595774453e-05, + "loss": 0.8414, + "step": 42870 + }, + { + "epoch": 0.688293552063436, + "grad_norm": 0.6391335129737854, + "learning_rate": 3.67647235673288e-05, + "loss": 0.8832, + "step": 42880 + }, + { + "epoch": 0.6884540682836, + "grad_norm": 0.7296987771987915, + "learning_rate": 3.67591604287457e-05, + "loss": 0.7796, + "step": 42890 + }, + { + "epoch": 0.6886145845037641, + "grad_norm": 1.008298635482788, + "learning_rate": 3.675359654234901e-05, + "loss": 0.7992, + "step": 42900 + }, + { + "epoch": 0.6887751007239281, + "grad_norm": 1.1158758401870728, + "learning_rate": 3.674803190849257e-05, + "loss": 0.852, + "step": 42910 + }, + { + "epoch": 0.6889356169440922, + "grad_norm": 0.944038987159729, + "learning_rate": 3.674246652753026e-05, + "loss": 0.7437, + "step": 42920 + }, + { + "epoch": 0.6890961331642562, + "grad_norm": 0.6797846555709839, + "learning_rate": 3.6736900399816e-05, + "loss": 0.8267, + "step": 42930 + }, + { + "epoch": 0.6892566493844203, + "grad_norm": 1.3830797672271729, + "learning_rate": 3.673133352570376e-05, + "loss": 0.8915, + "step": 42940 + }, + { + "epoch": 0.6894171656045843, + "grad_norm": 0.5845041275024414, + "learning_rate": 3.672576590554756e-05, + "loss": 0.8968, + "step": 42950 + }, + { + "epoch": 0.6895776818247484, + "grad_norm": 0.8024300932884216, + "learning_rate": 3.6720197539701475e-05, + "loss": 0.7839, + "step": 42960 + }, + { + "epoch": 0.6897381980449124, + "grad_norm": 0.5669068098068237, + "learning_rate": 3.671462842851962e-05, + "loss": 0.6945, + "step": 42970 + }, + { + "epoch": 0.6898987142650765, + "grad_norm": 0.7046951055526733, + "learning_rate": 3.670905857235615e-05, + "loss": 0.7799, + "step": 42980 + }, + { + "epoch": 0.6900592304852405, + "grad_norm": 0.691458523273468, + "learning_rate": 3.670348797156528e-05, + "loss": 0.8695, + "step": 42990 + }, + { + "epoch": 0.6902197467054045, + "grad_norm": 0.5629019737243652, + "learning_rate": 3.669791662650127e-05, + "loss": 0.7335, + "step": 43000 + }, + { + "epoch": 0.6903802629255686, + "grad_norm": 0.6809003949165344, + "learning_rate": 3.669234453751842e-05, + "loss": 0.7947, + "step": 43010 + }, + { + "epoch": 0.6905407791457326, + "grad_norm": 0.9610717296600342, + "learning_rate": 3.668677170497109e-05, + "loss": 0.8243, + "step": 43020 + }, + { + "epoch": 0.6907012953658968, + "grad_norm": 0.5890305042266846, + "learning_rate": 3.6681198129213675e-05, + "loss": 0.7646, + "step": 43030 + }, + { + "epoch": 0.6908618115860607, + "grad_norm": 0.4605785608291626, + "learning_rate": 3.667562381060061e-05, + "loss": 0.7735, + "step": 43040 + }, + { + "epoch": 0.6910223278062249, + "grad_norm": 0.6447873115539551, + "learning_rate": 3.6670048749486415e-05, + "loss": 0.8376, + "step": 43050 + }, + { + "epoch": 0.6911828440263889, + "grad_norm": 1.2049788236618042, + "learning_rate": 3.6664472946225606e-05, + "loss": 0.7436, + "step": 43060 + }, + { + "epoch": 0.691343360246553, + "grad_norm": 0.9653052687644958, + "learning_rate": 3.665889640117278e-05, + "loss": 0.7123, + "step": 43070 + }, + { + "epoch": 0.691503876466717, + "grad_norm": 0.7077568769454956, + "learning_rate": 3.665331911468258e-05, + "loss": 0.8296, + "step": 43080 + }, + { + "epoch": 0.691664392686881, + "grad_norm": 1.1818325519561768, + "learning_rate": 3.664774108710967e-05, + "loss": 0.7542, + "step": 43090 + }, + { + "epoch": 0.6918249089070451, + "grad_norm": 0.8007374405860901, + "learning_rate": 3.6642162318808804e-05, + "loss": 0.8279, + "step": 43100 + }, + { + "epoch": 0.6919854251272091, + "grad_norm": 0.6590679883956909, + "learning_rate": 3.663658281013475e-05, + "loss": 0.8421, + "step": 43110 + }, + { + "epoch": 0.6921459413473732, + "grad_norm": 0.6987830996513367, + "learning_rate": 3.663100256144231e-05, + "loss": 0.7859, + "step": 43120 + }, + { + "epoch": 0.6923064575675372, + "grad_norm": 0.8858531713485718, + "learning_rate": 3.6625421573086386e-05, + "loss": 0.7694, + "step": 43130 + }, + { + "epoch": 0.6924669737877013, + "grad_norm": 0.7926640510559082, + "learning_rate": 3.661983984542189e-05, + "loss": 0.8696, + "step": 43140 + }, + { + "epoch": 0.6926274900078653, + "grad_norm": 1.069199562072754, + "learning_rate": 3.6614257378803775e-05, + "loss": 0.9007, + "step": 43150 + }, + { + "epoch": 0.6927880062280294, + "grad_norm": 0.5886120200157166, + "learning_rate": 3.660867417358706e-05, + "loss": 0.751, + "step": 43160 + }, + { + "epoch": 0.6929485224481934, + "grad_norm": 0.8097028136253357, + "learning_rate": 3.6603090230126805e-05, + "loss": 0.8453, + "step": 43170 + }, + { + "epoch": 0.6931090386683575, + "grad_norm": 1.4847923517227173, + "learning_rate": 3.6597505548778115e-05, + "loss": 0.7324, + "step": 43180 + }, + { + "epoch": 0.6932695548885215, + "grad_norm": 0.7982926368713379, + "learning_rate": 3.659192012989614e-05, + "loss": 0.6627, + "step": 43190 + }, + { + "epoch": 0.6934300711086855, + "grad_norm": 0.7061456441879272, + "learning_rate": 3.6586333973836086e-05, + "loss": 0.7429, + "step": 43200 + }, + { + "epoch": 0.6935905873288496, + "grad_norm": 0.6830614805221558, + "learning_rate": 3.658074708095319e-05, + "loss": 0.7105, + "step": 43210 + }, + { + "epoch": 0.6937511035490136, + "grad_norm": 0.5413295030593872, + "learning_rate": 3.657515945160276e-05, + "loss": 0.7918, + "step": 43220 + }, + { + "epoch": 0.6939116197691777, + "grad_norm": 0.6871911287307739, + "learning_rate": 3.656957108614012e-05, + "loss": 0.8278, + "step": 43230 + }, + { + "epoch": 0.6940721359893417, + "grad_norm": 0.657788097858429, + "learning_rate": 3.656398198492067e-05, + "loss": 0.7821, + "step": 43240 + }, + { + "epoch": 0.6942326522095058, + "grad_norm": 0.7797564268112183, + "learning_rate": 3.655839214829983e-05, + "loss": 0.8473, + "step": 43250 + }, + { + "epoch": 0.6943931684296698, + "grad_norm": 1.1260461807250977, + "learning_rate": 3.6552801576633104e-05, + "loss": 0.8703, + "step": 43260 + }, + { + "epoch": 0.6945536846498339, + "grad_norm": 0.6186652779579163, + "learning_rate": 3.6547210270276e-05, + "loss": 0.7802, + "step": 43270 + }, + { + "epoch": 0.6947142008699979, + "grad_norm": 0.7044996023178101, + "learning_rate": 3.65416182295841e-05, + "loss": 0.7351, + "step": 43280 + }, + { + "epoch": 0.6948747170901619, + "grad_norm": 0.8156829476356506, + "learning_rate": 3.653602545491302e-05, + "loss": 0.8597, + "step": 43290 + }, + { + "epoch": 0.695035233310326, + "grad_norm": 0.5647484660148621, + "learning_rate": 3.653043194661844e-05, + "loss": 0.6874, + "step": 43300 + }, + { + "epoch": 0.69519574953049, + "grad_norm": 0.73240727186203, + "learning_rate": 3.652483770505605e-05, + "loss": 0.8001, + "step": 43310 + }, + { + "epoch": 0.6953562657506541, + "grad_norm": 0.6308151483535767, + "learning_rate": 3.6519242730581634e-05, + "loss": 0.6429, + "step": 43320 + }, + { + "epoch": 0.6955167819708181, + "grad_norm": 0.7331646084785461, + "learning_rate": 3.651364702355099e-05, + "loss": 0.8088, + "step": 43330 + }, + { + "epoch": 0.6956772981909822, + "grad_norm": 0.8113948702812195, + "learning_rate": 3.650805058431997e-05, + "loss": 0.7542, + "step": 43340 + }, + { + "epoch": 0.6958378144111462, + "grad_norm": 0.5390267968177795, + "learning_rate": 3.650245341324449e-05, + "loss": 0.7971, + "step": 43350 + }, + { + "epoch": 0.6959983306313103, + "grad_norm": 0.7305426001548767, + "learning_rate": 3.649685551068048e-05, + "loss": 0.7273, + "step": 43360 + }, + { + "epoch": 0.6961588468514743, + "grad_norm": 0.8106812238693237, + "learning_rate": 3.649125687698393e-05, + "loss": 0.735, + "step": 43370 + }, + { + "epoch": 0.6963193630716384, + "grad_norm": 0.7844286561012268, + "learning_rate": 3.6485657512510896e-05, + "loss": 0.6875, + "step": 43380 + }, + { + "epoch": 0.6964798792918024, + "grad_norm": 0.6762911677360535, + "learning_rate": 3.6480057417617464e-05, + "loss": 0.9228, + "step": 43390 + }, + { + "epoch": 0.6966403955119664, + "grad_norm": 0.7364296913146973, + "learning_rate": 3.6474456592659755e-05, + "loss": 0.6959, + "step": 43400 + }, + { + "epoch": 0.6968009117321305, + "grad_norm": 0.812059223651886, + "learning_rate": 3.646885503799396e-05, + "loss": 0.7972, + "step": 43410 + }, + { + "epoch": 0.6969614279522945, + "grad_norm": 1.3563131093978882, + "learning_rate": 3.64632527539763e-05, + "loss": 0.7542, + "step": 43420 + }, + { + "epoch": 0.6971219441724587, + "grad_norm": 0.7469868659973145, + "learning_rate": 3.645764974096303e-05, + "loss": 0.81, + "step": 43430 + }, + { + "epoch": 0.6972824603926226, + "grad_norm": 0.9263250231742859, + "learning_rate": 3.6452045999310503e-05, + "loss": 0.8345, + "step": 43440 + }, + { + "epoch": 0.6974429766127868, + "grad_norm": 0.5551093220710754, + "learning_rate": 3.644644152937506e-05, + "loss": 0.6625, + "step": 43450 + }, + { + "epoch": 0.6976034928329508, + "grad_norm": 1.053181529045105, + "learning_rate": 3.644083633151313e-05, + "loss": 0.8062, + "step": 43460 + }, + { + "epoch": 0.6977640090531149, + "grad_norm": 1.3398433923721313, + "learning_rate": 3.643523040608115e-05, + "loss": 0.8395, + "step": 43470 + }, + { + "epoch": 0.6979245252732789, + "grad_norm": 0.6285298466682434, + "learning_rate": 3.6429623753435634e-05, + "loss": 0.7544, + "step": 43480 + }, + { + "epoch": 0.6980850414934429, + "grad_norm": 0.7205914258956909, + "learning_rate": 3.642401637393314e-05, + "loss": 0.7972, + "step": 43490 + }, + { + "epoch": 0.698245557713607, + "grad_norm": 0.5267080664634705, + "learning_rate": 3.641840826793025e-05, + "loss": 0.8382, + "step": 43500 + }, + { + "epoch": 0.698406073933771, + "grad_norm": 0.9526007175445557, + "learning_rate": 3.641279943578362e-05, + "loss": 0.8261, + "step": 43510 + }, + { + "epoch": 0.6985665901539351, + "grad_norm": 0.8123343586921692, + "learning_rate": 3.6407189877849926e-05, + "loss": 0.7392, + "step": 43520 + }, + { + "epoch": 0.6987271063740991, + "grad_norm": 0.6491408348083496, + "learning_rate": 3.640157959448591e-05, + "loss": 0.7804, + "step": 43530 + }, + { + "epoch": 0.6988876225942632, + "grad_norm": 0.7734830379486084, + "learning_rate": 3.6395968586048355e-05, + "loss": 0.7848, + "step": 43540 + }, + { + "epoch": 0.6990481388144272, + "grad_norm": 1.097369909286499, + "learning_rate": 3.639035685289408e-05, + "loss": 0.7794, + "step": 43550 + }, + { + "epoch": 0.6992086550345913, + "grad_norm": 0.8758425712585449, + "learning_rate": 3.6384744395379975e-05, + "loss": 0.7939, + "step": 43560 + }, + { + "epoch": 0.6993691712547553, + "grad_norm": 1.132480502128601, + "learning_rate": 3.637913121386294e-05, + "loss": 0.8144, + "step": 43570 + }, + { + "epoch": 0.6995296874749194, + "grad_norm": 0.6235433220863342, + "learning_rate": 3.6373517308699944e-05, + "loss": 0.7915, + "step": 43580 + }, + { + "epoch": 0.6996902036950834, + "grad_norm": 0.9620994329452515, + "learning_rate": 3.6367902680248006e-05, + "loss": 0.7851, + "step": 43590 + }, + { + "epoch": 0.6998507199152474, + "grad_norm": 0.7152814269065857, + "learning_rate": 3.636228732886418e-05, + "loss": 0.7839, + "step": 43600 + }, + { + "epoch": 0.7000112361354115, + "grad_norm": 0.9048435091972351, + "learning_rate": 3.635667125490558e-05, + "loss": 0.7015, + "step": 43610 + }, + { + "epoch": 0.7001717523555755, + "grad_norm": 0.6923835277557373, + "learning_rate": 3.635105445872933e-05, + "loss": 0.8063, + "step": 43620 + }, + { + "epoch": 0.7003322685757396, + "grad_norm": 1.1245408058166504, + "learning_rate": 3.634543694069265e-05, + "loss": 0.7646, + "step": 43630 + }, + { + "epoch": 0.7004927847959036, + "grad_norm": 0.5021297335624695, + "learning_rate": 3.6339818701152776e-05, + "loss": 0.7927, + "step": 43640 + }, + { + "epoch": 0.7006533010160677, + "grad_norm": 0.822234570980072, + "learning_rate": 3.633419974046698e-05, + "loss": 0.7658, + "step": 43650 + }, + { + "epoch": 0.7008138172362317, + "grad_norm": 1.2825554609298706, + "learning_rate": 3.6328580058992614e-05, + "loss": 0.793, + "step": 43660 + }, + { + "epoch": 0.7009743334563958, + "grad_norm": 0.9623109102249146, + "learning_rate": 3.632295965708704e-05, + "loss": 0.8044, + "step": 43670 + }, + { + "epoch": 0.7011348496765598, + "grad_norm": 0.8197557926177979, + "learning_rate": 3.6317338535107694e-05, + "loss": 0.7533, + "step": 43680 + }, + { + "epoch": 0.7012953658967238, + "grad_norm": 0.7755063772201538, + "learning_rate": 3.631171669341205e-05, + "loss": 0.6749, + "step": 43690 + }, + { + "epoch": 0.7014558821168879, + "grad_norm": 0.6034262180328369, + "learning_rate": 3.6306094132357604e-05, + "loss": 0.8982, + "step": 43700 + }, + { + "epoch": 0.7016163983370519, + "grad_norm": 0.5717037916183472, + "learning_rate": 3.630047085230194e-05, + "loss": 0.781, + "step": 43710 + }, + { + "epoch": 0.701776914557216, + "grad_norm": 1.0962406396865845, + "learning_rate": 3.629484685360265e-05, + "loss": 0.7227, + "step": 43720 + }, + { + "epoch": 0.70193743077738, + "grad_norm": 0.8763008713722229, + "learning_rate": 3.628922213661741e-05, + "loss": 0.815, + "step": 43730 + }, + { + "epoch": 0.7020979469975441, + "grad_norm": 0.7078784108161926, + "learning_rate": 3.628359670170389e-05, + "loss": 0.7507, + "step": 43740 + }, + { + "epoch": 0.7022584632177081, + "grad_norm": 0.8353548049926758, + "learning_rate": 3.627797054921985e-05, + "loss": 0.778, + "step": 43750 + }, + { + "epoch": 0.7024189794378722, + "grad_norm": 0.8975707292556763, + "learning_rate": 3.627234367952308e-05, + "loss": 0.8093, + "step": 43760 + }, + { + "epoch": 0.7025794956580362, + "grad_norm": 0.695889413356781, + "learning_rate": 3.626671609297141e-05, + "loss": 0.7766, + "step": 43770 + }, + { + "epoch": 0.7027400118782003, + "grad_norm": 0.7152813076972961, + "learning_rate": 3.626108778992273e-05, + "loss": 0.7982, + "step": 43780 + }, + { + "epoch": 0.7029005280983643, + "grad_norm": 0.9640992283821106, + "learning_rate": 3.6255458770734954e-05, + "loss": 0.8718, + "step": 43790 + }, + { + "epoch": 0.7030610443185283, + "grad_norm": 0.9205359816551208, + "learning_rate": 3.624982903576607e-05, + "loss": 0.8639, + "step": 43800 + }, + { + "epoch": 0.7032215605386924, + "grad_norm": 0.46441060304641724, + "learning_rate": 3.624419858537409e-05, + "loss": 0.784, + "step": 43810 + }, + { + "epoch": 0.7033820767588564, + "grad_norm": 0.7291011214256287, + "learning_rate": 3.623856741991707e-05, + "loss": 0.7274, + "step": 43820 + }, + { + "epoch": 0.7035425929790206, + "grad_norm": 0.733214795589447, + "learning_rate": 3.623293553975314e-05, + "loss": 0.8008, + "step": 43830 + }, + { + "epoch": 0.7037031091991846, + "grad_norm": 1.011303186416626, + "learning_rate": 3.6227302945240424e-05, + "loss": 0.7711, + "step": 43840 + }, + { + "epoch": 0.7038636254193487, + "grad_norm": 0.9116636514663696, + "learning_rate": 3.6221669636737144e-05, + "loss": 0.9078, + "step": 43850 + }, + { + "epoch": 0.7040241416395127, + "grad_norm": 0.6578760147094727, + "learning_rate": 3.621603561460154e-05, + "loss": 0.7561, + "step": 43860 + }, + { + "epoch": 0.7041846578596768, + "grad_norm": 0.5608623027801514, + "learning_rate": 3.621040087919191e-05, + "loss": 0.6979, + "step": 43870 + }, + { + "epoch": 0.7043451740798408, + "grad_norm": 0.7999644875526428, + "learning_rate": 3.620476543086657e-05, + "loss": 0.7954, + "step": 43880 + }, + { + "epoch": 0.7045056903000048, + "grad_norm": 0.8399812579154968, + "learning_rate": 3.619912926998392e-05, + "loss": 0.7888, + "step": 43890 + }, + { + "epoch": 0.7046662065201689, + "grad_norm": 0.6370208263397217, + "learning_rate": 3.6193492396902376e-05, + "loss": 0.7834, + "step": 43900 + }, + { + "epoch": 0.7048267227403329, + "grad_norm": 0.8231136202812195, + "learning_rate": 3.618785481198042e-05, + "loss": 0.6271, + "step": 43910 + }, + { + "epoch": 0.704987238960497, + "grad_norm": 0.5534765720367432, + "learning_rate": 3.6182216515576566e-05, + "loss": 0.825, + "step": 43920 + }, + { + "epoch": 0.705147755180661, + "grad_norm": 0.9078421592712402, + "learning_rate": 3.617657750804937e-05, + "loss": 0.8212, + "step": 43930 + }, + { + "epoch": 0.7053082714008251, + "grad_norm": 0.9620141983032227, + "learning_rate": 3.6170937789757444e-05, + "loss": 0.7347, + "step": 43940 + }, + { + "epoch": 0.7054687876209891, + "grad_norm": 1.2561590671539307, + "learning_rate": 3.6165297361059445e-05, + "loss": 0.8043, + "step": 43950 + }, + { + "epoch": 0.7056293038411532, + "grad_norm": 0.9557247161865234, + "learning_rate": 3.615965622231406e-05, + "loss": 0.8494, + "step": 43960 + }, + { + "epoch": 0.7057898200613172, + "grad_norm": 0.6169530749320984, + "learning_rate": 3.615401437388005e-05, + "loss": 0.7623, + "step": 43970 + }, + { + "epoch": 0.7059503362814813, + "grad_norm": 1.1671388149261475, + "learning_rate": 3.614837181611619e-05, + "loss": 0.8694, + "step": 43980 + }, + { + "epoch": 0.7061108525016453, + "grad_norm": 0.7505227327346802, + "learning_rate": 3.614272854938132e-05, + "loss": 0.685, + "step": 43990 + }, + { + "epoch": 0.7062713687218093, + "grad_norm": 0.6642532348632812, + "learning_rate": 3.613708457403431e-05, + "loss": 0.8215, + "step": 44000 + }, + { + "epoch": 0.7062713687218093, + "eval_loss": 0.7845669984817505, + "eval_runtime": 1833.4901, + "eval_samples_per_second": 14.307, + "eval_steps_per_second": 1.788, + "step": 44000 + }, + { + "epoch": 0.7064318849419734, + "grad_norm": 0.6884881854057312, + "learning_rate": 3.6131439890434086e-05, + "loss": 0.71, + "step": 44010 + }, + { + "epoch": 0.7065924011621374, + "grad_norm": 0.6804848313331604, + "learning_rate": 3.6125794498939624e-05, + "loss": 0.7749, + "step": 44020 + }, + { + "epoch": 0.7067529173823015, + "grad_norm": 1.019946813583374, + "learning_rate": 3.6120148399909934e-05, + "loss": 0.8105, + "step": 44030 + }, + { + "epoch": 0.7069134336024655, + "grad_norm": 0.8240928649902344, + "learning_rate": 3.611450159370407e-05, + "loss": 0.7841, + "step": 44040 + }, + { + "epoch": 0.7070739498226296, + "grad_norm": 0.6895486116409302, + "learning_rate": 3.6108854080681145e-05, + "loss": 0.7882, + "step": 44050 + }, + { + "epoch": 0.7072344660427936, + "grad_norm": 0.8385114669799805, + "learning_rate": 3.610320586120031e-05, + "loss": 0.7015, + "step": 44060 + }, + { + "epoch": 0.7073949822629577, + "grad_norm": 0.6656649112701416, + "learning_rate": 3.609755693562074e-05, + "loss": 0.7636, + "step": 44070 + }, + { + "epoch": 0.7075554984831217, + "grad_norm": 0.6288572549819946, + "learning_rate": 3.609190730430169e-05, + "loss": 0.8862, + "step": 44080 + }, + { + "epoch": 0.7077160147032857, + "grad_norm": 1.1931781768798828, + "learning_rate": 3.6086256967602436e-05, + "loss": 0.6554, + "step": 44090 + }, + { + "epoch": 0.7078765309234498, + "grad_norm": 1.1362107992172241, + "learning_rate": 3.6080605925882304e-05, + "loss": 0.743, + "step": 44100 + }, + { + "epoch": 0.7080370471436138, + "grad_norm": 0.6836768388748169, + "learning_rate": 3.607495417950068e-05, + "loss": 0.7663, + "step": 44110 + }, + { + "epoch": 0.7081975633637779, + "grad_norm": 0.5628271102905273, + "learning_rate": 3.606930172881697e-05, + "loss": 0.7419, + "step": 44120 + }, + { + "epoch": 0.7083580795839419, + "grad_norm": 0.7562851309776306, + "learning_rate": 3.606364857419064e-05, + "loss": 0.8184, + "step": 44130 + }, + { + "epoch": 0.708518595804106, + "grad_norm": 0.5170414447784424, + "learning_rate": 3.6057994715981205e-05, + "loss": 0.6881, + "step": 44140 + }, + { + "epoch": 0.70867911202427, + "grad_norm": 0.7055572867393494, + "learning_rate": 3.60523401545482e-05, + "loss": 0.8374, + "step": 44150 + }, + { + "epoch": 0.7088396282444341, + "grad_norm": 0.5118683576583862, + "learning_rate": 3.604668489025123e-05, + "loss": 0.8089, + "step": 44160 + }, + { + "epoch": 0.7090001444645981, + "grad_norm": 0.8183616995811462, + "learning_rate": 3.604102892344994e-05, + "loss": 0.6818, + "step": 44170 + }, + { + "epoch": 0.7091606606847622, + "grad_norm": 0.5520790815353394, + "learning_rate": 3.6035372254504026e-05, + "loss": 0.7102, + "step": 44180 + }, + { + "epoch": 0.7093211769049262, + "grad_norm": 0.4939606487751007, + "learning_rate": 3.6029714883773194e-05, + "loss": 0.782, + "step": 44190 + }, + { + "epoch": 0.7094816931250902, + "grad_norm": 0.8521398305892944, + "learning_rate": 3.602405681161724e-05, + "loss": 0.7961, + "step": 44200 + }, + { + "epoch": 0.7096422093452543, + "grad_norm": 0.500464141368866, + "learning_rate": 3.601839803839597e-05, + "loss": 0.8569, + "step": 44210 + }, + { + "epoch": 0.7098027255654183, + "grad_norm": 0.8329895734786987, + "learning_rate": 3.601273856446927e-05, + "loss": 0.7185, + "step": 44220 + }, + { + "epoch": 0.7099632417855825, + "grad_norm": 1.2889325618743896, + "learning_rate": 3.600707839019703e-05, + "loss": 0.7161, + "step": 44230 + }, + { + "epoch": 0.7101237580057465, + "grad_norm": 0.7647642493247986, + "learning_rate": 3.6001417515939204e-05, + "loss": 0.7597, + "step": 44240 + }, + { + "epoch": 0.7102842742259106, + "grad_norm": 1.259054183959961, + "learning_rate": 3.5995755942055796e-05, + "loss": 0.8537, + "step": 44250 + }, + { + "epoch": 0.7104447904460746, + "grad_norm": 1.086751937866211, + "learning_rate": 3.599009366890686e-05, + "loss": 0.7549, + "step": 44260 + }, + { + "epoch": 0.7106053066662387, + "grad_norm": 0.5986170172691345, + "learning_rate": 3.5984430696852463e-05, + "loss": 0.7495, + "step": 44270 + }, + { + "epoch": 0.7107658228864027, + "grad_norm": 0.5054409503936768, + "learning_rate": 3.597876702625275e-05, + "loss": 0.8779, + "step": 44280 + }, + { + "epoch": 0.7109263391065668, + "grad_norm": 0.8353496193885803, + "learning_rate": 3.597310265746789e-05, + "loss": 0.7431, + "step": 44290 + }, + { + "epoch": 0.7110868553267308, + "grad_norm": 0.7375242710113525, + "learning_rate": 3.59674375908581e-05, + "loss": 0.8084, + "step": 44300 + }, + { + "epoch": 0.7112473715468948, + "grad_norm": 0.4850204586982727, + "learning_rate": 3.5961771826783655e-05, + "loss": 0.8249, + "step": 44310 + }, + { + "epoch": 0.7114078877670589, + "grad_norm": 0.9473116397857666, + "learning_rate": 3.595610536560487e-05, + "loss": 0.7806, + "step": 44320 + }, + { + "epoch": 0.7115684039872229, + "grad_norm": 1.6844922304153442, + "learning_rate": 3.595043820768208e-05, + "loss": 0.8425, + "step": 44330 + }, + { + "epoch": 0.711728920207387, + "grad_norm": 0.6386390328407288, + "learning_rate": 3.5944770353375694e-05, + "loss": 0.6816, + "step": 44340 + }, + { + "epoch": 0.711889436427551, + "grad_norm": 0.7214829921722412, + "learning_rate": 3.593910180304616e-05, + "loss": 0.822, + "step": 44350 + }, + { + "epoch": 0.7120499526477151, + "grad_norm": 0.47224798798561096, + "learning_rate": 3.593343255705395e-05, + "loss": 0.7658, + "step": 44360 + }, + { + "epoch": 0.7122104688678791, + "grad_norm": 0.5720266699790955, + "learning_rate": 3.59277626157596e-05, + "loss": 0.8024, + "step": 44370 + }, + { + "epoch": 0.7123709850880432, + "grad_norm": 0.7102388739585876, + "learning_rate": 3.5922091979523694e-05, + "loss": 0.8205, + "step": 44380 + }, + { + "epoch": 0.7125315013082072, + "grad_norm": 0.6223841905593872, + "learning_rate": 3.5916420648706846e-05, + "loss": 0.6986, + "step": 44390 + }, + { + "epoch": 0.7126920175283712, + "grad_norm": 0.5537089705467224, + "learning_rate": 3.591074862366971e-05, + "loss": 0.7635, + "step": 44400 + }, + { + "epoch": 0.7128525337485353, + "grad_norm": 1.3559428453445435, + "learning_rate": 3.5905075904773e-05, + "loss": 0.8222, + "step": 44410 + }, + { + "epoch": 0.7130130499686993, + "grad_norm": 0.664853036403656, + "learning_rate": 3.589940249237748e-05, + "loss": 0.8614, + "step": 44420 + }, + { + "epoch": 0.7131735661888634, + "grad_norm": 1.0176308155059814, + "learning_rate": 3.5893728386843926e-05, + "loss": 0.9079, + "step": 44430 + }, + { + "epoch": 0.7133340824090274, + "grad_norm": 0.6553974747657776, + "learning_rate": 3.588805358853319e-05, + "loss": 0.7313, + "step": 44440 + }, + { + "epoch": 0.7134945986291915, + "grad_norm": 0.7526280283927917, + "learning_rate": 3.588237809780615e-05, + "loss": 0.8394, + "step": 44450 + }, + { + "epoch": 0.7136551148493555, + "grad_norm": 0.8980331420898438, + "learning_rate": 3.587670191502373e-05, + "loss": 0.7803, + "step": 44460 + }, + { + "epoch": 0.7138156310695196, + "grad_norm": 0.5748066902160645, + "learning_rate": 3.5871025040546916e-05, + "loss": 0.7764, + "step": 44470 + }, + { + "epoch": 0.7139761472896836, + "grad_norm": 1.3569819927215576, + "learning_rate": 3.586534747473671e-05, + "loss": 0.8216, + "step": 44480 + }, + { + "epoch": 0.7141366635098477, + "grad_norm": 0.757868230342865, + "learning_rate": 3.585966921795418e-05, + "loss": 0.8354, + "step": 44490 + }, + { + "epoch": 0.7142971797300117, + "grad_norm": 0.802298903465271, + "learning_rate": 3.5853990270560435e-05, + "loss": 0.7227, + "step": 44500 + }, + { + "epoch": 0.7144576959501757, + "grad_norm": 0.6602879166603088, + "learning_rate": 3.5848310632916606e-05, + "loss": 0.9287, + "step": 44510 + }, + { + "epoch": 0.7146182121703398, + "grad_norm": 0.811231255531311, + "learning_rate": 3.5842630305383895e-05, + "loss": 0.7215, + "step": 44520 + }, + { + "epoch": 0.7147787283905038, + "grad_norm": 0.45653659105300903, + "learning_rate": 3.583694928832354e-05, + "loss": 0.7328, + "step": 44530 + }, + { + "epoch": 0.7149392446106679, + "grad_norm": 1.1013833284378052, + "learning_rate": 3.5831267582096817e-05, + "loss": 0.7914, + "step": 44540 + }, + { + "epoch": 0.7150997608308319, + "grad_norm": 1.4801708459854126, + "learning_rate": 3.582558518706505e-05, + "loss": 0.7501, + "step": 44550 + }, + { + "epoch": 0.715260277050996, + "grad_norm": 1.3563860654830933, + "learning_rate": 3.58199021035896e-05, + "loss": 0.7774, + "step": 44560 + }, + { + "epoch": 0.71542079327116, + "grad_norm": 0.6566212773323059, + "learning_rate": 3.581421833203188e-05, + "loss": 0.8551, + "step": 44570 + }, + { + "epoch": 0.7155813094913241, + "grad_norm": 0.8924524784088135, + "learning_rate": 3.580853387275335e-05, + "loss": 0.877, + "step": 44580 + }, + { + "epoch": 0.7157418257114881, + "grad_norm": 0.8735685348510742, + "learning_rate": 3.580284872611551e-05, + "loss": 0.7523, + "step": 44590 + }, + { + "epoch": 0.7159023419316521, + "grad_norm": 0.7229984998703003, + "learning_rate": 3.5797162892479896e-05, + "loss": 0.8234, + "step": 44600 + }, + { + "epoch": 0.7160628581518163, + "grad_norm": 0.7178834676742554, + "learning_rate": 3.5791476372208095e-05, + "loss": 0.7941, + "step": 44610 + }, + { + "epoch": 0.7162233743719802, + "grad_norm": 0.9617845416069031, + "learning_rate": 3.578578916566173e-05, + "loss": 0.7383, + "step": 44620 + }, + { + "epoch": 0.7163838905921444, + "grad_norm": 0.8587314486503601, + "learning_rate": 3.5780101273202484e-05, + "loss": 0.8161, + "step": 44630 + }, + { + "epoch": 0.7165444068123084, + "grad_norm": 0.5831043720245361, + "learning_rate": 3.577441269519207e-05, + "loss": 0.7617, + "step": 44640 + }, + { + "epoch": 0.7167049230324725, + "grad_norm": 0.663982629776001, + "learning_rate": 3.576872343199225e-05, + "loss": 0.7952, + "step": 44650 + }, + { + "epoch": 0.7168654392526365, + "grad_norm": 0.7513864636421204, + "learning_rate": 3.5763033483964824e-05, + "loss": 0.8913, + "step": 44660 + }, + { + "epoch": 0.7170259554728006, + "grad_norm": 0.9475144147872925, + "learning_rate": 3.575734285147164e-05, + "loss": 0.7246, + "step": 44670 + }, + { + "epoch": 0.7171864716929646, + "grad_norm": 0.7599744200706482, + "learning_rate": 3.575165153487459e-05, + "loss": 0.7636, + "step": 44680 + }, + { + "epoch": 0.7173469879131287, + "grad_norm": 0.7859033346176147, + "learning_rate": 3.574595953453561e-05, + "loss": 0.8513, + "step": 44690 + }, + { + "epoch": 0.7175075041332927, + "grad_norm": 0.7288567423820496, + "learning_rate": 3.574026685081668e-05, + "loss": 0.8087, + "step": 44700 + }, + { + "epoch": 0.7176680203534567, + "grad_norm": 0.5980200171470642, + "learning_rate": 3.573457348407981e-05, + "loss": 0.7496, + "step": 44710 + }, + { + "epoch": 0.7178285365736208, + "grad_norm": 0.8097037672996521, + "learning_rate": 3.5728879434687075e-05, + "loss": 0.7631, + "step": 44720 + }, + { + "epoch": 0.7179890527937848, + "grad_norm": 0.6062846183776855, + "learning_rate": 3.572318470300058e-05, + "loss": 0.7813, + "step": 44730 + }, + { + "epoch": 0.7181495690139489, + "grad_norm": 0.4658814072608948, + "learning_rate": 3.571748928938248e-05, + "loss": 0.7339, + "step": 44740 + }, + { + "epoch": 0.7183100852341129, + "grad_norm": 0.8584542870521545, + "learning_rate": 3.5711793194194954e-05, + "loss": 0.8365, + "step": 44750 + }, + { + "epoch": 0.718470601454277, + "grad_norm": 1.141358494758606, + "learning_rate": 3.5706096417800264e-05, + "loss": 0.686, + "step": 44760 + }, + { + "epoch": 0.718631117674441, + "grad_norm": 1.232380986213684, + "learning_rate": 3.5700398960560674e-05, + "loss": 0.712, + "step": 44770 + }, + { + "epoch": 0.7187916338946051, + "grad_norm": 0.6840028762817383, + "learning_rate": 3.5694700822838524e-05, + "loss": 0.8438, + "step": 44780 + }, + { + "epoch": 0.7189521501147691, + "grad_norm": 0.8056495189666748, + "learning_rate": 3.568900200499616e-05, + "loss": 0.8522, + "step": 44790 + }, + { + "epoch": 0.7191126663349331, + "grad_norm": 0.9448619484901428, + "learning_rate": 3.5683302507396015e-05, + "loss": 0.7867, + "step": 44800 + }, + { + "epoch": 0.7192731825550972, + "grad_norm": 0.7989571690559387, + "learning_rate": 3.567760233040054e-05, + "loss": 0.8053, + "step": 44810 + }, + { + "epoch": 0.7194336987752612, + "grad_norm": 0.7017776370048523, + "learning_rate": 3.567190147437222e-05, + "loss": 0.8971, + "step": 44820 + }, + { + "epoch": 0.7195942149954253, + "grad_norm": 0.5696350336074829, + "learning_rate": 3.56661999396736e-05, + "loss": 0.7105, + "step": 44830 + }, + { + "epoch": 0.7197547312155893, + "grad_norm": 0.6704114079475403, + "learning_rate": 3.566049772666728e-05, + "loss": 0.7281, + "step": 44840 + }, + { + "epoch": 0.7199152474357534, + "grad_norm": 0.8023644089698792, + "learning_rate": 3.565479483571587e-05, + "loss": 0.7413, + "step": 44850 + }, + { + "epoch": 0.7200757636559174, + "grad_norm": 0.7270790934562683, + "learning_rate": 3.564909126718204e-05, + "loss": 0.7814, + "step": 44860 + }, + { + "epoch": 0.7202362798760815, + "grad_norm": 0.9614642262458801, + "learning_rate": 3.564338702142852e-05, + "loss": 0.7168, + "step": 44870 + }, + { + "epoch": 0.7203967960962455, + "grad_norm": 0.799365222454071, + "learning_rate": 3.563768209881805e-05, + "loss": 0.8045, + "step": 44880 + }, + { + "epoch": 0.7205573123164096, + "grad_norm": 1.0544037818908691, + "learning_rate": 3.563197649971344e-05, + "loss": 0.9063, + "step": 44890 + }, + { + "epoch": 0.7207178285365736, + "grad_norm": 0.7176162004470825, + "learning_rate": 3.562627022447752e-05, + "loss": 0.7492, + "step": 44900 + }, + { + "epoch": 0.7208783447567376, + "grad_norm": 0.5714158415794373, + "learning_rate": 3.562056327347319e-05, + "loss": 0.8066, + "step": 44910 + }, + { + "epoch": 0.7210388609769017, + "grad_norm": 0.9730111956596375, + "learning_rate": 3.561485564706337e-05, + "loss": 0.6632, + "step": 44920 + }, + { + "epoch": 0.7211993771970657, + "grad_norm": 0.8979157209396362, + "learning_rate": 3.560914734561103e-05, + "loss": 0.7317, + "step": 44930 + }, + { + "epoch": 0.7213598934172298, + "grad_norm": 0.607295572757721, + "learning_rate": 3.560343836947919e-05, + "loss": 0.8167, + "step": 44940 + }, + { + "epoch": 0.7215204096373938, + "grad_norm": 0.7278205156326294, + "learning_rate": 3.559772871903091e-05, + "loss": 0.8398, + "step": 44950 + }, + { + "epoch": 0.7216809258575579, + "grad_norm": 0.7503724098205566, + "learning_rate": 3.5592018394629286e-05, + "loss": 0.816, + "step": 44960 + }, + { + "epoch": 0.7218414420777219, + "grad_norm": 0.8528366684913635, + "learning_rate": 3.5586307396637456e-05, + "loss": 0.7515, + "step": 44970 + }, + { + "epoch": 0.722001958297886, + "grad_norm": 0.7533091306686401, + "learning_rate": 3.5580595725418615e-05, + "loss": 0.7019, + "step": 44980 + }, + { + "epoch": 0.72216247451805, + "grad_norm": 0.7151532769203186, + "learning_rate": 3.5574883381335985e-05, + "loss": 0.7441, + "step": 44990 + }, + { + "epoch": 0.722322990738214, + "grad_norm": 0.7146628499031067, + "learning_rate": 3.5569170364752834e-05, + "loss": 0.8179, + "step": 45000 + }, + { + "epoch": 0.7224835069583782, + "grad_norm": 0.5116981863975525, + "learning_rate": 3.55634566760325e-05, + "loss": 0.6834, + "step": 45010 + }, + { + "epoch": 0.7226440231785421, + "grad_norm": 1.48153817653656, + "learning_rate": 3.555774231553831e-05, + "loss": 0.8244, + "step": 45020 + }, + { + "epoch": 0.7228045393987063, + "grad_norm": 0.6187269687652588, + "learning_rate": 3.555202728363367e-05, + "loss": 0.8049, + "step": 45030 + }, + { + "epoch": 0.7229650556188703, + "grad_norm": 0.5793522596359253, + "learning_rate": 3.554631158068205e-05, + "loss": 0.7422, + "step": 45040 + }, + { + "epoch": 0.7231255718390344, + "grad_norm": 0.8465526103973389, + "learning_rate": 3.5540595207046887e-05, + "loss": 0.8089, + "step": 45050 + }, + { + "epoch": 0.7232860880591984, + "grad_norm": 1.0236217975616455, + "learning_rate": 3.553487816309175e-05, + "loss": 0.8334, + "step": 45060 + }, + { + "epoch": 0.7234466042793625, + "grad_norm": 0.5327222347259521, + "learning_rate": 3.5529160449180206e-05, + "loss": 0.8061, + "step": 45070 + }, + { + "epoch": 0.7236071204995265, + "grad_norm": 0.7696072459220886, + "learning_rate": 3.552344206567584e-05, + "loss": 0.779, + "step": 45080 + }, + { + "epoch": 0.7237676367196906, + "grad_norm": 0.9290697574615479, + "learning_rate": 3.5517723012942335e-05, + "loss": 0.7775, + "step": 45090 + }, + { + "epoch": 0.7239281529398546, + "grad_norm": 0.7551490664482117, + "learning_rate": 3.551200329134337e-05, + "loss": 0.772, + "step": 45100 + }, + { + "epoch": 0.7240886691600186, + "grad_norm": 0.6667298078536987, + "learning_rate": 3.55062829012427e-05, + "loss": 0.7094, + "step": 45110 + }, + { + "epoch": 0.7242491853801827, + "grad_norm": 0.654815673828125, + "learning_rate": 3.550056184300411e-05, + "loss": 0.7899, + "step": 45120 + }, + { + "epoch": 0.7244097016003467, + "grad_norm": 0.7387762069702148, + "learning_rate": 3.5494840116991405e-05, + "loss": 0.6838, + "step": 45130 + }, + { + "epoch": 0.7245702178205108, + "grad_norm": 0.957543671131134, + "learning_rate": 3.5489117723568475e-05, + "loss": 0.8373, + "step": 45140 + }, + { + "epoch": 0.7247307340406748, + "grad_norm": 0.7133802771568298, + "learning_rate": 3.5483394663099224e-05, + "loss": 0.7503, + "step": 45150 + }, + { + "epoch": 0.7248912502608389, + "grad_norm": 0.793603777885437, + "learning_rate": 3.547767093594761e-05, + "loss": 0.7943, + "step": 45160 + }, + { + "epoch": 0.7250517664810029, + "grad_norm": 0.6490007042884827, + "learning_rate": 3.547194654247761e-05, + "loss": 0.76, + "step": 45170 + }, + { + "epoch": 0.725212282701167, + "grad_norm": 0.578920841217041, + "learning_rate": 3.5466221483053274e-05, + "loss": 0.8218, + "step": 45180 + }, + { + "epoch": 0.725372798921331, + "grad_norm": 0.6700495481491089, + "learning_rate": 3.546049575803868e-05, + "loss": 0.7911, + "step": 45190 + }, + { + "epoch": 0.725533315141495, + "grad_norm": 0.8877912759780884, + "learning_rate": 3.545476936779796e-05, + "loss": 0.7877, + "step": 45200 + }, + { + "epoch": 0.7256938313616591, + "grad_norm": 0.7654439210891724, + "learning_rate": 3.544904231269527e-05, + "loss": 0.8185, + "step": 45210 + }, + { + "epoch": 0.7258543475818231, + "grad_norm": 0.7333383560180664, + "learning_rate": 3.5443314593094826e-05, + "loss": 0.8052, + "step": 45220 + }, + { + "epoch": 0.7260148638019872, + "grad_norm": 0.6632410883903503, + "learning_rate": 3.5437586209360865e-05, + "loss": 0.7455, + "step": 45230 + }, + { + "epoch": 0.7261753800221512, + "grad_norm": 1.0582056045532227, + "learning_rate": 3.5431857161857674e-05, + "loss": 0.7199, + "step": 45240 + }, + { + "epoch": 0.7263358962423153, + "grad_norm": 0.8363789916038513, + "learning_rate": 3.5426127450949604e-05, + "loss": 0.7973, + "step": 45250 + }, + { + "epoch": 0.7264964124624793, + "grad_norm": 1.1137714385986328, + "learning_rate": 3.542039707700103e-05, + "loss": 0.8101, + "step": 45260 + }, + { + "epoch": 0.7266569286826434, + "grad_norm": 1.041069746017456, + "learning_rate": 3.541466604037635e-05, + "loss": 0.8036, + "step": 45270 + }, + { + "epoch": 0.7268174449028074, + "grad_norm": 0.8871268630027771, + "learning_rate": 3.5408934341440044e-05, + "loss": 0.7463, + "step": 45280 + }, + { + "epoch": 0.7269779611229715, + "grad_norm": 0.7312914729118347, + "learning_rate": 3.540320198055661e-05, + "loss": 0.7819, + "step": 45290 + }, + { + "epoch": 0.7271384773431355, + "grad_norm": 1.12020742893219, + "learning_rate": 3.539746895809059e-05, + "loss": 0.9211, + "step": 45300 + }, + { + "epoch": 0.7272989935632995, + "grad_norm": 0.8918442130088806, + "learning_rate": 3.539173527440657e-05, + "loss": 0.7833, + "step": 45310 + }, + { + "epoch": 0.7274595097834636, + "grad_norm": 0.7914934158325195, + "learning_rate": 3.538600092986918e-05, + "loss": 0.7073, + "step": 45320 + }, + { + "epoch": 0.7276200260036276, + "grad_norm": 0.5760055780410767, + "learning_rate": 3.538026592484309e-05, + "loss": 0.8327, + "step": 45330 + }, + { + "epoch": 0.7277805422237917, + "grad_norm": 0.8478487133979797, + "learning_rate": 3.537453025969301e-05, + "loss": 0.708, + "step": 45340 + }, + { + "epoch": 0.7279410584439557, + "grad_norm": 0.6928296685218811, + "learning_rate": 3.5368793934783704e-05, + "loss": 0.8565, + "step": 45350 + }, + { + "epoch": 0.7281015746641198, + "grad_norm": 0.89241623878479, + "learning_rate": 3.5363056950479956e-05, + "loss": 0.7209, + "step": 45360 + }, + { + "epoch": 0.7282620908842838, + "grad_norm": 1.2079685926437378, + "learning_rate": 3.535731930714662e-05, + "loss": 0.8459, + "step": 45370 + }, + { + "epoch": 0.728422607104448, + "grad_norm": 0.9382962584495544, + "learning_rate": 3.535158100514856e-05, + "loss": 0.8027, + "step": 45380 + }, + { + "epoch": 0.728583123324612, + "grad_norm": 1.8930928707122803, + "learning_rate": 3.534584204485071e-05, + "loss": 0.7071, + "step": 45390 + }, + { + "epoch": 0.728743639544776, + "grad_norm": 0.662469744682312, + "learning_rate": 3.534010242661802e-05, + "loss": 0.8569, + "step": 45400 + }, + { + "epoch": 0.72890415576494, + "grad_norm": 0.5306475758552551, + "learning_rate": 3.533436215081551e-05, + "loss": 0.7414, + "step": 45410 + }, + { + "epoch": 0.729064671985104, + "grad_norm": 0.6982967853546143, + "learning_rate": 3.532862121780823e-05, + "loss": 0.7028, + "step": 45420 + }, + { + "epoch": 0.7292251882052682, + "grad_norm": 0.587578296661377, + "learning_rate": 3.532287962796126e-05, + "loss": 0.6621, + "step": 45430 + }, + { + "epoch": 0.7293857044254322, + "grad_norm": 0.7080175280570984, + "learning_rate": 3.5317137381639736e-05, + "loss": 0.7746, + "step": 45440 + }, + { + "epoch": 0.7295462206455963, + "grad_norm": 0.6761866807937622, + "learning_rate": 3.5311394479208835e-05, + "loss": 0.7865, + "step": 45450 + }, + { + "epoch": 0.7297067368657603, + "grad_norm": 0.4305269420146942, + "learning_rate": 3.530565092103376e-05, + "loss": 0.7085, + "step": 45460 + }, + { + "epoch": 0.7298672530859244, + "grad_norm": 0.6998249888420105, + "learning_rate": 3.529990670747977e-05, + "loss": 0.7226, + "step": 45470 + }, + { + "epoch": 0.7300277693060884, + "grad_norm": 0.682244598865509, + "learning_rate": 3.5294161838912176e-05, + "loss": 0.8242, + "step": 45480 + }, + { + "epoch": 0.7301882855262525, + "grad_norm": 0.9088906049728394, + "learning_rate": 3.5288416315696304e-05, + "loss": 0.7843, + "step": 45490 + }, + { + "epoch": 0.7303488017464165, + "grad_norm": 1.0565990209579468, + "learning_rate": 3.528267013819754e-05, + "loss": 0.669, + "step": 45500 + }, + { + "epoch": 0.7305093179665805, + "grad_norm": 0.677294909954071, + "learning_rate": 3.527692330678132e-05, + "loss": 0.7861, + "step": 45510 + }, + { + "epoch": 0.7306698341867446, + "grad_norm": 0.5881915092468262, + "learning_rate": 3.5271175821813084e-05, + "loss": 0.8837, + "step": 45520 + }, + { + "epoch": 0.7308303504069086, + "grad_norm": 0.9773523807525635, + "learning_rate": 3.526542768365836e-05, + "loss": 0.8885, + "step": 45530 + }, + { + "epoch": 0.7309908666270727, + "grad_norm": 1.3537858724594116, + "learning_rate": 3.525967889268269e-05, + "loss": 0.7642, + "step": 45540 + }, + { + "epoch": 0.7311513828472367, + "grad_norm": 0.6391019821166992, + "learning_rate": 3.525392944925165e-05, + "loss": 0.8403, + "step": 45550 + }, + { + "epoch": 0.7313118990674008, + "grad_norm": 0.5086960792541504, + "learning_rate": 3.524817935373089e-05, + "loss": 0.7936, + "step": 45560 + }, + { + "epoch": 0.7314724152875648, + "grad_norm": 0.7177423238754272, + "learning_rate": 3.524242860648607e-05, + "loss": 0.6727, + "step": 45570 + }, + { + "epoch": 0.7316329315077289, + "grad_norm": 0.7113295197486877, + "learning_rate": 3.523667720788291e-05, + "loss": 0.8917, + "step": 45580 + }, + { + "epoch": 0.7317934477278929, + "grad_norm": 0.7589520215988159, + "learning_rate": 3.523092515828715e-05, + "loss": 0.7792, + "step": 45590 + }, + { + "epoch": 0.7319539639480569, + "grad_norm": 0.7181091904640198, + "learning_rate": 3.522517245806462e-05, + "loss": 0.8675, + "step": 45600 + }, + { + "epoch": 0.732114480168221, + "grad_norm": 0.7202557921409607, + "learning_rate": 3.521941910758113e-05, + "loss": 0.7726, + "step": 45610 + }, + { + "epoch": 0.732274996388385, + "grad_norm": 0.6506466865539551, + "learning_rate": 3.521366510720255e-05, + "loss": 0.7585, + "step": 45620 + }, + { + "epoch": 0.7324355126085491, + "grad_norm": 1.5681684017181396, + "learning_rate": 3.520791045729484e-05, + "loss": 0.764, + "step": 45630 + }, + { + "epoch": 0.7325960288287131, + "grad_norm": 1.2547909021377563, + "learning_rate": 3.520215515822393e-05, + "loss": 0.6244, + "step": 45640 + }, + { + "epoch": 0.7327565450488772, + "grad_norm": 0.63919597864151, + "learning_rate": 3.519639921035583e-05, + "loss": 0.7926, + "step": 45650 + }, + { + "epoch": 0.7329170612690412, + "grad_norm": 0.7390354871749878, + "learning_rate": 3.51906426140566e-05, + "loss": 0.7603, + "step": 45660 + }, + { + "epoch": 0.7330775774892053, + "grad_norm": 0.8434482216835022, + "learning_rate": 3.51848853696923e-05, + "loss": 0.6863, + "step": 45670 + }, + { + "epoch": 0.7332380937093693, + "grad_norm": 1.0801622867584229, + "learning_rate": 3.517912747762907e-05, + "loss": 0.6665, + "step": 45680 + }, + { + "epoch": 0.7333986099295334, + "grad_norm": 0.7999425530433655, + "learning_rate": 3.517336893823308e-05, + "loss": 0.7154, + "step": 45690 + }, + { + "epoch": 0.7335591261496974, + "grad_norm": 1.0176345109939575, + "learning_rate": 3.516760975187054e-05, + "loss": 0.689, + "step": 45700 + }, + { + "epoch": 0.7337196423698614, + "grad_norm": 0.7539014220237732, + "learning_rate": 3.516184991890769e-05, + "loss": 0.8252, + "step": 45710 + }, + { + "epoch": 0.7338801585900255, + "grad_norm": 0.7340081930160522, + "learning_rate": 3.515608943971083e-05, + "loss": 0.8339, + "step": 45720 + }, + { + "epoch": 0.7340406748101895, + "grad_norm": 0.7325131893157959, + "learning_rate": 3.51503283146463e-05, + "loss": 0.8739, + "step": 45730 + }, + { + "epoch": 0.7342011910303536, + "grad_norm": 0.7341185808181763, + "learning_rate": 3.514456654408046e-05, + "loss": 0.7058, + "step": 45740 + }, + { + "epoch": 0.7343617072505176, + "grad_norm": 1.1263372898101807, + "learning_rate": 3.513880412837973e-05, + "loss": 0.9261, + "step": 45750 + }, + { + "epoch": 0.7345222234706817, + "grad_norm": 0.9125267863273621, + "learning_rate": 3.513304106791057e-05, + "loss": 0.7444, + "step": 45760 + }, + { + "epoch": 0.7346827396908457, + "grad_norm": 0.45151498913764954, + "learning_rate": 3.512727736303947e-05, + "loss": 0.7429, + "step": 45770 + }, + { + "epoch": 0.7348432559110099, + "grad_norm": 1.1225664615631104, + "learning_rate": 3.512151301413297e-05, + "loss": 0.7781, + "step": 45780 + }, + { + "epoch": 0.7350037721311738, + "grad_norm": 0.8200727105140686, + "learning_rate": 3.511574802155766e-05, + "loss": 0.815, + "step": 45790 + }, + { + "epoch": 0.735164288351338, + "grad_norm": 0.941150426864624, + "learning_rate": 3.5109982385680134e-05, + "loss": 0.7098, + "step": 45800 + }, + { + "epoch": 0.735324804571502, + "grad_norm": 0.8179227709770203, + "learning_rate": 3.510421610686707e-05, + "loss": 0.7396, + "step": 45810 + }, + { + "epoch": 0.735485320791666, + "grad_norm": 0.8163920044898987, + "learning_rate": 3.509844918548518e-05, + "loss": 0.8574, + "step": 45820 + }, + { + "epoch": 0.7356458370118301, + "grad_norm": 1.6106030941009521, + "learning_rate": 3.509268162190118e-05, + "loss": 0.8071, + "step": 45830 + }, + { + "epoch": 0.7358063532319941, + "grad_norm": 0.7064700722694397, + "learning_rate": 3.508691341648188e-05, + "loss": 0.6401, + "step": 45840 + }, + { + "epoch": 0.7359668694521582, + "grad_norm": 1.0426011085510254, + "learning_rate": 3.508114456959408e-05, + "loss": 0.8862, + "step": 45850 + }, + { + "epoch": 0.7361273856723222, + "grad_norm": 1.0215387344360352, + "learning_rate": 3.507537508160466e-05, + "loss": 0.709, + "step": 45860 + }, + { + "epoch": 0.7362879018924863, + "grad_norm": 1.0258136987686157, + "learning_rate": 3.506960495288052e-05, + "loss": 0.7938, + "step": 45870 + }, + { + "epoch": 0.7364484181126503, + "grad_norm": 0.5823265910148621, + "learning_rate": 3.506383418378861e-05, + "loss": 0.8321, + "step": 45880 + }, + { + "epoch": 0.7366089343328144, + "grad_norm": 0.6245133876800537, + "learning_rate": 3.505806277469591e-05, + "loss": 0.7855, + "step": 45890 + }, + { + "epoch": 0.7367694505529784, + "grad_norm": 0.6822571158409119, + "learning_rate": 3.505229072596946e-05, + "loss": 0.7557, + "step": 45900 + }, + { + "epoch": 0.7369299667731424, + "grad_norm": 0.7979654669761658, + "learning_rate": 3.504651803797631e-05, + "loss": 0.7873, + "step": 45910 + }, + { + "epoch": 0.7370904829933065, + "grad_norm": 0.637847900390625, + "learning_rate": 3.504074471108358e-05, + "loss": 0.7504, + "step": 45920 + }, + { + "epoch": 0.7372509992134705, + "grad_norm": 0.7962189316749573, + "learning_rate": 3.503497074565843e-05, + "loss": 0.8195, + "step": 45930 + }, + { + "epoch": 0.7374115154336346, + "grad_norm": 0.8234226107597351, + "learning_rate": 3.502919614206804e-05, + "loss": 0.76, + "step": 45940 + }, + { + "epoch": 0.7375720316537986, + "grad_norm": 0.9526236653327942, + "learning_rate": 3.502342090067963e-05, + "loss": 0.8203, + "step": 45950 + }, + { + "epoch": 0.7377325478739627, + "grad_norm": 1.0076135396957397, + "learning_rate": 3.501764502186049e-05, + "loss": 0.6886, + "step": 45960 + }, + { + "epoch": 0.7378930640941267, + "grad_norm": 0.7763625979423523, + "learning_rate": 3.5011868505977925e-05, + "loss": 0.9005, + "step": 45970 + }, + { + "epoch": 0.7380535803142908, + "grad_norm": 0.6784461140632629, + "learning_rate": 3.500609135339928e-05, + "loss": 0.7986, + "step": 45980 + }, + { + "epoch": 0.7382140965344548, + "grad_norm": 0.5210177898406982, + "learning_rate": 3.500031356449196e-05, + "loss": 0.6078, + "step": 45990 + }, + { + "epoch": 0.7383746127546189, + "grad_norm": 0.6026228070259094, + "learning_rate": 3.4994535139623394e-05, + "loss": 0.741, + "step": 46000 + }, + { + "epoch": 0.7385351289747829, + "grad_norm": 0.9030646085739136, + "learning_rate": 3.4988756079161056e-05, + "loss": 0.6834, + "step": 46010 + }, + { + "epoch": 0.7386956451949469, + "grad_norm": 0.6532267332077026, + "learning_rate": 3.498297638347246e-05, + "loss": 0.676, + "step": 46020 + }, + { + "epoch": 0.738856161415111, + "grad_norm": 0.9453523755073547, + "learning_rate": 3.497719605292514e-05, + "loss": 0.7211, + "step": 46030 + }, + { + "epoch": 0.739016677635275, + "grad_norm": 0.8047298789024353, + "learning_rate": 3.497141508788673e-05, + "loss": 0.6526, + "step": 46040 + }, + { + "epoch": 0.7391771938554391, + "grad_norm": 1.264747142791748, + "learning_rate": 3.4965633488724844e-05, + "loss": 0.7908, + "step": 46050 + }, + { + "epoch": 0.7393377100756031, + "grad_norm": 0.7942028641700745, + "learning_rate": 3.4959851255807165e-05, + "loss": 0.7414, + "step": 46060 + }, + { + "epoch": 0.7394982262957672, + "grad_norm": 0.9146605134010315, + "learning_rate": 3.4954068389501396e-05, + "loss": 0.7914, + "step": 46070 + }, + { + "epoch": 0.7396587425159312, + "grad_norm": 1.3214107751846313, + "learning_rate": 3.494828489017531e-05, + "loss": 0.7944, + "step": 46080 + }, + { + "epoch": 0.7398192587360953, + "grad_norm": 0.5482259392738342, + "learning_rate": 3.494250075819669e-05, + "loss": 0.858, + "step": 46090 + }, + { + "epoch": 0.7399797749562593, + "grad_norm": 0.9854785203933716, + "learning_rate": 3.4936715993933385e-05, + "loss": 0.7314, + "step": 46100 + }, + { + "epoch": 0.7401402911764233, + "grad_norm": 0.5981090068817139, + "learning_rate": 3.4930930597753255e-05, + "loss": 0.7414, + "step": 46110 + }, + { + "epoch": 0.7403008073965874, + "grad_norm": 1.2913228273391724, + "learning_rate": 3.4925144570024235e-05, + "loss": 0.8071, + "step": 46120 + }, + { + "epoch": 0.7404613236167514, + "grad_norm": 0.6664310097694397, + "learning_rate": 3.4919357911114275e-05, + "loss": 0.5402, + "step": 46130 + }, + { + "epoch": 0.7406218398369155, + "grad_norm": 1.0477807521820068, + "learning_rate": 3.491357062139137e-05, + "loss": 0.7724, + "step": 46140 + }, + { + "epoch": 0.7407823560570795, + "grad_norm": 0.9953406453132629, + "learning_rate": 3.490778270122356e-05, + "loss": 0.7543, + "step": 46150 + }, + { + "epoch": 0.7409428722772436, + "grad_norm": 0.5993740558624268, + "learning_rate": 3.490199415097892e-05, + "loss": 0.7056, + "step": 46160 + }, + { + "epoch": 0.7411033884974076, + "grad_norm": 0.7767809629440308, + "learning_rate": 3.489620497102558e-05, + "loss": 0.7934, + "step": 46170 + }, + { + "epoch": 0.7412639047175718, + "grad_norm": 0.8357779383659363, + "learning_rate": 3.489041516173168e-05, + "loss": 0.7296, + "step": 46180 + }, + { + "epoch": 0.7414244209377358, + "grad_norm": 0.7672458291053772, + "learning_rate": 3.488462472346543e-05, + "loss": 0.8138, + "step": 46190 + }, + { + "epoch": 0.7415849371578999, + "grad_norm": 0.8750352263450623, + "learning_rate": 3.4878833656595064e-05, + "loss": 0.8281, + "step": 46200 + }, + { + "epoch": 0.7417454533780639, + "grad_norm": 0.6871256232261658, + "learning_rate": 3.487304196148886e-05, + "loss": 0.8014, + "step": 46210 + }, + { + "epoch": 0.7419059695982279, + "grad_norm": 0.775514543056488, + "learning_rate": 3.4867249638515145e-05, + "loss": 0.7535, + "step": 46220 + }, + { + "epoch": 0.742066485818392, + "grad_norm": 1.303187608718872, + "learning_rate": 3.486145668804225e-05, + "loss": 0.9207, + "step": 46230 + }, + { + "epoch": 0.742227002038556, + "grad_norm": 0.785500168800354, + "learning_rate": 3.48556631104386e-05, + "loss": 0.8364, + "step": 46240 + }, + { + "epoch": 0.7423875182587201, + "grad_norm": 0.682597815990448, + "learning_rate": 3.4849868906072633e-05, + "loss": 0.8279, + "step": 46250 + }, + { + "epoch": 0.7425480344788841, + "grad_norm": 0.6680241227149963, + "learning_rate": 3.4844074075312806e-05, + "loss": 0.7078, + "step": 46260 + }, + { + "epoch": 0.7427085506990482, + "grad_norm": 0.8856092095375061, + "learning_rate": 3.4838278618527644e-05, + "loss": 0.9061, + "step": 46270 + }, + { + "epoch": 0.7428690669192122, + "grad_norm": 0.5470900535583496, + "learning_rate": 3.483248253608571e-05, + "loss": 0.8476, + "step": 46280 + }, + { + "epoch": 0.7430295831393763, + "grad_norm": 0.7936294078826904, + "learning_rate": 3.48266858283556e-05, + "loss": 0.8021, + "step": 46290 + }, + { + "epoch": 0.7431900993595403, + "grad_norm": 0.9083913564682007, + "learning_rate": 3.482088849570594e-05, + "loss": 0.7027, + "step": 46300 + }, + { + "epoch": 0.7433506155797043, + "grad_norm": 1.2436202764511108, + "learning_rate": 3.4815090538505415e-05, + "loss": 0.769, + "step": 46310 + }, + { + "epoch": 0.7435111317998684, + "grad_norm": 0.7077643275260925, + "learning_rate": 3.4809291957122756e-05, + "loss": 0.6837, + "step": 46320 + }, + { + "epoch": 0.7436716480200324, + "grad_norm": 0.6772229075431824, + "learning_rate": 3.480349275192669e-05, + "loss": 0.7047, + "step": 46330 + }, + { + "epoch": 0.7438321642401965, + "grad_norm": 1.3686575889587402, + "learning_rate": 3.479769292328603e-05, + "loss": 0.747, + "step": 46340 + }, + { + "epoch": 0.7439926804603605, + "grad_norm": 0.7352926135063171, + "learning_rate": 3.47918924715696e-05, + "loss": 0.7885, + "step": 46350 + }, + { + "epoch": 0.7441531966805246, + "grad_norm": 0.8482518792152405, + "learning_rate": 3.478609139714628e-05, + "loss": 0.7137, + "step": 46360 + }, + { + "epoch": 0.7443137129006886, + "grad_norm": 0.7089337706565857, + "learning_rate": 3.4780289700385e-05, + "loss": 0.8582, + "step": 46370 + }, + { + "epoch": 0.7444742291208527, + "grad_norm": 1.3654853105545044, + "learning_rate": 3.477448738165469e-05, + "loss": 0.9459, + "step": 46380 + }, + { + "epoch": 0.7446347453410167, + "grad_norm": 1.0225071907043457, + "learning_rate": 3.476868444132435e-05, + "loss": 0.6388, + "step": 46390 + }, + { + "epoch": 0.7447952615611808, + "grad_norm": 0.593392550945282, + "learning_rate": 3.476288087976302e-05, + "loss": 0.8466, + "step": 46400 + }, + { + "epoch": 0.7449557777813448, + "grad_norm": 1.065180778503418, + "learning_rate": 3.4757076697339776e-05, + "loss": 0.7409, + "step": 46410 + }, + { + "epoch": 0.7451162940015088, + "grad_norm": 1.2445522546768188, + "learning_rate": 3.475127189442371e-05, + "loss": 0.8317, + "step": 46420 + }, + { + "epoch": 0.7452768102216729, + "grad_norm": 0.6922425031661987, + "learning_rate": 3.4745466471383994e-05, + "loss": 0.7752, + "step": 46430 + }, + { + "epoch": 0.7454373264418369, + "grad_norm": 0.9538314938545227, + "learning_rate": 3.473966042858981e-05, + "loss": 0.8426, + "step": 46440 + }, + { + "epoch": 0.745597842662001, + "grad_norm": 0.4190889298915863, + "learning_rate": 3.473385376641039e-05, + "loss": 0.6197, + "step": 46450 + }, + { + "epoch": 0.745758358882165, + "grad_norm": 1.274301528930664, + "learning_rate": 3.472804648521499e-05, + "loss": 0.7873, + "step": 46460 + }, + { + "epoch": 0.7459188751023291, + "grad_norm": 0.8003995418548584, + "learning_rate": 3.4722238585372945e-05, + "loss": 0.833, + "step": 46470 + }, + { + "epoch": 0.7460793913224931, + "grad_norm": 0.7970043420791626, + "learning_rate": 3.471643006725358e-05, + "loss": 0.8242, + "step": 46480 + }, + { + "epoch": 0.7462399075426572, + "grad_norm": 0.780644953250885, + "learning_rate": 3.4710620931226295e-05, + "loss": 0.791, + "step": 46490 + }, + { + "epoch": 0.7464004237628212, + "grad_norm": 0.69204181432724, + "learning_rate": 3.470481117766052e-05, + "loss": 0.7807, + "step": 46500 + }, + { + "epoch": 0.7465609399829852, + "grad_norm": 0.591472327709198, + "learning_rate": 3.4699000806925715e-05, + "loss": 0.8082, + "step": 46510 + }, + { + "epoch": 0.7467214562031493, + "grad_norm": 1.3708049058914185, + "learning_rate": 3.469318981939138e-05, + "loss": 0.8039, + "step": 46520 + }, + { + "epoch": 0.7468819724233133, + "grad_norm": 0.8735969066619873, + "learning_rate": 3.468737821542707e-05, + "loss": 0.7551, + "step": 46530 + }, + { + "epoch": 0.7470424886434774, + "grad_norm": 0.620796263217926, + "learning_rate": 3.468156599540236e-05, + "loss": 0.8865, + "step": 46540 + }, + { + "epoch": 0.7472030048636414, + "grad_norm": 0.5172329545021057, + "learning_rate": 3.467575315968688e-05, + "loss": 0.7783, + "step": 46550 + }, + { + "epoch": 0.7473635210838055, + "grad_norm": 0.7837884426116943, + "learning_rate": 3.466993970865029e-05, + "loss": 0.8748, + "step": 46560 + }, + { + "epoch": 0.7475240373039695, + "grad_norm": 0.6768105626106262, + "learning_rate": 3.4664125642662295e-05, + "loss": 0.7636, + "step": 46570 + }, + { + "epoch": 0.7476845535241337, + "grad_norm": 0.9330618381500244, + "learning_rate": 3.465831096209263e-05, + "loss": 0.9333, + "step": 46580 + }, + { + "epoch": 0.7478450697442977, + "grad_norm": 0.7827755808830261, + "learning_rate": 3.4652495667311076e-05, + "loss": 0.7072, + "step": 46590 + }, + { + "epoch": 0.7480055859644618, + "grad_norm": 0.7209663391113281, + "learning_rate": 3.464667975868745e-05, + "loss": 0.8475, + "step": 46600 + }, + { + "epoch": 0.7481661021846258, + "grad_norm": 0.8247103095054626, + "learning_rate": 3.464086323659161e-05, + "loss": 0.8401, + "step": 46610 + }, + { + "epoch": 0.7483266184047898, + "grad_norm": 0.8554466366767883, + "learning_rate": 3.463504610139346e-05, + "loss": 0.6963, + "step": 46620 + }, + { + "epoch": 0.7484871346249539, + "grad_norm": 0.8670833706855774, + "learning_rate": 3.4629228353462926e-05, + "loss": 0.8112, + "step": 46630 + }, + { + "epoch": 0.7486476508451179, + "grad_norm": 0.5264651775360107, + "learning_rate": 3.4623409993169986e-05, + "loss": 0.6618, + "step": 46640 + }, + { + "epoch": 0.748808167065282, + "grad_norm": 0.740392804145813, + "learning_rate": 3.4617591020884655e-05, + "loss": 0.8436, + "step": 46650 + }, + { + "epoch": 0.748968683285446, + "grad_norm": 0.6391090154647827, + "learning_rate": 3.4611771436976986e-05, + "loss": 0.7313, + "step": 46660 + }, + { + "epoch": 0.7491291995056101, + "grad_norm": 0.49133387207984924, + "learning_rate": 3.460595124181707e-05, + "loss": 0.6799, + "step": 46670 + }, + { + "epoch": 0.7492897157257741, + "grad_norm": 0.7975960969924927, + "learning_rate": 3.4600130435775036e-05, + "loss": 0.7554, + "step": 46680 + }, + { + "epoch": 0.7494502319459382, + "grad_norm": 0.71964031457901, + "learning_rate": 3.459430901922105e-05, + "loss": 0.7753, + "step": 46690 + }, + { + "epoch": 0.7496107481661022, + "grad_norm": 0.9347183704376221, + "learning_rate": 3.458848699252533e-05, + "loss": 0.8981, + "step": 46700 + }, + { + "epoch": 0.7497712643862662, + "grad_norm": 0.8880789279937744, + "learning_rate": 3.4582664356058104e-05, + "loss": 0.715, + "step": 46710 + }, + { + "epoch": 0.7499317806064303, + "grad_norm": 0.7372109293937683, + "learning_rate": 3.457684111018968e-05, + "loss": 0.7826, + "step": 46720 + }, + { + "epoch": 0.7500922968265943, + "grad_norm": 0.6302301287651062, + "learning_rate": 3.457101725529037e-05, + "loss": 0.7026, + "step": 46730 + }, + { + "epoch": 0.7502528130467584, + "grad_norm": 1.1651113033294678, + "learning_rate": 3.4565192791730536e-05, + "loss": 0.7759, + "step": 46740 + }, + { + "epoch": 0.7504133292669224, + "grad_norm": 1.8180187940597534, + "learning_rate": 3.455936771988058e-05, + "loss": 0.7677, + "step": 46750 + }, + { + "epoch": 0.7505738454870865, + "grad_norm": 0.5760422945022583, + "learning_rate": 3.455354204011095e-05, + "loss": 0.8369, + "step": 46760 + }, + { + "epoch": 0.7507343617072505, + "grad_norm": 0.9635288715362549, + "learning_rate": 3.454771575279212e-05, + "loss": 0.759, + "step": 46770 + }, + { + "epoch": 0.7508948779274146, + "grad_norm": 0.7615636587142944, + "learning_rate": 3.4541888858294605e-05, + "loss": 0.8246, + "step": 46780 + }, + { + "epoch": 0.7510553941475786, + "grad_norm": 0.6771033406257629, + "learning_rate": 3.4536061356988964e-05, + "loss": 0.7842, + "step": 46790 + }, + { + "epoch": 0.7512159103677427, + "grad_norm": 0.8168887495994568, + "learning_rate": 3.4530233249245795e-05, + "loss": 0.8158, + "step": 46800 + }, + { + "epoch": 0.7513764265879067, + "grad_norm": 1.118881344795227, + "learning_rate": 3.4524404535435725e-05, + "loss": 0.8251, + "step": 46810 + }, + { + "epoch": 0.7515369428080707, + "grad_norm": 1.382838249206543, + "learning_rate": 3.451857521592943e-05, + "loss": 0.6661, + "step": 46820 + }, + { + "epoch": 0.7516974590282348, + "grad_norm": 1.117309331893921, + "learning_rate": 3.451274529109761e-05, + "loss": 0.7533, + "step": 46830 + }, + { + "epoch": 0.7518579752483988, + "grad_norm": 0.7794817090034485, + "learning_rate": 3.4506914761311036e-05, + "loss": 0.8202, + "step": 46840 + }, + { + "epoch": 0.7520184914685629, + "grad_norm": 0.6798915863037109, + "learning_rate": 3.450108362694048e-05, + "loss": 0.7958, + "step": 46850 + }, + { + "epoch": 0.7521790076887269, + "grad_norm": 0.5663411617279053, + "learning_rate": 3.449525188835677e-05, + "loss": 0.6325, + "step": 46860 + }, + { + "epoch": 0.752339523908891, + "grad_norm": 1.09853994846344, + "learning_rate": 3.4489419545930774e-05, + "loss": 0.8181, + "step": 46870 + }, + { + "epoch": 0.752500040129055, + "grad_norm": 0.5723960995674133, + "learning_rate": 3.448358660003339e-05, + "loss": 0.7409, + "step": 46880 + }, + { + "epoch": 0.7526605563492191, + "grad_norm": 0.9123439192771912, + "learning_rate": 3.4477753051035567e-05, + "loss": 0.7841, + "step": 46890 + }, + { + "epoch": 0.7528210725693831, + "grad_norm": 0.6417880654335022, + "learning_rate": 3.447191889930827e-05, + "loss": 0.8229, + "step": 46900 + }, + { + "epoch": 0.7529815887895471, + "grad_norm": 0.8627910017967224, + "learning_rate": 3.446608414522253e-05, + "loss": 0.817, + "step": 46910 + }, + { + "epoch": 0.7531421050097112, + "grad_norm": 0.6413512825965881, + "learning_rate": 3.4460248789149396e-05, + "loss": 0.7399, + "step": 46920 + }, + { + "epoch": 0.7533026212298752, + "grad_norm": 0.7903092503547668, + "learning_rate": 3.445441283145996e-05, + "loss": 0.7551, + "step": 46930 + }, + { + "epoch": 0.7534631374500393, + "grad_norm": 0.9058539271354675, + "learning_rate": 3.444857627252537e-05, + "loss": 0.896, + "step": 46940 + }, + { + "epoch": 0.7536236536702033, + "grad_norm": 0.7881035804748535, + "learning_rate": 3.444273911271679e-05, + "loss": 0.8308, + "step": 46950 + }, + { + "epoch": 0.7537841698903675, + "grad_norm": 1.2780065536499023, + "learning_rate": 3.443690135240541e-05, + "loss": 0.9231, + "step": 46960 + }, + { + "epoch": 0.7539446861105314, + "grad_norm": 0.7312056422233582, + "learning_rate": 3.4431062991962506e-05, + "loss": 0.6801, + "step": 46970 + }, + { + "epoch": 0.7541052023306956, + "grad_norm": 0.8339315056800842, + "learning_rate": 3.442522403175934e-05, + "loss": 0.8191, + "step": 46980 + }, + { + "epoch": 0.7542657185508596, + "grad_norm": 0.6344066262245178, + "learning_rate": 3.441938447216725e-05, + "loss": 0.729, + "step": 46990 + }, + { + "epoch": 0.7544262347710237, + "grad_norm": 0.8734412789344788, + "learning_rate": 3.44135443135576e-05, + "loss": 0.8237, + "step": 47000 + }, + { + "epoch": 0.7545867509911877, + "grad_norm": 0.5487950444221497, + "learning_rate": 3.440770355630178e-05, + "loss": 0.7837, + "step": 47010 + }, + { + "epoch": 0.7547472672113517, + "grad_norm": 0.5525424480438232, + "learning_rate": 3.4401862200771226e-05, + "loss": 0.7693, + "step": 47020 + }, + { + "epoch": 0.7549077834315158, + "grad_norm": 0.5308713316917419, + "learning_rate": 3.439602024733743e-05, + "loss": 0.8235, + "step": 47030 + }, + { + "epoch": 0.7550682996516798, + "grad_norm": 0.5107971429824829, + "learning_rate": 3.439017769637189e-05, + "loss": 0.6793, + "step": 47040 + }, + { + "epoch": 0.7552288158718439, + "grad_norm": 0.8562702536582947, + "learning_rate": 3.4384334548246156e-05, + "loss": 0.7931, + "step": 47050 + }, + { + "epoch": 0.7553893320920079, + "grad_norm": 0.9141082167625427, + "learning_rate": 3.4378490803331836e-05, + "loss": 0.9288, + "step": 47060 + }, + { + "epoch": 0.755549848312172, + "grad_norm": 0.5264933705329895, + "learning_rate": 3.4372646462000544e-05, + "loss": 0.6829, + "step": 47070 + }, + { + "epoch": 0.755710364532336, + "grad_norm": 0.9945857524871826, + "learning_rate": 3.436680152462395e-05, + "loss": 0.7343, + "step": 47080 + }, + { + "epoch": 0.7558708807525001, + "grad_norm": 1.2583001852035522, + "learning_rate": 3.436095599157375e-05, + "loss": 0.8002, + "step": 47090 + }, + { + "epoch": 0.7560313969726641, + "grad_norm": 0.6638344526290894, + "learning_rate": 3.4355109863221695e-05, + "loss": 0.7014, + "step": 47100 + }, + { + "epoch": 0.7561919131928281, + "grad_norm": 0.7283402681350708, + "learning_rate": 3.4349263139939566e-05, + "loss": 0.7813, + "step": 47110 + }, + { + "epoch": 0.7563524294129922, + "grad_norm": 0.5065928101539612, + "learning_rate": 3.4343415822099165e-05, + "loss": 0.6863, + "step": 47120 + }, + { + "epoch": 0.7565129456331562, + "grad_norm": 0.6920192241668701, + "learning_rate": 3.4337567910072376e-05, + "loss": 0.8297, + "step": 47130 + }, + { + "epoch": 0.7566734618533203, + "grad_norm": 0.7680574059486389, + "learning_rate": 3.433171940423106e-05, + "loss": 0.8249, + "step": 47140 + }, + { + "epoch": 0.7568339780734843, + "grad_norm": 0.8951802849769592, + "learning_rate": 3.432587030494717e-05, + "loss": 0.8049, + "step": 47150 + }, + { + "epoch": 0.7569944942936484, + "grad_norm": 0.7420780062675476, + "learning_rate": 3.4320020612592664e-05, + "loss": 0.7482, + "step": 47160 + }, + { + "epoch": 0.7571550105138124, + "grad_norm": 0.48718616366386414, + "learning_rate": 3.431417032753954e-05, + "loss": 0.7663, + "step": 47170 + }, + { + "epoch": 0.7573155267339765, + "grad_norm": 1.1934170722961426, + "learning_rate": 3.430831945015986e-05, + "loss": 0.8336, + "step": 47180 + }, + { + "epoch": 0.7574760429541405, + "grad_norm": 1.1655116081237793, + "learning_rate": 3.43024679808257e-05, + "loss": 0.7571, + "step": 47190 + }, + { + "epoch": 0.7576365591743046, + "grad_norm": 0.9448001384735107, + "learning_rate": 3.429661591990917e-05, + "loss": 0.7507, + "step": 47200 + }, + { + "epoch": 0.7577970753944686, + "grad_norm": 0.6156874895095825, + "learning_rate": 3.429076326778243e-05, + "loss": 0.7743, + "step": 47210 + }, + { + "epoch": 0.7579575916146326, + "grad_norm": 1.0309243202209473, + "learning_rate": 3.428491002481769e-05, + "loss": 0.7714, + "step": 47220 + }, + { + "epoch": 0.7581181078347967, + "grad_norm": 0.6443856954574585, + "learning_rate": 3.427905619138716e-05, + "loss": 0.7416, + "step": 47230 + }, + { + "epoch": 0.7582786240549607, + "grad_norm": 0.6925715804100037, + "learning_rate": 3.427320176786312e-05, + "loss": 0.7398, + "step": 47240 + }, + { + "epoch": 0.7584391402751248, + "grad_norm": 0.7475680708885193, + "learning_rate": 3.426734675461788e-05, + "loss": 0.7203, + "step": 47250 + }, + { + "epoch": 0.7585996564952888, + "grad_norm": 0.9528889656066895, + "learning_rate": 3.426149115202378e-05, + "loss": 0.7963, + "step": 47260 + }, + { + "epoch": 0.7587601727154529, + "grad_norm": 0.8842716217041016, + "learning_rate": 3.42556349604532e-05, + "loss": 0.8826, + "step": 47270 + }, + { + "epoch": 0.7589206889356169, + "grad_norm": 1.7770744562149048, + "learning_rate": 3.424977818027856e-05, + "loss": 0.752, + "step": 47280 + }, + { + "epoch": 0.759081205155781, + "grad_norm": 0.5097578167915344, + "learning_rate": 3.424392081187231e-05, + "loss": 0.8719, + "step": 47290 + }, + { + "epoch": 0.759241721375945, + "grad_norm": 0.6701672077178955, + "learning_rate": 3.423806285560696e-05, + "loss": 0.6617, + "step": 47300 + }, + { + "epoch": 0.7594022375961091, + "grad_norm": 0.8934614062309265, + "learning_rate": 3.4232204311855046e-05, + "loss": 0.8474, + "step": 47310 + }, + { + "epoch": 0.7595627538162731, + "grad_norm": 0.8113291263580322, + "learning_rate": 3.422634518098911e-05, + "loss": 0.6734, + "step": 47320 + }, + { + "epoch": 0.7597232700364371, + "grad_norm": 0.8437072038650513, + "learning_rate": 3.422048546338178e-05, + "loss": 0.8289, + "step": 47330 + }, + { + "epoch": 0.7598837862566012, + "grad_norm": 0.929185152053833, + "learning_rate": 3.4214625159405696e-05, + "loss": 0.8995, + "step": 47340 + }, + { + "epoch": 0.7600443024767652, + "grad_norm": 0.7989711165428162, + "learning_rate": 3.420876426943353e-05, + "loss": 0.8066, + "step": 47350 + }, + { + "epoch": 0.7602048186969294, + "grad_norm": 1.0777970552444458, + "learning_rate": 3.420290279383801e-05, + "loss": 0.7043, + "step": 47360 + }, + { + "epoch": 0.7603653349170933, + "grad_norm": 0.5642830729484558, + "learning_rate": 3.419704073299189e-05, + "loss": 0.722, + "step": 47370 + }, + { + "epoch": 0.7605258511372575, + "grad_norm": 0.5777187943458557, + "learning_rate": 3.419117808726795e-05, + "loss": 0.9146, + "step": 47380 + }, + { + "epoch": 0.7606863673574215, + "grad_norm": 0.9973240494728088, + "learning_rate": 3.418531485703904e-05, + "loss": 0.843, + "step": 47390 + }, + { + "epoch": 0.7608468835775856, + "grad_norm": 0.7608653903007507, + "learning_rate": 3.417945104267802e-05, + "loss": 0.8007, + "step": 47400 + }, + { + "epoch": 0.7610073997977496, + "grad_norm": 0.789920449256897, + "learning_rate": 3.417358664455779e-05, + "loss": 0.7232, + "step": 47410 + }, + { + "epoch": 0.7611679160179136, + "grad_norm": 1.0600416660308838, + "learning_rate": 3.4167721663051285e-05, + "loss": 0.8468, + "step": 47420 + }, + { + "epoch": 0.7613284322380777, + "grad_norm": 1.3420401811599731, + "learning_rate": 3.4161856098531496e-05, + "loss": 0.7584, + "step": 47430 + }, + { + "epoch": 0.7614889484582417, + "grad_norm": 0.7567830681800842, + "learning_rate": 3.4155989951371434e-05, + "loss": 0.7416, + "step": 47440 + }, + { + "epoch": 0.7616494646784058, + "grad_norm": 1.0696054697036743, + "learning_rate": 3.415012322194415e-05, + "loss": 0.6926, + "step": 47450 + }, + { + "epoch": 0.7618099808985698, + "grad_norm": 0.8253053426742554, + "learning_rate": 3.4144255910622734e-05, + "loss": 0.7476, + "step": 47460 + }, + { + "epoch": 0.7619704971187339, + "grad_norm": 0.8364560604095459, + "learning_rate": 3.413838801778032e-05, + "loss": 0.7486, + "step": 47470 + }, + { + "epoch": 0.7621310133388979, + "grad_norm": 0.5578338503837585, + "learning_rate": 3.413251954379006e-05, + "loss": 0.8421, + "step": 47480 + }, + { + "epoch": 0.762291529559062, + "grad_norm": 0.7376497983932495, + "learning_rate": 3.412665048902516e-05, + "loss": 0.766, + "step": 47490 + }, + { + "epoch": 0.762452045779226, + "grad_norm": 0.511900007724762, + "learning_rate": 3.412078085385886e-05, + "loss": 0.8833, + "step": 47500 + }, + { + "epoch": 0.7626125619993901, + "grad_norm": 0.8520833253860474, + "learning_rate": 3.411491063866443e-05, + "loss": 0.6791, + "step": 47510 + }, + { + "epoch": 0.7627730782195541, + "grad_norm": 1.0527558326721191, + "learning_rate": 3.410903984381518e-05, + "loss": 0.7416, + "step": 47520 + }, + { + "epoch": 0.7629335944397181, + "grad_norm": 1.104543685913086, + "learning_rate": 3.410316846968446e-05, + "loss": 0.7548, + "step": 47530 + }, + { + "epoch": 0.7630941106598822, + "grad_norm": 0.7047179341316223, + "learning_rate": 3.409729651664566e-05, + "loss": 0.7395, + "step": 47540 + }, + { + "epoch": 0.7632546268800462, + "grad_norm": 1.2407176494598389, + "learning_rate": 3.409142398507219e-05, + "loss": 0.724, + "step": 47550 + }, + { + "epoch": 0.7634151431002103, + "grad_norm": 0.6105513572692871, + "learning_rate": 3.408555087533752e-05, + "loss": 0.8059, + "step": 47560 + }, + { + "epoch": 0.7635756593203743, + "grad_norm": 0.8185921311378479, + "learning_rate": 3.407967718781514e-05, + "loss": 0.8653, + "step": 47570 + }, + { + "epoch": 0.7637361755405384, + "grad_norm": 0.5622501373291016, + "learning_rate": 3.4073802922878586e-05, + "loss": 0.8159, + "step": 47580 + }, + { + "epoch": 0.7638966917607024, + "grad_norm": 0.7776587009429932, + "learning_rate": 3.406792808090142e-05, + "loss": 0.8158, + "step": 47590 + }, + { + "epoch": 0.7640572079808665, + "grad_norm": 0.5370931029319763, + "learning_rate": 3.4062052662257246e-05, + "loss": 0.8575, + "step": 47600 + }, + { + "epoch": 0.7642177242010305, + "grad_norm": 0.6818130612373352, + "learning_rate": 3.405617666731972e-05, + "loss": 0.7397, + "step": 47610 + }, + { + "epoch": 0.7643782404211945, + "grad_norm": 0.6477590203285217, + "learning_rate": 3.405030009646252e-05, + "loss": 0.7463, + "step": 47620 + }, + { + "epoch": 0.7645387566413586, + "grad_norm": 0.6784939169883728, + "learning_rate": 3.404442295005934e-05, + "loss": 0.8242, + "step": 47630 + }, + { + "epoch": 0.7646992728615226, + "grad_norm": 0.4738917946815491, + "learning_rate": 3.4038545228483945e-05, + "loss": 0.8061, + "step": 47640 + }, + { + "epoch": 0.7648597890816867, + "grad_norm": 0.8201245069503784, + "learning_rate": 3.403266693211013e-05, + "loss": 0.7692, + "step": 47650 + }, + { + "epoch": 0.7650203053018507, + "grad_norm": 1.248845100402832, + "learning_rate": 3.402678806131172e-05, + "loss": 0.7772, + "step": 47660 + }, + { + "epoch": 0.7651808215220148, + "grad_norm": 0.6773941516876221, + "learning_rate": 3.402090861646256e-05, + "loss": 0.698, + "step": 47670 + }, + { + "epoch": 0.7653413377421788, + "grad_norm": 0.6296435594558716, + "learning_rate": 3.4015028597936565e-05, + "loss": 0.7773, + "step": 47680 + }, + { + "epoch": 0.7655018539623429, + "grad_norm": 0.8133717179298401, + "learning_rate": 3.4009148006107674e-05, + "loss": 0.785, + "step": 47690 + }, + { + "epoch": 0.7656623701825069, + "grad_norm": 0.6503700613975525, + "learning_rate": 3.400326684134984e-05, + "loss": 0.8582, + "step": 47700 + }, + { + "epoch": 0.765822886402671, + "grad_norm": 0.7359870672225952, + "learning_rate": 3.3997385104037065e-05, + "loss": 0.869, + "step": 47710 + }, + { + "epoch": 0.765983402622835, + "grad_norm": 0.9801903963088989, + "learning_rate": 3.399150279454342e-05, + "loss": 0.7861, + "step": 47720 + }, + { + "epoch": 0.766143918842999, + "grad_norm": 0.6348630785942078, + "learning_rate": 3.398561991324297e-05, + "loss": 0.739, + "step": 47730 + }, + { + "epoch": 0.7663044350631631, + "grad_norm": 0.9088622331619263, + "learning_rate": 3.397973646050984e-05, + "loss": 0.7218, + "step": 47740 + }, + { + "epoch": 0.7664649512833271, + "grad_norm": 0.6575883626937866, + "learning_rate": 3.3973852436718165e-05, + "loss": 0.8651, + "step": 47750 + }, + { + "epoch": 0.7666254675034913, + "grad_norm": 0.610304057598114, + "learning_rate": 3.3967967842242156e-05, + "loss": 0.7839, + "step": 47760 + }, + { + "epoch": 0.7667859837236553, + "grad_norm": 1.184758186340332, + "learning_rate": 3.3962082677456025e-05, + "loss": 0.7082, + "step": 47770 + }, + { + "epoch": 0.7669464999438194, + "grad_norm": 0.4262809157371521, + "learning_rate": 3.3956196942734044e-05, + "loss": 0.6568, + "step": 47780 + }, + { + "epoch": 0.7671070161639834, + "grad_norm": 0.670619547367096, + "learning_rate": 3.3950310638450494e-05, + "loss": 0.6822, + "step": 47790 + }, + { + "epoch": 0.7672675323841475, + "grad_norm": 1.316801905632019, + "learning_rate": 3.394442376497973e-05, + "loss": 0.7746, + "step": 47800 + }, + { + "epoch": 0.7674280486043115, + "grad_norm": 0.7800815105438232, + "learning_rate": 3.3938536322696106e-05, + "loss": 0.7763, + "step": 47810 + }, + { + "epoch": 0.7675885648244755, + "grad_norm": 0.6589600443840027, + "learning_rate": 3.393264831197404e-05, + "loss": 0.7018, + "step": 47820 + }, + { + "epoch": 0.7677490810446396, + "grad_norm": 0.8333020806312561, + "learning_rate": 3.392675973318796e-05, + "loss": 0.8367, + "step": 47830 + }, + { + "epoch": 0.7679095972648036, + "grad_norm": 0.8035185933113098, + "learning_rate": 3.3920870586712374e-05, + "loss": 0.9729, + "step": 47840 + }, + { + "epoch": 0.7680701134849677, + "grad_norm": 0.6918268799781799, + "learning_rate": 3.3914980872921766e-05, + "loss": 0.703, + "step": 47850 + }, + { + "epoch": 0.7682306297051317, + "grad_norm": 0.5614519715309143, + "learning_rate": 3.39090905921907e-05, + "loss": 0.7367, + "step": 47860 + }, + { + "epoch": 0.7683911459252958, + "grad_norm": 1.1502766609191895, + "learning_rate": 3.3903199744893765e-05, + "loss": 0.7605, + "step": 47870 + }, + { + "epoch": 0.7685516621454598, + "grad_norm": 0.686087965965271, + "learning_rate": 3.389730833140559e-05, + "loss": 0.6743, + "step": 47880 + }, + { + "epoch": 0.7687121783656239, + "grad_norm": 0.7218456268310547, + "learning_rate": 3.389141635210082e-05, + "loss": 0.7185, + "step": 47890 + }, + { + "epoch": 0.7688726945857879, + "grad_norm": 0.6282814145088196, + "learning_rate": 3.388552380735416e-05, + "loss": 0.7476, + "step": 47900 + }, + { + "epoch": 0.769033210805952, + "grad_norm": 0.8465125560760498, + "learning_rate": 3.3879630697540335e-05, + "loss": 0.934, + "step": 47910 + }, + { + "epoch": 0.769193727026116, + "grad_norm": 0.6591414213180542, + "learning_rate": 3.3873737023034114e-05, + "loss": 0.7864, + "step": 47920 + }, + { + "epoch": 0.76935424324628, + "grad_norm": 0.7375032305717468, + "learning_rate": 3.3867842784210305e-05, + "loss": 0.8614, + "step": 47930 + }, + { + "epoch": 0.7695147594664441, + "grad_norm": 1.0226800441741943, + "learning_rate": 3.386194798144375e-05, + "loss": 0.8013, + "step": 47940 + }, + { + "epoch": 0.7696752756866081, + "grad_norm": 0.7681062817573547, + "learning_rate": 3.3856052615109314e-05, + "loss": 0.8021, + "step": 47950 + }, + { + "epoch": 0.7698357919067722, + "grad_norm": 0.5256933569908142, + "learning_rate": 3.3850156685581904e-05, + "loss": 0.7612, + "step": 47960 + }, + { + "epoch": 0.7699963081269362, + "grad_norm": 0.7660645246505737, + "learning_rate": 3.384426019323649e-05, + "loss": 0.7206, + "step": 47970 + }, + { + "epoch": 0.7701568243471003, + "grad_norm": 0.5864084959030151, + "learning_rate": 3.383836313844802e-05, + "loss": 0.6764, + "step": 47980 + }, + { + "epoch": 0.7703173405672643, + "grad_norm": 0.7991495132446289, + "learning_rate": 3.383246552159154e-05, + "loss": 0.7104, + "step": 47990 + }, + { + "epoch": 0.7704778567874284, + "grad_norm": 1.1140830516815186, + "learning_rate": 3.382656734304209e-05, + "loss": 0.8399, + "step": 48000 + }, + { + "epoch": 0.7704778567874284, + "eval_loss": 0.7818954586982727, + "eval_runtime": 1833.3343, + "eval_samples_per_second": 14.308, + "eval_steps_per_second": 1.789, + "step": 48000 + }, + { + "epoch": 0.7706383730075924, + "grad_norm": 0.9191313982009888, + "learning_rate": 3.382066860317477e-05, + "loss": 0.8289, + "step": 48010 + }, + { + "epoch": 0.7707988892277564, + "grad_norm": 0.8463922142982483, + "learning_rate": 3.3814769302364696e-05, + "loss": 0.7918, + "step": 48020 + }, + { + "epoch": 0.7709594054479205, + "grad_norm": 0.9990621209144592, + "learning_rate": 3.380886944098703e-05, + "loss": 0.8387, + "step": 48030 + }, + { + "epoch": 0.7711199216680845, + "grad_norm": 1.1503041982650757, + "learning_rate": 3.380296901941697e-05, + "loss": 0.7028, + "step": 48040 + }, + { + "epoch": 0.7712804378882486, + "grad_norm": 0.6125820279121399, + "learning_rate": 3.3797068038029754e-05, + "loss": 0.6937, + "step": 48050 + }, + { + "epoch": 0.7714409541084126, + "grad_norm": 0.7416352033615112, + "learning_rate": 3.3791166497200645e-05, + "loss": 0.7629, + "step": 48060 + }, + { + "epoch": 0.7716014703285767, + "grad_norm": 0.7862654328346252, + "learning_rate": 3.3785264397304935e-05, + "loss": 0.7137, + "step": 48070 + }, + { + "epoch": 0.7717619865487407, + "grad_norm": 1.5761116743087769, + "learning_rate": 3.377936173871798e-05, + "loss": 0.7378, + "step": 48080 + }, + { + "epoch": 0.7719225027689048, + "grad_norm": 0.8357574939727783, + "learning_rate": 3.377345852181515e-05, + "loss": 0.7774, + "step": 48090 + }, + { + "epoch": 0.7720830189890688, + "grad_norm": 0.6209170818328857, + "learning_rate": 3.3767554746971853e-05, + "loss": 0.7401, + "step": 48100 + }, + { + "epoch": 0.772243535209233, + "grad_norm": 0.6413715481758118, + "learning_rate": 3.376165041456353e-05, + "loss": 0.7458, + "step": 48110 + }, + { + "epoch": 0.7724040514293969, + "grad_norm": 0.7396160960197449, + "learning_rate": 3.375574552496567e-05, + "loss": 0.806, + "step": 48120 + }, + { + "epoch": 0.7725645676495609, + "grad_norm": 1.0319231748580933, + "learning_rate": 3.3749840078553784e-05, + "loss": 0.8667, + "step": 48130 + }, + { + "epoch": 0.772725083869725, + "grad_norm": 0.7439280152320862, + "learning_rate": 3.374393407570342e-05, + "loss": 0.7771, + "step": 48140 + }, + { + "epoch": 0.772885600089889, + "grad_norm": 0.5338926911354065, + "learning_rate": 3.3738027516790185e-05, + "loss": 0.6751, + "step": 48150 + }, + { + "epoch": 0.7730461163100532, + "grad_norm": 0.7542116045951843, + "learning_rate": 3.373212040218968e-05, + "loss": 0.771, + "step": 48160 + }, + { + "epoch": 0.7732066325302172, + "grad_norm": 0.6880031228065491, + "learning_rate": 3.372621273227756e-05, + "loss": 0.8039, + "step": 48170 + }, + { + "epoch": 0.7733671487503813, + "grad_norm": 0.5812457203865051, + "learning_rate": 3.3720304507429535e-05, + "loss": 0.9254, + "step": 48180 + }, + { + "epoch": 0.7735276649705453, + "grad_norm": 0.6513689756393433, + "learning_rate": 3.371439572802133e-05, + "loss": 0.7727, + "step": 48190 + }, + { + "epoch": 0.7736881811907094, + "grad_norm": 0.7621978521347046, + "learning_rate": 3.3708486394428704e-05, + "loss": 0.7582, + "step": 48200 + }, + { + "epoch": 0.7738486974108734, + "grad_norm": 0.5978759527206421, + "learning_rate": 3.370257650702745e-05, + "loss": 0.8167, + "step": 48210 + }, + { + "epoch": 0.7740092136310374, + "grad_norm": 0.7161044478416443, + "learning_rate": 3.369666606619341e-05, + "loss": 0.8326, + "step": 48220 + }, + { + "epoch": 0.7741697298512015, + "grad_norm": 0.6701865792274475, + "learning_rate": 3.369075507230246e-05, + "loss": 0.8091, + "step": 48230 + }, + { + "epoch": 0.7743302460713655, + "grad_norm": 0.5719074606895447, + "learning_rate": 3.368484352573049e-05, + "loss": 0.8916, + "step": 48240 + }, + { + "epoch": 0.7744907622915296, + "grad_norm": 0.4965462386608124, + "learning_rate": 3.367893142685346e-05, + "loss": 0.714, + "step": 48250 + }, + { + "epoch": 0.7746512785116936, + "grad_norm": 0.6505371928215027, + "learning_rate": 3.3673018776047314e-05, + "loss": 0.7981, + "step": 48260 + }, + { + "epoch": 0.7748117947318577, + "grad_norm": 0.7749561667442322, + "learning_rate": 3.3667105573688085e-05, + "loss": 0.7757, + "step": 48270 + }, + { + "epoch": 0.7749723109520217, + "grad_norm": 0.986030101776123, + "learning_rate": 3.366119182015182e-05, + "loss": 0.7077, + "step": 48280 + }, + { + "epoch": 0.7751328271721858, + "grad_norm": 0.6506640911102295, + "learning_rate": 3.3655277515814584e-05, + "loss": 0.8677, + "step": 48290 + }, + { + "epoch": 0.7752933433923498, + "grad_norm": 1.2741812467575073, + "learning_rate": 3.36493626610525e-05, + "loss": 0.8281, + "step": 48300 + }, + { + "epoch": 0.7754538596125139, + "grad_norm": 0.8368878960609436, + "learning_rate": 3.3643447256241716e-05, + "loss": 0.6695, + "step": 48310 + }, + { + "epoch": 0.7756143758326779, + "grad_norm": 0.6210781335830688, + "learning_rate": 3.363753130175842e-05, + "loss": 0.8667, + "step": 48320 + }, + { + "epoch": 0.7757748920528419, + "grad_norm": 0.6357489824295044, + "learning_rate": 3.3631614797978826e-05, + "loss": 0.822, + "step": 48330 + }, + { + "epoch": 0.775935408273006, + "grad_norm": 0.9714656472206116, + "learning_rate": 3.362569774527921e-05, + "loss": 0.7924, + "step": 48340 + }, + { + "epoch": 0.77609592449317, + "grad_norm": 1.1117593050003052, + "learning_rate": 3.361978014403583e-05, + "loss": 0.765, + "step": 48350 + }, + { + "epoch": 0.7762564407133341, + "grad_norm": 0.6227678656578064, + "learning_rate": 3.361386199462502e-05, + "loss": 0.8868, + "step": 48360 + }, + { + "epoch": 0.7764169569334981, + "grad_norm": 0.6003962755203247, + "learning_rate": 3.360794329742316e-05, + "loss": 0.8793, + "step": 48370 + }, + { + "epoch": 0.7765774731536622, + "grad_norm": 0.9226481914520264, + "learning_rate": 3.360202405280662e-05, + "loss": 0.6716, + "step": 48380 + }, + { + "epoch": 0.7767379893738262, + "grad_norm": 0.936410129070282, + "learning_rate": 3.3596104261151844e-05, + "loss": 0.7517, + "step": 48390 + }, + { + "epoch": 0.7768985055939903, + "grad_norm": 1.2976566553115845, + "learning_rate": 3.35901839228353e-05, + "loss": 0.9184, + "step": 48400 + }, + { + "epoch": 0.7770590218141543, + "grad_norm": 0.5343047380447388, + "learning_rate": 3.358426303823346e-05, + "loss": 0.6704, + "step": 48410 + }, + { + "epoch": 0.7772195380343183, + "grad_norm": 0.6263543367385864, + "learning_rate": 3.3578341607722886e-05, + "loss": 0.647, + "step": 48420 + }, + { + "epoch": 0.7773800542544824, + "grad_norm": 0.599520742893219, + "learning_rate": 3.357241963168013e-05, + "loss": 0.7914, + "step": 48430 + }, + { + "epoch": 0.7775405704746464, + "grad_norm": 0.7442195415496826, + "learning_rate": 3.3566497110481804e-05, + "loss": 0.772, + "step": 48440 + }, + { + "epoch": 0.7777010866948105, + "grad_norm": 1.007751703262329, + "learning_rate": 3.356057404450454e-05, + "loss": 0.7683, + "step": 48450 + }, + { + "epoch": 0.7778616029149745, + "grad_norm": 0.5933817625045776, + "learning_rate": 3.3554650434125015e-05, + "loss": 0.8279, + "step": 48460 + }, + { + "epoch": 0.7780221191351386, + "grad_norm": 0.8827105760574341, + "learning_rate": 3.3548726279719936e-05, + "loss": 0.8182, + "step": 48470 + }, + { + "epoch": 0.7781826353553026, + "grad_norm": 0.8385125994682312, + "learning_rate": 3.3542801581666025e-05, + "loss": 0.6918, + "step": 48480 + }, + { + "epoch": 0.7783431515754667, + "grad_norm": 1.428069829940796, + "learning_rate": 3.353687634034009e-05, + "loss": 0.7816, + "step": 48490 + }, + { + "epoch": 0.7785036677956307, + "grad_norm": 0.7750338912010193, + "learning_rate": 3.3530950556118924e-05, + "loss": 0.7844, + "step": 48500 + }, + { + "epoch": 0.7786641840157948, + "grad_norm": 0.9843417406082153, + "learning_rate": 3.3525024229379376e-05, + "loss": 0.7934, + "step": 48510 + }, + { + "epoch": 0.7788247002359588, + "grad_norm": 1.0834964513778687, + "learning_rate": 3.351909736049832e-05, + "loss": 0.785, + "step": 48520 + }, + { + "epoch": 0.7789852164561228, + "grad_norm": 0.6438849568367004, + "learning_rate": 3.351316994985267e-05, + "loss": 0.8262, + "step": 48530 + }, + { + "epoch": 0.779145732676287, + "grad_norm": 1.382948398590088, + "learning_rate": 3.350724199781938e-05, + "loss": 0.7889, + "step": 48540 + }, + { + "epoch": 0.779306248896451, + "grad_norm": 0.648024320602417, + "learning_rate": 3.3501313504775434e-05, + "loss": 0.8194, + "step": 48550 + }, + { + "epoch": 0.7794667651166151, + "grad_norm": 1.3443875312805176, + "learning_rate": 3.349538447109785e-05, + "loss": 0.9589, + "step": 48560 + }, + { + "epoch": 0.779627281336779, + "grad_norm": 1.2242995500564575, + "learning_rate": 3.348945489716368e-05, + "loss": 0.8295, + "step": 48570 + }, + { + "epoch": 0.7797877975569432, + "grad_norm": 0.6267728805541992, + "learning_rate": 3.348352478334999e-05, + "loss": 0.7023, + "step": 48580 + }, + { + "epoch": 0.7799483137771072, + "grad_norm": 1.1494019031524658, + "learning_rate": 3.3477594130033926e-05, + "loss": 0.7977, + "step": 48590 + }, + { + "epoch": 0.7801088299972713, + "grad_norm": 0.5921040177345276, + "learning_rate": 3.347166293759264e-05, + "loss": 0.8832, + "step": 48600 + }, + { + "epoch": 0.7802693462174353, + "grad_norm": 0.7601976990699768, + "learning_rate": 3.346573120640331e-05, + "loss": 0.7803, + "step": 48610 + }, + { + "epoch": 0.7804298624375993, + "grad_norm": 0.6137930750846863, + "learning_rate": 3.345979893684317e-05, + "loss": 0.7074, + "step": 48620 + }, + { + "epoch": 0.7805903786577634, + "grad_norm": 0.5665217041969299, + "learning_rate": 3.345386612928946e-05, + "loss": 0.8104, + "step": 48630 + }, + { + "epoch": 0.7807508948779274, + "grad_norm": 0.7306960225105286, + "learning_rate": 3.344793278411948e-05, + "loss": 0.8018, + "step": 48640 + }, + { + "epoch": 0.7809114110980915, + "grad_norm": 1.26497220993042, + "learning_rate": 3.3441998901710575e-05, + "loss": 0.8353, + "step": 48650 + }, + { + "epoch": 0.7810719273182555, + "grad_norm": 0.7014787793159485, + "learning_rate": 3.343606448244008e-05, + "loss": 0.8479, + "step": 48660 + }, + { + "epoch": 0.7812324435384196, + "grad_norm": 0.8438863158226013, + "learning_rate": 3.343012952668541e-05, + "loss": 0.7309, + "step": 48670 + }, + { + "epoch": 0.7813929597585836, + "grad_norm": 1.0472692251205444, + "learning_rate": 3.342419403482397e-05, + "loss": 0.6615, + "step": 48680 + }, + { + "epoch": 0.7815534759787477, + "grad_norm": 0.5033907294273376, + "learning_rate": 3.341825800723323e-05, + "loss": 0.639, + "step": 48690 + }, + { + "epoch": 0.7817139921989117, + "grad_norm": 0.8818095922470093, + "learning_rate": 3.34123214442907e-05, + "loss": 0.8898, + "step": 48700 + }, + { + "epoch": 0.7818745084190758, + "grad_norm": 0.9343175292015076, + "learning_rate": 3.3406384346373906e-05, + "loss": 0.6903, + "step": 48710 + }, + { + "epoch": 0.7820350246392398, + "grad_norm": 0.7714505791664124, + "learning_rate": 3.34004467138604e-05, + "loss": 0.7799, + "step": 48720 + }, + { + "epoch": 0.7821955408594038, + "grad_norm": 0.7340734004974365, + "learning_rate": 3.339450854712779e-05, + "loss": 0.7685, + "step": 48730 + }, + { + "epoch": 0.7823560570795679, + "grad_norm": 1.09711492061615, + "learning_rate": 3.3388569846553705e-05, + "loss": 0.7673, + "step": 48740 + }, + { + "epoch": 0.7825165732997319, + "grad_norm": 0.7810465693473816, + "learning_rate": 3.338263061251582e-05, + "loss": 0.8452, + "step": 48750 + }, + { + "epoch": 0.782677089519896, + "grad_norm": 0.46729686856269836, + "learning_rate": 3.3376690845391825e-05, + "loss": 0.86, + "step": 48760 + }, + { + "epoch": 0.78283760574006, + "grad_norm": 0.6776688694953918, + "learning_rate": 3.337075054555945e-05, + "loss": 0.6717, + "step": 48770 + }, + { + "epoch": 0.7829981219602241, + "grad_norm": 1.0231400728225708, + "learning_rate": 3.3364809713396485e-05, + "loss": 0.8158, + "step": 48780 + }, + { + "epoch": 0.7831586381803881, + "grad_norm": 0.6587131023406982, + "learning_rate": 3.335886834928071e-05, + "loss": 0.7753, + "step": 48790 + }, + { + "epoch": 0.7833191544005522, + "grad_norm": 2.879744529724121, + "learning_rate": 3.335292645358998e-05, + "loss": 0.737, + "step": 48800 + }, + { + "epoch": 0.7834796706207162, + "grad_norm": 0.7281743884086609, + "learning_rate": 3.3346984026702145e-05, + "loss": 0.7316, + "step": 48810 + }, + { + "epoch": 0.7836401868408803, + "grad_norm": 0.8483946323394775, + "learning_rate": 3.334104106899512e-05, + "loss": 0.6952, + "step": 48820 + }, + { + "epoch": 0.7838007030610443, + "grad_norm": 0.8171595335006714, + "learning_rate": 3.3335097580846844e-05, + "loss": 0.826, + "step": 48830 + }, + { + "epoch": 0.7839612192812083, + "grad_norm": 0.9978684186935425, + "learning_rate": 3.3329153562635276e-05, + "loss": 0.729, + "step": 48840 + }, + { + "epoch": 0.7841217355013724, + "grad_norm": 0.885200560092926, + "learning_rate": 3.332320901473843e-05, + "loss": 0.7205, + "step": 48850 + }, + { + "epoch": 0.7842822517215364, + "grad_norm": 0.9627339243888855, + "learning_rate": 3.3317263937534344e-05, + "loss": 0.818, + "step": 48860 + }, + { + "epoch": 0.7844427679417005, + "grad_norm": 0.8299519419670105, + "learning_rate": 3.3311318331401095e-05, + "loss": 0.829, + "step": 48870 + }, + { + "epoch": 0.7846032841618645, + "grad_norm": 0.6129348278045654, + "learning_rate": 3.330537219671678e-05, + "loss": 0.7367, + "step": 48880 + }, + { + "epoch": 0.7847638003820286, + "grad_norm": 0.33320364356040955, + "learning_rate": 3.3299425533859534e-05, + "loss": 0.7406, + "step": 48890 + }, + { + "epoch": 0.7849243166021926, + "grad_norm": 0.825352668762207, + "learning_rate": 3.329347834320755e-05, + "loss": 0.5889, + "step": 48900 + }, + { + "epoch": 0.7850848328223567, + "grad_norm": 0.7550395727157593, + "learning_rate": 3.328753062513901e-05, + "loss": 0.7279, + "step": 48910 + }, + { + "epoch": 0.7852453490425207, + "grad_norm": 0.5624242424964905, + "learning_rate": 3.328158238003216e-05, + "loss": 0.7366, + "step": 48920 + }, + { + "epoch": 0.7854058652626847, + "grad_norm": 0.857441246509552, + "learning_rate": 3.32756336082653e-05, + "loss": 0.8647, + "step": 48930 + }, + { + "epoch": 0.7855663814828489, + "grad_norm": 0.7675848603248596, + "learning_rate": 3.32696843102167e-05, + "loss": 0.847, + "step": 48940 + }, + { + "epoch": 0.7857268977030128, + "grad_norm": 0.6878885626792908, + "learning_rate": 3.326373448626471e-05, + "loss": 0.7857, + "step": 48950 + }, + { + "epoch": 0.785887413923177, + "grad_norm": 0.836056113243103, + "learning_rate": 3.3257784136787725e-05, + "loss": 0.8272, + "step": 48960 + }, + { + "epoch": 0.786047930143341, + "grad_norm": 0.7798150181770325, + "learning_rate": 3.325183326216413e-05, + "loss": 0.7938, + "step": 48970 + }, + { + "epoch": 0.7862084463635051, + "grad_norm": 1.21633780002594, + "learning_rate": 3.324588186277237e-05, + "loss": 0.7953, + "step": 48980 + }, + { + "epoch": 0.7863689625836691, + "grad_norm": 0.7611923217773438, + "learning_rate": 3.323992993899093e-05, + "loss": 0.7669, + "step": 48990 + }, + { + "epoch": 0.7865294788038332, + "grad_norm": 0.9425803422927856, + "learning_rate": 3.32339774911983e-05, + "loss": 0.7408, + "step": 49000 + }, + { + "epoch": 0.7866899950239972, + "grad_norm": 0.9062397480010986, + "learning_rate": 3.3228024519773033e-05, + "loss": 0.75, + "step": 49010 + }, + { + "epoch": 0.7868505112441613, + "grad_norm": 1.0354200601577759, + "learning_rate": 3.3222071025093706e-05, + "loss": 0.7607, + "step": 49020 + }, + { + "epoch": 0.7870110274643253, + "grad_norm": 0.8675691485404968, + "learning_rate": 3.3216117007538914e-05, + "loss": 0.8599, + "step": 49030 + }, + { + "epoch": 0.7871715436844893, + "grad_norm": 0.5059550404548645, + "learning_rate": 3.321016246748731e-05, + "loss": 0.8298, + "step": 49040 + }, + { + "epoch": 0.7873320599046534, + "grad_norm": 0.6259164810180664, + "learning_rate": 3.320420740531755e-05, + "loss": 0.8329, + "step": 49050 + }, + { + "epoch": 0.7874925761248174, + "grad_norm": 1.067480206489563, + "learning_rate": 3.319825182140837e-05, + "loss": 0.8985, + "step": 49060 + }, + { + "epoch": 0.7876530923449815, + "grad_norm": 1.212033987045288, + "learning_rate": 3.3192295716138476e-05, + "loss": 0.6749, + "step": 49070 + }, + { + "epoch": 0.7878136085651455, + "grad_norm": 0.7801744341850281, + "learning_rate": 3.318633908988667e-05, + "loss": 0.8021, + "step": 49080 + }, + { + "epoch": 0.7879741247853096, + "grad_norm": 2.7633697986602783, + "learning_rate": 3.3180381943031744e-05, + "loss": 0.8147, + "step": 49090 + }, + { + "epoch": 0.7881346410054736, + "grad_norm": 1.2810304164886475, + "learning_rate": 3.3174424275952544e-05, + "loss": 0.8384, + "step": 49100 + }, + { + "epoch": 0.7882951572256377, + "grad_norm": 0.709423840045929, + "learning_rate": 3.3168466089027926e-05, + "loss": 0.7769, + "step": 49110 + }, + { + "epoch": 0.7884556734458017, + "grad_norm": 0.7753222584724426, + "learning_rate": 3.316250738263682e-05, + "loss": 0.8285, + "step": 49120 + }, + { + "epoch": 0.7886161896659657, + "grad_norm": 0.9772328734397888, + "learning_rate": 3.315654815715815e-05, + "loss": 0.8153, + "step": 49130 + }, + { + "epoch": 0.7887767058861298, + "grad_norm": 0.5231125354766846, + "learning_rate": 3.3150588412970894e-05, + "loss": 0.8902, + "step": 49140 + }, + { + "epoch": 0.7889372221062938, + "grad_norm": 0.7537075877189636, + "learning_rate": 3.314462815045405e-05, + "loss": 0.9189, + "step": 49150 + }, + { + "epoch": 0.7890977383264579, + "grad_norm": 0.7542588114738464, + "learning_rate": 3.313866736998666e-05, + "loss": 0.7498, + "step": 49160 + }, + { + "epoch": 0.7892582545466219, + "grad_norm": 0.9031575918197632, + "learning_rate": 3.31327060719478e-05, + "loss": 0.7282, + "step": 49170 + }, + { + "epoch": 0.789418770766786, + "grad_norm": 0.674235999584198, + "learning_rate": 3.312674425671656e-05, + "loss": 0.7928, + "step": 49180 + }, + { + "epoch": 0.78957928698695, + "grad_norm": 0.5677818059921265, + "learning_rate": 3.3120781924672087e-05, + "loss": 0.6653, + "step": 49190 + }, + { + "epoch": 0.7897398032071141, + "grad_norm": 0.655698835849762, + "learning_rate": 3.311481907619355e-05, + "loss": 0.8058, + "step": 49200 + }, + { + "epoch": 0.7899003194272781, + "grad_norm": 0.6876958012580872, + "learning_rate": 3.310885571166015e-05, + "loss": 0.8793, + "step": 49210 + }, + { + "epoch": 0.7900608356474422, + "grad_norm": 0.8955802321434021, + "learning_rate": 3.310289183145112e-05, + "loss": 0.7763, + "step": 49220 + }, + { + "epoch": 0.7902213518676062, + "grad_norm": 0.9151268005371094, + "learning_rate": 3.3096927435945725e-05, + "loss": 0.6935, + "step": 49230 + }, + { + "epoch": 0.7903818680877702, + "grad_norm": 1.269410490989685, + "learning_rate": 3.309096252552328e-05, + "loss": 0.685, + "step": 49240 + }, + { + "epoch": 0.7905423843079343, + "grad_norm": 0.6994181871414185, + "learning_rate": 3.3084997100563097e-05, + "loss": 0.776, + "step": 49250 + }, + { + "epoch": 0.7907029005280983, + "grad_norm": 1.1905794143676758, + "learning_rate": 3.307903116144455e-05, + "loss": 0.7573, + "step": 49260 + }, + { + "epoch": 0.7908634167482624, + "grad_norm": 0.8029124736785889, + "learning_rate": 3.3073064708547047e-05, + "loss": 0.7442, + "step": 49270 + }, + { + "epoch": 0.7910239329684264, + "grad_norm": 0.9430586099624634, + "learning_rate": 3.306709774225002e-05, + "loss": 0.7549, + "step": 49280 + }, + { + "epoch": 0.7911844491885905, + "grad_norm": 0.7225251197814941, + "learning_rate": 3.306113026293291e-05, + "loss": 0.7614, + "step": 49290 + }, + { + "epoch": 0.7913449654087545, + "grad_norm": 1.1334139108657837, + "learning_rate": 3.305516227097523e-05, + "loss": 0.6389, + "step": 49300 + }, + { + "epoch": 0.7915054816289187, + "grad_norm": 0.8750320076942444, + "learning_rate": 3.304919376675652e-05, + "loss": 0.6914, + "step": 49310 + }, + { + "epoch": 0.7916659978490826, + "grad_norm": 0.7175473570823669, + "learning_rate": 3.304322475065633e-05, + "loss": 0.7772, + "step": 49320 + }, + { + "epoch": 0.7918265140692466, + "grad_norm": 1.176600694656372, + "learning_rate": 3.303725522305424e-05, + "loss": 0.7632, + "step": 49330 + }, + { + "epoch": 0.7919870302894108, + "grad_norm": 0.44781047105789185, + "learning_rate": 3.303128518432991e-05, + "loss": 0.7483, + "step": 49340 + }, + { + "epoch": 0.7921475465095748, + "grad_norm": 1.24093759059906, + "learning_rate": 3.3025314634862966e-05, + "loss": 0.7166, + "step": 49350 + }, + { + "epoch": 0.7923080627297389, + "grad_norm": 0.692228376865387, + "learning_rate": 3.3019343575033126e-05, + "loss": 0.605, + "step": 49360 + }, + { + "epoch": 0.7924685789499029, + "grad_norm": 0.8563541173934937, + "learning_rate": 3.30133720052201e-05, + "loss": 0.7657, + "step": 49370 + }, + { + "epoch": 0.792629095170067, + "grad_norm": 0.8620800971984863, + "learning_rate": 3.3007399925803646e-05, + "loss": 0.7789, + "step": 49380 + }, + { + "epoch": 0.792789611390231, + "grad_norm": 0.5879426598548889, + "learning_rate": 3.300142733716355e-05, + "loss": 0.8397, + "step": 49390 + }, + { + "epoch": 0.7929501276103951, + "grad_norm": 0.6811913251876831, + "learning_rate": 3.2995454239679655e-05, + "loss": 0.7093, + "step": 49400 + }, + { + "epoch": 0.7931106438305591, + "grad_norm": 0.7622894644737244, + "learning_rate": 3.298948063373178e-05, + "loss": 0.7527, + "step": 49410 + }, + { + "epoch": 0.7932711600507232, + "grad_norm": 0.7467905282974243, + "learning_rate": 3.2983506519699846e-05, + "loss": 0.825, + "step": 49420 + }, + { + "epoch": 0.7934316762708872, + "grad_norm": 1.0118968486785889, + "learning_rate": 3.297753189796375e-05, + "loss": 0.6144, + "step": 49430 + }, + { + "epoch": 0.7935921924910512, + "grad_norm": 1.2320278882980347, + "learning_rate": 3.297155676890345e-05, + "loss": 0.8781, + "step": 49440 + }, + { + "epoch": 0.7937527087112153, + "grad_norm": 0.8042446970939636, + "learning_rate": 3.2965581132898924e-05, + "loss": 0.6792, + "step": 49450 + }, + { + "epoch": 0.7939132249313793, + "grad_norm": 0.9557898640632629, + "learning_rate": 3.295960499033019e-05, + "loss": 0.7794, + "step": 49460 + }, + { + "epoch": 0.7940737411515434, + "grad_norm": 0.7610619068145752, + "learning_rate": 3.29536283415773e-05, + "loss": 0.7547, + "step": 49470 + }, + { + "epoch": 0.7942342573717074, + "grad_norm": 0.7949659824371338, + "learning_rate": 3.294765118702033e-05, + "loss": 0.8565, + "step": 49480 + }, + { + "epoch": 0.7943947735918715, + "grad_norm": 1.2041778564453125, + "learning_rate": 3.294167352703939e-05, + "loss": 0.9644, + "step": 49490 + }, + { + "epoch": 0.7945552898120355, + "grad_norm": 0.6526481509208679, + "learning_rate": 3.293569536201463e-05, + "loss": 0.775, + "step": 49500 + }, + { + "epoch": 0.7947158060321996, + "grad_norm": 2.1473042964935303, + "learning_rate": 3.292971669232622e-05, + "loss": 0.6686, + "step": 49510 + }, + { + "epoch": 0.7948763222523636, + "grad_norm": 1.46681809425354, + "learning_rate": 3.292373751835438e-05, + "loss": 0.7907, + "step": 49520 + }, + { + "epoch": 0.7950368384725276, + "grad_norm": 0.7248761653900146, + "learning_rate": 3.291775784047933e-05, + "loss": 0.8656, + "step": 49530 + }, + { + "epoch": 0.7951973546926917, + "grad_norm": 0.9483925700187683, + "learning_rate": 3.291177765908136e-05, + "loss": 0.7526, + "step": 49540 + }, + { + "epoch": 0.7953578709128557, + "grad_norm": 1.2585062980651855, + "learning_rate": 3.290579697454076e-05, + "loss": 0.8099, + "step": 49550 + }, + { + "epoch": 0.7955183871330198, + "grad_norm": 0.7309524416923523, + "learning_rate": 3.289981578723789e-05, + "loss": 0.6967, + "step": 49560 + }, + { + "epoch": 0.7956789033531838, + "grad_norm": 1.0352766513824463, + "learning_rate": 3.2893834097553096e-05, + "loss": 0.7208, + "step": 49570 + }, + { + "epoch": 0.7958394195733479, + "grad_norm": 0.7852630615234375, + "learning_rate": 3.288785190586678e-05, + "loss": 0.812, + "step": 49580 + }, + { + "epoch": 0.7959999357935119, + "grad_norm": 0.766288697719574, + "learning_rate": 3.288186921255939e-05, + "loss": 0.7497, + "step": 49590 + }, + { + "epoch": 0.796160452013676, + "grad_norm": 0.8681581020355225, + "learning_rate": 3.287588601801137e-05, + "loss": 0.8465, + "step": 49600 + }, + { + "epoch": 0.79632096823384, + "grad_norm": 0.6631078124046326, + "learning_rate": 3.286990232260323e-05, + "loss": 0.8098, + "step": 49610 + }, + { + "epoch": 0.7964814844540041, + "grad_norm": 0.6838750243186951, + "learning_rate": 3.28639181267155e-05, + "loss": 0.8467, + "step": 49620 + }, + { + "epoch": 0.7966420006741681, + "grad_norm": 0.5785457491874695, + "learning_rate": 3.285793343072872e-05, + "loss": 0.8606, + "step": 49630 + }, + { + "epoch": 0.7968025168943321, + "grad_norm": 0.7843804955482483, + "learning_rate": 3.28519482350235e-05, + "loss": 0.7222, + "step": 49640 + }, + { + "epoch": 0.7969630331144962, + "grad_norm": 0.5576416850090027, + "learning_rate": 3.284596253998047e-05, + "loss": 0.8248, + "step": 49650 + }, + { + "epoch": 0.7971235493346602, + "grad_norm": 0.9994361400604248, + "learning_rate": 3.283997634598026e-05, + "loss": 0.7181, + "step": 49660 + }, + { + "epoch": 0.7972840655548243, + "grad_norm": 0.713901162147522, + "learning_rate": 3.2833989653403575e-05, + "loss": 0.8138, + "step": 49670 + }, + { + "epoch": 0.7974445817749883, + "grad_norm": 0.8924233317375183, + "learning_rate": 3.282800246263114e-05, + "loss": 0.789, + "step": 49680 + }, + { + "epoch": 0.7976050979951524, + "grad_norm": 0.6702607870101929, + "learning_rate": 3.2822014774043674e-05, + "loss": 0.7055, + "step": 49690 + }, + { + "epoch": 0.7977656142153164, + "grad_norm": 1.0954357385635376, + "learning_rate": 3.2816026588021985e-05, + "loss": 0.8596, + "step": 49700 + }, + { + "epoch": 0.7979261304354806, + "grad_norm": 1.011792540550232, + "learning_rate": 3.281003790494689e-05, + "loss": 0.7527, + "step": 49710 + }, + { + "epoch": 0.7980866466556445, + "grad_norm": 0.6294679641723633, + "learning_rate": 3.2804048725199215e-05, + "loss": 0.8143, + "step": 49720 + }, + { + "epoch": 0.7982471628758085, + "grad_norm": 0.6909133791923523, + "learning_rate": 3.279805904915985e-05, + "loss": 0.6789, + "step": 49730 + }, + { + "epoch": 0.7984076790959727, + "grad_norm": 0.6403861045837402, + "learning_rate": 3.2792068877209695e-05, + "loss": 0.8984, + "step": 49740 + }, + { + "epoch": 0.7985681953161367, + "grad_norm": 0.9736300110816956, + "learning_rate": 3.27860782097297e-05, + "loss": 0.8365, + "step": 49750 + }, + { + "epoch": 0.7987287115363008, + "grad_norm": 0.9431784152984619, + "learning_rate": 3.2780087047100825e-05, + "loss": 0.7904, + "step": 49760 + }, + { + "epoch": 0.7988892277564648, + "grad_norm": 0.90958172082901, + "learning_rate": 3.2774095389704085e-05, + "loss": 0.7358, + "step": 49770 + }, + { + "epoch": 0.7990497439766289, + "grad_norm": 0.8742929100990295, + "learning_rate": 3.27681032379205e-05, + "loss": 0.8961, + "step": 49780 + }, + { + "epoch": 0.7992102601967929, + "grad_norm": 0.7339177131652832, + "learning_rate": 3.2762110592131145e-05, + "loss": 0.8055, + "step": 49790 + }, + { + "epoch": 0.799370776416957, + "grad_norm": 0.6489745378494263, + "learning_rate": 3.275611745271713e-05, + "loss": 0.6855, + "step": 49800 + }, + { + "epoch": 0.799531292637121, + "grad_norm": 0.9870871901512146, + "learning_rate": 3.2750123820059555e-05, + "loss": 0.756, + "step": 49810 + }, + { + "epoch": 0.7996918088572851, + "grad_norm": 0.47257184982299805, + "learning_rate": 3.274412969453959e-05, + "loss": 0.8373, + "step": 49820 + }, + { + "epoch": 0.7998523250774491, + "grad_norm": 0.9797194600105286, + "learning_rate": 3.273813507653843e-05, + "loss": 0.7767, + "step": 49830 + }, + { + "epoch": 0.8000128412976131, + "grad_norm": 0.8997868299484253, + "learning_rate": 3.273213996643731e-05, + "loss": 0.7676, + "step": 49840 + }, + { + "epoch": 0.8001733575177772, + "grad_norm": 1.0165690183639526, + "learning_rate": 3.272614436461746e-05, + "loss": 0.6784, + "step": 49850 + }, + { + "epoch": 0.8003338737379412, + "grad_norm": 0.8312771320343018, + "learning_rate": 3.272014827146018e-05, + "loss": 0.6294, + "step": 49860 + }, + { + "epoch": 0.8004943899581053, + "grad_norm": 1.2306082248687744, + "learning_rate": 3.2714151687346784e-05, + "loss": 0.6946, + "step": 49870 + }, + { + "epoch": 0.8006549061782693, + "grad_norm": 0.7512449622154236, + "learning_rate": 3.270815461265861e-05, + "loss": 0.7476, + "step": 49880 + }, + { + "epoch": 0.8008154223984334, + "grad_norm": 0.8751206994056702, + "learning_rate": 3.270215704777705e-05, + "loss": 0.6678, + "step": 49890 + }, + { + "epoch": 0.8009759386185974, + "grad_norm": 0.8564692735671997, + "learning_rate": 3.269615899308352e-05, + "loss": 0.6845, + "step": 49900 + }, + { + "epoch": 0.8011364548387615, + "grad_norm": 0.8914405107498169, + "learning_rate": 3.269016044895944e-05, + "loss": 0.8694, + "step": 49910 + }, + { + "epoch": 0.8012969710589255, + "grad_norm": 1.3982877731323242, + "learning_rate": 3.268416141578629e-05, + "loss": 0.832, + "step": 49920 + }, + { + "epoch": 0.8014574872790895, + "grad_norm": 1.2443989515304565, + "learning_rate": 3.267816189394558e-05, + "loss": 0.9046, + "step": 49930 + }, + { + "epoch": 0.8016180034992536, + "grad_norm": 1.2671538591384888, + "learning_rate": 3.267216188381884e-05, + "loss": 0.9234, + "step": 49940 + }, + { + "epoch": 0.8017785197194176, + "grad_norm": 0.6819776892662048, + "learning_rate": 3.266616138578763e-05, + "loss": 0.6314, + "step": 49950 + }, + { + "epoch": 0.8019390359395817, + "grad_norm": 0.8981294631958008, + "learning_rate": 3.266016040023356e-05, + "loss": 0.7862, + "step": 49960 + }, + { + "epoch": 0.8020995521597457, + "grad_norm": 0.7878396511077881, + "learning_rate": 3.265415892753823e-05, + "loss": 0.8189, + "step": 49970 + }, + { + "epoch": 0.8022600683799098, + "grad_norm": 0.6790988445281982, + "learning_rate": 3.264815696808334e-05, + "loss": 0.7709, + "step": 49980 + }, + { + "epoch": 0.8024205846000738, + "grad_norm": 0.5871779322624207, + "learning_rate": 3.264215452225055e-05, + "loss": 0.8191, + "step": 49990 + }, + { + "epoch": 0.8025811008202379, + "grad_norm": 0.8104393482208252, + "learning_rate": 3.263615159042158e-05, + "loss": 0.6862, + "step": 50000 + }, + { + "epoch": 0.8027416170404019, + "grad_norm": 0.8035295009613037, + "learning_rate": 3.26301481729782e-05, + "loss": 0.7046, + "step": 50010 + }, + { + "epoch": 0.802902133260566, + "grad_norm": 1.1499499082565308, + "learning_rate": 3.262414427030218e-05, + "loss": 0.7602, + "step": 50020 + }, + { + "epoch": 0.80306264948073, + "grad_norm": 0.7172354459762573, + "learning_rate": 3.2618139882775324e-05, + "loss": 0.769, + "step": 50030 + }, + { + "epoch": 0.803223165700894, + "grad_norm": 0.4611945152282715, + "learning_rate": 3.2612135010779485e-05, + "loss": 0.7803, + "step": 50040 + }, + { + "epoch": 0.8033836819210581, + "grad_norm": 0.6208071708679199, + "learning_rate": 3.260612965469654e-05, + "loss": 0.9087, + "step": 50050 + }, + { + "epoch": 0.8035441981412221, + "grad_norm": 0.5292310118675232, + "learning_rate": 3.260012381490839e-05, + "loss": 0.8074, + "step": 50060 + }, + { + "epoch": 0.8037047143613862, + "grad_norm": 0.6459535956382751, + "learning_rate": 3.2594117491796985e-05, + "loss": 0.9184, + "step": 50070 + }, + { + "epoch": 0.8038652305815502, + "grad_norm": 0.7298881411552429, + "learning_rate": 3.258811068574427e-05, + "loss": 0.7386, + "step": 50080 + }, + { + "epoch": 0.8040257468017143, + "grad_norm": 0.741271436214447, + "learning_rate": 3.258210339713226e-05, + "loss": 0.7559, + "step": 50090 + }, + { + "epoch": 0.8041862630218783, + "grad_norm": 0.7022154331207275, + "learning_rate": 3.257609562634297e-05, + "loss": 0.7442, + "step": 50100 + }, + { + "epoch": 0.8043467792420425, + "grad_norm": 0.6415762901306152, + "learning_rate": 3.2570087373758465e-05, + "loss": 0.7286, + "step": 50110 + }, + { + "epoch": 0.8045072954622065, + "grad_norm": 1.0236841440200806, + "learning_rate": 3.256407863976084e-05, + "loss": 0.8117, + "step": 50120 + }, + { + "epoch": 0.8046678116823704, + "grad_norm": 0.6728070974349976, + "learning_rate": 3.2558069424732214e-05, + "loss": 0.8061, + "step": 50130 + }, + { + "epoch": 0.8048283279025346, + "grad_norm": 0.7487862706184387, + "learning_rate": 3.2552059729054726e-05, + "loss": 0.7685, + "step": 50140 + }, + { + "epoch": 0.8049888441226986, + "grad_norm": 1.0607112646102905, + "learning_rate": 3.254604955311057e-05, + "loss": 0.7755, + "step": 50150 + }, + { + "epoch": 0.8051493603428627, + "grad_norm": 0.8897578120231628, + "learning_rate": 3.254003889728195e-05, + "loss": 0.7872, + "step": 50160 + }, + { + "epoch": 0.8053098765630267, + "grad_norm": 0.7848422527313232, + "learning_rate": 3.2534027761951115e-05, + "loss": 0.7735, + "step": 50170 + }, + { + "epoch": 0.8054703927831908, + "grad_norm": 1.4168310165405273, + "learning_rate": 3.252801614750033e-05, + "loss": 0.7334, + "step": 50180 + }, + { + "epoch": 0.8056309090033548, + "grad_norm": 0.6005171537399292, + "learning_rate": 3.2522004054311904e-05, + "loss": 0.7466, + "step": 50190 + }, + { + "epoch": 0.8057914252235189, + "grad_norm": 1.1018625497817993, + "learning_rate": 3.251599148276817e-05, + "loss": 0.7442, + "step": 50200 + }, + { + "epoch": 0.8059519414436829, + "grad_norm": 0.6615254282951355, + "learning_rate": 3.2509978433251496e-05, + "loss": 0.7511, + "step": 50210 + }, + { + "epoch": 0.806112457663847, + "grad_norm": 0.655268669128418, + "learning_rate": 3.2503964906144274e-05, + "loss": 0.7287, + "step": 50220 + }, + { + "epoch": 0.806272973884011, + "grad_norm": 0.9221110343933105, + "learning_rate": 3.249795090182892e-05, + "loss": 0.7929, + "step": 50230 + }, + { + "epoch": 0.806433490104175, + "grad_norm": 0.7853561043739319, + "learning_rate": 3.249193642068791e-05, + "loss": 0.824, + "step": 50240 + }, + { + "epoch": 0.8065940063243391, + "grad_norm": 0.9273967146873474, + "learning_rate": 3.24859214631037e-05, + "loss": 0.7971, + "step": 50250 + }, + { + "epoch": 0.8067545225445031, + "grad_norm": 0.8964365720748901, + "learning_rate": 3.2479906029458834e-05, + "loss": 0.745, + "step": 50260 + }, + { + "epoch": 0.8069150387646672, + "grad_norm": 0.8761554956436157, + "learning_rate": 3.247389012013585e-05, + "loss": 0.7472, + "step": 50270 + }, + { + "epoch": 0.8070755549848312, + "grad_norm": 2.570111036300659, + "learning_rate": 3.246787373551731e-05, + "loss": 0.6907, + "step": 50280 + }, + { + "epoch": 0.8072360712049953, + "grad_norm": 0.633938729763031, + "learning_rate": 3.2461856875985836e-05, + "loss": 0.7053, + "step": 50290 + }, + { + "epoch": 0.8073965874251593, + "grad_norm": 0.8151563405990601, + "learning_rate": 3.245583954192407e-05, + "loss": 0.758, + "step": 50300 + }, + { + "epoch": 0.8075571036453234, + "grad_norm": 0.5890304446220398, + "learning_rate": 3.244982173371466e-05, + "loss": 0.7335, + "step": 50310 + }, + { + "epoch": 0.8077176198654874, + "grad_norm": 1.1307809352874756, + "learning_rate": 3.244380345174032e-05, + "loss": 0.784, + "step": 50320 + }, + { + "epoch": 0.8078781360856515, + "grad_norm": 0.9896979331970215, + "learning_rate": 3.2437784696383766e-05, + "loss": 0.8391, + "step": 50330 + }, + { + "epoch": 0.8080386523058155, + "grad_norm": 1.642120361328125, + "learning_rate": 3.2431765468027765e-05, + "loss": 0.7232, + "step": 50340 + }, + { + "epoch": 0.8081991685259795, + "grad_norm": 0.819875180721283, + "learning_rate": 3.2425745767055094e-05, + "loss": 0.8078, + "step": 50350 + }, + { + "epoch": 0.8083596847461436, + "grad_norm": 0.8612938523292542, + "learning_rate": 3.2419725593848584e-05, + "loss": 0.7725, + "step": 50360 + }, + { + "epoch": 0.8085202009663076, + "grad_norm": 0.4634527266025543, + "learning_rate": 3.241370494879107e-05, + "loss": 0.7323, + "step": 50370 + }, + { + "epoch": 0.8086807171864717, + "grad_norm": 0.9854872226715088, + "learning_rate": 3.240768383226545e-05, + "loss": 0.858, + "step": 50380 + }, + { + "epoch": 0.8088412334066357, + "grad_norm": 0.7229140996932983, + "learning_rate": 3.24016622446546e-05, + "loss": 0.852, + "step": 50390 + }, + { + "epoch": 0.8090017496267998, + "grad_norm": 0.9027138352394104, + "learning_rate": 3.2395640186341476e-05, + "loss": 0.8249, + "step": 50400 + }, + { + "epoch": 0.8091622658469638, + "grad_norm": 0.737303614616394, + "learning_rate": 3.238961765770905e-05, + "loss": 0.7441, + "step": 50410 + }, + { + "epoch": 0.8093227820671279, + "grad_norm": 0.5994957685470581, + "learning_rate": 3.238359465914031e-05, + "loss": 0.7887, + "step": 50420 + }, + { + "epoch": 0.8094832982872919, + "grad_norm": 0.6626923084259033, + "learning_rate": 3.237757119101828e-05, + "loss": 0.8445, + "step": 50430 + }, + { + "epoch": 0.8096438145074559, + "grad_norm": 0.6264437437057495, + "learning_rate": 3.2371547253726045e-05, + "loss": 0.7787, + "step": 50440 + }, + { + "epoch": 0.80980433072762, + "grad_norm": 0.7160683870315552, + "learning_rate": 3.2365522847646654e-05, + "loss": 0.7045, + "step": 50450 + }, + { + "epoch": 0.809964846947784, + "grad_norm": 0.8715370893478394, + "learning_rate": 3.235949797316325e-05, + "loss": 0.8146, + "step": 50460 + }, + { + "epoch": 0.8101253631679481, + "grad_norm": 0.594992995262146, + "learning_rate": 3.235347263065897e-05, + "loss": 0.7588, + "step": 50470 + }, + { + "epoch": 0.8102858793881121, + "grad_norm": 0.559975266456604, + "learning_rate": 3.234744682051698e-05, + "loss": 0.7848, + "step": 50480 + }, + { + "epoch": 0.8104463956082762, + "grad_norm": 0.6466308832168579, + "learning_rate": 3.234142054312051e-05, + "loss": 0.719, + "step": 50490 + }, + { + "epoch": 0.8106069118284402, + "grad_norm": 0.8475767374038696, + "learning_rate": 3.2335393798852774e-05, + "loss": 0.7555, + "step": 50500 + }, + { + "epoch": 0.8107674280486044, + "grad_norm": 0.7874159216880798, + "learning_rate": 3.232936658809705e-05, + "loss": 0.7569, + "step": 50510 + }, + { + "epoch": 0.8109279442687684, + "grad_norm": 0.8940522074699402, + "learning_rate": 3.232333891123664e-05, + "loss": 0.7515, + "step": 50520 + }, + { + "epoch": 0.8110884604889325, + "grad_norm": 0.7174528241157532, + "learning_rate": 3.2317310768654844e-05, + "loss": 0.7523, + "step": 50530 + }, + { + "epoch": 0.8112489767090965, + "grad_norm": 0.870205819606781, + "learning_rate": 3.231128216073504e-05, + "loss": 0.7629, + "step": 50540 + }, + { + "epoch": 0.8114094929292605, + "grad_norm": 0.8651663064956665, + "learning_rate": 3.23052530878606e-05, + "loss": 0.7675, + "step": 50550 + }, + { + "epoch": 0.8115700091494246, + "grad_norm": 0.670037567615509, + "learning_rate": 3.2299223550414945e-05, + "loss": 0.7349, + "step": 50560 + }, + { + "epoch": 0.8117305253695886, + "grad_norm": 0.6718480587005615, + "learning_rate": 3.229319354878151e-05, + "loss": 0.6933, + "step": 50570 + }, + { + "epoch": 0.8118910415897527, + "grad_norm": 0.776530385017395, + "learning_rate": 3.228716308334378e-05, + "loss": 0.7361, + "step": 50580 + }, + { + "epoch": 0.8120515578099167, + "grad_norm": 0.7283920049667358, + "learning_rate": 3.2281132154485236e-05, + "loss": 0.7454, + "step": 50590 + }, + { + "epoch": 0.8122120740300808, + "grad_norm": 1.4104617834091187, + "learning_rate": 3.227510076258943e-05, + "loss": 0.8328, + "step": 50600 + }, + { + "epoch": 0.8123725902502448, + "grad_norm": 0.7341061234474182, + "learning_rate": 3.2269068908039924e-05, + "loss": 0.8337, + "step": 50610 + }, + { + "epoch": 0.8125331064704089, + "grad_norm": 0.7077570557594299, + "learning_rate": 3.226303659122029e-05, + "loss": 0.7534, + "step": 50620 + }, + { + "epoch": 0.8126936226905729, + "grad_norm": 0.7406942844390869, + "learning_rate": 3.225700381251416e-05, + "loss": 0.8475, + "step": 50630 + }, + { + "epoch": 0.8128541389107369, + "grad_norm": 0.8373804688453674, + "learning_rate": 3.225097057230518e-05, + "loss": 0.6085, + "step": 50640 + }, + { + "epoch": 0.813014655130901, + "grad_norm": 0.6087937355041504, + "learning_rate": 3.2244936870977036e-05, + "loss": 0.8409, + "step": 50650 + }, + { + "epoch": 0.813175171351065, + "grad_norm": 0.6097262501716614, + "learning_rate": 3.223890270891342e-05, + "loss": 0.691, + "step": 50660 + }, + { + "epoch": 0.8133356875712291, + "grad_norm": 0.6145855784416199, + "learning_rate": 3.223286808649809e-05, + "loss": 0.8165, + "step": 50670 + }, + { + "epoch": 0.8134962037913931, + "grad_norm": 0.6010522246360779, + "learning_rate": 3.2226833004114796e-05, + "loss": 0.7435, + "step": 50680 + }, + { + "epoch": 0.8136567200115572, + "grad_norm": 0.6444766521453857, + "learning_rate": 3.222079746214735e-05, + "loss": 0.7822, + "step": 50690 + }, + { + "epoch": 0.8138172362317212, + "grad_norm": 0.7466980814933777, + "learning_rate": 3.221476146097956e-05, + "loss": 0.8175, + "step": 50700 + }, + { + "epoch": 0.8139777524518853, + "grad_norm": 0.6455379724502563, + "learning_rate": 3.220872500099529e-05, + "loss": 0.8145, + "step": 50710 + }, + { + "epoch": 0.8141382686720493, + "grad_norm": 1.0604822635650635, + "learning_rate": 3.220268808257841e-05, + "loss": 0.8209, + "step": 50720 + }, + { + "epoch": 0.8142987848922134, + "grad_norm": 1.0054367780685425, + "learning_rate": 3.2196650706112855e-05, + "loss": 0.8075, + "step": 50730 + }, + { + "epoch": 0.8144593011123774, + "grad_norm": 0.6066417098045349, + "learning_rate": 3.219061287198256e-05, + "loss": 0.8534, + "step": 50740 + }, + { + "epoch": 0.8146198173325414, + "grad_norm": 0.8890721797943115, + "learning_rate": 3.218457458057148e-05, + "loss": 0.7633, + "step": 50750 + }, + { + "epoch": 0.8147803335527055, + "grad_norm": 0.8088901042938232, + "learning_rate": 3.2178535832263626e-05, + "loss": 0.7394, + "step": 50760 + }, + { + "epoch": 0.8149408497728695, + "grad_norm": 0.7342813014984131, + "learning_rate": 3.217249662744304e-05, + "loss": 0.7532, + "step": 50770 + }, + { + "epoch": 0.8151013659930336, + "grad_norm": 0.4916336238384247, + "learning_rate": 3.2166456966493755e-05, + "loss": 0.8893, + "step": 50780 + }, + { + "epoch": 0.8152618822131976, + "grad_norm": 1.660528540611267, + "learning_rate": 3.216041684979987e-05, + "loss": 0.8051, + "step": 50790 + }, + { + "epoch": 0.8154223984333617, + "grad_norm": 0.6898794174194336, + "learning_rate": 3.2154376277745504e-05, + "loss": 0.8006, + "step": 50800 + }, + { + "epoch": 0.8155829146535257, + "grad_norm": 0.5132544636726379, + "learning_rate": 3.21483352507148e-05, + "loss": 0.7728, + "step": 50810 + }, + { + "epoch": 0.8157434308736898, + "grad_norm": 1.1991245746612549, + "learning_rate": 3.214229376909192e-05, + "loss": 1.0009, + "step": 50820 + }, + { + "epoch": 0.8159039470938538, + "grad_norm": 0.6765375137329102, + "learning_rate": 3.213625183326109e-05, + "loss": 0.9167, + "step": 50830 + }, + { + "epoch": 0.8160644633140178, + "grad_norm": 0.6382625102996826, + "learning_rate": 3.213020944360653e-05, + "loss": 0.7377, + "step": 50840 + }, + { + "epoch": 0.8162249795341819, + "grad_norm": 0.6266745328903198, + "learning_rate": 3.212416660051249e-05, + "loss": 0.7678, + "step": 50850 + }, + { + "epoch": 0.8163854957543459, + "grad_norm": 0.9448476433753967, + "learning_rate": 3.2118123304363274e-05, + "loss": 0.7239, + "step": 50860 + }, + { + "epoch": 0.81654601197451, + "grad_norm": 0.7105902433395386, + "learning_rate": 3.21120795555432e-05, + "loss": 0.6573, + "step": 50870 + }, + { + "epoch": 0.816706528194674, + "grad_norm": 1.036645531654358, + "learning_rate": 3.210603535443659e-05, + "loss": 0.7209, + "step": 50880 + }, + { + "epoch": 0.8168670444148382, + "grad_norm": 0.9393647313117981, + "learning_rate": 3.209999070142786e-05, + "loss": 0.7875, + "step": 50890 + }, + { + "epoch": 0.8170275606350021, + "grad_norm": 0.6943752765655518, + "learning_rate": 3.2093945596901385e-05, + "loss": 0.7075, + "step": 50900 + }, + { + "epoch": 0.8171880768551663, + "grad_norm": 0.7303508520126343, + "learning_rate": 3.208790004124161e-05, + "loss": 0.7209, + "step": 50910 + }, + { + "epoch": 0.8173485930753303, + "grad_norm": 1.14373779296875, + "learning_rate": 3.208185403483299e-05, + "loss": 0.8139, + "step": 50920 + }, + { + "epoch": 0.8175091092954944, + "grad_norm": 0.6121991276741028, + "learning_rate": 3.207580757806003e-05, + "loss": 0.7331, + "step": 50930 + }, + { + "epoch": 0.8176696255156584, + "grad_norm": 0.5250168442726135, + "learning_rate": 3.206976067130723e-05, + "loss": 0.9737, + "step": 50940 + }, + { + "epoch": 0.8178301417358224, + "grad_norm": 0.6325916647911072, + "learning_rate": 3.206371331495915e-05, + "loss": 0.7334, + "step": 50950 + }, + { + "epoch": 0.8179906579559865, + "grad_norm": 0.5058959722518921, + "learning_rate": 3.2057665509400364e-05, + "loss": 0.7736, + "step": 50960 + }, + { + "epoch": 0.8181511741761505, + "grad_norm": 0.7221342325210571, + "learning_rate": 3.205161725501547e-05, + "loss": 0.8551, + "step": 50970 + }, + { + "epoch": 0.8183116903963146, + "grad_norm": 1.6827733516693115, + "learning_rate": 3.204556855218911e-05, + "loss": 0.7322, + "step": 50980 + }, + { + "epoch": 0.8184722066164786, + "grad_norm": 0.8795449733734131, + "learning_rate": 3.203951940130594e-05, + "loss": 0.6689, + "step": 50990 + }, + { + "epoch": 0.8186327228366427, + "grad_norm": 0.6283458471298218, + "learning_rate": 3.203346980275067e-05, + "loss": 0.7962, + "step": 51000 + }, + { + "epoch": 0.8187932390568067, + "grad_norm": 0.7434853911399841, + "learning_rate": 3.202741975690798e-05, + "loss": 0.7448, + "step": 51010 + }, + { + "epoch": 0.8189537552769708, + "grad_norm": 0.5576426386833191, + "learning_rate": 3.2021369264162654e-05, + "loss": 0.6719, + "step": 51020 + }, + { + "epoch": 0.8191142714971348, + "grad_norm": 1.4899184703826904, + "learning_rate": 3.2015318324899444e-05, + "loss": 0.75, + "step": 51030 + }, + { + "epoch": 0.8192747877172988, + "grad_norm": 0.7968209385871887, + "learning_rate": 3.2009266939503174e-05, + "loss": 0.7002, + "step": 51040 + }, + { + "epoch": 0.8194353039374629, + "grad_norm": 1.2451251745224, + "learning_rate": 3.200321510835867e-05, + "loss": 0.8102, + "step": 51050 + }, + { + "epoch": 0.8195958201576269, + "grad_norm": 1.5527254343032837, + "learning_rate": 3.199716283185078e-05, + "loss": 0.8621, + "step": 51060 + }, + { + "epoch": 0.819756336377791, + "grad_norm": 0.7013638615608215, + "learning_rate": 3.199111011036441e-05, + "loss": 0.7969, + "step": 51070 + }, + { + "epoch": 0.819916852597955, + "grad_norm": 0.7005305290222168, + "learning_rate": 3.198505694428447e-05, + "loss": 0.757, + "step": 51080 + }, + { + "epoch": 0.8200773688181191, + "grad_norm": 0.784109354019165, + "learning_rate": 3.19790033339959e-05, + "loss": 0.8087, + "step": 51090 + }, + { + "epoch": 0.8202378850382831, + "grad_norm": 0.9406647086143494, + "learning_rate": 3.197294927988369e-05, + "loss": 0.8031, + "step": 51100 + }, + { + "epoch": 0.8203984012584472, + "grad_norm": 0.9272148609161377, + "learning_rate": 3.196689478233282e-05, + "loss": 0.7297, + "step": 51110 + }, + { + "epoch": 0.8205589174786112, + "grad_norm": 0.6848446130752563, + "learning_rate": 3.1960839841728347e-05, + "loss": 0.691, + "step": 51120 + }, + { + "epoch": 0.8207194336987753, + "grad_norm": 0.6427187323570251, + "learning_rate": 3.1954784458455306e-05, + "loss": 0.7681, + "step": 51130 + }, + { + "epoch": 0.8208799499189393, + "grad_norm": 0.5888258218765259, + "learning_rate": 3.194872863289881e-05, + "loss": 0.8939, + "step": 51140 + }, + { + "epoch": 0.8210404661391033, + "grad_norm": 1.0835438966751099, + "learning_rate": 3.194267236544394e-05, + "loss": 0.923, + "step": 51150 + }, + { + "epoch": 0.8212009823592674, + "grad_norm": 0.9158841967582703, + "learning_rate": 3.193661565647586e-05, + "loss": 0.7937, + "step": 51160 + }, + { + "epoch": 0.8213614985794314, + "grad_norm": 0.7903165221214294, + "learning_rate": 3.193055850637974e-05, + "loss": 0.7572, + "step": 51170 + }, + { + "epoch": 0.8215220147995955, + "grad_norm": 0.6007195115089417, + "learning_rate": 3.192450091554078e-05, + "loss": 0.6484, + "step": 51180 + }, + { + "epoch": 0.8216825310197595, + "grad_norm": 0.7190322875976562, + "learning_rate": 3.1918442884344204e-05, + "loss": 0.6447, + "step": 51190 + }, + { + "epoch": 0.8218430472399236, + "grad_norm": 0.6688763499259949, + "learning_rate": 3.1912384413175264e-05, + "loss": 0.7149, + "step": 51200 + }, + { + "epoch": 0.8220035634600876, + "grad_norm": 0.8013874292373657, + "learning_rate": 3.190632550241925e-05, + "loss": 0.7017, + "step": 51210 + }, + { + "epoch": 0.8221640796802517, + "grad_norm": 0.6203571557998657, + "learning_rate": 3.190026615246147e-05, + "loss": 0.809, + "step": 51220 + }, + { + "epoch": 0.8223245959004157, + "grad_norm": 0.6456263661384583, + "learning_rate": 3.189420636368726e-05, + "loss": 0.787, + "step": 51230 + }, + { + "epoch": 0.8224851121205797, + "grad_norm": 0.6965777277946472, + "learning_rate": 3.1888146136481995e-05, + "loss": 0.8035, + "step": 51240 + }, + { + "epoch": 0.8226456283407438, + "grad_norm": 0.9090338349342346, + "learning_rate": 3.188208547123106e-05, + "loss": 0.7733, + "step": 51250 + }, + { + "epoch": 0.8228061445609078, + "grad_norm": 1.1492468118667603, + "learning_rate": 3.187602436831989e-05, + "loss": 0.8204, + "step": 51260 + }, + { + "epoch": 0.822966660781072, + "grad_norm": 0.6570440530776978, + "learning_rate": 3.186996282813392e-05, + "loss": 0.7785, + "step": 51270 + }, + { + "epoch": 0.8231271770012359, + "grad_norm": 0.7745352983474731, + "learning_rate": 3.186390085105864e-05, + "loss": 0.663, + "step": 51280 + }, + { + "epoch": 0.8232876932214, + "grad_norm": 0.7883049249649048, + "learning_rate": 3.185783843747956e-05, + "loss": 0.6798, + "step": 51290 + }, + { + "epoch": 0.823448209441564, + "grad_norm": 1.9919511079788208, + "learning_rate": 3.185177558778219e-05, + "loss": 0.7815, + "step": 51300 + }, + { + "epoch": 0.8236087256617282, + "grad_norm": 0.6226492524147034, + "learning_rate": 3.1845712302352116e-05, + "loss": 0.9529, + "step": 51310 + }, + { + "epoch": 0.8237692418818922, + "grad_norm": 0.7133690714836121, + "learning_rate": 3.1839648581574926e-05, + "loss": 0.7397, + "step": 51320 + }, + { + "epoch": 0.8239297581020563, + "grad_norm": 0.49226683378219604, + "learning_rate": 3.183358442583623e-05, + "loss": 0.7733, + "step": 51330 + }, + { + "epoch": 0.8240902743222203, + "grad_norm": 0.8983396291732788, + "learning_rate": 3.1827519835521666e-05, + "loss": 0.915, + "step": 51340 + }, + { + "epoch": 0.8242507905423843, + "grad_norm": 0.6730813980102539, + "learning_rate": 3.182145481101692e-05, + "loss": 0.7202, + "step": 51350 + }, + { + "epoch": 0.8244113067625484, + "grad_norm": 0.7853294014930725, + "learning_rate": 3.181538935270769e-05, + "loss": 0.7599, + "step": 51360 + }, + { + "epoch": 0.8245718229827124, + "grad_norm": 0.7878832817077637, + "learning_rate": 3.180932346097969e-05, + "loss": 0.7648, + "step": 51370 + }, + { + "epoch": 0.8247323392028765, + "grad_norm": 1.1561143398284912, + "learning_rate": 3.180325713621869e-05, + "loss": 0.7749, + "step": 51380 + }, + { + "epoch": 0.8248928554230405, + "grad_norm": 0.7162206172943115, + "learning_rate": 3.179719037881047e-05, + "loss": 0.7951, + "step": 51390 + }, + { + "epoch": 0.8250533716432046, + "grad_norm": 0.7769566774368286, + "learning_rate": 3.179112318914083e-05, + "loss": 0.6767, + "step": 51400 + }, + { + "epoch": 0.8252138878633686, + "grad_norm": 0.7295281291007996, + "learning_rate": 3.178505556759561e-05, + "loss": 0.7923, + "step": 51410 + }, + { + "epoch": 0.8253744040835327, + "grad_norm": 0.815035879611969, + "learning_rate": 3.17789875145607e-05, + "loss": 0.9085, + "step": 51420 + }, + { + "epoch": 0.8255349203036967, + "grad_norm": 0.8998827934265137, + "learning_rate": 3.1772919030421955e-05, + "loss": 0.701, + "step": 51430 + }, + { + "epoch": 0.8256954365238607, + "grad_norm": 0.45619437098503113, + "learning_rate": 3.176685011556532e-05, + "loss": 0.701, + "step": 51440 + }, + { + "epoch": 0.8258559527440248, + "grad_norm": 1.0730310678482056, + "learning_rate": 3.176078077037674e-05, + "loss": 0.7951, + "step": 51450 + }, + { + "epoch": 0.8260164689641888, + "grad_norm": 0.6860649585723877, + "learning_rate": 3.175471099524217e-05, + "loss": 0.7788, + "step": 51460 + }, + { + "epoch": 0.8261769851843529, + "grad_norm": 0.9177849888801575, + "learning_rate": 3.174864079054763e-05, + "loss": 0.8228, + "step": 51470 + }, + { + "epoch": 0.8263375014045169, + "grad_norm": 0.6845583319664001, + "learning_rate": 3.1742570156679155e-05, + "loss": 0.7612, + "step": 51480 + }, + { + "epoch": 0.826498017624681, + "grad_norm": 1.2593958377838135, + "learning_rate": 3.173649909402279e-05, + "loss": 0.7547, + "step": 51490 + }, + { + "epoch": 0.826658533844845, + "grad_norm": 1.0179110765457153, + "learning_rate": 3.173042760296462e-05, + "loss": 0.7102, + "step": 51500 + }, + { + "epoch": 0.8268190500650091, + "grad_norm": 0.7475149035453796, + "learning_rate": 3.1724355683890764e-05, + "loss": 0.6479, + "step": 51510 + }, + { + "epoch": 0.8269795662851731, + "grad_norm": 0.6423097252845764, + "learning_rate": 3.171828333718735e-05, + "loss": 0.816, + "step": 51520 + }, + { + "epoch": 0.8271400825053372, + "grad_norm": 0.7709639072418213, + "learning_rate": 3.171221056324055e-05, + "loss": 0.7711, + "step": 51530 + }, + { + "epoch": 0.8273005987255012, + "grad_norm": 0.8602519631385803, + "learning_rate": 3.170613736243655e-05, + "loss": 0.7882, + "step": 51540 + }, + { + "epoch": 0.8274611149456652, + "grad_norm": 0.9428362846374512, + "learning_rate": 3.1700063735161576e-05, + "loss": 0.8036, + "step": 51550 + }, + { + "epoch": 0.8276216311658293, + "grad_norm": 0.7058117389678955, + "learning_rate": 3.169398968180188e-05, + "loss": 0.7533, + "step": 51560 + }, + { + "epoch": 0.8277821473859933, + "grad_norm": 0.7675348520278931, + "learning_rate": 3.1687915202743726e-05, + "loss": 0.7622, + "step": 51570 + }, + { + "epoch": 0.8279426636061574, + "grad_norm": 0.6978825926780701, + "learning_rate": 3.1681840298373415e-05, + "loss": 0.745, + "step": 51580 + }, + { + "epoch": 0.8281031798263214, + "grad_norm": 0.8322034478187561, + "learning_rate": 3.1675764969077284e-05, + "loss": 0.7532, + "step": 51590 + }, + { + "epoch": 0.8282636960464855, + "grad_norm": 0.8932547569274902, + "learning_rate": 3.166968921524168e-05, + "loss": 0.724, + "step": 51600 + }, + { + "epoch": 0.8284242122666495, + "grad_norm": 0.6266891360282898, + "learning_rate": 3.166361303725299e-05, + "loss": 0.7568, + "step": 51610 + }, + { + "epoch": 0.8285847284868136, + "grad_norm": 0.6329978108406067, + "learning_rate": 3.1657536435497624e-05, + "loss": 0.7672, + "step": 51620 + }, + { + "epoch": 0.8287452447069776, + "grad_norm": 0.6184289455413818, + "learning_rate": 3.165145941036202e-05, + "loss": 0.694, + "step": 51630 + }, + { + "epoch": 0.8289057609271416, + "grad_norm": 0.6278343796730042, + "learning_rate": 3.1645381962232635e-05, + "loss": 0.8276, + "step": 51640 + }, + { + "epoch": 0.8290662771473057, + "grad_norm": 0.7803795337677002, + "learning_rate": 3.1639304091495955e-05, + "loss": 0.7544, + "step": 51650 + }, + { + "epoch": 0.8292267933674697, + "grad_norm": 0.7440910339355469, + "learning_rate": 3.163322579853851e-05, + "loss": 0.7976, + "step": 51660 + }, + { + "epoch": 0.8293873095876338, + "grad_norm": 0.799324095249176, + "learning_rate": 3.162714708374684e-05, + "loss": 0.7645, + "step": 51670 + }, + { + "epoch": 0.8295478258077978, + "grad_norm": 0.7008814215660095, + "learning_rate": 3.162106794750751e-05, + "loss": 0.7372, + "step": 51680 + }, + { + "epoch": 0.829708342027962, + "grad_norm": 0.8159811496734619, + "learning_rate": 3.1614988390207115e-05, + "loss": 0.7435, + "step": 51690 + }, + { + "epoch": 0.829868858248126, + "grad_norm": 0.7628427147865295, + "learning_rate": 3.16089084122323e-05, + "loss": 0.7663, + "step": 51700 + }, + { + "epoch": 0.8300293744682901, + "grad_norm": 0.7554438710212708, + "learning_rate": 3.160282801396968e-05, + "loss": 0.7844, + "step": 51710 + }, + { + "epoch": 0.8301898906884541, + "grad_norm": 0.5375316739082336, + "learning_rate": 3.159674719580597e-05, + "loss": 0.9245, + "step": 51720 + }, + { + "epoch": 0.8303504069086182, + "grad_norm": 1.2882580757141113, + "learning_rate": 3.1590665958127844e-05, + "loss": 0.8146, + "step": 51730 + }, + { + "epoch": 0.8305109231287822, + "grad_norm": 0.6969280242919922, + "learning_rate": 3.158458430132205e-05, + "loss": 0.8434, + "step": 51740 + }, + { + "epoch": 0.8306714393489462, + "grad_norm": 0.7893766164779663, + "learning_rate": 3.1578502225775346e-05, + "loss": 0.7646, + "step": 51750 + }, + { + "epoch": 0.8308319555691103, + "grad_norm": 0.8418654799461365, + "learning_rate": 3.157241973187451e-05, + "loss": 0.7648, + "step": 51760 + }, + { + "epoch": 0.8309924717892743, + "grad_norm": 0.8913121223449707, + "learning_rate": 3.156633682000636e-05, + "loss": 0.6282, + "step": 51770 + }, + { + "epoch": 0.8311529880094384, + "grad_norm": 1.2166492938995361, + "learning_rate": 3.1560253490557723e-05, + "loss": 0.7584, + "step": 51780 + }, + { + "epoch": 0.8313135042296024, + "grad_norm": 0.9976508021354675, + "learning_rate": 3.155416974391546e-05, + "loss": 0.8042, + "step": 51790 + }, + { + "epoch": 0.8314740204497665, + "grad_norm": 1.576456069946289, + "learning_rate": 3.154808558046649e-05, + "loss": 0.7182, + "step": 51800 + }, + { + "epoch": 0.8316345366699305, + "grad_norm": 0.8552323579788208, + "learning_rate": 3.15420010005977e-05, + "loss": 0.7153, + "step": 51810 + }, + { + "epoch": 0.8317950528900946, + "grad_norm": 0.7519588470458984, + "learning_rate": 3.153591600469604e-05, + "loss": 0.6518, + "step": 51820 + }, + { + "epoch": 0.8319555691102586, + "grad_norm": 0.8069236278533936, + "learning_rate": 3.1529830593148495e-05, + "loss": 0.8986, + "step": 51830 + }, + { + "epoch": 0.8321160853304227, + "grad_norm": 0.6765708327293396, + "learning_rate": 3.1523744766342046e-05, + "loss": 0.7802, + "step": 51840 + }, + { + "epoch": 0.8322766015505867, + "grad_norm": 0.4457422196865082, + "learning_rate": 3.151765852466372e-05, + "loss": 0.7508, + "step": 51850 + }, + { + "epoch": 0.8324371177707507, + "grad_norm": 0.6583738923072815, + "learning_rate": 3.151157186850057e-05, + "loss": 0.7298, + "step": 51860 + }, + { + "epoch": 0.8325976339909148, + "grad_norm": 0.8298267722129822, + "learning_rate": 3.1505484798239656e-05, + "loss": 0.6357, + "step": 51870 + }, + { + "epoch": 0.8327581502110788, + "grad_norm": 0.6041926741600037, + "learning_rate": 3.14993973142681e-05, + "loss": 0.7303, + "step": 51880 + }, + { + "epoch": 0.8329186664312429, + "grad_norm": 0.5583828687667847, + "learning_rate": 3.1493309416973026e-05, + "loss": 0.7921, + "step": 51890 + }, + { + "epoch": 0.8330791826514069, + "grad_norm": 0.5586530566215515, + "learning_rate": 3.148722110674158e-05, + "loss": 0.7918, + "step": 51900 + }, + { + "epoch": 0.833239698871571, + "grad_norm": 0.9106624126434326, + "learning_rate": 3.148113238396094e-05, + "loss": 0.8003, + "step": 51910 + }, + { + "epoch": 0.833400215091735, + "grad_norm": 1.0130101442337036, + "learning_rate": 3.147504324901833e-05, + "loss": 0.9176, + "step": 51920 + }, + { + "epoch": 0.8335607313118991, + "grad_norm": 0.8647226691246033, + "learning_rate": 3.1468953702300974e-05, + "loss": 0.7126, + "step": 51930 + }, + { + "epoch": 0.8337212475320631, + "grad_norm": 0.6546942591667175, + "learning_rate": 3.146286374419613e-05, + "loss": 0.7693, + "step": 51940 + }, + { + "epoch": 0.8338817637522271, + "grad_norm": 0.5069435238838196, + "learning_rate": 3.145677337509109e-05, + "loss": 0.7329, + "step": 51950 + }, + { + "epoch": 0.8340422799723912, + "grad_norm": 0.7006145119667053, + "learning_rate": 3.145068259537316e-05, + "loss": 0.6616, + "step": 51960 + }, + { + "epoch": 0.8342027961925552, + "grad_norm": 0.6145896911621094, + "learning_rate": 3.1444591405429666e-05, + "loss": 0.7422, + "step": 51970 + }, + { + "epoch": 0.8343633124127193, + "grad_norm": 0.6949965953826904, + "learning_rate": 3.1438499805648006e-05, + "loss": 0.7302, + "step": 51980 + }, + { + "epoch": 0.8345238286328833, + "grad_norm": 0.7092311382293701, + "learning_rate": 3.143240779641553e-05, + "loss": 0.8647, + "step": 51990 + }, + { + "epoch": 0.8346843448530474, + "grad_norm": 1.0651274919509888, + "learning_rate": 3.1426315378119684e-05, + "loss": 0.8106, + "step": 52000 + }, + { + "epoch": 0.8346843448530474, + "eval_loss": 0.7796915769577026, + "eval_runtime": 1833.3949, + "eval_samples_per_second": 14.307, + "eval_steps_per_second": 1.788, + "step": 52000 + }, + { + "epoch": 0.8348448610732114, + "grad_norm": 0.7881428599357605, + "learning_rate": 3.142022255114789e-05, + "loss": 0.7531, + "step": 52010 + }, + { + "epoch": 0.8350053772933755, + "grad_norm": 0.7138424515724182, + "learning_rate": 3.141412931588763e-05, + "loss": 0.7821, + "step": 52020 + }, + { + "epoch": 0.8351658935135395, + "grad_norm": 1.2252860069274902, + "learning_rate": 3.140803567272639e-05, + "loss": 0.8884, + "step": 52030 + }, + { + "epoch": 0.8353264097337036, + "grad_norm": 0.688848614692688, + "learning_rate": 3.1401941622051686e-05, + "loss": 0.826, + "step": 52040 + }, + { + "epoch": 0.8354869259538676, + "grad_norm": 0.6673890948295593, + "learning_rate": 3.139584716425108e-05, + "loss": 0.7188, + "step": 52050 + }, + { + "epoch": 0.8356474421740316, + "grad_norm": 0.920684278011322, + "learning_rate": 3.1389752299712137e-05, + "loss": 0.6815, + "step": 52060 + }, + { + "epoch": 0.8358079583941957, + "grad_norm": 0.6656520366668701, + "learning_rate": 3.138365702882244e-05, + "loss": 0.7274, + "step": 52070 + }, + { + "epoch": 0.8359684746143597, + "grad_norm": 1.0643335580825806, + "learning_rate": 3.137756135196964e-05, + "loss": 0.9404, + "step": 52080 + }, + { + "epoch": 0.8361289908345239, + "grad_norm": 0.6181660294532776, + "learning_rate": 3.137146526954136e-05, + "loss": 0.6393, + "step": 52090 + }, + { + "epoch": 0.8362895070546879, + "grad_norm": 0.6213283538818359, + "learning_rate": 3.136536878192529e-05, + "loss": 0.6901, + "step": 52100 + }, + { + "epoch": 0.836450023274852, + "grad_norm": 0.8121861815452576, + "learning_rate": 3.1359271889509126e-05, + "loss": 0.723, + "step": 52110 + }, + { + "epoch": 0.836610539495016, + "grad_norm": 1.0398539304733276, + "learning_rate": 3.135317459268059e-05, + "loss": 0.9302, + "step": 52120 + }, + { + "epoch": 0.8367710557151801, + "grad_norm": 1.0615894794464111, + "learning_rate": 3.1347076891827435e-05, + "loss": 0.781, + "step": 52130 + }, + { + "epoch": 0.8369315719353441, + "grad_norm": 0.6286749839782715, + "learning_rate": 3.134097878733746e-05, + "loss": 0.7559, + "step": 52140 + }, + { + "epoch": 0.8370920881555081, + "grad_norm": 0.605238676071167, + "learning_rate": 3.1334880279598436e-05, + "loss": 0.8901, + "step": 52150 + }, + { + "epoch": 0.8372526043756722, + "grad_norm": 0.930112361907959, + "learning_rate": 3.132878136899822e-05, + "loss": 0.765, + "step": 52160 + }, + { + "epoch": 0.8374131205958362, + "grad_norm": 0.8334870934486389, + "learning_rate": 3.132268205592465e-05, + "loss": 0.853, + "step": 52170 + }, + { + "epoch": 0.8375736368160003, + "grad_norm": 1.1702371835708618, + "learning_rate": 3.1316582340765605e-05, + "loss": 0.825, + "step": 52180 + }, + { + "epoch": 0.8377341530361643, + "grad_norm": 0.7909703850746155, + "learning_rate": 3.131048222390901e-05, + "loss": 0.7408, + "step": 52190 + }, + { + "epoch": 0.8378946692563284, + "grad_norm": 0.7308981418609619, + "learning_rate": 3.1304381705742776e-05, + "loss": 0.9318, + "step": 52200 + }, + { + "epoch": 0.8380551854764924, + "grad_norm": 0.9484549760818481, + "learning_rate": 3.1298280786654863e-05, + "loss": 0.8695, + "step": 52210 + }, + { + "epoch": 0.8382157016966565, + "grad_norm": 0.7925643920898438, + "learning_rate": 3.1292179467033267e-05, + "loss": 0.8416, + "step": 52220 + }, + { + "epoch": 0.8383762179168205, + "grad_norm": 0.684715211391449, + "learning_rate": 3.1286077747265994e-05, + "loss": 0.7392, + "step": 52230 + }, + { + "epoch": 0.8385367341369846, + "grad_norm": 1.2995930910110474, + "learning_rate": 3.127997562774106e-05, + "loss": 0.7615, + "step": 52240 + }, + { + "epoch": 0.8386972503571486, + "grad_norm": 0.6776323318481445, + "learning_rate": 3.1273873108846544e-05, + "loss": 0.7878, + "step": 52250 + }, + { + "epoch": 0.8388577665773126, + "grad_norm": 1.0817039012908936, + "learning_rate": 3.126777019097052e-05, + "loss": 0.8689, + "step": 52260 + }, + { + "epoch": 0.8390182827974767, + "grad_norm": 0.7110409736633301, + "learning_rate": 3.1261666874501094e-05, + "loss": 0.7714, + "step": 52270 + }, + { + "epoch": 0.8391787990176407, + "grad_norm": 0.8052693009376526, + "learning_rate": 3.1255563159826415e-05, + "loss": 0.882, + "step": 52280 + }, + { + "epoch": 0.8393393152378048, + "grad_norm": 0.7086915969848633, + "learning_rate": 3.124945904733463e-05, + "loss": 0.7437, + "step": 52290 + }, + { + "epoch": 0.8394998314579688, + "grad_norm": 1.0792137384414673, + "learning_rate": 3.124335453741394e-05, + "loss": 0.9277, + "step": 52300 + }, + { + "epoch": 0.8396603476781329, + "grad_norm": 0.9411969780921936, + "learning_rate": 3.123724963045253e-05, + "loss": 0.8533, + "step": 52310 + }, + { + "epoch": 0.8398208638982969, + "grad_norm": 0.869421124458313, + "learning_rate": 3.123114432683866e-05, + "loss": 0.7247, + "step": 52320 + }, + { + "epoch": 0.839981380118461, + "grad_norm": 0.44697239995002747, + "learning_rate": 3.122503862696059e-05, + "loss": 0.7969, + "step": 52330 + }, + { + "epoch": 0.840141896338625, + "grad_norm": 1.0396506786346436, + "learning_rate": 3.12189325312066e-05, + "loss": 0.6923, + "step": 52340 + }, + { + "epoch": 0.840302412558789, + "grad_norm": 1.1021019220352173, + "learning_rate": 3.121282603996499e-05, + "loss": 0.5852, + "step": 52350 + }, + { + "epoch": 0.8404629287789531, + "grad_norm": 0.8246138691902161, + "learning_rate": 3.120671915362413e-05, + "loss": 0.7659, + "step": 52360 + }, + { + "epoch": 0.8406234449991171, + "grad_norm": 0.5514949560165405, + "learning_rate": 3.120061187257234e-05, + "loss": 0.7098, + "step": 52370 + }, + { + "epoch": 0.8407839612192812, + "grad_norm": 1.3018441200256348, + "learning_rate": 3.1194504197198044e-05, + "loss": 0.77, + "step": 52380 + }, + { + "epoch": 0.8409444774394452, + "grad_norm": 0.6040323972702026, + "learning_rate": 3.118839612788963e-05, + "loss": 0.8242, + "step": 52390 + }, + { + "epoch": 0.8411049936596093, + "grad_norm": 0.5248334407806396, + "learning_rate": 3.118228766503555e-05, + "loss": 0.6986, + "step": 52400 + }, + { + "epoch": 0.8412655098797733, + "grad_norm": 0.6588628888130188, + "learning_rate": 3.1176178809024256e-05, + "loss": 0.7876, + "step": 52410 + }, + { + "epoch": 0.8414260260999374, + "grad_norm": 0.735351026058197, + "learning_rate": 3.117006956024425e-05, + "loss": 0.7604, + "step": 52420 + }, + { + "epoch": 0.8415865423201014, + "grad_norm": 0.6558626294136047, + "learning_rate": 3.1163959919084026e-05, + "loss": 0.7343, + "step": 52430 + }, + { + "epoch": 0.8417470585402655, + "grad_norm": 0.6408172845840454, + "learning_rate": 3.115784988593213e-05, + "loss": 0.9082, + "step": 52440 + }, + { + "epoch": 0.8419075747604295, + "grad_norm": 0.9055995941162109, + "learning_rate": 3.115173946117713e-05, + "loss": 0.7988, + "step": 52450 + }, + { + "epoch": 0.8420680909805935, + "grad_norm": 0.7418809533119202, + "learning_rate": 3.11456286452076e-05, + "loss": 0.8157, + "step": 52460 + }, + { + "epoch": 0.8422286072007577, + "grad_norm": 0.44372910261154175, + "learning_rate": 3.113951743841217e-05, + "loss": 0.6845, + "step": 52470 + }, + { + "epoch": 0.8423891234209216, + "grad_norm": 0.710764467716217, + "learning_rate": 3.113340584117945e-05, + "loss": 0.8404, + "step": 52480 + }, + { + "epoch": 0.8425496396410858, + "grad_norm": 0.7935421466827393, + "learning_rate": 3.112729385389813e-05, + "loss": 0.7674, + "step": 52490 + }, + { + "epoch": 0.8427101558612498, + "grad_norm": 0.69051593542099, + "learning_rate": 3.112118147695688e-05, + "loss": 0.8061, + "step": 52500 + }, + { + "epoch": 0.8428706720814139, + "grad_norm": 0.8647956848144531, + "learning_rate": 3.111506871074442e-05, + "loss": 0.8072, + "step": 52510 + }, + { + "epoch": 0.8430311883015779, + "grad_norm": 0.5754925608634949, + "learning_rate": 3.110895555564948e-05, + "loss": 0.7711, + "step": 52520 + }, + { + "epoch": 0.843191704521742, + "grad_norm": 0.7919754385948181, + "learning_rate": 3.110284201206082e-05, + "loss": 0.7985, + "step": 52530 + }, + { + "epoch": 0.843352220741906, + "grad_norm": 0.7461055517196655, + "learning_rate": 3.1096728080367225e-05, + "loss": 0.7955, + "step": 52540 + }, + { + "epoch": 0.84351273696207, + "grad_norm": 0.9612576961517334, + "learning_rate": 3.109061376095751e-05, + "loss": 0.6629, + "step": 52550 + }, + { + "epoch": 0.8436732531822341, + "grad_norm": 0.7846565246582031, + "learning_rate": 3.108449905422051e-05, + "loss": 0.7977, + "step": 52560 + }, + { + "epoch": 0.8438337694023981, + "grad_norm": 0.8809017539024353, + "learning_rate": 3.107838396054509e-05, + "loss": 0.7214, + "step": 52570 + }, + { + "epoch": 0.8439942856225622, + "grad_norm": 0.7183091044425964, + "learning_rate": 3.107226848032012e-05, + "loss": 0.7897, + "step": 52580 + }, + { + "epoch": 0.8441548018427262, + "grad_norm": 0.607062816619873, + "learning_rate": 3.106615261393451e-05, + "loss": 0.8171, + "step": 52590 + }, + { + "epoch": 0.8443153180628903, + "grad_norm": 0.7164509296417236, + "learning_rate": 3.10600363617772e-05, + "loss": 0.8952, + "step": 52600 + }, + { + "epoch": 0.8444758342830543, + "grad_norm": 0.6982039213180542, + "learning_rate": 3.1053919724237154e-05, + "loss": 0.801, + "step": 52610 + }, + { + "epoch": 0.8446363505032184, + "grad_norm": 2.780975341796875, + "learning_rate": 3.104780270170334e-05, + "loss": 0.8291, + "step": 52620 + }, + { + "epoch": 0.8447968667233824, + "grad_norm": 0.7463472485542297, + "learning_rate": 3.104168529456477e-05, + "loss": 0.7454, + "step": 52630 + }, + { + "epoch": 0.8449573829435465, + "grad_norm": 0.7050913572311401, + "learning_rate": 3.1035567503210484e-05, + "loss": 0.8042, + "step": 52640 + }, + { + "epoch": 0.8451178991637105, + "grad_norm": 0.6496828198432922, + "learning_rate": 3.102944932802953e-05, + "loss": 0.7964, + "step": 52650 + }, + { + "epoch": 0.8452784153838745, + "grad_norm": 0.5909246802330017, + "learning_rate": 3.102333076941098e-05, + "loss": 0.7722, + "step": 52660 + }, + { + "epoch": 0.8454389316040386, + "grad_norm": 0.6441418528556824, + "learning_rate": 3.1017211827743956e-05, + "loss": 0.7352, + "step": 52670 + }, + { + "epoch": 0.8455994478242026, + "grad_norm": 0.9058145880699158, + "learning_rate": 3.101109250341756e-05, + "loss": 0.8999, + "step": 52680 + }, + { + "epoch": 0.8457599640443667, + "grad_norm": 0.819124162197113, + "learning_rate": 3.100497279682098e-05, + "loss": 0.7236, + "step": 52690 + }, + { + "epoch": 0.8459204802645307, + "grad_norm": 0.8230063915252686, + "learning_rate": 3.099885270834337e-05, + "loss": 0.7997, + "step": 52700 + }, + { + "epoch": 0.8460809964846948, + "grad_norm": 0.8300336599349976, + "learning_rate": 3.0992732238373936e-05, + "loss": 0.9055, + "step": 52710 + }, + { + "epoch": 0.8462415127048588, + "grad_norm": 0.727199137210846, + "learning_rate": 3.098661138730191e-05, + "loss": 0.7564, + "step": 52720 + }, + { + "epoch": 0.8464020289250229, + "grad_norm": 0.8800899982452393, + "learning_rate": 3.0980490155516535e-05, + "loss": 0.9086, + "step": 52730 + }, + { + "epoch": 0.8465625451451869, + "grad_norm": 1.0006943941116333, + "learning_rate": 3.097436854340708e-05, + "loss": 0.712, + "step": 52740 + }, + { + "epoch": 0.8467230613653509, + "grad_norm": 0.9080299139022827, + "learning_rate": 3.096824655136286e-05, + "loss": 0.7539, + "step": 52750 + }, + { + "epoch": 0.846883577585515, + "grad_norm": 1.4558351039886475, + "learning_rate": 3.096212417977319e-05, + "loss": 0.7537, + "step": 52760 + }, + { + "epoch": 0.847044093805679, + "grad_norm": 0.8315474987030029, + "learning_rate": 3.0956001429027415e-05, + "loss": 0.7337, + "step": 52770 + }, + { + "epoch": 0.8472046100258431, + "grad_norm": 0.7302047610282898, + "learning_rate": 3.094987829951491e-05, + "loss": 0.7679, + "step": 52780 + }, + { + "epoch": 0.8473651262460071, + "grad_norm": 0.5686960220336914, + "learning_rate": 3.094375479162506e-05, + "loss": 0.8248, + "step": 52790 + }, + { + "epoch": 0.8475256424661712, + "grad_norm": 0.5952631831169128, + "learning_rate": 3.0937630905747295e-05, + "loss": 0.7282, + "step": 52800 + }, + { + "epoch": 0.8476861586863352, + "grad_norm": 0.8354450464248657, + "learning_rate": 3.093150664227105e-05, + "loss": 0.7412, + "step": 52810 + }, + { + "epoch": 0.8478466749064993, + "grad_norm": 0.9509097933769226, + "learning_rate": 3.0925382001585806e-05, + "loss": 0.8058, + "step": 52820 + }, + { + "epoch": 0.8480071911266633, + "grad_norm": 0.8049838542938232, + "learning_rate": 3.091925698408105e-05, + "loss": 0.8163, + "step": 52830 + }, + { + "epoch": 0.8481677073468274, + "grad_norm": 0.7500231266021729, + "learning_rate": 3.091313159014628e-05, + "loss": 0.7621, + "step": 52840 + }, + { + "epoch": 0.8483282235669914, + "grad_norm": 0.853039026260376, + "learning_rate": 3.090700582017104e-05, + "loss": 0.7904, + "step": 52850 + }, + { + "epoch": 0.8484887397871554, + "grad_norm": 0.7488430142402649, + "learning_rate": 3.0900879674544916e-05, + "loss": 0.8265, + "step": 52860 + }, + { + "epoch": 0.8486492560073196, + "grad_norm": 0.8654149174690247, + "learning_rate": 3.0894753153657475e-05, + "loss": 0.6667, + "step": 52870 + }, + { + "epoch": 0.8488097722274835, + "grad_norm": 0.8572497367858887, + "learning_rate": 3.0888626257898336e-05, + "loss": 0.6419, + "step": 52880 + }, + { + "epoch": 0.8489702884476477, + "grad_norm": 0.6955173015594482, + "learning_rate": 3.088249898765712e-05, + "loss": 0.8025, + "step": 52890 + }, + { + "epoch": 0.8491308046678117, + "grad_norm": 0.6116199493408203, + "learning_rate": 3.08763713433235e-05, + "loss": 0.7267, + "step": 52900 + }, + { + "epoch": 0.8492913208879758, + "grad_norm": 1.459314227104187, + "learning_rate": 3.087024332528715e-05, + "loss": 0.7375, + "step": 52910 + }, + { + "epoch": 0.8494518371081398, + "grad_norm": 0.594618022441864, + "learning_rate": 3.086411493393779e-05, + "loss": 0.7512, + "step": 52920 + }, + { + "epoch": 0.8496123533283039, + "grad_norm": 0.588488757610321, + "learning_rate": 3.0857986169665134e-05, + "loss": 0.7296, + "step": 52930 + }, + { + "epoch": 0.8497728695484679, + "grad_norm": 0.8129919767379761, + "learning_rate": 3.085185703285893e-05, + "loss": 0.7821, + "step": 52940 + }, + { + "epoch": 0.8499333857686319, + "grad_norm": 0.8383474349975586, + "learning_rate": 3.084572752390898e-05, + "loss": 0.705, + "step": 52950 + }, + { + "epoch": 0.850093901988796, + "grad_norm": 0.9841850399971008, + "learning_rate": 3.083959764320507e-05, + "loss": 0.7577, + "step": 52960 + }, + { + "epoch": 0.85025441820896, + "grad_norm": 1.0148524045944214, + "learning_rate": 3.083346739113702e-05, + "loss": 0.6635, + "step": 52970 + }, + { + "epoch": 0.8504149344291241, + "grad_norm": 0.8356283903121948, + "learning_rate": 3.082733676809468e-05, + "loss": 0.7902, + "step": 52980 + }, + { + "epoch": 0.8505754506492881, + "grad_norm": 0.6466485857963562, + "learning_rate": 3.082120577446793e-05, + "loss": 0.7237, + "step": 52990 + }, + { + "epoch": 0.8507359668694522, + "grad_norm": 0.8251693844795227, + "learning_rate": 3.081507441064665e-05, + "loss": 0.8009, + "step": 53000 + }, + { + "epoch": 0.8508964830896162, + "grad_norm": 0.8461582660675049, + "learning_rate": 3.080894267702078e-05, + "loss": 0.87, + "step": 53010 + }, + { + "epoch": 0.8510569993097803, + "grad_norm": 0.6124935150146484, + "learning_rate": 3.0802810573980245e-05, + "loss": 0.6768, + "step": 53020 + }, + { + "epoch": 0.8512175155299443, + "grad_norm": 0.8439136147499084, + "learning_rate": 3.0796678101915014e-05, + "loss": 0.7673, + "step": 53030 + }, + { + "epoch": 0.8513780317501084, + "grad_norm": 0.549156665802002, + "learning_rate": 3.0790545261215094e-05, + "loss": 0.7682, + "step": 53040 + }, + { + "epoch": 0.8515385479702724, + "grad_norm": 1.2077950239181519, + "learning_rate": 3.0784412052270467e-05, + "loss": 0.7417, + "step": 53050 + }, + { + "epoch": 0.8516990641904364, + "grad_norm": 0.9773049354553223, + "learning_rate": 3.077827847547119e-05, + "loss": 0.8799, + "step": 53060 + }, + { + "epoch": 0.8518595804106005, + "grad_norm": 1.0705960988998413, + "learning_rate": 3.077214453120731e-05, + "loss": 0.7786, + "step": 53070 + }, + { + "epoch": 0.8520200966307645, + "grad_norm": 0.8591026663780212, + "learning_rate": 3.076601021986893e-05, + "loss": 0.834, + "step": 53080 + }, + { + "epoch": 0.8521806128509286, + "grad_norm": 1.0769667625427246, + "learning_rate": 3.075987554184613e-05, + "loss": 0.8041, + "step": 53090 + }, + { + "epoch": 0.8523411290710926, + "grad_norm": 0.8775037527084351, + "learning_rate": 3.075374049752906e-05, + "loss": 0.7852, + "step": 53100 + }, + { + "epoch": 0.8525016452912567, + "grad_norm": 0.6657942533493042, + "learning_rate": 3.0747605087307864e-05, + "loss": 0.8896, + "step": 53110 + }, + { + "epoch": 0.8526621615114207, + "grad_norm": 0.800563633441925, + "learning_rate": 3.074146931157272e-05, + "loss": 0.7763, + "step": 53120 + }, + { + "epoch": 0.8528226777315848, + "grad_norm": 0.7074289321899414, + "learning_rate": 3.073533317071382e-05, + "loss": 0.7586, + "step": 53130 + }, + { + "epoch": 0.8529831939517488, + "grad_norm": 0.7974056005477905, + "learning_rate": 3.072919666512141e-05, + "loss": 0.7539, + "step": 53140 + }, + { + "epoch": 0.8531437101719128, + "grad_norm": 1.1326624155044556, + "learning_rate": 3.072305979518571e-05, + "loss": 0.7143, + "step": 53150 + }, + { + "epoch": 0.8533042263920769, + "grad_norm": 0.6495110392570496, + "learning_rate": 3.071692256129699e-05, + "loss": 0.6095, + "step": 53160 + }, + { + "epoch": 0.8534647426122409, + "grad_norm": 0.7514623403549194, + "learning_rate": 3.0710784963845554e-05, + "loss": 0.6629, + "step": 53170 + }, + { + "epoch": 0.853625258832405, + "grad_norm": 0.6954189538955688, + "learning_rate": 3.0704647003221706e-05, + "loss": 0.7779, + "step": 53180 + }, + { + "epoch": 0.853785775052569, + "grad_norm": 0.7292186617851257, + "learning_rate": 3.0698508679815815e-05, + "loss": 0.8437, + "step": 53190 + }, + { + "epoch": 0.8539462912727331, + "grad_norm": 0.9100925326347351, + "learning_rate": 3.06923699940182e-05, + "loss": 0.7046, + "step": 53200 + }, + { + "epoch": 0.8541068074928971, + "grad_norm": 0.7705385088920593, + "learning_rate": 3.0686230946219265e-05, + "loss": 0.7844, + "step": 53210 + }, + { + "epoch": 0.8542673237130612, + "grad_norm": 0.6410653591156006, + "learning_rate": 3.068009153680942e-05, + "loss": 0.762, + "step": 53220 + }, + { + "epoch": 0.8544278399332252, + "grad_norm": 0.6189106702804565, + "learning_rate": 3.0673951766179096e-05, + "loss": 0.7548, + "step": 53230 + }, + { + "epoch": 0.8545883561533894, + "grad_norm": 1.6340656280517578, + "learning_rate": 3.066781163471873e-05, + "loss": 0.8293, + "step": 53240 + }, + { + "epoch": 0.8547488723735533, + "grad_norm": 1.1787419319152832, + "learning_rate": 3.066167114281883e-05, + "loss": 0.7301, + "step": 53250 + }, + { + "epoch": 0.8549093885937173, + "grad_norm": 0.7757689952850342, + "learning_rate": 3.065553029086987e-05, + "loss": 0.8308, + "step": 53260 + }, + { + "epoch": 0.8550699048138815, + "grad_norm": 0.7723706364631653, + "learning_rate": 3.064938907926237e-05, + "loss": 0.8249, + "step": 53270 + }, + { + "epoch": 0.8552304210340455, + "grad_norm": 0.6498652696609497, + "learning_rate": 3.0643247508386885e-05, + "loss": 0.7829, + "step": 53280 + }, + { + "epoch": 0.8553909372542096, + "grad_norm": 0.7636462450027466, + "learning_rate": 3.063710557863398e-05, + "loss": 0.7177, + "step": 53290 + }, + { + "epoch": 0.8555514534743736, + "grad_norm": 0.7560423612594604, + "learning_rate": 3.063096329039425e-05, + "loss": 0.6935, + "step": 53300 + }, + { + "epoch": 0.8557119696945377, + "grad_norm": 0.7072769403457642, + "learning_rate": 3.0624820644058307e-05, + "loss": 0.793, + "step": 53310 + }, + { + "epoch": 0.8558724859147017, + "grad_norm": 0.8678737282752991, + "learning_rate": 3.061867764001678e-05, + "loss": 0.7358, + "step": 53320 + }, + { + "epoch": 0.8560330021348658, + "grad_norm": 1.2410509586334229, + "learning_rate": 3.061253427866034e-05, + "loss": 0.7553, + "step": 53330 + }, + { + "epoch": 0.8561935183550298, + "grad_norm": 0.5725073218345642, + "learning_rate": 3.0606390560379656e-05, + "loss": 0.7094, + "step": 53340 + }, + { + "epoch": 0.8563540345751939, + "grad_norm": 0.5895586013793945, + "learning_rate": 3.060024648556544e-05, + "loss": 0.7065, + "step": 53350 + }, + { + "epoch": 0.8565145507953579, + "grad_norm": 0.8385864496231079, + "learning_rate": 3.059410205460842e-05, + "loss": 0.7582, + "step": 53360 + }, + { + "epoch": 0.8566750670155219, + "grad_norm": 0.5255938172340393, + "learning_rate": 3.0587957267899345e-05, + "loss": 0.7289, + "step": 53370 + }, + { + "epoch": 0.856835583235686, + "grad_norm": 0.9647319316864014, + "learning_rate": 3.058181212582898e-05, + "loss": 0.7317, + "step": 53380 + }, + { + "epoch": 0.85699609945585, + "grad_norm": 0.6229328513145447, + "learning_rate": 3.057566662878813e-05, + "loss": 0.8002, + "step": 53390 + }, + { + "epoch": 0.8571566156760141, + "grad_norm": 0.6331156492233276, + "learning_rate": 3.0569520777167604e-05, + "loss": 0.6758, + "step": 53400 + }, + { + "epoch": 0.8573171318961781, + "grad_norm": 1.1075937747955322, + "learning_rate": 3.056337457135825e-05, + "loss": 0.7153, + "step": 53410 + }, + { + "epoch": 0.8574776481163422, + "grad_norm": 0.7713668346405029, + "learning_rate": 3.0557228011750925e-05, + "loss": 0.759, + "step": 53420 + }, + { + "epoch": 0.8576381643365062, + "grad_norm": 0.6581687927246094, + "learning_rate": 3.0551081098736524e-05, + "loss": 0.6978, + "step": 53430 + }, + { + "epoch": 0.8577986805566703, + "grad_norm": 1.3812344074249268, + "learning_rate": 3.054493383270594e-05, + "loss": 0.7866, + "step": 53440 + }, + { + "epoch": 0.8579591967768343, + "grad_norm": 0.6271838545799255, + "learning_rate": 3.0538786214050106e-05, + "loss": 0.8802, + "step": 53450 + }, + { + "epoch": 0.8581197129969983, + "grad_norm": 1.13473641872406, + "learning_rate": 3.053263824315998e-05, + "loss": 0.6806, + "step": 53460 + }, + { + "epoch": 0.8582802292171624, + "grad_norm": 1.0041555166244507, + "learning_rate": 3.052648992042654e-05, + "loss": 0.7406, + "step": 53470 + }, + { + "epoch": 0.8584407454373264, + "grad_norm": 0.7967252135276794, + "learning_rate": 3.052034124624077e-05, + "loss": 0.8292, + "step": 53480 + }, + { + "epoch": 0.8586012616574905, + "grad_norm": 1.0279408693313599, + "learning_rate": 3.05141922209937e-05, + "loss": 0.7306, + "step": 53490 + }, + { + "epoch": 0.8587617778776545, + "grad_norm": 1.3488457202911377, + "learning_rate": 3.0508042845076374e-05, + "loss": 0.7821, + "step": 53500 + }, + { + "epoch": 0.8589222940978186, + "grad_norm": 0.6364864110946655, + "learning_rate": 3.0501893118879855e-05, + "loss": 0.7612, + "step": 53510 + }, + { + "epoch": 0.8590828103179826, + "grad_norm": 0.9023767709732056, + "learning_rate": 3.0495743042795216e-05, + "loss": 0.8821, + "step": 53520 + }, + { + "epoch": 0.8592433265381467, + "grad_norm": 0.8290342688560486, + "learning_rate": 3.048959261721358e-05, + "loss": 0.921, + "step": 53530 + }, + { + "epoch": 0.8594038427583107, + "grad_norm": 1.1543998718261719, + "learning_rate": 3.0483441842526084e-05, + "loss": 0.961, + "step": 53540 + }, + { + "epoch": 0.8595643589784748, + "grad_norm": 0.9074872732162476, + "learning_rate": 3.0477290719123857e-05, + "loss": 0.8081, + "step": 53550 + }, + { + "epoch": 0.8597248751986388, + "grad_norm": 0.9197531342506409, + "learning_rate": 3.0471139247398094e-05, + "loss": 0.7905, + "step": 53560 + }, + { + "epoch": 0.8598853914188028, + "grad_norm": 0.885453999042511, + "learning_rate": 3.0464987427739995e-05, + "loss": 0.7346, + "step": 53570 + }, + { + "epoch": 0.8600459076389669, + "grad_norm": 0.7627508640289307, + "learning_rate": 3.0458835260540765e-05, + "loss": 0.7643, + "step": 53580 + }, + { + "epoch": 0.8602064238591309, + "grad_norm": 0.915335476398468, + "learning_rate": 3.0452682746191654e-05, + "loss": 0.6986, + "step": 53590 + }, + { + "epoch": 0.860366940079295, + "grad_norm": 0.6940934658050537, + "learning_rate": 3.0446529885083924e-05, + "loss": 0.7907, + "step": 53600 + }, + { + "epoch": 0.860527456299459, + "grad_norm": 0.6832582354545593, + "learning_rate": 3.0440376677608857e-05, + "loss": 0.7566, + "step": 53610 + }, + { + "epoch": 0.8606879725196231, + "grad_norm": 1.4703067541122437, + "learning_rate": 3.0434223124157768e-05, + "loss": 0.756, + "step": 53620 + }, + { + "epoch": 0.8608484887397871, + "grad_norm": 0.7108792066574097, + "learning_rate": 3.042806922512199e-05, + "loss": 0.8147, + "step": 53630 + }, + { + "epoch": 0.8610090049599513, + "grad_norm": 0.5071561932563782, + "learning_rate": 3.0421914980892863e-05, + "loss": 0.7998, + "step": 53640 + }, + { + "epoch": 0.8611695211801152, + "grad_norm": 1.2200303077697754, + "learning_rate": 3.0415760391861763e-05, + "loss": 0.8548, + "step": 53650 + }, + { + "epoch": 0.8613300374002792, + "grad_norm": 1.1981121301651, + "learning_rate": 3.0409605458420094e-05, + "loss": 0.6816, + "step": 53660 + }, + { + "epoch": 0.8614905536204434, + "grad_norm": 0.9166679978370667, + "learning_rate": 3.040345018095927e-05, + "loss": 0.8132, + "step": 53670 + }, + { + "epoch": 0.8616510698406074, + "grad_norm": 0.7152298092842102, + "learning_rate": 3.0397294559870728e-05, + "loss": 0.7247, + "step": 53680 + }, + { + "epoch": 0.8618115860607715, + "grad_norm": 1.1705211400985718, + "learning_rate": 3.0391138595545926e-05, + "loss": 0.6756, + "step": 53690 + }, + { + "epoch": 0.8619721022809355, + "grad_norm": 1.0384658575057983, + "learning_rate": 3.0384982288376364e-05, + "loss": 0.7564, + "step": 53700 + }, + { + "epoch": 0.8621326185010996, + "grad_norm": 0.9779655337333679, + "learning_rate": 3.0378825638753522e-05, + "loss": 0.8018, + "step": 53710 + }, + { + "epoch": 0.8622931347212636, + "grad_norm": 0.7041295170783997, + "learning_rate": 3.0372668647068947e-05, + "loss": 0.8863, + "step": 53720 + }, + { + "epoch": 0.8624536509414277, + "grad_norm": 0.9196216464042664, + "learning_rate": 3.036651131371418e-05, + "loss": 0.7281, + "step": 53730 + }, + { + "epoch": 0.8626141671615917, + "grad_norm": 0.936858057975769, + "learning_rate": 3.036035363908079e-05, + "loss": 0.7926, + "step": 53740 + }, + { + "epoch": 0.8627746833817558, + "grad_norm": 0.7776564359664917, + "learning_rate": 3.035419562356037e-05, + "loss": 0.8142, + "step": 53750 + }, + { + "epoch": 0.8629351996019198, + "grad_norm": 0.7520744800567627, + "learning_rate": 3.0348037267544538e-05, + "loss": 0.8128, + "step": 53760 + }, + { + "epoch": 0.8630957158220838, + "grad_norm": 0.9471583962440491, + "learning_rate": 3.0341878571424925e-05, + "loss": 0.7524, + "step": 53770 + }, + { + "epoch": 0.8632562320422479, + "grad_norm": 0.45883938670158386, + "learning_rate": 3.0335719535593187e-05, + "loss": 0.7214, + "step": 53780 + }, + { + "epoch": 0.8634167482624119, + "grad_norm": 0.8966663479804993, + "learning_rate": 3.0329560160441002e-05, + "loss": 0.7925, + "step": 53790 + }, + { + "epoch": 0.863577264482576, + "grad_norm": 0.7578988671302795, + "learning_rate": 3.0323400446360073e-05, + "loss": 0.8353, + "step": 53800 + }, + { + "epoch": 0.86373778070274, + "grad_norm": 0.5249755382537842, + "learning_rate": 3.0317240393742118e-05, + "loss": 0.7805, + "step": 53810 + }, + { + "epoch": 0.8638982969229041, + "grad_norm": 0.6136336922645569, + "learning_rate": 3.0311080002978893e-05, + "loss": 0.8406, + "step": 53820 + }, + { + "epoch": 0.8640588131430681, + "grad_norm": 0.6970844864845276, + "learning_rate": 3.030491927446214e-05, + "loss": 0.784, + "step": 53830 + }, + { + "epoch": 0.8642193293632322, + "grad_norm": 0.7213741540908813, + "learning_rate": 3.0298758208583666e-05, + "loss": 0.7442, + "step": 53840 + }, + { + "epoch": 0.8643798455833962, + "grad_norm": 0.7757663130760193, + "learning_rate": 3.0292596805735274e-05, + "loss": 0.7815, + "step": 53850 + }, + { + "epoch": 0.8645403618035602, + "grad_norm": 0.7883428335189819, + "learning_rate": 3.0286435066308783e-05, + "loss": 0.8539, + "step": 53860 + }, + { + "epoch": 0.8647008780237243, + "grad_norm": 0.6660919189453125, + "learning_rate": 3.0280272990696046e-05, + "loss": 0.7415, + "step": 53870 + }, + { + "epoch": 0.8648613942438883, + "grad_norm": 0.744312047958374, + "learning_rate": 3.027411057928895e-05, + "loss": 0.7532, + "step": 53880 + }, + { + "epoch": 0.8650219104640524, + "grad_norm": 0.8144683241844177, + "learning_rate": 3.026794783247937e-05, + "loss": 0.7197, + "step": 53890 + }, + { + "epoch": 0.8651824266842164, + "grad_norm": 0.7662053108215332, + "learning_rate": 3.0261784750659228e-05, + "loss": 0.6529, + "step": 53900 + }, + { + "epoch": 0.8653429429043805, + "grad_norm": 0.6334777474403381, + "learning_rate": 3.0255621334220457e-05, + "loss": 0.7898, + "step": 53910 + }, + { + "epoch": 0.8655034591245445, + "grad_norm": 0.6865820288658142, + "learning_rate": 3.0249457583555026e-05, + "loss": 0.8395, + "step": 53920 + }, + { + "epoch": 0.8656639753447086, + "grad_norm": 2.7929654121398926, + "learning_rate": 3.0243293499054892e-05, + "loss": 0.7751, + "step": 53930 + }, + { + "epoch": 0.8658244915648726, + "grad_norm": 1.061813235282898, + "learning_rate": 3.023712908111207e-05, + "loss": 0.8005, + "step": 53940 + }, + { + "epoch": 0.8659850077850367, + "grad_norm": 0.7034225463867188, + "learning_rate": 3.0230964330118583e-05, + "loss": 0.7751, + "step": 53950 + }, + { + "epoch": 0.8661455240052007, + "grad_norm": 0.9611375331878662, + "learning_rate": 3.0224799246466462e-05, + "loss": 0.7029, + "step": 53960 + }, + { + "epoch": 0.8663060402253647, + "grad_norm": 0.9513174295425415, + "learning_rate": 3.0218633830547778e-05, + "loss": 0.8047, + "step": 53970 + }, + { + "epoch": 0.8664665564455288, + "grad_norm": 0.7694123983383179, + "learning_rate": 3.021246808275462e-05, + "loss": 0.8608, + "step": 53980 + }, + { + "epoch": 0.8666270726656928, + "grad_norm": 0.8766696453094482, + "learning_rate": 3.020630200347907e-05, + "loss": 0.6444, + "step": 53990 + }, + { + "epoch": 0.8667875888858569, + "grad_norm": 0.9450390338897705, + "learning_rate": 3.0200135593113278e-05, + "loss": 0.7698, + "step": 54000 + }, + { + "epoch": 0.8669481051060209, + "grad_norm": 1.0339562892913818, + "learning_rate": 3.0193968852049386e-05, + "loss": 0.9418, + "step": 54010 + }, + { + "epoch": 0.867108621326185, + "grad_norm": 0.6756321787834167, + "learning_rate": 3.018780178067956e-05, + "loss": 0.8568, + "step": 54020 + }, + { + "epoch": 0.867269137546349, + "grad_norm": 0.8085231781005859, + "learning_rate": 3.0181634379395983e-05, + "loss": 0.7291, + "step": 54030 + }, + { + "epoch": 0.8674296537665132, + "grad_norm": 0.5480183362960815, + "learning_rate": 3.0175466648590888e-05, + "loss": 0.8317, + "step": 54040 + }, + { + "epoch": 0.8675901699866772, + "grad_norm": 0.8104788661003113, + "learning_rate": 3.016929858865648e-05, + "loss": 0.7831, + "step": 54050 + }, + { + "epoch": 0.8677506862068411, + "grad_norm": 0.6970775723457336, + "learning_rate": 3.016313019998502e-05, + "loss": 0.7981, + "step": 54060 + }, + { + "epoch": 0.8679112024270053, + "grad_norm": 0.8168606162071228, + "learning_rate": 3.0156961482968794e-05, + "loss": 0.7483, + "step": 54070 + }, + { + "epoch": 0.8680717186471693, + "grad_norm": 0.717464804649353, + "learning_rate": 3.0150792438000074e-05, + "loss": 0.7551, + "step": 54080 + }, + { + "epoch": 0.8682322348673334, + "grad_norm": 0.9511051177978516, + "learning_rate": 3.0144623065471194e-05, + "loss": 0.8352, + "step": 54090 + }, + { + "epoch": 0.8683927510874974, + "grad_norm": 0.9647249579429626, + "learning_rate": 3.0138453365774476e-05, + "loss": 0.8631, + "step": 54100 + }, + { + "epoch": 0.8685532673076615, + "grad_norm": 0.7932609915733337, + "learning_rate": 3.0132283339302293e-05, + "loss": 0.8402, + "step": 54110 + }, + { + "epoch": 0.8687137835278255, + "grad_norm": 0.8156803846359253, + "learning_rate": 3.0126112986447004e-05, + "loss": 0.7795, + "step": 54120 + }, + { + "epoch": 0.8688742997479896, + "grad_norm": 1.3748396635055542, + "learning_rate": 3.0119942307601022e-05, + "loss": 0.8675, + "step": 54130 + }, + { + "epoch": 0.8690348159681536, + "grad_norm": 0.8398205637931824, + "learning_rate": 3.011377130315676e-05, + "loss": 0.7282, + "step": 54140 + }, + { + "epoch": 0.8691953321883177, + "grad_norm": 0.6313886642456055, + "learning_rate": 3.0107599973506655e-05, + "loss": 0.8687, + "step": 54150 + }, + { + "epoch": 0.8693558484084817, + "grad_norm": 0.6912403702735901, + "learning_rate": 3.010142831904318e-05, + "loss": 0.8313, + "step": 54160 + }, + { + "epoch": 0.8695163646286457, + "grad_norm": 0.7366620898246765, + "learning_rate": 3.0095256340158795e-05, + "loss": 0.8863, + "step": 54170 + }, + { + "epoch": 0.8696768808488098, + "grad_norm": 1.2344517707824707, + "learning_rate": 3.0089084037246017e-05, + "loss": 0.8391, + "step": 54180 + }, + { + "epoch": 0.8698373970689738, + "grad_norm": 0.685845136642456, + "learning_rate": 3.008291141069736e-05, + "loss": 0.8939, + "step": 54190 + }, + { + "epoch": 0.8699979132891379, + "grad_norm": 0.7630943059921265, + "learning_rate": 3.0076738460905386e-05, + "loss": 0.7058, + "step": 54200 + }, + { + "epoch": 0.8701584295093019, + "grad_norm": 0.6717374920845032, + "learning_rate": 3.0070565188262628e-05, + "loss": 0.7769, + "step": 54210 + }, + { + "epoch": 0.870318945729466, + "grad_norm": 1.0499471426010132, + "learning_rate": 3.006439159316169e-05, + "loss": 0.7703, + "step": 54220 + }, + { + "epoch": 0.87047946194963, + "grad_norm": 1.2943527698516846, + "learning_rate": 3.0058217675995172e-05, + "loss": 0.7996, + "step": 54230 + }, + { + "epoch": 0.8706399781697941, + "grad_norm": 0.9096190929412842, + "learning_rate": 3.0052043437155697e-05, + "loss": 0.8604, + "step": 54240 + }, + { + "epoch": 0.8708004943899581, + "grad_norm": 0.5921663641929626, + "learning_rate": 3.004586887703591e-05, + "loss": 0.7081, + "step": 54250 + }, + { + "epoch": 0.8709610106101221, + "grad_norm": 0.817924976348877, + "learning_rate": 3.0039693996028483e-05, + "loss": 0.7739, + "step": 54260 + }, + { + "epoch": 0.8711215268302862, + "grad_norm": 0.5429435968399048, + "learning_rate": 3.0033518794526095e-05, + "loss": 0.7763, + "step": 54270 + }, + { + "epoch": 0.8712820430504502, + "grad_norm": 0.9112148880958557, + "learning_rate": 3.0027343272921453e-05, + "loss": 0.8481, + "step": 54280 + }, + { + "epoch": 0.8714425592706143, + "grad_norm": 1.253695011138916, + "learning_rate": 3.00211674316073e-05, + "loss": 0.7267, + "step": 54290 + }, + { + "epoch": 0.8716030754907783, + "grad_norm": 0.7258495688438416, + "learning_rate": 3.0014991270976355e-05, + "loss": 0.8235, + "step": 54300 + }, + { + "epoch": 0.8717635917109424, + "grad_norm": 0.6978042721748352, + "learning_rate": 3.0008814791421404e-05, + "loss": 0.8098, + "step": 54310 + }, + { + "epoch": 0.8719241079311064, + "grad_norm": 0.8165655732154846, + "learning_rate": 3.0002637993335237e-05, + "loss": 0.7624, + "step": 54320 + }, + { + "epoch": 0.8720846241512705, + "grad_norm": 0.7742758393287659, + "learning_rate": 2.999646087711065e-05, + "loss": 0.7599, + "step": 54330 + }, + { + "epoch": 0.8722451403714345, + "grad_norm": 0.9067382216453552, + "learning_rate": 2.9990283443140475e-05, + "loss": 0.8171, + "step": 54340 + }, + { + "epoch": 0.8724056565915986, + "grad_norm": 0.5571860671043396, + "learning_rate": 2.998410569181757e-05, + "loss": 0.6478, + "step": 54350 + }, + { + "epoch": 0.8725661728117626, + "grad_norm": 0.8357293009757996, + "learning_rate": 2.997792762353479e-05, + "loss": 0.7032, + "step": 54360 + }, + { + "epoch": 0.8727266890319266, + "grad_norm": 0.782564103603363, + "learning_rate": 2.9971749238685032e-05, + "loss": 0.6255, + "step": 54370 + }, + { + "epoch": 0.8728872052520907, + "grad_norm": 1.0792485475540161, + "learning_rate": 2.99655705376612e-05, + "loss": 0.7934, + "step": 54380 + }, + { + "epoch": 0.8730477214722547, + "grad_norm": 2.3469326496124268, + "learning_rate": 2.9959391520856238e-05, + "loss": 0.7281, + "step": 54390 + }, + { + "epoch": 0.8732082376924188, + "grad_norm": 0.7498217225074768, + "learning_rate": 2.9953212188663076e-05, + "loss": 0.7406, + "step": 54400 + }, + { + "epoch": 0.8733687539125828, + "grad_norm": 0.5835211873054504, + "learning_rate": 2.9947032541474696e-05, + "loss": 0.7371, + "step": 54410 + }, + { + "epoch": 0.873529270132747, + "grad_norm": 1.420947790145874, + "learning_rate": 2.9940852579684076e-05, + "loss": 0.7177, + "step": 54420 + }, + { + "epoch": 0.873689786352911, + "grad_norm": 1.082679033279419, + "learning_rate": 2.9934672303684236e-05, + "loss": 0.6564, + "step": 54430 + }, + { + "epoch": 0.873850302573075, + "grad_norm": 0.7364447712898254, + "learning_rate": 2.99284917138682e-05, + "loss": 0.7925, + "step": 54440 + }, + { + "epoch": 0.874010818793239, + "grad_norm": 1.051896095275879, + "learning_rate": 2.9922310810629023e-05, + "loss": 0.7798, + "step": 54450 + }, + { + "epoch": 0.874171335013403, + "grad_norm": 0.6389321684837341, + "learning_rate": 2.991612959435976e-05, + "loss": 0.7022, + "step": 54460 + }, + { + "epoch": 0.8743318512335672, + "grad_norm": 1.379365086555481, + "learning_rate": 2.9909948065453513e-05, + "loss": 0.7232, + "step": 54470 + }, + { + "epoch": 0.8744923674537312, + "grad_norm": 1.099120855331421, + "learning_rate": 2.9903766224303398e-05, + "loss": 0.7696, + "step": 54480 + }, + { + "epoch": 0.8746528836738953, + "grad_norm": 1.2715474367141724, + "learning_rate": 2.989758407130252e-05, + "loss": 0.9463, + "step": 54490 + }, + { + "epoch": 0.8748133998940593, + "grad_norm": 1.6890895366668701, + "learning_rate": 2.9891401606844045e-05, + "loss": 0.7857, + "step": 54500 + }, + { + "epoch": 0.8749739161142234, + "grad_norm": 0.6474032402038574, + "learning_rate": 2.9885218831321142e-05, + "loss": 0.72, + "step": 54510 + }, + { + "epoch": 0.8751344323343874, + "grad_norm": 0.8824485540390015, + "learning_rate": 2.9879035745126988e-05, + "loss": 0.6996, + "step": 54520 + }, + { + "epoch": 0.8752949485545515, + "grad_norm": 0.8529213666915894, + "learning_rate": 2.98728523486548e-05, + "loss": 0.7532, + "step": 54530 + }, + { + "epoch": 0.8754554647747155, + "grad_norm": 0.5974135994911194, + "learning_rate": 2.986666864229781e-05, + "loss": 0.8062, + "step": 54540 + }, + { + "epoch": 0.8756159809948796, + "grad_norm": 0.6701919436454773, + "learning_rate": 2.9860484626449247e-05, + "loss": 0.8234, + "step": 54550 + }, + { + "epoch": 0.8757764972150436, + "grad_norm": 0.612483024597168, + "learning_rate": 2.9854300301502397e-05, + "loss": 0.7054, + "step": 54560 + }, + { + "epoch": 0.8759370134352076, + "grad_norm": 0.9075747728347778, + "learning_rate": 2.984811566785054e-05, + "loss": 0.7909, + "step": 54570 + }, + { + "epoch": 0.8760975296553717, + "grad_norm": 0.8759452104568481, + "learning_rate": 2.984193072588698e-05, + "loss": 0.7487, + "step": 54580 + }, + { + "epoch": 0.8762580458755357, + "grad_norm": 0.8577710390090942, + "learning_rate": 2.9835745476005038e-05, + "loss": 0.8202, + "step": 54590 + }, + { + "epoch": 0.8764185620956998, + "grad_norm": 0.6678754687309265, + "learning_rate": 2.982955991859808e-05, + "loss": 0.769, + "step": 54600 + }, + { + "epoch": 0.8765790783158638, + "grad_norm": 2.7914321422576904, + "learning_rate": 2.982337405405945e-05, + "loss": 0.8057, + "step": 54610 + }, + { + "epoch": 0.8767395945360279, + "grad_norm": 0.8774573802947998, + "learning_rate": 2.9817187882782543e-05, + "loss": 0.7363, + "step": 54620 + }, + { + "epoch": 0.8769001107561919, + "grad_norm": 1.4167429208755493, + "learning_rate": 2.9811001405160764e-05, + "loss": 0.7653, + "step": 54630 + }, + { + "epoch": 0.877060626976356, + "grad_norm": 0.6862244009971619, + "learning_rate": 2.980481462158753e-05, + "loss": 0.8188, + "step": 54640 + }, + { + "epoch": 0.87722114319652, + "grad_norm": 0.637752115726471, + "learning_rate": 2.979862753245628e-05, + "loss": 0.7246, + "step": 54650 + }, + { + "epoch": 0.877381659416684, + "grad_norm": 0.7585854530334473, + "learning_rate": 2.97924401381605e-05, + "loss": 0.82, + "step": 54660 + }, + { + "epoch": 0.8775421756368481, + "grad_norm": 0.9364258050918579, + "learning_rate": 2.9786252439093642e-05, + "loss": 0.7701, + "step": 54670 + }, + { + "epoch": 0.8777026918570121, + "grad_norm": 0.7340432405471802, + "learning_rate": 2.9780064435649226e-05, + "loss": 0.7951, + "step": 54680 + }, + { + "epoch": 0.8778632080771762, + "grad_norm": 0.8069435358047485, + "learning_rate": 2.977387612822077e-05, + "loss": 0.693, + "step": 54690 + }, + { + "epoch": 0.8780237242973402, + "grad_norm": 0.8634735345840454, + "learning_rate": 2.9767687517201808e-05, + "loss": 0.8159, + "step": 54700 + }, + { + "epoch": 0.8781842405175043, + "grad_norm": 0.619355320930481, + "learning_rate": 2.9761498602985903e-05, + "loss": 0.7045, + "step": 54710 + }, + { + "epoch": 0.8783447567376683, + "grad_norm": 0.867229700088501, + "learning_rate": 2.975530938596664e-05, + "loss": 0.7075, + "step": 54720 + }, + { + "epoch": 0.8785052729578324, + "grad_norm": 0.8175614476203918, + "learning_rate": 2.9749119866537607e-05, + "loss": 0.8059, + "step": 54730 + }, + { + "epoch": 0.8786657891779964, + "grad_norm": 1.1296448707580566, + "learning_rate": 2.974293004509242e-05, + "loss": 0.6794, + "step": 54740 + }, + { + "epoch": 0.8788263053981605, + "grad_norm": 0.6046215295791626, + "learning_rate": 2.9736739922024724e-05, + "loss": 0.8434, + "step": 54750 + }, + { + "epoch": 0.8789868216183245, + "grad_norm": 0.7857196927070618, + "learning_rate": 2.9730549497728178e-05, + "loss": 0.8668, + "step": 54760 + }, + { + "epoch": 0.8791473378384885, + "grad_norm": 1.238686442375183, + "learning_rate": 2.972435877259644e-05, + "loss": 0.7482, + "step": 54770 + }, + { + "epoch": 0.8793078540586526, + "grad_norm": 2.098641872406006, + "learning_rate": 2.9718167747023218e-05, + "loss": 0.8289, + "step": 54780 + }, + { + "epoch": 0.8794683702788166, + "grad_norm": 1.1863516569137573, + "learning_rate": 2.9711976421402222e-05, + "loss": 0.7305, + "step": 54790 + }, + { + "epoch": 0.8796288864989807, + "grad_norm": 0.6331625580787659, + "learning_rate": 2.9705784796127177e-05, + "loss": 0.7462, + "step": 54800 + }, + { + "epoch": 0.8797894027191447, + "grad_norm": 0.5815594792366028, + "learning_rate": 2.9699592871591842e-05, + "loss": 0.7189, + "step": 54810 + }, + { + "epoch": 0.8799499189393089, + "grad_norm": 0.5539968609809875, + "learning_rate": 2.9693400648189984e-05, + "loss": 0.7718, + "step": 54820 + }, + { + "epoch": 0.8801104351594728, + "grad_norm": 0.556820809841156, + "learning_rate": 2.9687208126315392e-05, + "loss": 0.7232, + "step": 54830 + }, + { + "epoch": 0.880270951379637, + "grad_norm": 0.6116346716880798, + "learning_rate": 2.9681015306361877e-05, + "loss": 0.7061, + "step": 54840 + }, + { + "epoch": 0.880431467599801, + "grad_norm": 0.6012527346611023, + "learning_rate": 2.967482218872326e-05, + "loss": 0.6688, + "step": 54850 + }, + { + "epoch": 0.8805919838199651, + "grad_norm": 0.5815205574035645, + "learning_rate": 2.966862877379339e-05, + "loss": 0.7378, + "step": 54860 + }, + { + "epoch": 0.8807525000401291, + "grad_norm": 1.1368191242218018, + "learning_rate": 2.9662435061966133e-05, + "loss": 0.7288, + "step": 54870 + }, + { + "epoch": 0.8809130162602931, + "grad_norm": 0.6661084890365601, + "learning_rate": 2.9656241053635374e-05, + "loss": 0.8143, + "step": 54880 + }, + { + "epoch": 0.8810735324804572, + "grad_norm": 0.9112212657928467, + "learning_rate": 2.965004674919501e-05, + "loss": 0.7383, + "step": 54890 + }, + { + "epoch": 0.8812340487006212, + "grad_norm": 3.0297467708587646, + "learning_rate": 2.9643852149038965e-05, + "loss": 0.7929, + "step": 54900 + }, + { + "epoch": 0.8813945649207853, + "grad_norm": 1.03333580493927, + "learning_rate": 2.9637657253561185e-05, + "loss": 0.85, + "step": 54910 + }, + { + "epoch": 0.8815550811409493, + "grad_norm": 0.6853891611099243, + "learning_rate": 2.963146206315562e-05, + "loss": 0.8758, + "step": 54920 + }, + { + "epoch": 0.8817155973611134, + "grad_norm": 0.934198796749115, + "learning_rate": 2.962526657821625e-05, + "loss": 0.7563, + "step": 54930 + }, + { + "epoch": 0.8818761135812774, + "grad_norm": 0.9415134191513062, + "learning_rate": 2.9619070799137077e-05, + "loss": 0.8617, + "step": 54940 + }, + { + "epoch": 0.8820366298014415, + "grad_norm": 0.8805731534957886, + "learning_rate": 2.9612874726312112e-05, + "loss": 0.8052, + "step": 54950 + }, + { + "epoch": 0.8821971460216055, + "grad_norm": 0.5368466973304749, + "learning_rate": 2.9606678360135386e-05, + "loss": 0.8421, + "step": 54960 + }, + { + "epoch": 0.8823576622417695, + "grad_norm": 0.8102704882621765, + "learning_rate": 2.960048170100096e-05, + "loss": 0.847, + "step": 54970 + }, + { + "epoch": 0.8825181784619336, + "grad_norm": 0.870502769947052, + "learning_rate": 2.9594284749302897e-05, + "loss": 0.7108, + "step": 54980 + }, + { + "epoch": 0.8826786946820976, + "grad_norm": 1.2249772548675537, + "learning_rate": 2.9588087505435286e-05, + "loss": 0.7511, + "step": 54990 + }, + { + "epoch": 0.8828392109022617, + "grad_norm": 0.7090245485305786, + "learning_rate": 2.9581889969792242e-05, + "loss": 0.689, + "step": 55000 + }, + { + "epoch": 0.8829997271224257, + "grad_norm": 1.1764757633209229, + "learning_rate": 2.957569214276789e-05, + "loss": 0.7237, + "step": 55010 + }, + { + "epoch": 0.8831602433425898, + "grad_norm": 0.7465420961380005, + "learning_rate": 2.9569494024756377e-05, + "loss": 0.6603, + "step": 55020 + }, + { + "epoch": 0.8833207595627538, + "grad_norm": 0.7763564586639404, + "learning_rate": 2.956329561615186e-05, + "loss": 0.7783, + "step": 55030 + }, + { + "epoch": 0.8834812757829179, + "grad_norm": 1.8095778226852417, + "learning_rate": 2.9557096917348524e-05, + "loss": 0.8062, + "step": 55040 + }, + { + "epoch": 0.8836417920030819, + "grad_norm": 1.4012575149536133, + "learning_rate": 2.955089792874057e-05, + "loss": 0.7049, + "step": 55050 + }, + { + "epoch": 0.883802308223246, + "grad_norm": 1.4094905853271484, + "learning_rate": 2.9544698650722226e-05, + "loss": 0.7588, + "step": 55060 + }, + { + "epoch": 0.88396282444341, + "grad_norm": 0.9833947420120239, + "learning_rate": 2.953849908368772e-05, + "loss": 0.7066, + "step": 55070 + }, + { + "epoch": 0.884123340663574, + "grad_norm": 0.636940062046051, + "learning_rate": 2.9532299228031307e-05, + "loss": 0.7029, + "step": 55080 + }, + { + "epoch": 0.8842838568837381, + "grad_norm": 0.6080500483512878, + "learning_rate": 2.9526099084147267e-05, + "loss": 0.829, + "step": 55090 + }, + { + "epoch": 0.8844443731039021, + "grad_norm": 0.7981950640678406, + "learning_rate": 2.9519898652429895e-05, + "loss": 0.828, + "step": 55100 + }, + { + "epoch": 0.8846048893240662, + "grad_norm": 0.8864955902099609, + "learning_rate": 2.9513697933273492e-05, + "loss": 0.7377, + "step": 55110 + }, + { + "epoch": 0.8847654055442302, + "grad_norm": 0.825844407081604, + "learning_rate": 2.9507496927072398e-05, + "loss": 0.7705, + "step": 55120 + }, + { + "epoch": 0.8849259217643943, + "grad_norm": 0.6086444854736328, + "learning_rate": 2.9501295634220955e-05, + "loss": 0.7154, + "step": 55130 + }, + { + "epoch": 0.8850864379845583, + "grad_norm": 0.9117220640182495, + "learning_rate": 2.9495094055113525e-05, + "loss": 0.7135, + "step": 55140 + }, + { + "epoch": 0.8852469542047224, + "grad_norm": 1.0303317308425903, + "learning_rate": 2.94888921901445e-05, + "loss": 0.8409, + "step": 55150 + }, + { + "epoch": 0.8854074704248864, + "grad_norm": 1.9159226417541504, + "learning_rate": 2.9482690039708282e-05, + "loss": 0.8544, + "step": 55160 + }, + { + "epoch": 0.8855679866450504, + "grad_norm": 0.6345512866973877, + "learning_rate": 2.9476487604199283e-05, + "loss": 0.7722, + "step": 55170 + }, + { + "epoch": 0.8857285028652145, + "grad_norm": 1.742567539215088, + "learning_rate": 2.9470284884011944e-05, + "loss": 0.7453, + "step": 55180 + }, + { + "epoch": 0.8858890190853785, + "grad_norm": 0.9668522477149963, + "learning_rate": 2.9464081879540733e-05, + "loss": 0.8138, + "step": 55190 + }, + { + "epoch": 0.8860495353055426, + "grad_norm": 2.637559652328491, + "learning_rate": 2.945787859118011e-05, + "loss": 0.7346, + "step": 55200 + }, + { + "epoch": 0.8862100515257066, + "grad_norm": 0.6865595579147339, + "learning_rate": 2.9451675019324575e-05, + "loss": 0.7829, + "step": 55210 + }, + { + "epoch": 0.8863705677458708, + "grad_norm": 0.9243323802947998, + "learning_rate": 2.9445471164368643e-05, + "loss": 0.7441, + "step": 55220 + }, + { + "epoch": 0.8865310839660347, + "grad_norm": 0.48731958866119385, + "learning_rate": 2.9439267026706828e-05, + "loss": 0.7088, + "step": 55230 + }, + { + "epoch": 0.8866916001861989, + "grad_norm": 0.9612644910812378, + "learning_rate": 2.9433062606733692e-05, + "loss": 0.7641, + "step": 55240 + }, + { + "epoch": 0.8868521164063629, + "grad_norm": 0.5129079222679138, + "learning_rate": 2.942685790484379e-05, + "loss": 0.6259, + "step": 55250 + }, + { + "epoch": 0.887012632626527, + "grad_norm": 0.731831967830658, + "learning_rate": 2.9420652921431713e-05, + "loss": 0.8532, + "step": 55260 + }, + { + "epoch": 0.887173148846691, + "grad_norm": 1.0540353059768677, + "learning_rate": 2.941444765689205e-05, + "loss": 0.7413, + "step": 55270 + }, + { + "epoch": 0.887333665066855, + "grad_norm": 0.6840178370475769, + "learning_rate": 2.9408242111619426e-05, + "loss": 0.697, + "step": 55280 + }, + { + "epoch": 0.8874941812870191, + "grad_norm": 0.4916870594024658, + "learning_rate": 2.940203628600849e-05, + "loss": 0.8774, + "step": 55290 + }, + { + "epoch": 0.8876546975071831, + "grad_norm": 0.6857337355613708, + "learning_rate": 2.939583018045387e-05, + "loss": 0.7068, + "step": 55300 + }, + { + "epoch": 0.8878152137273472, + "grad_norm": 0.8205177783966064, + "learning_rate": 2.9389623795350253e-05, + "loss": 0.7707, + "step": 55310 + }, + { + "epoch": 0.8879757299475112, + "grad_norm": 0.7818800210952759, + "learning_rate": 2.9383417131092332e-05, + "loss": 0.6625, + "step": 55320 + }, + { + "epoch": 0.8881362461676753, + "grad_norm": 0.7578741908073425, + "learning_rate": 2.937721018807481e-05, + "loss": 0.7089, + "step": 55330 + }, + { + "epoch": 0.8882967623878393, + "grad_norm": 0.6506258845329285, + "learning_rate": 2.9371002966692407e-05, + "loss": 0.7908, + "step": 55340 + }, + { + "epoch": 0.8884572786080034, + "grad_norm": 0.48025983572006226, + "learning_rate": 2.936479546733987e-05, + "loss": 0.6931, + "step": 55350 + }, + { + "epoch": 0.8886177948281674, + "grad_norm": 0.552994966506958, + "learning_rate": 2.9358587690411966e-05, + "loss": 0.7809, + "step": 55360 + }, + { + "epoch": 0.8887783110483314, + "grad_norm": 0.6862017512321472, + "learning_rate": 2.9352379636303462e-05, + "loss": 0.7885, + "step": 55370 + }, + { + "epoch": 0.8889388272684955, + "grad_norm": 0.9933024644851685, + "learning_rate": 2.9346171305409165e-05, + "loss": 0.7933, + "step": 55380 + }, + { + "epoch": 0.8890993434886595, + "grad_norm": 0.6007983088493347, + "learning_rate": 2.9339962698123874e-05, + "loss": 0.8021, + "step": 55390 + }, + { + "epoch": 0.8892598597088236, + "grad_norm": 0.6155750751495361, + "learning_rate": 2.9333753814842434e-05, + "loss": 0.7736, + "step": 55400 + }, + { + "epoch": 0.8894203759289876, + "grad_norm": 0.5435142517089844, + "learning_rate": 2.932754465595969e-05, + "loss": 0.6022, + "step": 55410 + }, + { + "epoch": 0.8895808921491517, + "grad_norm": 0.7675585746765137, + "learning_rate": 2.9321335221870498e-05, + "loss": 0.8063, + "step": 55420 + }, + { + "epoch": 0.8897414083693157, + "grad_norm": 0.8157670497894287, + "learning_rate": 2.9315125512969755e-05, + "loss": 0.879, + "step": 55430 + }, + { + "epoch": 0.8899019245894798, + "grad_norm": 1.2049884796142578, + "learning_rate": 2.930891552965236e-05, + "loss": 0.7852, + "step": 55440 + }, + { + "epoch": 0.8900624408096438, + "grad_norm": 0.5972486734390259, + "learning_rate": 2.9302705272313225e-05, + "loss": 0.7901, + "step": 55450 + }, + { + "epoch": 0.8902229570298079, + "grad_norm": 1.2322165966033936, + "learning_rate": 2.9296494741347284e-05, + "loss": 0.8876, + "step": 55460 + }, + { + "epoch": 0.8903834732499719, + "grad_norm": 0.6592298746109009, + "learning_rate": 2.9290283937149504e-05, + "loss": 0.7824, + "step": 55470 + }, + { + "epoch": 0.8905439894701359, + "grad_norm": 0.6766608357429504, + "learning_rate": 2.9284072860114842e-05, + "loss": 0.7286, + "step": 55480 + }, + { + "epoch": 0.8907045056903, + "grad_norm": 0.6338342428207397, + "learning_rate": 2.9277861510638288e-05, + "loss": 0.7809, + "step": 55490 + }, + { + "epoch": 0.890865021910464, + "grad_norm": 0.6962128281593323, + "learning_rate": 2.9271649889114862e-05, + "loss": 0.8036, + "step": 55500 + }, + { + "epoch": 0.8910255381306281, + "grad_norm": 1.3017244338989258, + "learning_rate": 2.926543799593956e-05, + "loss": 0.8323, + "step": 55510 + }, + { + "epoch": 0.8911860543507921, + "grad_norm": 0.667212188243866, + "learning_rate": 2.925922583150744e-05, + "loss": 0.7506, + "step": 55520 + }, + { + "epoch": 0.8913465705709562, + "grad_norm": 0.6036951541900635, + "learning_rate": 2.9253013396213568e-05, + "loss": 0.8056, + "step": 55530 + }, + { + "epoch": 0.8915070867911202, + "grad_norm": 0.8571463823318481, + "learning_rate": 2.9246800690452997e-05, + "loss": 0.7308, + "step": 55540 + }, + { + "epoch": 0.8916676030112843, + "grad_norm": 1.376457929611206, + "learning_rate": 2.9240587714620827e-05, + "loss": 0.7924, + "step": 55550 + }, + { + "epoch": 0.8918281192314483, + "grad_norm": 0.74739009141922, + "learning_rate": 2.923437446911217e-05, + "loss": 0.9089, + "step": 55560 + }, + { + "epoch": 0.8919886354516123, + "grad_norm": 1.1385629177093506, + "learning_rate": 2.922816095432216e-05, + "loss": 0.6223, + "step": 55570 + }, + { + "epoch": 0.8921491516717764, + "grad_norm": 1.1640018224716187, + "learning_rate": 2.9221947170645915e-05, + "loss": 1.0198, + "step": 55580 + }, + { + "epoch": 0.8923096678919404, + "grad_norm": 0.7718788981437683, + "learning_rate": 2.9215733118478617e-05, + "loss": 0.8379, + "step": 55590 + }, + { + "epoch": 0.8924701841121045, + "grad_norm": 0.9593631625175476, + "learning_rate": 2.920951879821544e-05, + "loss": 0.79, + "step": 55600 + }, + { + "epoch": 0.8926307003322685, + "grad_norm": 0.7285793423652649, + "learning_rate": 2.9203304210251568e-05, + "loss": 0.6654, + "step": 55610 + }, + { + "epoch": 0.8927912165524327, + "grad_norm": 0.6157375574111938, + "learning_rate": 2.919708935498222e-05, + "loss": 0.8657, + "step": 55620 + }, + { + "epoch": 0.8929517327725967, + "grad_norm": 0.9659951329231262, + "learning_rate": 2.9190874232802622e-05, + "loss": 0.7965, + "step": 55630 + }, + { + "epoch": 0.8931122489927608, + "grad_norm": 0.693231463432312, + "learning_rate": 2.918465884410803e-05, + "loss": 0.7186, + "step": 55640 + }, + { + "epoch": 0.8932727652129248, + "grad_norm": 1.0421477556228638, + "learning_rate": 2.9178443189293692e-05, + "loss": 0.7994, + "step": 55650 + }, + { + "epoch": 0.8934332814330889, + "grad_norm": 0.904953122138977, + "learning_rate": 2.9172227268754886e-05, + "loss": 0.7435, + "step": 55660 + }, + { + "epoch": 0.8935937976532529, + "grad_norm": 0.7911776304244995, + "learning_rate": 2.9166011082886922e-05, + "loss": 0.657, + "step": 55670 + }, + { + "epoch": 0.8937543138734169, + "grad_norm": 0.8534031510353088, + "learning_rate": 2.91597946320851e-05, + "loss": 0.6876, + "step": 55680 + }, + { + "epoch": 0.893914830093581, + "grad_norm": 0.6149693131446838, + "learning_rate": 2.9153577916744764e-05, + "loss": 0.6486, + "step": 55690 + }, + { + "epoch": 0.894075346313745, + "grad_norm": 0.945487916469574, + "learning_rate": 2.9147360937261243e-05, + "loss": 0.7445, + "step": 55700 + }, + { + "epoch": 0.8942358625339091, + "grad_norm": 0.7507523894309998, + "learning_rate": 2.914114369402991e-05, + "loss": 0.7586, + "step": 55710 + }, + { + "epoch": 0.8943963787540731, + "grad_norm": 0.6620413661003113, + "learning_rate": 2.9134926187446147e-05, + "loss": 0.7192, + "step": 55720 + }, + { + "epoch": 0.8945568949742372, + "grad_norm": 0.7737084031105042, + "learning_rate": 2.9128708417905344e-05, + "loss": 0.7819, + "step": 55730 + }, + { + "epoch": 0.8947174111944012, + "grad_norm": 0.9162411689758301, + "learning_rate": 2.9122490385802918e-05, + "loss": 0.7193, + "step": 55740 + }, + { + "epoch": 0.8948779274145653, + "grad_norm": 1.3730254173278809, + "learning_rate": 2.9116272091534308e-05, + "loss": 0.7294, + "step": 55750 + }, + { + "epoch": 0.8950384436347293, + "grad_norm": 1.0364972352981567, + "learning_rate": 2.9110053535494947e-05, + "loss": 0.7664, + "step": 55760 + }, + { + "epoch": 0.8951989598548933, + "grad_norm": 0.728260338306427, + "learning_rate": 2.9103834718080303e-05, + "loss": 0.6897, + "step": 55770 + }, + { + "epoch": 0.8953594760750574, + "grad_norm": 0.9908638596534729, + "learning_rate": 2.9097615639685867e-05, + "loss": 0.8109, + "step": 55780 + }, + { + "epoch": 0.8955199922952214, + "grad_norm": 0.8533222079277039, + "learning_rate": 2.9091396300707118e-05, + "loss": 0.7485, + "step": 55790 + }, + { + "epoch": 0.8956805085153855, + "grad_norm": 0.8203845620155334, + "learning_rate": 2.908517670153958e-05, + "loss": 0.6447, + "step": 55800 + }, + { + "epoch": 0.8958410247355495, + "grad_norm": 1.025429606437683, + "learning_rate": 2.9078956842578787e-05, + "loss": 0.7533, + "step": 55810 + }, + { + "epoch": 0.8960015409557136, + "grad_norm": 0.569598913192749, + "learning_rate": 2.9072736724220274e-05, + "loss": 0.8224, + "step": 55820 + }, + { + "epoch": 0.8961620571758776, + "grad_norm": 0.5767167210578918, + "learning_rate": 2.9066516346859606e-05, + "loss": 0.8225, + "step": 55830 + }, + { + "epoch": 0.8963225733960417, + "grad_norm": 0.7147471308708191, + "learning_rate": 2.906029571089237e-05, + "loss": 0.662, + "step": 55840 + }, + { + "epoch": 0.8964830896162057, + "grad_norm": 0.6568450927734375, + "learning_rate": 2.905407481671416e-05, + "loss": 0.6996, + "step": 55850 + }, + { + "epoch": 0.8966436058363698, + "grad_norm": 0.6515069603919983, + "learning_rate": 2.9047853664720586e-05, + "loss": 0.8954, + "step": 55860 + }, + { + "epoch": 0.8968041220565338, + "grad_norm": 0.8269568085670471, + "learning_rate": 2.9041632255307278e-05, + "loss": 0.6861, + "step": 55870 + }, + { + "epoch": 0.8969646382766978, + "grad_norm": 0.9762548208236694, + "learning_rate": 2.9035410588869882e-05, + "loss": 0.7806, + "step": 55880 + }, + { + "epoch": 0.8971251544968619, + "grad_norm": 0.5158499479293823, + "learning_rate": 2.902918866580406e-05, + "loss": 0.8141, + "step": 55890 + }, + { + "epoch": 0.8972856707170259, + "grad_norm": 0.8623389601707458, + "learning_rate": 2.9022966486505476e-05, + "loss": 0.7936, + "step": 55900 + }, + { + "epoch": 0.89744618693719, + "grad_norm": 0.7569959759712219, + "learning_rate": 2.9016744051369848e-05, + "loss": 0.8257, + "step": 55910 + }, + { + "epoch": 0.897606703157354, + "grad_norm": 0.698162317276001, + "learning_rate": 2.901052136079287e-05, + "loss": 0.7984, + "step": 55920 + }, + { + "epoch": 0.8977672193775181, + "grad_norm": 0.7354707717895508, + "learning_rate": 2.9004298415170272e-05, + "loss": 0.6941, + "step": 55930 + }, + { + "epoch": 0.8979277355976821, + "grad_norm": 0.6649985909461975, + "learning_rate": 2.8998075214897796e-05, + "loss": 0.8809, + "step": 55940 + }, + { + "epoch": 0.8980882518178462, + "grad_norm": 0.7123598456382751, + "learning_rate": 2.899185176037121e-05, + "loss": 0.7772, + "step": 55950 + }, + { + "epoch": 0.8982487680380102, + "grad_norm": 0.833511471748352, + "learning_rate": 2.8985628051986273e-05, + "loss": 0.8007, + "step": 55960 + }, + { + "epoch": 0.8984092842581742, + "grad_norm": 0.5682579278945923, + "learning_rate": 2.897940409013879e-05, + "loss": 0.7437, + "step": 55970 + }, + { + "epoch": 0.8985698004783383, + "grad_norm": 0.707038164138794, + "learning_rate": 2.897317987522456e-05, + "loss": 0.6655, + "step": 55980 + }, + { + "epoch": 0.8987303166985023, + "grad_norm": 0.8858893513679504, + "learning_rate": 2.8966955407639413e-05, + "loss": 0.853, + "step": 55990 + }, + { + "epoch": 0.8988908329186664, + "grad_norm": 0.834848940372467, + "learning_rate": 2.896073068777919e-05, + "loss": 0.8345, + "step": 56000 + }, + { + "epoch": 0.8988908329186664, + "eval_loss": 0.7777735590934753, + "eval_runtime": 1833.3451, + "eval_samples_per_second": 14.308, + "eval_steps_per_second": 1.789, + "step": 56000 + }, + { + "epoch": 0.8990513491388304, + "grad_norm": 0.5536409616470337, + "learning_rate": 2.8954505716039738e-05, + "loss": 0.7571, + "step": 56010 + }, + { + "epoch": 0.8992118653589946, + "grad_norm": 0.6410654187202454, + "learning_rate": 2.8948280492816932e-05, + "loss": 0.8234, + "step": 56020 + }, + { + "epoch": 0.8993723815791586, + "grad_norm": 0.6657175421714783, + "learning_rate": 2.8942055018506662e-05, + "loss": 0.7672, + "step": 56030 + }, + { + "epoch": 0.8995328977993227, + "grad_norm": 0.7635987401008606, + "learning_rate": 2.893582929350483e-05, + "loss": 0.8557, + "step": 56040 + }, + { + "epoch": 0.8996934140194867, + "grad_norm": 1.0316094160079956, + "learning_rate": 2.8929603318207354e-05, + "loss": 0.7938, + "step": 56050 + }, + { + "epoch": 0.8998539302396508, + "grad_norm": 0.6131860017776489, + "learning_rate": 2.892337709301018e-05, + "loss": 0.8695, + "step": 56060 + }, + { + "epoch": 0.9000144464598148, + "grad_norm": 0.8752529621124268, + "learning_rate": 2.891715061830924e-05, + "loss": 0.7832, + "step": 56070 + }, + { + "epoch": 0.9001749626799788, + "grad_norm": 1.1470189094543457, + "learning_rate": 2.8910923894500512e-05, + "loss": 0.7139, + "step": 56080 + }, + { + "epoch": 0.9003354789001429, + "grad_norm": 0.6997299194335938, + "learning_rate": 2.8904696921979985e-05, + "loss": 0.7491, + "step": 56090 + }, + { + "epoch": 0.9004959951203069, + "grad_norm": 0.5490342974662781, + "learning_rate": 2.8898469701143655e-05, + "loss": 0.6859, + "step": 56100 + }, + { + "epoch": 0.900656511340471, + "grad_norm": 0.7116551995277405, + "learning_rate": 2.8892242232387524e-05, + "loss": 0.7668, + "step": 56110 + }, + { + "epoch": 0.900817027560635, + "grad_norm": 0.8648437261581421, + "learning_rate": 2.888601451610764e-05, + "loss": 0.7574, + "step": 56120 + }, + { + "epoch": 0.9009775437807991, + "grad_norm": 0.8268005847930908, + "learning_rate": 2.887978655270004e-05, + "loss": 0.8298, + "step": 56130 + }, + { + "epoch": 0.9011380600009631, + "grad_norm": 0.6222602725028992, + "learning_rate": 2.8873558342560787e-05, + "loss": 0.8125, + "step": 56140 + }, + { + "epoch": 0.9012985762211272, + "grad_norm": 0.7946728467941284, + "learning_rate": 2.8867329886085953e-05, + "loss": 0.6801, + "step": 56150 + }, + { + "epoch": 0.9014590924412912, + "grad_norm": 1.127439022064209, + "learning_rate": 2.8861101183671647e-05, + "loss": 0.7739, + "step": 56160 + }, + { + "epoch": 0.9016196086614552, + "grad_norm": 0.9533993005752563, + "learning_rate": 2.885487223571396e-05, + "loss": 0.6832, + "step": 56170 + }, + { + "epoch": 0.9017801248816193, + "grad_norm": 0.7524852752685547, + "learning_rate": 2.8848643042609025e-05, + "loss": 0.7294, + "step": 56180 + }, + { + "epoch": 0.9019406411017833, + "grad_norm": 0.8052418231964111, + "learning_rate": 2.884241360475299e-05, + "loss": 0.7819, + "step": 56190 + }, + { + "epoch": 0.9021011573219474, + "grad_norm": 1.0951186418533325, + "learning_rate": 2.8836183922541994e-05, + "loss": 0.7234, + "step": 56200 + }, + { + "epoch": 0.9022616735421114, + "grad_norm": 0.8550351858139038, + "learning_rate": 2.8829953996372216e-05, + "loss": 0.7762, + "step": 56210 + }, + { + "epoch": 0.9024221897622755, + "grad_norm": 0.5600941181182861, + "learning_rate": 2.8823723826639853e-05, + "loss": 0.718, + "step": 56220 + }, + { + "epoch": 0.9025827059824395, + "grad_norm": 0.8505851626396179, + "learning_rate": 2.881749341374109e-05, + "loss": 0.6437, + "step": 56230 + }, + { + "epoch": 0.9027432222026036, + "grad_norm": 0.9829393029212952, + "learning_rate": 2.8811262758072156e-05, + "loss": 0.8192, + "step": 56240 + }, + { + "epoch": 0.9029037384227676, + "grad_norm": 0.7253702878952026, + "learning_rate": 2.8805031860029273e-05, + "loss": 0.7003, + "step": 56250 + }, + { + "epoch": 0.9030642546429317, + "grad_norm": 1.3885780572891235, + "learning_rate": 2.8798800720008708e-05, + "loss": 0.9196, + "step": 56260 + }, + { + "epoch": 0.9032247708630957, + "grad_norm": 0.6217785477638245, + "learning_rate": 2.8792569338406706e-05, + "loss": 0.7301, + "step": 56270 + }, + { + "epoch": 0.9033852870832597, + "grad_norm": 0.5908799171447754, + "learning_rate": 2.8786337715619553e-05, + "loss": 0.7498, + "step": 56280 + }, + { + "epoch": 0.9035458033034238, + "grad_norm": 1.1797338724136353, + "learning_rate": 2.878010585204355e-05, + "loss": 0.755, + "step": 56290 + }, + { + "epoch": 0.9037063195235878, + "grad_norm": 0.7818145155906677, + "learning_rate": 2.8773873748074998e-05, + "loss": 0.7824, + "step": 56300 + }, + { + "epoch": 0.9038668357437519, + "grad_norm": 0.7029056549072266, + "learning_rate": 2.8767641404110228e-05, + "loss": 0.8638, + "step": 56310 + }, + { + "epoch": 0.9040273519639159, + "grad_norm": 0.9903162121772766, + "learning_rate": 2.8761408820545577e-05, + "loss": 0.8409, + "step": 56320 + }, + { + "epoch": 0.90418786818408, + "grad_norm": 5.019413471221924, + "learning_rate": 2.87551759977774e-05, + "loss": 0.6855, + "step": 56330 + }, + { + "epoch": 0.904348384404244, + "grad_norm": 0.7054320573806763, + "learning_rate": 2.8748942936202078e-05, + "loss": 0.7679, + "step": 56340 + }, + { + "epoch": 0.9045089006244081, + "grad_norm": 0.7310061454772949, + "learning_rate": 2.8742709636215987e-05, + "loss": 0.8299, + "step": 56350 + }, + { + "epoch": 0.9046694168445721, + "grad_norm": 0.6319513916969299, + "learning_rate": 2.8736476098215526e-05, + "loss": 0.7095, + "step": 56360 + }, + { + "epoch": 0.9048299330647362, + "grad_norm": 1.5512813329696655, + "learning_rate": 2.8730242322597116e-05, + "loss": 0.7994, + "step": 56370 + }, + { + "epoch": 0.9049904492849002, + "grad_norm": 0.7145676016807556, + "learning_rate": 2.8724008309757194e-05, + "loss": 0.755, + "step": 56380 + }, + { + "epoch": 0.9051509655050642, + "grad_norm": 0.6277914047241211, + "learning_rate": 2.87177740600922e-05, + "loss": 0.6331, + "step": 56390 + }, + { + "epoch": 0.9053114817252284, + "grad_norm": 1.1780239343643188, + "learning_rate": 2.8711539573998593e-05, + "loss": 0.8713, + "step": 56400 + }, + { + "epoch": 0.9054719979453923, + "grad_norm": 0.7152280211448669, + "learning_rate": 2.8705304851872862e-05, + "loss": 0.8717, + "step": 56410 + }, + { + "epoch": 0.9056325141655565, + "grad_norm": 0.6898215413093567, + "learning_rate": 2.869906989411148e-05, + "loss": 0.8723, + "step": 56420 + }, + { + "epoch": 0.9057930303857205, + "grad_norm": 0.8440951704978943, + "learning_rate": 2.8692834701110967e-05, + "loss": 0.8261, + "step": 56430 + }, + { + "epoch": 0.9059535466058846, + "grad_norm": 0.8978618383407593, + "learning_rate": 2.8686599273267843e-05, + "loss": 0.834, + "step": 56440 + }, + { + "epoch": 0.9061140628260486, + "grad_norm": 0.9133883714675903, + "learning_rate": 2.8680363610978644e-05, + "loss": 0.9404, + "step": 56450 + }, + { + "epoch": 0.9062745790462127, + "grad_norm": 0.5826568007469177, + "learning_rate": 2.867412771463992e-05, + "loss": 0.6996, + "step": 56460 + }, + { + "epoch": 0.9064350952663767, + "grad_norm": 0.9803723096847534, + "learning_rate": 2.8667891584648245e-05, + "loss": 0.8119, + "step": 56470 + }, + { + "epoch": 0.9065956114865407, + "grad_norm": 0.8318573832511902, + "learning_rate": 2.8661655221400186e-05, + "loss": 0.8753, + "step": 56480 + }, + { + "epoch": 0.9067561277067048, + "grad_norm": 0.6159113049507141, + "learning_rate": 2.8655418625292345e-05, + "loss": 0.8627, + "step": 56490 + }, + { + "epoch": 0.9069166439268688, + "grad_norm": 1.029930830001831, + "learning_rate": 2.8649181796721343e-05, + "loss": 0.8189, + "step": 56500 + }, + { + "epoch": 0.9070771601470329, + "grad_norm": 0.6334224343299866, + "learning_rate": 2.8642944736083788e-05, + "loss": 0.5946, + "step": 56510 + }, + { + "epoch": 0.9072376763671969, + "grad_norm": 0.5290175676345825, + "learning_rate": 2.8636707443776335e-05, + "loss": 0.8649, + "step": 56520 + }, + { + "epoch": 0.907398192587361, + "grad_norm": 0.771543025970459, + "learning_rate": 2.8630469920195637e-05, + "loss": 0.6682, + "step": 56530 + }, + { + "epoch": 0.907558708807525, + "grad_norm": 1.1950316429138184, + "learning_rate": 2.862423216573835e-05, + "loss": 0.6475, + "step": 56540 + }, + { + "epoch": 0.9077192250276891, + "grad_norm": 0.8002382516860962, + "learning_rate": 2.8617994180801178e-05, + "loss": 0.8057, + "step": 56550 + }, + { + "epoch": 0.9078797412478531, + "grad_norm": 0.9087185859680176, + "learning_rate": 2.8611755965780806e-05, + "loss": 0.7487, + "step": 56560 + }, + { + "epoch": 0.9080402574680172, + "grad_norm": 0.6821175217628479, + "learning_rate": 2.860551752107396e-05, + "loss": 0.6482, + "step": 56570 + }, + { + "epoch": 0.9082007736881812, + "grad_norm": 0.6318082213401794, + "learning_rate": 2.8599278847077354e-05, + "loss": 0.7603, + "step": 56580 + }, + { + "epoch": 0.9083612899083452, + "grad_norm": 0.8191330432891846, + "learning_rate": 2.8593039944187744e-05, + "loss": 0.6575, + "step": 56590 + }, + { + "epoch": 0.9085218061285093, + "grad_norm": 1.0066817998886108, + "learning_rate": 2.8586800812801878e-05, + "loss": 0.6712, + "step": 56600 + }, + { + "epoch": 0.9086823223486733, + "grad_norm": 0.8875594735145569, + "learning_rate": 2.8580561453316535e-05, + "loss": 0.8307, + "step": 56610 + }, + { + "epoch": 0.9088428385688374, + "grad_norm": 0.6277624368667603, + "learning_rate": 2.8574321866128505e-05, + "loss": 0.78, + "step": 56620 + }, + { + "epoch": 0.9090033547890014, + "grad_norm": 0.7498376369476318, + "learning_rate": 2.8568082051634582e-05, + "loss": 0.9486, + "step": 56630 + }, + { + "epoch": 0.9091638710091655, + "grad_norm": 0.9656326770782471, + "learning_rate": 2.8561842010231575e-05, + "loss": 0.7716, + "step": 56640 + }, + { + "epoch": 0.9093243872293295, + "grad_norm": 1.007704734802246, + "learning_rate": 2.8555601742316328e-05, + "loss": 0.6817, + "step": 56650 + }, + { + "epoch": 0.9094849034494936, + "grad_norm": 0.9831529259681702, + "learning_rate": 2.8549361248285684e-05, + "loss": 0.688, + "step": 56660 + }, + { + "epoch": 0.9096454196696576, + "grad_norm": 1.0240861177444458, + "learning_rate": 2.8543120528536486e-05, + "loss": 0.7171, + "step": 56670 + }, + { + "epoch": 0.9098059358898216, + "grad_norm": 0.49688148498535156, + "learning_rate": 2.8536879583465625e-05, + "loss": 0.7914, + "step": 56680 + }, + { + "epoch": 0.9099664521099857, + "grad_norm": 0.671505331993103, + "learning_rate": 2.8530638413469983e-05, + "loss": 0.7582, + "step": 56690 + }, + { + "epoch": 0.9101269683301497, + "grad_norm": 0.7423988580703735, + "learning_rate": 2.852439701894646e-05, + "loss": 0.7667, + "step": 56700 + }, + { + "epoch": 0.9102874845503138, + "grad_norm": 0.5864954590797424, + "learning_rate": 2.851815540029197e-05, + "loss": 0.8485, + "step": 56710 + }, + { + "epoch": 0.9104480007704778, + "grad_norm": 1.0479648113250732, + "learning_rate": 2.851191355790345e-05, + "loss": 0.699, + "step": 56720 + }, + { + "epoch": 0.9106085169906419, + "grad_norm": 0.5915074944496155, + "learning_rate": 2.8505671492177836e-05, + "loss": 0.8165, + "step": 56730 + }, + { + "epoch": 0.9107690332108059, + "grad_norm": 0.9762322902679443, + "learning_rate": 2.8499429203512096e-05, + "loss": 0.7669, + "step": 56740 + }, + { + "epoch": 0.91092954943097, + "grad_norm": 0.7891535758972168, + "learning_rate": 2.8493186692303198e-05, + "loss": 0.7046, + "step": 56750 + }, + { + "epoch": 0.911090065651134, + "grad_norm": 0.7709766030311584, + "learning_rate": 2.8486943958948125e-05, + "loss": 0.8596, + "step": 56760 + }, + { + "epoch": 0.9112505818712981, + "grad_norm": 0.8818676471710205, + "learning_rate": 2.8480701003843886e-05, + "loss": 0.6588, + "step": 56770 + }, + { + "epoch": 0.9114110980914621, + "grad_norm": 0.7705491781234741, + "learning_rate": 2.847445782738749e-05, + "loss": 0.8562, + "step": 56780 + }, + { + "epoch": 0.9115716143116261, + "grad_norm": 0.9090352654457092, + "learning_rate": 2.846821442997597e-05, + "loss": 0.8752, + "step": 56790 + }, + { + "epoch": 0.9117321305317903, + "grad_norm": 0.6803156137466431, + "learning_rate": 2.8461970812006367e-05, + "loss": 0.6968, + "step": 56800 + }, + { + "epoch": 0.9118926467519543, + "grad_norm": 1.3752328157424927, + "learning_rate": 2.8455726973875745e-05, + "loss": 0.7513, + "step": 56810 + }, + { + "epoch": 0.9120531629721184, + "grad_norm": 0.5819945335388184, + "learning_rate": 2.8449482915981168e-05, + "loss": 0.7111, + "step": 56820 + }, + { + "epoch": 0.9122136791922824, + "grad_norm": 1.1309839487075806, + "learning_rate": 2.8443238638719722e-05, + "loss": 0.7121, + "step": 56830 + }, + { + "epoch": 0.9123741954124465, + "grad_norm": 0.7039775848388672, + "learning_rate": 2.8436994142488515e-05, + "loss": 0.8335, + "step": 56840 + }, + { + "epoch": 0.9125347116326105, + "grad_norm": 0.7486555576324463, + "learning_rate": 2.8430749427684644e-05, + "loss": 0.8184, + "step": 56850 + }, + { + "epoch": 0.9126952278527746, + "grad_norm": 3.6372079849243164, + "learning_rate": 2.8424504494705252e-05, + "loss": 0.8667, + "step": 56860 + }, + { + "epoch": 0.9128557440729386, + "grad_norm": 0.6321544051170349, + "learning_rate": 2.8418259343947472e-05, + "loss": 0.7249, + "step": 56870 + }, + { + "epoch": 0.9130162602931026, + "grad_norm": 0.7725563645362854, + "learning_rate": 2.841201397580846e-05, + "loss": 0.7426, + "step": 56880 + }, + { + "epoch": 0.9131767765132667, + "grad_norm": 0.9156036376953125, + "learning_rate": 2.840576839068539e-05, + "loss": 0.7019, + "step": 56890 + }, + { + "epoch": 0.9133372927334307, + "grad_norm": 0.9304108023643494, + "learning_rate": 2.8399522588975437e-05, + "loss": 0.7076, + "step": 56900 + }, + { + "epoch": 0.9134978089535948, + "grad_norm": 1.8999937772750854, + "learning_rate": 2.83932765710758e-05, + "loss": 0.8803, + "step": 56910 + }, + { + "epoch": 0.9136583251737588, + "grad_norm": 1.9012089967727661, + "learning_rate": 2.8387030337383685e-05, + "loss": 0.8223, + "step": 56920 + }, + { + "epoch": 0.9138188413939229, + "grad_norm": 0.8312927484512329, + "learning_rate": 2.8380783888296325e-05, + "loss": 0.7829, + "step": 56930 + }, + { + "epoch": 0.9139793576140869, + "grad_norm": 0.9134458899497986, + "learning_rate": 2.8374537224210955e-05, + "loss": 0.7459, + "step": 56940 + }, + { + "epoch": 0.914139873834251, + "grad_norm": 1.052334189414978, + "learning_rate": 2.8368290345524824e-05, + "loss": 0.7804, + "step": 56950 + }, + { + "epoch": 0.914300390054415, + "grad_norm": 0.5940123796463013, + "learning_rate": 2.8362043252635195e-05, + "loss": 0.7283, + "step": 56960 + }, + { + "epoch": 0.9144609062745791, + "grad_norm": 0.7095311880111694, + "learning_rate": 2.835579594593935e-05, + "loss": 0.6431, + "step": 56970 + }, + { + "epoch": 0.9146214224947431, + "grad_norm": 0.8936702609062195, + "learning_rate": 2.8349548425834577e-05, + "loss": 0.748, + "step": 56980 + }, + { + "epoch": 0.9147819387149071, + "grad_norm": 0.9321175813674927, + "learning_rate": 2.8343300692718185e-05, + "loss": 0.7502, + "step": 56990 + }, + { + "epoch": 0.9149424549350712, + "grad_norm": 0.7213349342346191, + "learning_rate": 2.8337052746987496e-05, + "loss": 0.7099, + "step": 57000 + }, + { + "epoch": 0.9151029711552352, + "grad_norm": 0.690405547618866, + "learning_rate": 2.8330804589039835e-05, + "loss": 0.7786, + "step": 57010 + }, + { + "epoch": 0.9152634873753993, + "grad_norm": 0.9312558174133301, + "learning_rate": 2.8324556219272547e-05, + "loss": 0.8152, + "step": 57020 + }, + { + "epoch": 0.9154240035955633, + "grad_norm": 1.1304988861083984, + "learning_rate": 2.831830763808301e-05, + "loss": 0.8582, + "step": 57030 + }, + { + "epoch": 0.9155845198157274, + "grad_norm": 1.1650792360305786, + "learning_rate": 2.8312058845868573e-05, + "loss": 0.7303, + "step": 57040 + }, + { + "epoch": 0.9157450360358914, + "grad_norm": 0.749189555644989, + "learning_rate": 2.830580984302663e-05, + "loss": 0.8261, + "step": 57050 + }, + { + "epoch": 0.9159055522560555, + "grad_norm": 0.7578228712081909, + "learning_rate": 2.8299560629954598e-05, + "loss": 0.7139, + "step": 57060 + }, + { + "epoch": 0.9160660684762195, + "grad_norm": 1.0088754892349243, + "learning_rate": 2.8293311207049868e-05, + "loss": 0.9274, + "step": 57070 + }, + { + "epoch": 0.9162265846963835, + "grad_norm": 0.8250893950462341, + "learning_rate": 2.828706157470987e-05, + "loss": 0.7577, + "step": 57080 + }, + { + "epoch": 0.9163871009165476, + "grad_norm": 0.8738901615142822, + "learning_rate": 2.8280811733332058e-05, + "loss": 0.8583, + "step": 57090 + }, + { + "epoch": 0.9165476171367116, + "grad_norm": 0.8532384037971497, + "learning_rate": 2.827456168331387e-05, + "loss": 0.9076, + "step": 57100 + }, + { + "epoch": 0.9167081333568757, + "grad_norm": 0.8090844750404358, + "learning_rate": 2.826831142505278e-05, + "loss": 0.8298, + "step": 57110 + }, + { + "epoch": 0.9168686495770397, + "grad_norm": 0.7316603064537048, + "learning_rate": 2.826206095894627e-05, + "loss": 0.7184, + "step": 57120 + }, + { + "epoch": 0.9170291657972038, + "grad_norm": 1.2177759408950806, + "learning_rate": 2.8255810285391826e-05, + "loss": 0.7122, + "step": 57130 + }, + { + "epoch": 0.9171896820173678, + "grad_norm": 0.7554214596748352, + "learning_rate": 2.8249559404786958e-05, + "loss": 0.7961, + "step": 57140 + }, + { + "epoch": 0.9173501982375319, + "grad_norm": 0.7129135131835938, + "learning_rate": 2.824330831752919e-05, + "loss": 0.7617, + "step": 57150 + }, + { + "epoch": 0.9175107144576959, + "grad_norm": 1.0325312614440918, + "learning_rate": 2.823705702401604e-05, + "loss": 0.7353, + "step": 57160 + }, + { + "epoch": 0.91767123067786, + "grad_norm": 0.9266979694366455, + "learning_rate": 2.8230805524645067e-05, + "loss": 0.7799, + "step": 57170 + }, + { + "epoch": 0.917831746898024, + "grad_norm": 1.7301783561706543, + "learning_rate": 2.822455381981382e-05, + "loss": 0.7513, + "step": 57180 + }, + { + "epoch": 0.917992263118188, + "grad_norm": 0.8308454155921936, + "learning_rate": 2.8218301909919882e-05, + "loss": 0.7041, + "step": 57190 + }, + { + "epoch": 0.9181527793383522, + "grad_norm": 0.7874858975410461, + "learning_rate": 2.821204979536084e-05, + "loss": 0.7494, + "step": 57200 + }, + { + "epoch": 0.9183132955585162, + "grad_norm": 1.2444182634353638, + "learning_rate": 2.8205797476534267e-05, + "loss": 0.9642, + "step": 57210 + }, + { + "epoch": 0.9184738117786803, + "grad_norm": 0.8110929727554321, + "learning_rate": 2.8199544953837802e-05, + "loss": 0.7915, + "step": 57220 + }, + { + "epoch": 0.9186343279988443, + "grad_norm": 1.0258969068527222, + "learning_rate": 2.8193292227669054e-05, + "loss": 0.7058, + "step": 57230 + }, + { + "epoch": 0.9187948442190084, + "grad_norm": 0.639419674873352, + "learning_rate": 2.8187039298425662e-05, + "loss": 0.7582, + "step": 57240 + }, + { + "epoch": 0.9189553604391724, + "grad_norm": 0.5791950821876526, + "learning_rate": 2.818078616650528e-05, + "loss": 0.6759, + "step": 57250 + }, + { + "epoch": 0.9191158766593365, + "grad_norm": 0.966127872467041, + "learning_rate": 2.8174532832305565e-05, + "loss": 0.9327, + "step": 57260 + }, + { + "epoch": 0.9192763928795005, + "grad_norm": 0.7794151306152344, + "learning_rate": 2.816827929622419e-05, + "loss": 0.8656, + "step": 57270 + }, + { + "epoch": 0.9194369090996645, + "grad_norm": 1.0582048892974854, + "learning_rate": 2.816202555865886e-05, + "loss": 0.7713, + "step": 57280 + }, + { + "epoch": 0.9195974253198286, + "grad_norm": 0.6157635450363159, + "learning_rate": 2.8155771620007253e-05, + "loss": 0.7797, + "step": 57290 + }, + { + "epoch": 0.9197579415399926, + "grad_norm": 0.7400408387184143, + "learning_rate": 2.8149517480667096e-05, + "loss": 0.7787, + "step": 57300 + }, + { + "epoch": 0.9199184577601567, + "grad_norm": 0.6221532225608826, + "learning_rate": 2.814326314103612e-05, + "loss": 0.7385, + "step": 57310 + }, + { + "epoch": 0.9200789739803207, + "grad_norm": 0.5604380965232849, + "learning_rate": 2.813700860151205e-05, + "loss": 0.8332, + "step": 57320 + }, + { + "epoch": 0.9202394902004848, + "grad_norm": 0.7163998484611511, + "learning_rate": 2.8130753862492643e-05, + "loss": 0.8045, + "step": 57330 + }, + { + "epoch": 0.9204000064206488, + "grad_norm": 0.7410334944725037, + "learning_rate": 2.8124498924375674e-05, + "loss": 0.8941, + "step": 57340 + }, + { + "epoch": 0.9205605226408129, + "grad_norm": 0.6527199149131775, + "learning_rate": 2.8118243787558907e-05, + "loss": 0.7394, + "step": 57350 + }, + { + "epoch": 0.9207210388609769, + "grad_norm": 0.6318860054016113, + "learning_rate": 2.8111988452440137e-05, + "loss": 0.7604, + "step": 57360 + }, + { + "epoch": 0.920881555081141, + "grad_norm": 0.6576940417289734, + "learning_rate": 2.8105732919417173e-05, + "loss": 0.8062, + "step": 57370 + }, + { + "epoch": 0.921042071301305, + "grad_norm": 0.8522058129310608, + "learning_rate": 2.8099477188887817e-05, + "loss": 0.9001, + "step": 57380 + }, + { + "epoch": 0.921202587521469, + "grad_norm": 0.6206981539726257, + "learning_rate": 2.8093221261249902e-05, + "loss": 0.8776, + "step": 57390 + }, + { + "epoch": 0.9213631037416331, + "grad_norm": 0.6644395589828491, + "learning_rate": 2.808696513690128e-05, + "loss": 0.7774, + "step": 57400 + }, + { + "epoch": 0.9215236199617971, + "grad_norm": 0.8752009868621826, + "learning_rate": 2.808070881623979e-05, + "loss": 0.6415, + "step": 57410 + }, + { + "epoch": 0.9216841361819612, + "grad_norm": 0.6490957736968994, + "learning_rate": 2.8074452299663295e-05, + "loss": 0.7357, + "step": 57420 + }, + { + "epoch": 0.9218446524021252, + "grad_norm": 0.7498231530189514, + "learning_rate": 2.806819558756969e-05, + "loss": 0.7762, + "step": 57430 + }, + { + "epoch": 0.9220051686222893, + "grad_norm": 1.2352020740509033, + "learning_rate": 2.8061938680356843e-05, + "loss": 0.6963, + "step": 57440 + }, + { + "epoch": 0.9221656848424533, + "grad_norm": 0.5411141514778137, + "learning_rate": 2.805568157842267e-05, + "loss": 0.6829, + "step": 57450 + }, + { + "epoch": 0.9223262010626174, + "grad_norm": 1.1032679080963135, + "learning_rate": 2.8049424282165088e-05, + "loss": 0.7727, + "step": 57460 + }, + { + "epoch": 0.9224867172827814, + "grad_norm": 0.711172878742218, + "learning_rate": 2.8043166791982023e-05, + "loss": 0.7321, + "step": 57470 + }, + { + "epoch": 0.9226472335029454, + "grad_norm": 0.830485463142395, + "learning_rate": 2.8036909108271404e-05, + "loss": 0.7511, + "step": 57480 + }, + { + "epoch": 0.9228077497231095, + "grad_norm": 0.9287384152412415, + "learning_rate": 2.803065123143119e-05, + "loss": 0.8238, + "step": 57490 + }, + { + "epoch": 0.9229682659432735, + "grad_norm": 1.1162699460983276, + "learning_rate": 2.8024393161859346e-05, + "loss": 0.7558, + "step": 57500 + }, + { + "epoch": 0.9231287821634376, + "grad_norm": 0.8261299729347229, + "learning_rate": 2.8018134899953856e-05, + "loss": 0.7442, + "step": 57510 + }, + { + "epoch": 0.9232892983836016, + "grad_norm": 0.7463170289993286, + "learning_rate": 2.8011876446112693e-05, + "loss": 0.7673, + "step": 57520 + }, + { + "epoch": 0.9234498146037657, + "grad_norm": 0.7107521295547485, + "learning_rate": 2.800561780073387e-05, + "loss": 0.7215, + "step": 57530 + }, + { + "epoch": 0.9236103308239297, + "grad_norm": 0.9304255843162537, + "learning_rate": 2.7999358964215395e-05, + "loss": 0.7716, + "step": 57540 + }, + { + "epoch": 0.9237708470440938, + "grad_norm": 0.7371270060539246, + "learning_rate": 2.799309993695529e-05, + "loss": 0.6407, + "step": 57550 + }, + { + "epoch": 0.9239313632642578, + "grad_norm": 0.5849297046661377, + "learning_rate": 2.7986840719351605e-05, + "loss": 0.7156, + "step": 57560 + }, + { + "epoch": 0.924091879484422, + "grad_norm": 0.625359833240509, + "learning_rate": 2.7980581311802378e-05, + "loss": 0.7806, + "step": 57570 + }, + { + "epoch": 0.924252395704586, + "grad_norm": 0.7230536341667175, + "learning_rate": 2.797432171470567e-05, + "loss": 0.78, + "step": 57580 + }, + { + "epoch": 0.92441291192475, + "grad_norm": 1.003304123878479, + "learning_rate": 2.7968061928459567e-05, + "loss": 0.8333, + "step": 57590 + }, + { + "epoch": 0.9245734281449141, + "grad_norm": 0.9822371006011963, + "learning_rate": 2.7961801953462136e-05, + "loss": 0.8984, + "step": 57600 + }, + { + "epoch": 0.924733944365078, + "grad_norm": 1.3291583061218262, + "learning_rate": 2.795554179011149e-05, + "loss": 0.6971, + "step": 57610 + }, + { + "epoch": 0.9248944605852422, + "grad_norm": 0.563075602054596, + "learning_rate": 2.7949281438805735e-05, + "loss": 0.7352, + "step": 57620 + }, + { + "epoch": 0.9250549768054062, + "grad_norm": 0.6680590510368347, + "learning_rate": 2.794302089994299e-05, + "loss": 0.7899, + "step": 57630 + }, + { + "epoch": 0.9252154930255703, + "grad_norm": 0.8655178546905518, + "learning_rate": 2.7936760173921384e-05, + "loss": 0.8564, + "step": 57640 + }, + { + "epoch": 0.9253760092457343, + "grad_norm": 1.2971359491348267, + "learning_rate": 2.7930499261139075e-05, + "loss": 0.7385, + "step": 57650 + }, + { + "epoch": 0.9255365254658984, + "grad_norm": 0.897357165813446, + "learning_rate": 2.7924238161994214e-05, + "loss": 0.8264, + "step": 57660 + }, + { + "epoch": 0.9256970416860624, + "grad_norm": 0.6270431280136108, + "learning_rate": 2.7917976876884967e-05, + "loss": 0.7783, + "step": 57670 + }, + { + "epoch": 0.9258575579062264, + "grad_norm": 1.036059021949768, + "learning_rate": 2.7911715406209514e-05, + "loss": 0.8079, + "step": 57680 + }, + { + "epoch": 0.9260180741263905, + "grad_norm": 0.8624611496925354, + "learning_rate": 2.7905453750366055e-05, + "loss": 0.741, + "step": 57690 + }, + { + "epoch": 0.9261785903465545, + "grad_norm": 0.6472511291503906, + "learning_rate": 2.7899191909752787e-05, + "loss": 0.853, + "step": 57700 + }, + { + "epoch": 0.9263391065667186, + "grad_norm": 0.5408697724342346, + "learning_rate": 2.7892929884767938e-05, + "loss": 0.7375, + "step": 57710 + }, + { + "epoch": 0.9264996227868826, + "grad_norm": 1.4958999156951904, + "learning_rate": 2.7886667675809718e-05, + "loss": 0.8165, + "step": 57720 + }, + { + "epoch": 0.9266601390070467, + "grad_norm": 0.666499137878418, + "learning_rate": 2.7880405283276385e-05, + "loss": 0.8436, + "step": 57730 + }, + { + "epoch": 0.9268206552272107, + "grad_norm": 1.2657876014709473, + "learning_rate": 2.7874142707566177e-05, + "loss": 0.7345, + "step": 57740 + }, + { + "epoch": 0.9269811714473748, + "grad_norm": 0.831595242023468, + "learning_rate": 2.786787994907737e-05, + "loss": 0.8324, + "step": 57750 + }, + { + "epoch": 0.9271416876675388, + "grad_norm": 0.902538537979126, + "learning_rate": 2.7861617008208218e-05, + "loss": 0.9468, + "step": 57760 + }, + { + "epoch": 0.9273022038877029, + "grad_norm": 0.8239491581916809, + "learning_rate": 2.7855353885357027e-05, + "loss": 0.7379, + "step": 57770 + }, + { + "epoch": 0.9274627201078669, + "grad_norm": 0.7199010252952576, + "learning_rate": 2.7849090580922094e-05, + "loss": 0.7618, + "step": 57780 + }, + { + "epoch": 0.9276232363280309, + "grad_norm": 0.921436607837677, + "learning_rate": 2.7842827095301714e-05, + "loss": 0.8215, + "step": 57790 + }, + { + "epoch": 0.927783752548195, + "grad_norm": 0.5713641047477722, + "learning_rate": 2.783656342889422e-05, + "loss": 0.7664, + "step": 57800 + }, + { + "epoch": 0.927944268768359, + "grad_norm": 0.6487100124359131, + "learning_rate": 2.7830299582097935e-05, + "loss": 0.6456, + "step": 57810 + }, + { + "epoch": 0.9281047849885231, + "grad_norm": 0.5034416317939758, + "learning_rate": 2.782403555531121e-05, + "loss": 0.7589, + "step": 57820 + }, + { + "epoch": 0.9282653012086871, + "grad_norm": 0.7584102153778076, + "learning_rate": 2.7817771348932408e-05, + "loss": 0.7342, + "step": 57830 + }, + { + "epoch": 0.9284258174288512, + "grad_norm": 0.7458183765411377, + "learning_rate": 2.7811506963359884e-05, + "loss": 0.7463, + "step": 57840 + }, + { + "epoch": 0.9285863336490152, + "grad_norm": 0.6377924084663391, + "learning_rate": 2.7805242398992015e-05, + "loss": 0.769, + "step": 57850 + }, + { + "epoch": 0.9287468498691793, + "grad_norm": 0.6374291181564331, + "learning_rate": 2.779897765622719e-05, + "loss": 0.7587, + "step": 57860 + }, + { + "epoch": 0.9289073660893433, + "grad_norm": 0.6665219664573669, + "learning_rate": 2.7792712735463833e-05, + "loss": 0.7, + "step": 57870 + }, + { + "epoch": 0.9290678823095073, + "grad_norm": 0.7987855076789856, + "learning_rate": 2.778644763710032e-05, + "loss": 0.7384, + "step": 57880 + }, + { + "epoch": 0.9292283985296714, + "grad_norm": 0.6854622960090637, + "learning_rate": 2.7780182361535095e-05, + "loss": 0.8392, + "step": 57890 + }, + { + "epoch": 0.9293889147498354, + "grad_norm": 0.6531542539596558, + "learning_rate": 2.7773916909166597e-05, + "loss": 0.72, + "step": 57900 + }, + { + "epoch": 0.9295494309699995, + "grad_norm": 0.6737208366394043, + "learning_rate": 2.776765128039326e-05, + "loss": 0.7162, + "step": 57910 + }, + { + "epoch": 0.9297099471901635, + "grad_norm": 1.5237468481063843, + "learning_rate": 2.7761385475613544e-05, + "loss": 0.8322, + "step": 57920 + }, + { + "epoch": 0.9298704634103276, + "grad_norm": 0.7985415458679199, + "learning_rate": 2.7755119495225927e-05, + "loss": 0.7794, + "step": 57930 + }, + { + "epoch": 0.9300309796304916, + "grad_norm": 0.9351372718811035, + "learning_rate": 2.774885333962887e-05, + "loss": 0.7053, + "step": 57940 + }, + { + "epoch": 0.9301914958506557, + "grad_norm": 1.1048953533172607, + "learning_rate": 2.7742587009220884e-05, + "loss": 0.7739, + "step": 57950 + }, + { + "epoch": 0.9303520120708197, + "grad_norm": 0.6859215497970581, + "learning_rate": 2.7736320504400458e-05, + "loss": 0.8024, + "step": 57960 + }, + { + "epoch": 0.9305125282909839, + "grad_norm": 0.540195643901825, + "learning_rate": 2.773005382556611e-05, + "loss": 0.654, + "step": 57970 + }, + { + "epoch": 0.9306730445111479, + "grad_norm": 0.4628857672214508, + "learning_rate": 2.772378697311636e-05, + "loss": 0.7377, + "step": 57980 + }, + { + "epoch": 0.9308335607313118, + "grad_norm": 0.667374849319458, + "learning_rate": 2.771751994744975e-05, + "loss": 0.7859, + "step": 57990 + }, + { + "epoch": 0.930994076951476, + "grad_norm": 0.6924329400062561, + "learning_rate": 2.7711252748964817e-05, + "loss": 0.6955, + "step": 58000 + }, + { + "epoch": 0.93115459317164, + "grad_norm": 0.8478180170059204, + "learning_rate": 2.770498537806012e-05, + "loss": 0.738, + "step": 58010 + }, + { + "epoch": 0.9313151093918041, + "grad_norm": 0.8361011743545532, + "learning_rate": 2.769871783513423e-05, + "loss": 0.794, + "step": 58020 + }, + { + "epoch": 0.9314756256119681, + "grad_norm": 0.986714243888855, + "learning_rate": 2.7692450120585734e-05, + "loss": 0.7166, + "step": 58030 + }, + { + "epoch": 0.9316361418321322, + "grad_norm": 0.8459975123405457, + "learning_rate": 2.7686182234813207e-05, + "loss": 0.7166, + "step": 58040 + }, + { + "epoch": 0.9317966580522962, + "grad_norm": 0.9578590989112854, + "learning_rate": 2.7679914178215255e-05, + "loss": 0.8468, + "step": 58050 + }, + { + "epoch": 0.9319571742724603, + "grad_norm": 0.6879661083221436, + "learning_rate": 2.7673645951190497e-05, + "loss": 0.8317, + "step": 58060 + }, + { + "epoch": 0.9321176904926243, + "grad_norm": 0.7557054162025452, + "learning_rate": 2.7667377554137546e-05, + "loss": 0.8505, + "step": 58070 + }, + { + "epoch": 0.9322782067127884, + "grad_norm": 0.7247436046600342, + "learning_rate": 2.7661108987455035e-05, + "loss": 0.8515, + "step": 58080 + }, + { + "epoch": 0.9324387229329524, + "grad_norm": 0.8282655477523804, + "learning_rate": 2.765484025154162e-05, + "loss": 0.6816, + "step": 58090 + }, + { + "epoch": 0.9325992391531164, + "grad_norm": 1.0702141523361206, + "learning_rate": 2.7648571346795938e-05, + "loss": 0.8273, + "step": 58100 + }, + { + "epoch": 0.9327597553732805, + "grad_norm": 0.6238583922386169, + "learning_rate": 2.7642302273616667e-05, + "loss": 0.6706, + "step": 58110 + }, + { + "epoch": 0.9329202715934445, + "grad_norm": 0.6821714639663696, + "learning_rate": 2.763603303240248e-05, + "loss": 0.859, + "step": 58120 + }, + { + "epoch": 0.9330807878136086, + "grad_norm": 0.7496328949928284, + "learning_rate": 2.7629763623552062e-05, + "loss": 0.7171, + "step": 58130 + }, + { + "epoch": 0.9332413040337726, + "grad_norm": 1.0851547718048096, + "learning_rate": 2.7623494047464122e-05, + "loss": 0.7876, + "step": 58140 + }, + { + "epoch": 0.9334018202539367, + "grad_norm": 0.713047444820404, + "learning_rate": 2.7617224304537346e-05, + "loss": 0.8119, + "step": 58150 + }, + { + "epoch": 0.9335623364741007, + "grad_norm": 0.6391628980636597, + "learning_rate": 2.7610954395170473e-05, + "loss": 0.794, + "step": 58160 + }, + { + "epoch": 0.9337228526942648, + "grad_norm": 0.992779552936554, + "learning_rate": 2.7604684319762225e-05, + "loss": 0.7236, + "step": 58170 + }, + { + "epoch": 0.9338833689144288, + "grad_norm": 0.8766685128211975, + "learning_rate": 2.7598414078711345e-05, + "loss": 0.8252, + "step": 58180 + }, + { + "epoch": 0.9340438851345928, + "grad_norm": 0.9305059313774109, + "learning_rate": 2.7592143672416577e-05, + "loss": 0.811, + "step": 58190 + }, + { + "epoch": 0.9342044013547569, + "grad_norm": 0.8005132675170898, + "learning_rate": 2.7585873101276683e-05, + "loss": 0.6844, + "step": 58200 + }, + { + "epoch": 0.9343649175749209, + "grad_norm": 0.5202292799949646, + "learning_rate": 2.7579602365690444e-05, + "loss": 0.7434, + "step": 58210 + }, + { + "epoch": 0.934525433795085, + "grad_norm": 0.7573438882827759, + "learning_rate": 2.757333146605663e-05, + "loss": 0.7968, + "step": 58220 + }, + { + "epoch": 0.934685950015249, + "grad_norm": 0.9386662244796753, + "learning_rate": 2.7567060402774036e-05, + "loss": 0.7526, + "step": 58230 + }, + { + "epoch": 0.9348464662354131, + "grad_norm": 0.8225018382072449, + "learning_rate": 2.7560789176241477e-05, + "loss": 0.8317, + "step": 58240 + }, + { + "epoch": 0.9350069824555771, + "grad_norm": 0.632011890411377, + "learning_rate": 2.7554517786857753e-05, + "loss": 0.7462, + "step": 58250 + }, + { + "epoch": 0.9351674986757412, + "grad_norm": 0.6982889771461487, + "learning_rate": 2.7548246235021686e-05, + "loss": 0.6759, + "step": 58260 + }, + { + "epoch": 0.9353280148959052, + "grad_norm": 1.1567754745483398, + "learning_rate": 2.7541974521132125e-05, + "loss": 0.7364, + "step": 58270 + }, + { + "epoch": 0.9354885311160693, + "grad_norm": 0.8478853702545166, + "learning_rate": 2.7535702645587896e-05, + "loss": 0.8176, + "step": 58280 + }, + { + "epoch": 0.9356490473362333, + "grad_norm": 0.9484835267066956, + "learning_rate": 2.7529430608787858e-05, + "loss": 0.604, + "step": 58290 + }, + { + "epoch": 0.9358095635563973, + "grad_norm": 1.148431420326233, + "learning_rate": 2.752315841113089e-05, + "loss": 0.7345, + "step": 58300 + }, + { + "epoch": 0.9359700797765614, + "grad_norm": 0.9187483191490173, + "learning_rate": 2.7516886053015856e-05, + "loss": 0.7834, + "step": 58310 + }, + { + "epoch": 0.9361305959967254, + "grad_norm": 0.6966903805732727, + "learning_rate": 2.7510613534841634e-05, + "loss": 0.7327, + "step": 58320 + }, + { + "epoch": 0.9362911122168895, + "grad_norm": 0.6945207118988037, + "learning_rate": 2.7504340857007126e-05, + "loss": 0.8718, + "step": 58330 + }, + { + "epoch": 0.9364516284370535, + "grad_norm": 0.6128920912742615, + "learning_rate": 2.749806801991125e-05, + "loss": 0.732, + "step": 58340 + }, + { + "epoch": 0.9366121446572176, + "grad_norm": 0.8530048727989197, + "learning_rate": 2.7491795023952905e-05, + "loss": 0.8421, + "step": 58350 + }, + { + "epoch": 0.9367726608773816, + "grad_norm": 0.6773085594177246, + "learning_rate": 2.7485521869531017e-05, + "loss": 0.9221, + "step": 58360 + }, + { + "epoch": 0.9369331770975458, + "grad_norm": 0.5135380029678345, + "learning_rate": 2.7479248557044534e-05, + "loss": 0.745, + "step": 58370 + }, + { + "epoch": 0.9370936933177098, + "grad_norm": 0.7625150680541992, + "learning_rate": 2.7472975086892388e-05, + "loss": 0.7631, + "step": 58380 + }, + { + "epoch": 0.9372542095378738, + "grad_norm": 0.8384085893630981, + "learning_rate": 2.7466701459473543e-05, + "loss": 0.7683, + "step": 58390 + }, + { + "epoch": 0.9374147257580379, + "grad_norm": 1.2299038171768188, + "learning_rate": 2.7460427675186967e-05, + "loss": 0.8058, + "step": 58400 + }, + { + "epoch": 0.9375752419782019, + "grad_norm": 0.8132359981536865, + "learning_rate": 2.7454153734431627e-05, + "loss": 0.5921, + "step": 58410 + }, + { + "epoch": 0.937735758198366, + "grad_norm": 0.8303583860397339, + "learning_rate": 2.7447879637606512e-05, + "loss": 0.7341, + "step": 58420 + }, + { + "epoch": 0.93789627441853, + "grad_norm": 0.6264082193374634, + "learning_rate": 2.7441605385110626e-05, + "loss": 0.7652, + "step": 58430 + }, + { + "epoch": 0.9380567906386941, + "grad_norm": 0.647875964641571, + "learning_rate": 2.743533097734296e-05, + "loss": 0.9154, + "step": 58440 + }, + { + "epoch": 0.9382173068588581, + "grad_norm": 0.7785263061523438, + "learning_rate": 2.7429056414702548e-05, + "loss": 0.7279, + "step": 58450 + }, + { + "epoch": 0.9383778230790222, + "grad_norm": 0.6018912196159363, + "learning_rate": 2.7422781697588403e-05, + "loss": 0.8015, + "step": 58460 + }, + { + "epoch": 0.9385383392991862, + "grad_norm": 0.8063865900039673, + "learning_rate": 2.7416506826399557e-05, + "loss": 0.813, + "step": 58470 + }, + { + "epoch": 0.9386988555193503, + "grad_norm": 0.6715295910835266, + "learning_rate": 2.741023180153506e-05, + "loss": 0.7861, + "step": 58480 + }, + { + "epoch": 0.9388593717395143, + "grad_norm": 0.5578140020370483, + "learning_rate": 2.7403956623393972e-05, + "loss": 0.7066, + "step": 58490 + }, + { + "epoch": 0.9390198879596783, + "grad_norm": 0.5427664518356323, + "learning_rate": 2.7397681292375355e-05, + "loss": 0.7193, + "step": 58500 + }, + { + "epoch": 0.9391804041798424, + "grad_norm": 0.7786543369293213, + "learning_rate": 2.7391405808878274e-05, + "loss": 0.6622, + "step": 58510 + }, + { + "epoch": 0.9393409204000064, + "grad_norm": 0.642900824546814, + "learning_rate": 2.7385130173301825e-05, + "loss": 0.5673, + "step": 58520 + }, + { + "epoch": 0.9395014366201705, + "grad_norm": 0.5713398456573486, + "learning_rate": 2.7378854386045095e-05, + "loss": 0.7367, + "step": 58530 + }, + { + "epoch": 0.9396619528403345, + "grad_norm": 0.6109932065010071, + "learning_rate": 2.737257844750718e-05, + "loss": 0.7932, + "step": 58540 + }, + { + "epoch": 0.9398224690604986, + "grad_norm": 0.7414340972900391, + "learning_rate": 2.736630235808721e-05, + "loss": 0.8066, + "step": 58550 + }, + { + "epoch": 0.9399829852806626, + "grad_norm": 0.9734876155853271, + "learning_rate": 2.73600261181843e-05, + "loss": 0.7557, + "step": 58560 + }, + { + "epoch": 0.9401435015008267, + "grad_norm": 0.8577888607978821, + "learning_rate": 2.7353749728197575e-05, + "loss": 0.8309, + "step": 58570 + }, + { + "epoch": 0.9403040177209907, + "grad_norm": 1.077553153038025, + "learning_rate": 2.7347473188526185e-05, + "loss": 0.7838, + "step": 58580 + }, + { + "epoch": 0.9404645339411547, + "grad_norm": 0.6681370735168457, + "learning_rate": 2.7341196499569283e-05, + "loss": 0.7169, + "step": 58590 + }, + { + "epoch": 0.9406250501613188, + "grad_norm": 0.6793078184127808, + "learning_rate": 2.733491966172602e-05, + "loss": 0.8215, + "step": 58600 + }, + { + "epoch": 0.9407855663814828, + "grad_norm": 0.8315091729164124, + "learning_rate": 2.732864267539557e-05, + "loss": 0.7573, + "step": 58610 + }, + { + "epoch": 0.9409460826016469, + "grad_norm": 0.8223089575767517, + "learning_rate": 2.732236554097712e-05, + "loss": 0.7285, + "step": 58620 + }, + { + "epoch": 0.9411065988218109, + "grad_norm": 0.6541125774383545, + "learning_rate": 2.7316088258869848e-05, + "loss": 0.7112, + "step": 58630 + }, + { + "epoch": 0.941267115041975, + "grad_norm": 0.677705705165863, + "learning_rate": 2.7309810829472953e-05, + "loss": 0.6974, + "step": 58640 + }, + { + "epoch": 0.941427631262139, + "grad_norm": 0.6513223052024841, + "learning_rate": 2.7303533253185653e-05, + "loss": 0.804, + "step": 58650 + }, + { + "epoch": 0.9415881474823031, + "grad_norm": 0.6748706698417664, + "learning_rate": 2.7297255530407157e-05, + "loss": 0.9627, + "step": 58660 + }, + { + "epoch": 0.9417486637024671, + "grad_norm": 0.9451727867126465, + "learning_rate": 2.7290977661536693e-05, + "loss": 0.7946, + "step": 58670 + }, + { + "epoch": 0.9419091799226312, + "grad_norm": 0.6820887327194214, + "learning_rate": 2.72846996469735e-05, + "loss": 0.6728, + "step": 58680 + }, + { + "epoch": 0.9420696961427952, + "grad_norm": 2.2869961261749268, + "learning_rate": 2.7278421487116814e-05, + "loss": 0.7434, + "step": 58690 + }, + { + "epoch": 0.9422302123629592, + "grad_norm": 0.9331046938896179, + "learning_rate": 2.727214318236589e-05, + "loss": 0.6626, + "step": 58700 + }, + { + "epoch": 0.9423907285831233, + "grad_norm": 0.6224152445793152, + "learning_rate": 2.7265864733120007e-05, + "loss": 0.7342, + "step": 58710 + }, + { + "epoch": 0.9425512448032873, + "grad_norm": 0.5694825053215027, + "learning_rate": 2.7259586139778414e-05, + "loss": 0.7396, + "step": 58720 + }, + { + "epoch": 0.9427117610234514, + "grad_norm": 0.837989866733551, + "learning_rate": 2.7253307402740414e-05, + "loss": 0.8014, + "step": 58730 + }, + { + "epoch": 0.9428722772436154, + "grad_norm": 0.5460785031318665, + "learning_rate": 2.724702852240528e-05, + "loss": 0.6615, + "step": 58740 + }, + { + "epoch": 0.9430327934637796, + "grad_norm": 0.9060196876525879, + "learning_rate": 2.7240749499172324e-05, + "loss": 0.766, + "step": 58750 + }, + { + "epoch": 0.9431933096839435, + "grad_norm": 0.7561401128768921, + "learning_rate": 2.723447033344086e-05, + "loss": 0.833, + "step": 58760 + }, + { + "epoch": 0.9433538259041077, + "grad_norm": 0.7350110411643982, + "learning_rate": 2.722819102561019e-05, + "loss": 0.8518, + "step": 58770 + }, + { + "epoch": 0.9435143421242717, + "grad_norm": 0.7625289559364319, + "learning_rate": 2.722191157607965e-05, + "loss": 0.7504, + "step": 58780 + }, + { + "epoch": 0.9436748583444357, + "grad_norm": 0.6036867499351501, + "learning_rate": 2.7215631985248574e-05, + "loss": 0.7247, + "step": 58790 + }, + { + "epoch": 0.9438353745645998, + "grad_norm": 0.8425914645195007, + "learning_rate": 2.7209352253516313e-05, + "loss": 0.8479, + "step": 58800 + }, + { + "epoch": 0.9439958907847638, + "grad_norm": 0.7830104231834412, + "learning_rate": 2.7203072381282212e-05, + "loss": 0.6502, + "step": 58810 + }, + { + "epoch": 0.9441564070049279, + "grad_norm": 0.6690815091133118, + "learning_rate": 2.7196792368945632e-05, + "loss": 0.8648, + "step": 58820 + }, + { + "epoch": 0.9443169232250919, + "grad_norm": 0.5235433578491211, + "learning_rate": 2.719051221690596e-05, + "loss": 0.7093, + "step": 58830 + }, + { + "epoch": 0.944477439445256, + "grad_norm": 0.9814441800117493, + "learning_rate": 2.718423192556257e-05, + "loss": 0.7296, + "step": 58840 + }, + { + "epoch": 0.94463795566542, + "grad_norm": 0.6031317114830017, + "learning_rate": 2.7177951495314848e-05, + "loss": 0.7045, + "step": 58850 + }, + { + "epoch": 0.9447984718855841, + "grad_norm": 0.7851951718330383, + "learning_rate": 2.7171670926562186e-05, + "loss": 0.843, + "step": 58860 + }, + { + "epoch": 0.9449589881057481, + "grad_norm": 0.720137357711792, + "learning_rate": 2.716539021970401e-05, + "loss": 0.6567, + "step": 58870 + }, + { + "epoch": 0.9451195043259122, + "grad_norm": 0.7244751453399658, + "learning_rate": 2.7159109375139725e-05, + "loss": 0.6872, + "step": 58880 + }, + { + "epoch": 0.9452800205460762, + "grad_norm": 0.9327145218849182, + "learning_rate": 2.7152828393268746e-05, + "loss": 0.8184, + "step": 58890 + }, + { + "epoch": 0.9454405367662402, + "grad_norm": 0.5973581075668335, + "learning_rate": 2.714654727449053e-05, + "loss": 0.9227, + "step": 58900 + }, + { + "epoch": 0.9456010529864043, + "grad_norm": 0.9315463900566101, + "learning_rate": 2.7140266019204502e-05, + "loss": 0.7675, + "step": 58910 + }, + { + "epoch": 0.9457615692065683, + "grad_norm": 0.5362744927406311, + "learning_rate": 2.713398462781011e-05, + "loss": 0.6694, + "step": 58920 + }, + { + "epoch": 0.9459220854267324, + "grad_norm": 0.7111684679985046, + "learning_rate": 2.7127703100706836e-05, + "loss": 0.8573, + "step": 58930 + }, + { + "epoch": 0.9460826016468964, + "grad_norm": 0.7133885025978088, + "learning_rate": 2.712142143829412e-05, + "loss": 0.6953, + "step": 58940 + }, + { + "epoch": 0.9462431178670605, + "grad_norm": 0.8020309209823608, + "learning_rate": 2.7115139640971455e-05, + "loss": 0.7744, + "step": 58950 + }, + { + "epoch": 0.9464036340872245, + "grad_norm": 1.0803743600845337, + "learning_rate": 2.7108857709138324e-05, + "loss": 0.8468, + "step": 58960 + }, + { + "epoch": 0.9465641503073886, + "grad_norm": 0.8828917145729065, + "learning_rate": 2.7102575643194223e-05, + "loss": 0.7005, + "step": 58970 + }, + { + "epoch": 0.9467246665275526, + "grad_norm": 0.6930515170097351, + "learning_rate": 2.7096293443538646e-05, + "loss": 0.812, + "step": 58980 + }, + { + "epoch": 0.9468851827477166, + "grad_norm": 1.0064244270324707, + "learning_rate": 2.709001111057112e-05, + "loss": 0.74, + "step": 58990 + }, + { + "epoch": 0.9470456989678807, + "grad_norm": 0.7619084119796753, + "learning_rate": 2.708372864469114e-05, + "loss": 0.7496, + "step": 59000 + }, + { + "epoch": 0.9472062151880447, + "grad_norm": 0.6047338843345642, + "learning_rate": 2.707744604629826e-05, + "loss": 0.7056, + "step": 59010 + }, + { + "epoch": 0.9473667314082088, + "grad_norm": 0.8663108348846436, + "learning_rate": 2.7071163315792004e-05, + "loss": 0.8854, + "step": 59020 + }, + { + "epoch": 0.9475272476283728, + "grad_norm": 0.9109717607498169, + "learning_rate": 2.706488045357191e-05, + "loss": 0.7445, + "step": 59030 + }, + { + "epoch": 0.9476877638485369, + "grad_norm": 0.6433135867118835, + "learning_rate": 2.7058597460037543e-05, + "loss": 0.6857, + "step": 59040 + }, + { + "epoch": 0.9478482800687009, + "grad_norm": 1.6459589004516602, + "learning_rate": 2.7052314335588464e-05, + "loss": 0.9619, + "step": 59050 + }, + { + "epoch": 0.948008796288865, + "grad_norm": 1.0225632190704346, + "learning_rate": 2.7046031080624234e-05, + "loss": 0.8096, + "step": 59060 + }, + { + "epoch": 0.948169312509029, + "grad_norm": 0.6955157518386841, + "learning_rate": 2.703974769554443e-05, + "loss": 0.7363, + "step": 59070 + }, + { + "epoch": 0.9483298287291931, + "grad_norm": 0.675747275352478, + "learning_rate": 2.703346418074866e-05, + "loss": 0.8427, + "step": 59080 + }, + { + "epoch": 0.9484903449493571, + "grad_norm": 0.8007250428199768, + "learning_rate": 2.7027180536636497e-05, + "loss": 0.7827, + "step": 59090 + }, + { + "epoch": 0.9486508611695211, + "grad_norm": 0.6245532631874084, + "learning_rate": 2.7020896763607544e-05, + "loss": 0.8332, + "step": 59100 + }, + { + "epoch": 0.9488113773896852, + "grad_norm": 0.7189433574676514, + "learning_rate": 2.7014612862061423e-05, + "loss": 0.8217, + "step": 59110 + }, + { + "epoch": 0.9489718936098492, + "grad_norm": 0.6980908513069153, + "learning_rate": 2.7008328832397756e-05, + "loss": 0.7599, + "step": 59120 + }, + { + "epoch": 0.9491324098300133, + "grad_norm": 0.7242516875267029, + "learning_rate": 2.700204467501615e-05, + "loss": 0.821, + "step": 59130 + }, + { + "epoch": 0.9492929260501773, + "grad_norm": 0.924083411693573, + "learning_rate": 2.699576039031626e-05, + "loss": 0.7782, + "step": 59140 + }, + { + "epoch": 0.9494534422703415, + "grad_norm": 1.1039535999298096, + "learning_rate": 2.6989475978697725e-05, + "loss": 0.7905, + "step": 59150 + }, + { + "epoch": 0.9496139584905054, + "grad_norm": 1.010832667350769, + "learning_rate": 2.6983191440560198e-05, + "loss": 0.8516, + "step": 59160 + }, + { + "epoch": 0.9497744747106696, + "grad_norm": 0.7899096608161926, + "learning_rate": 2.6976906776303323e-05, + "loss": 0.6853, + "step": 59170 + }, + { + "epoch": 0.9499349909308336, + "grad_norm": 0.5442420244216919, + "learning_rate": 2.6970621986326793e-05, + "loss": 0.7958, + "step": 59180 + }, + { + "epoch": 0.9500955071509976, + "grad_norm": 0.5542633533477783, + "learning_rate": 2.6964337071030265e-05, + "loss": 0.8547, + "step": 59190 + }, + { + "epoch": 0.9502560233711617, + "grad_norm": 0.7794419527053833, + "learning_rate": 2.6958052030813423e-05, + "loss": 0.912, + "step": 59200 + }, + { + "epoch": 0.9504165395913257, + "grad_norm": 1.0626438856124878, + "learning_rate": 2.6951766866075978e-05, + "loss": 0.821, + "step": 59210 + }, + { + "epoch": 0.9505770558114898, + "grad_norm": 0.6170475482940674, + "learning_rate": 2.6945481577217608e-05, + "loss": 0.8642, + "step": 59220 + }, + { + "epoch": 0.9507375720316538, + "grad_norm": 0.8354384899139404, + "learning_rate": 2.6939196164638025e-05, + "loss": 0.7014, + "step": 59230 + }, + { + "epoch": 0.9508980882518179, + "grad_norm": 1.3752765655517578, + "learning_rate": 2.6932910628736956e-05, + "loss": 0.8123, + "step": 59240 + }, + { + "epoch": 0.9510586044719819, + "grad_norm": 1.0413154363632202, + "learning_rate": 2.692662496991411e-05, + "loss": 0.8001, + "step": 59250 + }, + { + "epoch": 0.951219120692146, + "grad_norm": 0.8082032203674316, + "learning_rate": 2.6920339188569226e-05, + "loss": 0.721, + "step": 59260 + }, + { + "epoch": 0.95137963691231, + "grad_norm": 0.7100674510002136, + "learning_rate": 2.691405328510205e-05, + "loss": 0.7914, + "step": 59270 + }, + { + "epoch": 0.9515401531324741, + "grad_norm": 0.8855526447296143, + "learning_rate": 2.6907767259912304e-05, + "loss": 0.826, + "step": 59280 + }, + { + "epoch": 0.9517006693526381, + "grad_norm": 0.6687140464782715, + "learning_rate": 2.6901481113399763e-05, + "loss": 0.7234, + "step": 59290 + }, + { + "epoch": 0.9518611855728021, + "grad_norm": 0.5666716694831848, + "learning_rate": 2.6895194845964188e-05, + "loss": 0.7503, + "step": 59300 + }, + { + "epoch": 0.9520217017929662, + "grad_norm": 1.260688304901123, + "learning_rate": 2.6888908458005342e-05, + "loss": 0.7284, + "step": 59310 + }, + { + "epoch": 0.9521822180131302, + "grad_norm": 0.8654762506484985, + "learning_rate": 2.6882621949923005e-05, + "loss": 0.7896, + "step": 59320 + }, + { + "epoch": 0.9523427342332943, + "grad_norm": 0.788006067276001, + "learning_rate": 2.6876335322116965e-05, + "loss": 0.6867, + "step": 59330 + }, + { + "epoch": 0.9525032504534583, + "grad_norm": 0.8149182796478271, + "learning_rate": 2.6870048574987005e-05, + "loss": 0.8183, + "step": 59340 + }, + { + "epoch": 0.9526637666736224, + "grad_norm": 1.147788405418396, + "learning_rate": 2.6863761708932935e-05, + "loss": 0.7466, + "step": 59350 + }, + { + "epoch": 0.9528242828937864, + "grad_norm": 1.0332143306732178, + "learning_rate": 2.6857474724354554e-05, + "loss": 0.8002, + "step": 59360 + }, + { + "epoch": 0.9529847991139505, + "grad_norm": 1.137015700340271, + "learning_rate": 2.6851187621651686e-05, + "loss": 0.7462, + "step": 59370 + }, + { + "epoch": 0.9531453153341145, + "grad_norm": 0.7773959636688232, + "learning_rate": 2.684490040122415e-05, + "loss": 0.7282, + "step": 59380 + }, + { + "epoch": 0.9533058315542785, + "grad_norm": 0.6804434657096863, + "learning_rate": 2.6838613063471783e-05, + "loss": 0.6932, + "step": 59390 + }, + { + "epoch": 0.9534663477744426, + "grad_norm": 1.1179404258728027, + "learning_rate": 2.6832325608794413e-05, + "loss": 0.9234, + "step": 59400 + }, + { + "epoch": 0.9536268639946066, + "grad_norm": 0.6339488625526428, + "learning_rate": 2.6826038037591882e-05, + "loss": 0.8187, + "step": 59410 + }, + { + "epoch": 0.9537873802147707, + "grad_norm": 0.8113591074943542, + "learning_rate": 2.6819750350264055e-05, + "loss": 0.7673, + "step": 59420 + }, + { + "epoch": 0.9539478964349347, + "grad_norm": 1.0652827024459839, + "learning_rate": 2.6813462547210795e-05, + "loss": 0.7841, + "step": 59430 + }, + { + "epoch": 0.9541084126550988, + "grad_norm": 0.7310013175010681, + "learning_rate": 2.680717462883195e-05, + "loss": 0.7006, + "step": 59440 + }, + { + "epoch": 0.9542689288752628, + "grad_norm": 0.9872377514839172, + "learning_rate": 2.680088659552741e-05, + "loss": 0.8536, + "step": 59450 + }, + { + "epoch": 0.9544294450954269, + "grad_norm": 1.3366156816482544, + "learning_rate": 2.6794598447697056e-05, + "loss": 0.7256, + "step": 59460 + }, + { + "epoch": 0.9545899613155909, + "grad_norm": 1.0851799249649048, + "learning_rate": 2.6788310185740766e-05, + "loss": 0.7896, + "step": 59470 + }, + { + "epoch": 0.954750477535755, + "grad_norm": 0.7360928654670715, + "learning_rate": 2.678202181005845e-05, + "loss": 0.8553, + "step": 59480 + }, + { + "epoch": 0.954910993755919, + "grad_norm": 1.1360528469085693, + "learning_rate": 2.6775733321050006e-05, + "loss": 0.7815, + "step": 59490 + }, + { + "epoch": 0.955071509976083, + "grad_norm": 1.2506129741668701, + "learning_rate": 2.6769444719115345e-05, + "loss": 0.8093, + "step": 59500 + }, + { + "epoch": 0.9552320261962471, + "grad_norm": 0.8034750819206238, + "learning_rate": 2.6763156004654383e-05, + "loss": 0.8567, + "step": 59510 + }, + { + "epoch": 0.9553925424164111, + "grad_norm": 1.0276650190353394, + "learning_rate": 2.6756867178067052e-05, + "loss": 0.8464, + "step": 59520 + }, + { + "epoch": 0.9555530586365752, + "grad_norm": 0.6802396774291992, + "learning_rate": 2.6750578239753276e-05, + "loss": 0.8168, + "step": 59530 + }, + { + "epoch": 0.9557135748567392, + "grad_norm": 1.4285078048706055, + "learning_rate": 2.6744289190112998e-05, + "loss": 0.6913, + "step": 59540 + }, + { + "epoch": 0.9558740910769034, + "grad_norm": 0.715206503868103, + "learning_rate": 2.6738000029546173e-05, + "loss": 0.7633, + "step": 59550 + }, + { + "epoch": 0.9560346072970674, + "grad_norm": 0.729849100112915, + "learning_rate": 2.6731710758452743e-05, + "loss": 0.6467, + "step": 59560 + }, + { + "epoch": 0.9561951235172315, + "grad_norm": 0.8148371577262878, + "learning_rate": 2.6725421377232666e-05, + "loss": 0.7273, + "step": 59570 + }, + { + "epoch": 0.9563556397373955, + "grad_norm": 0.5766688585281372, + "learning_rate": 2.6719131886285932e-05, + "loss": 0.6985, + "step": 59580 + }, + { + "epoch": 0.9565161559575596, + "grad_norm": 0.8466945886611938, + "learning_rate": 2.6712842286012483e-05, + "loss": 0.7383, + "step": 59590 + }, + { + "epoch": 0.9566766721777236, + "grad_norm": 0.9874039888381958, + "learning_rate": 2.670655257681233e-05, + "loss": 0.721, + "step": 59600 + }, + { + "epoch": 0.9568371883978876, + "grad_norm": 0.39166587591171265, + "learning_rate": 2.6700262759085447e-05, + "loss": 0.7688, + "step": 59610 + }, + { + "epoch": 0.9569977046180517, + "grad_norm": 1.12564218044281, + "learning_rate": 2.669397283323183e-05, + "loss": 0.8442, + "step": 59620 + }, + { + "epoch": 0.9571582208382157, + "grad_norm": 0.7577055096626282, + "learning_rate": 2.6687682799651482e-05, + "loss": 0.8109, + "step": 59630 + }, + { + "epoch": 0.9573187370583798, + "grad_norm": 0.800405740737915, + "learning_rate": 2.6681392658744414e-05, + "loss": 0.7736, + "step": 59640 + }, + { + "epoch": 0.9574792532785438, + "grad_norm": 0.9424332976341248, + "learning_rate": 2.6675102410910645e-05, + "loss": 0.7269, + "step": 59650 + }, + { + "epoch": 0.9576397694987079, + "grad_norm": 0.63348788022995, + "learning_rate": 2.666881205655019e-05, + "loss": 0.6428, + "step": 59660 + }, + { + "epoch": 0.9578002857188719, + "grad_norm": 0.6939621567726135, + "learning_rate": 2.6662521596063084e-05, + "loss": 0.722, + "step": 59670 + }, + { + "epoch": 0.957960801939036, + "grad_norm": 1.0738266706466675, + "learning_rate": 2.6656231029849365e-05, + "loss": 0.7021, + "step": 59680 + }, + { + "epoch": 0.9581213181592, + "grad_norm": 0.9232262969017029, + "learning_rate": 2.6649940358309072e-05, + "loss": 0.8307, + "step": 59690 + }, + { + "epoch": 0.958281834379364, + "grad_norm": 0.9420933127403259, + "learning_rate": 2.664364958184226e-05, + "loss": 0.6662, + "step": 59700 + }, + { + "epoch": 0.9584423505995281, + "grad_norm": 0.7416630387306213, + "learning_rate": 2.663735870084898e-05, + "loss": 0.7505, + "step": 59710 + }, + { + "epoch": 0.9586028668196921, + "grad_norm": 0.6413959264755249, + "learning_rate": 2.663106771572929e-05, + "loss": 0.7332, + "step": 59720 + }, + { + "epoch": 0.9587633830398562, + "grad_norm": 0.9045729041099548, + "learning_rate": 2.6624776626883274e-05, + "loss": 0.802, + "step": 59730 + }, + { + "epoch": 0.9589238992600202, + "grad_norm": 0.6747221350669861, + "learning_rate": 2.6618485434711005e-05, + "loss": 0.7755, + "step": 59740 + }, + { + "epoch": 0.9590844154801843, + "grad_norm": 0.7996554374694824, + "learning_rate": 2.6612194139612557e-05, + "loss": 0.807, + "step": 59750 + }, + { + "epoch": 0.9592449317003483, + "grad_norm": 1.347529649734497, + "learning_rate": 2.6605902741988022e-05, + "loss": 0.8698, + "step": 59760 + }, + { + "epoch": 0.9594054479205124, + "grad_norm": 0.8312614560127258, + "learning_rate": 2.65996112422375e-05, + "loss": 0.6858, + "step": 59770 + }, + { + "epoch": 0.9595659641406764, + "grad_norm": 0.851327657699585, + "learning_rate": 2.6593319640761096e-05, + "loss": 0.8521, + "step": 59780 + }, + { + "epoch": 0.9597264803608405, + "grad_norm": 0.7620516419410706, + "learning_rate": 2.6587027937958904e-05, + "loss": 0.797, + "step": 59790 + }, + { + "epoch": 0.9598869965810045, + "grad_norm": 2.088733196258545, + "learning_rate": 2.658073613423106e-05, + "loss": 0.8851, + "step": 59800 + }, + { + "epoch": 0.9600475128011685, + "grad_norm": 0.7275897860527039, + "learning_rate": 2.6574444229977673e-05, + "loss": 0.7656, + "step": 59810 + }, + { + "epoch": 0.9602080290213326, + "grad_norm": 0.9954098463058472, + "learning_rate": 2.656815222559887e-05, + "loss": 0.7426, + "step": 59820 + }, + { + "epoch": 0.9603685452414966, + "grad_norm": 0.9445508122444153, + "learning_rate": 2.6561860121494795e-05, + "loss": 0.7638, + "step": 59830 + }, + { + "epoch": 0.9605290614616607, + "grad_norm": 0.6769991517066956, + "learning_rate": 2.655556791806558e-05, + "loss": 0.8497, + "step": 59840 + }, + { + "epoch": 0.9606895776818247, + "grad_norm": 1.0129542350769043, + "learning_rate": 2.654927561571138e-05, + "loss": 0.717, + "step": 59850 + }, + { + "epoch": 0.9608500939019888, + "grad_norm": 0.5805830359458923, + "learning_rate": 2.6542983214832346e-05, + "loss": 0.7967, + "step": 59860 + }, + { + "epoch": 0.9610106101221528, + "grad_norm": 0.7377362847328186, + "learning_rate": 2.6536690715828627e-05, + "loss": 0.8187, + "step": 59870 + }, + { + "epoch": 0.9611711263423169, + "grad_norm": 0.8817998766899109, + "learning_rate": 2.6530398119100402e-05, + "loss": 0.7446, + "step": 59880 + }, + { + "epoch": 0.9613316425624809, + "grad_norm": 0.882331907749176, + "learning_rate": 2.6524105425047848e-05, + "loss": 0.646, + "step": 59890 + }, + { + "epoch": 0.9614921587826449, + "grad_norm": 1.0604057312011719, + "learning_rate": 2.6517812634071122e-05, + "loss": 0.8522, + "step": 59900 + }, + { + "epoch": 0.961652675002809, + "grad_norm": 0.9991618990898132, + "learning_rate": 2.6511519746570422e-05, + "loss": 0.7304, + "step": 59910 + }, + { + "epoch": 0.961813191222973, + "grad_norm": 0.9187946319580078, + "learning_rate": 2.650522676294594e-05, + "loss": 0.8389, + "step": 59920 + }, + { + "epoch": 0.9619737074431371, + "grad_norm": 0.6849340200424194, + "learning_rate": 2.6498933683597877e-05, + "loss": 0.8153, + "step": 59930 + }, + { + "epoch": 0.9621342236633011, + "grad_norm": 0.9318202137947083, + "learning_rate": 2.6492640508926418e-05, + "loss": 0.8036, + "step": 59940 + }, + { + "epoch": 0.9622947398834653, + "grad_norm": 1.051112413406372, + "learning_rate": 2.6486347239331795e-05, + "loss": 0.7114, + "step": 59950 + }, + { + "epoch": 0.9624552561036293, + "grad_norm": 0.7949503064155579, + "learning_rate": 2.6480053875214207e-05, + "loss": 0.7716, + "step": 59960 + }, + { + "epoch": 0.9626157723237934, + "grad_norm": 0.8394770622253418, + "learning_rate": 2.6473760416973874e-05, + "loss": 0.7166, + "step": 59970 + }, + { + "epoch": 0.9627762885439574, + "grad_norm": 1.0139338970184326, + "learning_rate": 2.6467466865011036e-05, + "loss": 0.7797, + "step": 59980 + }, + { + "epoch": 0.9629368047641215, + "grad_norm": 0.8879220485687256, + "learning_rate": 2.6461173219725916e-05, + "loss": 0.7915, + "step": 59990 + }, + { + "epoch": 0.9630973209842855, + "grad_norm": 0.9709604382514954, + "learning_rate": 2.645487948151875e-05, + "loss": 0.6993, + "step": 60000 + }, + { + "epoch": 0.9630973209842855, + "eval_loss": 0.77556973695755, + "eval_runtime": 1834.0246, + "eval_samples_per_second": 14.302, + "eval_steps_per_second": 1.788, + "step": 60000 + }, + { + "epoch": 0.9632578372044495, + "grad_norm": 1.0913053750991821, + "learning_rate": 2.644858565078979e-05, + "loss": 0.7704, + "step": 60010 + }, + { + "epoch": 0.9634183534246136, + "grad_norm": 0.9580832719802856, + "learning_rate": 2.6442291727939293e-05, + "loss": 0.7117, + "step": 60020 + }, + { + "epoch": 0.9635788696447776, + "grad_norm": 0.45983999967575073, + "learning_rate": 2.6435997713367507e-05, + "loss": 0.7456, + "step": 60030 + }, + { + "epoch": 0.9637393858649417, + "grad_norm": 1.0389342308044434, + "learning_rate": 2.6429703607474686e-05, + "loss": 0.7885, + "step": 60040 + }, + { + "epoch": 0.9638999020851057, + "grad_norm": 0.7478015422821045, + "learning_rate": 2.6423409410661115e-05, + "loss": 0.8106, + "step": 60050 + }, + { + "epoch": 0.9640604183052698, + "grad_norm": 0.8744599223136902, + "learning_rate": 2.6417115123327058e-05, + "loss": 0.7622, + "step": 60060 + }, + { + "epoch": 0.9642209345254338, + "grad_norm": 0.6579895615577698, + "learning_rate": 2.6410820745872794e-05, + "loss": 0.7369, + "step": 60070 + }, + { + "epoch": 0.9643814507455979, + "grad_norm": 0.8957390189170837, + "learning_rate": 2.6404526278698616e-05, + "loss": 0.8441, + "step": 60080 + }, + { + "epoch": 0.9645419669657619, + "grad_norm": 0.5973559617996216, + "learning_rate": 2.639823172220481e-05, + "loss": 0.8157, + "step": 60090 + }, + { + "epoch": 0.9647024831859259, + "grad_norm": 1.1153000593185425, + "learning_rate": 2.6391937076791667e-05, + "loss": 0.8157, + "step": 60100 + }, + { + "epoch": 0.96486299940609, + "grad_norm": 0.8028897047042847, + "learning_rate": 2.63856423428595e-05, + "loss": 0.9116, + "step": 60110 + }, + { + "epoch": 0.965023515626254, + "grad_norm": 0.863567590713501, + "learning_rate": 2.637934752080861e-05, + "loss": 0.7152, + "step": 60120 + }, + { + "epoch": 0.9651840318464181, + "grad_norm": 0.5518215298652649, + "learning_rate": 2.6373052611039318e-05, + "loss": 0.7931, + "step": 60130 + }, + { + "epoch": 0.9653445480665821, + "grad_norm": 1.011912226676941, + "learning_rate": 2.6366757613951942e-05, + "loss": 0.778, + "step": 60140 + }, + { + "epoch": 0.9655050642867462, + "grad_norm": 0.7571361064910889, + "learning_rate": 2.6360462529946796e-05, + "loss": 0.7253, + "step": 60150 + }, + { + "epoch": 0.9656655805069102, + "grad_norm": 0.8548415303230286, + "learning_rate": 2.635416735942422e-05, + "loss": 0.7773, + "step": 60160 + }, + { + "epoch": 0.9658260967270743, + "grad_norm": 0.8055107593536377, + "learning_rate": 2.6347872102784554e-05, + "loss": 0.714, + "step": 60170 + }, + { + "epoch": 0.9659866129472383, + "grad_norm": 0.9362832903862, + "learning_rate": 2.6341576760428128e-05, + "loss": 0.8582, + "step": 60180 + }, + { + "epoch": 0.9661471291674024, + "grad_norm": 0.6315274238586426, + "learning_rate": 2.633528133275529e-05, + "loss": 0.8263, + "step": 60190 + }, + { + "epoch": 0.9663076453875664, + "grad_norm": 0.62116539478302, + "learning_rate": 2.63289858201664e-05, + "loss": 0.7203, + "step": 60200 + }, + { + "epoch": 0.9664681616077304, + "grad_norm": 0.8392134308815002, + "learning_rate": 2.6322690223061818e-05, + "loss": 0.725, + "step": 60210 + }, + { + "epoch": 0.9666286778278945, + "grad_norm": 0.5587195754051208, + "learning_rate": 2.63163945418419e-05, + "loss": 0.781, + "step": 60220 + }, + { + "epoch": 0.9667891940480585, + "grad_norm": 0.9241369962692261, + "learning_rate": 2.631009877690701e-05, + "loss": 0.8473, + "step": 60230 + }, + { + "epoch": 0.9669497102682226, + "grad_norm": 1.0545467138290405, + "learning_rate": 2.6303802928657534e-05, + "loss": 0.7374, + "step": 60240 + }, + { + "epoch": 0.9671102264883866, + "grad_norm": 0.8684262633323669, + "learning_rate": 2.6297506997493836e-05, + "loss": 0.8193, + "step": 60250 + }, + { + "epoch": 0.9672707427085507, + "grad_norm": 0.9722834825515747, + "learning_rate": 2.629121098381631e-05, + "loss": 0.827, + "step": 60260 + }, + { + "epoch": 0.9674312589287147, + "grad_norm": 0.7070519924163818, + "learning_rate": 2.6284914888025352e-05, + "loss": 0.7549, + "step": 60270 + }, + { + "epoch": 0.9675917751488788, + "grad_norm": 0.5835386514663696, + "learning_rate": 2.6278618710521342e-05, + "loss": 0.8232, + "step": 60280 + }, + { + "epoch": 0.9677522913690428, + "grad_norm": 0.9244447946548462, + "learning_rate": 2.6272322451704683e-05, + "loss": 0.7615, + "step": 60290 + }, + { + "epoch": 0.9679128075892068, + "grad_norm": 0.4864223599433899, + "learning_rate": 2.626602611197579e-05, + "loss": 0.7546, + "step": 60300 + }, + { + "epoch": 0.968073323809371, + "grad_norm": 0.6972334980964661, + "learning_rate": 2.625972969173506e-05, + "loss": 0.6294, + "step": 60310 + }, + { + "epoch": 0.9682338400295349, + "grad_norm": 1.0926626920700073, + "learning_rate": 2.6253433191382926e-05, + "loss": 0.8835, + "step": 60320 + }, + { + "epoch": 0.968394356249699, + "grad_norm": 0.6913195252418518, + "learning_rate": 2.6247136611319795e-05, + "loss": 0.7882, + "step": 60330 + }, + { + "epoch": 0.968554872469863, + "grad_norm": 0.5717893838882446, + "learning_rate": 2.6240839951946094e-05, + "loss": 0.7246, + "step": 60340 + }, + { + "epoch": 0.9687153886900272, + "grad_norm": 1.2917007207870483, + "learning_rate": 2.6234543213662254e-05, + "loss": 0.7637, + "step": 60350 + }, + { + "epoch": 0.9688759049101912, + "grad_norm": 0.574692964553833, + "learning_rate": 2.6228246396868715e-05, + "loss": 0.6055, + "step": 60360 + }, + { + "epoch": 0.9690364211303553, + "grad_norm": 0.9209277033805847, + "learning_rate": 2.6221949501965915e-05, + "loss": 0.7935, + "step": 60370 + }, + { + "epoch": 0.9691969373505193, + "grad_norm": 0.7723389863967896, + "learning_rate": 2.621565252935429e-05, + "loss": 0.7422, + "step": 60380 + }, + { + "epoch": 0.9693574535706834, + "grad_norm": 0.5609288215637207, + "learning_rate": 2.620935547943431e-05, + "loss": 0.7851, + "step": 60390 + }, + { + "epoch": 0.9695179697908474, + "grad_norm": 0.97829270362854, + "learning_rate": 2.6203058352606414e-05, + "loss": 0.7819, + "step": 60400 + }, + { + "epoch": 0.9696784860110114, + "grad_norm": 0.6875663995742798, + "learning_rate": 2.6196761149271064e-05, + "loss": 0.7233, + "step": 60410 + }, + { + "epoch": 0.9698390022311755, + "grad_norm": 0.7307199239730835, + "learning_rate": 2.619046386982874e-05, + "loss": 0.8473, + "step": 60420 + }, + { + "epoch": 0.9699995184513395, + "grad_norm": 1.1742761135101318, + "learning_rate": 2.618416651467989e-05, + "loss": 0.8729, + "step": 60430 + }, + { + "epoch": 0.9701600346715036, + "grad_norm": 0.7963273525238037, + "learning_rate": 2.617786908422501e-05, + "loss": 0.8844, + "step": 60440 + }, + { + "epoch": 0.9703205508916676, + "grad_norm": 0.7029579877853394, + "learning_rate": 2.617157157886457e-05, + "loss": 0.8605, + "step": 60450 + }, + { + "epoch": 0.9704810671118317, + "grad_norm": 0.7180235981941223, + "learning_rate": 2.6165273998999045e-05, + "loss": 0.8037, + "step": 60460 + }, + { + "epoch": 0.9706415833319957, + "grad_norm": 0.568340003490448, + "learning_rate": 2.6158976345028936e-05, + "loss": 0.8986, + "step": 60470 + }, + { + "epoch": 0.9708020995521598, + "grad_norm": 0.982994556427002, + "learning_rate": 2.615267861735473e-05, + "loss": 0.8758, + "step": 60480 + }, + { + "epoch": 0.9709626157723238, + "grad_norm": 1.0042883157730103, + "learning_rate": 2.6146380816376937e-05, + "loss": 0.7196, + "step": 60490 + }, + { + "epoch": 0.9711231319924878, + "grad_norm": 0.8547118306159973, + "learning_rate": 2.6140082942496052e-05, + "loss": 0.7947, + "step": 60500 + }, + { + "epoch": 0.9712836482126519, + "grad_norm": 0.8033013343811035, + "learning_rate": 2.6133784996112576e-05, + "loss": 0.7232, + "step": 60510 + }, + { + "epoch": 0.9714441644328159, + "grad_norm": 0.7997279763221741, + "learning_rate": 2.6127486977627042e-05, + "loss": 0.7931, + "step": 60520 + }, + { + "epoch": 0.97160468065298, + "grad_norm": 0.7311005592346191, + "learning_rate": 2.6121188887439946e-05, + "loss": 0.7579, + "step": 60530 + }, + { + "epoch": 0.971765196873144, + "grad_norm": 0.6208533048629761, + "learning_rate": 2.6114890725951812e-05, + "loss": 0.8693, + "step": 60540 + }, + { + "epoch": 0.9719257130933081, + "grad_norm": 0.7545430660247803, + "learning_rate": 2.6108592493563182e-05, + "loss": 0.747, + "step": 60550 + }, + { + "epoch": 0.9720862293134721, + "grad_norm": 1.9426474571228027, + "learning_rate": 2.610229419067457e-05, + "loss": 0.7336, + "step": 60560 + }, + { + "epoch": 0.9722467455336362, + "grad_norm": 0.7205753326416016, + "learning_rate": 2.609599581768652e-05, + "loss": 0.7231, + "step": 60570 + }, + { + "epoch": 0.9724072617538002, + "grad_norm": 0.5729808807373047, + "learning_rate": 2.6089697374999572e-05, + "loss": 0.8722, + "step": 60580 + }, + { + "epoch": 0.9725677779739643, + "grad_norm": 0.94708651304245, + "learning_rate": 2.6083398863014256e-05, + "loss": 0.6982, + "step": 60590 + }, + { + "epoch": 0.9727282941941283, + "grad_norm": 0.5146676301956177, + "learning_rate": 2.6077100282131138e-05, + "loss": 0.7637, + "step": 60600 + }, + { + "epoch": 0.9728888104142923, + "grad_norm": 0.7400230765342712, + "learning_rate": 2.6070801632750763e-05, + "loss": 0.7639, + "step": 60610 + }, + { + "epoch": 0.9730493266344564, + "grad_norm": 1.3652548789978027, + "learning_rate": 2.6064502915273685e-05, + "loss": 0.7467, + "step": 60620 + }, + { + "epoch": 0.9732098428546204, + "grad_norm": 1.2947537899017334, + "learning_rate": 2.6058204130100478e-05, + "loss": 0.7306, + "step": 60630 + }, + { + "epoch": 0.9733703590747845, + "grad_norm": 1.220522165298462, + "learning_rate": 2.60519052776317e-05, + "loss": 0.8268, + "step": 60640 + }, + { + "epoch": 0.9735308752949485, + "grad_norm": 0.6106850504875183, + "learning_rate": 2.604560635826791e-05, + "loss": 0.7421, + "step": 60650 + }, + { + "epoch": 0.9736913915151126, + "grad_norm": 0.8952733874320984, + "learning_rate": 2.60393073724097e-05, + "loss": 0.8256, + "step": 60660 + }, + { + "epoch": 0.9738519077352766, + "grad_norm": 0.9124263525009155, + "learning_rate": 2.603300832045764e-05, + "loss": 0.8081, + "step": 60670 + }, + { + "epoch": 0.9740124239554407, + "grad_norm": 0.6789494752883911, + "learning_rate": 2.6026709202812316e-05, + "loss": 0.7822, + "step": 60680 + }, + { + "epoch": 0.9741729401756047, + "grad_norm": 0.6091787219047546, + "learning_rate": 2.602041001987431e-05, + "loss": 0.7953, + "step": 60690 + }, + { + "epoch": 0.9743334563957687, + "grad_norm": 0.7077558636665344, + "learning_rate": 2.601411077204422e-05, + "loss": 0.7294, + "step": 60700 + }, + { + "epoch": 0.9744939726159328, + "grad_norm": 0.8926966786384583, + "learning_rate": 2.600781145972263e-05, + "loss": 0.8605, + "step": 60710 + }, + { + "epoch": 0.9746544888360968, + "grad_norm": 0.8788101673126221, + "learning_rate": 2.6001512083310154e-05, + "loss": 0.7235, + "step": 60720 + }, + { + "epoch": 0.974815005056261, + "grad_norm": 0.8496079444885254, + "learning_rate": 2.5995212643207378e-05, + "loss": 0.7746, + "step": 60730 + }, + { + "epoch": 0.974975521276425, + "grad_norm": 0.6960690021514893, + "learning_rate": 2.598891313981493e-05, + "loss": 0.7429, + "step": 60740 + }, + { + "epoch": 0.9751360374965891, + "grad_norm": 0.85723876953125, + "learning_rate": 2.5982613573533405e-05, + "loss": 0.7, + "step": 60750 + }, + { + "epoch": 0.9752965537167531, + "grad_norm": 0.8274732828140259, + "learning_rate": 2.5976313944763425e-05, + "loss": 0.815, + "step": 60760 + }, + { + "epoch": 0.9754570699369172, + "grad_norm": 1.0237884521484375, + "learning_rate": 2.5970014253905612e-05, + "loss": 0.7597, + "step": 60770 + }, + { + "epoch": 0.9756175861570812, + "grad_norm": 0.6604880690574646, + "learning_rate": 2.5963714501360582e-05, + "loss": 0.7679, + "step": 60780 + }, + { + "epoch": 0.9757781023772453, + "grad_norm": 1.044718861579895, + "learning_rate": 2.5957414687528964e-05, + "loss": 0.7194, + "step": 60790 + }, + { + "epoch": 0.9759386185974093, + "grad_norm": 0.5120238661766052, + "learning_rate": 2.5951114812811394e-05, + "loss": 0.8878, + "step": 60800 + }, + { + "epoch": 0.9760991348175733, + "grad_norm": 0.4864431917667389, + "learning_rate": 2.5944814877608503e-05, + "loss": 0.8088, + "step": 60810 + }, + { + "epoch": 0.9762596510377374, + "grad_norm": 0.7100830078125, + "learning_rate": 2.593851488232093e-05, + "loss": 0.6703, + "step": 60820 + }, + { + "epoch": 0.9764201672579014, + "grad_norm": 0.8133230209350586, + "learning_rate": 2.593221482734933e-05, + "loss": 0.6611, + "step": 60830 + }, + { + "epoch": 0.9765806834780655, + "grad_norm": 0.9041920900344849, + "learning_rate": 2.5925914713094328e-05, + "loss": 0.7349, + "step": 60840 + }, + { + "epoch": 0.9767411996982295, + "grad_norm": 1.076271653175354, + "learning_rate": 2.5919614539956582e-05, + "loss": 0.7284, + "step": 60850 + }, + { + "epoch": 0.9769017159183936, + "grad_norm": 0.9232913255691528, + "learning_rate": 2.591331430833676e-05, + "loss": 0.8161, + "step": 60860 + }, + { + "epoch": 0.9770622321385576, + "grad_norm": 0.7834044694900513, + "learning_rate": 2.59070140186355e-05, + "loss": 0.8255, + "step": 60870 + }, + { + "epoch": 0.9772227483587217, + "grad_norm": 0.6481605768203735, + "learning_rate": 2.5900713671253475e-05, + "loss": 0.7852, + "step": 60880 + }, + { + "epoch": 0.9773832645788857, + "grad_norm": 0.6914343237876892, + "learning_rate": 2.589441326659135e-05, + "loss": 0.7505, + "step": 60890 + }, + { + "epoch": 0.9775437807990497, + "grad_norm": 1.0412356853485107, + "learning_rate": 2.5888112805049786e-05, + "loss": 0.8259, + "step": 60900 + }, + { + "epoch": 0.9777042970192138, + "grad_norm": 1.3312807083129883, + "learning_rate": 2.588181228702946e-05, + "loss": 0.9043, + "step": 60910 + }, + { + "epoch": 0.9778648132393778, + "grad_norm": 0.7504780888557434, + "learning_rate": 2.587551171293105e-05, + "loss": 0.6892, + "step": 60920 + }, + { + "epoch": 0.9780253294595419, + "grad_norm": 0.7742280960083008, + "learning_rate": 2.586921108315523e-05, + "loss": 0.75, + "step": 60930 + }, + { + "epoch": 0.9781858456797059, + "grad_norm": 0.6440693736076355, + "learning_rate": 2.586291039810269e-05, + "loss": 0.7889, + "step": 60940 + }, + { + "epoch": 0.97834636189987, + "grad_norm": 0.9837389588356018, + "learning_rate": 2.5856609658174118e-05, + "loss": 0.8186, + "step": 60950 + }, + { + "epoch": 0.978506878120034, + "grad_norm": 0.7449393272399902, + "learning_rate": 2.5850308863770194e-05, + "loss": 0.7422, + "step": 60960 + }, + { + "epoch": 0.9786673943401981, + "grad_norm": 0.8946384787559509, + "learning_rate": 2.5844008015291615e-05, + "loss": 0.7563, + "step": 60970 + }, + { + "epoch": 0.9788279105603621, + "grad_norm": 0.6071855425834656, + "learning_rate": 2.5837707113139086e-05, + "loss": 0.6791, + "step": 60980 + }, + { + "epoch": 0.9789884267805262, + "grad_norm": 1.1619019508361816, + "learning_rate": 2.58314061577133e-05, + "loss": 0.8548, + "step": 60990 + }, + { + "epoch": 0.9791489430006902, + "grad_norm": 0.9511868953704834, + "learning_rate": 2.5825105149414957e-05, + "loss": 0.8569, + "step": 61000 + }, + { + "epoch": 0.9793094592208542, + "grad_norm": 0.6751697063446045, + "learning_rate": 2.5818804088644778e-05, + "loss": 0.6938, + "step": 61010 + }, + { + "epoch": 0.9794699754410183, + "grad_norm": 1.0623066425323486, + "learning_rate": 2.581250297580346e-05, + "loss": 0.7956, + "step": 61020 + }, + { + "epoch": 0.9796304916611823, + "grad_norm": 0.7446132898330688, + "learning_rate": 2.5806201811291724e-05, + "loss": 0.852, + "step": 61030 + }, + { + "epoch": 0.9797910078813464, + "grad_norm": 0.7426188588142395, + "learning_rate": 2.5799900595510278e-05, + "loss": 0.7281, + "step": 61040 + }, + { + "epoch": 0.9799515241015104, + "grad_norm": 0.6688826680183411, + "learning_rate": 2.5793599328859858e-05, + "loss": 0.8432, + "step": 61050 + }, + { + "epoch": 0.9801120403216745, + "grad_norm": 0.9357270002365112, + "learning_rate": 2.5787298011741172e-05, + "loss": 0.8288, + "step": 61060 + }, + { + "epoch": 0.9802725565418385, + "grad_norm": 0.7055511474609375, + "learning_rate": 2.5780996644554954e-05, + "loss": 0.6428, + "step": 61070 + }, + { + "epoch": 0.9804330727620026, + "grad_norm": 1.1512131690979004, + "learning_rate": 2.5774695227701943e-05, + "loss": 0.6774, + "step": 61080 + }, + { + "epoch": 0.9805935889821666, + "grad_norm": 0.844656765460968, + "learning_rate": 2.5768393761582854e-05, + "loss": 0.7764, + "step": 61090 + }, + { + "epoch": 0.9807541052023308, + "grad_norm": 1.1814428567886353, + "learning_rate": 2.576209224659843e-05, + "loss": 0.6764, + "step": 61100 + }, + { + "epoch": 0.9809146214224947, + "grad_norm": 0.8805698752403259, + "learning_rate": 2.5755790683149424e-05, + "loss": 0.8216, + "step": 61110 + }, + { + "epoch": 0.9810751376426587, + "grad_norm": 0.7099401950836182, + "learning_rate": 2.5749489071636552e-05, + "loss": 0.7206, + "step": 61120 + }, + { + "epoch": 0.9812356538628229, + "grad_norm": 0.8148156404495239, + "learning_rate": 2.574318741246058e-05, + "loss": 0.8037, + "step": 61130 + }, + { + "epoch": 0.9813961700829869, + "grad_norm": 1.103562593460083, + "learning_rate": 2.5736885706022253e-05, + "loss": 0.8744, + "step": 61140 + }, + { + "epoch": 0.981556686303151, + "grad_norm": 0.9196653962135315, + "learning_rate": 2.5730583952722314e-05, + "loss": 0.7238, + "step": 61150 + }, + { + "epoch": 0.981717202523315, + "grad_norm": 0.7625278234481812, + "learning_rate": 2.5724282152961527e-05, + "loss": 0.8461, + "step": 61160 + }, + { + "epoch": 0.9818777187434791, + "grad_norm": 1.9114056825637817, + "learning_rate": 2.5717980307140647e-05, + "loss": 0.7483, + "step": 61170 + }, + { + "epoch": 0.9820382349636431, + "grad_norm": 0.6560629606246948, + "learning_rate": 2.5711678415660432e-05, + "loss": 0.7545, + "step": 61180 + }, + { + "epoch": 0.9821987511838072, + "grad_norm": 0.7365866303443909, + "learning_rate": 2.570537647892165e-05, + "loss": 0.8879, + "step": 61190 + }, + { + "epoch": 0.9823592674039712, + "grad_norm": 0.854896068572998, + "learning_rate": 2.569907449732506e-05, + "loss": 0.8055, + "step": 61200 + }, + { + "epoch": 0.9825197836241352, + "grad_norm": 0.8881088495254517, + "learning_rate": 2.569277247127144e-05, + "loss": 0.7995, + "step": 61210 + }, + { + "epoch": 0.9826802998442993, + "grad_norm": 0.7296180725097656, + "learning_rate": 2.5686470401161556e-05, + "loss": 0.769, + "step": 61220 + }, + { + "epoch": 0.9828408160644633, + "grad_norm": 0.8214325308799744, + "learning_rate": 2.568016828739618e-05, + "loss": 0.7715, + "step": 61230 + }, + { + "epoch": 0.9830013322846274, + "grad_norm": 1.0055397748947144, + "learning_rate": 2.567386613037609e-05, + "loss": 0.6439, + "step": 61240 + }, + { + "epoch": 0.9831618485047914, + "grad_norm": 0.7140029072761536, + "learning_rate": 2.566756393050207e-05, + "loss": 0.8348, + "step": 61250 + }, + { + "epoch": 0.9833223647249555, + "grad_norm": 0.8573659658432007, + "learning_rate": 2.5661261688174914e-05, + "loss": 0.7163, + "step": 61260 + }, + { + "epoch": 0.9834828809451195, + "grad_norm": 1.076686978340149, + "learning_rate": 2.5654959403795387e-05, + "loss": 0.7213, + "step": 61270 + }, + { + "epoch": 0.9836433971652836, + "grad_norm": 0.6705933809280396, + "learning_rate": 2.5648657077764282e-05, + "loss": 0.8158, + "step": 61280 + }, + { + "epoch": 0.9838039133854476, + "grad_norm": 0.7687873840332031, + "learning_rate": 2.56423547104824e-05, + "loss": 0.6898, + "step": 61290 + }, + { + "epoch": 0.9839644296056117, + "grad_norm": 1.415767788887024, + "learning_rate": 2.5636052302350534e-05, + "loss": 0.7997, + "step": 61300 + }, + { + "epoch": 0.9841249458257757, + "grad_norm": 0.8901718854904175, + "learning_rate": 2.5629749853769465e-05, + "loss": 0.7525, + "step": 61310 + }, + { + "epoch": 0.9842854620459397, + "grad_norm": 1.1936841011047363, + "learning_rate": 2.562344736514e-05, + "loss": 0.8371, + "step": 61320 + }, + { + "epoch": 0.9844459782661038, + "grad_norm": 0.7422024607658386, + "learning_rate": 2.5617144836862954e-05, + "loss": 0.6777, + "step": 61330 + }, + { + "epoch": 0.9846064944862678, + "grad_norm": 1.0367074012756348, + "learning_rate": 2.5610842269339113e-05, + "loss": 0.671, + "step": 61340 + }, + { + "epoch": 0.9847670107064319, + "grad_norm": 1.3954631090164185, + "learning_rate": 2.5604539662969285e-05, + "loss": 0.8189, + "step": 61350 + }, + { + "epoch": 0.9849275269265959, + "grad_norm": 0.6614015698432922, + "learning_rate": 2.559823701815429e-05, + "loss": 0.7912, + "step": 61360 + }, + { + "epoch": 0.98508804314676, + "grad_norm": 0.8437753319740295, + "learning_rate": 2.5591934335294927e-05, + "loss": 0.7231, + "step": 61370 + }, + { + "epoch": 0.985248559366924, + "grad_norm": 0.7511743903160095, + "learning_rate": 2.558563161479201e-05, + "loss": 0.722, + "step": 61380 + }, + { + "epoch": 0.9854090755870881, + "grad_norm": 0.865856945514679, + "learning_rate": 2.5579328857046368e-05, + "loss": 0.8935, + "step": 61390 + }, + { + "epoch": 0.9855695918072521, + "grad_norm": 0.8034844398498535, + "learning_rate": 2.557302606245881e-05, + "loss": 0.7472, + "step": 61400 + }, + { + "epoch": 0.9857301080274161, + "grad_norm": 0.9027842879295349, + "learning_rate": 2.556672323143015e-05, + "loss": 0.6139, + "step": 61410 + }, + { + "epoch": 0.9858906242475802, + "grad_norm": 0.6911764740943909, + "learning_rate": 2.5560420364361232e-05, + "loss": 0.8149, + "step": 61420 + }, + { + "epoch": 0.9860511404677442, + "grad_norm": 0.764900803565979, + "learning_rate": 2.555411746165286e-05, + "loss": 0.6187, + "step": 61430 + }, + { + "epoch": 0.9862116566879083, + "grad_norm": 0.7302017211914062, + "learning_rate": 2.5547814523705866e-05, + "loss": 0.8327, + "step": 61440 + }, + { + "epoch": 0.9863721729080723, + "grad_norm": 1.32390296459198, + "learning_rate": 2.5541511550921094e-05, + "loss": 0.7, + "step": 61450 + }, + { + "epoch": 0.9865326891282364, + "grad_norm": 1.1472100019454956, + "learning_rate": 2.5535208543699357e-05, + "loss": 0.7956, + "step": 61460 + }, + { + "epoch": 0.9866932053484004, + "grad_norm": 0.6641342639923096, + "learning_rate": 2.55289055024415e-05, + "loss": 0.8088, + "step": 61470 + }, + { + "epoch": 0.9868537215685645, + "grad_norm": 0.6761956214904785, + "learning_rate": 2.5522602427548358e-05, + "loss": 0.7344, + "step": 61480 + }, + { + "epoch": 0.9870142377887285, + "grad_norm": 0.6641927361488342, + "learning_rate": 2.5516299319420765e-05, + "loss": 0.764, + "step": 61490 + }, + { + "epoch": 0.9871747540088927, + "grad_norm": 0.6453723907470703, + "learning_rate": 2.5509996178459573e-05, + "loss": 0.8111, + "step": 61500 + }, + { + "epoch": 0.9873352702290566, + "grad_norm": 0.8840134739875793, + "learning_rate": 2.550369300506561e-05, + "loss": 0.7351, + "step": 61510 + }, + { + "epoch": 0.9874957864492206, + "grad_norm": 0.6411915421485901, + "learning_rate": 2.5497389799639737e-05, + "loss": 0.748, + "step": 61520 + }, + { + "epoch": 0.9876563026693848, + "grad_norm": 0.828525185585022, + "learning_rate": 2.549108656258279e-05, + "loss": 0.682, + "step": 61530 + }, + { + "epoch": 0.9878168188895488, + "grad_norm": 0.7479402422904968, + "learning_rate": 2.548478329429561e-05, + "loss": 0.8675, + "step": 61540 + }, + { + "epoch": 0.9879773351097129, + "grad_norm": 0.8817206025123596, + "learning_rate": 2.5478479995179067e-05, + "loss": 0.7641, + "step": 61550 + }, + { + "epoch": 0.9881378513298769, + "grad_norm": 1.1491237878799438, + "learning_rate": 2.5472176665634002e-05, + "loss": 0.7423, + "step": 61560 + }, + { + "epoch": 0.988298367550041, + "grad_norm": 0.6479136943817139, + "learning_rate": 2.5465873306061273e-05, + "loss": 0.7859, + "step": 61570 + }, + { + "epoch": 0.988458883770205, + "grad_norm": 0.6299335956573486, + "learning_rate": 2.5459569916861735e-05, + "loss": 0.8179, + "step": 61580 + }, + { + "epoch": 0.9886193999903691, + "grad_norm": 0.8500528335571289, + "learning_rate": 2.5453266498436253e-05, + "loss": 0.8173, + "step": 61590 + }, + { + "epoch": 0.9887799162105331, + "grad_norm": 0.5474291443824768, + "learning_rate": 2.5446963051185673e-05, + "loss": 0.7465, + "step": 61600 + }, + { + "epoch": 0.9889404324306971, + "grad_norm": 1.0020054578781128, + "learning_rate": 2.5440659575510878e-05, + "loss": 0.5304, + "step": 61610 + }, + { + "epoch": 0.9891009486508612, + "grad_norm": 0.6921834945678711, + "learning_rate": 2.5434356071812708e-05, + "loss": 0.6923, + "step": 61620 + }, + { + "epoch": 0.9892614648710252, + "grad_norm": 1.1277168989181519, + "learning_rate": 2.542805254049205e-05, + "loss": 0.7948, + "step": 61630 + }, + { + "epoch": 0.9894219810911893, + "grad_norm": 0.7148985862731934, + "learning_rate": 2.542174898194976e-05, + "loss": 0.715, + "step": 61640 + }, + { + "epoch": 0.9895824973113533, + "grad_norm": 0.8172020316123962, + "learning_rate": 2.541544539658671e-05, + "loss": 0.6847, + "step": 61650 + }, + { + "epoch": 0.9897430135315174, + "grad_norm": 0.31988513469696045, + "learning_rate": 2.540914178480377e-05, + "loss": 0.7086, + "step": 61660 + }, + { + "epoch": 0.9899035297516814, + "grad_norm": 1.0430936813354492, + "learning_rate": 2.5402838147001823e-05, + "loss": 0.7954, + "step": 61670 + }, + { + "epoch": 0.9900640459718455, + "grad_norm": 0.7153648734092712, + "learning_rate": 2.5396534483581724e-05, + "loss": 0.7801, + "step": 61680 + }, + { + "epoch": 0.9902245621920095, + "grad_norm": 0.8469419479370117, + "learning_rate": 2.539023079494436e-05, + "loss": 0.7685, + "step": 61690 + }, + { + "epoch": 0.9903850784121736, + "grad_norm": 0.6128125786781311, + "learning_rate": 2.538392708149061e-05, + "loss": 0.7876, + "step": 61700 + }, + { + "epoch": 0.9905455946323376, + "grad_norm": 0.6766359806060791, + "learning_rate": 2.5377623343621353e-05, + "loss": 0.6877, + "step": 61710 + }, + { + "epoch": 0.9907061108525016, + "grad_norm": 0.6741725206375122, + "learning_rate": 2.5371319581737462e-05, + "loss": 0.7045, + "step": 61720 + }, + { + "epoch": 0.9908666270726657, + "grad_norm": 0.7176598906517029, + "learning_rate": 2.5365015796239834e-05, + "loss": 0.6739, + "step": 61730 + }, + { + "epoch": 0.9910271432928297, + "grad_norm": 1.1009854078292847, + "learning_rate": 2.535871198752934e-05, + "loss": 0.6771, + "step": 61740 + }, + { + "epoch": 0.9911876595129938, + "grad_norm": 0.6897593140602112, + "learning_rate": 2.5352408156006864e-05, + "loss": 0.8036, + "step": 61750 + }, + { + "epoch": 0.9913481757331578, + "grad_norm": 0.6889849305152893, + "learning_rate": 2.5346104302073303e-05, + "loss": 0.743, + "step": 61760 + }, + { + "epoch": 0.9915086919533219, + "grad_norm": 0.6723377704620361, + "learning_rate": 2.533980042612954e-05, + "loss": 0.7335, + "step": 61770 + }, + { + "epoch": 0.9916692081734859, + "grad_norm": 1.1377779245376587, + "learning_rate": 2.5333496528576457e-05, + "loss": 0.7371, + "step": 61780 + }, + { + "epoch": 0.99182972439365, + "grad_norm": 0.9173325896263123, + "learning_rate": 2.5327192609814965e-05, + "loss": 0.8269, + "step": 61790 + }, + { + "epoch": 0.991990240613814, + "grad_norm": 0.6982620358467102, + "learning_rate": 2.5320888670245933e-05, + "loss": 0.7885, + "step": 61800 + }, + { + "epoch": 0.992150756833978, + "grad_norm": 0.8787770867347717, + "learning_rate": 2.531458471027027e-05, + "loss": 0.6944, + "step": 61810 + }, + { + "epoch": 0.9923112730541421, + "grad_norm": 0.6523706912994385, + "learning_rate": 2.5308280730288857e-05, + "loss": 0.774, + "step": 61820 + }, + { + "epoch": 0.9924717892743061, + "grad_norm": 0.6824976205825806, + "learning_rate": 2.5301976730702608e-05, + "loss": 0.7031, + "step": 61830 + }, + { + "epoch": 0.9926323054944702, + "grad_norm": 0.7293452620506287, + "learning_rate": 2.529567271191241e-05, + "loss": 0.8336, + "step": 61840 + }, + { + "epoch": 0.9927928217146342, + "grad_norm": 0.783382773399353, + "learning_rate": 2.528936867431916e-05, + "loss": 0.7475, + "step": 61850 + }, + { + "epoch": 0.9929533379347983, + "grad_norm": 1.0549143552780151, + "learning_rate": 2.528306461832376e-05, + "loss": 0.6847, + "step": 61860 + }, + { + "epoch": 0.9931138541549623, + "grad_norm": 0.5675854086875916, + "learning_rate": 2.5276760544327115e-05, + "loss": 0.7782, + "step": 61870 + }, + { + "epoch": 0.9932743703751264, + "grad_norm": 1.3447155952453613, + "learning_rate": 2.5270456452730123e-05, + "loss": 0.8383, + "step": 61880 + }, + { + "epoch": 0.9934348865952904, + "grad_norm": 0.5075446963310242, + "learning_rate": 2.526415234393369e-05, + "loss": 0.7596, + "step": 61890 + }, + { + "epoch": 0.9935954028154546, + "grad_norm": 0.7674878835678101, + "learning_rate": 2.5257848218338716e-05, + "loss": 0.8507, + "step": 61900 + }, + { + "epoch": 0.9937559190356186, + "grad_norm": 0.8186535835266113, + "learning_rate": 2.5251544076346106e-05, + "loss": 0.7612, + "step": 61910 + }, + { + "epoch": 0.9939164352557825, + "grad_norm": 0.79648357629776, + "learning_rate": 2.5245239918356777e-05, + "loss": 0.7608, + "step": 61920 + }, + { + "epoch": 0.9940769514759467, + "grad_norm": 1.0331196784973145, + "learning_rate": 2.5238935744771624e-05, + "loss": 0.8686, + "step": 61930 + }, + { + "epoch": 0.9942374676961107, + "grad_norm": 0.8341041207313538, + "learning_rate": 2.523263155599156e-05, + "loss": 0.6974, + "step": 61940 + }, + { + "epoch": 0.9943979839162748, + "grad_norm": 0.9249868988990784, + "learning_rate": 2.5226327352417494e-05, + "loss": 0.9202, + "step": 61950 + }, + { + "epoch": 0.9945585001364388, + "grad_norm": 0.8019543886184692, + "learning_rate": 2.5220023134450338e-05, + "loss": 0.7669, + "step": 61960 + }, + { + "epoch": 0.9947190163566029, + "grad_norm": 0.9412922263145447, + "learning_rate": 2.5213718902491e-05, + "loss": 0.927, + "step": 61970 + }, + { + "epoch": 0.9948795325767669, + "grad_norm": 0.7114933729171753, + "learning_rate": 2.5207414656940397e-05, + "loss": 0.8865, + "step": 61980 + }, + { + "epoch": 0.995040048796931, + "grad_norm": 0.8611968159675598, + "learning_rate": 2.520111039819944e-05, + "loss": 0.7375, + "step": 61990 + }, + { + "epoch": 0.995200565017095, + "grad_norm": 1.012342095375061, + "learning_rate": 2.5194806126669042e-05, + "loss": 0.7317, + "step": 62000 + }, + { + "epoch": 0.995361081237259, + "grad_norm": 0.8486974835395813, + "learning_rate": 2.518850184275012e-05, + "loss": 0.6833, + "step": 62010 + }, + { + "epoch": 0.9955215974574231, + "grad_norm": 0.8259174227714539, + "learning_rate": 2.518219754684358e-05, + "loss": 0.7894, + "step": 62020 + }, + { + "epoch": 0.9956821136775871, + "grad_norm": 0.6556829214096069, + "learning_rate": 2.5175893239350347e-05, + "loss": 0.751, + "step": 62030 + }, + { + "epoch": 0.9958426298977512, + "grad_norm": 1.0675313472747803, + "learning_rate": 2.516958892067134e-05, + "loss": 0.8757, + "step": 62040 + }, + { + "epoch": 0.9960031461179152, + "grad_norm": 1.436835765838623, + "learning_rate": 2.516328459120747e-05, + "loss": 0.7657, + "step": 62050 + }, + { + "epoch": 0.9961636623380793, + "grad_norm": 1.3497246503829956, + "learning_rate": 2.5156980251359658e-05, + "loss": 0.7667, + "step": 62060 + }, + { + "epoch": 0.9963241785582433, + "grad_norm": 1.2800496816635132, + "learning_rate": 2.5150675901528826e-05, + "loss": 0.7154, + "step": 62070 + }, + { + "epoch": 0.9964846947784074, + "grad_norm": 0.5809334516525269, + "learning_rate": 2.5144371542115886e-05, + "loss": 0.8094, + "step": 62080 + }, + { + "epoch": 0.9966452109985714, + "grad_norm": 0.7445762157440186, + "learning_rate": 2.513806717352177e-05, + "loss": 0.8335, + "step": 62090 + }, + { + "epoch": 0.9968057272187355, + "grad_norm": 0.6278963088989258, + "learning_rate": 2.5131762796147384e-05, + "loss": 0.9561, + "step": 62100 + }, + { + "epoch": 0.9969662434388995, + "grad_norm": 0.9766242504119873, + "learning_rate": 2.5125458410393664e-05, + "loss": 0.7546, + "step": 62110 + }, + { + "epoch": 0.9971267596590635, + "grad_norm": 0.6552825570106506, + "learning_rate": 2.5119154016661527e-05, + "loss": 0.7152, + "step": 62120 + }, + { + "epoch": 0.9972872758792276, + "grad_norm": 0.657993495464325, + "learning_rate": 2.5112849615351884e-05, + "loss": 0.8564, + "step": 62130 + }, + { + "epoch": 0.9974477920993916, + "grad_norm": 0.7543416619300842, + "learning_rate": 2.5106545206865677e-05, + "loss": 0.7531, + "step": 62140 + }, + { + "epoch": 0.9976083083195557, + "grad_norm": 0.7566736936569214, + "learning_rate": 2.5100240791603823e-05, + "loss": 0.7214, + "step": 62150 + }, + { + "epoch": 0.9977688245397197, + "grad_norm": 0.9442691206932068, + "learning_rate": 2.5093936369967236e-05, + "loss": 0.786, + "step": 62160 + }, + { + "epoch": 0.9979293407598838, + "grad_norm": 0.6915245056152344, + "learning_rate": 2.5087631942356848e-05, + "loss": 0.64, + "step": 62170 + }, + { + "epoch": 0.9980898569800478, + "grad_norm": 0.6654157042503357, + "learning_rate": 2.508132750917358e-05, + "loss": 0.7984, + "step": 62180 + }, + { + "epoch": 0.9982503732002119, + "grad_norm": 1.0593775510787964, + "learning_rate": 2.5075023070818364e-05, + "loss": 0.7062, + "step": 62190 + }, + { + "epoch": 0.9984108894203759, + "grad_norm": 0.7368036508560181, + "learning_rate": 2.5068718627692124e-05, + "loss": 0.6776, + "step": 62200 + }, + { + "epoch": 0.9985714056405399, + "grad_norm": 0.9566357731819153, + "learning_rate": 2.5062414180195774e-05, + "loss": 0.8204, + "step": 62210 + }, + { + "epoch": 0.998731921860704, + "grad_norm": 0.5641929507255554, + "learning_rate": 2.5056109728730254e-05, + "loss": 0.7153, + "step": 62220 + }, + { + "epoch": 0.998892438080868, + "grad_norm": 1.0154519081115723, + "learning_rate": 2.504980527369649e-05, + "loss": 0.8158, + "step": 62230 + }, + { + "epoch": 0.9990529543010321, + "grad_norm": 0.5738666653633118, + "learning_rate": 2.5043500815495395e-05, + "loss": 0.7782, + "step": 62240 + }, + { + "epoch": 0.9992134705211961, + "grad_norm": 0.7303397059440613, + "learning_rate": 2.5037196354527904e-05, + "loss": 0.8718, + "step": 62250 + }, + { + "epoch": 0.9993739867413602, + "grad_norm": 0.8640841841697693, + "learning_rate": 2.503089189119495e-05, + "loss": 0.7182, + "step": 62260 + }, + { + "epoch": 0.9995345029615242, + "grad_norm": 1.1369208097457886, + "learning_rate": 2.5024587425897445e-05, + "loss": 0.7847, + "step": 62270 + }, + { + "epoch": 0.9996950191816883, + "grad_norm": 1.1639087200164795, + "learning_rate": 2.5018282959036325e-05, + "loss": 0.6521, + "step": 62280 + }, + { + "epoch": 0.9998555354018523, + "grad_norm": 0.9179198741912842, + "learning_rate": 2.501197849101252e-05, + "loss": 0.8833, + "step": 62290 + }, + { + "epoch": 1.0000160516220165, + "grad_norm": 0.7948868870735168, + "learning_rate": 2.5005674022226953e-05, + "loss": 0.8203, + "step": 62300 + }, + { + "epoch": 1.0001765678421803, + "grad_norm": 1.5485901832580566, + "learning_rate": 2.4999369553080553e-05, + "loss": 0.7137, + "step": 62310 + }, + { + "epoch": 1.0003370840623445, + "grad_norm": 0.6874681115150452, + "learning_rate": 2.4993065083974237e-05, + "loss": 0.8009, + "step": 62320 + }, + { + "epoch": 1.0004976002825086, + "grad_norm": 0.832650899887085, + "learning_rate": 2.4986760615308948e-05, + "loss": 0.7142, + "step": 62330 + }, + { + "epoch": 1.0006581165026727, + "grad_norm": 0.820235013961792, + "learning_rate": 2.4980456147485613e-05, + "loss": 0.71, + "step": 62340 + }, + { + "epoch": 1.0008186327228366, + "grad_norm": 0.5072542428970337, + "learning_rate": 2.4974151680905143e-05, + "loss": 0.8049, + "step": 62350 + }, + { + "epoch": 1.0009791489430007, + "grad_norm": 0.7368881702423096, + "learning_rate": 2.4967847215968472e-05, + "loss": 0.7062, + "step": 62360 + }, + { + "epoch": 1.0011396651631648, + "grad_norm": 0.7163023352622986, + "learning_rate": 2.4961542753076532e-05, + "loss": 0.7145, + "step": 62370 + }, + { + "epoch": 1.0013001813833289, + "grad_norm": 0.5187686085700989, + "learning_rate": 2.4955238292630244e-05, + "loss": 0.7962, + "step": 62380 + }, + { + "epoch": 1.0014606976034928, + "grad_norm": 0.8290539383888245, + "learning_rate": 2.4948933835030542e-05, + "loss": 0.8317, + "step": 62390 + }, + { + "epoch": 1.0016212138236569, + "grad_norm": 0.8891013264656067, + "learning_rate": 2.494262938067836e-05, + "loss": 0.6305, + "step": 62400 + }, + { + "epoch": 1.001781730043821, + "grad_norm": 0.6979957222938538, + "learning_rate": 2.4936324929974596e-05, + "loss": 0.7356, + "step": 62410 + }, + { + "epoch": 1.0019422462639849, + "grad_norm": 0.702718198299408, + "learning_rate": 2.49300204833202e-05, + "loss": 0.7518, + "step": 62420 + }, + { + "epoch": 1.002102762484149, + "grad_norm": 1.065593957901001, + "learning_rate": 2.4923716041116084e-05, + "loss": 0.7809, + "step": 62430 + }, + { + "epoch": 1.002263278704313, + "grad_norm": 0.7310143113136292, + "learning_rate": 2.4917411603763184e-05, + "loss": 0.6587, + "step": 62440 + }, + { + "epoch": 1.0024237949244772, + "grad_norm": 0.5744591951370239, + "learning_rate": 2.4911107171662416e-05, + "loss": 0.7626, + "step": 62450 + }, + { + "epoch": 1.002584311144641, + "grad_norm": 0.7083208560943604, + "learning_rate": 2.4904802745214723e-05, + "loss": 0.6765, + "step": 62460 + }, + { + "epoch": 1.0027448273648052, + "grad_norm": 1.3949624300003052, + "learning_rate": 2.4898498324821008e-05, + "loss": 0.7649, + "step": 62470 + }, + { + "epoch": 1.0029053435849693, + "grad_norm": 0.7268627882003784, + "learning_rate": 2.4892193910882202e-05, + "loss": 0.7471, + "step": 62480 + }, + { + "epoch": 1.0030658598051334, + "grad_norm": 0.8661142587661743, + "learning_rate": 2.488588950379923e-05, + "loss": 0.5818, + "step": 62490 + }, + { + "epoch": 1.0032263760252973, + "grad_norm": 1.037631630897522, + "learning_rate": 2.487958510397302e-05, + "loss": 0.6812, + "step": 62500 + }, + { + "epoch": 1.0033868922454614, + "grad_norm": 0.6242178082466125, + "learning_rate": 2.487328071180449e-05, + "loss": 0.7685, + "step": 62510 + }, + { + "epoch": 1.0035474084656255, + "grad_norm": 0.7790234684944153, + "learning_rate": 2.4866976327694566e-05, + "loss": 0.6944, + "step": 62520 + }, + { + "epoch": 1.0037079246857894, + "grad_norm": 0.842107892036438, + "learning_rate": 2.4860671952044167e-05, + "loss": 0.7653, + "step": 62530 + }, + { + "epoch": 1.0038684409059535, + "grad_norm": 0.5960937142372131, + "learning_rate": 2.485436758525421e-05, + "loss": 0.6885, + "step": 62540 + }, + { + "epoch": 1.0040289571261176, + "grad_norm": 0.582139253616333, + "learning_rate": 2.4848063227725617e-05, + "loss": 0.7877, + "step": 62550 + }, + { + "epoch": 1.0041894733462817, + "grad_norm": 0.9721177220344543, + "learning_rate": 2.4841758879859314e-05, + "loss": 0.7406, + "step": 62560 + }, + { + "epoch": 1.0043499895664456, + "grad_norm": 0.5814354419708252, + "learning_rate": 2.4835454542056218e-05, + "loss": 0.6658, + "step": 62570 + }, + { + "epoch": 1.0045105057866097, + "grad_norm": 0.6528146862983704, + "learning_rate": 2.4829150214717244e-05, + "loss": 0.679, + "step": 62580 + }, + { + "epoch": 1.0046710220067738, + "grad_norm": 1.4199190139770508, + "learning_rate": 2.4822845898243328e-05, + "loss": 0.7364, + "step": 62590 + }, + { + "epoch": 1.004831538226938, + "grad_norm": 1.1513116359710693, + "learning_rate": 2.4816541593035357e-05, + "loss": 0.7788, + "step": 62600 + }, + { + "epoch": 1.0049920544471018, + "grad_norm": 1.4795360565185547, + "learning_rate": 2.4810237299494268e-05, + "loss": 0.6289, + "step": 62610 + }, + { + "epoch": 1.005152570667266, + "grad_norm": 0.609406054019928, + "learning_rate": 2.4803933018020973e-05, + "loss": 0.7999, + "step": 62620 + }, + { + "epoch": 1.00531308688743, + "grad_norm": 0.975148618221283, + "learning_rate": 2.4797628749016386e-05, + "loss": 0.6823, + "step": 62630 + }, + { + "epoch": 1.005473603107594, + "grad_norm": 0.7932100892066956, + "learning_rate": 2.4791324492881425e-05, + "loss": 0.6581, + "step": 62640 + }, + { + "epoch": 1.005634119327758, + "grad_norm": 0.7091104984283447, + "learning_rate": 2.478502025001701e-05, + "loss": 0.6591, + "step": 62650 + }, + { + "epoch": 1.0057946355479221, + "grad_norm": 1.1706968545913696, + "learning_rate": 2.4778716020824035e-05, + "loss": 0.7401, + "step": 62660 + }, + { + "epoch": 1.0059551517680863, + "grad_norm": 0.5197381973266602, + "learning_rate": 2.477241180570342e-05, + "loss": 0.7167, + "step": 62670 + }, + { + "epoch": 1.0061156679882501, + "grad_norm": 1.39340341091156, + "learning_rate": 2.4766107605056084e-05, + "loss": 0.7287, + "step": 62680 + }, + { + "epoch": 1.0062761842084142, + "grad_norm": 0.8275733590126038, + "learning_rate": 2.4759803419282926e-05, + "loss": 0.6976, + "step": 62690 + }, + { + "epoch": 1.0064367004285784, + "grad_norm": 0.5733929872512817, + "learning_rate": 2.4753499248784866e-05, + "loss": 0.6032, + "step": 62700 + }, + { + "epoch": 1.0065972166487425, + "grad_norm": 0.645094633102417, + "learning_rate": 2.4747195093962814e-05, + "loss": 0.7322, + "step": 62710 + }, + { + "epoch": 1.0067577328689064, + "grad_norm": 0.706322431564331, + "learning_rate": 2.4740890955217657e-05, + "loss": 0.7799, + "step": 62720 + }, + { + "epoch": 1.0069182490890705, + "grad_norm": 0.8258772492408752, + "learning_rate": 2.4734586832950317e-05, + "loss": 0.6831, + "step": 62730 + }, + { + "epoch": 1.0070787653092346, + "grad_norm": 0.7408973574638367, + "learning_rate": 2.47282827275617e-05, + "loss": 0.7028, + "step": 62740 + }, + { + "epoch": 1.0072392815293985, + "grad_norm": 1.0494389533996582, + "learning_rate": 2.4721978639452702e-05, + "loss": 0.7771, + "step": 62750 + }, + { + "epoch": 1.0073997977495626, + "grad_norm": 0.9338209629058838, + "learning_rate": 2.4715674569024232e-05, + "loss": 0.7465, + "step": 62760 + }, + { + "epoch": 1.0075603139697267, + "grad_norm": 0.5227081179618835, + "learning_rate": 2.4709370516677198e-05, + "loss": 0.7129, + "step": 62770 + }, + { + "epoch": 1.0077208301898908, + "grad_norm": 0.9578135013580322, + "learning_rate": 2.4703066482812488e-05, + "loss": 0.6466, + "step": 62780 + }, + { + "epoch": 1.0078813464100547, + "grad_norm": 0.5590094923973083, + "learning_rate": 2.4696762467831004e-05, + "loss": 0.7704, + "step": 62790 + }, + { + "epoch": 1.0080418626302188, + "grad_norm": 0.4651164412498474, + "learning_rate": 2.469045847213365e-05, + "loss": 0.7509, + "step": 62800 + }, + { + "epoch": 1.0082023788503829, + "grad_norm": 0.7231650352478027, + "learning_rate": 2.4684154496121316e-05, + "loss": 0.6538, + "step": 62810 + }, + { + "epoch": 1.0083628950705468, + "grad_norm": 0.8276961445808411, + "learning_rate": 2.4677850540194906e-05, + "loss": 0.745, + "step": 62820 + }, + { + "epoch": 1.0085234112907109, + "grad_norm": 0.761307418346405, + "learning_rate": 2.4671546604755315e-05, + "loss": 0.7539, + "step": 62830 + }, + { + "epoch": 1.008683927510875, + "grad_norm": 0.7179023027420044, + "learning_rate": 2.4665242690203432e-05, + "loss": 0.7271, + "step": 62840 + }, + { + "epoch": 1.008844443731039, + "grad_norm": 0.901297926902771, + "learning_rate": 2.4658938796940146e-05, + "loss": 0.7883, + "step": 62850 + }, + { + "epoch": 1.009004959951203, + "grad_norm": 0.9424343109130859, + "learning_rate": 2.4652634925366346e-05, + "loss": 0.7012, + "step": 62860 + }, + { + "epoch": 1.009165476171367, + "grad_norm": 1.1154533624649048, + "learning_rate": 2.464633107588293e-05, + "loss": 0.7339, + "step": 62870 + }, + { + "epoch": 1.0093259923915312, + "grad_norm": 0.5224239230155945, + "learning_rate": 2.464002724889078e-05, + "loss": 0.7846, + "step": 62880 + }, + { + "epoch": 1.0094865086116953, + "grad_norm": 0.8332353830337524, + "learning_rate": 2.4633723444790784e-05, + "loss": 0.6739, + "step": 62890 + }, + { + "epoch": 1.0096470248318592, + "grad_norm": 1.0475610494613647, + "learning_rate": 2.4627419663983838e-05, + "loss": 0.7613, + "step": 62900 + }, + { + "epoch": 1.0098075410520233, + "grad_norm": 1.0258163213729858, + "learning_rate": 2.4621115906870805e-05, + "loss": 0.7292, + "step": 62910 + }, + { + "epoch": 1.0099680572721874, + "grad_norm": 0.7909448146820068, + "learning_rate": 2.4614812173852574e-05, + "loss": 0.6526, + "step": 62920 + }, + { + "epoch": 1.0101285734923513, + "grad_norm": 0.7191421985626221, + "learning_rate": 2.4608508465330033e-05, + "loss": 0.7766, + "step": 62930 + }, + { + "epoch": 1.0102890897125154, + "grad_norm": 0.8641693592071533, + "learning_rate": 2.460220478170405e-05, + "loss": 0.7812, + "step": 62940 + }, + { + "epoch": 1.0104496059326795, + "grad_norm": 1.148854374885559, + "learning_rate": 2.459590112337551e-05, + "loss": 0.8138, + "step": 62950 + }, + { + "epoch": 1.0106101221528436, + "grad_norm": 0.7935690879821777, + "learning_rate": 2.45895974907453e-05, + "loss": 0.7515, + "step": 62960 + }, + { + "epoch": 1.0107706383730075, + "grad_norm": 1.0722383260726929, + "learning_rate": 2.4583293884214268e-05, + "loss": 0.7532, + "step": 62970 + }, + { + "epoch": 1.0109311545931716, + "grad_norm": 1.0489897727966309, + "learning_rate": 2.45769903041833e-05, + "loss": 0.7284, + "step": 62980 + }, + { + "epoch": 1.0110916708133357, + "grad_norm": 0.995561420917511, + "learning_rate": 2.4570686751053266e-05, + "loss": 0.7158, + "step": 62990 + }, + { + "epoch": 1.0112521870334998, + "grad_norm": 0.8077309131622314, + "learning_rate": 2.4564383225225032e-05, + "loss": 0.7231, + "step": 63000 + }, + { + "epoch": 1.0114127032536637, + "grad_norm": 0.6598693132400513, + "learning_rate": 2.455807972709947e-05, + "loss": 0.7097, + "step": 63010 + }, + { + "epoch": 1.0115732194738278, + "grad_norm": 0.5976759195327759, + "learning_rate": 2.455177625707745e-05, + "loss": 0.7333, + "step": 63020 + }, + { + "epoch": 1.011733735693992, + "grad_norm": 0.9819822311401367, + "learning_rate": 2.454547281555983e-05, + "loss": 0.7407, + "step": 63030 + }, + { + "epoch": 1.0118942519141558, + "grad_norm": 0.7947660684585571, + "learning_rate": 2.453916940294746e-05, + "loss": 0.7474, + "step": 63040 + }, + { + "epoch": 1.01205476813432, + "grad_norm": 0.7896876335144043, + "learning_rate": 2.4532866019641214e-05, + "loss": 0.6738, + "step": 63050 + }, + { + "epoch": 1.012215284354484, + "grad_norm": 0.8658848404884338, + "learning_rate": 2.4526562666041948e-05, + "loss": 0.6883, + "step": 63060 + }, + { + "epoch": 1.0123758005746482, + "grad_norm": 1.764670729637146, + "learning_rate": 2.452025934255052e-05, + "loss": 0.6586, + "step": 63070 + }, + { + "epoch": 1.012536316794812, + "grad_norm": 0.938639223575592, + "learning_rate": 2.451395604956778e-05, + "loss": 0.7968, + "step": 63080 + }, + { + "epoch": 1.0126968330149761, + "grad_norm": 0.8413400053977966, + "learning_rate": 2.4507652787494586e-05, + "loss": 0.7392, + "step": 63090 + }, + { + "epoch": 1.0128573492351403, + "grad_norm": 0.6985334753990173, + "learning_rate": 2.450134955673178e-05, + "loss": 0.6468, + "step": 63100 + }, + { + "epoch": 1.0130178654553044, + "grad_norm": 1.7337173223495483, + "learning_rate": 2.449504635768022e-05, + "loss": 0.7262, + "step": 63110 + }, + { + "epoch": 1.0131783816754683, + "grad_norm": 0.7898188233375549, + "learning_rate": 2.4488743190740743e-05, + "loss": 0.7672, + "step": 63120 + }, + { + "epoch": 1.0133388978956324, + "grad_norm": 0.8581573367118835, + "learning_rate": 2.44824400563142e-05, + "loss": 0.7478, + "step": 63130 + }, + { + "epoch": 1.0134994141157965, + "grad_norm": 0.6887175440788269, + "learning_rate": 2.4476136954801436e-05, + "loss": 0.8531, + "step": 63140 + }, + { + "epoch": 1.0136599303359604, + "grad_norm": 1.0442135334014893, + "learning_rate": 2.4469833886603292e-05, + "loss": 0.6221, + "step": 63150 + }, + { + "epoch": 1.0138204465561245, + "grad_norm": 0.6384783387184143, + "learning_rate": 2.446353085212059e-05, + "loss": 0.7141, + "step": 63160 + }, + { + "epoch": 1.0139809627762886, + "grad_norm": 1.1071946620941162, + "learning_rate": 2.4457227851754178e-05, + "loss": 0.6923, + "step": 63170 + }, + { + "epoch": 1.0141414789964527, + "grad_norm": 1.200080156326294, + "learning_rate": 2.4450924885904887e-05, + "loss": 0.7216, + "step": 63180 + }, + { + "epoch": 1.0143019952166166, + "grad_norm": 0.8525433540344238, + "learning_rate": 2.4444621954973558e-05, + "loss": 0.7592, + "step": 63190 + }, + { + "epoch": 1.0144625114367807, + "grad_norm": 0.8853884339332581, + "learning_rate": 2.4438319059361006e-05, + "loss": 0.6223, + "step": 63200 + }, + { + "epoch": 1.0146230276569448, + "grad_norm": 0.8751156330108643, + "learning_rate": 2.4432016199468076e-05, + "loss": 0.7093, + "step": 63210 + }, + { + "epoch": 1.0147835438771087, + "grad_norm": 0.6166747212409973, + "learning_rate": 2.4425713375695576e-05, + "loss": 0.7699, + "step": 63220 + }, + { + "epoch": 1.0149440600972728, + "grad_norm": 1.211146593093872, + "learning_rate": 2.441941058844433e-05, + "loss": 0.6416, + "step": 63230 + }, + { + "epoch": 1.015104576317437, + "grad_norm": 1.1309776306152344, + "learning_rate": 2.4413107838115165e-05, + "loss": 0.675, + "step": 63240 + }, + { + "epoch": 1.015265092537601, + "grad_norm": 0.6790115237236023, + "learning_rate": 2.4406805125108896e-05, + "loss": 0.7397, + "step": 63250 + }, + { + "epoch": 1.0154256087577649, + "grad_norm": 1.2443382740020752, + "learning_rate": 2.440050244982634e-05, + "loss": 0.714, + "step": 63260 + }, + { + "epoch": 1.015586124977929, + "grad_norm": 0.8129385113716125, + "learning_rate": 2.439419981266832e-05, + "loss": 0.7244, + "step": 63270 + }, + { + "epoch": 1.015746641198093, + "grad_norm": 0.6374788880348206, + "learning_rate": 2.4387897214035623e-05, + "loss": 0.7265, + "step": 63280 + }, + { + "epoch": 1.0159071574182572, + "grad_norm": 0.9452635049819946, + "learning_rate": 2.4381594654329073e-05, + "loss": 0.7487, + "step": 63290 + }, + { + "epoch": 1.016067673638421, + "grad_norm": 0.5486201643943787, + "learning_rate": 2.4375292133949473e-05, + "loss": 0.6068, + "step": 63300 + }, + { + "epoch": 1.0162281898585852, + "grad_norm": 0.8494896292686462, + "learning_rate": 2.4368989653297627e-05, + "loss": 0.6813, + "step": 63310 + }, + { + "epoch": 1.0163887060787493, + "grad_norm": 0.6621972322463989, + "learning_rate": 2.4362687212774335e-05, + "loss": 0.6926, + "step": 63320 + }, + { + "epoch": 1.0165492222989132, + "grad_norm": 1.2329038381576538, + "learning_rate": 2.4356384812780398e-05, + "loss": 0.717, + "step": 63330 + }, + { + "epoch": 1.0167097385190773, + "grad_norm": 0.935461163520813, + "learning_rate": 2.4350082453716617e-05, + "loss": 0.8002, + "step": 63340 + }, + { + "epoch": 1.0168702547392414, + "grad_norm": 0.9068610072135925, + "learning_rate": 2.4343780135983768e-05, + "loss": 0.7795, + "step": 63350 + }, + { + "epoch": 1.0170307709594055, + "grad_norm": 1.0861425399780273, + "learning_rate": 2.433747785998265e-05, + "loss": 0.7654, + "step": 63360 + }, + { + "epoch": 1.0171912871795694, + "grad_norm": 0.6472353339195251, + "learning_rate": 2.433117562611405e-05, + "loss": 0.7185, + "step": 63370 + }, + { + "epoch": 1.0173518033997335, + "grad_norm": 0.5575109124183655, + "learning_rate": 2.4324873434778757e-05, + "loss": 0.6892, + "step": 63380 + }, + { + "epoch": 1.0175123196198976, + "grad_norm": 1.1362025737762451, + "learning_rate": 2.4318571286377552e-05, + "loss": 0.6837, + "step": 63390 + }, + { + "epoch": 1.0176728358400617, + "grad_norm": 0.8302701115608215, + "learning_rate": 2.4312269181311214e-05, + "loss": 0.7761, + "step": 63400 + }, + { + "epoch": 1.0178333520602256, + "grad_norm": 0.8848928809165955, + "learning_rate": 2.430596711998052e-05, + "loss": 0.8717, + "step": 63410 + }, + { + "epoch": 1.0179938682803897, + "grad_norm": 1.2020705938339233, + "learning_rate": 2.429966510278624e-05, + "loss": 0.6327, + "step": 63420 + }, + { + "epoch": 1.0181543845005538, + "grad_norm": 1.7517600059509277, + "learning_rate": 2.429336313012915e-05, + "loss": 0.6783, + "step": 63430 + }, + { + "epoch": 1.0183149007207177, + "grad_norm": 0.9589860439300537, + "learning_rate": 2.4287061202410015e-05, + "loss": 0.8113, + "step": 63440 + }, + { + "epoch": 1.0184754169408818, + "grad_norm": 0.757745623588562, + "learning_rate": 2.4280759320029604e-05, + "loss": 0.7341, + "step": 63450 + }, + { + "epoch": 1.018635933161046, + "grad_norm": 0.7785635590553284, + "learning_rate": 2.4274457483388694e-05, + "loss": 0.6425, + "step": 63460 + }, + { + "epoch": 1.01879644938121, + "grad_norm": 0.8263211846351624, + "learning_rate": 2.4268155692888014e-05, + "loss": 0.62, + "step": 63470 + }, + { + "epoch": 1.018956965601374, + "grad_norm": 0.8695405125617981, + "learning_rate": 2.4261853948928338e-05, + "loss": 0.7013, + "step": 63480 + }, + { + "epoch": 1.019117481821538, + "grad_norm": 0.7331264615058899, + "learning_rate": 2.4255552251910418e-05, + "loss": 0.7907, + "step": 63490 + }, + { + "epoch": 1.0192779980417022, + "grad_norm": 0.9513654708862305, + "learning_rate": 2.4249250602235006e-05, + "loss": 0.7597, + "step": 63500 + }, + { + "epoch": 1.0194385142618663, + "grad_norm": 0.9103023409843445, + "learning_rate": 2.4242949000302847e-05, + "loss": 0.868, + "step": 63510 + }, + { + "epoch": 1.0195990304820302, + "grad_norm": 0.8035452961921692, + "learning_rate": 2.4236647446514702e-05, + "loss": 0.6936, + "step": 63520 + }, + { + "epoch": 1.0197595467021943, + "grad_norm": 1.1235511302947998, + "learning_rate": 2.423034594127129e-05, + "loss": 0.7748, + "step": 63530 + }, + { + "epoch": 1.0199200629223584, + "grad_norm": 0.8113754987716675, + "learning_rate": 2.4224044484973353e-05, + "loss": 0.7157, + "step": 63540 + }, + { + "epoch": 1.0200805791425223, + "grad_norm": 0.8841383457183838, + "learning_rate": 2.4217743078021636e-05, + "loss": 0.7368, + "step": 63550 + }, + { + "epoch": 1.0202410953626864, + "grad_norm": 0.8939146995544434, + "learning_rate": 2.4211441720816865e-05, + "loss": 0.7372, + "step": 63560 + }, + { + "epoch": 1.0204016115828505, + "grad_norm": 1.1779539585113525, + "learning_rate": 2.4205140413759775e-05, + "loss": 0.7289, + "step": 63570 + }, + { + "epoch": 1.0205621278030146, + "grad_norm": 0.7881899476051331, + "learning_rate": 2.41988391572511e-05, + "loss": 0.6863, + "step": 63580 + }, + { + "epoch": 1.0207226440231785, + "grad_norm": 1.0086829662322998, + "learning_rate": 2.419253795169154e-05, + "loss": 0.7602, + "step": 63590 + }, + { + "epoch": 1.0208831602433426, + "grad_norm": 0.6705745458602905, + "learning_rate": 2.4186236797481826e-05, + "loss": 0.6345, + "step": 63600 + }, + { + "epoch": 1.0210436764635067, + "grad_norm": 0.6613556742668152, + "learning_rate": 2.4179935695022678e-05, + "loss": 0.6372, + "step": 63610 + }, + { + "epoch": 1.0212041926836708, + "grad_norm": 0.8033743500709534, + "learning_rate": 2.4173634644714806e-05, + "loss": 0.5847, + "step": 63620 + }, + { + "epoch": 1.0213647089038347, + "grad_norm": 0.6804102659225464, + "learning_rate": 2.416733364695892e-05, + "loss": 0.6074, + "step": 63630 + }, + { + "epoch": 1.0215252251239988, + "grad_norm": 1.0094879865646362, + "learning_rate": 2.4161032702155726e-05, + "loss": 0.721, + "step": 63640 + }, + { + "epoch": 1.021685741344163, + "grad_norm": 0.9911817312240601, + "learning_rate": 2.415473181070594e-05, + "loss": 0.8128, + "step": 63650 + }, + { + "epoch": 1.0218462575643268, + "grad_norm": 0.7195400595664978, + "learning_rate": 2.4148430973010236e-05, + "loss": 0.6986, + "step": 63660 + }, + { + "epoch": 1.022006773784491, + "grad_norm": 0.9183965921401978, + "learning_rate": 2.414213018946933e-05, + "loss": 0.7835, + "step": 63670 + }, + { + "epoch": 1.022167290004655, + "grad_norm": 0.8005900979042053, + "learning_rate": 2.41358294604839e-05, + "loss": 0.7171, + "step": 63680 + }, + { + "epoch": 1.0223278062248191, + "grad_norm": 1.0225363969802856, + "learning_rate": 2.412952878645465e-05, + "loss": 0.5785, + "step": 63690 + }, + { + "epoch": 1.022488322444983, + "grad_norm": 1.0614513158798218, + "learning_rate": 2.4123228167782258e-05, + "loss": 0.6906, + "step": 63700 + }, + { + "epoch": 1.022648838665147, + "grad_norm": 0.9634148478507996, + "learning_rate": 2.4116927604867417e-05, + "loss": 0.7512, + "step": 63710 + }, + { + "epoch": 1.0228093548853112, + "grad_norm": 1.2051678895950317, + "learning_rate": 2.411062709811079e-05, + "loss": 0.7791, + "step": 63720 + }, + { + "epoch": 1.022969871105475, + "grad_norm": 0.8143066763877869, + "learning_rate": 2.410432664791306e-05, + "loss": 0.739, + "step": 63730 + }, + { + "epoch": 1.0231303873256392, + "grad_norm": 0.9495546221733093, + "learning_rate": 2.4098026254674892e-05, + "loss": 0.7845, + "step": 63740 + }, + { + "epoch": 1.0232909035458033, + "grad_norm": 0.9331105351448059, + "learning_rate": 2.4091725918796967e-05, + "loss": 0.6099, + "step": 63750 + }, + { + "epoch": 1.0234514197659674, + "grad_norm": 0.7203542590141296, + "learning_rate": 2.4085425640679943e-05, + "loss": 0.8219, + "step": 63760 + }, + { + "epoch": 1.0236119359861313, + "grad_norm": 0.8178290128707886, + "learning_rate": 2.4079125420724487e-05, + "loss": 0.7449, + "step": 63770 + }, + { + "epoch": 1.0237724522062954, + "grad_norm": 0.9844710230827332, + "learning_rate": 2.4072825259331242e-05, + "loss": 0.7696, + "step": 63780 + }, + { + "epoch": 1.0239329684264595, + "grad_norm": 0.8785572648048401, + "learning_rate": 2.4066525156900865e-05, + "loss": 0.6985, + "step": 63790 + }, + { + "epoch": 1.0240934846466236, + "grad_norm": 0.9817780256271362, + "learning_rate": 2.406022511383401e-05, + "loss": 0.6358, + "step": 63800 + }, + { + "epoch": 1.0242540008667875, + "grad_norm": 0.7712971568107605, + "learning_rate": 2.4053925130531323e-05, + "loss": 0.7621, + "step": 63810 + }, + { + "epoch": 1.0244145170869516, + "grad_norm": 0.7290503978729248, + "learning_rate": 2.4047625207393444e-05, + "loss": 0.6418, + "step": 63820 + }, + { + "epoch": 1.0245750333071157, + "grad_norm": 1.4703694581985474, + "learning_rate": 2.4041325344821026e-05, + "loss": 0.77, + "step": 63830 + }, + { + "epoch": 1.0247355495272796, + "grad_norm": 1.0263617038726807, + "learning_rate": 2.4035025543214675e-05, + "loss": 0.7215, + "step": 63840 + }, + { + "epoch": 1.0248960657474437, + "grad_norm": 0.9203011989593506, + "learning_rate": 2.4028725802975038e-05, + "loss": 0.7427, + "step": 63850 + }, + { + "epoch": 1.0250565819676078, + "grad_norm": 0.634994626045227, + "learning_rate": 2.4022426124502737e-05, + "loss": 0.7221, + "step": 63860 + }, + { + "epoch": 1.025217098187772, + "grad_norm": 0.7125700116157532, + "learning_rate": 2.4016126508198396e-05, + "loss": 0.7707, + "step": 63870 + }, + { + "epoch": 1.0253776144079358, + "grad_norm": 0.7932704091072083, + "learning_rate": 2.4009826954462637e-05, + "loss": 0.7355, + "step": 63880 + }, + { + "epoch": 1.0255381306281, + "grad_norm": 0.8120912313461304, + "learning_rate": 2.400352746369607e-05, + "loss": 0.6519, + "step": 63890 + }, + { + "epoch": 1.025698646848264, + "grad_norm": 0.6495726108551025, + "learning_rate": 2.3997228036299317e-05, + "loss": 0.5959, + "step": 63900 + }, + { + "epoch": 1.0258591630684282, + "grad_norm": 0.5983802080154419, + "learning_rate": 2.3990928672672968e-05, + "loss": 0.6824, + "step": 63910 + }, + { + "epoch": 1.026019679288592, + "grad_norm": 1.281423568725586, + "learning_rate": 2.398462937321763e-05, + "loss": 0.8552, + "step": 63920 + }, + { + "epoch": 1.0261801955087562, + "grad_norm": 1.109545350074768, + "learning_rate": 2.3978330138333907e-05, + "loss": 0.7335, + "step": 63930 + }, + { + "epoch": 1.0263407117289203, + "grad_norm": 1.0521645545959473, + "learning_rate": 2.397203096842239e-05, + "loss": 0.7694, + "step": 63940 + }, + { + "epoch": 1.0265012279490842, + "grad_norm": 0.8022415041923523, + "learning_rate": 2.3965731863883666e-05, + "loss": 0.7573, + "step": 63950 + }, + { + "epoch": 1.0266617441692483, + "grad_norm": 0.7661333680152893, + "learning_rate": 2.3959432825118342e-05, + "loss": 0.6748, + "step": 63960 + }, + { + "epoch": 1.0268222603894124, + "grad_norm": 1.186143398284912, + "learning_rate": 2.395313385252697e-05, + "loss": 0.8, + "step": 63970 + }, + { + "epoch": 1.0269827766095765, + "grad_norm": 1.1599571704864502, + "learning_rate": 2.3946834946510135e-05, + "loss": 0.6512, + "step": 63980 + }, + { + "epoch": 1.0271432928297404, + "grad_norm": 0.7636520266532898, + "learning_rate": 2.3940536107468424e-05, + "loss": 0.7604, + "step": 63990 + }, + { + "epoch": 1.0273038090499045, + "grad_norm": 0.7878389358520508, + "learning_rate": 2.3934237335802393e-05, + "loss": 0.5918, + "step": 64000 + }, + { + "epoch": 1.0273038090499045, + "eval_loss": 0.7761463522911072, + "eval_runtime": 1833.7732, + "eval_samples_per_second": 14.304, + "eval_steps_per_second": 1.788, + "step": 64000 + }, + { + "epoch": 1.0274643252700686, + "grad_norm": 0.7294833660125732, + "learning_rate": 2.3927938631912612e-05, + "loss": 0.6537, + "step": 64010 + }, + { + "epoch": 1.0276248414902325, + "grad_norm": 0.9238069653511047, + "learning_rate": 2.392163999619965e-05, + "loss": 0.7504, + "step": 64020 + }, + { + "epoch": 1.0277853577103966, + "grad_norm": 0.8415144681930542, + "learning_rate": 2.3915341429064045e-05, + "loss": 0.6367, + "step": 64030 + }, + { + "epoch": 1.0279458739305607, + "grad_norm": 0.9644629955291748, + "learning_rate": 2.3909042930906362e-05, + "loss": 0.6863, + "step": 64040 + }, + { + "epoch": 1.0281063901507248, + "grad_norm": 1.499157190322876, + "learning_rate": 2.3902744502127146e-05, + "loss": 0.7283, + "step": 64050 + }, + { + "epoch": 1.0282669063708887, + "grad_norm": 0.8438952565193176, + "learning_rate": 2.3896446143126938e-05, + "loss": 0.6881, + "step": 64060 + }, + { + "epoch": 1.0284274225910528, + "grad_norm": 0.6472133994102478, + "learning_rate": 2.389014785430628e-05, + "loss": 0.6338, + "step": 64070 + }, + { + "epoch": 1.028587938811217, + "grad_norm": 0.709945559501648, + "learning_rate": 2.388384963606571e-05, + "loss": 0.7049, + "step": 64080 + }, + { + "epoch": 1.028748455031381, + "grad_norm": 0.5620659589767456, + "learning_rate": 2.3877551488805745e-05, + "loss": 0.6857, + "step": 64090 + }, + { + "epoch": 1.028908971251545, + "grad_norm": 0.9244040846824646, + "learning_rate": 2.387125341292692e-05, + "loss": 0.6799, + "step": 64100 + }, + { + "epoch": 1.029069487471709, + "grad_norm": 0.8980721235275269, + "learning_rate": 2.386495540882975e-05, + "loss": 0.6409, + "step": 64110 + }, + { + "epoch": 1.0292300036918731, + "grad_norm": 0.996030867099762, + "learning_rate": 2.3858657476914754e-05, + "loss": 0.7536, + "step": 64120 + }, + { + "epoch": 1.029390519912037, + "grad_norm": 0.7668693661689758, + "learning_rate": 2.385235961758244e-05, + "loss": 0.6244, + "step": 64130 + }, + { + "epoch": 1.0295510361322011, + "grad_norm": 0.698319673538208, + "learning_rate": 2.3846061831233325e-05, + "loss": 0.6721, + "step": 64140 + }, + { + "epoch": 1.0297115523523652, + "grad_norm": 0.9531891345977783, + "learning_rate": 2.383976411826791e-05, + "loss": 0.661, + "step": 64150 + }, + { + "epoch": 1.0298720685725293, + "grad_norm": 0.8400694727897644, + "learning_rate": 2.383346647908668e-05, + "loss": 0.7987, + "step": 64160 + }, + { + "epoch": 1.0300325847926932, + "grad_norm": 1.1302194595336914, + "learning_rate": 2.382716891409013e-05, + "loss": 0.7173, + "step": 64170 + }, + { + "epoch": 1.0301931010128573, + "grad_norm": 1.5380958318710327, + "learning_rate": 2.382087142367876e-05, + "loss": 0.7044, + "step": 64180 + }, + { + "epoch": 1.0303536172330214, + "grad_norm": 0.9517553448677063, + "learning_rate": 2.381457400825304e-05, + "loss": 0.6615, + "step": 64190 + }, + { + "epoch": 1.0305141334531855, + "grad_norm": 0.6704196929931641, + "learning_rate": 2.380827666821346e-05, + "loss": 0.7541, + "step": 64200 + }, + { + "epoch": 1.0306746496733494, + "grad_norm": 0.781816840171814, + "learning_rate": 2.38019794039605e-05, + "loss": 0.661, + "step": 64210 + }, + { + "epoch": 1.0308351658935135, + "grad_norm": 0.6788597106933594, + "learning_rate": 2.3795682215894604e-05, + "loss": 0.5395, + "step": 64220 + }, + { + "epoch": 1.0309956821136776, + "grad_norm": 1.4368489980697632, + "learning_rate": 2.3789385104416254e-05, + "loss": 0.7743, + "step": 64230 + }, + { + "epoch": 1.0311561983338415, + "grad_norm": 0.9181525111198425, + "learning_rate": 2.3783088069925904e-05, + "loss": 0.742, + "step": 64240 + }, + { + "epoch": 1.0313167145540056, + "grad_norm": 0.717510998249054, + "learning_rate": 2.377679111282401e-05, + "loss": 0.6386, + "step": 64250 + }, + { + "epoch": 1.0314772307741698, + "grad_norm": 0.8025223612785339, + "learning_rate": 2.3770494233511025e-05, + "loss": 0.6433, + "step": 64260 + }, + { + "epoch": 1.0316377469943339, + "grad_norm": 0.9484972953796387, + "learning_rate": 2.3764197432387392e-05, + "loss": 0.7182, + "step": 64270 + }, + { + "epoch": 1.0317982632144977, + "grad_norm": 0.8325333595275879, + "learning_rate": 2.3757900709853547e-05, + "loss": 0.7321, + "step": 64280 + }, + { + "epoch": 1.0319587794346619, + "grad_norm": 0.9445354342460632, + "learning_rate": 2.3751604066309926e-05, + "loss": 0.7271, + "step": 64290 + }, + { + "epoch": 1.032119295654826, + "grad_norm": 0.9322941899299622, + "learning_rate": 2.3745307502156958e-05, + "loss": 0.7251, + "step": 64300 + }, + { + "epoch": 1.03227981187499, + "grad_norm": 0.6221054792404175, + "learning_rate": 2.3739011017795065e-05, + "loss": 0.6468, + "step": 64310 + }, + { + "epoch": 1.032440328095154, + "grad_norm": 0.9800136685371399, + "learning_rate": 2.3732714613624673e-05, + "loss": 0.7029, + "step": 64320 + }, + { + "epoch": 1.032600844315318, + "grad_norm": 0.9821366667747498, + "learning_rate": 2.37264182900462e-05, + "loss": 0.7406, + "step": 64330 + }, + { + "epoch": 1.0327613605354822, + "grad_norm": 1.2906792163848877, + "learning_rate": 2.3720122047460042e-05, + "loss": 0.7045, + "step": 64340 + }, + { + "epoch": 1.032921876755646, + "grad_norm": 0.9577538967132568, + "learning_rate": 2.3713825886266607e-05, + "loss": 0.7275, + "step": 64350 + }, + { + "epoch": 1.0330823929758102, + "grad_norm": 0.6587565541267395, + "learning_rate": 2.3707529806866298e-05, + "loss": 0.6592, + "step": 64360 + }, + { + "epoch": 1.0332429091959743, + "grad_norm": 0.7201858162879944, + "learning_rate": 2.370123380965951e-05, + "loss": 0.6985, + "step": 64370 + }, + { + "epoch": 1.0334034254161384, + "grad_norm": 1.0430965423583984, + "learning_rate": 2.3694937895046627e-05, + "loss": 0.7751, + "step": 64380 + }, + { + "epoch": 1.0335639416363023, + "grad_norm": 0.8358522057533264, + "learning_rate": 2.3688642063428043e-05, + "loss": 0.723, + "step": 64390 + }, + { + "epoch": 1.0337244578564664, + "grad_norm": 0.9772236943244934, + "learning_rate": 2.368234631520412e-05, + "loss": 0.5892, + "step": 64400 + }, + { + "epoch": 1.0338849740766305, + "grad_norm": 0.7582921981811523, + "learning_rate": 2.3676050650775232e-05, + "loss": 0.782, + "step": 64410 + }, + { + "epoch": 1.0340454902967946, + "grad_norm": 0.8933585286140442, + "learning_rate": 2.3669755070541756e-05, + "loss": 0.7423, + "step": 64420 + }, + { + "epoch": 1.0342060065169585, + "grad_norm": 0.5856038928031921, + "learning_rate": 2.366345957490405e-05, + "loss": 0.6914, + "step": 64430 + }, + { + "epoch": 1.0343665227371226, + "grad_norm": 0.7375427484512329, + "learning_rate": 2.365716416426247e-05, + "loss": 0.667, + "step": 64440 + }, + { + "epoch": 1.0345270389572867, + "grad_norm": 1.0260412693023682, + "learning_rate": 2.3650868839017366e-05, + "loss": 0.6631, + "step": 64450 + }, + { + "epoch": 1.0346875551774506, + "grad_norm": 0.8026610016822815, + "learning_rate": 2.3644573599569094e-05, + "loss": 0.8688, + "step": 64460 + }, + { + "epoch": 1.0348480713976147, + "grad_norm": 0.6269752383232117, + "learning_rate": 2.3638278446317977e-05, + "loss": 0.7334, + "step": 64470 + }, + { + "epoch": 1.0350085876177788, + "grad_norm": 1.049219012260437, + "learning_rate": 2.3631983379664362e-05, + "loss": 0.721, + "step": 64480 + }, + { + "epoch": 1.035169103837943, + "grad_norm": 0.8788737058639526, + "learning_rate": 2.362568840000857e-05, + "loss": 0.7949, + "step": 64490 + }, + { + "epoch": 1.0353296200581068, + "grad_norm": 0.9931491017341614, + "learning_rate": 2.361939350775093e-05, + "loss": 0.6798, + "step": 64500 + }, + { + "epoch": 1.035490136278271, + "grad_norm": 0.7669209241867065, + "learning_rate": 2.3613098703291758e-05, + "loss": 0.654, + "step": 64510 + }, + { + "epoch": 1.035650652498435, + "grad_norm": 1.0186619758605957, + "learning_rate": 2.360680398703138e-05, + "loss": 0.6814, + "step": 64520 + }, + { + "epoch": 1.035811168718599, + "grad_norm": 0.9089034199714661, + "learning_rate": 2.3600509359370082e-05, + "loss": 0.7898, + "step": 64530 + }, + { + "epoch": 1.035971684938763, + "grad_norm": 1.0021523237228394, + "learning_rate": 2.3594214820708173e-05, + "loss": 0.7844, + "step": 64540 + }, + { + "epoch": 1.0361322011589271, + "grad_norm": 0.6015139222145081, + "learning_rate": 2.3587920371445953e-05, + "loss": 0.6466, + "step": 64550 + }, + { + "epoch": 1.0362927173790912, + "grad_norm": 1.1288173198699951, + "learning_rate": 2.358162601198371e-05, + "loss": 0.796, + "step": 64560 + }, + { + "epoch": 1.0364532335992551, + "grad_norm": 1.113667368888855, + "learning_rate": 2.3575331742721723e-05, + "loss": 0.7484, + "step": 64570 + }, + { + "epoch": 1.0366137498194192, + "grad_norm": 1.6339006423950195, + "learning_rate": 2.3569037564060285e-05, + "loss": 0.7282, + "step": 64580 + }, + { + "epoch": 1.0367742660395833, + "grad_norm": 2.083587408065796, + "learning_rate": 2.3562743476399652e-05, + "loss": 0.7209, + "step": 64590 + }, + { + "epoch": 1.0369347822597474, + "grad_norm": 0.8366955518722534, + "learning_rate": 2.35564494801401e-05, + "loss": 0.8131, + "step": 64600 + }, + { + "epoch": 1.0370952984799113, + "grad_norm": 0.6465186476707458, + "learning_rate": 2.3550155575681888e-05, + "loss": 0.692, + "step": 64610 + }, + { + "epoch": 1.0372558147000754, + "grad_norm": 1.0082842111587524, + "learning_rate": 2.354386176342527e-05, + "loss": 0.7743, + "step": 64620 + }, + { + "epoch": 1.0374163309202395, + "grad_norm": 0.6565262675285339, + "learning_rate": 2.3537568043770494e-05, + "loss": 0.7154, + "step": 64630 + }, + { + "epoch": 1.0375768471404034, + "grad_norm": 0.762374222278595, + "learning_rate": 2.3531274417117816e-05, + "loss": 0.6607, + "step": 64640 + }, + { + "epoch": 1.0377373633605675, + "grad_norm": 1.1991475820541382, + "learning_rate": 2.352498088386746e-05, + "loss": 0.7725, + "step": 64650 + }, + { + "epoch": 1.0378978795807317, + "grad_norm": 0.7669653296470642, + "learning_rate": 2.3518687444419658e-05, + "loss": 0.7076, + "step": 64660 + }, + { + "epoch": 1.0380583958008958, + "grad_norm": 2.021963119506836, + "learning_rate": 2.351239409917464e-05, + "loss": 0.7302, + "step": 64670 + }, + { + "epoch": 1.0382189120210596, + "grad_norm": 0.8704972267150879, + "learning_rate": 2.350610084853263e-05, + "loss": 0.7522, + "step": 64680 + }, + { + "epoch": 1.0383794282412238, + "grad_norm": 0.818389892578125, + "learning_rate": 2.3499807692893836e-05, + "loss": 0.6293, + "step": 64690 + }, + { + "epoch": 1.0385399444613879, + "grad_norm": 0.7037115693092346, + "learning_rate": 2.3493514632658466e-05, + "loss": 0.7378, + "step": 64700 + }, + { + "epoch": 1.038700460681552, + "grad_norm": 0.977931797504425, + "learning_rate": 2.348722166822674e-05, + "loss": 0.7387, + "step": 64710 + }, + { + "epoch": 1.0388609769017159, + "grad_norm": 0.7418214678764343, + "learning_rate": 2.3480928799998818e-05, + "loss": 0.8217, + "step": 64720 + }, + { + "epoch": 1.03902149312188, + "grad_norm": 0.6956214904785156, + "learning_rate": 2.3474636028374915e-05, + "loss": 0.7127, + "step": 64730 + }, + { + "epoch": 1.039182009342044, + "grad_norm": 0.9053347706794739, + "learning_rate": 2.3468343353755204e-05, + "loss": 0.6359, + "step": 64740 + }, + { + "epoch": 1.039342525562208, + "grad_norm": 1.1292824745178223, + "learning_rate": 2.3462050776539864e-05, + "loss": 0.755, + "step": 64750 + }, + { + "epoch": 1.039503041782372, + "grad_norm": 0.8009498715400696, + "learning_rate": 2.345575829712907e-05, + "loss": 0.8159, + "step": 64760 + }, + { + "epoch": 1.0396635580025362, + "grad_norm": 0.689756453037262, + "learning_rate": 2.3449465915922998e-05, + "loss": 0.6031, + "step": 64770 + }, + { + "epoch": 1.0398240742227003, + "grad_norm": 1.1143076419830322, + "learning_rate": 2.3443173633321777e-05, + "loss": 0.7917, + "step": 64780 + }, + { + "epoch": 1.0399845904428642, + "grad_norm": 0.6928494572639465, + "learning_rate": 2.3436881449725578e-05, + "loss": 0.6235, + "step": 64790 + }, + { + "epoch": 1.0401451066630283, + "grad_norm": 0.7418351173400879, + "learning_rate": 2.3430589365534543e-05, + "loss": 0.6881, + "step": 64800 + }, + { + "epoch": 1.0403056228831924, + "grad_norm": 1.2108638286590576, + "learning_rate": 2.3424297381148814e-05, + "loss": 0.6983, + "step": 64810 + }, + { + "epoch": 1.0404661391033563, + "grad_norm": 0.9472651481628418, + "learning_rate": 2.3418005496968527e-05, + "loss": 0.6981, + "step": 64820 + }, + { + "epoch": 1.0406266553235204, + "grad_norm": 1.0164469480514526, + "learning_rate": 2.341171371339381e-05, + "loss": 0.7835, + "step": 64830 + }, + { + "epoch": 1.0407871715436845, + "grad_norm": 1.202379822731018, + "learning_rate": 2.340542203082477e-05, + "loss": 0.7479, + "step": 64840 + }, + { + "epoch": 1.0409476877638486, + "grad_norm": 1.6326676607131958, + "learning_rate": 2.339913044966153e-05, + "loss": 0.7602, + "step": 64850 + }, + { + "epoch": 1.0411082039840125, + "grad_norm": 0.7627531290054321, + "learning_rate": 2.3392838970304195e-05, + "loss": 0.9055, + "step": 64860 + }, + { + "epoch": 1.0412687202041766, + "grad_norm": 0.8781518936157227, + "learning_rate": 2.338654759315287e-05, + "loss": 0.6994, + "step": 64870 + }, + { + "epoch": 1.0414292364243407, + "grad_norm": 0.7715114951133728, + "learning_rate": 2.338025631860765e-05, + "loss": 0.7093, + "step": 64880 + }, + { + "epoch": 1.0415897526445048, + "grad_norm": 0.7305839657783508, + "learning_rate": 2.337396514706862e-05, + "loss": 0.7201, + "step": 64890 + }, + { + "epoch": 1.0417502688646687, + "grad_norm": 0.9402493238449097, + "learning_rate": 2.336767407893587e-05, + "loss": 0.6677, + "step": 64900 + }, + { + "epoch": 1.0419107850848328, + "grad_norm": 1.0262900590896606, + "learning_rate": 2.336138311460946e-05, + "loss": 0.6488, + "step": 64910 + }, + { + "epoch": 1.042071301304997, + "grad_norm": 0.6997658610343933, + "learning_rate": 2.3355092254489468e-05, + "loss": 0.7038, + "step": 64920 + }, + { + "epoch": 1.042231817525161, + "grad_norm": 1.61270272731781, + "learning_rate": 2.334880149897595e-05, + "loss": 0.8098, + "step": 64930 + }, + { + "epoch": 1.042392333745325, + "grad_norm": 0.8104999661445618, + "learning_rate": 2.334251084846897e-05, + "loss": 0.642, + "step": 64940 + }, + { + "epoch": 1.042552849965489, + "grad_norm": 1.0074458122253418, + "learning_rate": 2.3336220303368574e-05, + "loss": 0.6472, + "step": 64950 + }, + { + "epoch": 1.0427133661856531, + "grad_norm": 1.8467671871185303, + "learning_rate": 2.3329929864074794e-05, + "loss": 0.7674, + "step": 64960 + }, + { + "epoch": 1.042873882405817, + "grad_norm": 1.6789298057556152, + "learning_rate": 2.3323639530987673e-05, + "loss": 0.6027, + "step": 64970 + }, + { + "epoch": 1.0430343986259811, + "grad_norm": 0.7829398512840271, + "learning_rate": 2.3317349304507242e-05, + "loss": 0.766, + "step": 64980 + }, + { + "epoch": 1.0431949148461452, + "grad_norm": 0.8356953859329224, + "learning_rate": 2.3311059185033516e-05, + "loss": 0.5798, + "step": 64990 + }, + { + "epoch": 1.0433554310663093, + "grad_norm": 0.7655699849128723, + "learning_rate": 2.3304769172966512e-05, + "loss": 0.7429, + "step": 65000 + }, + { + "epoch": 1.0435159472864732, + "grad_norm": 1.2532542943954468, + "learning_rate": 2.329847926870624e-05, + "loss": 0.5689, + "step": 65010 + }, + { + "epoch": 1.0436764635066373, + "grad_norm": 0.7872891426086426, + "learning_rate": 2.3292189472652713e-05, + "loss": 0.644, + "step": 65020 + }, + { + "epoch": 1.0438369797268015, + "grad_norm": 0.7478994727134705, + "learning_rate": 2.32858997852059e-05, + "loss": 0.6864, + "step": 65030 + }, + { + "epoch": 1.0439974959469653, + "grad_norm": 1.0572717189788818, + "learning_rate": 2.3279610206765795e-05, + "loss": 0.7934, + "step": 65040 + }, + { + "epoch": 1.0441580121671294, + "grad_norm": 0.8312227725982666, + "learning_rate": 2.3273320737732386e-05, + "loss": 0.688, + "step": 65050 + }, + { + "epoch": 1.0443185283872936, + "grad_norm": 1.1063807010650635, + "learning_rate": 2.3267031378505645e-05, + "loss": 0.6594, + "step": 65060 + }, + { + "epoch": 1.0444790446074577, + "grad_norm": 1.1581261157989502, + "learning_rate": 2.3260742129485534e-05, + "loss": 0.662, + "step": 65070 + }, + { + "epoch": 1.0446395608276215, + "grad_norm": 0.9377768039703369, + "learning_rate": 2.3254452991072033e-05, + "loss": 0.7455, + "step": 65080 + }, + { + "epoch": 1.0448000770477857, + "grad_norm": 1.7264974117279053, + "learning_rate": 2.324816396366506e-05, + "loss": 0.7559, + "step": 65090 + }, + { + "epoch": 1.0449605932679498, + "grad_norm": 0.4920092225074768, + "learning_rate": 2.3241875047664574e-05, + "loss": 0.6525, + "step": 65100 + }, + { + "epoch": 1.0451211094881139, + "grad_norm": 1.072605013847351, + "learning_rate": 2.323558624347052e-05, + "loss": 0.6924, + "step": 65110 + }, + { + "epoch": 1.0452816257082778, + "grad_norm": 0.6083507537841797, + "learning_rate": 2.3229297551482826e-05, + "loss": 0.6161, + "step": 65120 + }, + { + "epoch": 1.0454421419284419, + "grad_norm": 1.1088587045669556, + "learning_rate": 2.3223008972101414e-05, + "loss": 0.7292, + "step": 65130 + }, + { + "epoch": 1.045602658148606, + "grad_norm": 2.2435402870178223, + "learning_rate": 2.3216720505726207e-05, + "loss": 0.7886, + "step": 65140 + }, + { + "epoch": 1.0457631743687699, + "grad_norm": 0.9248068332672119, + "learning_rate": 2.32104321527571e-05, + "loss": 0.6694, + "step": 65150 + }, + { + "epoch": 1.045923690588934, + "grad_norm": 0.7627692222595215, + "learning_rate": 2.320414391359401e-05, + "loss": 0.6666, + "step": 65160 + }, + { + "epoch": 1.046084206809098, + "grad_norm": 0.6808854341506958, + "learning_rate": 2.3197855788636823e-05, + "loss": 0.597, + "step": 65170 + }, + { + "epoch": 1.0462447230292622, + "grad_norm": 0.9462350606918335, + "learning_rate": 2.3191567778285432e-05, + "loss": 0.6692, + "step": 65180 + }, + { + "epoch": 1.046405239249426, + "grad_norm": 0.7131199240684509, + "learning_rate": 2.3185279882939713e-05, + "loss": 0.8067, + "step": 65190 + }, + { + "epoch": 1.0465657554695902, + "grad_norm": 0.6434757709503174, + "learning_rate": 2.317899210299955e-05, + "loss": 0.7215, + "step": 65200 + }, + { + "epoch": 1.0467262716897543, + "grad_norm": 1.031802773475647, + "learning_rate": 2.3172704438864802e-05, + "loss": 0.7676, + "step": 65210 + }, + { + "epoch": 1.0468867879099184, + "grad_norm": 1.1878803968429565, + "learning_rate": 2.3166416890935315e-05, + "loss": 0.6378, + "step": 65220 + }, + { + "epoch": 1.0470473041300823, + "grad_norm": 1.3288923501968384, + "learning_rate": 2.3160129459610958e-05, + "loss": 0.6516, + "step": 65230 + }, + { + "epoch": 1.0472078203502464, + "grad_norm": 0.9084649682044983, + "learning_rate": 2.3153842145291565e-05, + "loss": 0.7068, + "step": 65240 + }, + { + "epoch": 1.0473683365704105, + "grad_norm": 0.820231556892395, + "learning_rate": 2.3147554948376976e-05, + "loss": 0.7372, + "step": 65250 + }, + { + "epoch": 1.0475288527905744, + "grad_norm": 0.7650024890899658, + "learning_rate": 2.314126786926702e-05, + "loss": 0.8064, + "step": 65260 + }, + { + "epoch": 1.0476893690107385, + "grad_norm": 1.5871937274932861, + "learning_rate": 2.3134980908361524e-05, + "loss": 0.7285, + "step": 65270 + }, + { + "epoch": 1.0478498852309026, + "grad_norm": 0.9007390141487122, + "learning_rate": 2.312869406606029e-05, + "loss": 0.7903, + "step": 65280 + }, + { + "epoch": 1.0480104014510667, + "grad_norm": 0.9612421989440918, + "learning_rate": 2.312240734276313e-05, + "loss": 0.7297, + "step": 65290 + }, + { + "epoch": 1.0481709176712306, + "grad_norm": 0.8115471601486206, + "learning_rate": 2.3116120738869844e-05, + "loss": 0.6497, + "step": 65300 + }, + { + "epoch": 1.0483314338913947, + "grad_norm": 1.1912378072738647, + "learning_rate": 2.310983425478022e-05, + "loss": 0.7519, + "step": 65310 + }, + { + "epoch": 1.0484919501115588, + "grad_norm": 0.844772458076477, + "learning_rate": 2.3103547890894046e-05, + "loss": 0.5759, + "step": 65320 + }, + { + "epoch": 1.0486524663317227, + "grad_norm": 1.1023168563842773, + "learning_rate": 2.30972616476111e-05, + "loss": 0.7662, + "step": 65330 + }, + { + "epoch": 1.0488129825518868, + "grad_norm": 0.7176132202148438, + "learning_rate": 2.3090975525331136e-05, + "loss": 0.7624, + "step": 65340 + }, + { + "epoch": 1.048973498772051, + "grad_norm": 0.9463656544685364, + "learning_rate": 2.308468952445393e-05, + "loss": 0.5968, + "step": 65350 + }, + { + "epoch": 1.049134014992215, + "grad_norm": 1.295184850692749, + "learning_rate": 2.3078403645379225e-05, + "loss": 0.6955, + "step": 65360 + }, + { + "epoch": 1.049294531212379, + "grad_norm": 0.9219323992729187, + "learning_rate": 2.3072117888506766e-05, + "loss": 0.8374, + "step": 65370 + }, + { + "epoch": 1.049455047432543, + "grad_norm": 0.6294381022453308, + "learning_rate": 2.3065832254236298e-05, + "loss": 0.6852, + "step": 65380 + }, + { + "epoch": 1.0496155636527071, + "grad_norm": 0.7084701657295227, + "learning_rate": 2.3059546742967554e-05, + "loss": 0.7417, + "step": 65390 + }, + { + "epoch": 1.0497760798728712, + "grad_norm": 0.6879215240478516, + "learning_rate": 2.305326135510024e-05, + "loss": 0.7747, + "step": 65400 + }, + { + "epoch": 1.0499365960930351, + "grad_norm": 0.762206494808197, + "learning_rate": 2.3046976091034076e-05, + "loss": 0.6928, + "step": 65410 + }, + { + "epoch": 1.0500971123131992, + "grad_norm": 0.7862225770950317, + "learning_rate": 2.304069095116877e-05, + "loss": 0.7555, + "step": 65420 + }, + { + "epoch": 1.0502576285333634, + "grad_norm": 0.9301307201385498, + "learning_rate": 2.3034405935904018e-05, + "loss": 0.7073, + "step": 65430 + }, + { + "epoch": 1.0504181447535272, + "grad_norm": 0.923402726650238, + "learning_rate": 2.3028121045639516e-05, + "loss": 0.5901, + "step": 65440 + }, + { + "epoch": 1.0505786609736913, + "grad_norm": 1.1368800401687622, + "learning_rate": 2.3021836280774948e-05, + "loss": 0.8176, + "step": 65450 + }, + { + "epoch": 1.0507391771938555, + "grad_norm": 0.8824573755264282, + "learning_rate": 2.3015551641709972e-05, + "loss": 0.8657, + "step": 65460 + }, + { + "epoch": 1.0508996934140196, + "grad_norm": 0.683870255947113, + "learning_rate": 2.3009267128844265e-05, + "loss": 0.7701, + "step": 65470 + }, + { + "epoch": 1.0510602096341835, + "grad_norm": 1.0236494541168213, + "learning_rate": 2.3002982742577485e-05, + "loss": 0.6029, + "step": 65480 + }, + { + "epoch": 1.0512207258543476, + "grad_norm": 1.2663569450378418, + "learning_rate": 2.299669848330928e-05, + "loss": 0.7052, + "step": 65490 + }, + { + "epoch": 1.0513812420745117, + "grad_norm": 0.6509541273117065, + "learning_rate": 2.299041435143929e-05, + "loss": 0.743, + "step": 65500 + }, + { + "epoch": 1.0515417582946758, + "grad_norm": 1.6937333345413208, + "learning_rate": 2.2984130347367156e-05, + "loss": 0.746, + "step": 65510 + }, + { + "epoch": 1.0517022745148397, + "grad_norm": 0.8326194286346436, + "learning_rate": 2.2977846471492506e-05, + "loss": 0.6863, + "step": 65520 + }, + { + "epoch": 1.0518627907350038, + "grad_norm": 0.8695763349533081, + "learning_rate": 2.297156272421495e-05, + "loss": 0.7663, + "step": 65530 + }, + { + "epoch": 1.0520233069551679, + "grad_norm": 0.5166404247283936, + "learning_rate": 2.2965279105934092e-05, + "loss": 0.7487, + "step": 65540 + }, + { + "epoch": 1.0521838231753318, + "grad_norm": 0.8285050988197327, + "learning_rate": 2.2958995617049544e-05, + "loss": 0.6384, + "step": 65550 + }, + { + "epoch": 1.0523443393954959, + "grad_norm": 1.0948231220245361, + "learning_rate": 2.2952712257960893e-05, + "loss": 0.7979, + "step": 65560 + }, + { + "epoch": 1.05250485561566, + "grad_norm": 1.6139721870422363, + "learning_rate": 2.2946429029067728e-05, + "loss": 0.7498, + "step": 65570 + }, + { + "epoch": 1.052665371835824, + "grad_norm": 0.9037678241729736, + "learning_rate": 2.2940145930769625e-05, + "loss": 0.6343, + "step": 65580 + }, + { + "epoch": 1.052825888055988, + "grad_norm": 0.763114869594574, + "learning_rate": 2.293386296346615e-05, + "loss": 0.8105, + "step": 65590 + }, + { + "epoch": 1.052986404276152, + "grad_norm": 2.865628480911255, + "learning_rate": 2.2927580127556862e-05, + "loss": 0.6484, + "step": 65600 + }, + { + "epoch": 1.0531469204963162, + "grad_norm": 0.811920166015625, + "learning_rate": 2.2921297423441313e-05, + "loss": 0.7338, + "step": 65610 + }, + { + "epoch": 1.0533074367164803, + "grad_norm": 0.9069992303848267, + "learning_rate": 2.2915014851519045e-05, + "loss": 0.7008, + "step": 65620 + }, + { + "epoch": 1.0534679529366442, + "grad_norm": 1.1766313314437866, + "learning_rate": 2.2908732412189597e-05, + "loss": 0.744, + "step": 65630 + }, + { + "epoch": 1.0536284691568083, + "grad_norm": 0.7836554050445557, + "learning_rate": 2.2902450105852508e-05, + "loss": 0.7607, + "step": 65640 + }, + { + "epoch": 1.0537889853769724, + "grad_norm": 0.9365366697311401, + "learning_rate": 2.2896167932907264e-05, + "loss": 0.7976, + "step": 65650 + }, + { + "epoch": 1.0539495015971363, + "grad_norm": 0.9193509817123413, + "learning_rate": 2.2889885893753392e-05, + "loss": 0.6815, + "step": 65660 + }, + { + "epoch": 1.0541100178173004, + "grad_norm": 0.8108930587768555, + "learning_rate": 2.2883603988790396e-05, + "loss": 0.7729, + "step": 65670 + }, + { + "epoch": 1.0542705340374645, + "grad_norm": 0.8858295679092407, + "learning_rate": 2.287732221841776e-05, + "loss": 0.782, + "step": 65680 + }, + { + "epoch": 1.0544310502576286, + "grad_norm": 0.7651798129081726, + "learning_rate": 2.2871040583034974e-05, + "loss": 0.7832, + "step": 65690 + }, + { + "epoch": 1.0545915664777925, + "grad_norm": 1.0666636228561401, + "learning_rate": 2.2864759083041524e-05, + "loss": 0.8551, + "step": 65700 + }, + { + "epoch": 1.0547520826979566, + "grad_norm": 0.9500322341918945, + "learning_rate": 2.285847771883685e-05, + "loss": 0.7302, + "step": 65710 + }, + { + "epoch": 1.0549125989181207, + "grad_norm": 0.9122289419174194, + "learning_rate": 2.2852196490820424e-05, + "loss": 0.8112, + "step": 65720 + }, + { + "epoch": 1.0550731151382848, + "grad_norm": 0.9362168312072754, + "learning_rate": 2.2845915399391697e-05, + "loss": 0.7526, + "step": 65730 + }, + { + "epoch": 1.0552336313584487, + "grad_norm": 0.9219255447387695, + "learning_rate": 2.2839634444950107e-05, + "loss": 0.7339, + "step": 65740 + }, + { + "epoch": 1.0553941475786128, + "grad_norm": 0.8376188278198242, + "learning_rate": 2.2833353627895084e-05, + "loss": 0.6874, + "step": 65750 + }, + { + "epoch": 1.055554663798777, + "grad_norm": 0.6994627714157104, + "learning_rate": 2.2827072948626062e-05, + "loss": 0.5896, + "step": 65760 + }, + { + "epoch": 1.0557151800189408, + "grad_norm": 1.5859477519989014, + "learning_rate": 2.282079240754244e-05, + "loss": 0.7629, + "step": 65770 + }, + { + "epoch": 1.055875696239105, + "grad_norm": 0.6780286431312561, + "learning_rate": 2.281451200504363e-05, + "loss": 0.6429, + "step": 65780 + }, + { + "epoch": 1.056036212459269, + "grad_norm": 1.8153568506240845, + "learning_rate": 2.2808231741529028e-05, + "loss": 0.6197, + "step": 65790 + }, + { + "epoch": 1.0561967286794331, + "grad_norm": 0.9741023778915405, + "learning_rate": 2.2801951617398025e-05, + "loss": 0.6501, + "step": 65800 + }, + { + "epoch": 1.056357244899597, + "grad_norm": 0.7636476159095764, + "learning_rate": 2.2795671633049995e-05, + "loss": 0.8395, + "step": 65810 + }, + { + "epoch": 1.0565177611197611, + "grad_norm": 1.0793708562850952, + "learning_rate": 2.2789391788884313e-05, + "loss": 0.6471, + "step": 65820 + }, + { + "epoch": 1.0566782773399253, + "grad_norm": 2.4685473442077637, + "learning_rate": 2.278311208530034e-05, + "loss": 0.6248, + "step": 65830 + }, + { + "epoch": 1.0568387935600891, + "grad_norm": 0.7828016877174377, + "learning_rate": 2.277683252269743e-05, + "loss": 0.6428, + "step": 65840 + }, + { + "epoch": 1.0569993097802532, + "grad_norm": 0.9585057497024536, + "learning_rate": 2.2770553101474916e-05, + "loss": 0.6435, + "step": 65850 + }, + { + "epoch": 1.0571598260004174, + "grad_norm": 0.8037406802177429, + "learning_rate": 2.276427382203214e-05, + "loss": 0.624, + "step": 65860 + }, + { + "epoch": 1.0573203422205815, + "grad_norm": 0.8645564317703247, + "learning_rate": 2.2757994684768426e-05, + "loss": 0.8486, + "step": 65870 + }, + { + "epoch": 1.0574808584407454, + "grad_norm": 0.9275181293487549, + "learning_rate": 2.275171569008309e-05, + "loss": 0.6577, + "step": 65880 + }, + { + "epoch": 1.0576413746609095, + "grad_norm": 1.2224957942962646, + "learning_rate": 2.274543683837545e-05, + "loss": 0.7825, + "step": 65890 + }, + { + "epoch": 1.0578018908810736, + "grad_norm": 0.9930294156074524, + "learning_rate": 2.2739158130044785e-05, + "loss": 0.7371, + "step": 65900 + }, + { + "epoch": 1.0579624071012377, + "grad_norm": 1.1029202938079834, + "learning_rate": 2.2732879565490396e-05, + "loss": 0.7556, + "step": 65910 + }, + { + "epoch": 1.0581229233214016, + "grad_norm": 0.836176335811615, + "learning_rate": 2.2726601145111556e-05, + "loss": 0.8039, + "step": 65920 + }, + { + "epoch": 1.0582834395415657, + "grad_norm": 0.8186068534851074, + "learning_rate": 2.2720322869307542e-05, + "loss": 0.9579, + "step": 65930 + }, + { + "epoch": 1.0584439557617298, + "grad_norm": 1.1225507259368896, + "learning_rate": 2.271404473847761e-05, + "loss": 0.7338, + "step": 65940 + }, + { + "epoch": 1.0586044719818937, + "grad_norm": 0.957334578037262, + "learning_rate": 2.270776675302103e-05, + "loss": 0.7028, + "step": 65950 + }, + { + "epoch": 1.0587649882020578, + "grad_norm": 0.7995381355285645, + "learning_rate": 2.270148891333702e-05, + "loss": 0.7019, + "step": 65960 + }, + { + "epoch": 1.0589255044222219, + "grad_norm": 0.955154299736023, + "learning_rate": 2.2695211219824822e-05, + "loss": 0.812, + "step": 65970 + }, + { + "epoch": 1.059086020642386, + "grad_norm": 0.9090430736541748, + "learning_rate": 2.268893367288366e-05, + "loss": 0.6996, + "step": 65980 + }, + { + "epoch": 1.0592465368625499, + "grad_norm": 0.6972959637641907, + "learning_rate": 2.2682656272912757e-05, + "loss": 0.7021, + "step": 65990 + }, + { + "epoch": 1.059407053082714, + "grad_norm": 1.4118844270706177, + "learning_rate": 2.2676379020311313e-05, + "loss": 0.6241, + "step": 66000 + }, + { + "epoch": 1.059567569302878, + "grad_norm": 0.8085902333259583, + "learning_rate": 2.2670101915478533e-05, + "loss": 0.7665, + "step": 66010 + }, + { + "epoch": 1.0597280855230422, + "grad_norm": 0.7768388986587524, + "learning_rate": 2.266382495881359e-05, + "loss": 0.7701, + "step": 66020 + }, + { + "epoch": 1.059888601743206, + "grad_norm": 0.697803795337677, + "learning_rate": 2.265754815071566e-05, + "loss": 0.7815, + "step": 66030 + }, + { + "epoch": 1.0600491179633702, + "grad_norm": 0.5575347542762756, + "learning_rate": 2.2651271491583925e-05, + "loss": 0.75, + "step": 66040 + }, + { + "epoch": 1.0602096341835343, + "grad_norm": 0.7928617596626282, + "learning_rate": 2.2644994981817537e-05, + "loss": 0.6305, + "step": 66050 + }, + { + "epoch": 1.0603701504036982, + "grad_norm": 0.8434029221534729, + "learning_rate": 2.2638718621815645e-05, + "loss": 0.7683, + "step": 66060 + }, + { + "epoch": 1.0605306666238623, + "grad_norm": 0.6975804567337036, + "learning_rate": 2.2632442411977386e-05, + "loss": 0.6886, + "step": 66070 + }, + { + "epoch": 1.0606911828440264, + "grad_norm": 0.6145359873771667, + "learning_rate": 2.2626166352701908e-05, + "loss": 0.7144, + "step": 66080 + }, + { + "epoch": 1.0608516990641905, + "grad_norm": 2.160057783126831, + "learning_rate": 2.2619890444388307e-05, + "loss": 0.6567, + "step": 66090 + }, + { + "epoch": 1.0610122152843544, + "grad_norm": 0.9008125066757202, + "learning_rate": 2.26136146874357e-05, + "loss": 0.7344, + "step": 66100 + }, + { + "epoch": 1.0611727315045185, + "grad_norm": 0.9384028911590576, + "learning_rate": 2.2607339082243195e-05, + "loss": 0.6774, + "step": 66110 + }, + { + "epoch": 1.0613332477246826, + "grad_norm": 1.3914092779159546, + "learning_rate": 2.2601063629209878e-05, + "loss": 0.6887, + "step": 66120 + }, + { + "epoch": 1.0614937639448465, + "grad_norm": 0.7154830694198608, + "learning_rate": 2.2594788328734835e-05, + "loss": 0.6839, + "step": 66130 + }, + { + "epoch": 1.0616542801650106, + "grad_norm": 0.9289195537567139, + "learning_rate": 2.2588513181217147e-05, + "loss": 0.7036, + "step": 66140 + }, + { + "epoch": 1.0618147963851747, + "grad_norm": 0.8683678507804871, + "learning_rate": 2.2582238187055862e-05, + "loss": 0.7827, + "step": 66150 + }, + { + "epoch": 1.0619753126053388, + "grad_norm": 0.6117386221885681, + "learning_rate": 2.2575963346650034e-05, + "loss": 0.7525, + "step": 66160 + }, + { + "epoch": 1.0621358288255027, + "grad_norm": 0.8188052773475647, + "learning_rate": 2.256968866039871e-05, + "loss": 0.7388, + "step": 66170 + }, + { + "epoch": 1.0622963450456668, + "grad_norm": 0.5594343543052673, + "learning_rate": 2.256341412870092e-05, + "loss": 0.6379, + "step": 66180 + }, + { + "epoch": 1.062456861265831, + "grad_norm": 1.1705880165100098, + "learning_rate": 2.2557139751955692e-05, + "loss": 0.7402, + "step": 66190 + }, + { + "epoch": 1.062617377485995, + "grad_norm": 0.7293546795845032, + "learning_rate": 2.2550865530562043e-05, + "loss": 0.6345, + "step": 66200 + }, + { + "epoch": 1.062777893706159, + "grad_norm": 0.6503583192825317, + "learning_rate": 2.2544591464918966e-05, + "loss": 0.6739, + "step": 66210 + }, + { + "epoch": 1.062938409926323, + "grad_norm": 0.7571455240249634, + "learning_rate": 2.253831755542546e-05, + "loss": 0.828, + "step": 66220 + }, + { + "epoch": 1.0630989261464872, + "grad_norm": 0.928023636341095, + "learning_rate": 2.2532043802480506e-05, + "loss": 0.6973, + "step": 66230 + }, + { + "epoch": 1.0632594423666513, + "grad_norm": 0.6677547693252563, + "learning_rate": 2.2525770206483083e-05, + "loss": 0.6493, + "step": 66240 + }, + { + "epoch": 1.0634199585868152, + "grad_norm": 1.0671976804733276, + "learning_rate": 2.2519496767832154e-05, + "loss": 0.7659, + "step": 66250 + }, + { + "epoch": 1.0635804748069793, + "grad_norm": 0.9871487617492676, + "learning_rate": 2.251322348692668e-05, + "loss": 0.7199, + "step": 66260 + }, + { + "epoch": 1.0637409910271434, + "grad_norm": 0.6893566250801086, + "learning_rate": 2.250695036416559e-05, + "loss": 0.6424, + "step": 66270 + }, + { + "epoch": 1.0639015072473073, + "grad_norm": 0.9618403315544128, + "learning_rate": 2.250067739994782e-05, + "loss": 0.6592, + "step": 66280 + }, + { + "epoch": 1.0640620234674714, + "grad_norm": 1.258641004562378, + "learning_rate": 2.2494404594672298e-05, + "loss": 0.7466, + "step": 66290 + }, + { + "epoch": 1.0642225396876355, + "grad_norm": 0.9447195529937744, + "learning_rate": 2.248813194873794e-05, + "loss": 0.6723, + "step": 66300 + }, + { + "epoch": 1.0643830559077996, + "grad_norm": 0.7491387128829956, + "learning_rate": 2.2481859462543642e-05, + "loss": 0.6962, + "step": 66310 + }, + { + "epoch": 1.0645435721279635, + "grad_norm": 0.9160307049751282, + "learning_rate": 2.2475587136488307e-05, + "loss": 0.6847, + "step": 66320 + }, + { + "epoch": 1.0647040883481276, + "grad_norm": 1.0648550987243652, + "learning_rate": 2.2469314970970822e-05, + "loss": 0.7979, + "step": 66330 + }, + { + "epoch": 1.0648646045682917, + "grad_norm": 0.5858681201934814, + "learning_rate": 2.246304296639004e-05, + "loss": 0.617, + "step": 66340 + }, + { + "epoch": 1.0650251207884556, + "grad_norm": 0.6956511735916138, + "learning_rate": 2.2456771123144837e-05, + "loss": 0.5912, + "step": 66350 + }, + { + "epoch": 1.0651856370086197, + "grad_norm": 1.4047924280166626, + "learning_rate": 2.245049944163406e-05, + "loss": 0.6942, + "step": 66360 + }, + { + "epoch": 1.0653461532287838, + "grad_norm": 1.0349241495132446, + "learning_rate": 2.244422792225656e-05, + "loss": 0.709, + "step": 66370 + }, + { + "epoch": 1.065506669448948, + "grad_norm": 0.5492111444473267, + "learning_rate": 2.2437956565411156e-05, + "loss": 0.7173, + "step": 66380 + }, + { + "epoch": 1.0656671856691118, + "grad_norm": 1.200865387916565, + "learning_rate": 2.243168537149669e-05, + "loss": 0.7434, + "step": 66390 + }, + { + "epoch": 1.065827701889276, + "grad_norm": 0.6436598300933838, + "learning_rate": 2.2425414340911956e-05, + "loss": 0.7196, + "step": 66400 + }, + { + "epoch": 1.06598821810944, + "grad_norm": 0.6759573817253113, + "learning_rate": 2.241914347405575e-05, + "loss": 0.6769, + "step": 66410 + }, + { + "epoch": 1.066148734329604, + "grad_norm": 0.8549280166625977, + "learning_rate": 2.2412872771326874e-05, + "loss": 0.603, + "step": 66420 + }, + { + "epoch": 1.066309250549768, + "grad_norm": 0.7214244604110718, + "learning_rate": 2.2406602233124106e-05, + "loss": 0.64, + "step": 66430 + }, + { + "epoch": 1.066469766769932, + "grad_norm": 1.1000852584838867, + "learning_rate": 2.2400331859846212e-05, + "loss": 0.7089, + "step": 66440 + }, + { + "epoch": 1.0666302829900962, + "grad_norm": 0.8087923526763916, + "learning_rate": 2.2394061651891958e-05, + "loss": 0.7411, + "step": 66450 + }, + { + "epoch": 1.06679079921026, + "grad_norm": 0.6646246314048767, + "learning_rate": 2.2387791609660087e-05, + "loss": 0.6839, + "step": 66460 + }, + { + "epoch": 1.0669513154304242, + "grad_norm": 0.6018154621124268, + "learning_rate": 2.2381521733549336e-05, + "loss": 0.7658, + "step": 66470 + }, + { + "epoch": 1.0671118316505883, + "grad_norm": 0.7520949244499207, + "learning_rate": 2.2375252023958433e-05, + "loss": 0.7982, + "step": 66480 + }, + { + "epoch": 1.0672723478707524, + "grad_norm": 0.7492460012435913, + "learning_rate": 2.2368982481286095e-05, + "loss": 0.7169, + "step": 66490 + }, + { + "epoch": 1.0674328640909163, + "grad_norm": 0.6631795167922974, + "learning_rate": 2.236271310593103e-05, + "loss": 0.722, + "step": 66500 + }, + { + "epoch": 1.0675933803110804, + "grad_norm": 1.136340856552124, + "learning_rate": 2.2356443898291938e-05, + "loss": 0.6525, + "step": 66510 + }, + { + "epoch": 1.0677538965312445, + "grad_norm": 0.9355343580245972, + "learning_rate": 2.2350174858767488e-05, + "loss": 0.7803, + "step": 66520 + }, + { + "epoch": 1.0679144127514086, + "grad_norm": 0.5091009140014648, + "learning_rate": 2.2343905987756367e-05, + "loss": 0.707, + "step": 66530 + }, + { + "epoch": 1.0680749289715725, + "grad_norm": 1.0544302463531494, + "learning_rate": 2.2337637285657236e-05, + "loss": 0.7729, + "step": 66540 + }, + { + "epoch": 1.0682354451917366, + "grad_norm": 0.8909885883331299, + "learning_rate": 2.233136875286875e-05, + "loss": 0.711, + "step": 66550 + }, + { + "epoch": 1.0683959614119007, + "grad_norm": 1.4654353857040405, + "learning_rate": 2.232510038978954e-05, + "loss": 0.6446, + "step": 66560 + }, + { + "epoch": 1.0685564776320646, + "grad_norm": 1.5475660562515259, + "learning_rate": 2.231883219681826e-05, + "loss": 0.8469, + "step": 66570 + }, + { + "epoch": 1.0687169938522287, + "grad_norm": 1.1174192428588867, + "learning_rate": 2.2312564174353508e-05, + "loss": 0.7839, + "step": 66580 + }, + { + "epoch": 1.0688775100723928, + "grad_norm": 1.0825672149658203, + "learning_rate": 2.2306296322793895e-05, + "loss": 0.7036, + "step": 66590 + }, + { + "epoch": 1.069038026292557, + "grad_norm": 1.1920233964920044, + "learning_rate": 2.2300028642538028e-05, + "loss": 0.7427, + "step": 66600 + }, + { + "epoch": 1.0691985425127208, + "grad_norm": 0.6897644400596619, + "learning_rate": 2.2293761133984495e-05, + "loss": 0.7993, + "step": 66610 + }, + { + "epoch": 1.069359058732885, + "grad_norm": 0.6569563150405884, + "learning_rate": 2.2287493797531868e-05, + "loss": 0.6225, + "step": 66620 + }, + { + "epoch": 1.069519574953049, + "grad_norm": 0.8182573914527893, + "learning_rate": 2.228122663357871e-05, + "loss": 0.7137, + "step": 66630 + }, + { + "epoch": 1.069680091173213, + "grad_norm": 0.7309070229530334, + "learning_rate": 2.2274959642523595e-05, + "loss": 0.7753, + "step": 66640 + }, + { + "epoch": 1.069840607393377, + "grad_norm": 1.1207212209701538, + "learning_rate": 2.2268692824765045e-05, + "loss": 0.7834, + "step": 66650 + }, + { + "epoch": 1.0700011236135412, + "grad_norm": 1.2129693031311035, + "learning_rate": 2.2262426180701594e-05, + "loss": 0.5813, + "step": 66660 + }, + { + "epoch": 1.0701616398337053, + "grad_norm": 0.984807014465332, + "learning_rate": 2.2256159710731777e-05, + "loss": 0.7916, + "step": 66670 + }, + { + "epoch": 1.0703221560538692, + "grad_norm": 0.7523483037948608, + "learning_rate": 2.224989341525409e-05, + "loss": 0.6464, + "step": 66680 + }, + { + "epoch": 1.0704826722740333, + "grad_norm": 0.6911218762397766, + "learning_rate": 2.2243627294667047e-05, + "loss": 0.717, + "step": 66690 + }, + { + "epoch": 1.0706431884941974, + "grad_norm": 0.8721676468849182, + "learning_rate": 2.223736134936914e-05, + "loss": 0.6349, + "step": 66700 + }, + { + "epoch": 1.0708037047143615, + "grad_norm": 0.9807147979736328, + "learning_rate": 2.223109557975882e-05, + "loss": 0.8078, + "step": 66710 + }, + { + "epoch": 1.0709642209345254, + "grad_norm": 0.668390691280365, + "learning_rate": 2.2224829986234573e-05, + "loss": 0.7579, + "step": 66720 + }, + { + "epoch": 1.0711247371546895, + "grad_norm": 0.6912745833396912, + "learning_rate": 2.221856456919485e-05, + "loss": 0.7512, + "step": 66730 + }, + { + "epoch": 1.0712852533748536, + "grad_norm": 1.0741431713104248, + "learning_rate": 2.2212299329038095e-05, + "loss": 0.722, + "step": 66740 + }, + { + "epoch": 1.0714457695950177, + "grad_norm": 0.9849450588226318, + "learning_rate": 2.220603426616274e-05, + "loss": 0.9389, + "step": 66750 + }, + { + "epoch": 1.0716062858151816, + "grad_norm": 0.8085958957672119, + "learning_rate": 2.2199769380967212e-05, + "loss": 0.625, + "step": 66760 + }, + { + "epoch": 1.0717668020353457, + "grad_norm": 0.9421380162239075, + "learning_rate": 2.219350467384991e-05, + "loss": 0.7823, + "step": 66770 + }, + { + "epoch": 1.0719273182555098, + "grad_norm": 0.9910733699798584, + "learning_rate": 2.2187240145209243e-05, + "loss": 0.6398, + "step": 66780 + }, + { + "epoch": 1.0720878344756737, + "grad_norm": 0.831845760345459, + "learning_rate": 2.218097579544359e-05, + "loss": 0.7593, + "step": 66790 + }, + { + "epoch": 1.0722483506958378, + "grad_norm": 1.0753939151763916, + "learning_rate": 2.217471162495133e-05, + "loss": 0.813, + "step": 66800 + }, + { + "epoch": 1.072408866916002, + "grad_norm": 0.749413788318634, + "learning_rate": 2.2168447634130828e-05, + "loss": 0.6452, + "step": 66810 + }, + { + "epoch": 1.072569383136166, + "grad_norm": 1.7326152324676514, + "learning_rate": 2.2162183823380443e-05, + "loss": 0.755, + "step": 66820 + }, + { + "epoch": 1.07272989935633, + "grad_norm": 0.8579797148704529, + "learning_rate": 2.2155920193098505e-05, + "loss": 0.7591, + "step": 66830 + }, + { + "epoch": 1.072890415576494, + "grad_norm": 0.9159342646598816, + "learning_rate": 2.214965674368335e-05, + "loss": 0.6139, + "step": 66840 + }, + { + "epoch": 1.0730509317966581, + "grad_norm": 0.7506638169288635, + "learning_rate": 2.2143393475533292e-05, + "loss": 0.7287, + "step": 66850 + }, + { + "epoch": 1.073211448016822, + "grad_norm": 1.5763161182403564, + "learning_rate": 2.2137130389046645e-05, + "loss": 0.7754, + "step": 66860 + }, + { + "epoch": 1.073371964236986, + "grad_norm": 0.8270688652992249, + "learning_rate": 2.2130867484621703e-05, + "loss": 0.709, + "step": 66870 + }, + { + "epoch": 1.0735324804571502, + "grad_norm": 0.9305542707443237, + "learning_rate": 2.212460476265675e-05, + "loss": 0.6852, + "step": 66880 + }, + { + "epoch": 1.0736929966773143, + "grad_norm": 0.8980093002319336, + "learning_rate": 2.2118342223550063e-05, + "loss": 0.7398, + "step": 66890 + }, + { + "epoch": 1.0738535128974782, + "grad_norm": 1.5806884765625, + "learning_rate": 2.211207986769989e-05, + "loss": 0.8025, + "step": 66900 + }, + { + "epoch": 1.0740140291176423, + "grad_norm": 0.9030564427375793, + "learning_rate": 2.2105817695504487e-05, + "loss": 0.7194, + "step": 66910 + }, + { + "epoch": 1.0741745453378064, + "grad_norm": 1.2594871520996094, + "learning_rate": 2.209955570736209e-05, + "loss": 0.7323, + "step": 66920 + }, + { + "epoch": 1.0743350615579703, + "grad_norm": 0.8325933218002319, + "learning_rate": 2.209329390367093e-05, + "loss": 0.7106, + "step": 66930 + }, + { + "epoch": 1.0744955777781344, + "grad_norm": 0.7410846948623657, + "learning_rate": 2.2087032284829212e-05, + "loss": 0.6776, + "step": 66940 + }, + { + "epoch": 1.0746560939982985, + "grad_norm": 0.9311900734901428, + "learning_rate": 2.2080770851235157e-05, + "loss": 0.6745, + "step": 66950 + }, + { + "epoch": 1.0748166102184626, + "grad_norm": 1.0455831289291382, + "learning_rate": 2.207450960328693e-05, + "loss": 0.7058, + "step": 66960 + }, + { + "epoch": 1.0749771264386265, + "grad_norm": 1.4300631284713745, + "learning_rate": 2.2068248541382722e-05, + "loss": 0.7583, + "step": 66970 + }, + { + "epoch": 1.0751376426587906, + "grad_norm": 1.442512035369873, + "learning_rate": 2.20619876659207e-05, + "loss": 0.7373, + "step": 66980 + }, + { + "epoch": 1.0752981588789547, + "grad_norm": 0.8807119727134705, + "learning_rate": 2.2055726977299016e-05, + "loss": 0.6514, + "step": 66990 + }, + { + "epoch": 1.0754586750991189, + "grad_norm": 1.6455304622650146, + "learning_rate": 2.2049466475915815e-05, + "loss": 0.6975, + "step": 67000 + }, + { + "epoch": 1.0756191913192827, + "grad_norm": 1.4758754968643188, + "learning_rate": 2.204320616216924e-05, + "loss": 0.6772, + "step": 67010 + }, + { + "epoch": 1.0757797075394469, + "grad_norm": 1.1476176977157593, + "learning_rate": 2.203694603645739e-05, + "loss": 0.6309, + "step": 67020 + }, + { + "epoch": 1.075940223759611, + "grad_norm": 0.6431595683097839, + "learning_rate": 2.2030686099178377e-05, + "loss": 0.676, + "step": 67030 + }, + { + "epoch": 1.076100739979775, + "grad_norm": 0.6870743036270142, + "learning_rate": 2.2024426350730302e-05, + "loss": 0.6972, + "step": 67040 + }, + { + "epoch": 1.076261256199939, + "grad_norm": 0.7722061276435852, + "learning_rate": 2.2018166791511244e-05, + "loss": 0.7159, + "step": 67050 + }, + { + "epoch": 1.076421772420103, + "grad_norm": 1.0389045476913452, + "learning_rate": 2.201190742191928e-05, + "loss": 0.7279, + "step": 67060 + }, + { + "epoch": 1.0765822886402672, + "grad_norm": 0.9301592707633972, + "learning_rate": 2.2005648242352467e-05, + "loss": 0.74, + "step": 67070 + }, + { + "epoch": 1.076742804860431, + "grad_norm": 1.1551215648651123, + "learning_rate": 2.199938925320885e-05, + "loss": 0.8144, + "step": 67080 + }, + { + "epoch": 1.0769033210805952, + "grad_norm": 1.0953574180603027, + "learning_rate": 2.1993130454886464e-05, + "loss": 0.6742, + "step": 67090 + }, + { + "epoch": 1.0770638373007593, + "grad_norm": 1.3062835931777954, + "learning_rate": 2.1986871847783332e-05, + "loss": 0.707, + "step": 67100 + }, + { + "epoch": 1.0772243535209234, + "grad_norm": 0.7451522946357727, + "learning_rate": 2.1980613432297464e-05, + "loss": 0.7269, + "step": 67110 + }, + { + "epoch": 1.0773848697410873, + "grad_norm": 0.7895632982254028, + "learning_rate": 2.197435520882686e-05, + "loss": 0.6455, + "step": 67120 + }, + { + "epoch": 1.0775453859612514, + "grad_norm": 0.800298273563385, + "learning_rate": 2.196809717776951e-05, + "loss": 0.7148, + "step": 67130 + }, + { + "epoch": 1.0777059021814155, + "grad_norm": 1.4482486248016357, + "learning_rate": 2.1961839339523383e-05, + "loss": 0.6232, + "step": 67140 + }, + { + "epoch": 1.0778664184015794, + "grad_norm": 0.9283769726753235, + "learning_rate": 2.1955581694486435e-05, + "loss": 0.7888, + "step": 67150 + }, + { + "epoch": 1.0780269346217435, + "grad_norm": 1.0072656869888306, + "learning_rate": 2.1949324243056625e-05, + "loss": 0.6725, + "step": 67160 + }, + { + "epoch": 1.0781874508419076, + "grad_norm": 1.4948641061782837, + "learning_rate": 2.194306698563189e-05, + "loss": 0.658, + "step": 67170 + }, + { + "epoch": 1.0783479670620717, + "grad_norm": 1.1070482730865479, + "learning_rate": 2.193680992261015e-05, + "loss": 0.6634, + "step": 67180 + }, + { + "epoch": 1.0785084832822356, + "grad_norm": 0.9983013272285461, + "learning_rate": 2.193055305438932e-05, + "loss": 0.7877, + "step": 67190 + }, + { + "epoch": 1.0786689995023997, + "grad_norm": 1.0571945905685425, + "learning_rate": 2.1924296381367313e-05, + "loss": 0.8225, + "step": 67200 + }, + { + "epoch": 1.0788295157225638, + "grad_norm": 0.7772236466407776, + "learning_rate": 2.191803990394199e-05, + "loss": 0.6294, + "step": 67210 + }, + { + "epoch": 1.078990031942728, + "grad_norm": 0.6706082820892334, + "learning_rate": 2.191178362251124e-05, + "loss": 0.6808, + "step": 67220 + }, + { + "epoch": 1.0791505481628918, + "grad_norm": 1.3500447273254395, + "learning_rate": 2.1905527537472928e-05, + "loss": 0.595, + "step": 67230 + }, + { + "epoch": 1.079311064383056, + "grad_norm": 1.0389295816421509, + "learning_rate": 2.1899271649224902e-05, + "loss": 0.601, + "step": 67240 + }, + { + "epoch": 1.07947158060322, + "grad_norm": 1.0490080118179321, + "learning_rate": 2.1893015958164994e-05, + "loss": 0.8317, + "step": 67250 + }, + { + "epoch": 1.079632096823384, + "grad_norm": 1.3552037477493286, + "learning_rate": 2.1886760464691053e-05, + "loss": 0.7226, + "step": 67260 + }, + { + "epoch": 1.079792613043548, + "grad_norm": 0.7500867247581482, + "learning_rate": 2.1880505169200857e-05, + "loss": 0.6796, + "step": 67270 + }, + { + "epoch": 1.0799531292637121, + "grad_norm": 1.320414662361145, + "learning_rate": 2.1874250072092226e-05, + "loss": 0.715, + "step": 67280 + }, + { + "epoch": 1.0801136454838762, + "grad_norm": 0.6891539692878723, + "learning_rate": 2.1867995173762942e-05, + "loss": 0.7027, + "step": 67290 + }, + { + "epoch": 1.0802741617040401, + "grad_norm": 1.0571331977844238, + "learning_rate": 2.1861740474610786e-05, + "loss": 0.6973, + "step": 67300 + }, + { + "epoch": 1.0804346779242042, + "grad_norm": 0.8246678113937378, + "learning_rate": 2.1855485975033514e-05, + "loss": 0.7317, + "step": 67310 + }, + { + "epoch": 1.0805951941443683, + "grad_norm": 0.8003681898117065, + "learning_rate": 2.1849231675428886e-05, + "loss": 0.6898, + "step": 67320 + }, + { + "epoch": 1.0807557103645324, + "grad_norm": 0.9270354509353638, + "learning_rate": 2.184297757619462e-05, + "loss": 0.7668, + "step": 67330 + }, + { + "epoch": 1.0809162265846963, + "grad_norm": 0.8461979627609253, + "learning_rate": 2.1836723677728452e-05, + "loss": 0.7648, + "step": 67340 + }, + { + "epoch": 1.0810767428048604, + "grad_norm": 1.2744561433792114, + "learning_rate": 2.1830469980428093e-05, + "loss": 0.6857, + "step": 67350 + }, + { + "epoch": 1.0812372590250245, + "grad_norm": 0.7242479920387268, + "learning_rate": 2.1824216484691233e-05, + "loss": 0.7449, + "step": 67360 + }, + { + "epoch": 1.0813977752451884, + "grad_norm": 1.0268276929855347, + "learning_rate": 2.181796319091557e-05, + "loss": 0.7151, + "step": 67370 + }, + { + "epoch": 1.0815582914653525, + "grad_norm": 0.8491235971450806, + "learning_rate": 2.181171009949878e-05, + "loss": 0.8194, + "step": 67380 + }, + { + "epoch": 1.0817188076855166, + "grad_norm": 0.8822036385536194, + "learning_rate": 2.1805457210838504e-05, + "loss": 0.8039, + "step": 67390 + }, + { + "epoch": 1.0818793239056808, + "grad_norm": 1.0126286745071411, + "learning_rate": 2.1799204525332406e-05, + "loss": 0.7765, + "step": 67400 + }, + { + "epoch": 1.0820398401258446, + "grad_norm": 0.7828128933906555, + "learning_rate": 2.179295204337811e-05, + "loss": 0.6931, + "step": 67410 + }, + { + "epoch": 1.0822003563460088, + "grad_norm": 0.9041615724563599, + "learning_rate": 2.1786699765373235e-05, + "loss": 0.6165, + "step": 67420 + }, + { + "epoch": 1.0823608725661729, + "grad_norm": 0.8660089373588562, + "learning_rate": 2.17804476917154e-05, + "loss": 0.6724, + "step": 67430 + }, + { + "epoch": 1.0825213887863367, + "grad_norm": 0.8449252247810364, + "learning_rate": 2.1774195822802193e-05, + "loss": 0.648, + "step": 67440 + }, + { + "epoch": 1.0826819050065009, + "grad_norm": 1.14505136013031, + "learning_rate": 2.1767944159031207e-05, + "loss": 0.6798, + "step": 67450 + }, + { + "epoch": 1.082842421226665, + "grad_norm": 0.9322762489318848, + "learning_rate": 2.176169270079999e-05, + "loss": 0.6293, + "step": 67460 + }, + { + "epoch": 1.083002937446829, + "grad_norm": 0.9366891384124756, + "learning_rate": 2.1755441448506115e-05, + "loss": 0.6259, + "step": 67470 + }, + { + "epoch": 1.083163453666993, + "grad_norm": 0.8441817760467529, + "learning_rate": 2.174919040254712e-05, + "loss": 0.7799, + "step": 67480 + }, + { + "epoch": 1.083323969887157, + "grad_norm": 0.7903669476509094, + "learning_rate": 2.1742939563320537e-05, + "loss": 0.9113, + "step": 67490 + }, + { + "epoch": 1.0834844861073212, + "grad_norm": 0.747532844543457, + "learning_rate": 2.1736688931223874e-05, + "loss": 0.7236, + "step": 67500 + }, + { + "epoch": 1.0836450023274853, + "grad_norm": 0.7549463510513306, + "learning_rate": 2.1730438506654657e-05, + "loss": 0.7587, + "step": 67510 + }, + { + "epoch": 1.0838055185476492, + "grad_norm": 1.569077730178833, + "learning_rate": 2.172418829001035e-05, + "loss": 0.7104, + "step": 67520 + }, + { + "epoch": 1.0839660347678133, + "grad_norm": 0.8434920907020569, + "learning_rate": 2.1717938281688438e-05, + "loss": 0.8995, + "step": 67530 + }, + { + "epoch": 1.0841265509879774, + "grad_norm": 0.7968530058860779, + "learning_rate": 2.171168848208639e-05, + "loss": 0.751, + "step": 67540 + }, + { + "epoch": 1.0842870672081415, + "grad_norm": 0.7855919003486633, + "learning_rate": 2.1705438891601654e-05, + "loss": 0.7617, + "step": 67550 + }, + { + "epoch": 1.0844475834283054, + "grad_norm": 0.78795325756073, + "learning_rate": 2.1699189510631668e-05, + "loss": 0.7035, + "step": 67560 + }, + { + "epoch": 1.0846080996484695, + "grad_norm": 1.3267039060592651, + "learning_rate": 2.1692940339573865e-05, + "loss": 0.7514, + "step": 67570 + }, + { + "epoch": 1.0847686158686336, + "grad_norm": 0.8105947375297546, + "learning_rate": 2.1686691378825635e-05, + "loss": 0.7452, + "step": 67580 + }, + { + "epoch": 1.0849291320887975, + "grad_norm": 1.362783670425415, + "learning_rate": 2.1680442628784386e-05, + "loss": 0.8342, + "step": 67590 + }, + { + "epoch": 1.0850896483089616, + "grad_norm": 0.9968134164810181, + "learning_rate": 2.1674194089847502e-05, + "loss": 0.7251, + "step": 67600 + }, + { + "epoch": 1.0852501645291257, + "grad_norm": 0.9256383180618286, + "learning_rate": 2.1667945762412353e-05, + "loss": 0.668, + "step": 67610 + }, + { + "epoch": 1.0854106807492898, + "grad_norm": 1.0639899969100952, + "learning_rate": 2.16616976468763e-05, + "loss": 0.7345, + "step": 67620 + }, + { + "epoch": 1.0855711969694537, + "grad_norm": 0.7554033398628235, + "learning_rate": 2.165544974363669e-05, + "loss": 0.5846, + "step": 67630 + }, + { + "epoch": 1.0857317131896178, + "grad_norm": 1.3083711862564087, + "learning_rate": 2.1649202053090834e-05, + "loss": 0.6986, + "step": 67640 + }, + { + "epoch": 1.085892229409782, + "grad_norm": 1.070611834526062, + "learning_rate": 2.164295457563606e-05, + "loss": 0.6508, + "step": 67650 + }, + { + "epoch": 1.0860527456299458, + "grad_norm": 0.9127103686332703, + "learning_rate": 2.1636707311669674e-05, + "loss": 0.7607, + "step": 67660 + }, + { + "epoch": 1.08621326185011, + "grad_norm": 0.9819462299346924, + "learning_rate": 2.163046026158896e-05, + "loss": 0.7178, + "step": 67670 + }, + { + "epoch": 1.086373778070274, + "grad_norm": 0.6226595044136047, + "learning_rate": 2.16242134257912e-05, + "loss": 0.815, + "step": 67680 + }, + { + "epoch": 1.0865342942904381, + "grad_norm": 1.5813443660736084, + "learning_rate": 2.161796680467365e-05, + "loss": 0.8738, + "step": 67690 + }, + { + "epoch": 1.086694810510602, + "grad_norm": 1.0229099988937378, + "learning_rate": 2.1611720398633563e-05, + "loss": 0.755, + "step": 67700 + }, + { + "epoch": 1.0868553267307661, + "grad_norm": 0.9456957578659058, + "learning_rate": 2.160547420806817e-05, + "loss": 0.6945, + "step": 67710 + }, + { + "epoch": 1.0870158429509302, + "grad_norm": 0.9677833914756775, + "learning_rate": 2.1599228233374695e-05, + "loss": 0.6328, + "step": 67720 + }, + { + "epoch": 1.0871763591710943, + "grad_norm": 0.931078314781189, + "learning_rate": 2.1592982474950343e-05, + "loss": 0.7139, + "step": 67730 + }, + { + "epoch": 1.0873368753912582, + "grad_norm": 2.2912704944610596, + "learning_rate": 2.1586736933192308e-05, + "loss": 0.5821, + "step": 67740 + }, + { + "epoch": 1.0874973916114223, + "grad_norm": 1.5758126974105835, + "learning_rate": 2.158049160849777e-05, + "loss": 0.7419, + "step": 67750 + }, + { + "epoch": 1.0876579078315864, + "grad_norm": 0.8146055340766907, + "learning_rate": 2.1574246501263897e-05, + "loss": 0.6491, + "step": 67760 + }, + { + "epoch": 1.0878184240517503, + "grad_norm": 1.1073096990585327, + "learning_rate": 2.156800161188784e-05, + "loss": 0.809, + "step": 67770 + }, + { + "epoch": 1.0879789402719144, + "grad_norm": 0.545136570930481, + "learning_rate": 2.1561756940766732e-05, + "loss": 0.6712, + "step": 67780 + }, + { + "epoch": 1.0881394564920785, + "grad_norm": 1.5402919054031372, + "learning_rate": 2.1555512488297703e-05, + "loss": 0.6968, + "step": 67790 + }, + { + "epoch": 1.0882999727122427, + "grad_norm": 0.9783905148506165, + "learning_rate": 2.1549268254877865e-05, + "loss": 0.7503, + "step": 67800 + }, + { + "epoch": 1.0884604889324065, + "grad_norm": 0.8765348792076111, + "learning_rate": 2.1543024240904308e-05, + "loss": 0.7076, + "step": 67810 + }, + { + "epoch": 1.0886210051525707, + "grad_norm": 1.2118886709213257, + "learning_rate": 2.1536780446774134e-05, + "loss": 0.6785, + "step": 67820 + }, + { + "epoch": 1.0887815213727348, + "grad_norm": 1.0218993425369263, + "learning_rate": 2.153053687288438e-05, + "loss": 0.7567, + "step": 67830 + }, + { + "epoch": 1.0889420375928989, + "grad_norm": 0.9643524885177612, + "learning_rate": 2.1524293519632122e-05, + "loss": 0.7344, + "step": 67840 + }, + { + "epoch": 1.0891025538130628, + "grad_norm": 0.9462507367134094, + "learning_rate": 2.1518050387414394e-05, + "loss": 0.7158, + "step": 67850 + }, + { + "epoch": 1.0892630700332269, + "grad_norm": 0.8961391448974609, + "learning_rate": 2.1511807476628227e-05, + "loss": 0.7474, + "step": 67860 + }, + { + "epoch": 1.089423586253391, + "grad_norm": 0.9341515302658081, + "learning_rate": 2.150556478767063e-05, + "loss": 0.7325, + "step": 67870 + }, + { + "epoch": 1.0895841024735549, + "grad_norm": 0.7509249448776245, + "learning_rate": 2.149932232093861e-05, + "loss": 0.7131, + "step": 67880 + }, + { + "epoch": 1.089744618693719, + "grad_norm": 0.712286114692688, + "learning_rate": 2.1493080076829135e-05, + "loss": 0.6715, + "step": 67890 + }, + { + "epoch": 1.089905134913883, + "grad_norm": 0.8513141870498657, + "learning_rate": 2.1486838055739182e-05, + "loss": 0.6931, + "step": 67900 + }, + { + "epoch": 1.0900656511340472, + "grad_norm": 0.668315589427948, + "learning_rate": 2.148059625806571e-05, + "loss": 0.7602, + "step": 67910 + }, + { + "epoch": 1.090226167354211, + "grad_norm": 1.2097899913787842, + "learning_rate": 2.1474354684205662e-05, + "loss": 0.7053, + "step": 67920 + }, + { + "epoch": 1.0903866835743752, + "grad_norm": 1.0090320110321045, + "learning_rate": 2.146811333455596e-05, + "loss": 0.7128, + "step": 67930 + }, + { + "epoch": 1.0905471997945393, + "grad_norm": 1.0403146743774414, + "learning_rate": 2.1461872209513527e-05, + "loss": 0.7411, + "step": 67940 + }, + { + "epoch": 1.0907077160147032, + "grad_norm": 0.7620077729225159, + "learning_rate": 2.145563130947525e-05, + "loss": 0.6782, + "step": 67950 + }, + { + "epoch": 1.0908682322348673, + "grad_norm": 0.9604838490486145, + "learning_rate": 2.1449390634838015e-05, + "loss": 0.8136, + "step": 67960 + }, + { + "epoch": 1.0910287484550314, + "grad_norm": 1.8983228206634521, + "learning_rate": 2.1443150185998696e-05, + "loss": 0.6262, + "step": 67970 + }, + { + "epoch": 1.0911892646751955, + "grad_norm": 1.3547165393829346, + "learning_rate": 2.1436909963354148e-05, + "loss": 0.6914, + "step": 67980 + }, + { + "epoch": 1.0913497808953594, + "grad_norm": 1.1965245008468628, + "learning_rate": 2.1430669967301214e-05, + "loss": 0.7793, + "step": 67990 + }, + { + "epoch": 1.0915102971155235, + "grad_norm": 0.7715591788291931, + "learning_rate": 2.142443019823672e-05, + "loss": 0.6664, + "step": 68000 + }, + { + "epoch": 1.0915102971155235, + "eval_loss": 0.7768766283988953, + "eval_runtime": 1833.8464, + "eval_samples_per_second": 14.304, + "eval_steps_per_second": 1.788, + "step": 68000 + }, + { + "epoch": 1.0916708133356876, + "grad_norm": 1.0044825077056885, + "learning_rate": 2.1418190656557484e-05, + "loss": 0.6698, + "step": 68010 + }, + { + "epoch": 1.0918313295558517, + "grad_norm": 0.8742111921310425, + "learning_rate": 2.1411951342660298e-05, + "loss": 0.7269, + "step": 68020 + }, + { + "epoch": 1.0919918457760156, + "grad_norm": 1.2794451713562012, + "learning_rate": 2.1405712256941946e-05, + "loss": 0.5578, + "step": 68030 + }, + { + "epoch": 1.0921523619961797, + "grad_norm": 1.1462247371673584, + "learning_rate": 2.1399473399799196e-05, + "loss": 0.8198, + "step": 68040 + }, + { + "epoch": 1.0923128782163438, + "grad_norm": 0.7301763296127319, + "learning_rate": 2.1393234771628807e-05, + "loss": 0.6793, + "step": 68050 + }, + { + "epoch": 1.0924733944365077, + "grad_norm": 0.6635832190513611, + "learning_rate": 2.138699637282751e-05, + "loss": 0.6257, + "step": 68060 + }, + { + "epoch": 1.0926339106566718, + "grad_norm": 0.7985203266143799, + "learning_rate": 2.1380758203792047e-05, + "loss": 0.7203, + "step": 68070 + }, + { + "epoch": 1.092794426876836, + "grad_norm": 0.8811420798301697, + "learning_rate": 2.1374520264919114e-05, + "loss": 0.6407, + "step": 68080 + }, + { + "epoch": 1.092954943097, + "grad_norm": 0.9972993731498718, + "learning_rate": 2.1368282556605415e-05, + "loss": 0.7026, + "step": 68090 + }, + { + "epoch": 1.093115459317164, + "grad_norm": 0.7171999216079712, + "learning_rate": 2.1362045079247626e-05, + "loss": 0.6062, + "step": 68100 + }, + { + "epoch": 1.093275975537328, + "grad_norm": 1.155646800994873, + "learning_rate": 2.135580783324242e-05, + "loss": 0.8177, + "step": 68110 + }, + { + "epoch": 1.0934364917574921, + "grad_norm": 1.0108293294906616, + "learning_rate": 2.1349570818986444e-05, + "loss": 0.7447, + "step": 68120 + }, + { + "epoch": 1.0935970079776562, + "grad_norm": 0.8665661215782166, + "learning_rate": 2.134333403687635e-05, + "loss": 0.5923, + "step": 68130 + }, + { + "epoch": 1.0937575241978201, + "grad_norm": 0.847226083278656, + "learning_rate": 2.133709748730874e-05, + "loss": 0.681, + "step": 68140 + }, + { + "epoch": 1.0939180404179842, + "grad_norm": 0.7993077039718628, + "learning_rate": 2.1330861170680228e-05, + "loss": 0.8008, + "step": 68150 + }, + { + "epoch": 1.0940785566381483, + "grad_norm": 0.7775295376777649, + "learning_rate": 2.132462508738741e-05, + "loss": 0.7483, + "step": 68160 + }, + { + "epoch": 1.0942390728583122, + "grad_norm": 0.920616865158081, + "learning_rate": 2.1318389237826867e-05, + "loss": 0.6752, + "step": 68170 + }, + { + "epoch": 1.0943995890784763, + "grad_norm": 1.0175888538360596, + "learning_rate": 2.1312153622395155e-05, + "loss": 0.6389, + "step": 68180 + }, + { + "epoch": 1.0945601052986405, + "grad_norm": 0.869664192199707, + "learning_rate": 2.1305918241488844e-05, + "loss": 0.743, + "step": 68190 + }, + { + "epoch": 1.0947206215188046, + "grad_norm": 0.9353387355804443, + "learning_rate": 2.1299683095504434e-05, + "loss": 0.7054, + "step": 68200 + }, + { + "epoch": 1.0948811377389684, + "grad_norm": 1.1033191680908203, + "learning_rate": 2.1293448184838462e-05, + "loss": 0.7466, + "step": 68210 + }, + { + "epoch": 1.0950416539591326, + "grad_norm": 1.0891528129577637, + "learning_rate": 2.128721350988743e-05, + "loss": 0.6636, + "step": 68220 + }, + { + "epoch": 1.0952021701792967, + "grad_norm": 0.825838029384613, + "learning_rate": 2.128097907104783e-05, + "loss": 0.7338, + "step": 68230 + }, + { + "epoch": 1.0953626863994606, + "grad_norm": 0.8047500252723694, + "learning_rate": 2.127474486871613e-05, + "loss": 0.6568, + "step": 68240 + }, + { + "epoch": 1.0955232026196247, + "grad_norm": 1.1127718687057495, + "learning_rate": 2.126851090328879e-05, + "loss": 0.6463, + "step": 68250 + }, + { + "epoch": 1.0956837188397888, + "grad_norm": 0.7215779423713684, + "learning_rate": 2.126227717516227e-05, + "loss": 0.7505, + "step": 68260 + }, + { + "epoch": 1.0958442350599529, + "grad_norm": 0.9954982399940491, + "learning_rate": 2.125604368473297e-05, + "loss": 0.6976, + "step": 68270 + }, + { + "epoch": 1.0960047512801168, + "grad_norm": 0.8242918848991394, + "learning_rate": 2.1249810432397318e-05, + "loss": 0.7621, + "step": 68280 + }, + { + "epoch": 1.0961652675002809, + "grad_norm": 0.757996141910553, + "learning_rate": 2.124357741855171e-05, + "loss": 0.7782, + "step": 68290 + }, + { + "epoch": 1.096325783720445, + "grad_norm": 0.7532035708427429, + "learning_rate": 2.123734464359253e-05, + "loss": 0.6523, + "step": 68300 + }, + { + "epoch": 1.096486299940609, + "grad_norm": 1.233432412147522, + "learning_rate": 2.1231112107916145e-05, + "loss": 0.6948, + "step": 68310 + }, + { + "epoch": 1.096646816160773, + "grad_norm": 0.8001562356948853, + "learning_rate": 2.122487981191892e-05, + "loss": 0.828, + "step": 68320 + }, + { + "epoch": 1.096807332380937, + "grad_norm": 1.2547094821929932, + "learning_rate": 2.121864775599717e-05, + "loss": 0.7003, + "step": 68330 + }, + { + "epoch": 1.0969678486011012, + "grad_norm": 0.7306742072105408, + "learning_rate": 2.1212415940547236e-05, + "loss": 0.7303, + "step": 68340 + }, + { + "epoch": 1.0971283648212653, + "grad_norm": 1.8987648487091064, + "learning_rate": 2.120618436596541e-05, + "loss": 0.9694, + "step": 68350 + }, + { + "epoch": 1.0972888810414292, + "grad_norm": 1.1589889526367188, + "learning_rate": 2.1199953032647992e-05, + "loss": 0.7181, + "step": 68360 + }, + { + "epoch": 1.0974493972615933, + "grad_norm": 1.0372117757797241, + "learning_rate": 2.119372194099126e-05, + "loss": 0.7035, + "step": 68370 + }, + { + "epoch": 1.0976099134817574, + "grad_norm": 1.1686285734176636, + "learning_rate": 2.1187491091391472e-05, + "loss": 0.6869, + "step": 68380 + }, + { + "epoch": 1.0977704297019213, + "grad_norm": 1.8110508918762207, + "learning_rate": 2.118126048424487e-05, + "loss": 0.8121, + "step": 68390 + }, + { + "epoch": 1.0979309459220854, + "grad_norm": 1.6801395416259766, + "learning_rate": 2.117503011994769e-05, + "loss": 0.6518, + "step": 68400 + }, + { + "epoch": 1.0980914621422495, + "grad_norm": 0.770034909248352, + "learning_rate": 2.1168799998896143e-05, + "loss": 0.5706, + "step": 68410 + }, + { + "epoch": 1.0982519783624136, + "grad_norm": 1.3845000267028809, + "learning_rate": 2.116257012148643e-05, + "loss": 0.7426, + "step": 68420 + }, + { + "epoch": 1.0984124945825775, + "grad_norm": 1.289565086364746, + "learning_rate": 2.1156340488114732e-05, + "loss": 0.6799, + "step": 68430 + }, + { + "epoch": 1.0985730108027416, + "grad_norm": 1.8447660207748413, + "learning_rate": 2.115011109917723e-05, + "loss": 0.6723, + "step": 68440 + }, + { + "epoch": 1.0987335270229057, + "grad_norm": 0.601520836353302, + "learning_rate": 2.1143881955070056e-05, + "loss": 0.7633, + "step": 68450 + }, + { + "epoch": 1.0988940432430696, + "grad_norm": 1.043160080909729, + "learning_rate": 2.1137653056189354e-05, + "loss": 0.8415, + "step": 68460 + }, + { + "epoch": 1.0990545594632337, + "grad_norm": 0.7790906429290771, + "learning_rate": 2.113142440293125e-05, + "loss": 0.71, + "step": 68470 + }, + { + "epoch": 1.0992150756833978, + "grad_norm": 0.8653931617736816, + "learning_rate": 2.1125195995691846e-05, + "loss": 0.7556, + "step": 68480 + }, + { + "epoch": 1.099375591903562, + "grad_norm": 1.4586610794067383, + "learning_rate": 2.111896783486724e-05, + "loss": 0.801, + "step": 68490 + }, + { + "epoch": 1.0995361081237258, + "grad_norm": 0.871461033821106, + "learning_rate": 2.1112739920853504e-05, + "loss": 0.5678, + "step": 68500 + }, + { + "epoch": 1.09969662434389, + "grad_norm": 1.0507992506027222, + "learning_rate": 2.1106512254046683e-05, + "loss": 0.8477, + "step": 68510 + }, + { + "epoch": 1.099857140564054, + "grad_norm": 0.8119199275970459, + "learning_rate": 2.110028483484283e-05, + "loss": 0.6977, + "step": 68520 + }, + { + "epoch": 1.1000176567842181, + "grad_norm": 1.3355234861373901, + "learning_rate": 2.1094057663637976e-05, + "loss": 0.6497, + "step": 68530 + }, + { + "epoch": 1.100178173004382, + "grad_norm": 0.9324609041213989, + "learning_rate": 2.1087830740828128e-05, + "loss": 0.821, + "step": 68540 + }, + { + "epoch": 1.1003386892245461, + "grad_norm": 1.5952492952346802, + "learning_rate": 2.1081604066809282e-05, + "loss": 0.5717, + "step": 68550 + }, + { + "epoch": 1.1004992054447102, + "grad_norm": 0.9200400710105896, + "learning_rate": 2.1075377641977416e-05, + "loss": 0.7228, + "step": 68560 + }, + { + "epoch": 1.1006597216648741, + "grad_norm": 0.7142199277877808, + "learning_rate": 2.106915146672851e-05, + "loss": 0.7403, + "step": 68570 + }, + { + "epoch": 1.1008202378850382, + "grad_norm": 0.7741956114768982, + "learning_rate": 2.1062925541458493e-05, + "loss": 0.6596, + "step": 68580 + }, + { + "epoch": 1.1009807541052024, + "grad_norm": 0.8583527207374573, + "learning_rate": 2.10566998665633e-05, + "loss": 0.7205, + "step": 68590 + }, + { + "epoch": 1.1011412703253665, + "grad_norm": 1.0759841203689575, + "learning_rate": 2.1050474442438847e-05, + "loss": 0.7831, + "step": 68600 + }, + { + "epoch": 1.1013017865455303, + "grad_norm": 0.8792186975479126, + "learning_rate": 2.104424926948104e-05, + "loss": 0.6683, + "step": 68610 + }, + { + "epoch": 1.1014623027656945, + "grad_norm": 0.8426170349121094, + "learning_rate": 2.103802434808576e-05, + "loss": 0.6955, + "step": 68620 + }, + { + "epoch": 1.1016228189858586, + "grad_norm": 0.8554047346115112, + "learning_rate": 2.1031799678648887e-05, + "loss": 0.7304, + "step": 68630 + }, + { + "epoch": 1.1017833352060227, + "grad_norm": 0.8066012859344482, + "learning_rate": 2.1025575261566253e-05, + "loss": 0.6308, + "step": 68640 + }, + { + "epoch": 1.1019438514261866, + "grad_norm": 1.1444085836410522, + "learning_rate": 2.101935109723371e-05, + "loss": 0.7277, + "step": 68650 + }, + { + "epoch": 1.1021043676463507, + "grad_norm": 0.6307269334793091, + "learning_rate": 2.1013127186047078e-05, + "loss": 0.6455, + "step": 68660 + }, + { + "epoch": 1.1022648838665148, + "grad_norm": 0.802040696144104, + "learning_rate": 2.1006903528402146e-05, + "loss": 0.7916, + "step": 68670 + }, + { + "epoch": 1.1024254000866787, + "grad_norm": 0.7707061171531677, + "learning_rate": 2.1000680124694718e-05, + "loss": 0.673, + "step": 68680 + }, + { + "epoch": 1.1025859163068428, + "grad_norm": 1.4700356721878052, + "learning_rate": 2.0994456975320563e-05, + "loss": 0.6242, + "step": 68690 + }, + { + "epoch": 1.1027464325270069, + "grad_norm": 0.7093893885612488, + "learning_rate": 2.098823408067543e-05, + "loss": 0.6901, + "step": 68700 + }, + { + "epoch": 1.102906948747171, + "grad_norm": 0.6678190231323242, + "learning_rate": 2.098201144115506e-05, + "loss": 0.6902, + "step": 68710 + }, + { + "epoch": 1.1030674649673349, + "grad_norm": 1.2830443382263184, + "learning_rate": 2.097578905715518e-05, + "loss": 0.6223, + "step": 68720 + }, + { + "epoch": 1.103227981187499, + "grad_norm": 1.273393154144287, + "learning_rate": 2.09695669290715e-05, + "loss": 0.6807, + "step": 68730 + }, + { + "epoch": 1.103388497407663, + "grad_norm": 0.5713159441947937, + "learning_rate": 2.0963345057299705e-05, + "loss": 0.779, + "step": 68740 + }, + { + "epoch": 1.103549013627827, + "grad_norm": 0.8259296417236328, + "learning_rate": 2.095712344223548e-05, + "loss": 0.707, + "step": 68750 + }, + { + "epoch": 1.103709529847991, + "grad_norm": 1.3664090633392334, + "learning_rate": 2.0950902084274464e-05, + "loss": 0.6414, + "step": 68760 + }, + { + "epoch": 1.1038700460681552, + "grad_norm": 1.1689563989639282, + "learning_rate": 2.0944680983812314e-05, + "loss": 0.7397, + "step": 68770 + }, + { + "epoch": 1.1040305622883193, + "grad_norm": 0.7239499688148499, + "learning_rate": 2.0938460141244648e-05, + "loss": 0.6119, + "step": 68780 + }, + { + "epoch": 1.1041910785084832, + "grad_norm": 2.3550024032592773, + "learning_rate": 2.0932239556967083e-05, + "loss": 0.8468, + "step": 68790 + }, + { + "epoch": 1.1043515947286473, + "grad_norm": 0.8912999033927917, + "learning_rate": 2.09260192313752e-05, + "loss": 0.7154, + "step": 68800 + }, + { + "epoch": 1.1045121109488114, + "grad_norm": 1.5183225870132446, + "learning_rate": 2.0919799164864585e-05, + "loss": 0.7142, + "step": 68810 + }, + { + "epoch": 1.1046726271689755, + "grad_norm": 0.6252288818359375, + "learning_rate": 2.0913579357830805e-05, + "loss": 0.6552, + "step": 68820 + }, + { + "epoch": 1.1048331433891394, + "grad_norm": 0.7746715545654297, + "learning_rate": 2.0907359810669383e-05, + "loss": 0.6697, + "step": 68830 + }, + { + "epoch": 1.1049936596093035, + "grad_norm": 0.9670315980911255, + "learning_rate": 2.0901140523775852e-05, + "loss": 0.5564, + "step": 68840 + }, + { + "epoch": 1.1051541758294676, + "grad_norm": 0.6984583139419556, + "learning_rate": 2.0894921497545733e-05, + "loss": 0.6114, + "step": 68850 + }, + { + "epoch": 1.1053146920496317, + "grad_norm": 0.7399809956550598, + "learning_rate": 2.0888702732374506e-05, + "loss": 0.7494, + "step": 68860 + }, + { + "epoch": 1.1054752082697956, + "grad_norm": 1.3291066884994507, + "learning_rate": 2.0882484228657657e-05, + "loss": 0.6983, + "step": 68870 + }, + { + "epoch": 1.1056357244899597, + "grad_norm": 0.8275732398033142, + "learning_rate": 2.0876265986790656e-05, + "loss": 0.6235, + "step": 68880 + }, + { + "epoch": 1.1057962407101238, + "grad_norm": 0.8016736507415771, + "learning_rate": 2.087004800716892e-05, + "loss": 0.8496, + "step": 68890 + }, + { + "epoch": 1.1059567569302877, + "grad_norm": 0.6588501334190369, + "learning_rate": 2.0863830290187894e-05, + "loss": 0.6974, + "step": 68900 + }, + { + "epoch": 1.1061172731504518, + "grad_norm": 0.907826840877533, + "learning_rate": 2.0857612836242982e-05, + "loss": 0.7321, + "step": 68910 + }, + { + "epoch": 1.106277789370616, + "grad_norm": 1.0248390436172485, + "learning_rate": 2.0851395645729582e-05, + "loss": 0.62, + "step": 68920 + }, + { + "epoch": 1.10643830559078, + "grad_norm": 1.144977331161499, + "learning_rate": 2.084517871904307e-05, + "loss": 0.8728, + "step": 68930 + }, + { + "epoch": 1.106598821810944, + "grad_norm": 0.9303678274154663, + "learning_rate": 2.0838962056578807e-05, + "loss": 0.6882, + "step": 68940 + }, + { + "epoch": 1.106759338031108, + "grad_norm": 1.444917917251587, + "learning_rate": 2.083274565873213e-05, + "loss": 0.6837, + "step": 68950 + }, + { + "epoch": 1.1069198542512722, + "grad_norm": 0.729026734828949, + "learning_rate": 2.082652952589837e-05, + "loss": 0.7154, + "step": 68960 + }, + { + "epoch": 1.107080370471436, + "grad_norm": 0.7368753552436829, + "learning_rate": 2.0820313658472842e-05, + "loss": 0.8083, + "step": 68970 + }, + { + "epoch": 1.1072408866916001, + "grad_norm": 1.0095508098602295, + "learning_rate": 2.0814098056850828e-05, + "loss": 0.694, + "step": 68980 + }, + { + "epoch": 1.1074014029117643, + "grad_norm": 0.6127718091011047, + "learning_rate": 2.080788272142761e-05, + "loss": 0.723, + "step": 68990 + }, + { + "epoch": 1.1075619191319284, + "grad_norm": 1.1369646787643433, + "learning_rate": 2.080166765259845e-05, + "loss": 0.7485, + "step": 69000 + }, + { + "epoch": 1.1077224353520922, + "grad_norm": 0.7714826464653015, + "learning_rate": 2.0795452850758577e-05, + "loss": 0.6701, + "step": 69010 + }, + { + "epoch": 1.1078829515722564, + "grad_norm": 0.5569642782211304, + "learning_rate": 2.078923831630323e-05, + "loss": 0.594, + "step": 69020 + }, + { + "epoch": 1.1080434677924205, + "grad_norm": 1.642685890197754, + "learning_rate": 2.078302404962761e-05, + "loss": 0.7195, + "step": 69030 + }, + { + "epoch": 1.1082039840125846, + "grad_norm": 0.9469515681266785, + "learning_rate": 2.0776810051126907e-05, + "loss": 0.7804, + "step": 69040 + }, + { + "epoch": 1.1083645002327485, + "grad_norm": 1.2768466472625732, + "learning_rate": 2.07705963211963e-05, + "loss": 0.5709, + "step": 69050 + }, + { + "epoch": 1.1085250164529126, + "grad_norm": 0.858999490737915, + "learning_rate": 2.076438286023094e-05, + "loss": 0.6789, + "step": 69060 + }, + { + "epoch": 1.1086855326730767, + "grad_norm": 0.9078071713447571, + "learning_rate": 2.075816966862598e-05, + "loss": 0.5811, + "step": 69070 + }, + { + "epoch": 1.1088460488932406, + "grad_norm": 2.0016472339630127, + "learning_rate": 2.075195674677653e-05, + "loss": 0.822, + "step": 69080 + }, + { + "epoch": 1.1090065651134047, + "grad_norm": 1.1035739183425903, + "learning_rate": 2.0745744095077684e-05, + "loss": 0.7314, + "step": 69090 + }, + { + "epoch": 1.1091670813335688, + "grad_norm": 1.3359339237213135, + "learning_rate": 2.0739531713924554e-05, + "loss": 0.59, + "step": 69100 + }, + { + "epoch": 1.109327597553733, + "grad_norm": 0.6590023636817932, + "learning_rate": 2.0733319603712195e-05, + "loss": 0.769, + "step": 69110 + }, + { + "epoch": 1.1094881137738968, + "grad_norm": 0.9555895924568176, + "learning_rate": 2.0727107764835667e-05, + "loss": 0.7145, + "step": 69120 + }, + { + "epoch": 1.1096486299940609, + "grad_norm": 0.8854460716247559, + "learning_rate": 2.0720896197690018e-05, + "loss": 0.7826, + "step": 69130 + }, + { + "epoch": 1.109809146214225, + "grad_norm": 1.3088352680206299, + "learning_rate": 2.0714684902670245e-05, + "loss": 0.7663, + "step": 69140 + }, + { + "epoch": 1.109969662434389, + "grad_norm": 0.7963261604309082, + "learning_rate": 2.070847388017136e-05, + "loss": 0.8101, + "step": 69150 + }, + { + "epoch": 1.110130178654553, + "grad_norm": 0.6888970136642456, + "learning_rate": 2.0702263130588346e-05, + "loss": 0.7326, + "step": 69160 + }, + { + "epoch": 1.110290694874717, + "grad_norm": 1.376157283782959, + "learning_rate": 2.069605265431617e-05, + "loss": 0.6736, + "step": 69170 + }, + { + "epoch": 1.1104512110948812, + "grad_norm": 1.264936923980713, + "learning_rate": 2.0689842451749787e-05, + "loss": 0.7764, + "step": 69180 + }, + { + "epoch": 1.110611727315045, + "grad_norm": 0.9382315874099731, + "learning_rate": 2.0683632523284135e-05, + "loss": 0.8309, + "step": 69190 + }, + { + "epoch": 1.1107722435352092, + "grad_norm": 1.0777426958084106, + "learning_rate": 2.0677422869314108e-05, + "loss": 0.6597, + "step": 69200 + }, + { + "epoch": 1.1109327597553733, + "grad_norm": 0.9216499328613281, + "learning_rate": 2.067121349023462e-05, + "loss": 0.7012, + "step": 69210 + }, + { + "epoch": 1.1110932759755374, + "grad_norm": 0.7044641375541687, + "learning_rate": 2.066500438644054e-05, + "loss": 0.7748, + "step": 69220 + }, + { + "epoch": 1.1112537921957013, + "grad_norm": 0.7732403874397278, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.786, + "step": 69230 + }, + { + "epoch": 1.1114143084158654, + "grad_norm": 0.8281802535057068, + "learning_rate": 2.0652587006288065e-05, + "loss": 0.7509, + "step": 69240 + }, + { + "epoch": 1.1115748246360295, + "grad_norm": 1.1165003776550293, + "learning_rate": 2.0646378730719344e-05, + "loss": 0.7063, + "step": 69250 + }, + { + "epoch": 1.1117353408561934, + "grad_norm": 1.2063367366790771, + "learning_rate": 2.0640170732015377e-05, + "loss": 0.6659, + "step": 69260 + }, + { + "epoch": 1.1118958570763575, + "grad_norm": 1.7235946655273438, + "learning_rate": 2.0633963010570962e-05, + "loss": 0.682, + "step": 69270 + }, + { + "epoch": 1.1120563732965216, + "grad_norm": 0.9076159000396729, + "learning_rate": 2.0627755566780878e-05, + "loss": 0.7645, + "step": 69280 + }, + { + "epoch": 1.1122168895166857, + "grad_norm": 1.0548783540725708, + "learning_rate": 2.0621548401039872e-05, + "loss": 0.7762, + "step": 69290 + }, + { + "epoch": 1.1123774057368496, + "grad_norm": 0.9500921964645386, + "learning_rate": 2.061534151374269e-05, + "loss": 0.7002, + "step": 69300 + }, + { + "epoch": 1.1125379219570137, + "grad_norm": 0.8893677592277527, + "learning_rate": 2.0609134905284056e-05, + "loss": 0.7068, + "step": 69310 + }, + { + "epoch": 1.1126984381771778, + "grad_norm": 0.8698769807815552, + "learning_rate": 2.0602928576058666e-05, + "loss": 0.7994, + "step": 69320 + }, + { + "epoch": 1.112858954397342, + "grad_norm": 0.5990699529647827, + "learning_rate": 2.0596722526461208e-05, + "loss": 0.7437, + "step": 69330 + }, + { + "epoch": 1.1130194706175058, + "grad_norm": 0.7893218398094177, + "learning_rate": 2.0590516756886357e-05, + "loss": 0.6793, + "step": 69340 + }, + { + "epoch": 1.11317998683767, + "grad_norm": 0.9129883646965027, + "learning_rate": 2.0584311267728757e-05, + "loss": 0.6693, + "step": 69350 + }, + { + "epoch": 1.113340503057834, + "grad_norm": 0.773200273513794, + "learning_rate": 2.0578106059383042e-05, + "loss": 0.9286, + "step": 69360 + }, + { + "epoch": 1.113501019277998, + "grad_norm": 0.6676891446113586, + "learning_rate": 2.057190113224383e-05, + "loss": 0.669, + "step": 69370 + }, + { + "epoch": 1.113661535498162, + "grad_norm": 1.1670299768447876, + "learning_rate": 2.0565696486705725e-05, + "loss": 0.6518, + "step": 69380 + }, + { + "epoch": 1.1138220517183262, + "grad_norm": 2.7660391330718994, + "learning_rate": 2.0559492123163283e-05, + "loss": 0.6836, + "step": 69390 + }, + { + "epoch": 1.1139825679384903, + "grad_norm": 0.8590372800827026, + "learning_rate": 2.0553288042011085e-05, + "loss": 0.6692, + "step": 69400 + }, + { + "epoch": 1.1141430841586542, + "grad_norm": 1.3292815685272217, + "learning_rate": 2.0547084243643662e-05, + "loss": 0.7408, + "step": 69410 + }, + { + "epoch": 1.1143036003788183, + "grad_norm": 1.3933672904968262, + "learning_rate": 2.054088072845555e-05, + "loss": 0.733, + "step": 69420 + }, + { + "epoch": 1.1144641165989824, + "grad_norm": 0.9685540795326233, + "learning_rate": 2.053467749684125e-05, + "loss": 0.5927, + "step": 69430 + }, + { + "epoch": 1.1146246328191465, + "grad_norm": 0.7435873746871948, + "learning_rate": 2.0528474549195262e-05, + "loss": 0.7883, + "step": 69440 + }, + { + "epoch": 1.1147851490393104, + "grad_norm": 1.3046386241912842, + "learning_rate": 2.0522271885912036e-05, + "loss": 0.7647, + "step": 69450 + }, + { + "epoch": 1.1149456652594745, + "grad_norm": 0.8985583782196045, + "learning_rate": 2.0516069507386037e-05, + "loss": 0.7887, + "step": 69460 + }, + { + "epoch": 1.1151061814796386, + "grad_norm": 1.0494903326034546, + "learning_rate": 2.05098674140117e-05, + "loss": 0.7098, + "step": 69470 + }, + { + "epoch": 1.1152666976998025, + "grad_norm": 0.8415878415107727, + "learning_rate": 2.050366560618344e-05, + "loss": 0.6794, + "step": 69480 + }, + { + "epoch": 1.1154272139199666, + "grad_norm": 0.6146888136863708, + "learning_rate": 2.0497464084295658e-05, + "loss": 0.6164, + "step": 69490 + }, + { + "epoch": 1.1155877301401307, + "grad_norm": 1.3842048645019531, + "learning_rate": 2.0491262848742745e-05, + "loss": 0.7208, + "step": 69500 + }, + { + "epoch": 1.1157482463602948, + "grad_norm": 0.75701904296875, + "learning_rate": 2.048506189991904e-05, + "loss": 0.715, + "step": 69510 + }, + { + "epoch": 1.1159087625804587, + "grad_norm": 1.630330204963684, + "learning_rate": 2.0478861238218897e-05, + "loss": 0.5458, + "step": 69520 + }, + { + "epoch": 1.1160692788006228, + "grad_norm": 1.48627769947052, + "learning_rate": 2.0472660864036644e-05, + "loss": 0.6858, + "step": 69530 + }, + { + "epoch": 1.116229795020787, + "grad_norm": 0.7753258347511292, + "learning_rate": 2.0466460777766586e-05, + "loss": 0.6284, + "step": 69540 + }, + { + "epoch": 1.1163903112409508, + "grad_norm": 0.9214838743209839, + "learning_rate": 2.0460260979803016e-05, + "loss": 0.6069, + "step": 69550 + }, + { + "epoch": 1.116550827461115, + "grad_norm": 0.8224284052848816, + "learning_rate": 2.0454061470540203e-05, + "loss": 0.7525, + "step": 69560 + }, + { + "epoch": 1.116711343681279, + "grad_norm": 0.7702061533927917, + "learning_rate": 2.0447862250372396e-05, + "loss": 0.6824, + "step": 69570 + }, + { + "epoch": 1.116871859901443, + "grad_norm": 1.1357429027557373, + "learning_rate": 2.044166331969383e-05, + "loss": 0.774, + "step": 69580 + }, + { + "epoch": 1.117032376121607, + "grad_norm": 1.1858059167861938, + "learning_rate": 2.0435464678898726e-05, + "loss": 0.5874, + "step": 69590 + }, + { + "epoch": 1.117192892341771, + "grad_norm": 0.9193571209907532, + "learning_rate": 2.0429266328381275e-05, + "loss": 0.7648, + "step": 69600 + }, + { + "epoch": 1.1173534085619352, + "grad_norm": 0.9807444214820862, + "learning_rate": 2.042306826853566e-05, + "loss": 0.725, + "step": 69610 + }, + { + "epoch": 1.1175139247820993, + "grad_norm": 0.8438572883605957, + "learning_rate": 2.0416870499756034e-05, + "loss": 0.7573, + "step": 69620 + }, + { + "epoch": 1.1176744410022632, + "grad_norm": 1.840366244316101, + "learning_rate": 2.041067302243655e-05, + "loss": 0.7192, + "step": 69630 + }, + { + "epoch": 1.1178349572224273, + "grad_norm": 0.6595022082328796, + "learning_rate": 2.0404475836971323e-05, + "loss": 0.661, + "step": 69640 + }, + { + "epoch": 1.1179954734425914, + "grad_norm": 0.8055015206336975, + "learning_rate": 2.0398278943754458e-05, + "loss": 0.7445, + "step": 69650 + }, + { + "epoch": 1.1181559896627555, + "grad_norm": 0.6955201029777527, + "learning_rate": 2.0392082343180045e-05, + "loss": 0.6612, + "step": 69660 + }, + { + "epoch": 1.1183165058829194, + "grad_norm": 1.2687081098556519, + "learning_rate": 2.038588603564215e-05, + "loss": 0.7176, + "step": 69670 + }, + { + "epoch": 1.1184770221030835, + "grad_norm": 0.9785808324813843, + "learning_rate": 2.0379690021534817e-05, + "loss": 0.7797, + "step": 69680 + }, + { + "epoch": 1.1186375383232476, + "grad_norm": 0.7270053029060364, + "learning_rate": 2.037349430125209e-05, + "loss": 0.6198, + "step": 69690 + }, + { + "epoch": 1.1187980545434115, + "grad_norm": 1.174198865890503, + "learning_rate": 2.0367298875187965e-05, + "loss": 0.6845, + "step": 69700 + }, + { + "epoch": 1.1189585707635756, + "grad_norm": 1.3177013397216797, + "learning_rate": 2.0361103743736438e-05, + "loss": 0.7837, + "step": 69710 + }, + { + "epoch": 1.1191190869837397, + "grad_norm": 0.8952268362045288, + "learning_rate": 2.0354908907291483e-05, + "loss": 0.6761, + "step": 69720 + }, + { + "epoch": 1.1192796032039039, + "grad_norm": 1.0071132183074951, + "learning_rate": 2.034871436624706e-05, + "loss": 0.6348, + "step": 69730 + }, + { + "epoch": 1.1194401194240677, + "grad_norm": 1.069226622581482, + "learning_rate": 2.0342520120997103e-05, + "loss": 0.8257, + "step": 69740 + }, + { + "epoch": 1.1196006356442318, + "grad_norm": 0.6568140387535095, + "learning_rate": 2.033632617193554e-05, + "loss": 0.7326, + "step": 69750 + }, + { + "epoch": 1.119761151864396, + "grad_norm": 1.1707936525344849, + "learning_rate": 2.0330132519456248e-05, + "loss": 0.7139, + "step": 69760 + }, + { + "epoch": 1.1199216680845598, + "grad_norm": 0.7658221125602722, + "learning_rate": 2.032393916395312e-05, + "loss": 0.7115, + "step": 69770 + }, + { + "epoch": 1.120082184304724, + "grad_norm": 0.9459195733070374, + "learning_rate": 2.0317746105820015e-05, + "loss": 0.7242, + "step": 69780 + }, + { + "epoch": 1.120242700524888, + "grad_norm": 0.8330639600753784, + "learning_rate": 2.0311553345450775e-05, + "loss": 0.8289, + "step": 69790 + }, + { + "epoch": 1.1204032167450522, + "grad_norm": 0.8658886551856995, + "learning_rate": 2.0305360883239226e-05, + "loss": 0.6316, + "step": 69800 + }, + { + "epoch": 1.120563732965216, + "grad_norm": 1.2123558521270752, + "learning_rate": 2.0299168719579174e-05, + "loss": 0.6574, + "step": 69810 + }, + { + "epoch": 1.1207242491853802, + "grad_norm": 1.1088024377822876, + "learning_rate": 2.02929768548644e-05, + "loss": 0.7187, + "step": 69820 + }, + { + "epoch": 1.1208847654055443, + "grad_norm": 0.9116347432136536, + "learning_rate": 2.0286785289488664e-05, + "loss": 0.6406, + "step": 69830 + }, + { + "epoch": 1.1210452816257084, + "grad_norm": 0.9112764000892639, + "learning_rate": 2.028059402384572e-05, + "loss": 0.8218, + "step": 69840 + }, + { + "epoch": 1.1212057978458723, + "grad_norm": 0.9144654273986816, + "learning_rate": 2.0274403058329296e-05, + "loss": 0.5767, + "step": 69850 + }, + { + "epoch": 1.1213663140660364, + "grad_norm": 0.9670818448066711, + "learning_rate": 2.02682123933331e-05, + "loss": 0.7262, + "step": 69860 + }, + { + "epoch": 1.1215268302862005, + "grad_norm": 0.6744566559791565, + "learning_rate": 2.0262022029250826e-05, + "loss": 0.7848, + "step": 69870 + }, + { + "epoch": 1.1216873465063644, + "grad_norm": 0.745139479637146, + "learning_rate": 2.0255831966476148e-05, + "loss": 0.7207, + "step": 69880 + }, + { + "epoch": 1.1218478627265285, + "grad_norm": 0.8722692131996155, + "learning_rate": 2.0249642205402706e-05, + "loss": 0.6959, + "step": 69890 + }, + { + "epoch": 1.1220083789466926, + "grad_norm": 0.7329268455505371, + "learning_rate": 2.0243452746424135e-05, + "loss": 0.7437, + "step": 69900 + }, + { + "epoch": 1.1221688951668567, + "grad_norm": 0.9158763289451599, + "learning_rate": 2.023726358993406e-05, + "loss": 0.7723, + "step": 69910 + }, + { + "epoch": 1.1223294113870206, + "grad_norm": 1.05130136013031, + "learning_rate": 2.0231074736326055e-05, + "loss": 0.8246, + "step": 69920 + }, + { + "epoch": 1.1224899276071847, + "grad_norm": 0.9330126047134399, + "learning_rate": 2.0224886185993712e-05, + "loss": 0.6303, + "step": 69930 + }, + { + "epoch": 1.1226504438273488, + "grad_norm": 1.6907577514648438, + "learning_rate": 2.0218697939330582e-05, + "loss": 0.6276, + "step": 69940 + }, + { + "epoch": 1.122810960047513, + "grad_norm": 0.9110316634178162, + "learning_rate": 2.0212509996730196e-05, + "loss": 0.7121, + "step": 69950 + }, + { + "epoch": 1.1229714762676768, + "grad_norm": 0.993692934513092, + "learning_rate": 2.0206322358586076e-05, + "loss": 0.7313, + "step": 69960 + }, + { + "epoch": 1.123131992487841, + "grad_norm": 1.1709916591644287, + "learning_rate": 2.0200135025291715e-05, + "loss": 0.7265, + "step": 69970 + }, + { + "epoch": 1.123292508708005, + "grad_norm": 0.68226557970047, + "learning_rate": 2.0193947997240598e-05, + "loss": 0.7738, + "step": 69980 + }, + { + "epoch": 1.123453024928169, + "grad_norm": 0.923901379108429, + "learning_rate": 2.018776127482618e-05, + "loss": 0.72, + "step": 69990 + }, + { + "epoch": 1.123613541148333, + "grad_norm": 1.2417497634887695, + "learning_rate": 2.0181574858441904e-05, + "loss": 0.6415, + "step": 70000 + }, + { + "epoch": 1.1237740573684971, + "grad_norm": 1.6825902462005615, + "learning_rate": 2.017538874848118e-05, + "loss": 0.7063, + "step": 70010 + }, + { + "epoch": 1.1239345735886612, + "grad_norm": 0.7422142624855042, + "learning_rate": 2.016920294533741e-05, + "loss": 0.7021, + "step": 70020 + }, + { + "epoch": 1.124095089808825, + "grad_norm": 0.7932807207107544, + "learning_rate": 2.0163017449403986e-05, + "loss": 0.7011, + "step": 70030 + }, + { + "epoch": 1.1242556060289892, + "grad_norm": 0.9918656945228577, + "learning_rate": 2.0156832261074254e-05, + "loss": 0.7155, + "step": 70040 + }, + { + "epoch": 1.1244161222491533, + "grad_norm": 0.9513747692108154, + "learning_rate": 2.015064738074157e-05, + "loss": 0.7169, + "step": 70050 + }, + { + "epoch": 1.1245766384693172, + "grad_norm": 1.2416037321090698, + "learning_rate": 2.014446280879926e-05, + "loss": 0.7008, + "step": 70060 + }, + { + "epoch": 1.1247371546894813, + "grad_norm": 0.8709542751312256, + "learning_rate": 2.0138278545640603e-05, + "loss": 0.7221, + "step": 70070 + }, + { + "epoch": 1.1248976709096454, + "grad_norm": 0.875582218170166, + "learning_rate": 2.0132094591658894e-05, + "loss": 0.7674, + "step": 70080 + }, + { + "epoch": 1.1250581871298095, + "grad_norm": 1.1155900955200195, + "learning_rate": 2.0125910947247398e-05, + "loss": 0.6731, + "step": 70090 + }, + { + "epoch": 1.1252187033499734, + "grad_norm": 1.0152891874313354, + "learning_rate": 2.011972761279936e-05, + "loss": 0.6501, + "step": 70100 + }, + { + "epoch": 1.1253792195701375, + "grad_norm": 1.0427809953689575, + "learning_rate": 2.0113544588708e-05, + "loss": 0.6977, + "step": 70110 + }, + { + "epoch": 1.1255397357903016, + "grad_norm": 0.9612036943435669, + "learning_rate": 2.010736187536653e-05, + "loss": 0.6937, + "step": 70120 + }, + { + "epoch": 1.1257002520104658, + "grad_norm": 0.8397372961044312, + "learning_rate": 2.0101179473168124e-05, + "loss": 0.6523, + "step": 70130 + }, + { + "epoch": 1.1258607682306296, + "grad_norm": 1.0032190084457397, + "learning_rate": 2.009499738250595e-05, + "loss": 0.7224, + "step": 70140 + }, + { + "epoch": 1.1260212844507937, + "grad_norm": 0.7088655233383179, + "learning_rate": 2.008881560377315e-05, + "loss": 0.6971, + "step": 70150 + }, + { + "epoch": 1.1261818006709579, + "grad_norm": 1.2887359857559204, + "learning_rate": 2.008263413736285e-05, + "loss": 0.7257, + "step": 70160 + }, + { + "epoch": 1.126342316891122, + "grad_norm": 0.9088945984840393, + "learning_rate": 2.0076452983668163e-05, + "loss": 0.6509, + "step": 70170 + }, + { + "epoch": 1.1265028331112859, + "grad_norm": 1.1984392404556274, + "learning_rate": 2.0070272143082164e-05, + "loss": 0.6419, + "step": 70180 + }, + { + "epoch": 1.12666334933145, + "grad_norm": 1.1401828527450562, + "learning_rate": 2.0064091615997926e-05, + "loss": 0.7125, + "step": 70190 + }, + { + "epoch": 1.126823865551614, + "grad_norm": 1.0058754682540894, + "learning_rate": 2.0057911402808487e-05, + "loss": 0.6194, + "step": 70200 + }, + { + "epoch": 1.126984381771778, + "grad_norm": 0.7116989493370056, + "learning_rate": 2.0051731503906878e-05, + "loss": 0.757, + "step": 70210 + }, + { + "epoch": 1.127144897991942, + "grad_norm": 0.8172330260276794, + "learning_rate": 2.0045551919686103e-05, + "loss": 0.6566, + "step": 70220 + }, + { + "epoch": 1.1273054142121062, + "grad_norm": 0.8288361430168152, + "learning_rate": 2.0039372650539145e-05, + "loss": 0.7513, + "step": 70230 + }, + { + "epoch": 1.1274659304322703, + "grad_norm": 0.8053216338157654, + "learning_rate": 2.0033193696858967e-05, + "loss": 0.7108, + "step": 70240 + }, + { + "epoch": 1.1276264466524342, + "grad_norm": 1.6882612705230713, + "learning_rate": 2.0027015059038526e-05, + "loss": 0.7699, + "step": 70250 + }, + { + "epoch": 1.1277869628725983, + "grad_norm": 1.0694619417190552, + "learning_rate": 2.002083673747073e-05, + "loss": 0.5577, + "step": 70260 + }, + { + "epoch": 1.1279474790927624, + "grad_norm": 0.9332296848297119, + "learning_rate": 2.0014658732548498e-05, + "loss": 0.6955, + "step": 70270 + }, + { + "epoch": 1.1281079953129263, + "grad_norm": 0.9988384246826172, + "learning_rate": 2.0008481044664703e-05, + "loss": 0.74, + "step": 70280 + }, + { + "epoch": 1.1282685115330904, + "grad_norm": 0.7518426179885864, + "learning_rate": 2.000230367421222e-05, + "loss": 0.7217, + "step": 70290 + }, + { + "epoch": 1.1284290277532545, + "grad_norm": 0.9460627436637878, + "learning_rate": 1.999612662158389e-05, + "loss": 0.6608, + "step": 70300 + }, + { + "epoch": 1.1285895439734186, + "grad_norm": 1.067423701286316, + "learning_rate": 1.9989949887172548e-05, + "loss": 0.6831, + "step": 70310 + }, + { + "epoch": 1.1287500601935825, + "grad_norm": 0.670609176158905, + "learning_rate": 1.9983773471370975e-05, + "loss": 0.6196, + "step": 70320 + }, + { + "epoch": 1.1289105764137466, + "grad_norm": 1.5692217350006104, + "learning_rate": 1.9977597374571967e-05, + "loss": 0.5724, + "step": 70330 + }, + { + "epoch": 1.1290710926339107, + "grad_norm": 0.5693376660346985, + "learning_rate": 1.9971421597168288e-05, + "loss": 0.702, + "step": 70340 + }, + { + "epoch": 1.1292316088540746, + "grad_norm": 0.7924991846084595, + "learning_rate": 1.9965246139552682e-05, + "loss": 0.7143, + "step": 70350 + }, + { + "epoch": 1.1293921250742387, + "grad_norm": 1.1956628561019897, + "learning_rate": 1.9959071002117868e-05, + "loss": 0.8178, + "step": 70360 + }, + { + "epoch": 1.1295526412944028, + "grad_norm": 1.0798544883728027, + "learning_rate": 1.9952896185256564e-05, + "loss": 0.6594, + "step": 70370 + }, + { + "epoch": 1.129713157514567, + "grad_norm": 0.5931274890899658, + "learning_rate": 1.9946721689361427e-05, + "loss": 0.7846, + "step": 70380 + }, + { + "epoch": 1.1298736737347308, + "grad_norm": 1.0613905191421509, + "learning_rate": 1.994054751482513e-05, + "loss": 0.7181, + "step": 70390 + }, + { + "epoch": 1.130034189954895, + "grad_norm": 0.8071320652961731, + "learning_rate": 1.9934373662040318e-05, + "loss": 0.8015, + "step": 70400 + }, + { + "epoch": 1.130194706175059, + "grad_norm": 0.7961511611938477, + "learning_rate": 1.9928200131399608e-05, + "loss": 0.6528, + "step": 70410 + }, + { + "epoch": 1.1303552223952231, + "grad_norm": 0.8728100657463074, + "learning_rate": 1.99220269232956e-05, + "loss": 0.7378, + "step": 70420 + }, + { + "epoch": 1.130515738615387, + "grad_norm": 1.237910509109497, + "learning_rate": 1.9915854038120875e-05, + "loss": 0.7059, + "step": 70430 + }, + { + "epoch": 1.1306762548355511, + "grad_norm": 1.3388936519622803, + "learning_rate": 1.9909681476268007e-05, + "loss": 0.7344, + "step": 70440 + }, + { + "epoch": 1.1308367710557152, + "grad_norm": 1.688174843788147, + "learning_rate": 1.9903509238129505e-05, + "loss": 0.8495, + "step": 70450 + }, + { + "epoch": 1.1309972872758793, + "grad_norm": 1.080735683441162, + "learning_rate": 1.98973373240979e-05, + "loss": 0.6224, + "step": 70460 + }, + { + "epoch": 1.1311578034960432, + "grad_norm": 0.9347265958786011, + "learning_rate": 1.9891165734565696e-05, + "loss": 0.7244, + "step": 70470 + }, + { + "epoch": 1.1313183197162073, + "grad_norm": 0.6701546907424927, + "learning_rate": 1.9884994469925365e-05, + "loss": 0.5454, + "step": 70480 + }, + { + "epoch": 1.1314788359363714, + "grad_norm": 0.6894344687461853, + "learning_rate": 1.987882353056936e-05, + "loss": 0.7004, + "step": 70490 + }, + { + "epoch": 1.1316393521565353, + "grad_norm": 0.8248112201690674, + "learning_rate": 1.987265291689013e-05, + "loss": 0.6499, + "step": 70500 + }, + { + "epoch": 1.1317998683766994, + "grad_norm": 0.850373387336731, + "learning_rate": 1.986648262928007e-05, + "loss": 0.6748, + "step": 70510 + }, + { + "epoch": 1.1319603845968635, + "grad_norm": 0.6206574440002441, + "learning_rate": 1.986031266813159e-05, + "loss": 0.7143, + "step": 70520 + }, + { + "epoch": 1.1321209008170277, + "grad_norm": 0.8973442316055298, + "learning_rate": 1.9854143033837057e-05, + "loss": 0.8156, + "step": 70530 + }, + { + "epoch": 1.1322814170371915, + "grad_norm": 0.9699316024780273, + "learning_rate": 1.984797372678882e-05, + "loss": 0.672, + "step": 70540 + }, + { + "epoch": 1.1324419332573556, + "grad_norm": 1.3928933143615723, + "learning_rate": 1.9841804747379217e-05, + "loss": 0.7456, + "step": 70550 + }, + { + "epoch": 1.1326024494775198, + "grad_norm": 0.7020546197891235, + "learning_rate": 1.983563609600056e-05, + "loss": 0.8301, + "step": 70560 + }, + { + "epoch": 1.1327629656976836, + "grad_norm": 1.3384013175964355, + "learning_rate": 1.982946777304513e-05, + "loss": 0.8311, + "step": 70570 + }, + { + "epoch": 1.1329234819178478, + "grad_norm": 0.9081563949584961, + "learning_rate": 1.9823299778905204e-05, + "loss": 0.7161, + "step": 70580 + }, + { + "epoch": 1.1330839981380119, + "grad_norm": 0.8770254850387573, + "learning_rate": 1.9817132113973028e-05, + "loss": 0.6956, + "step": 70590 + }, + { + "epoch": 1.133244514358176, + "grad_norm": 1.1546571254730225, + "learning_rate": 1.981096477864083e-05, + "loss": 0.6435, + "step": 70600 + }, + { + "epoch": 1.1334050305783399, + "grad_norm": 1.0428659915924072, + "learning_rate": 1.980479777330081e-05, + "loss": 0.6021, + "step": 70610 + }, + { + "epoch": 1.133565546798504, + "grad_norm": 2.399736166000366, + "learning_rate": 1.9798631098345176e-05, + "loss": 0.6888, + "step": 70620 + }, + { + "epoch": 1.133726063018668, + "grad_norm": 0.8813310861587524, + "learning_rate": 1.979246475416606e-05, + "loss": 0.7386, + "step": 70630 + }, + { + "epoch": 1.133886579238832, + "grad_norm": 0.8504694700241089, + "learning_rate": 1.978629874115562e-05, + "loss": 0.6653, + "step": 70640 + }, + { + "epoch": 1.134047095458996, + "grad_norm": 1.0603289604187012, + "learning_rate": 1.978013305970598e-05, + "loss": 0.7588, + "step": 70650 + }, + { + "epoch": 1.1342076116791602, + "grad_norm": 0.9304746985435486, + "learning_rate": 1.9773967710209242e-05, + "loss": 0.8224, + "step": 70660 + }, + { + "epoch": 1.1343681278993243, + "grad_norm": 0.6838676333427429, + "learning_rate": 1.976780269305748e-05, + "loss": 0.7124, + "step": 70670 + }, + { + "epoch": 1.1345286441194884, + "grad_norm": 1.3431798219680786, + "learning_rate": 1.9761638008642773e-05, + "loss": 0.7541, + "step": 70680 + }, + { + "epoch": 1.1346891603396523, + "grad_norm": 0.9994063973426819, + "learning_rate": 1.975547365735713e-05, + "loss": 0.7105, + "step": 70690 + }, + { + "epoch": 1.1348496765598164, + "grad_norm": 1.2165395021438599, + "learning_rate": 1.9749309639592578e-05, + "loss": 0.7915, + "step": 70700 + }, + { + "epoch": 1.1350101927799805, + "grad_norm": 0.8720370531082153, + "learning_rate": 1.9743145955741116e-05, + "loss": 0.6582, + "step": 70710 + }, + { + "epoch": 1.1351707090001444, + "grad_norm": 0.9259694814682007, + "learning_rate": 1.9736982606194716e-05, + "loss": 0.7192, + "step": 70720 + }, + { + "epoch": 1.1353312252203085, + "grad_norm": 0.6086847186088562, + "learning_rate": 1.973081959134533e-05, + "loss": 0.7744, + "step": 70730 + }, + { + "epoch": 1.1354917414404726, + "grad_norm": 1.4512066841125488, + "learning_rate": 1.9724656911584892e-05, + "loss": 0.677, + "step": 70740 + }, + { + "epoch": 1.1356522576606367, + "grad_norm": 1.558113932609558, + "learning_rate": 1.9718494567305322e-05, + "loss": 0.6174, + "step": 70750 + }, + { + "epoch": 1.1358127738808006, + "grad_norm": 1.002219557762146, + "learning_rate": 1.971233255889849e-05, + "loss": 0.704, + "step": 70760 + }, + { + "epoch": 1.1359732901009647, + "grad_norm": 0.9040502905845642, + "learning_rate": 1.970617088675627e-05, + "loss": 0.8215, + "step": 70770 + }, + { + "epoch": 1.1361338063211288, + "grad_norm": 0.8541040420532227, + "learning_rate": 1.970000955127051e-05, + "loss": 0.6086, + "step": 70780 + }, + { + "epoch": 1.1362943225412927, + "grad_norm": 1.2319010496139526, + "learning_rate": 1.9693848552833037e-05, + "loss": 0.7419, + "step": 70790 + }, + { + "epoch": 1.1364548387614568, + "grad_norm": 1.1142972707748413, + "learning_rate": 1.968768789183565e-05, + "loss": 0.6384, + "step": 70800 + }, + { + "epoch": 1.136615354981621, + "grad_norm": 1.0654093027114868, + "learning_rate": 1.9681527568670136e-05, + "loss": 0.5834, + "step": 70810 + }, + { + "epoch": 1.136775871201785, + "grad_norm": 1.061918020248413, + "learning_rate": 1.967536758372825e-05, + "loss": 0.7311, + "step": 70820 + }, + { + "epoch": 1.136936387421949, + "grad_norm": 1.106339931488037, + "learning_rate": 1.966920793740173e-05, + "loss": 0.7316, + "step": 70830 + }, + { + "epoch": 1.137096903642113, + "grad_norm": 0.8493595123291016, + "learning_rate": 1.9663048630082306e-05, + "loss": 0.8127, + "step": 70840 + }, + { + "epoch": 1.1372574198622771, + "grad_norm": 0.7471984624862671, + "learning_rate": 1.9656889662161655e-05, + "loss": 0.8422, + "step": 70850 + }, + { + "epoch": 1.137417936082441, + "grad_norm": 0.9937965869903564, + "learning_rate": 1.9650731034031466e-05, + "loss": 0.6254, + "step": 70860 + }, + { + "epoch": 1.1375784523026051, + "grad_norm": 1.0726306438446045, + "learning_rate": 1.964457274608339e-05, + "loss": 0.8266, + "step": 70870 + }, + { + "epoch": 1.1377389685227692, + "grad_norm": 0.9452500343322754, + "learning_rate": 1.9638414798709047e-05, + "loss": 0.6819, + "step": 70880 + }, + { + "epoch": 1.1378994847429333, + "grad_norm": 0.966280460357666, + "learning_rate": 1.963225719230005e-05, + "loss": 0.7661, + "step": 70890 + }, + { + "epoch": 1.1380600009630972, + "grad_norm": 0.702876627445221, + "learning_rate": 1.9626099927247996e-05, + "loss": 0.7367, + "step": 70900 + }, + { + "epoch": 1.1382205171832613, + "grad_norm": 0.7832751274108887, + "learning_rate": 1.9619943003944445e-05, + "loss": 0.6749, + "step": 70910 + }, + { + "epoch": 1.1383810334034254, + "grad_norm": 0.9030916094779968, + "learning_rate": 1.9613786422780938e-05, + "loss": 0.695, + "step": 70920 + }, + { + "epoch": 1.1385415496235896, + "grad_norm": 1.00206458568573, + "learning_rate": 1.9607630184149012e-05, + "loss": 0.6842, + "step": 70930 + }, + { + "epoch": 1.1387020658437534, + "grad_norm": 1.0285418033599854, + "learning_rate": 1.9601474288440147e-05, + "loss": 0.7666, + "step": 70940 + }, + { + "epoch": 1.1388625820639176, + "grad_norm": 1.1267606019973755, + "learning_rate": 1.9595318736045832e-05, + "loss": 0.6907, + "step": 70950 + }, + { + "epoch": 1.1390230982840817, + "grad_norm": 0.8670206665992737, + "learning_rate": 1.9589163527357516e-05, + "loss": 0.8171, + "step": 70960 + }, + { + "epoch": 1.1391836145042458, + "grad_norm": 0.8324903845787048, + "learning_rate": 1.958300866276665e-05, + "loss": 0.7553, + "step": 70970 + }, + { + "epoch": 1.1393441307244097, + "grad_norm": 1.1977921724319458, + "learning_rate": 1.9576854142664634e-05, + "loss": 0.7108, + "step": 70980 + }, + { + "epoch": 1.1395046469445738, + "grad_norm": 0.9818487763404846, + "learning_rate": 1.9570699967442868e-05, + "loss": 0.6507, + "step": 70990 + }, + { + "epoch": 1.1396651631647379, + "grad_norm": 0.8643619418144226, + "learning_rate": 1.9564546137492724e-05, + "loss": 0.7907, + "step": 71000 + }, + { + "epoch": 1.1398256793849018, + "grad_norm": 0.7563008666038513, + "learning_rate": 1.955839265320553e-05, + "loss": 0.8211, + "step": 71010 + }, + { + "epoch": 1.1399861956050659, + "grad_norm": 1.1062687635421753, + "learning_rate": 1.9552239514972632e-05, + "loss": 0.7631, + "step": 71020 + }, + { + "epoch": 1.14014671182523, + "grad_norm": 1.1599358320236206, + "learning_rate": 1.954608672318532e-05, + "loss": 0.7387, + "step": 71030 + }, + { + "epoch": 1.140307228045394, + "grad_norm": 0.9910722374916077, + "learning_rate": 1.9539934278234882e-05, + "loss": 0.7083, + "step": 71040 + }, + { + "epoch": 1.140467744265558, + "grad_norm": 0.649465024471283, + "learning_rate": 1.9533782180512577e-05, + "loss": 0.6501, + "step": 71050 + }, + { + "epoch": 1.140628260485722, + "grad_norm": 0.8590642809867859, + "learning_rate": 1.9527630430409656e-05, + "loss": 0.7666, + "step": 71060 + }, + { + "epoch": 1.1407887767058862, + "grad_norm": 0.576342761516571, + "learning_rate": 1.9521479028317303e-05, + "loss": 0.6719, + "step": 71070 + }, + { + "epoch": 1.14094929292605, + "grad_norm": 0.6611829400062561, + "learning_rate": 1.951532797462673e-05, + "loss": 0.6752, + "step": 71080 + }, + { + "epoch": 1.1411098091462142, + "grad_norm": 1.115727186203003, + "learning_rate": 1.950917726972911e-05, + "loss": 0.7601, + "step": 71090 + }, + { + "epoch": 1.1412703253663783, + "grad_norm": 0.7542492747306824, + "learning_rate": 1.950302691401558e-05, + "loss": 0.7664, + "step": 71100 + }, + { + "epoch": 1.1414308415865424, + "grad_norm": 0.8406704664230347, + "learning_rate": 1.9496876907877283e-05, + "loss": 0.5797, + "step": 71110 + }, + { + "epoch": 1.1415913578067063, + "grad_norm": 1.0737298727035522, + "learning_rate": 1.9490727251705314e-05, + "loss": 0.7523, + "step": 71120 + }, + { + "epoch": 1.1417518740268704, + "grad_norm": 0.6645668148994446, + "learning_rate": 1.9484577945890752e-05, + "loss": 0.7792, + "step": 71130 + }, + { + "epoch": 1.1419123902470345, + "grad_norm": 1.0429867506027222, + "learning_rate": 1.9478428990824664e-05, + "loss": 0.8069, + "step": 71140 + }, + { + "epoch": 1.1420729064671984, + "grad_norm": 1.1837589740753174, + "learning_rate": 1.947228038689808e-05, + "loss": 0.751, + "step": 71150 + }, + { + "epoch": 1.1422334226873625, + "grad_norm": 0.8777702450752258, + "learning_rate": 1.9466132134502023e-05, + "loss": 0.7247, + "step": 71160 + }, + { + "epoch": 1.1423939389075266, + "grad_norm": 0.87794029712677, + "learning_rate": 1.945998423402748e-05, + "loss": 0.7558, + "step": 71170 + }, + { + "epoch": 1.1425544551276907, + "grad_norm": 1.215549349784851, + "learning_rate": 1.945383668586543e-05, + "loss": 0.6747, + "step": 71180 + }, + { + "epoch": 1.1427149713478548, + "grad_norm": 0.8850443959236145, + "learning_rate": 1.9447689490406804e-05, + "loss": 0.6344, + "step": 71190 + }, + { + "epoch": 1.1428754875680187, + "grad_norm": 0.9976768493652344, + "learning_rate": 1.9441542648042538e-05, + "loss": 0.6279, + "step": 71200 + }, + { + "epoch": 1.1430360037881828, + "grad_norm": 0.906110405921936, + "learning_rate": 1.9435396159163538e-05, + "loss": 0.6918, + "step": 71210 + }, + { + "epoch": 1.143196520008347, + "grad_norm": 0.779587984085083, + "learning_rate": 1.9429250024160677e-05, + "loss": 0.664, + "step": 71220 + }, + { + "epoch": 1.1433570362285108, + "grad_norm": 0.8997251987457275, + "learning_rate": 1.9423104243424822e-05, + "loss": 0.7672, + "step": 71230 + }, + { + "epoch": 1.143517552448675, + "grad_norm": 0.7450826168060303, + "learning_rate": 1.94169588173468e-05, + "loss": 0.6525, + "step": 71240 + }, + { + "epoch": 1.143678068668839, + "grad_norm": 1.047090768814087, + "learning_rate": 1.9410813746317443e-05, + "loss": 0.5831, + "step": 71250 + }, + { + "epoch": 1.1438385848890031, + "grad_norm": 0.6676836609840393, + "learning_rate": 1.9404669030727512e-05, + "loss": 0.7144, + "step": 71260 + }, + { + "epoch": 1.143999101109167, + "grad_norm": 1.7187491655349731, + "learning_rate": 1.939852467096779e-05, + "loss": 0.7436, + "step": 71270 + }, + { + "epoch": 1.1441596173293311, + "grad_norm": 0.6760104298591614, + "learning_rate": 1.9392380667429026e-05, + "loss": 0.6982, + "step": 71280 + }, + { + "epoch": 1.1443201335494952, + "grad_norm": 1.1960405111312866, + "learning_rate": 1.9386237020501933e-05, + "loss": 0.758, + "step": 71290 + }, + { + "epoch": 1.1444806497696591, + "grad_norm": 1.0171759128570557, + "learning_rate": 1.938009373057722e-05, + "loss": 0.5949, + "step": 71300 + }, + { + "epoch": 1.1446411659898232, + "grad_norm": 0.9004735350608826, + "learning_rate": 1.9373950798045567e-05, + "loss": 0.7245, + "step": 71310 + }, + { + "epoch": 1.1448016822099873, + "grad_norm": 0.8586950302124023, + "learning_rate": 1.9367808223297614e-05, + "loss": 0.6878, + "step": 71320 + }, + { + "epoch": 1.1449621984301515, + "grad_norm": 1.2301682233810425, + "learning_rate": 1.9361666006723997e-05, + "loss": 0.816, + "step": 71330 + }, + { + "epoch": 1.1451227146503153, + "grad_norm": 1.139563798904419, + "learning_rate": 1.9355524148715333e-05, + "loss": 0.7835, + "step": 71340 + }, + { + "epoch": 1.1452832308704795, + "grad_norm": 1.1925175189971924, + "learning_rate": 1.93493826496622e-05, + "loss": 0.743, + "step": 71350 + }, + { + "epoch": 1.1454437470906436, + "grad_norm": 1.0963187217712402, + "learning_rate": 1.9343241509955166e-05, + "loss": 0.7003, + "step": 71360 + }, + { + "epoch": 1.1456042633108074, + "grad_norm": 1.7554556131362915, + "learning_rate": 1.933710072998478e-05, + "loss": 0.6748, + "step": 71370 + }, + { + "epoch": 1.1457647795309716, + "grad_norm": 1.1086435317993164, + "learning_rate": 1.933096031014154e-05, + "loss": 0.7838, + "step": 71380 + }, + { + "epoch": 1.1459252957511357, + "grad_norm": 0.8300561308860779, + "learning_rate": 1.932482025081595e-05, + "loss": 0.6985, + "step": 71390 + }, + { + "epoch": 1.1460858119712998, + "grad_norm": 0.7752721309661865, + "learning_rate": 1.9318680552398483e-05, + "loss": 0.6459, + "step": 71400 + }, + { + "epoch": 1.1462463281914637, + "grad_norm": 1.1652508974075317, + "learning_rate": 1.9312541215279584e-05, + "loss": 0.7309, + "step": 71410 + }, + { + "epoch": 1.1464068444116278, + "grad_norm": 1.0570368766784668, + "learning_rate": 1.9306402239849682e-05, + "loss": 0.8218, + "step": 71420 + }, + { + "epoch": 1.1465673606317919, + "grad_norm": 0.726588249206543, + "learning_rate": 1.930026362649919e-05, + "loss": 0.7012, + "step": 71430 + }, + { + "epoch": 1.146727876851956, + "grad_norm": 0.7533114552497864, + "learning_rate": 1.9294125375618465e-05, + "loss": 0.7204, + "step": 71440 + }, + { + "epoch": 1.1468883930721199, + "grad_norm": 0.6213864088058472, + "learning_rate": 1.928798748759788e-05, + "loss": 0.6938, + "step": 71450 + }, + { + "epoch": 1.147048909292284, + "grad_norm": 0.9942591190338135, + "learning_rate": 1.9281849962827765e-05, + "loss": 0.7421, + "step": 71460 + }, + { + "epoch": 1.147209425512448, + "grad_norm": 0.7456579804420471, + "learning_rate": 1.9275712801698438e-05, + "loss": 0.7121, + "step": 71470 + }, + { + "epoch": 1.1473699417326122, + "grad_norm": 2.875626802444458, + "learning_rate": 1.926957600460017e-05, + "loss": 0.8534, + "step": 71480 + }, + { + "epoch": 1.147530457952776, + "grad_norm": 0.9574546813964844, + "learning_rate": 1.9263439571923235e-05, + "loss": 0.7695, + "step": 71490 + }, + { + "epoch": 1.1476909741729402, + "grad_norm": 1.300195336341858, + "learning_rate": 1.9257303504057876e-05, + "loss": 0.7704, + "step": 71500 + }, + { + "epoch": 1.1478514903931043, + "grad_norm": 1.0273488759994507, + "learning_rate": 1.92511678013943e-05, + "loss": 0.636, + "step": 71510 + }, + { + "epoch": 1.1480120066132682, + "grad_norm": 1.0343494415283203, + "learning_rate": 1.9245032464322717e-05, + "loss": 0.6713, + "step": 71520 + }, + { + "epoch": 1.1481725228334323, + "grad_norm": 2.079263687133789, + "learning_rate": 1.9238897493233287e-05, + "loss": 0.7398, + "step": 71530 + }, + { + "epoch": 1.1483330390535964, + "grad_norm": 1.3810460567474365, + "learning_rate": 1.923276288851616e-05, + "loss": 0.6516, + "step": 71540 + }, + { + "epoch": 1.1484935552737605, + "grad_norm": 1.023990273475647, + "learning_rate": 1.922662865056147e-05, + "loss": 0.7288, + "step": 71550 + }, + { + "epoch": 1.1486540714939244, + "grad_norm": 1.1363574266433716, + "learning_rate": 1.9220494779759314e-05, + "loss": 0.7516, + "step": 71560 + }, + { + "epoch": 1.1488145877140885, + "grad_norm": 0.7809047102928162, + "learning_rate": 1.9214361276499764e-05, + "loss": 0.7258, + "step": 71570 + }, + { + "epoch": 1.1489751039342526, + "grad_norm": 0.7896170020103455, + "learning_rate": 1.9208228141172874e-05, + "loss": 0.5634, + "step": 71580 + }, + { + "epoch": 1.1491356201544165, + "grad_norm": 0.9319188594818115, + "learning_rate": 1.9202095374168683e-05, + "loss": 0.7691, + "step": 71590 + }, + { + "epoch": 1.1492961363745806, + "grad_norm": 1.0369364023208618, + "learning_rate": 1.9195962975877192e-05, + "loss": 0.7351, + "step": 71600 + }, + { + "epoch": 1.1494566525947447, + "grad_norm": 1.3604875802993774, + "learning_rate": 1.9189830946688395e-05, + "loss": 0.7084, + "step": 71610 + }, + { + "epoch": 1.1496171688149088, + "grad_norm": 1.266930103302002, + "learning_rate": 1.9183699286992253e-05, + "loss": 0.6738, + "step": 71620 + }, + { + "epoch": 1.1497776850350727, + "grad_norm": 0.9695551991462708, + "learning_rate": 1.9177567997178692e-05, + "loss": 0.6612, + "step": 71630 + }, + { + "epoch": 1.1499382012552368, + "grad_norm": 0.9691539406776428, + "learning_rate": 1.917143707763763e-05, + "loss": 0.7389, + "step": 71640 + }, + { + "epoch": 1.150098717475401, + "grad_norm": 0.9642812013626099, + "learning_rate": 1.916530652875896e-05, + "loss": 0.7212, + "step": 71650 + }, + { + "epoch": 1.1502592336955648, + "grad_norm": 0.7747491002082825, + "learning_rate": 1.915917635093255e-05, + "loss": 0.7975, + "step": 71660 + }, + { + "epoch": 1.150419749915729, + "grad_norm": 0.8886061906814575, + "learning_rate": 1.9153046544548236e-05, + "loss": 0.6137, + "step": 71670 + }, + { + "epoch": 1.150580266135893, + "grad_norm": 0.8798766732215881, + "learning_rate": 1.914691710999586e-05, + "loss": 0.7935, + "step": 71680 + }, + { + "epoch": 1.1507407823560571, + "grad_norm": 0.9865604043006897, + "learning_rate": 1.914078804766519e-05, + "loss": 0.6688, + "step": 71690 + }, + { + "epoch": 1.150901298576221, + "grad_norm": 0.8811134696006775, + "learning_rate": 1.913465935794601e-05, + "loss": 0.7245, + "step": 71700 + }, + { + "epoch": 1.1510618147963851, + "grad_norm": 1.6649597883224487, + "learning_rate": 1.9128531041228068e-05, + "loss": 0.683, + "step": 71710 + }, + { + "epoch": 1.1512223310165492, + "grad_norm": 1.1390745639801025, + "learning_rate": 1.912240309790109e-05, + "loss": 0.7248, + "step": 71720 + }, + { + "epoch": 1.1513828472367134, + "grad_norm": 0.9565975666046143, + "learning_rate": 1.9116275528354775e-05, + "loss": 0.6735, + "step": 71730 + }, + { + "epoch": 1.1515433634568772, + "grad_norm": 0.7092466354370117, + "learning_rate": 1.9110148332978812e-05, + "loss": 0.6447, + "step": 71740 + }, + { + "epoch": 1.1517038796770414, + "grad_norm": 1.2143418788909912, + "learning_rate": 1.910402151216284e-05, + "loss": 0.6575, + "step": 71750 + }, + { + "epoch": 1.1518643958972055, + "grad_norm": 1.111075520515442, + "learning_rate": 1.9097895066296488e-05, + "loss": 0.6723, + "step": 71760 + }, + { + "epoch": 1.1520249121173696, + "grad_norm": 0.6475362181663513, + "learning_rate": 1.9091768995769373e-05, + "loss": 0.7148, + "step": 71770 + }, + { + "epoch": 1.1521854283375335, + "grad_norm": 1.0530136823654175, + "learning_rate": 1.9085643300971077e-05, + "loss": 0.6597, + "step": 71780 + }, + { + "epoch": 1.1523459445576976, + "grad_norm": 1.0637295246124268, + "learning_rate": 1.9079517982291145e-05, + "loss": 0.7024, + "step": 71790 + }, + { + "epoch": 1.1525064607778617, + "grad_norm": 0.965395450592041, + "learning_rate": 1.9073393040119123e-05, + "loss": 0.6194, + "step": 71800 + }, + { + "epoch": 1.1526669769980256, + "grad_norm": 1.1840509176254272, + "learning_rate": 1.906726847484452e-05, + "loss": 0.6867, + "step": 71810 + }, + { + "epoch": 1.1528274932181897, + "grad_norm": 1.1022340059280396, + "learning_rate": 1.9061144286856818e-05, + "loss": 0.7248, + "step": 71820 + }, + { + "epoch": 1.1529880094383538, + "grad_norm": 0.8653661012649536, + "learning_rate": 1.905502047654548e-05, + "loss": 0.6579, + "step": 71830 + }, + { + "epoch": 1.1531485256585179, + "grad_norm": 0.7986868023872375, + "learning_rate": 1.904889704429995e-05, + "loss": 0.6866, + "step": 71840 + }, + { + "epoch": 1.1533090418786818, + "grad_norm": 1.1121407747268677, + "learning_rate": 1.9042773990509635e-05, + "loss": 0.6006, + "step": 71850 + }, + { + "epoch": 1.1534695580988459, + "grad_norm": 1.16482675075531, + "learning_rate": 1.903665131556393e-05, + "loss": 0.6442, + "step": 71860 + }, + { + "epoch": 1.15363007431901, + "grad_norm": 1.1554648876190186, + "learning_rate": 1.903052901985221e-05, + "loss": 0.6486, + "step": 71870 + }, + { + "epoch": 1.1537905905391739, + "grad_norm": 0.7552081346511841, + "learning_rate": 1.90244071037638e-05, + "loss": 0.7621, + "step": 71880 + }, + { + "epoch": 1.153951106759338, + "grad_norm": 0.7338752746582031, + "learning_rate": 1.9018285567688016e-05, + "loss": 0.6855, + "step": 71890 + }, + { + "epoch": 1.154111622979502, + "grad_norm": 0.808019757270813, + "learning_rate": 1.9012164412014165e-05, + "loss": 0.7627, + "step": 71900 + }, + { + "epoch": 1.1542721391996662, + "grad_norm": 0.7085984945297241, + "learning_rate": 1.9006043637131517e-05, + "loss": 0.8172, + "step": 71910 + }, + { + "epoch": 1.15443265541983, + "grad_norm": 1.251456379890442, + "learning_rate": 1.8999923243429307e-05, + "loss": 0.6864, + "step": 71920 + }, + { + "epoch": 1.1545931716399942, + "grad_norm": 0.8291187882423401, + "learning_rate": 1.899380323129678e-05, + "loss": 0.5584, + "step": 71930 + }, + { + "epoch": 1.1547536878601583, + "grad_norm": 1.1680234670639038, + "learning_rate": 1.8987683601123096e-05, + "loss": 0.7068, + "step": 71940 + }, + { + "epoch": 1.1549142040803222, + "grad_norm": 0.6182203888893127, + "learning_rate": 1.898156435329745e-05, + "loss": 0.8049, + "step": 71950 + }, + { + "epoch": 1.1550747203004863, + "grad_norm": 0.8402630090713501, + "learning_rate": 1.8975445488208985e-05, + "loss": 0.7046, + "step": 71960 + }, + { + "epoch": 1.1552352365206504, + "grad_norm": 0.8137336373329163, + "learning_rate": 1.896932700624683e-05, + "loss": 0.6641, + "step": 71970 + }, + { + "epoch": 1.1553957527408145, + "grad_norm": 0.9721313118934631, + "learning_rate": 1.8963208907800077e-05, + "loss": 0.6193, + "step": 71980 + }, + { + "epoch": 1.1555562689609786, + "grad_norm": 0.9628494381904602, + "learning_rate": 1.8957091193257815e-05, + "loss": 0.8076, + "step": 71990 + }, + { + "epoch": 1.1557167851811425, + "grad_norm": 0.8052477836608887, + "learning_rate": 1.8950973863009074e-05, + "loss": 0.7618, + "step": 72000 + }, + { + "epoch": 1.1557167851811425, + "eval_loss": 0.7751402854919434, + "eval_runtime": 1833.3514, + "eval_samples_per_second": 14.308, + "eval_steps_per_second": 1.789, + "step": 72000 + }, + { + "epoch": 1.1558773014013066, + "grad_norm": 0.6631579995155334, + "learning_rate": 1.8944856917442895e-05, + "loss": 0.6497, + "step": 72010 + }, + { + "epoch": 1.1560378176214707, + "grad_norm": 0.987432062625885, + "learning_rate": 1.8938740356948274e-05, + "loss": 0.6636, + "step": 72020 + }, + { + "epoch": 1.1561983338416346, + "grad_norm": 0.57856684923172, + "learning_rate": 1.893262418191419e-05, + "loss": 0.7085, + "step": 72030 + }, + { + "epoch": 1.1563588500617987, + "grad_norm": 0.8651732206344604, + "learning_rate": 1.89265083927296e-05, + "loss": 0.696, + "step": 72040 + }, + { + "epoch": 1.1565193662819628, + "grad_norm": 1.1658809185028076, + "learning_rate": 1.8920392989783424e-05, + "loss": 0.6557, + "step": 72050 + }, + { + "epoch": 1.156679882502127, + "grad_norm": 0.8316484689712524, + "learning_rate": 1.8914277973464575e-05, + "loss": 0.726, + "step": 72060 + }, + { + "epoch": 1.1568403987222908, + "grad_norm": 0.8382241725921631, + "learning_rate": 1.8908163344161923e-05, + "loss": 0.7858, + "step": 72070 + }, + { + "epoch": 1.157000914942455, + "grad_norm": 0.8847562670707703, + "learning_rate": 1.8902049102264328e-05, + "loss": 0.6674, + "step": 72080 + }, + { + "epoch": 1.157161431162619, + "grad_norm": 0.6590490341186523, + "learning_rate": 1.889593524816062e-05, + "loss": 0.7047, + "step": 72090 + }, + { + "epoch": 1.157321947382783, + "grad_norm": 0.58794766664505, + "learning_rate": 1.8889821782239603e-05, + "loss": 0.5673, + "step": 72100 + }, + { + "epoch": 1.157482463602947, + "grad_norm": 1.1875535249710083, + "learning_rate": 1.8883708704890057e-05, + "loss": 0.6931, + "step": 72110 + }, + { + "epoch": 1.1576429798231112, + "grad_norm": 1.3847163915634155, + "learning_rate": 1.8877596016500743e-05, + "loss": 0.6943, + "step": 72120 + }, + { + "epoch": 1.1578034960432753, + "grad_norm": 1.0220947265625, + "learning_rate": 1.887148371746038e-05, + "loss": 0.6192, + "step": 72130 + }, + { + "epoch": 1.1579640122634391, + "grad_norm": 0.6907919645309448, + "learning_rate": 1.8865371808157684e-05, + "loss": 0.6112, + "step": 72140 + }, + { + "epoch": 1.1581245284836033, + "grad_norm": 1.4200078248977661, + "learning_rate": 1.8859260288981334e-05, + "loss": 0.7706, + "step": 72150 + }, + { + "epoch": 1.1582850447037674, + "grad_norm": 0.8221948742866516, + "learning_rate": 1.8853149160319987e-05, + "loss": 0.6972, + "step": 72160 + }, + { + "epoch": 1.1584455609239313, + "grad_norm": 0.7863196134567261, + "learning_rate": 1.8847038422562276e-05, + "loss": 0.751, + "step": 72170 + }, + { + "epoch": 1.1586060771440954, + "grad_norm": 0.9310802221298218, + "learning_rate": 1.8840928076096818e-05, + "loss": 0.69, + "step": 72180 + }, + { + "epoch": 1.1587665933642595, + "grad_norm": 1.4520796537399292, + "learning_rate": 1.883481812131217e-05, + "loss": 0.7156, + "step": 72190 + }, + { + "epoch": 1.1589271095844236, + "grad_norm": 1.11784029006958, + "learning_rate": 1.8828708558596904e-05, + "loss": 0.7987, + "step": 72200 + }, + { + "epoch": 1.1590876258045875, + "grad_norm": 1.3468621969223022, + "learning_rate": 1.8822599388339557e-05, + "loss": 0.7161, + "step": 72210 + }, + { + "epoch": 1.1592481420247516, + "grad_norm": 0.9562967419624329, + "learning_rate": 1.8816490610928627e-05, + "loss": 0.8148, + "step": 72220 + }, + { + "epoch": 1.1594086582449157, + "grad_norm": 1.1938873529434204, + "learning_rate": 1.88103822267526e-05, + "loss": 0.761, + "step": 72230 + }, + { + "epoch": 1.1595691744650798, + "grad_norm": 1.3397197723388672, + "learning_rate": 1.880427423619995e-05, + "loss": 0.7482, + "step": 72240 + }, + { + "epoch": 1.1597296906852437, + "grad_norm": 0.8901255130767822, + "learning_rate": 1.8798166639659078e-05, + "loss": 0.7077, + "step": 72250 + }, + { + "epoch": 1.1598902069054078, + "grad_norm": 0.7943786978721619, + "learning_rate": 1.8792059437518406e-05, + "loss": 0.5765, + "step": 72260 + }, + { + "epoch": 1.160050723125572, + "grad_norm": 1.218157410621643, + "learning_rate": 1.878595263016632e-05, + "loss": 0.7189, + "step": 72270 + }, + { + "epoch": 1.160211239345736, + "grad_norm": 1.343795895576477, + "learning_rate": 1.877984621799117e-05, + "loss": 0.6825, + "step": 72280 + }, + { + "epoch": 1.1603717555658999, + "grad_norm": 1.3540464639663696, + "learning_rate": 1.8773740201381296e-05, + "loss": 0.6945, + "step": 72290 + }, + { + "epoch": 1.160532271786064, + "grad_norm": 0.8709456920623779, + "learning_rate": 1.876763458072501e-05, + "loss": 0.6887, + "step": 72300 + }, + { + "epoch": 1.160692788006228, + "grad_norm": 0.5058837532997131, + "learning_rate": 1.8761529356410572e-05, + "loss": 0.7175, + "step": 72310 + }, + { + "epoch": 1.160853304226392, + "grad_norm": 0.8650254011154175, + "learning_rate": 1.8755424528826253e-05, + "loss": 0.6473, + "step": 72320 + }, + { + "epoch": 1.161013820446556, + "grad_norm": 1.1646678447723389, + "learning_rate": 1.8749320098360283e-05, + "loss": 0.6658, + "step": 72330 + }, + { + "epoch": 1.1611743366667202, + "grad_norm": 1.0136480331420898, + "learning_rate": 1.8743216065400865e-05, + "loss": 0.834, + "step": 72340 + }, + { + "epoch": 1.1613348528868843, + "grad_norm": 0.9330699443817139, + "learning_rate": 1.8737112430336183e-05, + "loss": 0.7235, + "step": 72350 + }, + { + "epoch": 1.1614953691070482, + "grad_norm": 1.0948784351348877, + "learning_rate": 1.873100919355439e-05, + "loss": 0.697, + "step": 72360 + }, + { + "epoch": 1.1616558853272123, + "grad_norm": 1.113861083984375, + "learning_rate": 1.872490635544362e-05, + "loss": 0.7337, + "step": 72370 + }, + { + "epoch": 1.1618164015473764, + "grad_norm": 0.8061099648475647, + "learning_rate": 1.8718803916391975e-05, + "loss": 0.7451, + "step": 72380 + }, + { + "epoch": 1.1619769177675403, + "grad_norm": 1.2426093816757202, + "learning_rate": 1.871270187678753e-05, + "loss": 0.72, + "step": 72390 + }, + { + "epoch": 1.1621374339877044, + "grad_norm": 1.0154246091842651, + "learning_rate": 1.8706600237018343e-05, + "loss": 0.671, + "step": 72400 + }, + { + "epoch": 1.1622979502078685, + "grad_norm": 1.1205822229385376, + "learning_rate": 1.870049899747245e-05, + "loss": 0.5733, + "step": 72410 + }, + { + "epoch": 1.1624584664280326, + "grad_norm": 0.758883535861969, + "learning_rate": 1.869439815853784e-05, + "loss": 0.7955, + "step": 72420 + }, + { + "epoch": 1.1626189826481965, + "grad_norm": 1.1362919807434082, + "learning_rate": 1.86882977206025e-05, + "loss": 0.7079, + "step": 72430 + }, + { + "epoch": 1.1627794988683606, + "grad_norm": 0.7741249203681946, + "learning_rate": 1.8682197684054376e-05, + "loss": 0.662, + "step": 72440 + }, + { + "epoch": 1.1629400150885247, + "grad_norm": 0.7099503874778748, + "learning_rate": 1.86760980492814e-05, + "loss": 0.7202, + "step": 72450 + }, + { + "epoch": 1.1631005313086886, + "grad_norm": 0.9904049634933472, + "learning_rate": 1.8669998816671465e-05, + "loss": 0.7027, + "step": 72460 + }, + { + "epoch": 1.1632610475288527, + "grad_norm": 0.8287633657455444, + "learning_rate": 1.8663899986612454e-05, + "loss": 0.8512, + "step": 72470 + }, + { + "epoch": 1.1634215637490168, + "grad_norm": 1.045393943786621, + "learning_rate": 1.8657801559492217e-05, + "loss": 0.652, + "step": 72480 + }, + { + "epoch": 1.163582079969181, + "grad_norm": 0.9356820583343506, + "learning_rate": 1.8651703535698582e-05, + "loss": 0.6299, + "step": 72490 + }, + { + "epoch": 1.163742596189345, + "grad_norm": 0.916735827922821, + "learning_rate": 1.8645605915619336e-05, + "loss": 0.6834, + "step": 72500 + }, + { + "epoch": 1.163903112409509, + "grad_norm": 0.9225800633430481, + "learning_rate": 1.8639508699642254e-05, + "loss": 0.6863, + "step": 72510 + }, + { + "epoch": 1.164063628629673, + "grad_norm": 1.5118008852005005, + "learning_rate": 1.863341188815509e-05, + "loss": 0.6788, + "step": 72520 + }, + { + "epoch": 1.1642241448498372, + "grad_norm": 0.9146981239318848, + "learning_rate": 1.8627315481545557e-05, + "loss": 0.7168, + "step": 72530 + }, + { + "epoch": 1.164384661070001, + "grad_norm": 1.0375807285308838, + "learning_rate": 1.8621219480201358e-05, + "loss": 0.5731, + "step": 72540 + }, + { + "epoch": 1.1645451772901652, + "grad_norm": 0.9670180678367615, + "learning_rate": 1.861512388451017e-05, + "loss": 0.706, + "step": 72550 + }, + { + "epoch": 1.1647056935103293, + "grad_norm": 1.068947196006775, + "learning_rate": 1.860902869485962e-05, + "loss": 0.6588, + "step": 72560 + }, + { + "epoch": 1.1648662097304934, + "grad_norm": 0.6734647154808044, + "learning_rate": 1.8602933911637334e-05, + "loss": 0.7032, + "step": 72570 + }, + { + "epoch": 1.1650267259506573, + "grad_norm": 0.8932271599769592, + "learning_rate": 1.8596839535230898e-05, + "loss": 0.8613, + "step": 72580 + }, + { + "epoch": 1.1651872421708214, + "grad_norm": 0.857671320438385, + "learning_rate": 1.859074556602789e-05, + "loss": 0.7586, + "step": 72590 + }, + { + "epoch": 1.1653477583909855, + "grad_norm": 0.6600189208984375, + "learning_rate": 1.858465200441584e-05, + "loss": 0.6404, + "step": 72600 + }, + { + "epoch": 1.1655082746111494, + "grad_norm": 0.9258981347084045, + "learning_rate": 1.8578558850782275e-05, + "loss": 0.6534, + "step": 72610 + }, + { + "epoch": 1.1656687908313135, + "grad_norm": 0.8577353954315186, + "learning_rate": 1.8572466105514687e-05, + "loss": 0.7374, + "step": 72620 + }, + { + "epoch": 1.1658293070514776, + "grad_norm": 0.792249858379364, + "learning_rate": 1.8566373769000513e-05, + "loss": 0.6903, + "step": 72630 + }, + { + "epoch": 1.1659898232716417, + "grad_norm": 2.5749900341033936, + "learning_rate": 1.856028184162721e-05, + "loss": 0.62, + "step": 72640 + }, + { + "epoch": 1.1661503394918056, + "grad_norm": 1.0027236938476562, + "learning_rate": 1.8554190323782185e-05, + "loss": 0.7767, + "step": 72650 + }, + { + "epoch": 1.1663108557119697, + "grad_norm": 1.0750994682312012, + "learning_rate": 1.854809921585282e-05, + "loss": 0.7965, + "step": 72660 + }, + { + "epoch": 1.1664713719321338, + "grad_norm": 1.0542598962783813, + "learning_rate": 1.8542008518226478e-05, + "loss": 0.6763, + "step": 72670 + }, + { + "epoch": 1.1666318881522977, + "grad_norm": 0.9422388076782227, + "learning_rate": 1.8535918231290494e-05, + "loss": 0.6405, + "step": 72680 + }, + { + "epoch": 1.1667924043724618, + "grad_norm": 0.6950238347053528, + "learning_rate": 1.8529828355432165e-05, + "loss": 0.5929, + "step": 72690 + }, + { + "epoch": 1.166952920592626, + "grad_norm": 1.0578755140304565, + "learning_rate": 1.8523738891038776e-05, + "loss": 0.6557, + "step": 72700 + }, + { + "epoch": 1.16711343681279, + "grad_norm": 0.6764551401138306, + "learning_rate": 1.851764983849758e-05, + "loss": 0.7519, + "step": 72710 + }, + { + "epoch": 1.167273953032954, + "grad_norm": 0.8577466011047363, + "learning_rate": 1.8511561198195814e-05, + "loss": 0.7747, + "step": 72720 + }, + { + "epoch": 1.167434469253118, + "grad_norm": 1.0672751665115356, + "learning_rate": 1.8505472970520672e-05, + "loss": 0.6653, + "step": 72730 + }, + { + "epoch": 1.167594985473282, + "grad_norm": 0.9460707902908325, + "learning_rate": 1.8499385155859328e-05, + "loss": 0.6578, + "step": 72740 + }, + { + "epoch": 1.1677555016934462, + "grad_norm": 0.9063759446144104, + "learning_rate": 1.849329775459893e-05, + "loss": 0.7375, + "step": 72750 + }, + { + "epoch": 1.16791601791361, + "grad_norm": 1.128399133682251, + "learning_rate": 1.8487210767126606e-05, + "loss": 0.6998, + "step": 72760 + }, + { + "epoch": 1.1680765341337742, + "grad_norm": 1.2583822011947632, + "learning_rate": 1.8481124193829454e-05, + "loss": 0.7408, + "step": 72770 + }, + { + "epoch": 1.1682370503539383, + "grad_norm": 0.6704979538917542, + "learning_rate": 1.8475038035094532e-05, + "loss": 0.5833, + "step": 72780 + }, + { + "epoch": 1.1683975665741024, + "grad_norm": 1.180687427520752, + "learning_rate": 1.84689522913089e-05, + "loss": 0.6979, + "step": 72790 + }, + { + "epoch": 1.1685580827942663, + "grad_norm": 0.9550046920776367, + "learning_rate": 1.846286696285958e-05, + "loss": 0.6727, + "step": 72800 + }, + { + "epoch": 1.1687185990144304, + "grad_norm": 0.7026744484901428, + "learning_rate": 1.845678205013354e-05, + "loss": 0.7439, + "step": 72810 + }, + { + "epoch": 1.1688791152345945, + "grad_norm": 1.0622204542160034, + "learning_rate": 1.8450697553517753e-05, + "loss": 0.7646, + "step": 72820 + }, + { + "epoch": 1.1690396314547584, + "grad_norm": 1.3594337701797485, + "learning_rate": 1.844461347339917e-05, + "loss": 0.69, + "step": 72830 + }, + { + "epoch": 1.1692001476749225, + "grad_norm": 0.8066992163658142, + "learning_rate": 1.8438529810164685e-05, + "loss": 0.6297, + "step": 72840 + }, + { + "epoch": 1.1693606638950866, + "grad_norm": 0.7693268656730652, + "learning_rate": 1.8432446564201198e-05, + "loss": 0.6889, + "step": 72850 + }, + { + "epoch": 1.1695211801152507, + "grad_norm": 1.259100317955017, + "learning_rate": 1.842636373589557e-05, + "loss": 0.6499, + "step": 72860 + }, + { + "epoch": 1.1696816963354146, + "grad_norm": 0.9102617502212524, + "learning_rate": 1.8420281325634615e-05, + "loss": 0.6936, + "step": 72870 + }, + { + "epoch": 1.1698422125555787, + "grad_norm": 1.4500812292099, + "learning_rate": 1.841419933380515e-05, + "loss": 0.7056, + "step": 72880 + }, + { + "epoch": 1.1700027287757429, + "grad_norm": 1.1426959037780762, + "learning_rate": 1.840811776079395e-05, + "loss": 0.8081, + "step": 72890 + }, + { + "epoch": 1.1701632449959067, + "grad_norm": 1.1485896110534668, + "learning_rate": 1.840203660698777e-05, + "loss": 0.7553, + "step": 72900 + }, + { + "epoch": 1.1703237612160708, + "grad_norm": 0.862895667552948, + "learning_rate": 1.8395955872773335e-05, + "loss": 0.6834, + "step": 72910 + }, + { + "epoch": 1.170484277436235, + "grad_norm": 0.7416561245918274, + "learning_rate": 1.8389875558537347e-05, + "loss": 0.753, + "step": 72920 + }, + { + "epoch": 1.170644793656399, + "grad_norm": 0.9340986013412476, + "learning_rate": 1.8383795664666487e-05, + "loss": 0.8745, + "step": 72930 + }, + { + "epoch": 1.170805309876563, + "grad_norm": 0.5074810981750488, + "learning_rate": 1.837771619154738e-05, + "loss": 0.6882, + "step": 72940 + }, + { + "epoch": 1.170965826096727, + "grad_norm": 0.861598014831543, + "learning_rate": 1.837163713956665e-05, + "loss": 0.7095, + "step": 72950 + }, + { + "epoch": 1.1711263423168912, + "grad_norm": 0.7950922250747681, + "learning_rate": 1.83655585091109e-05, + "loss": 0.662, + "step": 72960 + }, + { + "epoch": 1.171286858537055, + "grad_norm": 0.8643640279769897, + "learning_rate": 1.8359480300566678e-05, + "loss": 0.6061, + "step": 72970 + }, + { + "epoch": 1.1714473747572192, + "grad_norm": 0.9038317799568176, + "learning_rate": 1.8353402514320542e-05, + "loss": 0.669, + "step": 72980 + }, + { + "epoch": 1.1716078909773833, + "grad_norm": 0.9431499242782593, + "learning_rate": 1.8347325150758998e-05, + "loss": 0.704, + "step": 72990 + }, + { + "epoch": 1.1717684071975474, + "grad_norm": 2.117246389389038, + "learning_rate": 1.834124821026852e-05, + "loss": 0.6512, + "step": 73000 + }, + { + "epoch": 1.1719289234177113, + "grad_norm": 1.1064168214797974, + "learning_rate": 1.8335171693235577e-05, + "loss": 0.6533, + "step": 73010 + }, + { + "epoch": 1.1720894396378754, + "grad_norm": 1.0594170093536377, + "learning_rate": 1.832909560004659e-05, + "loss": 0.698, + "step": 73020 + }, + { + "epoch": 1.1722499558580395, + "grad_norm": 0.8000068664550781, + "learning_rate": 1.8323019931087976e-05, + "loss": 0.7661, + "step": 73030 + }, + { + "epoch": 1.1724104720782036, + "grad_norm": 0.7184832096099854, + "learning_rate": 1.8316944686746102e-05, + "loss": 0.8095, + "step": 73040 + }, + { + "epoch": 1.1725709882983675, + "grad_norm": 0.9345791339874268, + "learning_rate": 1.831086986740732e-05, + "loss": 0.6732, + "step": 73050 + }, + { + "epoch": 1.1727315045185316, + "grad_norm": 0.6665158271789551, + "learning_rate": 1.8304795473457954e-05, + "loss": 0.8031, + "step": 73060 + }, + { + "epoch": 1.1728920207386957, + "grad_norm": 3.190340518951416, + "learning_rate": 1.8298721505284293e-05, + "loss": 0.6428, + "step": 73070 + }, + { + "epoch": 1.1730525369588598, + "grad_norm": 1.2481249570846558, + "learning_rate": 1.829264796327262e-05, + "loss": 0.7072, + "step": 73080 + }, + { + "epoch": 1.1732130531790237, + "grad_norm": 1.01716148853302, + "learning_rate": 1.828657484780916e-05, + "loss": 0.7317, + "step": 73090 + }, + { + "epoch": 1.1733735693991878, + "grad_norm": 1.1746770143508911, + "learning_rate": 1.828050215928014e-05, + "loss": 0.8213, + "step": 73100 + }, + { + "epoch": 1.173534085619352, + "grad_norm": 1.0242974758148193, + "learning_rate": 1.8274429898071753e-05, + "loss": 0.7857, + "step": 73110 + }, + { + "epoch": 1.1736946018395158, + "grad_norm": 1.022411823272705, + "learning_rate": 1.826835806457014e-05, + "loss": 0.7122, + "step": 73120 + }, + { + "epoch": 1.17385511805968, + "grad_norm": 0.8557770848274231, + "learning_rate": 1.8262286659161444e-05, + "loss": 0.6791, + "step": 73130 + }, + { + "epoch": 1.174015634279844, + "grad_norm": 1.0214494466781616, + "learning_rate": 1.8256215682231765e-05, + "loss": 0.6564, + "step": 73140 + }, + { + "epoch": 1.1741761505000081, + "grad_norm": 1.1182818412780762, + "learning_rate": 1.825014513416719e-05, + "loss": 0.8074, + "step": 73150 + }, + { + "epoch": 1.174336666720172, + "grad_norm": 0.7961527705192566, + "learning_rate": 1.824407501535377e-05, + "loss": 0.7623, + "step": 73160 + }, + { + "epoch": 1.1744971829403361, + "grad_norm": 1.058042287826538, + "learning_rate": 1.8238005326177522e-05, + "loss": 0.7751, + "step": 73170 + }, + { + "epoch": 1.1746576991605002, + "grad_norm": 0.6202409863471985, + "learning_rate": 1.823193606702446e-05, + "loss": 0.649, + "step": 73180 + }, + { + "epoch": 1.174818215380664, + "grad_norm": 0.5096656680107117, + "learning_rate": 1.8225867238280523e-05, + "loss": 0.7343, + "step": 73190 + }, + { + "epoch": 1.1749787316008282, + "grad_norm": 0.5657974481582642, + "learning_rate": 1.8219798840331672e-05, + "loss": 0.6539, + "step": 73200 + }, + { + "epoch": 1.1751392478209923, + "grad_norm": 0.8201744556427002, + "learning_rate": 1.8213730873563822e-05, + "loss": 0.7549, + "step": 73210 + }, + { + "epoch": 1.1752997640411564, + "grad_norm": 0.8798885941505432, + "learning_rate": 1.8207663338362853e-05, + "loss": 0.8359, + "step": 73220 + }, + { + "epoch": 1.1754602802613203, + "grad_norm": 1.2061954736709595, + "learning_rate": 1.8201596235114633e-05, + "loss": 0.6726, + "step": 73230 + }, + { + "epoch": 1.1756207964814844, + "grad_norm": 0.7767678499221802, + "learning_rate": 1.8195529564205e-05, + "loss": 0.7537, + "step": 73240 + }, + { + "epoch": 1.1757813127016485, + "grad_norm": 1.0875033140182495, + "learning_rate": 1.8189463326019734e-05, + "loss": 0.8307, + "step": 73250 + }, + { + "epoch": 1.1759418289218124, + "grad_norm": 0.9450353980064392, + "learning_rate": 1.8183397520944627e-05, + "loss": 0.6488, + "step": 73260 + }, + { + "epoch": 1.1761023451419765, + "grad_norm": 0.5843946933746338, + "learning_rate": 1.8177332149365435e-05, + "loss": 0.8152, + "step": 73270 + }, + { + "epoch": 1.1762628613621406, + "grad_norm": 0.8743764758110046, + "learning_rate": 1.8171267211667868e-05, + "loss": 0.8627, + "step": 73280 + }, + { + "epoch": 1.1764233775823048, + "grad_norm": 1.361961841583252, + "learning_rate": 1.8165202708237626e-05, + "loss": 0.7528, + "step": 73290 + }, + { + "epoch": 1.1765838938024689, + "grad_norm": 0.5496252179145813, + "learning_rate": 1.8159138639460382e-05, + "loss": 0.6594, + "step": 73300 + }, + { + "epoch": 1.1767444100226327, + "grad_norm": 1.0252143144607544, + "learning_rate": 1.815307500572176e-05, + "loss": 0.7424, + "step": 73310 + }, + { + "epoch": 1.1769049262427969, + "grad_norm": 1.0303200483322144, + "learning_rate": 1.814701180740738e-05, + "loss": 0.7341, + "step": 73320 + }, + { + "epoch": 1.177065442462961, + "grad_norm": 2.4739537239074707, + "learning_rate": 1.8140949044902834e-05, + "loss": 0.7572, + "step": 73330 + }, + { + "epoch": 1.1772259586831249, + "grad_norm": 0.8094313144683838, + "learning_rate": 1.8134886718593662e-05, + "loss": 0.6235, + "step": 73340 + }, + { + "epoch": 1.177386474903289, + "grad_norm": 1.643262505531311, + "learning_rate": 1.8128824828865413e-05, + "loss": 0.7239, + "step": 73350 + }, + { + "epoch": 1.177546991123453, + "grad_norm": 0.8168038725852966, + "learning_rate": 1.8122763376103568e-05, + "loss": 0.6845, + "step": 73360 + }, + { + "epoch": 1.1777075073436172, + "grad_norm": 1.0784026384353638, + "learning_rate": 1.8116702360693606e-05, + "loss": 0.7288, + "step": 73370 + }, + { + "epoch": 1.177868023563781, + "grad_norm": 1.2012293338775635, + "learning_rate": 1.8110641783020975e-05, + "loss": 0.7258, + "step": 73380 + }, + { + "epoch": 1.1780285397839452, + "grad_norm": 1.0915606021881104, + "learning_rate": 1.8104581643471087e-05, + "loss": 0.7542, + "step": 73390 + }, + { + "epoch": 1.1781890560041093, + "grad_norm": 1.2305188179016113, + "learning_rate": 1.8098521942429332e-05, + "loss": 0.6503, + "step": 73400 + }, + { + "epoch": 1.1783495722242732, + "grad_norm": 0.9474614858627319, + "learning_rate": 1.8092462680281077e-05, + "loss": 0.7286, + "step": 73410 + }, + { + "epoch": 1.1785100884444373, + "grad_norm": 0.9981006383895874, + "learning_rate": 1.8086403857411654e-05, + "loss": 0.8378, + "step": 73420 + }, + { + "epoch": 1.1786706046646014, + "grad_norm": 0.7639000415802002, + "learning_rate": 1.808034547420637e-05, + "loss": 0.7384, + "step": 73430 + }, + { + "epoch": 1.1788311208847655, + "grad_norm": 1.126499056816101, + "learning_rate": 1.80742875310505e-05, + "loss": 0.6701, + "step": 73440 + }, + { + "epoch": 1.1789916371049294, + "grad_norm": 2.0504231452941895, + "learning_rate": 1.8068230028329288e-05, + "loss": 0.6174, + "step": 73450 + }, + { + "epoch": 1.1791521533250935, + "grad_norm": 1.0276241302490234, + "learning_rate": 1.806217296642796e-05, + "loss": 0.6559, + "step": 73460 + }, + { + "epoch": 1.1793126695452576, + "grad_norm": 1.001449704170227, + "learning_rate": 1.8056116345731712e-05, + "loss": 0.76, + "step": 73470 + }, + { + "epoch": 1.1794731857654215, + "grad_norm": 0.8933233022689819, + "learning_rate": 1.8050060166625705e-05, + "loss": 0.7125, + "step": 73480 + }, + { + "epoch": 1.1796337019855856, + "grad_norm": 0.7670060992240906, + "learning_rate": 1.8044004429495093e-05, + "loss": 0.7677, + "step": 73490 + }, + { + "epoch": 1.1797942182057497, + "grad_norm": 1.377428412437439, + "learning_rate": 1.803794913472496e-05, + "loss": 0.7687, + "step": 73500 + }, + { + "epoch": 1.1799547344259138, + "grad_norm": 1.4096206426620483, + "learning_rate": 1.80318942827004e-05, + "loss": 0.7552, + "step": 73510 + }, + { + "epoch": 1.1801152506460777, + "grad_norm": 0.86366206407547, + "learning_rate": 1.8025839873806462e-05, + "loss": 0.7316, + "step": 73520 + }, + { + "epoch": 1.1802757668662418, + "grad_norm": 1.1236027479171753, + "learning_rate": 1.8019785908428178e-05, + "loss": 0.7606, + "step": 73530 + }, + { + "epoch": 1.180436283086406, + "grad_norm": 1.0565800666809082, + "learning_rate": 1.8013732386950537e-05, + "loss": 0.7154, + "step": 73540 + }, + { + "epoch": 1.18059679930657, + "grad_norm": 1.1922857761383057, + "learning_rate": 1.8007679309758525e-05, + "loss": 0.6484, + "step": 73550 + }, + { + "epoch": 1.180757315526734, + "grad_norm": 0.8554391264915466, + "learning_rate": 1.8001626677237053e-05, + "loss": 0.6725, + "step": 73560 + }, + { + "epoch": 1.180917831746898, + "grad_norm": 0.8491227626800537, + "learning_rate": 1.799557448977105e-05, + "loss": 0.7302, + "step": 73570 + }, + { + "epoch": 1.1810783479670621, + "grad_norm": 1.0652166604995728, + "learning_rate": 1.79895227477454e-05, + "loss": 0.7487, + "step": 73580 + }, + { + "epoch": 1.1812388641872262, + "grad_norm": 0.7735791206359863, + "learning_rate": 1.7983471451544952e-05, + "loss": 0.7799, + "step": 73590 + }, + { + "epoch": 1.1813993804073901, + "grad_norm": 1.0242308378219604, + "learning_rate": 1.797742060155454e-05, + "loss": 0.8189, + "step": 73600 + }, + { + "epoch": 1.1815598966275542, + "grad_norm": 1.2339340448379517, + "learning_rate": 1.7971370198158967e-05, + "loss": 0.7706, + "step": 73610 + }, + { + "epoch": 1.1817204128477183, + "grad_norm": 1.35574471950531, + "learning_rate": 1.7965320241742983e-05, + "loss": 0.6937, + "step": 73620 + }, + { + "epoch": 1.1818809290678822, + "grad_norm": 1.3037775754928589, + "learning_rate": 1.7959270732691347e-05, + "loss": 0.7022, + "step": 73630 + }, + { + "epoch": 1.1820414452880463, + "grad_norm": 1.9756101369857788, + "learning_rate": 1.7953221671388767e-05, + "loss": 0.7251, + "step": 73640 + }, + { + "epoch": 1.1822019615082104, + "grad_norm": 1.1374365091323853, + "learning_rate": 1.7947173058219928e-05, + "loss": 0.8394, + "step": 73650 + }, + { + "epoch": 1.1823624777283746, + "grad_norm": 1.1021050214767456, + "learning_rate": 1.7941124893569495e-05, + "loss": 0.684, + "step": 73660 + }, + { + "epoch": 1.1825229939485384, + "grad_norm": 0.9667333364486694, + "learning_rate": 1.7935077177822083e-05, + "loss": 0.6605, + "step": 73670 + }, + { + "epoch": 1.1826835101687025, + "grad_norm": 0.8728442192077637, + "learning_rate": 1.7929029911362287e-05, + "loss": 0.7019, + "step": 73680 + }, + { + "epoch": 1.1828440263888667, + "grad_norm": 0.9833809733390808, + "learning_rate": 1.7922983094574692e-05, + "loss": 0.7678, + "step": 73690 + }, + { + "epoch": 1.1830045426090305, + "grad_norm": 1.2833691835403442, + "learning_rate": 1.7916936727843837e-05, + "loss": 0.7795, + "step": 73700 + }, + { + "epoch": 1.1831650588291946, + "grad_norm": 0.9668800830841064, + "learning_rate": 1.7910890811554228e-05, + "loss": 0.7851, + "step": 73710 + }, + { + "epoch": 1.1833255750493588, + "grad_norm": 1.339614748954773, + "learning_rate": 1.7904845346090356e-05, + "loss": 0.7014, + "step": 73720 + }, + { + "epoch": 1.1834860912695229, + "grad_norm": 1.2356328964233398, + "learning_rate": 1.7898800331836672e-05, + "loss": 0.7698, + "step": 73730 + }, + { + "epoch": 1.1836466074896868, + "grad_norm": 0.8532010912895203, + "learning_rate": 1.7892755769177622e-05, + "loss": 0.7237, + "step": 73740 + }, + { + "epoch": 1.1838071237098509, + "grad_norm": 1.333736538887024, + "learning_rate": 1.788671165849758e-05, + "loss": 0.8207, + "step": 73750 + }, + { + "epoch": 1.183967639930015, + "grad_norm": 1.4717612266540527, + "learning_rate": 1.7880668000180922e-05, + "loss": 0.6571, + "step": 73760 + }, + { + "epoch": 1.1841281561501789, + "grad_norm": 0.8792722225189209, + "learning_rate": 1.7874624794611995e-05, + "loss": 0.663, + "step": 73770 + }, + { + "epoch": 1.184288672370343, + "grad_norm": 0.6567350625991821, + "learning_rate": 1.786858204217511e-05, + "loss": 0.6876, + "step": 73780 + }, + { + "epoch": 1.184449188590507, + "grad_norm": 0.8446241617202759, + "learning_rate": 1.7862539743254546e-05, + "loss": 0.8538, + "step": 73790 + }, + { + "epoch": 1.1846097048106712, + "grad_norm": 0.7149763703346252, + "learning_rate": 1.7856497898234574e-05, + "loss": 0.6707, + "step": 73800 + }, + { + "epoch": 1.184770221030835, + "grad_norm": 1.0858137607574463, + "learning_rate": 1.78504565074994e-05, + "loss": 0.7096, + "step": 73810 + }, + { + "epoch": 1.1849307372509992, + "grad_norm": 1.5953320264816284, + "learning_rate": 1.784441557143323e-05, + "loss": 0.762, + "step": 73820 + }, + { + "epoch": 1.1850912534711633, + "grad_norm": 0.914723813533783, + "learning_rate": 1.7838375090420225e-05, + "loss": 0.7026, + "step": 73830 + }, + { + "epoch": 1.1852517696913274, + "grad_norm": 0.8962745070457458, + "learning_rate": 1.7832335064844534e-05, + "loss": 0.7206, + "step": 73840 + }, + { + "epoch": 1.1854122859114913, + "grad_norm": 1.327528953552246, + "learning_rate": 1.7826295495090262e-05, + "loss": 0.7996, + "step": 73850 + }, + { + "epoch": 1.1855728021316554, + "grad_norm": 1.1918851137161255, + "learning_rate": 1.78202563815415e-05, + "loss": 0.7799, + "step": 73860 + }, + { + "epoch": 1.1857333183518195, + "grad_norm": 0.7059483528137207, + "learning_rate": 1.781421772458228e-05, + "loss": 0.7246, + "step": 73870 + }, + { + "epoch": 1.1858938345719836, + "grad_norm": 0.7808214426040649, + "learning_rate": 1.7808179524596638e-05, + "loss": 0.7158, + "step": 73880 + }, + { + "epoch": 1.1860543507921475, + "grad_norm": 0.8092349767684937, + "learning_rate": 1.7802141781968568e-05, + "loss": 0.7512, + "step": 73890 + }, + { + "epoch": 1.1862148670123116, + "grad_norm": 0.8625509738922119, + "learning_rate": 1.7796104497082032e-05, + "loss": 0.6851, + "step": 73900 + }, + { + "epoch": 1.1863753832324757, + "grad_norm": 1.4922610521316528, + "learning_rate": 1.7790067670320967e-05, + "loss": 0.8025, + "step": 73910 + }, + { + "epoch": 1.1865358994526396, + "grad_norm": 0.8065730333328247, + "learning_rate": 1.778403130206929e-05, + "loss": 0.7193, + "step": 73920 + }, + { + "epoch": 1.1866964156728037, + "grad_norm": 1.2592568397521973, + "learning_rate": 1.7777995392710856e-05, + "loss": 0.8072, + "step": 73930 + }, + { + "epoch": 1.1868569318929678, + "grad_norm": 1.0111490488052368, + "learning_rate": 1.777195994262953e-05, + "loss": 0.7358, + "step": 73940 + }, + { + "epoch": 1.187017448113132, + "grad_norm": 0.9535436630249023, + "learning_rate": 1.7765924952209124e-05, + "loss": 0.7541, + "step": 73950 + }, + { + "epoch": 1.1871779643332958, + "grad_norm": 0.9265869855880737, + "learning_rate": 1.7759890421833435e-05, + "loss": 0.7227, + "step": 73960 + }, + { + "epoch": 1.18733848055346, + "grad_norm": 0.9972652196884155, + "learning_rate": 1.7753856351886227e-05, + "loss": 0.66, + "step": 73970 + }, + { + "epoch": 1.187498996773624, + "grad_norm": 0.9904355406761169, + "learning_rate": 1.7747822742751218e-05, + "loss": 0.7038, + "step": 73980 + }, + { + "epoch": 1.187659512993788, + "grad_norm": 1.3918795585632324, + "learning_rate": 1.774178959481212e-05, + "loss": 0.7808, + "step": 73990 + }, + { + "epoch": 1.187820029213952, + "grad_norm": 1.1098833084106445, + "learning_rate": 1.7735756908452597e-05, + "loss": 0.7023, + "step": 74000 + }, + { + "epoch": 1.1879805454341161, + "grad_norm": 1.0277695655822754, + "learning_rate": 1.7729724684056296e-05, + "loss": 0.7765, + "step": 74010 + }, + { + "epoch": 1.1881410616542802, + "grad_norm": 1.128543734550476, + "learning_rate": 1.7723692922006838e-05, + "loss": 0.6956, + "step": 74020 + }, + { + "epoch": 1.1883015778744441, + "grad_norm": 1.0573822259902954, + "learning_rate": 1.77176616226878e-05, + "loss": 0.8285, + "step": 74030 + }, + { + "epoch": 1.1884620940946082, + "grad_norm": 1.263924479484558, + "learning_rate": 1.771163078648274e-05, + "loss": 0.6874, + "step": 74040 + }, + { + "epoch": 1.1886226103147723, + "grad_norm": 0.8599129319190979, + "learning_rate": 1.7705600413775194e-05, + "loss": 0.6262, + "step": 74050 + }, + { + "epoch": 1.1887831265349365, + "grad_norm": 1.0083788633346558, + "learning_rate": 1.7699570504948636e-05, + "loss": 0.6328, + "step": 74060 + }, + { + "epoch": 1.1889436427551003, + "grad_norm": 1.5458974838256836, + "learning_rate": 1.769354106038655e-05, + "loss": 0.7282, + "step": 74070 + }, + { + "epoch": 1.1891041589752644, + "grad_norm": 1.2082785367965698, + "learning_rate": 1.7687512080472364e-05, + "loss": 0.8245, + "step": 74080 + }, + { + "epoch": 1.1892646751954286, + "grad_norm": 2.04634952545166, + "learning_rate": 1.7681483565589486e-05, + "loss": 0.6758, + "step": 74090 + }, + { + "epoch": 1.1894251914155927, + "grad_norm": 0.903251051902771, + "learning_rate": 1.7675455516121302e-05, + "loss": 0.674, + "step": 74100 + }, + { + "epoch": 1.1895857076357566, + "grad_norm": 1.026177167892456, + "learning_rate": 1.766942793245117e-05, + "loss": 0.6755, + "step": 74110 + }, + { + "epoch": 1.1897462238559207, + "grad_norm": 0.7959609627723694, + "learning_rate": 1.766340081496238e-05, + "loss": 0.7332, + "step": 74120 + }, + { + "epoch": 1.1899067400760848, + "grad_norm": 0.9849035739898682, + "learning_rate": 1.7657374164038237e-05, + "loss": 0.7383, + "step": 74130 + }, + { + "epoch": 1.1900672562962487, + "grad_norm": 0.7793600559234619, + "learning_rate": 1.7651347980062006e-05, + "loss": 0.7917, + "step": 74140 + }, + { + "epoch": 1.1902277725164128, + "grad_norm": 1.0163054466247559, + "learning_rate": 1.76453222634169e-05, + "loss": 0.7178, + "step": 74150 + }, + { + "epoch": 1.1903882887365769, + "grad_norm": 1.0258920192718506, + "learning_rate": 1.763929701448614e-05, + "loss": 0.766, + "step": 74160 + }, + { + "epoch": 1.190548804956741, + "grad_norm": 1.3721611499786377, + "learning_rate": 1.7633272233652896e-05, + "loss": 0.718, + "step": 74170 + }, + { + "epoch": 1.1907093211769049, + "grad_norm": 1.3787413835525513, + "learning_rate": 1.762724792130029e-05, + "loss": 0.6415, + "step": 74180 + }, + { + "epoch": 1.190869837397069, + "grad_norm": 0.817507803440094, + "learning_rate": 1.7621224077811437e-05, + "loss": 0.5942, + "step": 74190 + }, + { + "epoch": 1.191030353617233, + "grad_norm": 1.4122029542922974, + "learning_rate": 1.761520070356943e-05, + "loss": 0.6358, + "step": 74200 + }, + { + "epoch": 1.191190869837397, + "grad_norm": 0.8956156373023987, + "learning_rate": 1.760917779895731e-05, + "loss": 0.6647, + "step": 74210 + }, + { + "epoch": 1.191351386057561, + "grad_norm": 1.0736058950424194, + "learning_rate": 1.7603155364358105e-05, + "loss": 0.6536, + "step": 74220 + }, + { + "epoch": 1.1915119022777252, + "grad_norm": 0.9327526092529297, + "learning_rate": 1.759713340015481e-05, + "loss": 0.661, + "step": 74230 + }, + { + "epoch": 1.1916724184978893, + "grad_norm": 0.7632979154586792, + "learning_rate": 1.7591111906730368e-05, + "loss": 0.7911, + "step": 74240 + }, + { + "epoch": 1.1918329347180532, + "grad_norm": 1.266977071762085, + "learning_rate": 1.758509088446773e-05, + "loss": 0.7851, + "step": 74250 + }, + { + "epoch": 1.1919934509382173, + "grad_norm": 0.977074921131134, + "learning_rate": 1.7579070333749786e-05, + "loss": 0.7171, + "step": 74260 + }, + { + "epoch": 1.1921539671583814, + "grad_norm": 0.7341325283050537, + "learning_rate": 1.7573050254959414e-05, + "loss": 0.6603, + "step": 74270 + }, + { + "epoch": 1.1923144833785453, + "grad_norm": 1.0064162015914917, + "learning_rate": 1.756703064847946e-05, + "loss": 0.6766, + "step": 74280 + }, + { + "epoch": 1.1924749995987094, + "grad_norm": 1.3096444606781006, + "learning_rate": 1.7561011514692723e-05, + "loss": 0.6955, + "step": 74290 + }, + { + "epoch": 1.1926355158188735, + "grad_norm": 0.9057181477546692, + "learning_rate": 1.7554992853982e-05, + "loss": 0.6864, + "step": 74300 + }, + { + "epoch": 1.1927960320390376, + "grad_norm": 0.7599760293960571, + "learning_rate": 1.7548974666730024e-05, + "loss": 0.7167, + "step": 74310 + }, + { + "epoch": 1.1929565482592015, + "grad_norm": 0.9264327883720398, + "learning_rate": 1.7542956953319527e-05, + "loss": 0.7774, + "step": 74320 + }, + { + "epoch": 1.1931170644793656, + "grad_norm": 1.0674172639846802, + "learning_rate": 1.75369397141332e-05, + "loss": 0.6106, + "step": 74330 + }, + { + "epoch": 1.1932775806995297, + "grad_norm": 0.8358743190765381, + "learning_rate": 1.7530922949553707e-05, + "loss": 0.6748, + "step": 74340 + }, + { + "epoch": 1.1934380969196938, + "grad_norm": 0.9849324822425842, + "learning_rate": 1.7524906659963673e-05, + "loss": 0.642, + "step": 74350 + }, + { + "epoch": 1.1935986131398577, + "grad_norm": 1.024770975112915, + "learning_rate": 1.751889084574571e-05, + "loss": 0.6678, + "step": 74360 + }, + { + "epoch": 1.1937591293600218, + "grad_norm": 1.1784472465515137, + "learning_rate": 1.751287550728237e-05, + "loss": 0.6978, + "step": 74370 + }, + { + "epoch": 1.193919645580186, + "grad_norm": 1.563461422920227, + "learning_rate": 1.7506860644956203e-05, + "loss": 0.6627, + "step": 74380 + }, + { + "epoch": 1.19408016180035, + "grad_norm": 0.8146748542785645, + "learning_rate": 1.7500846259149716e-05, + "loss": 0.7998, + "step": 74390 + }, + { + "epoch": 1.194240678020514, + "grad_norm": 1.3344523906707764, + "learning_rate": 1.7494832350245398e-05, + "loss": 0.7627, + "step": 74400 + }, + { + "epoch": 1.194401194240678, + "grad_norm": 1.5407015085220337, + "learning_rate": 1.7488818918625687e-05, + "loss": 0.7511, + "step": 74410 + }, + { + "epoch": 1.1945617104608421, + "grad_norm": 0.9737513661384583, + "learning_rate": 1.7482805964673015e-05, + "loss": 0.7435, + "step": 74420 + }, + { + "epoch": 1.194722226681006, + "grad_norm": 0.7974161505699158, + "learning_rate": 1.7476793488769753e-05, + "loss": 0.6881, + "step": 74430 + }, + { + "epoch": 1.1948827429011701, + "grad_norm": 0.8231331706047058, + "learning_rate": 1.7470781491298273e-05, + "loss": 0.7146, + "step": 74440 + }, + { + "epoch": 1.1950432591213342, + "grad_norm": 0.890822172164917, + "learning_rate": 1.7464769972640894e-05, + "loss": 0.7322, + "step": 74450 + }, + { + "epoch": 1.1952037753414984, + "grad_norm": 1.2916290760040283, + "learning_rate": 1.745875893317992e-05, + "loss": 0.6889, + "step": 74460 + }, + { + "epoch": 1.1953642915616622, + "grad_norm": 0.9397755265235901, + "learning_rate": 1.7452748373297612e-05, + "loss": 0.6836, + "step": 74470 + }, + { + "epoch": 1.1955248077818263, + "grad_norm": 1.3933144807815552, + "learning_rate": 1.744673829337622e-05, + "loss": 0.7745, + "step": 74480 + }, + { + "epoch": 1.1956853240019905, + "grad_norm": 1.0036756992340088, + "learning_rate": 1.744072869379793e-05, + "loss": 0.8317, + "step": 74490 + }, + { + "epoch": 1.1958458402221543, + "grad_norm": 0.9504613280296326, + "learning_rate": 1.743471957494493e-05, + "loss": 0.728, + "step": 74500 + }, + { + "epoch": 1.1960063564423185, + "grad_norm": 0.7260792851448059, + "learning_rate": 1.742871093719936e-05, + "loss": 0.778, + "step": 74510 + }, + { + "epoch": 1.1961668726624826, + "grad_norm": 0.8875165581703186, + "learning_rate": 1.742270278094333e-05, + "loss": 0.867, + "step": 74520 + }, + { + "epoch": 1.1963273888826467, + "grad_norm": 0.9676148891448975, + "learning_rate": 1.7416695106558932e-05, + "loss": 0.7492, + "step": 74530 + }, + { + "epoch": 1.1964879051028106, + "grad_norm": 0.9661509990692139, + "learning_rate": 1.741068791442822e-05, + "loss": 0.7555, + "step": 74540 + }, + { + "epoch": 1.1966484213229747, + "grad_norm": 1.1863818168640137, + "learning_rate": 1.7404681204933212e-05, + "loss": 0.7218, + "step": 74550 + }, + { + "epoch": 1.1968089375431388, + "grad_norm": 0.8879155516624451, + "learning_rate": 1.7398674978455898e-05, + "loss": 0.8091, + "step": 74560 + }, + { + "epoch": 1.1969694537633027, + "grad_norm": 1.1974185705184937, + "learning_rate": 1.7392669235378237e-05, + "loss": 0.6146, + "step": 74570 + }, + { + "epoch": 1.1971299699834668, + "grad_norm": 0.8215389847755432, + "learning_rate": 1.7386663976082164e-05, + "loss": 0.6436, + "step": 74580 + }, + { + "epoch": 1.1972904862036309, + "grad_norm": 0.9002476334571838, + "learning_rate": 1.7380659200949574e-05, + "loss": 0.731, + "step": 74590 + }, + { + "epoch": 1.197451002423795, + "grad_norm": 0.8270787596702576, + "learning_rate": 1.7374654910362344e-05, + "loss": 0.7394, + "step": 74600 + }, + { + "epoch": 1.197611518643959, + "grad_norm": 0.7816817760467529, + "learning_rate": 1.7368651104702306e-05, + "loss": 0.6256, + "step": 74610 + }, + { + "epoch": 1.197772034864123, + "grad_norm": 0.6531346440315247, + "learning_rate": 1.736264778435126e-05, + "loss": 0.643, + "step": 74620 + }, + { + "epoch": 1.197932551084287, + "grad_norm": 0.88628089427948, + "learning_rate": 1.7356644949690992e-05, + "loss": 0.7772, + "step": 74630 + }, + { + "epoch": 1.1980930673044512, + "grad_norm": 0.7623951435089111, + "learning_rate": 1.735064260110324e-05, + "loss": 0.6838, + "step": 74640 + }, + { + "epoch": 1.198253583524615, + "grad_norm": 0.7121040225028992, + "learning_rate": 1.734464073896972e-05, + "loss": 0.8992, + "step": 74650 + }, + { + "epoch": 1.1984140997447792, + "grad_norm": 0.7912382483482361, + "learning_rate": 1.7338639363672122e-05, + "loss": 0.6367, + "step": 74660 + }, + { + "epoch": 1.1985746159649433, + "grad_norm": 0.8952768445014954, + "learning_rate": 1.7332638475592098e-05, + "loss": 0.6504, + "step": 74670 + }, + { + "epoch": 1.1987351321851074, + "grad_norm": 1.1607123613357544, + "learning_rate": 1.7326638075111258e-05, + "loss": 0.6887, + "step": 74680 + }, + { + "epoch": 1.1988956484052713, + "grad_norm": 0.6190829277038574, + "learning_rate": 1.7320638162611192e-05, + "loss": 0.8277, + "step": 74690 + }, + { + "epoch": 1.1990561646254354, + "grad_norm": 1.1820344924926758, + "learning_rate": 1.731463873847347e-05, + "loss": 0.755, + "step": 74700 + }, + { + "epoch": 1.1992166808455995, + "grad_norm": 1.0699799060821533, + "learning_rate": 1.7308639803079614e-05, + "loss": 0.7585, + "step": 74710 + }, + { + "epoch": 1.1993771970657634, + "grad_norm": 0.9856334924697876, + "learning_rate": 1.7302641356811126e-05, + "loss": 0.8065, + "step": 74720 + }, + { + "epoch": 1.1995377132859275, + "grad_norm": 0.8798818588256836, + "learning_rate": 1.7296643400049477e-05, + "loss": 0.7131, + "step": 74730 + }, + { + "epoch": 1.1996982295060916, + "grad_norm": 0.834159791469574, + "learning_rate": 1.7290645933176083e-05, + "loss": 0.7397, + "step": 74740 + }, + { + "epoch": 1.1998587457262557, + "grad_norm": 0.9237750172615051, + "learning_rate": 1.7284648956572358e-05, + "loss": 0.6679, + "step": 74750 + }, + { + "epoch": 1.2000192619464196, + "grad_norm": 1.379791498184204, + "learning_rate": 1.7278652470619677e-05, + "loss": 0.7018, + "step": 74760 + }, + { + "epoch": 1.2001797781665837, + "grad_norm": 0.6608775854110718, + "learning_rate": 1.727265647569938e-05, + "loss": 0.714, + "step": 74770 + }, + { + "epoch": 1.2003402943867478, + "grad_norm": 0.781033456325531, + "learning_rate": 1.726666097219277e-05, + "loss": 0.7221, + "step": 74780 + }, + { + "epoch": 1.2005008106069117, + "grad_norm": 1.01268470287323, + "learning_rate": 1.726066596048114e-05, + "loss": 0.7363, + "step": 74790 + }, + { + "epoch": 1.2006613268270758, + "grad_norm": 0.9503752589225769, + "learning_rate": 1.7254671440945735e-05, + "loss": 0.6328, + "step": 74800 + }, + { + "epoch": 1.20082184304724, + "grad_norm": 0.7534270286560059, + "learning_rate": 1.7248677413967756e-05, + "loss": 0.7551, + "step": 74810 + }, + { + "epoch": 1.200982359267404, + "grad_norm": 1.052774429321289, + "learning_rate": 1.7242683879928397e-05, + "loss": 0.7412, + "step": 74820 + }, + { + "epoch": 1.201142875487568, + "grad_norm": 0.9477763772010803, + "learning_rate": 1.7236690839208812e-05, + "loss": 0.678, + "step": 74830 + }, + { + "epoch": 1.201303391707732, + "grad_norm": 0.8183243870735168, + "learning_rate": 1.723069829219013e-05, + "loss": 0.8178, + "step": 74840 + }, + { + "epoch": 1.2014639079278961, + "grad_norm": 0.7288262844085693, + "learning_rate": 1.722470623925343e-05, + "loss": 0.6731, + "step": 74850 + }, + { + "epoch": 1.2016244241480603, + "grad_norm": 1.4395872354507446, + "learning_rate": 1.7218714680779783e-05, + "loss": 0.717, + "step": 74860 + }, + { + "epoch": 1.2017849403682241, + "grad_norm": 0.9640906453132629, + "learning_rate": 1.7212723617150206e-05, + "loss": 0.705, + "step": 74870 + }, + { + "epoch": 1.2019454565883883, + "grad_norm": 1.1383417844772339, + "learning_rate": 1.72067330487457e-05, + "loss": 0.7189, + "step": 74880 + }, + { + "epoch": 1.2021059728085524, + "grad_norm": 0.6208619475364685, + "learning_rate": 1.720074297594723e-05, + "loss": 0.7087, + "step": 74890 + }, + { + "epoch": 1.2022664890287165, + "grad_norm": 0.7429697513580322, + "learning_rate": 1.7194753399135726e-05, + "loss": 0.775, + "step": 74900 + }, + { + "epoch": 1.2024270052488804, + "grad_norm": 1.3045015335083008, + "learning_rate": 1.7188764318692106e-05, + "loss": 0.6611, + "step": 74910 + }, + { + "epoch": 1.2025875214690445, + "grad_norm": 0.7629736661911011, + "learning_rate": 1.7182775734997227e-05, + "loss": 0.6636, + "step": 74920 + }, + { + "epoch": 1.2027480376892086, + "grad_norm": 0.8380823135375977, + "learning_rate": 1.717678764843192e-05, + "loss": 0.6865, + "step": 74930 + }, + { + "epoch": 1.2029085539093725, + "grad_norm": 1.1383196115493774, + "learning_rate": 1.7170800059377e-05, + "loss": 0.8297, + "step": 74940 + }, + { + "epoch": 1.2030690701295366, + "grad_norm": 0.7067494988441467, + "learning_rate": 1.7164812968213246e-05, + "loss": 0.6948, + "step": 74950 + }, + { + "epoch": 1.2032295863497007, + "grad_norm": 0.9067361354827881, + "learning_rate": 1.71588263753214e-05, + "loss": 0.7781, + "step": 74960 + }, + { + "epoch": 1.2033901025698648, + "grad_norm": 0.9146612882614136, + "learning_rate": 1.7152840281082173e-05, + "loss": 0.7703, + "step": 74970 + }, + { + "epoch": 1.2035506187900287, + "grad_norm": 0.8102425336837769, + "learning_rate": 1.7146854685876256e-05, + "loss": 0.6832, + "step": 74980 + }, + { + "epoch": 1.2037111350101928, + "grad_norm": 1.6245094537734985, + "learning_rate": 1.714086959008428e-05, + "loss": 0.6668, + "step": 74990 + }, + { + "epoch": 1.2038716512303569, + "grad_norm": 1.276032567024231, + "learning_rate": 1.7134884994086863e-05, + "loss": 0.7084, + "step": 75000 + }, + { + "epoch": 1.2040321674505208, + "grad_norm": 1.1250169277191162, + "learning_rate": 1.7128900898264598e-05, + "loss": 0.6305, + "step": 75010 + }, + { + "epoch": 1.2041926836706849, + "grad_norm": 0.9311342835426331, + "learning_rate": 1.7122917302998042e-05, + "loss": 0.7907, + "step": 75020 + }, + { + "epoch": 1.204353199890849, + "grad_norm": 1.0641329288482666, + "learning_rate": 1.7116934208667713e-05, + "loss": 0.7264, + "step": 75030 + }, + { + "epoch": 1.204513716111013, + "grad_norm": 0.9664627313613892, + "learning_rate": 1.7110951615654102e-05, + "loss": 0.7113, + "step": 75040 + }, + { + "epoch": 1.204674232331177, + "grad_norm": 0.5224878191947937, + "learning_rate": 1.7104969524337662e-05, + "loss": 0.755, + "step": 75050 + }, + { + "epoch": 1.204834748551341, + "grad_norm": 0.8709831833839417, + "learning_rate": 1.7098987935098815e-05, + "loss": 0.722, + "step": 75060 + }, + { + "epoch": 1.2049952647715052, + "grad_norm": 1.3374793529510498, + "learning_rate": 1.7093006848317968e-05, + "loss": 0.7289, + "step": 75070 + }, + { + "epoch": 1.205155780991669, + "grad_norm": 1.40370512008667, + "learning_rate": 1.7087026264375468e-05, + "loss": 0.7362, + "step": 75080 + }, + { + "epoch": 1.2053162972118332, + "grad_norm": 0.6196061968803406, + "learning_rate": 1.7081046183651658e-05, + "loss": 0.6899, + "step": 75090 + }, + { + "epoch": 1.2054768134319973, + "grad_norm": 1.4804164171218872, + "learning_rate": 1.7075066606526828e-05, + "loss": 0.7694, + "step": 75100 + }, + { + "epoch": 1.2056373296521614, + "grad_norm": 0.7615329027175903, + "learning_rate": 1.706908753338126e-05, + "loss": 0.7055, + "step": 75110 + }, + { + "epoch": 1.2057978458723253, + "grad_norm": 0.9723572731018066, + "learning_rate": 1.7063108964595165e-05, + "loss": 0.7117, + "step": 75120 + }, + { + "epoch": 1.2059583620924894, + "grad_norm": 1.15120530128479, + "learning_rate": 1.705713090054876e-05, + "loss": 0.667, + "step": 75130 + }, + { + "epoch": 1.2061188783126535, + "grad_norm": 1.1224968433380127, + "learning_rate": 1.70511533416222e-05, + "loss": 0.7371, + "step": 75140 + }, + { + "epoch": 1.2062793945328176, + "grad_norm": 0.9258280992507935, + "learning_rate": 1.7045176288195637e-05, + "loss": 0.8922, + "step": 75150 + }, + { + "epoch": 1.2064399107529815, + "grad_norm": 1.8711661100387573, + "learning_rate": 1.703919974064917e-05, + "loss": 0.6773, + "step": 75160 + }, + { + "epoch": 1.2066004269731456, + "grad_norm": 0.8235915899276733, + "learning_rate": 1.703322369936288e-05, + "loss": 0.7616, + "step": 75170 + }, + { + "epoch": 1.2067609431933097, + "grad_norm": 1.0561383962631226, + "learning_rate": 1.70272481647168e-05, + "loss": 0.8119, + "step": 75180 + }, + { + "epoch": 1.2069214594134738, + "grad_norm": 1.6062414646148682, + "learning_rate": 1.7021273137090943e-05, + "loss": 0.7216, + "step": 75190 + }, + { + "epoch": 1.2070819756336377, + "grad_norm": 1.9717686176300049, + "learning_rate": 1.7015298616865277e-05, + "loss": 0.7276, + "step": 75200 + }, + { + "epoch": 1.2072424918538018, + "grad_norm": 0.7565680742263794, + "learning_rate": 1.700932460441976e-05, + "loss": 0.8019, + "step": 75210 + }, + { + "epoch": 1.207403008073966, + "grad_norm": 0.9257864952087402, + "learning_rate": 1.70033511001343e-05, + "loss": 0.8142, + "step": 75220 + }, + { + "epoch": 1.2075635242941298, + "grad_norm": 1.1809062957763672, + "learning_rate": 1.6997378104388775e-05, + "loss": 0.8136, + "step": 75230 + }, + { + "epoch": 1.207724040514294, + "grad_norm": 0.865972101688385, + "learning_rate": 1.6991405617563026e-05, + "loss": 0.8143, + "step": 75240 + }, + { + "epoch": 1.207884556734458, + "grad_norm": 0.7137873768806458, + "learning_rate": 1.6985433640036874e-05, + "loss": 0.7689, + "step": 75250 + }, + { + "epoch": 1.2080450729546222, + "grad_norm": 1.553358554840088, + "learning_rate": 1.6979462172190104e-05, + "loss": 0.6277, + "step": 75260 + }, + { + "epoch": 1.208205589174786, + "grad_norm": 1.190467357635498, + "learning_rate": 1.697349121440246e-05, + "loss": 0.669, + "step": 75270 + }, + { + "epoch": 1.2083661053949502, + "grad_norm": 0.9834647178649902, + "learning_rate": 1.6967520767053668e-05, + "loss": 0.7376, + "step": 75280 + }, + { + "epoch": 1.2085266216151143, + "grad_norm": 0.8510430455207825, + "learning_rate": 1.696155083052342e-05, + "loss": 0.6412, + "step": 75290 + }, + { + "epoch": 1.2086871378352781, + "grad_norm": 0.7574516534805298, + "learning_rate": 1.6955581405191345e-05, + "loss": 0.7688, + "step": 75300 + }, + { + "epoch": 1.2088476540554423, + "grad_norm": 1.9146827459335327, + "learning_rate": 1.694961249143708e-05, + "loss": 0.7063, + "step": 75310 + }, + { + "epoch": 1.2090081702756064, + "grad_norm": 0.6592751741409302, + "learning_rate": 1.6943644089640204e-05, + "loss": 0.6062, + "step": 75320 + }, + { + "epoch": 1.2091686864957705, + "grad_norm": 1.142932653427124, + "learning_rate": 1.6937676200180282e-05, + "loss": 0.8269, + "step": 75330 + }, + { + "epoch": 1.2093292027159344, + "grad_norm": 0.7605511546134949, + "learning_rate": 1.6931708823436833e-05, + "loss": 0.6853, + "step": 75340 + }, + { + "epoch": 1.2094897189360985, + "grad_norm": 0.6445212364196777, + "learning_rate": 1.6925741959789345e-05, + "loss": 0.7359, + "step": 75350 + }, + { + "epoch": 1.2096502351562626, + "grad_norm": 1.6813461780548096, + "learning_rate": 1.6919775609617288e-05, + "loss": 0.6933, + "step": 75360 + }, + { + "epoch": 1.2098107513764265, + "grad_norm": 0.7609135508537292, + "learning_rate": 1.6913809773300066e-05, + "loss": 0.5709, + "step": 75370 + }, + { + "epoch": 1.2099712675965906, + "grad_norm": 0.7452055215835571, + "learning_rate": 1.690784445121708e-05, + "loss": 0.7766, + "step": 75380 + }, + { + "epoch": 1.2101317838167547, + "grad_norm": 0.8368682861328125, + "learning_rate": 1.6901879643747694e-05, + "loss": 0.6872, + "step": 75390 + }, + { + "epoch": 1.2102923000369188, + "grad_norm": 0.7181956768035889, + "learning_rate": 1.689591535127123e-05, + "loss": 0.7246, + "step": 75400 + }, + { + "epoch": 1.210452816257083, + "grad_norm": 1.3473453521728516, + "learning_rate": 1.6889951574166984e-05, + "loss": 0.6659, + "step": 75410 + }, + { + "epoch": 1.2106133324772468, + "grad_norm": 0.7351340651512146, + "learning_rate": 1.6883988312814224e-05, + "loss": 0.6664, + "step": 75420 + }, + { + "epoch": 1.210773848697411, + "grad_norm": 1.2603498697280884, + "learning_rate": 1.6878025567592165e-05, + "loss": 0.6879, + "step": 75430 + }, + { + "epoch": 1.210934364917575, + "grad_norm": 1.2102460861206055, + "learning_rate": 1.6872063338880006e-05, + "loss": 0.7479, + "step": 75440 + }, + { + "epoch": 1.2110948811377389, + "grad_norm": 1.1415400505065918, + "learning_rate": 1.6866101627056912e-05, + "loss": 0.701, + "step": 75450 + }, + { + "epoch": 1.211255397357903, + "grad_norm": 1.0661357641220093, + "learning_rate": 1.6860140432502017e-05, + "loss": 0.6153, + "step": 75460 + }, + { + "epoch": 1.211415913578067, + "grad_norm": 0.8732959032058716, + "learning_rate": 1.6854179755594414e-05, + "loss": 0.7656, + "step": 75470 + }, + { + "epoch": 1.2115764297982312, + "grad_norm": 1.248441457748413, + "learning_rate": 1.6848219596713167e-05, + "loss": 0.6484, + "step": 75480 + }, + { + "epoch": 1.211736946018395, + "grad_norm": 0.9057785868644714, + "learning_rate": 1.6842259956237304e-05, + "loss": 0.6845, + "step": 75490 + }, + { + "epoch": 1.2118974622385592, + "grad_norm": 0.8331135511398315, + "learning_rate": 1.6836300834545832e-05, + "loss": 0.7394, + "step": 75500 + }, + { + "epoch": 1.2120579784587233, + "grad_norm": 0.9609780311584473, + "learning_rate": 1.683034223201771e-05, + "loss": 0.6034, + "step": 75510 + }, + { + "epoch": 1.2122184946788872, + "grad_norm": 1.0642229318618774, + "learning_rate": 1.6824384149031866e-05, + "loss": 0.7737, + "step": 75520 + }, + { + "epoch": 1.2123790108990513, + "grad_norm": 1.8050494194030762, + "learning_rate": 1.6818426585967207e-05, + "loss": 0.7562, + "step": 75530 + }, + { + "epoch": 1.2125395271192154, + "grad_norm": 1.196372389793396, + "learning_rate": 1.681246954320261e-05, + "loss": 0.7733, + "step": 75540 + }, + { + "epoch": 1.2127000433393795, + "grad_norm": 1.4648964405059814, + "learning_rate": 1.6806513021116875e-05, + "loss": 0.5909, + "step": 75550 + }, + { + "epoch": 1.2128605595595434, + "grad_norm": 0.8525046110153198, + "learning_rate": 1.6800557020088825e-05, + "loss": 0.7637, + "step": 75560 + }, + { + "epoch": 1.2130210757797075, + "grad_norm": 0.8128588795661926, + "learning_rate": 1.6794601540497225e-05, + "loss": 0.7905, + "step": 75570 + }, + { + "epoch": 1.2131815919998716, + "grad_norm": 1.4002878665924072, + "learning_rate": 1.678864658272081e-05, + "loss": 0.689, + "step": 75580 + }, + { + "epoch": 1.2133421082200355, + "grad_norm": 0.6611341238021851, + "learning_rate": 1.678269214713827e-05, + "loss": 0.6864, + "step": 75590 + }, + { + "epoch": 1.2135026244401996, + "grad_norm": 0.9158505201339722, + "learning_rate": 1.6776738234128277e-05, + "loss": 0.7558, + "step": 75600 + }, + { + "epoch": 1.2136631406603637, + "grad_norm": 1.5748536586761475, + "learning_rate": 1.6770784844069482e-05, + "loss": 0.6698, + "step": 75610 + }, + { + "epoch": 1.2138236568805278, + "grad_norm": 1.3151911497116089, + "learning_rate": 1.6764831977340456e-05, + "loss": 0.7577, + "step": 75620 + }, + { + "epoch": 1.2139841731006917, + "grad_norm": 1.1459749937057495, + "learning_rate": 1.6758879634319785e-05, + "loss": 0.7062, + "step": 75630 + }, + { + "epoch": 1.2141446893208558, + "grad_norm": 1.0287296772003174, + "learning_rate": 1.6752927815385995e-05, + "loss": 0.7507, + "step": 75640 + }, + { + "epoch": 1.21430520554102, + "grad_norm": 1.0297551155090332, + "learning_rate": 1.6746976520917594e-05, + "loss": 0.6713, + "step": 75650 + }, + { + "epoch": 1.214465721761184, + "grad_norm": 2.0507519245147705, + "learning_rate": 1.6741025751293043e-05, + "loss": 0.7795, + "step": 75660 + }, + { + "epoch": 1.214626237981348, + "grad_norm": 0.9209843873977661, + "learning_rate": 1.673507550689079e-05, + "loss": 0.6345, + "step": 75670 + }, + { + "epoch": 1.214786754201512, + "grad_norm": 1.2079541683197021, + "learning_rate": 1.6729125788089216e-05, + "loss": 0.609, + "step": 75680 + }, + { + "epoch": 1.2149472704216762, + "grad_norm": 1.0999672412872314, + "learning_rate": 1.6723176595266692e-05, + "loss": 0.7005, + "step": 75690 + }, + { + "epoch": 1.2151077866418403, + "grad_norm": 1.3206136226654053, + "learning_rate": 1.671722792880156e-05, + "loss": 0.7806, + "step": 75700 + }, + { + "epoch": 1.2152683028620042, + "grad_norm": 1.2699599266052246, + "learning_rate": 1.6711279789072122e-05, + "loss": 0.6721, + "step": 75710 + }, + { + "epoch": 1.2154288190821683, + "grad_norm": 1.1750462055206299, + "learning_rate": 1.6705332176456633e-05, + "loss": 0.749, + "step": 75720 + }, + { + "epoch": 1.2155893353023324, + "grad_norm": 0.8499077558517456, + "learning_rate": 1.6699385091333344e-05, + "loss": 0.7656, + "step": 75730 + }, + { + "epoch": 1.2157498515224963, + "grad_norm": 0.7892854809761047, + "learning_rate": 1.6693438534080435e-05, + "loss": 0.6808, + "step": 75740 + }, + { + "epoch": 1.2159103677426604, + "grad_norm": 0.8230244517326355, + "learning_rate": 1.668749250507608e-05, + "loss": 0.6463, + "step": 75750 + }, + { + "epoch": 1.2160708839628245, + "grad_norm": 1.1809101104736328, + "learning_rate": 1.6681547004698412e-05, + "loss": 0.7605, + "step": 75760 + }, + { + "epoch": 1.2162314001829886, + "grad_norm": 2.2349720001220703, + "learning_rate": 1.6675602033325528e-05, + "loss": 0.7105, + "step": 75770 + }, + { + "epoch": 1.2163919164031525, + "grad_norm": 1.0771195888519287, + "learning_rate": 1.6669657591335496e-05, + "loss": 0.5807, + "step": 75780 + }, + { + "epoch": 1.2165524326233166, + "grad_norm": 0.7116091847419739, + "learning_rate": 1.6663713679106355e-05, + "loss": 0.536, + "step": 75790 + }, + { + "epoch": 1.2167129488434807, + "grad_norm": 1.3285117149353027, + "learning_rate": 1.6657770297016087e-05, + "loss": 0.6634, + "step": 75800 + }, + { + "epoch": 1.2168734650636446, + "grad_norm": 1.2001078128814697, + "learning_rate": 1.6651827445442666e-05, + "loss": 0.6991, + "step": 75810 + }, + { + "epoch": 1.2170339812838087, + "grad_norm": 1.125571370124817, + "learning_rate": 1.664588512476402e-05, + "loss": 0.7068, + "step": 75820 + }, + { + "epoch": 1.2171944975039728, + "grad_norm": 1.2013345956802368, + "learning_rate": 1.6639943335358047e-05, + "loss": 0.638, + "step": 75830 + }, + { + "epoch": 1.217355013724137, + "grad_norm": 1.5416723489761353, + "learning_rate": 1.6634002077602606e-05, + "loss": 0.5764, + "step": 75840 + }, + { + "epoch": 1.2175155299443008, + "grad_norm": 1.1087374687194824, + "learning_rate": 1.6628061351875546e-05, + "loss": 0.7149, + "step": 75850 + }, + { + "epoch": 1.217676046164465, + "grad_norm": 1.0306813716888428, + "learning_rate": 1.6622121158554632e-05, + "loss": 0.6394, + "step": 75860 + }, + { + "epoch": 1.217836562384629, + "grad_norm": 1.393205165863037, + "learning_rate": 1.661618149801764e-05, + "loss": 0.6723, + "step": 75870 + }, + { + "epoch": 1.217997078604793, + "grad_norm": 0.9247450232505798, + "learning_rate": 1.6610242370642292e-05, + "loss": 0.7506, + "step": 75880 + }, + { + "epoch": 1.218157594824957, + "grad_norm": 0.9228038191795349, + "learning_rate": 1.660430377680629e-05, + "loss": 0.6734, + "step": 75890 + }, + { + "epoch": 1.218318111045121, + "grad_norm": 1.059127926826477, + "learning_rate": 1.659836571688729e-05, + "loss": 0.6545, + "step": 75900 + }, + { + "epoch": 1.2184786272652852, + "grad_norm": 1.49658203125, + "learning_rate": 1.659242819126292e-05, + "loss": 0.6965, + "step": 75910 + }, + { + "epoch": 1.2186391434854493, + "grad_norm": 1.0032744407653809, + "learning_rate": 1.6586491200310776e-05, + "loss": 0.7179, + "step": 75920 + }, + { + "epoch": 1.2187996597056132, + "grad_norm": 1.4951251745224, + "learning_rate": 1.6580554744408403e-05, + "loss": 0.7276, + "step": 75930 + }, + { + "epoch": 1.2189601759257773, + "grad_norm": 1.0577092170715332, + "learning_rate": 1.657461882393333e-05, + "loss": 0.6668, + "step": 75940 + }, + { + "epoch": 1.2191206921459414, + "grad_norm": 0.6695976853370667, + "learning_rate": 1.6568683439263046e-05, + "loss": 0.7319, + "step": 75950 + }, + { + "epoch": 1.2192812083661053, + "grad_norm": 0.8474462628364563, + "learning_rate": 1.656274859077501e-05, + "loss": 0.7185, + "step": 75960 + }, + { + "epoch": 1.2194417245862694, + "grad_norm": 0.6999849081039429, + "learning_rate": 1.655681427884664e-05, + "loss": 0.6932, + "step": 75970 + }, + { + "epoch": 1.2196022408064335, + "grad_norm": 1.1968876123428345, + "learning_rate": 1.6550880503855336e-05, + "loss": 0.6677, + "step": 75980 + }, + { + "epoch": 1.2197627570265976, + "grad_norm": 1.015384554862976, + "learning_rate": 1.6544947266178433e-05, + "loss": 0.7547, + "step": 75990 + }, + { + "epoch": 1.2199232732467615, + "grad_norm": 0.8121821880340576, + "learning_rate": 1.6539014566193255e-05, + "loss": 0.7139, + "step": 76000 + }, + { + "epoch": 1.2199232732467615, + "eval_loss": 0.7756150960922241, + "eval_runtime": 1834.05, + "eval_samples_per_second": 14.302, + "eval_steps_per_second": 1.788, + "step": 76000 + }, + { + "epoch": 1.2200837894669256, + "grad_norm": 1.3671964406967163, + "learning_rate": 1.6533082404277096e-05, + "loss": 0.6777, + "step": 76010 + }, + { + "epoch": 1.2202443056870897, + "grad_norm": 0.7997838258743286, + "learning_rate": 1.6527150780807195e-05, + "loss": 0.7976, + "step": 76020 + }, + { + "epoch": 1.2204048219072536, + "grad_norm": 1.3918226957321167, + "learning_rate": 1.6521219696160773e-05, + "loss": 0.7174, + "step": 76030 + }, + { + "epoch": 1.2205653381274177, + "grad_norm": 0.5238789916038513, + "learning_rate": 1.6515289150715023e-05, + "loss": 0.691, + "step": 76040 + }, + { + "epoch": 1.2207258543475819, + "grad_norm": 0.8012605905532837, + "learning_rate": 1.6509359144847072e-05, + "loss": 0.7149, + "step": 76050 + }, + { + "epoch": 1.220886370567746, + "grad_norm": 0.8940675258636475, + "learning_rate": 1.6503429678934046e-05, + "loss": 0.6881, + "step": 76060 + }, + { + "epoch": 1.2210468867879098, + "grad_norm": 0.8619933128356934, + "learning_rate": 1.6497500753353024e-05, + "loss": 0.772, + "step": 76070 + }, + { + "epoch": 1.221207403008074, + "grad_norm": 1.88601553440094, + "learning_rate": 1.6491572368481046e-05, + "loss": 0.6559, + "step": 76080 + }, + { + "epoch": 1.221367919228238, + "grad_norm": 1.251119613647461, + "learning_rate": 1.6485644524695132e-05, + "loss": 0.7129, + "step": 76090 + }, + { + "epoch": 1.221528435448402, + "grad_norm": 0.82793128490448, + "learning_rate": 1.647971722237225e-05, + "loss": 0.6583, + "step": 76100 + }, + { + "epoch": 1.221688951668566, + "grad_norm": 0.982830286026001, + "learning_rate": 1.6473790461889344e-05, + "loss": 0.6873, + "step": 76110 + }, + { + "epoch": 1.2218494678887302, + "grad_norm": 0.7644674181938171, + "learning_rate": 1.646786424362332e-05, + "loss": 0.6792, + "step": 76120 + }, + { + "epoch": 1.2220099841088943, + "grad_norm": 0.9900162220001221, + "learning_rate": 1.6461938567951047e-05, + "loss": 0.6076, + "step": 76130 + }, + { + "epoch": 1.2221705003290582, + "grad_norm": 0.6671926379203796, + "learning_rate": 1.6456013435249373e-05, + "loss": 0.6887, + "step": 76140 + }, + { + "epoch": 1.2223310165492223, + "grad_norm": 0.9148052334785461, + "learning_rate": 1.6450088845895095e-05, + "loss": 0.7762, + "step": 76150 + }, + { + "epoch": 1.2224915327693864, + "grad_norm": 1.1857444047927856, + "learning_rate": 1.6444164800264988e-05, + "loss": 0.6826, + "step": 76160 + }, + { + "epoch": 1.2226520489895505, + "grad_norm": 0.9412534832954407, + "learning_rate": 1.643824129873578e-05, + "loss": 0.6783, + "step": 76170 + }, + { + "epoch": 1.2228125652097144, + "grad_norm": 0.652093768119812, + "learning_rate": 1.643231834168417e-05, + "loss": 0.7391, + "step": 76180 + }, + { + "epoch": 1.2229730814298785, + "grad_norm": 0.7920336723327637, + "learning_rate": 1.6426395929486826e-05, + "loss": 0.6503, + "step": 76190 + }, + { + "epoch": 1.2231335976500426, + "grad_norm": 1.3818775415420532, + "learning_rate": 1.642047406252038e-05, + "loss": 0.8076, + "step": 76200 + }, + { + "epoch": 1.2232941138702067, + "grad_norm": 0.6173205375671387, + "learning_rate": 1.6414552741161427e-05, + "loss": 0.711, + "step": 76210 + }, + { + "epoch": 1.2234546300903706, + "grad_norm": 0.7616480588912964, + "learning_rate": 1.6408631965786525e-05, + "loss": 0.7589, + "step": 76220 + }, + { + "epoch": 1.2236151463105347, + "grad_norm": 0.7124353051185608, + "learning_rate": 1.640271173677222e-05, + "loss": 0.6911, + "step": 76230 + }, + { + "epoch": 1.2237756625306988, + "grad_norm": 0.9409170746803284, + "learning_rate": 1.6396792054494973e-05, + "loss": 0.8313, + "step": 76240 + }, + { + "epoch": 1.2239361787508627, + "grad_norm": 1.0971258878707886, + "learning_rate": 1.6390872919331257e-05, + "loss": 0.7324, + "step": 76250 + }, + { + "epoch": 1.2240966949710268, + "grad_norm": 1.2176008224487305, + "learning_rate": 1.6384954331657493e-05, + "loss": 0.7531, + "step": 76260 + }, + { + "epoch": 1.224257211191191, + "grad_norm": 0.5860143303871155, + "learning_rate": 1.6379036291850063e-05, + "loss": 0.6634, + "step": 76270 + }, + { + "epoch": 1.224417727411355, + "grad_norm": 1.1127169132232666, + "learning_rate": 1.637311880028533e-05, + "loss": 0.7021, + "step": 76280 + }, + { + "epoch": 1.224578243631519, + "grad_norm": 1.1114559173583984, + "learning_rate": 1.6367201857339618e-05, + "loss": 0.6304, + "step": 76290 + }, + { + "epoch": 1.224738759851683, + "grad_norm": 0.8631250858306885, + "learning_rate": 1.6361285463389185e-05, + "loss": 0.7226, + "step": 76300 + }, + { + "epoch": 1.2248992760718471, + "grad_norm": 1.0821104049682617, + "learning_rate": 1.6355369618810295e-05, + "loss": 0.6868, + "step": 76310 + }, + { + "epoch": 1.225059792292011, + "grad_norm": 1.0687899589538574, + "learning_rate": 1.6349454323979156e-05, + "loss": 0.6787, + "step": 76320 + }, + { + "epoch": 1.2252203085121751, + "grad_norm": 1.1406304836273193, + "learning_rate": 1.6343539579271952e-05, + "loss": 0.6627, + "step": 76330 + }, + { + "epoch": 1.2253808247323392, + "grad_norm": 1.1708860397338867, + "learning_rate": 1.633762538506482e-05, + "loss": 0.639, + "step": 76340 + }, + { + "epoch": 1.2255413409525033, + "grad_norm": 0.998625636100769, + "learning_rate": 1.6331711741733875e-05, + "loss": 0.7032, + "step": 76350 + }, + { + "epoch": 1.2257018571726672, + "grad_norm": 1.0031108856201172, + "learning_rate": 1.632579864965519e-05, + "loss": 0.74, + "step": 76360 + }, + { + "epoch": 1.2258623733928313, + "grad_norm": 0.9849521517753601, + "learning_rate": 1.6319886109204788e-05, + "loss": 0.7685, + "step": 76370 + }, + { + "epoch": 1.2260228896129954, + "grad_norm": 0.8079798221588135, + "learning_rate": 1.6313974120758684e-05, + "loss": 0.6105, + "step": 76380 + }, + { + "epoch": 1.2261834058331593, + "grad_norm": 0.811233401298523, + "learning_rate": 1.6308062684692842e-05, + "loss": 0.7008, + "step": 76390 + }, + { + "epoch": 1.2263439220533234, + "grad_norm": 1.141463041305542, + "learning_rate": 1.63021518013832e-05, + "loss": 0.6931, + "step": 76400 + }, + { + "epoch": 1.2265044382734875, + "grad_norm": 0.77812260389328, + "learning_rate": 1.629624147120566e-05, + "loss": 0.6751, + "step": 76410 + }, + { + "epoch": 1.2266649544936516, + "grad_norm": 1.2352526187896729, + "learning_rate": 1.6290331694536067e-05, + "loss": 0.7453, + "step": 76420 + }, + { + "epoch": 1.2268254707138155, + "grad_norm": 0.5737971067428589, + "learning_rate": 1.6284422471750256e-05, + "loss": 0.6657, + "step": 76430 + }, + { + "epoch": 1.2269859869339796, + "grad_norm": 0.8163816928863525, + "learning_rate": 1.6278513803224023e-05, + "loss": 0.6527, + "step": 76440 + }, + { + "epoch": 1.2271465031541438, + "grad_norm": 0.9617181420326233, + "learning_rate": 1.6272605689333127e-05, + "loss": 0.6738, + "step": 76450 + }, + { + "epoch": 1.2273070193743079, + "grad_norm": 1.2394630908966064, + "learning_rate": 1.6266698130453278e-05, + "loss": 0.823, + "step": 76460 + }, + { + "epoch": 1.2274675355944717, + "grad_norm": 1.3383015394210815, + "learning_rate": 1.6260791126960173e-05, + "loss": 0.721, + "step": 76470 + }, + { + "epoch": 1.2276280518146359, + "grad_norm": 1.4451340436935425, + "learning_rate": 1.625488467922946e-05, + "loss": 0.7896, + "step": 76480 + }, + { + "epoch": 1.2277885680348, + "grad_norm": 0.7281571626663208, + "learning_rate": 1.624897878763675e-05, + "loss": 0.6052, + "step": 76490 + }, + { + "epoch": 1.227949084254964, + "grad_norm": 1.0024431943893433, + "learning_rate": 1.6243073452557625e-05, + "loss": 0.7604, + "step": 76500 + }, + { + "epoch": 1.228109600475128, + "grad_norm": 0.6521099209785461, + "learning_rate": 1.6237168674367632e-05, + "loss": 0.6586, + "step": 76510 + }, + { + "epoch": 1.228270116695292, + "grad_norm": 0.720816433429718, + "learning_rate": 1.6231264453442273e-05, + "loss": 0.7287, + "step": 76520 + }, + { + "epoch": 1.2284306329154562, + "grad_norm": 1.1535691022872925, + "learning_rate": 1.6225360790157036e-05, + "loss": 0.793, + "step": 76530 + }, + { + "epoch": 1.22859114913562, + "grad_norm": 0.8121287822723389, + "learning_rate": 1.6219457684887353e-05, + "loss": 0.6743, + "step": 76540 + }, + { + "epoch": 1.2287516653557842, + "grad_norm": 0.8234773278236389, + "learning_rate": 1.6213555138008617e-05, + "loss": 0.6514, + "step": 76550 + }, + { + "epoch": 1.2289121815759483, + "grad_norm": 1.0156023502349854, + "learning_rate": 1.6207653149896207e-05, + "loss": 0.7856, + "step": 76560 + }, + { + "epoch": 1.2290726977961124, + "grad_norm": 0.8179785013198853, + "learning_rate": 1.6201751720925446e-05, + "loss": 0.7078, + "step": 76570 + }, + { + "epoch": 1.2292332140162763, + "grad_norm": 0.7560521960258484, + "learning_rate": 1.6195850851471635e-05, + "loss": 0.6928, + "step": 76580 + }, + { + "epoch": 1.2293937302364404, + "grad_norm": 0.8139846920967102, + "learning_rate": 1.618995054191004e-05, + "loss": 0.676, + "step": 76590 + }, + { + "epoch": 1.2295542464566045, + "grad_norm": 0.9107058048248291, + "learning_rate": 1.6184050792615884e-05, + "loss": 0.7562, + "step": 76600 + }, + { + "epoch": 1.2297147626767684, + "grad_norm": 1.100073218345642, + "learning_rate": 1.6178151603964347e-05, + "loss": 0.6111, + "step": 76610 + }, + { + "epoch": 1.2298752788969325, + "grad_norm": 1.2539583444595337, + "learning_rate": 1.617225297633059e-05, + "loss": 0.7149, + "step": 76620 + }, + { + "epoch": 1.2300357951170966, + "grad_norm": 1.2710083723068237, + "learning_rate": 1.6166354910089725e-05, + "loss": 0.6617, + "step": 76630 + }, + { + "epoch": 1.2301963113372607, + "grad_norm": 1.7488124370574951, + "learning_rate": 1.6160457405616846e-05, + "loss": 0.7335, + "step": 76640 + }, + { + "epoch": 1.2303568275574246, + "grad_norm": 0.8874931931495667, + "learning_rate": 1.6154560463286988e-05, + "loss": 0.6476, + "step": 76650 + }, + { + "epoch": 1.2305173437775887, + "grad_norm": 0.9792973399162292, + "learning_rate": 1.6148664083475176e-05, + "loss": 0.6454, + "step": 76660 + }, + { + "epoch": 1.2306778599977528, + "grad_norm": 1.0696841478347778, + "learning_rate": 1.6142768266556372e-05, + "loss": 0.6249, + "step": 76670 + }, + { + "epoch": 1.2308383762179167, + "grad_norm": 1.0005919933319092, + "learning_rate": 1.6136873012905517e-05, + "loss": 0.7483, + "step": 76680 + }, + { + "epoch": 1.2309988924380808, + "grad_norm": 1.002068281173706, + "learning_rate": 1.613097832289752e-05, + "loss": 0.7612, + "step": 76690 + }, + { + "epoch": 1.231159408658245, + "grad_norm": 0.7016863822937012, + "learning_rate": 1.612508419690724e-05, + "loss": 0.7405, + "step": 76700 + }, + { + "epoch": 1.231319924878409, + "grad_norm": 0.8668609261512756, + "learning_rate": 1.611919063530952e-05, + "loss": 0.7731, + "step": 76710 + }, + { + "epoch": 1.2314804410985731, + "grad_norm": 1.133926510810852, + "learning_rate": 1.6113297638479147e-05, + "loss": 0.5841, + "step": 76720 + }, + { + "epoch": 1.231640957318737, + "grad_norm": 1.5263912677764893, + "learning_rate": 1.6107405206790895e-05, + "loss": 0.768, + "step": 76730 + }, + { + "epoch": 1.2318014735389011, + "grad_norm": 1.1855007410049438, + "learning_rate": 1.6101513340619464e-05, + "loss": 0.6043, + "step": 76740 + }, + { + "epoch": 1.2319619897590652, + "grad_norm": 1.1042156219482422, + "learning_rate": 1.609562204033956e-05, + "loss": 0.6799, + "step": 76750 + }, + { + "epoch": 1.2321225059792291, + "grad_norm": 0.948907196521759, + "learning_rate": 1.6089731306325834e-05, + "loss": 0.7394, + "step": 76760 + }, + { + "epoch": 1.2322830221993932, + "grad_norm": 1.187520980834961, + "learning_rate": 1.6083841138952894e-05, + "loss": 0.8142, + "step": 76770 + }, + { + "epoch": 1.2324435384195573, + "grad_norm": 1.087785243988037, + "learning_rate": 1.6077951538595325e-05, + "loss": 0.7075, + "step": 76780 + }, + { + "epoch": 1.2326040546397214, + "grad_norm": 1.1913033723831177, + "learning_rate": 1.6072062505627682e-05, + "loss": 0.6241, + "step": 76790 + }, + { + "epoch": 1.2327645708598853, + "grad_norm": 0.9269064664840698, + "learning_rate": 1.6066174040424452e-05, + "loss": 0.621, + "step": 76800 + }, + { + "epoch": 1.2329250870800494, + "grad_norm": 0.9890971779823303, + "learning_rate": 1.606028614336012e-05, + "loss": 0.8424, + "step": 76810 + }, + { + "epoch": 1.2330856033002136, + "grad_norm": 1.2411364316940308, + "learning_rate": 1.6054398814809118e-05, + "loss": 0.6724, + "step": 76820 + }, + { + "epoch": 1.2332461195203774, + "grad_norm": 1.3956084251403809, + "learning_rate": 1.6048512055145843e-05, + "loss": 0.8196, + "step": 76830 + }, + { + "epoch": 1.2334066357405415, + "grad_norm": 1.477284550666809, + "learning_rate": 1.6042625864744663e-05, + "loss": 0.741, + "step": 76840 + }, + { + "epoch": 1.2335671519607057, + "grad_norm": 0.7233285307884216, + "learning_rate": 1.6036740243979913e-05, + "loss": 0.7271, + "step": 76850 + }, + { + "epoch": 1.2337276681808698, + "grad_norm": 0.869944155216217, + "learning_rate": 1.6030855193225864e-05, + "loss": 0.803, + "step": 76860 + }, + { + "epoch": 1.2338881844010336, + "grad_norm": 0.8182713985443115, + "learning_rate": 1.602497071285678e-05, + "loss": 0.7285, + "step": 76870 + }, + { + "epoch": 1.2340487006211978, + "grad_norm": 0.8666988611221313, + "learning_rate": 1.6019086803246886e-05, + "loss": 0.8075, + "step": 76880 + }, + { + "epoch": 1.2342092168413619, + "grad_norm": 1.316752314567566, + "learning_rate": 1.6013203464770355e-05, + "loss": 0.6457, + "step": 76890 + }, + { + "epoch": 1.2343697330615258, + "grad_norm": 0.8563502430915833, + "learning_rate": 1.6007320697801334e-05, + "loss": 0.7521, + "step": 76900 + }, + { + "epoch": 1.2345302492816899, + "grad_norm": 0.6626951098442078, + "learning_rate": 1.600143850271395e-05, + "loss": 0.6666, + "step": 76910 + }, + { + "epoch": 1.234690765501854, + "grad_norm": 1.1443331241607666, + "learning_rate": 1.5995556879882246e-05, + "loss": 0.704, + "step": 76920 + }, + { + "epoch": 1.234851281722018, + "grad_norm": 1.030880331993103, + "learning_rate": 1.5989675829680277e-05, + "loss": 0.5833, + "step": 76930 + }, + { + "epoch": 1.235011797942182, + "grad_norm": 0.9933192729949951, + "learning_rate": 1.5983795352482038e-05, + "loss": 0.6569, + "step": 76940 + }, + { + "epoch": 1.235172314162346, + "grad_norm": 0.6981860995292664, + "learning_rate": 1.5977915448661496e-05, + "loss": 0.7187, + "step": 76950 + }, + { + "epoch": 1.2353328303825102, + "grad_norm": 0.7626198530197144, + "learning_rate": 1.5972036118592575e-05, + "loss": 0.715, + "step": 76960 + }, + { + "epoch": 1.2354933466026743, + "grad_norm": 0.8867999315261841, + "learning_rate": 1.5966157362649173e-05, + "loss": 0.6739, + "step": 76970 + }, + { + "epoch": 1.2356538628228382, + "grad_norm": 0.8598698377609253, + "learning_rate": 1.5960279181205145e-05, + "loss": 0.7585, + "step": 76980 + }, + { + "epoch": 1.2358143790430023, + "grad_norm": 1.0757862329483032, + "learning_rate": 1.5954401574634297e-05, + "loss": 0.7714, + "step": 76990 + }, + { + "epoch": 1.2359748952631664, + "grad_norm": 1.1475613117218018, + "learning_rate": 1.5948524543310413e-05, + "loss": 0.7462, + "step": 77000 + }, + { + "epoch": 1.2361354114833305, + "grad_norm": 1.6067026853561401, + "learning_rate": 1.5942648087607242e-05, + "loss": 0.7134, + "step": 77010 + }, + { + "epoch": 1.2362959277034944, + "grad_norm": 1.1131185293197632, + "learning_rate": 1.5936772207898494e-05, + "loss": 0.7557, + "step": 77020 + }, + { + "epoch": 1.2364564439236585, + "grad_norm": 1.062626600265503, + "learning_rate": 1.5930896904557834e-05, + "loss": 0.6152, + "step": 77030 + }, + { + "epoch": 1.2366169601438226, + "grad_norm": 0.7655528783798218, + "learning_rate": 1.592502217795891e-05, + "loss": 0.6486, + "step": 77040 + }, + { + "epoch": 1.2367774763639865, + "grad_norm": 0.8441984057426453, + "learning_rate": 1.5919148028475306e-05, + "loss": 0.7709, + "step": 77050 + }, + { + "epoch": 1.2369379925841506, + "grad_norm": 1.0412907600402832, + "learning_rate": 1.5913274456480585e-05, + "loss": 0.7806, + "step": 77060 + }, + { + "epoch": 1.2370985088043147, + "grad_norm": 0.8645069003105164, + "learning_rate": 1.5907401462348274e-05, + "loss": 0.6779, + "step": 77070 + }, + { + "epoch": 1.2372590250244788, + "grad_norm": 1.3142188787460327, + "learning_rate": 1.5901529046451862e-05, + "loss": 0.7018, + "step": 77080 + }, + { + "epoch": 1.2374195412446427, + "grad_norm": 0.822442352771759, + "learning_rate": 1.5895657209164805e-05, + "loss": 0.7691, + "step": 77090 + }, + { + "epoch": 1.2375800574648068, + "grad_norm": 0.7355928421020508, + "learning_rate": 1.5889785950860515e-05, + "loss": 0.6648, + "step": 77100 + }, + { + "epoch": 1.237740573684971, + "grad_norm": 1.15119469165802, + "learning_rate": 1.5883915271912365e-05, + "loss": 0.7935, + "step": 77110 + }, + { + "epoch": 1.2379010899051348, + "grad_norm": 0.676233172416687, + "learning_rate": 1.5878045172693693e-05, + "loss": 0.65, + "step": 77120 + }, + { + "epoch": 1.238061606125299, + "grad_norm": 1.1753298044204712, + "learning_rate": 1.587217565357781e-05, + "loss": 0.6774, + "step": 77130 + }, + { + "epoch": 1.238222122345463, + "grad_norm": 0.9730895161628723, + "learning_rate": 1.5866306714937977e-05, + "loss": 0.7431, + "step": 77140 + }, + { + "epoch": 1.2383826385656271, + "grad_norm": 1.7579642534255981, + "learning_rate": 1.586043835714743e-05, + "loss": 0.7414, + "step": 77150 + }, + { + "epoch": 1.238543154785791, + "grad_norm": 0.8260015845298767, + "learning_rate": 1.5854570580579367e-05, + "loss": 0.7284, + "step": 77160 + }, + { + "epoch": 1.2387036710059551, + "grad_norm": 0.8527688980102539, + "learning_rate": 1.584870338560693e-05, + "loss": 0.6766, + "step": 77170 + }, + { + "epoch": 1.2388641872261192, + "grad_norm": 1.5507844686508179, + "learning_rate": 1.5842836772603243e-05, + "loss": 0.7477, + "step": 77180 + }, + { + "epoch": 1.2390247034462831, + "grad_norm": 0.8597396612167358, + "learning_rate": 1.5836970741941392e-05, + "loss": 0.6811, + "step": 77190 + }, + { + "epoch": 1.2391852196664472, + "grad_norm": 1.196641445159912, + "learning_rate": 1.5831105293994418e-05, + "loss": 0.7997, + "step": 77200 + }, + { + "epoch": 1.2393457358866113, + "grad_norm": 0.8654150366783142, + "learning_rate": 1.5825240429135336e-05, + "loss": 0.8076, + "step": 77210 + }, + { + "epoch": 1.2395062521067755, + "grad_norm": 1.2534576654434204, + "learning_rate": 1.5819376147737116e-05, + "loss": 0.6522, + "step": 77220 + }, + { + "epoch": 1.2396667683269396, + "grad_norm": 1.9643033742904663, + "learning_rate": 1.581351245017268e-05, + "loss": 0.8093, + "step": 77230 + }, + { + "epoch": 1.2398272845471034, + "grad_norm": 0.8370552062988281, + "learning_rate": 1.5807649336814935e-05, + "loss": 0.7755, + "step": 77240 + }, + { + "epoch": 1.2399878007672676, + "grad_norm": 1.0051895380020142, + "learning_rate": 1.580178680803674e-05, + "loss": 0.7097, + "step": 77250 + }, + { + "epoch": 1.2401483169874317, + "grad_norm": 1.0468766689300537, + "learning_rate": 1.5795924864210916e-05, + "loss": 0.748, + "step": 77260 + }, + { + "epoch": 1.2403088332075956, + "grad_norm": 0.8131490349769592, + "learning_rate": 1.5790063505710242e-05, + "loss": 0.6485, + "step": 77270 + }, + { + "epoch": 1.2404693494277597, + "grad_norm": 0.7982804775238037, + "learning_rate": 1.5784202732907478e-05, + "loss": 0.7515, + "step": 77280 + }, + { + "epoch": 1.2406298656479238, + "grad_norm": 0.7328596711158752, + "learning_rate": 1.577834254617533e-05, + "loss": 0.6456, + "step": 77290 + }, + { + "epoch": 1.2407903818680879, + "grad_norm": 2.0198254585266113, + "learning_rate": 1.5772482945886476e-05, + "loss": 0.6835, + "step": 77300 + }, + { + "epoch": 1.2409508980882518, + "grad_norm": 0.6335110068321228, + "learning_rate": 1.576662393241354e-05, + "loss": 0.6363, + "step": 77310 + }, + { + "epoch": 1.2411114143084159, + "grad_norm": 1.0689189434051514, + "learning_rate": 1.5760765506129123e-05, + "loss": 0.7621, + "step": 77320 + }, + { + "epoch": 1.24127193052858, + "grad_norm": 0.8628936409950256, + "learning_rate": 1.5754907667405794e-05, + "loss": 0.6955, + "step": 77330 + }, + { + "epoch": 1.2414324467487439, + "grad_norm": 1.2986536026000977, + "learning_rate": 1.5749050416616074e-05, + "loss": 0.7332, + "step": 77340 + }, + { + "epoch": 1.241592962968908, + "grad_norm": 0.9235920310020447, + "learning_rate": 1.574319375413245e-05, + "loss": 0.7949, + "step": 77350 + }, + { + "epoch": 1.241753479189072, + "grad_norm": 0.9518804550170898, + "learning_rate": 1.5737337680327374e-05, + "loss": 0.6939, + "step": 77360 + }, + { + "epoch": 1.2419139954092362, + "grad_norm": 0.9650624990463257, + "learning_rate": 1.573148219557325e-05, + "loss": 0.7581, + "step": 77370 + }, + { + "epoch": 1.2420745116294, + "grad_norm": 0.7801077365875244, + "learning_rate": 1.5725627300242464e-05, + "loss": 0.6624, + "step": 77380 + }, + { + "epoch": 1.2422350278495642, + "grad_norm": 0.9486892819404602, + "learning_rate": 1.571977299470734e-05, + "loss": 0.7393, + "step": 77390 + }, + { + "epoch": 1.2423955440697283, + "grad_norm": 0.9774121046066284, + "learning_rate": 1.5713919279340188e-05, + "loss": 0.6951, + "step": 77400 + }, + { + "epoch": 1.2425560602898922, + "grad_norm": 0.8076041340827942, + "learning_rate": 1.5708066154513273e-05, + "loss": 0.723, + "step": 77410 + }, + { + "epoch": 1.2427165765100563, + "grad_norm": 0.9370454549789429, + "learning_rate": 1.5702213620598803e-05, + "loss": 0.7216, + "step": 77420 + }, + { + "epoch": 1.2428770927302204, + "grad_norm": 1.1324490308761597, + "learning_rate": 1.569636167796898e-05, + "loss": 0.6162, + "step": 77430 + }, + { + "epoch": 1.2430376089503845, + "grad_norm": 0.8688700795173645, + "learning_rate": 1.5690510326995943e-05, + "loss": 0.8617, + "step": 77440 + }, + { + "epoch": 1.2431981251705484, + "grad_norm": 1.1917792558670044, + "learning_rate": 1.5684659568051808e-05, + "loss": 0.7306, + "step": 77450 + }, + { + "epoch": 1.2433586413907125, + "grad_norm": 1.0771329402923584, + "learning_rate": 1.5678809401508648e-05, + "loss": 0.6691, + "step": 77460 + }, + { + "epoch": 1.2435191576108766, + "grad_norm": 1.622384786605835, + "learning_rate": 1.567295982773851e-05, + "loss": 0.7155, + "step": 77470 + }, + { + "epoch": 1.2436796738310407, + "grad_norm": 1.0841823816299438, + "learning_rate": 1.5667110847113377e-05, + "loss": 0.6784, + "step": 77480 + }, + { + "epoch": 1.2438401900512046, + "grad_norm": 0.8400924801826477, + "learning_rate": 1.566126246000521e-05, + "loss": 0.7612, + "step": 77490 + }, + { + "epoch": 1.2440007062713687, + "grad_norm": 1.078747034072876, + "learning_rate": 1.5655414666785943e-05, + "loss": 0.595, + "step": 77500 + }, + { + "epoch": 1.2441612224915328, + "grad_norm": 1.0115129947662354, + "learning_rate": 1.5649567467827454e-05, + "loss": 0.5177, + "step": 77510 + }, + { + "epoch": 1.244321738711697, + "grad_norm": 1.0470679998397827, + "learning_rate": 1.564372086350159e-05, + "loss": 0.6503, + "step": 77520 + }, + { + "epoch": 1.2444822549318608, + "grad_norm": 0.8517902493476868, + "learning_rate": 1.5637874854180168e-05, + "loss": 0.8097, + "step": 77530 + }, + { + "epoch": 1.244642771152025, + "grad_norm": 1.1235226392745972, + "learning_rate": 1.563202944023496e-05, + "loss": 0.776, + "step": 77540 + }, + { + "epoch": 1.244803287372189, + "grad_norm": 0.8043145537376404, + "learning_rate": 1.562618462203769e-05, + "loss": 0.7064, + "step": 77550 + }, + { + "epoch": 1.244963803592353, + "grad_norm": 0.9850381016731262, + "learning_rate": 1.5620340399960053e-05, + "loss": 0.8089, + "step": 77560 + }, + { + "epoch": 1.245124319812517, + "grad_norm": 0.9093686938285828, + "learning_rate": 1.5614496774373722e-05, + "loss": 0.6952, + "step": 77570 + }, + { + "epoch": 1.2452848360326811, + "grad_norm": 1.060164213180542, + "learning_rate": 1.5608653745650303e-05, + "loss": 0.7181, + "step": 77580 + }, + { + "epoch": 1.2454453522528453, + "grad_norm": 0.7592608332633972, + "learning_rate": 1.5602811314161385e-05, + "loss": 0.7775, + "step": 77590 + }, + { + "epoch": 1.2456058684730091, + "grad_norm": 1.0255451202392578, + "learning_rate": 1.5596969480278518e-05, + "loss": 0.6825, + "step": 77600 + }, + { + "epoch": 1.2457663846931732, + "grad_norm": 0.9439435601234436, + "learning_rate": 1.55911282443732e-05, + "loss": 0.7201, + "step": 77610 + }, + { + "epoch": 1.2459269009133374, + "grad_norm": 1.0897480249404907, + "learning_rate": 1.55852876068169e-05, + "loss": 0.7172, + "step": 77620 + }, + { + "epoch": 1.2460874171335012, + "grad_norm": 0.8171595931053162, + "learning_rate": 1.5579447567981043e-05, + "loss": 0.669, + "step": 77630 + }, + { + "epoch": 1.2462479333536653, + "grad_norm": 1.2870832681655884, + "learning_rate": 1.5573608128237032e-05, + "loss": 0.7066, + "step": 77640 + }, + { + "epoch": 1.2464084495738295, + "grad_norm": 1.181403398513794, + "learning_rate": 1.5567769287956215e-05, + "loss": 0.6826, + "step": 77650 + }, + { + "epoch": 1.2465689657939936, + "grad_norm": 0.8219736814498901, + "learning_rate": 1.5561931047509918e-05, + "loss": 0.6238, + "step": 77660 + }, + { + "epoch": 1.2467294820141575, + "grad_norm": 1.2957763671875, + "learning_rate": 1.5556093407269402e-05, + "loss": 0.5978, + "step": 77670 + }, + { + "epoch": 1.2468899982343216, + "grad_norm": 1.0281533002853394, + "learning_rate": 1.5550256367605918e-05, + "loss": 0.7182, + "step": 77680 + }, + { + "epoch": 1.2470505144544857, + "grad_norm": 0.9023261666297913, + "learning_rate": 1.5544419928890664e-05, + "loss": 0.6221, + "step": 77690 + }, + { + "epoch": 1.2472110306746496, + "grad_norm": 1.364530324935913, + "learning_rate": 1.55385840914948e-05, + "loss": 0.6497, + "step": 77700 + }, + { + "epoch": 1.2473715468948137, + "grad_norm": 0.7460383772850037, + "learning_rate": 1.5532748855789458e-05, + "loss": 0.6379, + "step": 77710 + }, + { + "epoch": 1.2475320631149778, + "grad_norm": 1.089057445526123, + "learning_rate": 1.5526914222145734e-05, + "loss": 0.8596, + "step": 77720 + }, + { + "epoch": 1.2476925793351419, + "grad_norm": 0.9082130789756775, + "learning_rate": 1.552108019093465e-05, + "loss": 0.5926, + "step": 77730 + }, + { + "epoch": 1.2478530955553058, + "grad_norm": 0.7261349558830261, + "learning_rate": 1.551524676252723e-05, + "loss": 0.6457, + "step": 77740 + }, + { + "epoch": 1.2480136117754699, + "grad_norm": 1.6714060306549072, + "learning_rate": 1.5509413937294448e-05, + "loss": 0.7287, + "step": 77750 + }, + { + "epoch": 1.248174127995634, + "grad_norm": 0.9495734572410583, + "learning_rate": 1.5503581715607237e-05, + "loss": 0.8562, + "step": 77760 + }, + { + "epoch": 1.248334644215798, + "grad_norm": 1.1501859426498413, + "learning_rate": 1.549775009783649e-05, + "loss": 0.6998, + "step": 77770 + }, + { + "epoch": 1.248495160435962, + "grad_norm": 1.1986716985702515, + "learning_rate": 1.5491919084353062e-05, + "loss": 0.6183, + "step": 77780 + }, + { + "epoch": 1.248655676656126, + "grad_norm": 0.9408127069473267, + "learning_rate": 1.5486088675527784e-05, + "loss": 0.7009, + "step": 77790 + }, + { + "epoch": 1.2488161928762902, + "grad_norm": 0.9424430131912231, + "learning_rate": 1.5480258871731418e-05, + "loss": 0.7366, + "step": 77800 + }, + { + "epoch": 1.2489767090964543, + "grad_norm": 1.138646125793457, + "learning_rate": 1.547442967333471e-05, + "loss": 0.7283, + "step": 77810 + }, + { + "epoch": 1.2491372253166182, + "grad_norm": 1.2375432252883911, + "learning_rate": 1.5468601080708368e-05, + "loss": 0.7095, + "step": 77820 + }, + { + "epoch": 1.2492977415367823, + "grad_norm": 1.1660312414169312, + "learning_rate": 1.5462773094223047e-05, + "loss": 0.7829, + "step": 77830 + }, + { + "epoch": 1.2494582577569464, + "grad_norm": 1.039412498474121, + "learning_rate": 1.5456945714249387e-05, + "loss": 0.7174, + "step": 77840 + }, + { + "epoch": 1.2496187739771103, + "grad_norm": 1.5260850191116333, + "learning_rate": 1.5451118941157974e-05, + "loss": 0.6535, + "step": 77850 + }, + { + "epoch": 1.2497792901972744, + "grad_norm": 0.9425176978111267, + "learning_rate": 1.544529277531934e-05, + "loss": 0.5019, + "step": 77860 + }, + { + "epoch": 1.2499398064174385, + "grad_norm": 1.194753885269165, + "learning_rate": 1.5439467217104007e-05, + "loss": 0.4818, + "step": 77870 + }, + { + "epoch": 1.2501003226376026, + "grad_norm": 1.2361083030700684, + "learning_rate": 1.5433642266882442e-05, + "loss": 0.6745, + "step": 77880 + }, + { + "epoch": 1.2502608388577665, + "grad_norm": 1.1705830097198486, + "learning_rate": 1.5427817925025083e-05, + "loss": 0.7851, + "step": 77890 + }, + { + "epoch": 1.2504213550779306, + "grad_norm": 0.8926587700843811, + "learning_rate": 1.5421994191902317e-05, + "loss": 0.7576, + "step": 77900 + }, + { + "epoch": 1.2505818712980947, + "grad_norm": 1.2278872728347778, + "learning_rate": 1.541617106788451e-05, + "loss": 0.7327, + "step": 77910 + }, + { + "epoch": 1.2507423875182586, + "grad_norm": 0.7900294661521912, + "learning_rate": 1.541034855334197e-05, + "loss": 0.7328, + "step": 77920 + }, + { + "epoch": 1.2509029037384227, + "grad_norm": 1.6975572109222412, + "learning_rate": 1.5404526648644972e-05, + "loss": 0.7069, + "step": 77930 + }, + { + "epoch": 1.2510634199585868, + "grad_norm": 1.5320746898651123, + "learning_rate": 1.5398705354163763e-05, + "loss": 0.6188, + "step": 77940 + }, + { + "epoch": 1.251223936178751, + "grad_norm": 0.8110014200210571, + "learning_rate": 1.5392884670268538e-05, + "loss": 0.5867, + "step": 77950 + }, + { + "epoch": 1.251384452398915, + "grad_norm": 1.0353202819824219, + "learning_rate": 1.538706459732946e-05, + "loss": 0.6924, + "step": 77960 + }, + { + "epoch": 1.251544968619079, + "grad_norm": 0.9550654292106628, + "learning_rate": 1.5381245135716654e-05, + "loss": 0.5789, + "step": 77970 + }, + { + "epoch": 1.251705484839243, + "grad_norm": 0.760597288608551, + "learning_rate": 1.53754262858002e-05, + "loss": 0.7251, + "step": 77980 + }, + { + "epoch": 1.251866001059407, + "grad_norm": 1.1562392711639404, + "learning_rate": 1.536960804795014e-05, + "loss": 0.728, + "step": 77990 + }, + { + "epoch": 1.252026517279571, + "grad_norm": 0.8720255494117737, + "learning_rate": 1.5363790422536483e-05, + "loss": 0.6719, + "step": 78000 + }, + { + "epoch": 1.2521870334997351, + "grad_norm": 0.7540183663368225, + "learning_rate": 1.5357973409929198e-05, + "loss": 0.6395, + "step": 78010 + }, + { + "epoch": 1.2523475497198993, + "grad_norm": 0.8229268193244934, + "learning_rate": 1.5352157010498215e-05, + "loss": 0.7593, + "step": 78020 + }, + { + "epoch": 1.2525080659400634, + "grad_norm": 1.0058578252792358, + "learning_rate": 1.5346341224613427e-05, + "loss": 0.769, + "step": 78030 + }, + { + "epoch": 1.2526685821602273, + "grad_norm": 1.1848362684249878, + "learning_rate": 1.534052605264466e-05, + "loss": 0.5719, + "step": 78040 + }, + { + "epoch": 1.2528290983803914, + "grad_norm": 1.273906946182251, + "learning_rate": 1.5334711494961745e-05, + "loss": 0.8056, + "step": 78050 + }, + { + "epoch": 1.2529896146005555, + "grad_norm": 0.9377033710479736, + "learning_rate": 1.532889755193445e-05, + "loss": 0.7165, + "step": 78060 + }, + { + "epoch": 1.2531501308207194, + "grad_norm": 0.99953693151474, + "learning_rate": 1.5323084223932508e-05, + "loss": 0.6939, + "step": 78070 + }, + { + "epoch": 1.2533106470408835, + "grad_norm": 0.8632817268371582, + "learning_rate": 1.531727151132561e-05, + "loss": 0.6599, + "step": 78080 + }, + { + "epoch": 1.2534711632610476, + "grad_norm": 0.9541622400283813, + "learning_rate": 1.531145941448341e-05, + "loss": 0.6535, + "step": 78090 + }, + { + "epoch": 1.2536316794812117, + "grad_norm": 0.8347681164741516, + "learning_rate": 1.530564793377554e-05, + "loss": 0.7406, + "step": 78100 + }, + { + "epoch": 1.2537921957013756, + "grad_norm": 1.127609133720398, + "learning_rate": 1.529983706957155e-05, + "loss": 0.6462, + "step": 78110 + }, + { + "epoch": 1.2539527119215397, + "grad_norm": 1.4995157718658447, + "learning_rate": 1.5294026822240988e-05, + "loss": 0.6887, + "step": 78120 + }, + { + "epoch": 1.2541132281417038, + "grad_norm": 0.6131995320320129, + "learning_rate": 1.5288217192153348e-05, + "loss": 0.5958, + "step": 78130 + }, + { + "epoch": 1.2542737443618677, + "grad_norm": 0.8423106074333191, + "learning_rate": 1.5282408179678094e-05, + "loss": 0.7396, + "step": 78140 + }, + { + "epoch": 1.2544342605820318, + "grad_norm": 1.6532248258590698, + "learning_rate": 1.5276599785184644e-05, + "loss": 0.7136, + "step": 78150 + }, + { + "epoch": 1.2545947768021959, + "grad_norm": 0.9120277762413025, + "learning_rate": 1.527079200904239e-05, + "loss": 0.5688, + "step": 78160 + }, + { + "epoch": 1.25475529302236, + "grad_norm": 1.0196369886398315, + "learning_rate": 1.5264984851620644e-05, + "loss": 0.6937, + "step": 78170 + }, + { + "epoch": 1.2549158092425239, + "grad_norm": 1.1023441553115845, + "learning_rate": 1.5259178313288726e-05, + "loss": 0.6166, + "step": 78180 + }, + { + "epoch": 1.255076325462688, + "grad_norm": 1.0747032165527344, + "learning_rate": 1.5253372394415893e-05, + "loss": 0.7551, + "step": 78190 + }, + { + "epoch": 1.255236841682852, + "grad_norm": 0.6364234089851379, + "learning_rate": 1.5247567095371374e-05, + "loss": 0.6494, + "step": 78200 + }, + { + "epoch": 1.255397357903016, + "grad_norm": 0.8451074957847595, + "learning_rate": 1.5241762416524342e-05, + "loss": 0.7121, + "step": 78210 + }, + { + "epoch": 1.25555787412318, + "grad_norm": 0.9794045090675354, + "learning_rate": 1.5235958358243952e-05, + "loss": 0.7561, + "step": 78220 + }, + { + "epoch": 1.2557183903433442, + "grad_norm": 1.3240256309509277, + "learning_rate": 1.5230154920899299e-05, + "loss": 0.7159, + "step": 78230 + }, + { + "epoch": 1.2558789065635083, + "grad_norm": 0.9215763807296753, + "learning_rate": 1.5224352104859446e-05, + "loss": 0.7148, + "step": 78240 + }, + { + "epoch": 1.2560394227836724, + "grad_norm": 1.1366441249847412, + "learning_rate": 1.521854991049342e-05, + "loss": 0.7611, + "step": 78250 + }, + { + "epoch": 1.2561999390038363, + "grad_norm": 1.4160538911819458, + "learning_rate": 1.5212748338170214e-05, + "loss": 0.7074, + "step": 78260 + }, + { + "epoch": 1.2563604552240004, + "grad_norm": 1.0566236972808838, + "learning_rate": 1.5206947388258763e-05, + "loss": 0.6826, + "step": 78270 + }, + { + "epoch": 1.2565209714441643, + "grad_norm": 1.4919419288635254, + "learning_rate": 1.5201147061127985e-05, + "loss": 0.6555, + "step": 78280 + }, + { + "epoch": 1.2566814876643284, + "grad_norm": 1.12748384475708, + "learning_rate": 1.5195347357146736e-05, + "loss": 0.6097, + "step": 78290 + }, + { + "epoch": 1.2568420038844925, + "grad_norm": 1.5902622938156128, + "learning_rate": 1.5189548276683846e-05, + "loss": 0.7082, + "step": 78300 + }, + { + "epoch": 1.2570025201046566, + "grad_norm": 1.5171328783035278, + "learning_rate": 1.5183749820108107e-05, + "loss": 0.6899, + "step": 78310 + }, + { + "epoch": 1.2571630363248207, + "grad_norm": 0.6151436567306519, + "learning_rate": 1.5177951987788263e-05, + "loss": 0.7166, + "step": 78320 + }, + { + "epoch": 1.2573235525449846, + "grad_norm": 1.2661736011505127, + "learning_rate": 1.5172154780093024e-05, + "loss": 0.7958, + "step": 78330 + }, + { + "epoch": 1.2574840687651487, + "grad_norm": 1.313694715499878, + "learning_rate": 1.5166358197391057e-05, + "loss": 0.6602, + "step": 78340 + }, + { + "epoch": 1.2576445849853128, + "grad_norm": 1.022273063659668, + "learning_rate": 1.5160562240050999e-05, + "loss": 0.8061, + "step": 78350 + }, + { + "epoch": 1.2578051012054767, + "grad_norm": 0.7297672033309937, + "learning_rate": 1.515476690844142e-05, + "loss": 0.7159, + "step": 78360 + }, + { + "epoch": 1.2579656174256408, + "grad_norm": 1.093869686126709, + "learning_rate": 1.5148972202930883e-05, + "loss": 0.8164, + "step": 78370 + }, + { + "epoch": 1.258126133645805, + "grad_norm": 1.0206948518753052, + "learning_rate": 1.5143178123887892e-05, + "loss": 0.7132, + "step": 78380 + }, + { + "epoch": 1.258286649865969, + "grad_norm": 1.0386754274368286, + "learning_rate": 1.5137384671680921e-05, + "loss": 0.7062, + "step": 78390 + }, + { + "epoch": 1.258447166086133, + "grad_norm": 1.1543747186660767, + "learning_rate": 1.5131591846678394e-05, + "loss": 0.7477, + "step": 78400 + }, + { + "epoch": 1.258607682306297, + "grad_norm": 0.6725614666938782, + "learning_rate": 1.5125799649248717e-05, + "loss": 0.6649, + "step": 78410 + }, + { + "epoch": 1.2587681985264612, + "grad_norm": 0.6450100541114807, + "learning_rate": 1.5120008079760217e-05, + "loss": 0.6295, + "step": 78420 + }, + { + "epoch": 1.258928714746625, + "grad_norm": 0.8756811618804932, + "learning_rate": 1.511421713858121e-05, + "loss": 0.7312, + "step": 78430 + }, + { + "epoch": 1.2590892309667892, + "grad_norm": 1.3817129135131836, + "learning_rate": 1.5108426826079975e-05, + "loss": 0.6821, + "step": 78440 + }, + { + "epoch": 1.2592497471869533, + "grad_norm": 0.8022101521492004, + "learning_rate": 1.5102637142624732e-05, + "loss": 0.7909, + "step": 78450 + }, + { + "epoch": 1.2594102634071174, + "grad_norm": 0.7905492782592773, + "learning_rate": 1.5096848088583673e-05, + "loss": 0.7934, + "step": 78460 + }, + { + "epoch": 1.2595707796272813, + "grad_norm": 1.29241943359375, + "learning_rate": 1.5091059664324964e-05, + "loss": 0.7463, + "step": 78470 + }, + { + "epoch": 1.2597312958474454, + "grad_norm": 1.081510066986084, + "learning_rate": 1.5085271870216688e-05, + "loss": 0.7126, + "step": 78480 + }, + { + "epoch": 1.2598918120676095, + "grad_norm": 0.8568189740180969, + "learning_rate": 1.5079484706626928e-05, + "loss": 0.6869, + "step": 78490 + }, + { + "epoch": 1.2600523282877734, + "grad_norm": 0.9175143241882324, + "learning_rate": 1.5073698173923717e-05, + "loss": 0.6456, + "step": 78500 + }, + { + "epoch": 1.2602128445079375, + "grad_norm": 0.9892019629478455, + "learning_rate": 1.5067912272475037e-05, + "loss": 0.8443, + "step": 78510 + }, + { + "epoch": 1.2603733607281016, + "grad_norm": 1.0982434749603271, + "learning_rate": 1.506212700264884e-05, + "loss": 0.7922, + "step": 78520 + }, + { + "epoch": 1.2605338769482657, + "grad_norm": 0.9929289817810059, + "learning_rate": 1.5056342364813045e-05, + "loss": 0.7847, + "step": 78530 + }, + { + "epoch": 1.2606943931684298, + "grad_norm": 0.7844652533531189, + "learning_rate": 1.5050558359335504e-05, + "loss": 0.7209, + "step": 78540 + }, + { + "epoch": 1.2608549093885937, + "grad_norm": 0.8709708452224731, + "learning_rate": 1.5044774986584059e-05, + "loss": 0.7215, + "step": 78550 + }, + { + "epoch": 1.2610154256087578, + "grad_norm": 1.7890918254852295, + "learning_rate": 1.5038992246926492e-05, + "loss": 0.706, + "step": 78560 + }, + { + "epoch": 1.2611759418289217, + "grad_norm": 1.3272250890731812, + "learning_rate": 1.503321014073055e-05, + "loss": 0.8453, + "step": 78570 + }, + { + "epoch": 1.2613364580490858, + "grad_norm": 0.7819147109985352, + "learning_rate": 1.5027428668363947e-05, + "loss": 0.8475, + "step": 78580 + }, + { + "epoch": 1.26149697426925, + "grad_norm": 0.7148312926292419, + "learning_rate": 1.502164783019435e-05, + "loss": 0.7606, + "step": 78590 + }, + { + "epoch": 1.261657490489414, + "grad_norm": 0.8301345705986023, + "learning_rate": 1.5015867626589378e-05, + "loss": 0.606, + "step": 78600 + }, + { + "epoch": 1.261818006709578, + "grad_norm": 1.0001132488250732, + "learning_rate": 1.5010088057916628e-05, + "loss": 0.6682, + "step": 78610 + }, + { + "epoch": 1.261978522929742, + "grad_norm": 0.8433442115783691, + "learning_rate": 1.5004309124543636e-05, + "loss": 0.6583, + "step": 78620 + }, + { + "epoch": 1.262139039149906, + "grad_norm": 1.0910284519195557, + "learning_rate": 1.4998530826837919e-05, + "loss": 0.749, + "step": 78630 + }, + { + "epoch": 1.2622995553700702, + "grad_norm": 1.7208117246627808, + "learning_rate": 1.499275316516694e-05, + "loss": 0.7734, + "step": 78640 + }, + { + "epoch": 1.262460071590234, + "grad_norm": 1.456334114074707, + "learning_rate": 1.4986976139898123e-05, + "loss": 0.8968, + "step": 78650 + }, + { + "epoch": 1.2626205878103982, + "grad_norm": 1.039989948272705, + "learning_rate": 1.498119975139886e-05, + "loss": 0.6576, + "step": 78660 + }, + { + "epoch": 1.2627811040305623, + "grad_norm": 0.8610975742340088, + "learning_rate": 1.4975424000036481e-05, + "loss": 0.7354, + "step": 78670 + }, + { + "epoch": 1.2629416202507264, + "grad_norm": 1.069750428199768, + "learning_rate": 1.4969648886178298e-05, + "loss": 0.6987, + "step": 78680 + }, + { + "epoch": 1.2631021364708903, + "grad_norm": 0.595093309879303, + "learning_rate": 1.496387441019157e-05, + "loss": 0.6657, + "step": 78690 + }, + { + "epoch": 1.2632626526910544, + "grad_norm": 1.1319963932037354, + "learning_rate": 1.495810057244353e-05, + "loss": 0.7243, + "step": 78700 + }, + { + "epoch": 1.2634231689112185, + "grad_norm": 1.4779959917068481, + "learning_rate": 1.4952327373301345e-05, + "loss": 0.7971, + "step": 78710 + }, + { + "epoch": 1.2635836851313824, + "grad_norm": 1.002113938331604, + "learning_rate": 1.4946554813132179e-05, + "loss": 0.7198, + "step": 78720 + }, + { + "epoch": 1.2637442013515465, + "grad_norm": 1.9772850275039673, + "learning_rate": 1.4940782892303113e-05, + "loss": 0.8419, + "step": 78730 + }, + { + "epoch": 1.2639047175717106, + "grad_norm": 1.5324887037277222, + "learning_rate": 1.493501161118121e-05, + "loss": 0.6864, + "step": 78740 + }, + { + "epoch": 1.2640652337918747, + "grad_norm": 0.9286466240882874, + "learning_rate": 1.4929240970133493e-05, + "loss": 0.7155, + "step": 78750 + }, + { + "epoch": 1.2642257500120389, + "grad_norm": 1.1371468305587769, + "learning_rate": 1.492347096952694e-05, + "loss": 0.6486, + "step": 78760 + }, + { + "epoch": 1.2643862662322027, + "grad_norm": 0.8098145127296448, + "learning_rate": 1.491770160972849e-05, + "loss": 0.7506, + "step": 78770 + }, + { + "epoch": 1.2645467824523668, + "grad_norm": 0.9170480966567993, + "learning_rate": 1.4911932891105052e-05, + "loss": 0.7802, + "step": 78780 + }, + { + "epoch": 1.2647072986725307, + "grad_norm": 0.8940113186836243, + "learning_rate": 1.4906164814023463e-05, + "loss": 0.7503, + "step": 78790 + }, + { + "epoch": 1.2648678148926948, + "grad_norm": 0.7533485293388367, + "learning_rate": 1.4900397378850545e-05, + "loss": 0.7826, + "step": 78800 + }, + { + "epoch": 1.265028331112859, + "grad_norm": 9.06766414642334, + "learning_rate": 1.4894630585953073e-05, + "loss": 0.7248, + "step": 78810 + }, + { + "epoch": 1.265188847333023, + "grad_norm": 0.7188559174537659, + "learning_rate": 1.4888864435697785e-05, + "loss": 0.6186, + "step": 78820 + }, + { + "epoch": 1.2653493635531872, + "grad_norm": 0.6450923681259155, + "learning_rate": 1.4883098928451375e-05, + "loss": 0.6244, + "step": 78830 + }, + { + "epoch": 1.265509879773351, + "grad_norm": 0.9619137644767761, + "learning_rate": 1.4877334064580494e-05, + "loss": 0.6075, + "step": 78840 + }, + { + "epoch": 1.2656703959935152, + "grad_norm": 0.9912307262420654, + "learning_rate": 1.4871569844451748e-05, + "loss": 0.7136, + "step": 78850 + }, + { + "epoch": 1.2658309122136793, + "grad_norm": 1.1968485116958618, + "learning_rate": 1.486580626843172e-05, + "loss": 0.6889, + "step": 78860 + }, + { + "epoch": 1.2659914284338432, + "grad_norm": 1.2350140810012817, + "learning_rate": 1.4860043336886925e-05, + "loss": 0.7255, + "step": 78870 + }, + { + "epoch": 1.2661519446540073, + "grad_norm": 1.0204721689224243, + "learning_rate": 1.485428105018386e-05, + "loss": 0.6939, + "step": 78880 + }, + { + "epoch": 1.2663124608741714, + "grad_norm": 0.9774091243743896, + "learning_rate": 1.4848519408688968e-05, + "loss": 0.6864, + "step": 78890 + }, + { + "epoch": 1.2664729770943355, + "grad_norm": 1.7298202514648438, + "learning_rate": 1.4842758412768659e-05, + "loss": 0.689, + "step": 78900 + }, + { + "epoch": 1.2666334933144994, + "grad_norm": 0.8060284852981567, + "learning_rate": 1.4836998062789304e-05, + "loss": 0.688, + "step": 78910 + }, + { + "epoch": 1.2667940095346635, + "grad_norm": 1.0905271768569946, + "learning_rate": 1.4831238359117216e-05, + "loss": 0.7773, + "step": 78920 + }, + { + "epoch": 1.2669545257548276, + "grad_norm": 1.2643640041351318, + "learning_rate": 1.4825479302118683e-05, + "loss": 0.6514, + "step": 78930 + }, + { + "epoch": 1.2671150419749915, + "grad_norm": 0.8211184740066528, + "learning_rate": 1.481972089215995e-05, + "loss": 0.715, + "step": 78940 + }, + { + "epoch": 1.2672755581951556, + "grad_norm": 0.7516048550605774, + "learning_rate": 1.4813963129607216e-05, + "loss": 0.7152, + "step": 78950 + }, + { + "epoch": 1.2674360744153197, + "grad_norm": 0.8329293131828308, + "learning_rate": 1.4808206014826639e-05, + "loss": 0.6919, + "step": 78960 + }, + { + "epoch": 1.2675965906354838, + "grad_norm": 1.1467076539993286, + "learning_rate": 1.480244954818435e-05, + "loss": 0.6575, + "step": 78970 + }, + { + "epoch": 1.2677571068556477, + "grad_norm": 1.5539443492889404, + "learning_rate": 1.4796693730046407e-05, + "loss": 0.6337, + "step": 78980 + }, + { + "epoch": 1.2679176230758118, + "grad_norm": 1.1217461824417114, + "learning_rate": 1.4790938560778856e-05, + "loss": 0.6829, + "step": 78990 + }, + { + "epoch": 1.268078139295976, + "grad_norm": 1.5394341945648193, + "learning_rate": 1.478518404074769e-05, + "loss": 0.6873, + "step": 79000 + }, + { + "epoch": 1.2682386555161398, + "grad_norm": 1.05461585521698, + "learning_rate": 1.4779430170318864e-05, + "loss": 0.6991, + "step": 79010 + }, + { + "epoch": 1.268399171736304, + "grad_norm": 1.7570034265518188, + "learning_rate": 1.4773676949858292e-05, + "loss": 0.6841, + "step": 79020 + }, + { + "epoch": 1.268559687956468, + "grad_norm": 1.8057029247283936, + "learning_rate": 1.4767924379731854e-05, + "loss": 0.7426, + "step": 79030 + }, + { + "epoch": 1.2687202041766321, + "grad_norm": 1.1488484144210815, + "learning_rate": 1.4762172460305357e-05, + "loss": 0.6485, + "step": 79040 + }, + { + "epoch": 1.2688807203967962, + "grad_norm": 0.9072607159614563, + "learning_rate": 1.4756421191944602e-05, + "loss": 0.7526, + "step": 79050 + }, + { + "epoch": 1.2690412366169601, + "grad_norm": 0.9462841153144836, + "learning_rate": 1.4750670575015339e-05, + "loss": 0.6368, + "step": 79060 + }, + { + "epoch": 1.2692017528371242, + "grad_norm": 1.340294599533081, + "learning_rate": 1.4744920609883266e-05, + "loss": 0.7264, + "step": 79070 + }, + { + "epoch": 1.269362269057288, + "grad_norm": 1.1286901235580444, + "learning_rate": 1.473917129691405e-05, + "loss": 0.6612, + "step": 79080 + }, + { + "epoch": 1.2695227852774522, + "grad_norm": 1.067910075187683, + "learning_rate": 1.4733422636473329e-05, + "loss": 0.6757, + "step": 79090 + }, + { + "epoch": 1.2696833014976163, + "grad_norm": 1.193390965461731, + "learning_rate": 1.4727674628926658e-05, + "loss": 0.7344, + "step": 79100 + }, + { + "epoch": 1.2698438177177804, + "grad_norm": 0.8547729253768921, + "learning_rate": 1.4721927274639585e-05, + "loss": 0.7585, + "step": 79110 + }, + { + "epoch": 1.2700043339379445, + "grad_norm": 0.9150316119194031, + "learning_rate": 1.4716180573977618e-05, + "loss": 0.8885, + "step": 79120 + }, + { + "epoch": 1.2701648501581084, + "grad_norm": 1.0196079015731812, + "learning_rate": 1.47104345273062e-05, + "loss": 0.653, + "step": 79130 + }, + { + "epoch": 1.2703253663782725, + "grad_norm": 0.6759986877441406, + "learning_rate": 1.4704689134990755e-05, + "loss": 0.7687, + "step": 79140 + }, + { + "epoch": 1.2704858825984366, + "grad_norm": 1.3049447536468506, + "learning_rate": 1.4698944397396652e-05, + "loss": 0.8458, + "step": 79150 + }, + { + "epoch": 1.2706463988186005, + "grad_norm": 0.9090332984924316, + "learning_rate": 1.4693200314889233e-05, + "loss": 0.7387, + "step": 79160 + }, + { + "epoch": 1.2708069150387646, + "grad_norm": 0.9123874306678772, + "learning_rate": 1.4687456887833775e-05, + "loss": 0.7223, + "step": 79170 + }, + { + "epoch": 1.2709674312589287, + "grad_norm": 1.7295693159103394, + "learning_rate": 1.4681714116595529e-05, + "loss": 0.6901, + "step": 79180 + }, + { + "epoch": 1.2711279474790929, + "grad_norm": 0.7176494598388672, + "learning_rate": 1.46759720015397e-05, + "loss": 0.6327, + "step": 79190 + }, + { + "epoch": 1.2712884636992567, + "grad_norm": 0.9240027070045471, + "learning_rate": 1.4670230543031458e-05, + "loss": 0.6166, + "step": 79200 + }, + { + "epoch": 1.2714489799194209, + "grad_norm": 1.6097464561462402, + "learning_rate": 1.4664489741435921e-05, + "loss": 0.7838, + "step": 79210 + }, + { + "epoch": 1.271609496139585, + "grad_norm": 1.6081141233444214, + "learning_rate": 1.4658749597118182e-05, + "loss": 0.6596, + "step": 79220 + }, + { + "epoch": 1.2717700123597488, + "grad_norm": 0.9508723020553589, + "learning_rate": 1.4653010110443267e-05, + "loss": 0.8331, + "step": 79230 + }, + { + "epoch": 1.271930528579913, + "grad_norm": 1.2014423608779907, + "learning_rate": 1.4647271281776173e-05, + "loss": 0.8265, + "step": 79240 + }, + { + "epoch": 1.272091044800077, + "grad_norm": 0.8892713189125061, + "learning_rate": 1.4641533111481864e-05, + "loss": 0.651, + "step": 79250 + }, + { + "epoch": 1.2722515610202412, + "grad_norm": 0.9558422565460205, + "learning_rate": 1.463579559992525e-05, + "loss": 0.7282, + "step": 79260 + }, + { + "epoch": 1.2724120772404053, + "grad_norm": 1.3559776544570923, + "learning_rate": 1.463005874747121e-05, + "loss": 0.7499, + "step": 79270 + }, + { + "epoch": 1.2725725934605692, + "grad_norm": 0.9837824702262878, + "learning_rate": 1.462432255448457e-05, + "loss": 0.5647, + "step": 79280 + }, + { + "epoch": 1.2727331096807333, + "grad_norm": 1.2118431329727173, + "learning_rate": 1.461858702133011e-05, + "loss": 0.6999, + "step": 79290 + }, + { + "epoch": 1.2728936259008972, + "grad_norm": 1.481285572052002, + "learning_rate": 1.4612852148372585e-05, + "loss": 0.7105, + "step": 79300 + }, + { + "epoch": 1.2730541421210613, + "grad_norm": 0.7367807030677795, + "learning_rate": 1.4607117935976695e-05, + "loss": 0.67, + "step": 79310 + }, + { + "epoch": 1.2732146583412254, + "grad_norm": 0.6900240182876587, + "learning_rate": 1.4601384384507105e-05, + "loss": 0.7491, + "step": 79320 + }, + { + "epoch": 1.2733751745613895, + "grad_norm": 1.468518614768982, + "learning_rate": 1.4595651494328432e-05, + "loss": 0.766, + "step": 79330 + }, + { + "epoch": 1.2735356907815536, + "grad_norm": 0.9503509998321533, + "learning_rate": 1.4589919265805269e-05, + "loss": 0.7065, + "step": 79340 + }, + { + "epoch": 1.2736962070017175, + "grad_norm": 1.0180567502975464, + "learning_rate": 1.4584187699302129e-05, + "loss": 0.6684, + "step": 79350 + }, + { + "epoch": 1.2738567232218816, + "grad_norm": 0.9285694360733032, + "learning_rate": 1.4578456795183515e-05, + "loss": 0.6896, + "step": 79360 + }, + { + "epoch": 1.2740172394420457, + "grad_norm": 1.1635148525238037, + "learning_rate": 1.4572726553813885e-05, + "loss": 0.8462, + "step": 79370 + }, + { + "epoch": 1.2741777556622096, + "grad_norm": 0.8861281871795654, + "learning_rate": 1.456699697555764e-05, + "loss": 0.7312, + "step": 79380 + }, + { + "epoch": 1.2743382718823737, + "grad_norm": 0.7251657247543335, + "learning_rate": 1.4561268060779154e-05, + "loss": 0.7801, + "step": 79390 + }, + { + "epoch": 1.2744987881025378, + "grad_norm": 0.7829038500785828, + "learning_rate": 1.4555539809842761e-05, + "loss": 0.6649, + "step": 79400 + }, + { + "epoch": 1.274659304322702, + "grad_norm": 1.3387142419815063, + "learning_rate": 1.4549812223112719e-05, + "loss": 0.6965, + "step": 79410 + }, + { + "epoch": 1.2748198205428658, + "grad_norm": 0.935124397277832, + "learning_rate": 1.4544085300953285e-05, + "loss": 0.6763, + "step": 79420 + }, + { + "epoch": 1.27498033676303, + "grad_norm": 1.6829544305801392, + "learning_rate": 1.4538359043728661e-05, + "loss": 0.6391, + "step": 79430 + }, + { + "epoch": 1.275140852983194, + "grad_norm": 1.8193488121032715, + "learning_rate": 1.4532633451802994e-05, + "loss": 0.8745, + "step": 79440 + }, + { + "epoch": 1.275301369203358, + "grad_norm": 0.9912759065628052, + "learning_rate": 1.45269085255404e-05, + "loss": 0.7431, + "step": 79450 + }, + { + "epoch": 1.275461885423522, + "grad_norm": 1.0162944793701172, + "learning_rate": 1.4521184265304958e-05, + "loss": 0.7951, + "step": 79460 + }, + { + "epoch": 1.2756224016436861, + "grad_norm": 0.997498095035553, + "learning_rate": 1.45154606714607e-05, + "loss": 0.7071, + "step": 79470 + }, + { + "epoch": 1.2757829178638502, + "grad_norm": 0.9954838752746582, + "learning_rate": 1.4509737744371592e-05, + "loss": 0.7066, + "step": 79480 + }, + { + "epoch": 1.2759434340840141, + "grad_norm": 0.6918855905532837, + "learning_rate": 1.4504015484401598e-05, + "loss": 0.6964, + "step": 79490 + }, + { + "epoch": 1.2761039503041782, + "grad_norm": 0.9336999654769897, + "learning_rate": 1.449829389191461e-05, + "loss": 0.7533, + "step": 79500 + }, + { + "epoch": 1.2762644665243423, + "grad_norm": 1.6811596155166626, + "learning_rate": 1.4492572967274493e-05, + "loss": 0.7005, + "step": 79510 + }, + { + "epoch": 1.2764249827445062, + "grad_norm": 1.0189687013626099, + "learning_rate": 1.4486852710845061e-05, + "loss": 0.6674, + "step": 79520 + }, + { + "epoch": 1.2765854989646703, + "grad_norm": 1.0540823936462402, + "learning_rate": 1.4481133122990103e-05, + "loss": 0.6555, + "step": 79530 + }, + { + "epoch": 1.2767460151848344, + "grad_norm": 1.2769891023635864, + "learning_rate": 1.4475414204073329e-05, + "loss": 0.7897, + "step": 79540 + }, + { + "epoch": 1.2769065314049985, + "grad_norm": 0.6334158778190613, + "learning_rate": 1.446969595445844e-05, + "loss": 0.7376, + "step": 79550 + }, + { + "epoch": 1.2770670476251627, + "grad_norm": 1.036393404006958, + "learning_rate": 1.4463978374509079e-05, + "loss": 0.7166, + "step": 79560 + }, + { + "epoch": 1.2772275638453265, + "grad_norm": 1.2641388177871704, + "learning_rate": 1.445826146458885e-05, + "loss": 0.6828, + "step": 79570 + }, + { + "epoch": 1.2773880800654906, + "grad_norm": 1.0416516065597534, + "learning_rate": 1.4452545225061325e-05, + "loss": 0.6436, + "step": 79580 + }, + { + "epoch": 1.2775485962856545, + "grad_norm": 0.7617206573486328, + "learning_rate": 1.4446829656290023e-05, + "loss": 0.6309, + "step": 79590 + }, + { + "epoch": 1.2777091125058186, + "grad_norm": 1.6325188875198364, + "learning_rate": 1.4441114758638405e-05, + "loss": 0.6665, + "step": 79600 + }, + { + "epoch": 1.2778696287259828, + "grad_norm": 1.0460073947906494, + "learning_rate": 1.443540053246991e-05, + "loss": 0.6618, + "step": 79610 + }, + { + "epoch": 1.2780301449461469, + "grad_norm": 0.8949933648109436, + "learning_rate": 1.4429686978147939e-05, + "loss": 0.7637, + "step": 79620 + }, + { + "epoch": 1.278190661166311, + "grad_norm": 0.8045145869255066, + "learning_rate": 1.4423974096035833e-05, + "loss": 0.6842, + "step": 79630 + }, + { + "epoch": 1.2783511773864749, + "grad_norm": 1.1648402214050293, + "learning_rate": 1.4418261886496897e-05, + "loss": 0.7297, + "step": 79640 + }, + { + "epoch": 1.278511693606639, + "grad_norm": 1.0477226972579956, + "learning_rate": 1.4412550349894399e-05, + "loss": 0.7321, + "step": 79650 + }, + { + "epoch": 1.278672209826803, + "grad_norm": 0.8940422534942627, + "learning_rate": 1.4406839486591555e-05, + "loss": 0.6454, + "step": 79660 + }, + { + "epoch": 1.278832726046967, + "grad_norm": 0.9145665764808655, + "learning_rate": 1.4401129296951548e-05, + "loss": 0.8671, + "step": 79670 + }, + { + "epoch": 1.278993242267131, + "grad_norm": 1.1962051391601562, + "learning_rate": 1.4395419781337505e-05, + "loss": 0.7296, + "step": 79680 + }, + { + "epoch": 1.2791537584872952, + "grad_norm": 0.8564157485961914, + "learning_rate": 1.4389710940112521e-05, + "loss": 0.643, + "step": 79690 + }, + { + "epoch": 1.2793142747074593, + "grad_norm": 0.9348099827766418, + "learning_rate": 1.4384002773639657e-05, + "loss": 0.6883, + "step": 79700 + }, + { + "epoch": 1.2794747909276232, + "grad_norm": 1.0991321802139282, + "learning_rate": 1.4378295282281896e-05, + "loss": 0.6968, + "step": 79710 + }, + { + "epoch": 1.2796353071477873, + "grad_norm": 0.978914201259613, + "learning_rate": 1.4372588466402215e-05, + "loss": 0.8572, + "step": 79720 + }, + { + "epoch": 1.2797958233679514, + "grad_norm": 1.1404645442962646, + "learning_rate": 1.4366882326363526e-05, + "loss": 0.6499, + "step": 79730 + }, + { + "epoch": 1.2799563395881153, + "grad_norm": 0.7784029841423035, + "learning_rate": 1.4361176862528714e-05, + "loss": 0.7403, + "step": 79740 + }, + { + "epoch": 1.2801168558082794, + "grad_norm": 1.0157991647720337, + "learning_rate": 1.4355472075260609e-05, + "loss": 0.7917, + "step": 79750 + }, + { + "epoch": 1.2802773720284435, + "grad_norm": 0.6957995295524597, + "learning_rate": 1.4349767964922004e-05, + "loss": 0.6883, + "step": 79760 + }, + { + "epoch": 1.2804378882486076, + "grad_norm": 1.161336898803711, + "learning_rate": 1.4344064531875646e-05, + "loss": 0.6366, + "step": 79770 + }, + { + "epoch": 1.2805984044687715, + "grad_norm": 0.8684191107749939, + "learning_rate": 1.4338361776484249e-05, + "loss": 0.786, + "step": 79780 + }, + { + "epoch": 1.2807589206889356, + "grad_norm": 0.8587758541107178, + "learning_rate": 1.4332659699110454e-05, + "loss": 0.7131, + "step": 79790 + }, + { + "epoch": 1.2809194369090997, + "grad_norm": 1.0716452598571777, + "learning_rate": 1.4326958300116894e-05, + "loss": 0.7415, + "step": 79800 + }, + { + "epoch": 1.2810799531292636, + "grad_norm": 1.2448134422302246, + "learning_rate": 1.4321257579866137e-05, + "loss": 0.7602, + "step": 79810 + }, + { + "epoch": 1.2812404693494277, + "grad_norm": 0.7336066961288452, + "learning_rate": 1.4315557538720725e-05, + "loss": 0.684, + "step": 79820 + }, + { + "epoch": 1.2814009855695918, + "grad_norm": 0.9709814190864563, + "learning_rate": 1.4309858177043139e-05, + "loss": 0.6245, + "step": 79830 + }, + { + "epoch": 1.281561501789756, + "grad_norm": 0.9590495824813843, + "learning_rate": 1.430415949519584e-05, + "loss": 0.7196, + "step": 79840 + }, + { + "epoch": 1.28172201800992, + "grad_norm": 0.9520944952964783, + "learning_rate": 1.429846149354121e-05, + "loss": 0.7142, + "step": 79850 + }, + { + "epoch": 1.281882534230084, + "grad_norm": 0.960252583026886, + "learning_rate": 1.4292764172441617e-05, + "loss": 0.6143, + "step": 79860 + }, + { + "epoch": 1.282043050450248, + "grad_norm": 1.0336142778396606, + "learning_rate": 1.4287067532259377e-05, + "loss": 0.7519, + "step": 79870 + }, + { + "epoch": 1.282203566670412, + "grad_norm": 1.0152264833450317, + "learning_rate": 1.428137157335676e-05, + "loss": 0.6787, + "step": 79880 + }, + { + "epoch": 1.282364082890576, + "grad_norm": 0.7500178217887878, + "learning_rate": 1.4275676296096005e-05, + "loss": 0.7536, + "step": 79890 + }, + { + "epoch": 1.2825245991107401, + "grad_norm": 0.9364399909973145, + "learning_rate": 1.4269981700839296e-05, + "loss": 0.7667, + "step": 79900 + }, + { + "epoch": 1.2826851153309042, + "grad_norm": 0.7417507171630859, + "learning_rate": 1.4264287787948766e-05, + "loss": 0.635, + "step": 79910 + }, + { + "epoch": 1.2828456315510683, + "grad_norm": 1.541849970817566, + "learning_rate": 1.425859455778652e-05, + "loss": 0.7129, + "step": 79920 + }, + { + "epoch": 1.2830061477712322, + "grad_norm": 1.0938915014266968, + "learning_rate": 1.4252902010714614e-05, + "loss": 0.7656, + "step": 79930 + }, + { + "epoch": 1.2831666639913963, + "grad_norm": 1.6149259805679321, + "learning_rate": 1.4247210147095064e-05, + "loss": 0.7991, + "step": 79940 + }, + { + "epoch": 1.2833271802115604, + "grad_norm": 2.0485832691192627, + "learning_rate": 1.4241518967289833e-05, + "loss": 0.624, + "step": 79950 + }, + { + "epoch": 1.2834876964317243, + "grad_norm": 0.8914511203765869, + "learning_rate": 1.423582847166086e-05, + "loss": 0.7568, + "step": 79960 + }, + { + "epoch": 1.2836482126518884, + "grad_norm": 1.1497222185134888, + "learning_rate": 1.4230138660570008e-05, + "loss": 0.725, + "step": 79970 + }, + { + "epoch": 1.2838087288720526, + "grad_norm": 1.0297322273254395, + "learning_rate": 1.4224449534379122e-05, + "loss": 0.6364, + "step": 79980 + }, + { + "epoch": 1.2839692450922167, + "grad_norm": 0.9979276657104492, + "learning_rate": 1.4218761093450001e-05, + "loss": 0.6373, + "step": 79990 + }, + { + "epoch": 1.2841297613123805, + "grad_norm": 0.7724529504776001, + "learning_rate": 1.4213073338144395e-05, + "loss": 0.6896, + "step": 80000 + }, + { + "epoch": 1.2841297613123805, + "eval_loss": 0.7735086679458618, + "eval_runtime": 1834.2202, + "eval_samples_per_second": 14.301, + "eval_steps_per_second": 1.788, + "step": 80000 + }, + { + "epoch": 1.2842902775325447, + "grad_norm": 0.7772813439369202, + "learning_rate": 1.4207386268824013e-05, + "loss": 0.6925, + "step": 80010 + }, + { + "epoch": 1.2844507937527088, + "grad_norm": 0.9609172940254211, + "learning_rate": 1.4201699885850516e-05, + "loss": 0.7498, + "step": 80020 + }, + { + "epoch": 1.2846113099728727, + "grad_norm": 0.905562162399292, + "learning_rate": 1.4196014189585538e-05, + "loss": 0.7434, + "step": 80030 + }, + { + "epoch": 1.2847718261930368, + "grad_norm": 0.8285542726516724, + "learning_rate": 1.4190329180390635e-05, + "loss": 0.6565, + "step": 80040 + }, + { + "epoch": 1.2849323424132009, + "grad_norm": 0.791386067867279, + "learning_rate": 1.418464485862735e-05, + "loss": 0.8666, + "step": 80050 + }, + { + "epoch": 1.285092858633365, + "grad_norm": 1.0210633277893066, + "learning_rate": 1.4178961224657172e-05, + "loss": 0.5637, + "step": 80060 + }, + { + "epoch": 1.285253374853529, + "grad_norm": 1.2497166395187378, + "learning_rate": 1.4173278278841546e-05, + "loss": 0.7434, + "step": 80070 + }, + { + "epoch": 1.285413891073693, + "grad_norm": 1.3024035692214966, + "learning_rate": 1.4167596021541873e-05, + "loss": 0.665, + "step": 80080 + }, + { + "epoch": 1.285574407293857, + "grad_norm": 2.6109769344329834, + "learning_rate": 1.4161914453119529e-05, + "loss": 0.6245, + "step": 80090 + }, + { + "epoch": 1.285734923514021, + "grad_norm": 1.155569076538086, + "learning_rate": 1.4156233573935798e-05, + "loss": 0.7274, + "step": 80100 + }, + { + "epoch": 1.285895439734185, + "grad_norm": 0.7425225377082825, + "learning_rate": 1.4150553384351967e-05, + "loss": 0.7786, + "step": 80110 + }, + { + "epoch": 1.2860559559543492, + "grad_norm": 1.4083747863769531, + "learning_rate": 1.414487388472926e-05, + "loss": 0.728, + "step": 80120 + }, + { + "epoch": 1.2862164721745133, + "grad_norm": 0.8363215327262878, + "learning_rate": 1.4139195075428863e-05, + "loss": 0.6233, + "step": 80130 + }, + { + "epoch": 1.2863769883946774, + "grad_norm": 1.2006456851959229, + "learning_rate": 1.413351695681191e-05, + "loss": 0.8021, + "step": 80140 + }, + { + "epoch": 1.2865375046148413, + "grad_norm": 1.0324662923812866, + "learning_rate": 1.4127839529239511e-05, + "loss": 0.7928, + "step": 80150 + }, + { + "epoch": 1.2866980208350054, + "grad_norm": 0.7907307147979736, + "learning_rate": 1.4122162793072694e-05, + "loss": 0.6601, + "step": 80160 + }, + { + "epoch": 1.2868585370551695, + "grad_norm": 2.347852945327759, + "learning_rate": 1.4116486748672476e-05, + "loss": 0.677, + "step": 80170 + }, + { + "epoch": 1.2870190532753334, + "grad_norm": 1.2996028661727905, + "learning_rate": 1.4110811396399822e-05, + "loss": 0.6301, + "step": 80180 + }, + { + "epoch": 1.2871795694954975, + "grad_norm": 0.876847505569458, + "learning_rate": 1.4105136736615652e-05, + "loss": 0.7736, + "step": 80190 + }, + { + "epoch": 1.2873400857156616, + "grad_norm": 0.8169390559196472, + "learning_rate": 1.4099462769680835e-05, + "loss": 0.7622, + "step": 80200 + }, + { + "epoch": 1.2875006019358257, + "grad_norm": 1.2820014953613281, + "learning_rate": 1.4093789495956219e-05, + "loss": 0.707, + "step": 80210 + }, + { + "epoch": 1.2876611181559896, + "grad_norm": 0.8587291836738586, + "learning_rate": 1.4088116915802568e-05, + "loss": 0.7787, + "step": 80220 + }, + { + "epoch": 1.2878216343761537, + "grad_norm": 1.060283899307251, + "learning_rate": 1.4082445029580634e-05, + "loss": 0.7037, + "step": 80230 + }, + { + "epoch": 1.2879821505963178, + "grad_norm": 0.919632077217102, + "learning_rate": 1.4076773837651117e-05, + "loss": 0.8407, + "step": 80240 + }, + { + "epoch": 1.2881426668164817, + "grad_norm": 2.9479691982269287, + "learning_rate": 1.407110334037467e-05, + "loss": 0.6397, + "step": 80250 + }, + { + "epoch": 1.2883031830366458, + "grad_norm": 1.0562528371810913, + "learning_rate": 1.4065433538111904e-05, + "loss": 0.6226, + "step": 80260 + }, + { + "epoch": 1.28846369925681, + "grad_norm": 0.7092341780662537, + "learning_rate": 1.4059764431223391e-05, + "loss": 0.6967, + "step": 80270 + }, + { + "epoch": 1.288624215476974, + "grad_norm": 0.7387295365333557, + "learning_rate": 1.4054096020069643e-05, + "loss": 0.6873, + "step": 80280 + }, + { + "epoch": 1.288784731697138, + "grad_norm": 0.8883929252624512, + "learning_rate": 1.404842830501114e-05, + "loss": 0.7636, + "step": 80290 + }, + { + "epoch": 1.288945247917302, + "grad_norm": 1.1706849336624146, + "learning_rate": 1.4042761286408323e-05, + "loss": 0.6317, + "step": 80300 + }, + { + "epoch": 1.2891057641374661, + "grad_norm": 0.9247460961341858, + "learning_rate": 1.4037094964621573e-05, + "loss": 0.7221, + "step": 80310 + }, + { + "epoch": 1.28926628035763, + "grad_norm": 0.9902496933937073, + "learning_rate": 1.4031429340011248e-05, + "loss": 0.6104, + "step": 80320 + }, + { + "epoch": 1.2894267965777941, + "grad_norm": 1.1951124668121338, + "learning_rate": 1.4025764412937629e-05, + "loss": 0.6745, + "step": 80330 + }, + { + "epoch": 1.2895873127979582, + "grad_norm": 1.3943016529083252, + "learning_rate": 1.4020100183760979e-05, + "loss": 0.768, + "step": 80340 + }, + { + "epoch": 1.2897478290181223, + "grad_norm": 1.1958106756210327, + "learning_rate": 1.4014436652841512e-05, + "loss": 0.7519, + "step": 80350 + }, + { + "epoch": 1.2899083452382865, + "grad_norm": 1.3014057874679565, + "learning_rate": 1.4008773820539395e-05, + "loss": 0.6817, + "step": 80360 + }, + { + "epoch": 1.2900688614584503, + "grad_norm": 0.8896685838699341, + "learning_rate": 1.4003111687214754e-05, + "loss": 0.7687, + "step": 80370 + }, + { + "epoch": 1.2902293776786145, + "grad_norm": 0.889354944229126, + "learning_rate": 1.3997450253227661e-05, + "loss": 0.6613, + "step": 80380 + }, + { + "epoch": 1.2903898938987783, + "grad_norm": 0.9215013980865479, + "learning_rate": 1.3991789518938152e-05, + "loss": 0.5806, + "step": 80390 + }, + { + "epoch": 1.2905504101189424, + "grad_norm": 1.763527750968933, + "learning_rate": 1.3986129484706235e-05, + "loss": 0.7776, + "step": 80400 + }, + { + "epoch": 1.2907109263391066, + "grad_norm": 1.1464123725891113, + "learning_rate": 1.3980470150891822e-05, + "loss": 0.5577, + "step": 80410 + }, + { + "epoch": 1.2908714425592707, + "grad_norm": 1.033837914466858, + "learning_rate": 1.397481151785483e-05, + "loss": 0.6432, + "step": 80420 + }, + { + "epoch": 1.2910319587794348, + "grad_norm": 0.9817892909049988, + "learning_rate": 1.3969153585955116e-05, + "loss": 0.7181, + "step": 80430 + }, + { + "epoch": 1.2911924749995987, + "grad_norm": 0.8754597306251526, + "learning_rate": 1.3963496355552491e-05, + "loss": 0.7472, + "step": 80440 + }, + { + "epoch": 1.2913529912197628, + "grad_norm": 0.8996859788894653, + "learning_rate": 1.3957839827006715e-05, + "loss": 0.801, + "step": 80450 + }, + { + "epoch": 1.2915135074399269, + "grad_norm": 1.0387828350067139, + "learning_rate": 1.395218400067753e-05, + "loss": 0.7165, + "step": 80460 + }, + { + "epoch": 1.2916740236600908, + "grad_norm": 0.8777785897254944, + "learning_rate": 1.3946528876924587e-05, + "loss": 0.6053, + "step": 80470 + }, + { + "epoch": 1.2918345398802549, + "grad_norm": 1.2205650806427002, + "learning_rate": 1.3940874456107528e-05, + "loss": 0.5275, + "step": 80480 + }, + { + "epoch": 1.291995056100419, + "grad_norm": 0.9449856281280518, + "learning_rate": 1.3935220738585945e-05, + "loss": 0.7327, + "step": 80490 + }, + { + "epoch": 1.292155572320583, + "grad_norm": 0.864722490310669, + "learning_rate": 1.3929567724719378e-05, + "loss": 0.6454, + "step": 80500 + }, + { + "epoch": 1.292316088540747, + "grad_norm": 1.3824424743652344, + "learning_rate": 1.3923915414867327e-05, + "loss": 0.7727, + "step": 80510 + }, + { + "epoch": 1.292476604760911, + "grad_norm": 0.9650262594223022, + "learning_rate": 1.3918263809389249e-05, + "loss": 0.6696, + "step": 80520 + }, + { + "epoch": 1.2926371209810752, + "grad_norm": 1.2697727680206299, + "learning_rate": 1.3912612908644556e-05, + "loss": 0.7189, + "step": 80530 + }, + { + "epoch": 1.292797637201239, + "grad_norm": 0.9563111662864685, + "learning_rate": 1.3906962712992598e-05, + "loss": 0.6287, + "step": 80540 + }, + { + "epoch": 1.2929581534214032, + "grad_norm": 1.1034207344055176, + "learning_rate": 1.39013132227927e-05, + "loss": 0.7224, + "step": 80550 + }, + { + "epoch": 1.2931186696415673, + "grad_norm": 0.6725316047668457, + "learning_rate": 1.3895664438404138e-05, + "loss": 0.6205, + "step": 80560 + }, + { + "epoch": 1.2932791858617314, + "grad_norm": 1.016884684562683, + "learning_rate": 1.3890016360186142e-05, + "loss": 0.894, + "step": 80570 + }, + { + "epoch": 1.2934397020818953, + "grad_norm": 1.2151527404785156, + "learning_rate": 1.3884368988497898e-05, + "loss": 0.8804, + "step": 80580 + }, + { + "epoch": 1.2936002183020594, + "grad_norm": 1.409550666809082, + "learning_rate": 1.3878722323698554e-05, + "loss": 0.7707, + "step": 80590 + }, + { + "epoch": 1.2937607345222235, + "grad_norm": 0.9384359121322632, + "learning_rate": 1.3873076366147183e-05, + "loss": 0.692, + "step": 80600 + }, + { + "epoch": 1.2939212507423874, + "grad_norm": 1.0251668691635132, + "learning_rate": 1.3867431116202851e-05, + "loss": 0.7354, + "step": 80610 + }, + { + "epoch": 1.2940817669625515, + "grad_norm": 0.9131772518157959, + "learning_rate": 1.3861786574224553e-05, + "loss": 0.6046, + "step": 80620 + }, + { + "epoch": 1.2942422831827156, + "grad_norm": 0.8003970384597778, + "learning_rate": 1.3856142740571257e-05, + "loss": 0.5815, + "step": 80630 + }, + { + "epoch": 1.2944027994028797, + "grad_norm": 0.8372609615325928, + "learning_rate": 1.3850499615601875e-05, + "loss": 0.6171, + "step": 80640 + }, + { + "epoch": 1.2945633156230438, + "grad_norm": 0.9303085803985596, + "learning_rate": 1.3844857199675287e-05, + "loss": 0.7652, + "step": 80650 + }, + { + "epoch": 1.2947238318432077, + "grad_norm": 0.8572179079055786, + "learning_rate": 1.3839215493150296e-05, + "loss": 0.729, + "step": 80660 + }, + { + "epoch": 1.2948843480633718, + "grad_norm": 1.4239269495010376, + "learning_rate": 1.3833574496385693e-05, + "loss": 0.8064, + "step": 80670 + }, + { + "epoch": 1.2950448642835357, + "grad_norm": 1.1793969869613647, + "learning_rate": 1.3827934209740213e-05, + "loss": 0.6839, + "step": 80680 + }, + { + "epoch": 1.2952053805036998, + "grad_norm": 0.9548568725585938, + "learning_rate": 1.3822294633572542e-05, + "loss": 0.6098, + "step": 80690 + }, + { + "epoch": 1.295365896723864, + "grad_norm": 0.5275778770446777, + "learning_rate": 1.3816655768241324e-05, + "loss": 0.7047, + "step": 80700 + }, + { + "epoch": 1.295526412944028, + "grad_norm": 2.2502946853637695, + "learning_rate": 1.381101761410517e-05, + "loss": 0.855, + "step": 80710 + }, + { + "epoch": 1.2956869291641921, + "grad_norm": 1.1298649311065674, + "learning_rate": 1.3805380171522617e-05, + "loss": 0.686, + "step": 80720 + }, + { + "epoch": 1.295847445384356, + "grad_norm": 1.0124099254608154, + "learning_rate": 1.3799743440852176e-05, + "loss": 0.5947, + "step": 80730 + }, + { + "epoch": 1.2960079616045201, + "grad_norm": 2.5585951805114746, + "learning_rate": 1.3794107422452315e-05, + "loss": 0.7776, + "step": 80740 + }, + { + "epoch": 1.2961684778246843, + "grad_norm": 0.8786316514015198, + "learning_rate": 1.378847211668145e-05, + "loss": 0.69, + "step": 80750 + }, + { + "epoch": 1.2963289940448481, + "grad_norm": 1.044395923614502, + "learning_rate": 1.3782837523897951e-05, + "loss": 0.7024, + "step": 80760 + }, + { + "epoch": 1.2964895102650122, + "grad_norm": 0.9265943169593811, + "learning_rate": 1.3777203644460163e-05, + "loss": 0.7828, + "step": 80770 + }, + { + "epoch": 1.2966500264851764, + "grad_norm": 1.442947268486023, + "learning_rate": 1.3771570478726337e-05, + "loss": 0.6446, + "step": 80780 + }, + { + "epoch": 1.2968105427053405, + "grad_norm": 0.6835657954216003, + "learning_rate": 1.3765938027054726e-05, + "loss": 0.7592, + "step": 80790 + }, + { + "epoch": 1.2969710589255043, + "grad_norm": 1.010611891746521, + "learning_rate": 1.3760306289803521e-05, + "loss": 0.784, + "step": 80800 + }, + { + "epoch": 1.2971315751456685, + "grad_norm": 1.054158091545105, + "learning_rate": 1.3754675267330863e-05, + "loss": 0.733, + "step": 80810 + }, + { + "epoch": 1.2972920913658326, + "grad_norm": 2.0491397380828857, + "learning_rate": 1.3749044959994856e-05, + "loss": 0.7363, + "step": 80820 + }, + { + "epoch": 1.2974526075859965, + "grad_norm": 1.0289204120635986, + "learning_rate": 1.374341536815355e-05, + "loss": 0.7182, + "step": 80830 + }, + { + "epoch": 1.2976131238061606, + "grad_norm": 0.9432598352432251, + "learning_rate": 1.373778649216497e-05, + "loss": 0.6416, + "step": 80840 + }, + { + "epoch": 1.2977736400263247, + "grad_norm": 1.30915105342865, + "learning_rate": 1.3732158332387057e-05, + "loss": 0.7441, + "step": 80850 + }, + { + "epoch": 1.2979341562464888, + "grad_norm": 0.7876240015029907, + "learning_rate": 1.3726530889177736e-05, + "loss": 0.806, + "step": 80860 + }, + { + "epoch": 1.2980946724666529, + "grad_norm": 0.7848319411277771, + "learning_rate": 1.3720904162894885e-05, + "loss": 0.6649, + "step": 80870 + }, + { + "epoch": 1.2982551886868168, + "grad_norm": 0.6369417309761047, + "learning_rate": 1.3715278153896327e-05, + "loss": 0.6804, + "step": 80880 + }, + { + "epoch": 1.2984157049069809, + "grad_norm": 1.0294218063354492, + "learning_rate": 1.3709652862539843e-05, + "loss": 0.6953, + "step": 80890 + }, + { + "epoch": 1.2985762211271448, + "grad_norm": 1.7398629188537598, + "learning_rate": 1.3704028289183168e-05, + "loss": 0.6412, + "step": 80900 + }, + { + "epoch": 1.2987367373473089, + "grad_norm": 1.2111026048660278, + "learning_rate": 1.3698404434183995e-05, + "loss": 0.7928, + "step": 80910 + }, + { + "epoch": 1.298897253567473, + "grad_norm": 0.7746222019195557, + "learning_rate": 1.3692781297899965e-05, + "loss": 0.7022, + "step": 80920 + }, + { + "epoch": 1.299057769787637, + "grad_norm": 0.9814722537994385, + "learning_rate": 1.368715888068868e-05, + "loss": 0.6361, + "step": 80930 + }, + { + "epoch": 1.2992182860078012, + "grad_norm": 1.202738642692566, + "learning_rate": 1.3681537182907689e-05, + "loss": 0.7692, + "step": 80940 + }, + { + "epoch": 1.299378802227965, + "grad_norm": 0.9943771958351135, + "learning_rate": 1.3675916204914513e-05, + "loss": 0.7396, + "step": 80950 + }, + { + "epoch": 1.2995393184481292, + "grad_norm": 1.5029104948043823, + "learning_rate": 1.3670295947066586e-05, + "loss": 0.6846, + "step": 80960 + }, + { + "epoch": 1.2996998346682933, + "grad_norm": 1.190443754196167, + "learning_rate": 1.3664676409721344e-05, + "loss": 0.6992, + "step": 80970 + }, + { + "epoch": 1.2998603508884572, + "grad_norm": 1.073196291923523, + "learning_rate": 1.3659057593236147e-05, + "loss": 0.6411, + "step": 80980 + }, + { + "epoch": 1.3000208671086213, + "grad_norm": 1.1429630517959595, + "learning_rate": 1.3653439497968324e-05, + "loss": 0.7192, + "step": 80990 + }, + { + "epoch": 1.3001813833287854, + "grad_norm": 0.8295759558677673, + "learning_rate": 1.364782212427515e-05, + "loss": 0.7799, + "step": 81000 + }, + { + "epoch": 1.3003418995489495, + "grad_norm": 1.7153022289276123, + "learning_rate": 1.3642205472513858e-05, + "loss": 0.7112, + "step": 81010 + }, + { + "epoch": 1.3005024157691134, + "grad_norm": 1.064621090888977, + "learning_rate": 1.3636589543041644e-05, + "loss": 0.733, + "step": 81020 + }, + { + "epoch": 1.3006629319892775, + "grad_norm": 0.6318098902702332, + "learning_rate": 1.363097433621563e-05, + "loss": 0.5782, + "step": 81030 + }, + { + "epoch": 1.3008234482094416, + "grad_norm": 1.6635322570800781, + "learning_rate": 1.3625359852392919e-05, + "loss": 0.7395, + "step": 81040 + }, + { + "epoch": 1.3009839644296055, + "grad_norm": 1.120378017425537, + "learning_rate": 1.3619746091930558e-05, + "loss": 0.6067, + "step": 81050 + }, + { + "epoch": 1.3011444806497696, + "grad_norm": 0.8702584505081177, + "learning_rate": 1.3614133055185551e-05, + "loss": 0.8168, + "step": 81060 + }, + { + "epoch": 1.3013049968699337, + "grad_norm": 1.1649260520935059, + "learning_rate": 1.3608520742514855e-05, + "loss": 0.8084, + "step": 81070 + }, + { + "epoch": 1.3014655130900978, + "grad_norm": 1.1376266479492188, + "learning_rate": 1.3602909154275379e-05, + "loss": 0.6979, + "step": 81080 + }, + { + "epoch": 1.3016260293102617, + "grad_norm": 0.8494373559951782, + "learning_rate": 1.3597298290823993e-05, + "loss": 0.7464, + "step": 81090 + }, + { + "epoch": 1.3017865455304258, + "grad_norm": 0.5920217037200928, + "learning_rate": 1.35916881525175e-05, + "loss": 0.7086, + "step": 81100 + }, + { + "epoch": 1.30194706175059, + "grad_norm": 0.8442147970199585, + "learning_rate": 1.3586078739712682e-05, + "loss": 0.7736, + "step": 81110 + }, + { + "epoch": 1.3021075779707538, + "grad_norm": 1.1602178812026978, + "learning_rate": 1.3580470052766259e-05, + "loss": 0.7298, + "step": 81120 + }, + { + "epoch": 1.302268094190918, + "grad_norm": 1.110368251800537, + "learning_rate": 1.3574862092034918e-05, + "loss": 0.7216, + "step": 81130 + }, + { + "epoch": 1.302428610411082, + "grad_norm": 0.9701312184333801, + "learning_rate": 1.356925485787529e-05, + "loss": 0.7392, + "step": 81140 + }, + { + "epoch": 1.3025891266312462, + "grad_norm": 1.0196443796157837, + "learning_rate": 1.356364835064397e-05, + "loss": 0.6266, + "step": 81150 + }, + { + "epoch": 1.3027496428514103, + "grad_norm": 1.1879464387893677, + "learning_rate": 1.3558042570697482e-05, + "loss": 0.7111, + "step": 81160 + }, + { + "epoch": 1.3029101590715741, + "grad_norm": 0.8498045802116394, + "learning_rate": 1.3552437518392328e-05, + "loss": 0.7347, + "step": 81170 + }, + { + "epoch": 1.3030706752917383, + "grad_norm": 1.1521241664886475, + "learning_rate": 1.3546833194084956e-05, + "loss": 0.7799, + "step": 81180 + }, + { + "epoch": 1.3032311915119021, + "grad_norm": 0.874121904373169, + "learning_rate": 1.3541229598131771e-05, + "loss": 0.8216, + "step": 81190 + }, + { + "epoch": 1.3033917077320663, + "grad_norm": 1.1580716371536255, + "learning_rate": 1.3535626730889129e-05, + "loss": 0.666, + "step": 81200 + }, + { + "epoch": 1.3035522239522304, + "grad_norm": 1.255914330482483, + "learning_rate": 1.3530024592713344e-05, + "loss": 0.802, + "step": 81210 + }, + { + "epoch": 1.3037127401723945, + "grad_norm": 1.5194040536880493, + "learning_rate": 1.3524423183960665e-05, + "loss": 0.6569, + "step": 81220 + }, + { + "epoch": 1.3038732563925586, + "grad_norm": 0.8645784258842468, + "learning_rate": 1.3518822504987316e-05, + "loss": 0.8318, + "step": 81230 + }, + { + "epoch": 1.3040337726127225, + "grad_norm": 2.201869249343872, + "learning_rate": 1.3513222556149468e-05, + "loss": 0.6978, + "step": 81240 + }, + { + "epoch": 1.3041942888328866, + "grad_norm": 1.486291766166687, + "learning_rate": 1.3507623337803248e-05, + "loss": 0.6944, + "step": 81250 + }, + { + "epoch": 1.3043548050530507, + "grad_norm": 0.7332134246826172, + "learning_rate": 1.3502024850304728e-05, + "loss": 0.7197, + "step": 81260 + }, + { + "epoch": 1.3045153212732146, + "grad_norm": 1.1957818269729614, + "learning_rate": 1.349642709400995e-05, + "loss": 0.629, + "step": 81270 + }, + { + "epoch": 1.3046758374933787, + "grad_norm": 1.0205731391906738, + "learning_rate": 1.3490830069274881e-05, + "loss": 0.7105, + "step": 81280 + }, + { + "epoch": 1.3048363537135428, + "grad_norm": 1.0848445892333984, + "learning_rate": 1.348523377645547e-05, + "loss": 0.8441, + "step": 81290 + }, + { + "epoch": 1.304996869933707, + "grad_norm": 0.9862185716629028, + "learning_rate": 1.3479638215907603e-05, + "loss": 0.6956, + "step": 81300 + }, + { + "epoch": 1.3051573861538708, + "grad_norm": 1.0669047832489014, + "learning_rate": 1.3474043387987129e-05, + "loss": 0.7883, + "step": 81310 + }, + { + "epoch": 1.3053179023740349, + "grad_norm": 0.5695425271987915, + "learning_rate": 1.3468449293049846e-05, + "loss": 0.6639, + "step": 81320 + }, + { + "epoch": 1.305478418594199, + "grad_norm": 0.9212682247161865, + "learning_rate": 1.3462855931451501e-05, + "loss": 0.6894, + "step": 81330 + }, + { + "epoch": 1.3056389348143629, + "grad_norm": 1.3889952898025513, + "learning_rate": 1.3457263303547812e-05, + "loss": 0.7102, + "step": 81340 + }, + { + "epoch": 1.305799451034527, + "grad_norm": 0.7301838994026184, + "learning_rate": 1.3451671409694422e-05, + "loss": 0.726, + "step": 81350 + }, + { + "epoch": 1.305959967254691, + "grad_norm": 0.8515526652336121, + "learning_rate": 1.3446080250246948e-05, + "loss": 0.6323, + "step": 81360 + }, + { + "epoch": 1.3061204834748552, + "grad_norm": 0.9863777160644531, + "learning_rate": 1.3440489825560953e-05, + "loss": 0.7949, + "step": 81370 + }, + { + "epoch": 1.3062809996950193, + "grad_norm": 1.2593731880187988, + "learning_rate": 1.3434900135991962e-05, + "loss": 0.724, + "step": 81380 + }, + { + "epoch": 1.3064415159151832, + "grad_norm": 1.2069209814071655, + "learning_rate": 1.3429311181895437e-05, + "loss": 0.6541, + "step": 81390 + }, + { + "epoch": 1.3066020321353473, + "grad_norm": 2.073073148727417, + "learning_rate": 1.3423722963626822e-05, + "loss": 0.6544, + "step": 81400 + }, + { + "epoch": 1.3067625483555112, + "grad_norm": 0.8714665770530701, + "learning_rate": 1.3418135481541466e-05, + "loss": 0.6502, + "step": 81410 + }, + { + "epoch": 1.3069230645756753, + "grad_norm": 0.823506772518158, + "learning_rate": 1.3412548735994718e-05, + "loss": 0.6452, + "step": 81420 + }, + { + "epoch": 1.3070835807958394, + "grad_norm": 0.8020366430282593, + "learning_rate": 1.3406962727341859e-05, + "loss": 0.6391, + "step": 81430 + }, + { + "epoch": 1.3072440970160035, + "grad_norm": 0.9547913670539856, + "learning_rate": 1.3401377455938127e-05, + "loss": 0.6148, + "step": 81440 + }, + { + "epoch": 1.3074046132361676, + "grad_norm": 1.2834110260009766, + "learning_rate": 1.3395792922138711e-05, + "loss": 0.7596, + "step": 81450 + }, + { + "epoch": 1.3075651294563315, + "grad_norm": 1.3457762002944946, + "learning_rate": 1.3390209126298764e-05, + "loss": 0.6212, + "step": 81460 + }, + { + "epoch": 1.3077256456764956, + "grad_norm": 0.943367600440979, + "learning_rate": 1.3384626068773365e-05, + "loss": 0.6573, + "step": 81470 + }, + { + "epoch": 1.3078861618966597, + "grad_norm": 1.4346377849578857, + "learning_rate": 1.3379043749917572e-05, + "loss": 0.6189, + "step": 81480 + }, + { + "epoch": 1.3080466781168236, + "grad_norm": 1.1114768981933594, + "learning_rate": 1.337346217008639e-05, + "loss": 0.7686, + "step": 81490 + }, + { + "epoch": 1.3082071943369877, + "grad_norm": 1.0324182510375977, + "learning_rate": 1.3367881329634773e-05, + "loss": 0.7922, + "step": 81500 + }, + { + "epoch": 1.3083677105571518, + "grad_norm": 1.2547588348388672, + "learning_rate": 1.3362301228917629e-05, + "loss": 0.6319, + "step": 81510 + }, + { + "epoch": 1.308528226777316, + "grad_norm": 0.8849076628684998, + "learning_rate": 1.3356721868289818e-05, + "loss": 0.729, + "step": 81520 + }, + { + "epoch": 1.3086887429974798, + "grad_norm": 1.0818380117416382, + "learning_rate": 1.3351143248106163e-05, + "loss": 0.7687, + "step": 81530 + }, + { + "epoch": 1.308849259217644, + "grad_norm": 0.9286096692085266, + "learning_rate": 1.3345565368721418e-05, + "loss": 0.7019, + "step": 81540 + }, + { + "epoch": 1.309009775437808, + "grad_norm": 0.9304673075675964, + "learning_rate": 1.333998823049032e-05, + "loss": 0.6106, + "step": 81550 + }, + { + "epoch": 1.309170291657972, + "grad_norm": 1.0891305208206177, + "learning_rate": 1.3334411833767524e-05, + "loss": 0.7226, + "step": 81560 + }, + { + "epoch": 1.309330807878136, + "grad_norm": 1.8414334058761597, + "learning_rate": 1.3328836178907672e-05, + "loss": 0.717, + "step": 81570 + }, + { + "epoch": 1.3094913240983002, + "grad_norm": 1.196751594543457, + "learning_rate": 1.3323261266265344e-05, + "loss": 0.6909, + "step": 81580 + }, + { + "epoch": 1.3096518403184643, + "grad_norm": 0.7946603894233704, + "learning_rate": 1.3317687096195052e-05, + "loss": 0.6328, + "step": 81590 + }, + { + "epoch": 1.3098123565386282, + "grad_norm": 0.9291207194328308, + "learning_rate": 1.3312113669051296e-05, + "loss": 0.6891, + "step": 81600 + }, + { + "epoch": 1.3099728727587923, + "grad_norm": 0.7526233196258545, + "learning_rate": 1.330654098518851e-05, + "loss": 0.7007, + "step": 81610 + }, + { + "epoch": 1.3101333889789564, + "grad_norm": 0.9778485298156738, + "learning_rate": 1.3300969044961082e-05, + "loss": 0.7034, + "step": 81620 + }, + { + "epoch": 1.3102939051991203, + "grad_norm": 1.1625257730484009, + "learning_rate": 1.3295397848723354e-05, + "loss": 0.7025, + "step": 81630 + }, + { + "epoch": 1.3104544214192844, + "grad_norm": 0.7334359884262085, + "learning_rate": 1.3289827396829623e-05, + "loss": 0.6531, + "step": 81640 + }, + { + "epoch": 1.3106149376394485, + "grad_norm": 1.5493123531341553, + "learning_rate": 1.3284257689634153e-05, + "loss": 0.7063, + "step": 81650 + }, + { + "epoch": 1.3107754538596126, + "grad_norm": 1.2254284620285034, + "learning_rate": 1.3278688727491117e-05, + "loss": 0.743, + "step": 81660 + }, + { + "epoch": 1.3109359700797767, + "grad_norm": 1.4301795959472656, + "learning_rate": 1.3273120510754684e-05, + "loss": 0.7326, + "step": 81670 + }, + { + "epoch": 1.3110964862999406, + "grad_norm": 0.7994319200515747, + "learning_rate": 1.3267553039778952e-05, + "loss": 0.7293, + "step": 81680 + }, + { + "epoch": 1.3112570025201047, + "grad_norm": 0.78889000415802, + "learning_rate": 1.3261986314917984e-05, + "loss": 0.6268, + "step": 81690 + }, + { + "epoch": 1.3114175187402686, + "grad_norm": 0.7448105812072754, + "learning_rate": 1.3256420336525794e-05, + "loss": 0.6264, + "step": 81700 + }, + { + "epoch": 1.3115780349604327, + "grad_norm": 0.8746028542518616, + "learning_rate": 1.3250855104956353e-05, + "loss": 0.7203, + "step": 81710 + }, + { + "epoch": 1.3117385511805968, + "grad_norm": 1.1581236124038696, + "learning_rate": 1.3245290620563555e-05, + "loss": 0.7578, + "step": 81720 + }, + { + "epoch": 1.311899067400761, + "grad_norm": 1.0160778760910034, + "learning_rate": 1.323972688370128e-05, + "loss": 0.7615, + "step": 81730 + }, + { + "epoch": 1.312059583620925, + "grad_norm": 0.7953249216079712, + "learning_rate": 1.323416389472335e-05, + "loss": 0.6171, + "step": 81740 + }, + { + "epoch": 1.312220099841089, + "grad_norm": 0.9607376456260681, + "learning_rate": 1.322860165398354e-05, + "loss": 0.6749, + "step": 81750 + }, + { + "epoch": 1.312380616061253, + "grad_norm": 0.8059455752372742, + "learning_rate": 1.3223040161835573e-05, + "loss": 0.5873, + "step": 81760 + }, + { + "epoch": 1.3125411322814171, + "grad_norm": 0.6212871670722961, + "learning_rate": 1.3217479418633138e-05, + "loss": 0.7833, + "step": 81770 + }, + { + "epoch": 1.312701648501581, + "grad_norm": 1.1124951839447021, + "learning_rate": 1.3211919424729843e-05, + "loss": 0.6381, + "step": 81780 + }, + { + "epoch": 1.312862164721745, + "grad_norm": 0.7453697919845581, + "learning_rate": 1.3206360180479289e-05, + "loss": 0.7341, + "step": 81790 + }, + { + "epoch": 1.3130226809419092, + "grad_norm": 1.4226007461547852, + "learning_rate": 1.3200801686234998e-05, + "loss": 0.7194, + "step": 81800 + }, + { + "epoch": 1.3131831971620733, + "grad_norm": 0.7790101766586304, + "learning_rate": 1.319524394235047e-05, + "loss": 0.7161, + "step": 81810 + }, + { + "epoch": 1.3133437133822372, + "grad_norm": 0.6839580535888672, + "learning_rate": 1.3189686949179142e-05, + "loss": 0.6915, + "step": 81820 + }, + { + "epoch": 1.3135042296024013, + "grad_norm": 2.959503650665283, + "learning_rate": 1.318413070707441e-05, + "loss": 0.7187, + "step": 81830 + }, + { + "epoch": 1.3136647458225654, + "grad_norm": 1.3749442100524902, + "learning_rate": 1.3178575216389607e-05, + "loss": 0.7396, + "step": 81840 + }, + { + "epoch": 1.3138252620427293, + "grad_norm": 0.7853378057479858, + "learning_rate": 1.3173020477478034e-05, + "loss": 0.8195, + "step": 81850 + }, + { + "epoch": 1.3139857782628934, + "grad_norm": 1.0338919162750244, + "learning_rate": 1.3167466490692943e-05, + "loss": 0.5989, + "step": 81860 + }, + { + "epoch": 1.3141462944830575, + "grad_norm": 1.0072437524795532, + "learning_rate": 1.3161913256387531e-05, + "loss": 0.6738, + "step": 81870 + }, + { + "epoch": 1.3143068107032216, + "grad_norm": 1.0714352130889893, + "learning_rate": 1.3156360774914955e-05, + "loss": 0.6592, + "step": 81880 + }, + { + "epoch": 1.3144673269233855, + "grad_norm": 1.3543596267700195, + "learning_rate": 1.3150809046628316e-05, + "loss": 0.7252, + "step": 81890 + }, + { + "epoch": 1.3146278431435496, + "grad_norm": 0.8739212155342102, + "learning_rate": 1.3145258071880684e-05, + "loss": 0.6097, + "step": 81900 + }, + { + "epoch": 1.3147883593637137, + "grad_norm": 1.5354437828063965, + "learning_rate": 1.3139707851025052e-05, + "loss": 0.7264, + "step": 81910 + }, + { + "epoch": 1.3149488755838776, + "grad_norm": 1.3791965246200562, + "learning_rate": 1.3134158384414386e-05, + "loss": 0.6866, + "step": 81920 + }, + { + "epoch": 1.3151093918040417, + "grad_norm": 0.9851775169372559, + "learning_rate": 1.31286096724016e-05, + "loss": 0.7502, + "step": 81930 + }, + { + "epoch": 1.3152699080242058, + "grad_norm": 0.9455615878105164, + "learning_rate": 1.3123061715339563e-05, + "loss": 0.711, + "step": 81940 + }, + { + "epoch": 1.31543042424437, + "grad_norm": 0.9281620383262634, + "learning_rate": 1.311751451358109e-05, + "loss": 0.7541, + "step": 81950 + }, + { + "epoch": 1.315590940464534, + "grad_norm": 0.8447808027267456, + "learning_rate": 1.3111968067478958e-05, + "loss": 0.719, + "step": 81960 + }, + { + "epoch": 1.315751456684698, + "grad_norm": 0.8526858687400818, + "learning_rate": 1.3106422377385874e-05, + "loss": 0.6375, + "step": 81970 + }, + { + "epoch": 1.315911972904862, + "grad_norm": 1.3475807905197144, + "learning_rate": 1.3100877443654518e-05, + "loss": 0.6578, + "step": 81980 + }, + { + "epoch": 1.316072489125026, + "grad_norm": 1.0237189531326294, + "learning_rate": 1.3095333266637517e-05, + "loss": 0.736, + "step": 81990 + }, + { + "epoch": 1.31623300534519, + "grad_norm": 1.0401374101638794, + "learning_rate": 1.3089789846687445e-05, + "loss": 0.6911, + "step": 82000 + }, + { + "epoch": 1.3163935215653542, + "grad_norm": 0.7968671917915344, + "learning_rate": 1.3084247184156834e-05, + "loss": 0.759, + "step": 82010 + }, + { + "epoch": 1.3165540377855183, + "grad_norm": 1.2025164365768433, + "learning_rate": 1.3078705279398174e-05, + "loss": 0.7135, + "step": 82020 + }, + { + "epoch": 1.3167145540056824, + "grad_norm": 0.7544407248497009, + "learning_rate": 1.3073164132763877e-05, + "loss": 0.6659, + "step": 82030 + }, + { + "epoch": 1.3168750702258463, + "grad_norm": 1.0829155445098877, + "learning_rate": 1.3067623744606336e-05, + "loss": 0.7676, + "step": 82040 + }, + { + "epoch": 1.3170355864460104, + "grad_norm": 1.175402045249939, + "learning_rate": 1.306208411527789e-05, + "loss": 0.6688, + "step": 82050 + }, + { + "epoch": 1.3171961026661745, + "grad_norm": 0.9076882004737854, + "learning_rate": 1.3056545245130824e-05, + "loss": 0.782, + "step": 82060 + }, + { + "epoch": 1.3173566188863384, + "grad_norm": 0.9398937225341797, + "learning_rate": 1.3051007134517384e-05, + "loss": 0.6069, + "step": 82070 + }, + { + "epoch": 1.3175171351065025, + "grad_norm": 0.9509221315383911, + "learning_rate": 1.3045469783789765e-05, + "loss": 0.6654, + "step": 82080 + }, + { + "epoch": 1.3176776513266666, + "grad_norm": 1.29142165184021, + "learning_rate": 1.3039933193300092e-05, + "loss": 0.7793, + "step": 82090 + }, + { + "epoch": 1.3178381675468307, + "grad_norm": 1.0540848970413208, + "learning_rate": 1.3034397363400467e-05, + "loss": 0.7753, + "step": 82100 + }, + { + "epoch": 1.3179986837669946, + "grad_norm": 1.1039150953292847, + "learning_rate": 1.302886229444294e-05, + "loss": 0.6725, + "step": 82110 + }, + { + "epoch": 1.3181591999871587, + "grad_norm": 1.749144196510315, + "learning_rate": 1.302332798677951e-05, + "loss": 0.7497, + "step": 82120 + }, + { + "epoch": 1.3183197162073228, + "grad_norm": 1.2885758876800537, + "learning_rate": 1.3017794440762122e-05, + "loss": 0.5404, + "step": 82130 + }, + { + "epoch": 1.3184802324274867, + "grad_norm": 0.7766202092170715, + "learning_rate": 1.3012261656742678e-05, + "loss": 0.7692, + "step": 82140 + }, + { + "epoch": 1.3186407486476508, + "grad_norm": 0.6675050258636475, + "learning_rate": 1.3006729635073033e-05, + "loss": 0.7108, + "step": 82150 + }, + { + "epoch": 1.318801264867815, + "grad_norm": 1.3630210161209106, + "learning_rate": 1.300119837610499e-05, + "loss": 0.8274, + "step": 82160 + }, + { + "epoch": 1.318961781087979, + "grad_norm": 1.0606857538223267, + "learning_rate": 1.2995667880190304e-05, + "loss": 0.6404, + "step": 82170 + }, + { + "epoch": 1.3191222973081431, + "grad_norm": 0.866198718547821, + "learning_rate": 1.299013814768068e-05, + "loss": 0.7453, + "step": 82180 + }, + { + "epoch": 1.319282813528307, + "grad_norm": 0.9469107389450073, + "learning_rate": 1.2984609178927784e-05, + "loss": 0.7487, + "step": 82190 + }, + { + "epoch": 1.3194433297484711, + "grad_norm": 0.8326146006584167, + "learning_rate": 1.2979080974283231e-05, + "loss": 0.6326, + "step": 82200 + }, + { + "epoch": 1.319603845968635, + "grad_norm": 1.086371660232544, + "learning_rate": 1.297355353409856e-05, + "loss": 0.755, + "step": 82210 + }, + { + "epoch": 1.3197643621887991, + "grad_norm": 1.3165589570999146, + "learning_rate": 1.2968026858725297e-05, + "loss": 0.6408, + "step": 82220 + }, + { + "epoch": 1.3199248784089632, + "grad_norm": 1.1105725765228271, + "learning_rate": 1.2962500948514902e-05, + "loss": 0.7008, + "step": 82230 + }, + { + "epoch": 1.3200853946291273, + "grad_norm": 0.607511579990387, + "learning_rate": 1.2956975803818796e-05, + "loss": 0.9095, + "step": 82240 + }, + { + "epoch": 1.3202459108492914, + "grad_norm": 0.7510808110237122, + "learning_rate": 1.2951451424988345e-05, + "loss": 0.7354, + "step": 82250 + }, + { + "epoch": 1.3204064270694553, + "grad_norm": 0.8992972373962402, + "learning_rate": 1.2945927812374864e-05, + "loss": 0.7778, + "step": 82260 + }, + { + "epoch": 1.3205669432896194, + "grad_norm": 1.0179513692855835, + "learning_rate": 1.2940404966329633e-05, + "loss": 0.6501, + "step": 82270 + }, + { + "epoch": 1.3207274595097835, + "grad_norm": 0.8301910758018494, + "learning_rate": 1.2934882887203859e-05, + "loss": 0.656, + "step": 82280 + }, + { + "epoch": 1.3208879757299474, + "grad_norm": 1.5818274021148682, + "learning_rate": 1.2929361575348714e-05, + "loss": 0.6962, + "step": 82290 + }, + { + "epoch": 1.3210484919501115, + "grad_norm": 0.9329640865325928, + "learning_rate": 1.2923841031115325e-05, + "loss": 0.6654, + "step": 82300 + }, + { + "epoch": 1.3212090081702756, + "grad_norm": 0.7597145438194275, + "learning_rate": 1.2918321254854765e-05, + "loss": 0.5802, + "step": 82310 + }, + { + "epoch": 1.3213695243904398, + "grad_norm": 0.9991561770439148, + "learning_rate": 1.2912802246918066e-05, + "loss": 0.6597, + "step": 82320 + }, + { + "epoch": 1.3215300406106036, + "grad_norm": 0.7513924837112427, + "learning_rate": 1.2907284007656206e-05, + "loss": 0.6557, + "step": 82330 + }, + { + "epoch": 1.3216905568307677, + "grad_norm": 0.6950154900550842, + "learning_rate": 1.2901766537420095e-05, + "loss": 0.643, + "step": 82340 + }, + { + "epoch": 1.3218510730509319, + "grad_norm": 1.4112119674682617, + "learning_rate": 1.2896249836560626e-05, + "loss": 0.8033, + "step": 82350 + }, + { + "epoch": 1.3220115892710957, + "grad_norm": 1.5948517322540283, + "learning_rate": 1.2890733905428624e-05, + "loss": 0.6768, + "step": 82360 + }, + { + "epoch": 1.3221721054912599, + "grad_norm": 0.6617510914802551, + "learning_rate": 1.2885218744374872e-05, + "loss": 0.6235, + "step": 82370 + }, + { + "epoch": 1.322332621711424, + "grad_norm": 1.101204514503479, + "learning_rate": 1.2879704353750099e-05, + "loss": 0.8224, + "step": 82380 + }, + { + "epoch": 1.322493137931588, + "grad_norm": 0.7687272429466248, + "learning_rate": 1.2874190733905001e-05, + "loss": 0.6157, + "step": 82390 + }, + { + "epoch": 1.322653654151752, + "grad_norm": 1.1673145294189453, + "learning_rate": 1.2868677885190194e-05, + "loss": 0.6762, + "step": 82400 + }, + { + "epoch": 1.322814170371916, + "grad_norm": 2.5123579502105713, + "learning_rate": 1.2863165807956268e-05, + "loss": 0.735, + "step": 82410 + }, + { + "epoch": 1.3229746865920802, + "grad_norm": 1.3484629392623901, + "learning_rate": 1.285765450255376e-05, + "loss": 0.713, + "step": 82420 + }, + { + "epoch": 1.323135202812244, + "grad_norm": 1.3311641216278076, + "learning_rate": 1.2852143969333157e-05, + "loss": 0.7614, + "step": 82430 + }, + { + "epoch": 1.3232957190324082, + "grad_norm": 1.546886920928955, + "learning_rate": 1.2846634208644897e-05, + "loss": 0.6942, + "step": 82440 + }, + { + "epoch": 1.3234562352525723, + "grad_norm": 0.8029254078865051, + "learning_rate": 1.284112522083937e-05, + "loss": 0.7976, + "step": 82450 + }, + { + "epoch": 1.3236167514727364, + "grad_norm": 1.3531498908996582, + "learning_rate": 1.2835617006266923e-05, + "loss": 0.5908, + "step": 82460 + }, + { + "epoch": 1.3237772676929005, + "grad_norm": 1.451842188835144, + "learning_rate": 1.2830109565277826e-05, + "loss": 0.7711, + "step": 82470 + }, + { + "epoch": 1.3239377839130644, + "grad_norm": 1.108634114265442, + "learning_rate": 1.2824602898222333e-05, + "loss": 0.7707, + "step": 82480 + }, + { + "epoch": 1.3240983001332285, + "grad_norm": 0.7400292158126831, + "learning_rate": 1.281909700545063e-05, + "loss": 0.742, + "step": 82490 + }, + { + "epoch": 1.3242588163533924, + "grad_norm": 0.6951978802680969, + "learning_rate": 1.2813591887312865e-05, + "loss": 0.7452, + "step": 82500 + }, + { + "epoch": 1.3244193325735565, + "grad_norm": 0.6786137223243713, + "learning_rate": 1.2808087544159125e-05, + "loss": 0.846, + "step": 82510 + }, + { + "epoch": 1.3245798487937206, + "grad_norm": 0.9308404326438904, + "learning_rate": 1.2802583976339473e-05, + "loss": 0.7043, + "step": 82520 + }, + { + "epoch": 1.3247403650138847, + "grad_norm": 0.8202348947525024, + "learning_rate": 1.2797081184203877e-05, + "loss": 0.6599, + "step": 82530 + }, + { + "epoch": 1.3249008812340488, + "grad_norm": 0.9493246674537659, + "learning_rate": 1.2791579168102291e-05, + "loss": 0.7892, + "step": 82540 + }, + { + "epoch": 1.3250613974542127, + "grad_norm": 1.1303696632385254, + "learning_rate": 1.2786077928384619e-05, + "loss": 0.8885, + "step": 82550 + }, + { + "epoch": 1.3252219136743768, + "grad_norm": 0.8558535575866699, + "learning_rate": 1.27805774654007e-05, + "loss": 0.8035, + "step": 82560 + }, + { + "epoch": 1.325382429894541, + "grad_norm": 0.9078492522239685, + "learning_rate": 1.2775077779500333e-05, + "loss": 0.6476, + "step": 82570 + }, + { + "epoch": 1.3255429461147048, + "grad_norm": 2.188948154449463, + "learning_rate": 1.2769578871033275e-05, + "loss": 0.6828, + "step": 82580 + }, + { + "epoch": 1.325703462334869, + "grad_norm": 0.9500462412834167, + "learning_rate": 1.2764080740349204e-05, + "loss": 0.6268, + "step": 82590 + }, + { + "epoch": 1.325863978555033, + "grad_norm": 1.5491433143615723, + "learning_rate": 1.2758583387797784e-05, + "loss": 0.7112, + "step": 82600 + }, + { + "epoch": 1.3260244947751971, + "grad_norm": 1.6328904628753662, + "learning_rate": 1.275308681372861e-05, + "loss": 0.6483, + "step": 82610 + }, + { + "epoch": 1.326185010995361, + "grad_norm": 1.4500538110733032, + "learning_rate": 1.2747591018491234e-05, + "loss": 0.6292, + "step": 82620 + }, + { + "epoch": 1.3263455272155251, + "grad_norm": 0.8116426467895508, + "learning_rate": 1.2742096002435152e-05, + "loss": 0.7021, + "step": 82630 + }, + { + "epoch": 1.3265060434356892, + "grad_norm": 0.9565368294715881, + "learning_rate": 1.2736601765909827e-05, + "loss": 0.6137, + "step": 82640 + }, + { + "epoch": 1.3266665596558531, + "grad_norm": 1.467550277709961, + "learning_rate": 1.2731108309264644e-05, + "loss": 0.7203, + "step": 82650 + }, + { + "epoch": 1.3268270758760172, + "grad_norm": 0.8186715841293335, + "learning_rate": 1.272561563284896e-05, + "loss": 0.698, + "step": 82660 + }, + { + "epoch": 1.3269875920961813, + "grad_norm": 1.2555673122406006, + "learning_rate": 1.272012373701208e-05, + "loss": 0.6851, + "step": 82670 + }, + { + "epoch": 1.3271481083163454, + "grad_norm": 0.8800225257873535, + "learning_rate": 1.2714632622103254e-05, + "loss": 0.8002, + "step": 82680 + }, + { + "epoch": 1.3273086245365096, + "grad_norm": 1.2115617990493774, + "learning_rate": 1.2709142288471685e-05, + "loss": 0.6882, + "step": 82690 + }, + { + "epoch": 1.3274691407566734, + "grad_norm": 0.7920953035354614, + "learning_rate": 1.2703652736466525e-05, + "loss": 0.6513, + "step": 82700 + }, + { + "epoch": 1.3276296569768375, + "grad_norm": 1.193222165107727, + "learning_rate": 1.269816396643689e-05, + "loss": 0.6829, + "step": 82710 + }, + { + "epoch": 1.3277901731970014, + "grad_norm": 1.2477281093597412, + "learning_rate": 1.2692675978731813e-05, + "loss": 0.683, + "step": 82720 + }, + { + "epoch": 1.3279506894171655, + "grad_norm": 0.8676280379295349, + "learning_rate": 1.2687188773700306e-05, + "loss": 0.7166, + "step": 82730 + }, + { + "epoch": 1.3281112056373297, + "grad_norm": 0.8815311789512634, + "learning_rate": 1.2681702351691324e-05, + "loss": 0.676, + "step": 82740 + }, + { + "epoch": 1.3282717218574938, + "grad_norm": 1.226804494857788, + "learning_rate": 1.2676216713053773e-05, + "loss": 0.7644, + "step": 82750 + }, + { + "epoch": 1.3284322380776579, + "grad_norm": 0.8104398250579834, + "learning_rate": 1.2670731858136503e-05, + "loss": 0.8467, + "step": 82760 + }, + { + "epoch": 1.3285927542978218, + "grad_norm": 1.8327666521072388, + "learning_rate": 1.2665247787288326e-05, + "loss": 0.7557, + "step": 82770 + }, + { + "epoch": 1.3287532705179859, + "grad_norm": 0.9268956184387207, + "learning_rate": 1.2659764500857987e-05, + "loss": 0.6608, + "step": 82780 + }, + { + "epoch": 1.32891378673815, + "grad_norm": 0.9010215997695923, + "learning_rate": 1.2654281999194198e-05, + "loss": 0.7186, + "step": 82790 + }, + { + "epoch": 1.3290743029583139, + "grad_norm": 0.7741445302963257, + "learning_rate": 1.2648800282645613e-05, + "loss": 0.7476, + "step": 82800 + }, + { + "epoch": 1.329234819178478, + "grad_norm": 1.0601128339767456, + "learning_rate": 1.2643319351560833e-05, + "loss": 0.682, + "step": 82810 + }, + { + "epoch": 1.329395335398642, + "grad_norm": 1.3787330389022827, + "learning_rate": 1.2637839206288418e-05, + "loss": 0.6957, + "step": 82820 + }, + { + "epoch": 1.3295558516188062, + "grad_norm": 1.0480433702468872, + "learning_rate": 1.2632359847176881e-05, + "loss": 0.7509, + "step": 82830 + }, + { + "epoch": 1.32971636783897, + "grad_norm": 1.0169583559036255, + "learning_rate": 1.2626881274574657e-05, + "loss": 0.572, + "step": 82840 + }, + { + "epoch": 1.3298768840591342, + "grad_norm": 0.8419861793518066, + "learning_rate": 1.262140348883016e-05, + "loss": 0.6665, + "step": 82850 + }, + { + "epoch": 1.3300374002792983, + "grad_norm": 0.8989015221595764, + "learning_rate": 1.2615926490291747e-05, + "loss": 0.6986, + "step": 82860 + }, + { + "epoch": 1.3301979164994622, + "grad_norm": 0.7027350664138794, + "learning_rate": 1.2610450279307722e-05, + "loss": 0.5945, + "step": 82870 + }, + { + "epoch": 1.3303584327196263, + "grad_norm": 0.9493429064750671, + "learning_rate": 1.260497485622634e-05, + "loss": 0.75, + "step": 82880 + }, + { + "epoch": 1.3305189489397904, + "grad_norm": 1.2205586433410645, + "learning_rate": 1.2599500221395815e-05, + "loss": 0.7397, + "step": 82890 + }, + { + "epoch": 1.3306794651599545, + "grad_norm": 1.5487425327301025, + "learning_rate": 1.2594026375164283e-05, + "loss": 0.684, + "step": 82900 + }, + { + "epoch": 1.3308399813801184, + "grad_norm": 0.993385374546051, + "learning_rate": 1.2588553317879859e-05, + "loss": 0.7032, + "step": 82910 + }, + { + "epoch": 1.3310004976002825, + "grad_norm": 0.7103989124298096, + "learning_rate": 1.2583081049890594e-05, + "loss": 0.6574, + "step": 82920 + }, + { + "epoch": 1.3311610138204466, + "grad_norm": 0.959307849407196, + "learning_rate": 1.2577609571544496e-05, + "loss": 0.7721, + "step": 82930 + }, + { + "epoch": 1.3313215300406105, + "grad_norm": 0.980225145816803, + "learning_rate": 1.2572138883189516e-05, + "loss": 0.7251, + "step": 82940 + }, + { + "epoch": 1.3314820462607746, + "grad_norm": 1.1489509344100952, + "learning_rate": 1.2566668985173569e-05, + "loss": 0.7291, + "step": 82950 + }, + { + "epoch": 1.3316425624809387, + "grad_norm": 1.2022111415863037, + "learning_rate": 1.2561199877844487e-05, + "loss": 0.7112, + "step": 82960 + }, + { + "epoch": 1.3318030787011028, + "grad_norm": 1.9069970846176147, + "learning_rate": 1.2555731561550088e-05, + "loss": 0.5764, + "step": 82970 + }, + { + "epoch": 1.331963594921267, + "grad_norm": 1.0116561651229858, + "learning_rate": 1.2550264036638121e-05, + "loss": 0.882, + "step": 82980 + }, + { + "epoch": 1.3321241111414308, + "grad_norm": 1.417973518371582, + "learning_rate": 1.2544797303456285e-05, + "loss": 0.7163, + "step": 82990 + }, + { + "epoch": 1.332284627361595, + "grad_norm": 1.7230010032653809, + "learning_rate": 1.2539331362352238e-05, + "loss": 0.7521, + "step": 83000 + }, + { + "epoch": 1.3324451435817588, + "grad_norm": 1.1764057874679565, + "learning_rate": 1.2533866213673579e-05, + "loss": 0.7615, + "step": 83010 + }, + { + "epoch": 1.332605659801923, + "grad_norm": 1.1727474927902222, + "learning_rate": 1.2528401857767871e-05, + "loss": 0.7605, + "step": 83020 + }, + { + "epoch": 1.332766176022087, + "grad_norm": 0.8463351130485535, + "learning_rate": 1.2522938294982594e-05, + "loss": 0.6965, + "step": 83030 + }, + { + "epoch": 1.3329266922422511, + "grad_norm": 1.775602102279663, + "learning_rate": 1.251747552566521e-05, + "loss": 0.6051, + "step": 83040 + }, + { + "epoch": 1.3330872084624152, + "grad_norm": 1.8952362537384033, + "learning_rate": 1.2512013550163116e-05, + "loss": 0.7647, + "step": 83050 + }, + { + "epoch": 1.3332477246825791, + "grad_norm": 1.104189395904541, + "learning_rate": 1.250655236882366e-05, + "loss": 0.6674, + "step": 83060 + }, + { + "epoch": 1.3334082409027432, + "grad_norm": 0.9286359548568726, + "learning_rate": 1.250109198199415e-05, + "loss": 0.8249, + "step": 83070 + }, + { + "epoch": 1.3335687571229073, + "grad_norm": 1.2788639068603516, + "learning_rate": 1.2495632390021835e-05, + "loss": 0.644, + "step": 83080 + }, + { + "epoch": 1.3337292733430712, + "grad_norm": 1.6513100862503052, + "learning_rate": 1.2490173593253898e-05, + "loss": 0.8585, + "step": 83090 + }, + { + "epoch": 1.3338897895632353, + "grad_norm": 1.1214590072631836, + "learning_rate": 1.2484715592037494e-05, + "loss": 0.7819, + "step": 83100 + }, + { + "epoch": 1.3340503057833994, + "grad_norm": 1.1866357326507568, + "learning_rate": 1.2479258386719722e-05, + "loss": 0.7711, + "step": 83110 + }, + { + "epoch": 1.3342108220035636, + "grad_norm": 1.8832050561904907, + "learning_rate": 1.2473801977647628e-05, + "loss": 0.5921, + "step": 83120 + }, + { + "epoch": 1.3343713382237274, + "grad_norm": 0.9446771740913391, + "learning_rate": 1.2468346365168207e-05, + "loss": 0.8108, + "step": 83130 + }, + { + "epoch": 1.3345318544438916, + "grad_norm": 0.8148835301399231, + "learning_rate": 1.2462891549628413e-05, + "loss": 0.6709, + "step": 83140 + }, + { + "epoch": 1.3346923706640557, + "grad_norm": 0.8497385382652283, + "learning_rate": 1.2457437531375121e-05, + "loss": 0.6116, + "step": 83150 + }, + { + "epoch": 1.3348528868842195, + "grad_norm": 1.4148070812225342, + "learning_rate": 1.2451984310755185e-05, + "loss": 0.6659, + "step": 83160 + }, + { + "epoch": 1.3350134031043837, + "grad_norm": 1.9004915952682495, + "learning_rate": 1.2446531888115399e-05, + "loss": 0.7947, + "step": 83170 + }, + { + "epoch": 1.3351739193245478, + "grad_norm": 1.1849368810653687, + "learning_rate": 1.2441080263802501e-05, + "loss": 0.6834, + "step": 83180 + }, + { + "epoch": 1.3353344355447119, + "grad_norm": 1.1833302974700928, + "learning_rate": 1.2435629438163185e-05, + "loss": 0.6636, + "step": 83190 + }, + { + "epoch": 1.3354949517648758, + "grad_norm": 0.992701530456543, + "learning_rate": 1.2430179411544101e-05, + "loss": 0.8471, + "step": 83200 + }, + { + "epoch": 1.3356554679850399, + "grad_norm": 1.0146918296813965, + "learning_rate": 1.2424730184291822e-05, + "loss": 0.7039, + "step": 83210 + }, + { + "epoch": 1.335815984205204, + "grad_norm": 1.371101975440979, + "learning_rate": 1.241928175675289e-05, + "loss": 0.7565, + "step": 83220 + }, + { + "epoch": 1.3359765004253679, + "grad_norm": 1.1571987867355347, + "learning_rate": 1.2413834129273798e-05, + "loss": 0.6204, + "step": 83230 + }, + { + "epoch": 1.336137016645532, + "grad_norm": 0.8751712441444397, + "learning_rate": 1.2408387302200985e-05, + "loss": 0.7687, + "step": 83240 + }, + { + "epoch": 1.336297532865696, + "grad_norm": 0.7109758257865906, + "learning_rate": 1.2402941275880833e-05, + "loss": 0.804, + "step": 83250 + }, + { + "epoch": 1.3364580490858602, + "grad_norm": 1.189833641052246, + "learning_rate": 1.2397496050659679e-05, + "loss": 0.7077, + "step": 83260 + }, + { + "epoch": 1.3366185653060243, + "grad_norm": 0.9349237680435181, + "learning_rate": 1.2392051626883816e-05, + "loss": 0.5407, + "step": 83270 + }, + { + "epoch": 1.3367790815261882, + "grad_norm": 1.6188441514968872, + "learning_rate": 1.2386608004899458e-05, + "loss": 0.8509, + "step": 83280 + }, + { + "epoch": 1.3369395977463523, + "grad_norm": 1.4815700054168701, + "learning_rate": 1.2381165185052798e-05, + "loss": 0.7947, + "step": 83290 + }, + { + "epoch": 1.3371001139665162, + "grad_norm": 0.8501349687576294, + "learning_rate": 1.2375723167689967e-05, + "loss": 0.693, + "step": 83300 + }, + { + "epoch": 1.3372606301866803, + "grad_norm": 0.7743089199066162, + "learning_rate": 1.2370281953157045e-05, + "loss": 0.7113, + "step": 83310 + }, + { + "epoch": 1.3374211464068444, + "grad_norm": 1.8260126113891602, + "learning_rate": 1.2364841541800065e-05, + "loss": 0.7651, + "step": 83320 + }, + { + "epoch": 1.3375816626270085, + "grad_norm": 1.4235702753067017, + "learning_rate": 1.2359401933964997e-05, + "loss": 0.8095, + "step": 83330 + }, + { + "epoch": 1.3377421788471726, + "grad_norm": 1.0590269565582275, + "learning_rate": 1.235396312999779e-05, + "loss": 0.7527, + "step": 83340 + }, + { + "epoch": 1.3379026950673365, + "grad_norm": 1.05699622631073, + "learning_rate": 1.2348525130244288e-05, + "loss": 0.6051, + "step": 83350 + }, + { + "epoch": 1.3380632112875006, + "grad_norm": 1.1893185377120972, + "learning_rate": 1.2343087935050333e-05, + "loss": 0.7798, + "step": 83360 + }, + { + "epoch": 1.3382237275076647, + "grad_norm": 0.9432458281517029, + "learning_rate": 1.2337651544761694e-05, + "loss": 0.6882, + "step": 83370 + }, + { + "epoch": 1.3383842437278286, + "grad_norm": 1.1504203081130981, + "learning_rate": 1.23322159597241e-05, + "loss": 0.6273, + "step": 83380 + }, + { + "epoch": 1.3385447599479927, + "grad_norm": 0.9640244245529175, + "learning_rate": 1.2326781180283218e-05, + "loss": 0.7322, + "step": 83390 + }, + { + "epoch": 1.3387052761681568, + "grad_norm": 1.1133891344070435, + "learning_rate": 1.2321347206784666e-05, + "loss": 0.6689, + "step": 83400 + }, + { + "epoch": 1.338865792388321, + "grad_norm": 1.204648494720459, + "learning_rate": 1.231591403957402e-05, + "loss": 0.721, + "step": 83410 + }, + { + "epoch": 1.3390263086084848, + "grad_norm": 0.7917761206626892, + "learning_rate": 1.2310481678996787e-05, + "loss": 0.6487, + "step": 83420 + }, + { + "epoch": 1.339186824828649, + "grad_norm": 0.9764518141746521, + "learning_rate": 1.2305050125398446e-05, + "loss": 0.8532, + "step": 83430 + }, + { + "epoch": 1.339347341048813, + "grad_norm": 1.1902744770050049, + "learning_rate": 1.2299619379124402e-05, + "loss": 0.6627, + "step": 83440 + }, + { + "epoch": 1.339507857268977, + "grad_norm": 1.0951356887817383, + "learning_rate": 1.2294189440520031e-05, + "loss": 0.6982, + "step": 83450 + }, + { + "epoch": 1.339668373489141, + "grad_norm": 1.4364850521087646, + "learning_rate": 1.2288760309930628e-05, + "loss": 0.7286, + "step": 83460 + }, + { + "epoch": 1.3398288897093051, + "grad_norm": 1.1429837942123413, + "learning_rate": 1.228333198770146e-05, + "loss": 0.6912, + "step": 83470 + }, + { + "epoch": 1.3399894059294692, + "grad_norm": 0.501032292842865, + "learning_rate": 1.2277904474177737e-05, + "loss": 0.6641, + "step": 83480 + }, + { + "epoch": 1.3401499221496334, + "grad_norm": 0.8063519597053528, + "learning_rate": 1.2272477769704621e-05, + "loss": 0.7721, + "step": 83490 + }, + { + "epoch": 1.3403104383697972, + "grad_norm": 0.9122986793518066, + "learning_rate": 1.2267051874627216e-05, + "loss": 0.6639, + "step": 83500 + }, + { + "epoch": 1.3404709545899613, + "grad_norm": 1.2274585962295532, + "learning_rate": 1.2261626789290575e-05, + "loss": 0.6592, + "step": 83510 + }, + { + "epoch": 1.3406314708101252, + "grad_norm": 1.497604250907898, + "learning_rate": 1.2256202514039717e-05, + "loss": 0.7746, + "step": 83520 + }, + { + "epoch": 1.3407919870302893, + "grad_norm": 1.2844446897506714, + "learning_rate": 1.2250779049219569e-05, + "loss": 0.7044, + "step": 83530 + }, + { + "epoch": 1.3409525032504535, + "grad_norm": 0.9232126474380493, + "learning_rate": 1.2245356395175042e-05, + "loss": 0.6955, + "step": 83540 + }, + { + "epoch": 1.3411130194706176, + "grad_norm": 1.0078281164169312, + "learning_rate": 1.2239934552250986e-05, + "loss": 0.7215, + "step": 83550 + }, + { + "epoch": 1.3412735356907817, + "grad_norm": 1.028801679611206, + "learning_rate": 1.2234513520792199e-05, + "loss": 0.6914, + "step": 83560 + }, + { + "epoch": 1.3414340519109456, + "grad_norm": 1.2249194383621216, + "learning_rate": 1.2229093301143427e-05, + "loss": 0.5972, + "step": 83570 + }, + { + "epoch": 1.3415945681311097, + "grad_norm": 1.970324158668518, + "learning_rate": 1.222367389364937e-05, + "loss": 0.6997, + "step": 83580 + }, + { + "epoch": 1.3417550843512738, + "grad_norm": 0.9209614992141724, + "learning_rate": 1.2218255298654657e-05, + "loss": 0.7519, + "step": 83590 + }, + { + "epoch": 1.3419156005714377, + "grad_norm": 0.8481440544128418, + "learning_rate": 1.2212837516503887e-05, + "loss": 0.7248, + "step": 83600 + }, + { + "epoch": 1.3420761167916018, + "grad_norm": 0.7913222312927246, + "learning_rate": 1.2207420547541595e-05, + "loss": 0.7667, + "step": 83610 + }, + { + "epoch": 1.3422366330117659, + "grad_norm": 0.9447056651115417, + "learning_rate": 1.2202004392112273e-05, + "loss": 0.6816, + "step": 83620 + }, + { + "epoch": 1.34239714923193, + "grad_norm": 0.6965305805206299, + "learning_rate": 1.2196589050560356e-05, + "loss": 0.7138, + "step": 83630 + }, + { + "epoch": 1.3425576654520939, + "grad_norm": 0.8097999095916748, + "learning_rate": 1.2191174523230237e-05, + "loss": 0.7153, + "step": 83640 + }, + { + "epoch": 1.342718181672258, + "grad_norm": 0.9477149844169617, + "learning_rate": 1.2185760810466227e-05, + "loss": 0.6879, + "step": 83650 + }, + { + "epoch": 1.342878697892422, + "grad_norm": 0.7420792579650879, + "learning_rate": 1.2180347912612616e-05, + "loss": 0.7135, + "step": 83660 + }, + { + "epoch": 1.343039214112586, + "grad_norm": 1.0326871871948242, + "learning_rate": 1.2174935830013636e-05, + "loss": 0.6461, + "step": 83670 + }, + { + "epoch": 1.34319973033275, + "grad_norm": 0.9385388493537903, + "learning_rate": 1.2169524563013462e-05, + "loss": 0.7012, + "step": 83680 + }, + { + "epoch": 1.3433602465529142, + "grad_norm": 0.9174789190292358, + "learning_rate": 1.2164114111956218e-05, + "loss": 0.6968, + "step": 83690 + }, + { + "epoch": 1.3435207627730783, + "grad_norm": 0.4532835781574249, + "learning_rate": 1.2158704477185986e-05, + "loss": 0.7695, + "step": 83700 + }, + { + "epoch": 1.3436812789932422, + "grad_norm": 0.6881856918334961, + "learning_rate": 1.215329565904677e-05, + "loss": 0.6709, + "step": 83710 + }, + { + "epoch": 1.3438417952134063, + "grad_norm": 0.9251907467842102, + "learning_rate": 1.2147887657882547e-05, + "loss": 0.5997, + "step": 83720 + }, + { + "epoch": 1.3440023114335704, + "grad_norm": 0.9402714371681213, + "learning_rate": 1.2142480474037235e-05, + "loss": 0.6043, + "step": 83730 + }, + { + "epoch": 1.3441628276537343, + "grad_norm": 1.3477976322174072, + "learning_rate": 1.21370741078547e-05, + "loss": 0.6395, + "step": 83740 + }, + { + "epoch": 1.3443233438738984, + "grad_norm": 1.7855148315429688, + "learning_rate": 1.2131668559678751e-05, + "loss": 0.6811, + "step": 83750 + }, + { + "epoch": 1.3444838600940625, + "grad_norm": 1.0140444040298462, + "learning_rate": 1.2126263829853166e-05, + "loss": 0.6996, + "step": 83760 + }, + { + "epoch": 1.3446443763142266, + "grad_norm": 1.1497066020965576, + "learning_rate": 1.2120859918721627e-05, + "loss": 0.6958, + "step": 83770 + }, + { + "epoch": 1.3448048925343907, + "grad_norm": 1.8814046382904053, + "learning_rate": 1.2115456826627807e-05, + "loss": 0.7711, + "step": 83780 + }, + { + "epoch": 1.3449654087545546, + "grad_norm": 0.7861173152923584, + "learning_rate": 1.2110054553915307e-05, + "loss": 0.7533, + "step": 83790 + }, + { + "epoch": 1.3451259249747187, + "grad_norm": 1.0861846208572388, + "learning_rate": 1.2104653100927682e-05, + "loss": 0.6176, + "step": 83800 + }, + { + "epoch": 1.3452864411948826, + "grad_norm": 1.368775725364685, + "learning_rate": 1.2099252468008429e-05, + "loss": 0.8066, + "step": 83810 + }, + { + "epoch": 1.3454469574150467, + "grad_norm": 0.6463556289672852, + "learning_rate": 1.2093852655501003e-05, + "loss": 0.6826, + "step": 83820 + }, + { + "epoch": 1.3456074736352108, + "grad_norm": 0.9458009600639343, + "learning_rate": 1.2088453663748803e-05, + "loss": 0.6591, + "step": 83830 + }, + { + "epoch": 1.345767989855375, + "grad_norm": 1.4057217836380005, + "learning_rate": 1.2083055493095161e-05, + "loss": 0.7045, + "step": 83840 + }, + { + "epoch": 1.345928506075539, + "grad_norm": 1.471020221710205, + "learning_rate": 1.2077658143883372e-05, + "loss": 0.6103, + "step": 83850 + }, + { + "epoch": 1.346089022295703, + "grad_norm": 1.4271717071533203, + "learning_rate": 1.2072261616456679e-05, + "loss": 0.742, + "step": 83860 + }, + { + "epoch": 1.346249538515867, + "grad_norm": 0.9771077632904053, + "learning_rate": 1.206686591115827e-05, + "loss": 0.6576, + "step": 83870 + }, + { + "epoch": 1.3464100547360311, + "grad_norm": 0.8799369931221008, + "learning_rate": 1.2061471028331279e-05, + "loss": 0.6939, + "step": 83880 + }, + { + "epoch": 1.346570570956195, + "grad_norm": 1.1549745798110962, + "learning_rate": 1.2056076968318796e-05, + "loss": 0.7042, + "step": 83890 + }, + { + "epoch": 1.3467310871763591, + "grad_norm": 0.9974158406257629, + "learning_rate": 1.2050683731463838e-05, + "loss": 0.8597, + "step": 83900 + }, + { + "epoch": 1.3468916033965233, + "grad_norm": 1.6012468338012695, + "learning_rate": 1.204529131810939e-05, + "loss": 0.7564, + "step": 83910 + }, + { + "epoch": 1.3470521196166874, + "grad_norm": 0.780163586139679, + "learning_rate": 1.2039899728598378e-05, + "loss": 0.7004, + "step": 83920 + }, + { + "epoch": 1.3472126358368512, + "grad_norm": 1.1537455320358276, + "learning_rate": 1.2034508963273674e-05, + "loss": 0.7567, + "step": 83930 + }, + { + "epoch": 1.3473731520570154, + "grad_norm": 1.8649896383285522, + "learning_rate": 1.20291190224781e-05, + "loss": 0.8282, + "step": 83940 + }, + { + "epoch": 1.3475336682771795, + "grad_norm": 1.168674111366272, + "learning_rate": 1.2023729906554426e-05, + "loss": 0.6775, + "step": 83950 + }, + { + "epoch": 1.3476941844973434, + "grad_norm": 1.0853954553604126, + "learning_rate": 1.2018341615845374e-05, + "loss": 0.7381, + "step": 83960 + }, + { + "epoch": 1.3478547007175075, + "grad_norm": 0.9721826910972595, + "learning_rate": 1.2012954150693595e-05, + "loss": 0.5949, + "step": 83970 + }, + { + "epoch": 1.3480152169376716, + "grad_norm": 0.9651376605033875, + "learning_rate": 1.2007567511441704e-05, + "loss": 0.7858, + "step": 83980 + }, + { + "epoch": 1.3481757331578357, + "grad_norm": 1.0879871845245361, + "learning_rate": 1.200218169843226e-05, + "loss": 0.6834, + "step": 83990 + }, + { + "epoch": 1.3483362493779998, + "grad_norm": 0.8067581057548523, + "learning_rate": 1.1996796712007772e-05, + "loss": 0.7815, + "step": 84000 + }, + { + "epoch": 1.3483362493779998, + "eval_loss": 0.773542582988739, + "eval_runtime": 1834.2878, + "eval_samples_per_second": 14.3, + "eval_steps_per_second": 1.788, + "step": 84000 + }, + { + "epoch": 1.3484967655981637, + "grad_norm": 1.8508962392807007, + "learning_rate": 1.1991412552510692e-05, + "loss": 0.7558, + "step": 84010 + }, + { + "epoch": 1.3486572818183278, + "grad_norm": 1.5022846460342407, + "learning_rate": 1.1986029220283421e-05, + "loss": 0.9073, + "step": 84020 + }, + { + "epoch": 1.3488177980384917, + "grad_norm": 0.8728272914886475, + "learning_rate": 1.1980646715668306e-05, + "loss": 0.6189, + "step": 84030 + }, + { + "epoch": 1.3489783142586558, + "grad_norm": 0.6128484010696411, + "learning_rate": 1.1975265039007647e-05, + "loss": 0.7121, + "step": 84040 + }, + { + "epoch": 1.3491388304788199, + "grad_norm": 0.9033302068710327, + "learning_rate": 1.1969884190643682e-05, + "loss": 0.6796, + "step": 84050 + }, + { + "epoch": 1.349299346698984, + "grad_norm": 1.3213016986846924, + "learning_rate": 1.1964504170918602e-05, + "loss": 0.7063, + "step": 84060 + }, + { + "epoch": 1.349459862919148, + "grad_norm": 0.8838151693344116, + "learning_rate": 1.1959124980174549e-05, + "loss": 0.8116, + "step": 84070 + }, + { + "epoch": 1.349620379139312, + "grad_norm": 1.3375639915466309, + "learning_rate": 1.1953746618753614e-05, + "loss": 0.7748, + "step": 84080 + }, + { + "epoch": 1.349780895359476, + "grad_norm": 1.1149625778198242, + "learning_rate": 1.194836908699781e-05, + "loss": 0.7285, + "step": 84090 + }, + { + "epoch": 1.3499414115796402, + "grad_norm": 0.7090871334075928, + "learning_rate": 1.1942992385249126e-05, + "loss": 0.6047, + "step": 84100 + }, + { + "epoch": 1.350101927799804, + "grad_norm": 0.772484302520752, + "learning_rate": 1.193761651384949e-05, + "loss": 0.7323, + "step": 84110 + }, + { + "epoch": 1.3502624440199682, + "grad_norm": 0.7505108118057251, + "learning_rate": 1.1932241473140776e-05, + "loss": 0.7766, + "step": 84120 + }, + { + "epoch": 1.3504229602401323, + "grad_norm": 0.7765289545059204, + "learning_rate": 1.1926867263464805e-05, + "loss": 0.6618, + "step": 84130 + }, + { + "epoch": 1.3505834764602964, + "grad_norm": 1.1154499053955078, + "learning_rate": 1.1921493885163354e-05, + "loss": 0.7227, + "step": 84140 + }, + { + "epoch": 1.3507439926804603, + "grad_norm": 0.9845353960990906, + "learning_rate": 1.1916121338578117e-05, + "loss": 0.6852, + "step": 84150 + }, + { + "epoch": 1.3509045089006244, + "grad_norm": 0.9882499575614929, + "learning_rate": 1.1910749624050772e-05, + "loss": 0.7008, + "step": 84160 + }, + { + "epoch": 1.3510650251207885, + "grad_norm": 1.0930012464523315, + "learning_rate": 1.190537874192292e-05, + "loss": 0.6798, + "step": 84170 + }, + { + "epoch": 1.3512255413409524, + "grad_norm": 0.96038818359375, + "learning_rate": 1.1900008692536126e-05, + "loss": 0.7928, + "step": 84180 + }, + { + "epoch": 1.3513860575611165, + "grad_norm": 1.2010669708251953, + "learning_rate": 1.1894639476231889e-05, + "loss": 0.6299, + "step": 84190 + }, + { + "epoch": 1.3515465737812806, + "grad_norm": 1.307680368423462, + "learning_rate": 1.188927109335167e-05, + "loss": 0.5959, + "step": 84200 + }, + { + "epoch": 1.3517070900014447, + "grad_norm": 1.0772244930267334, + "learning_rate": 1.1883903544236849e-05, + "loss": 0.6831, + "step": 84210 + }, + { + "epoch": 1.3518676062216086, + "grad_norm": 1.2486804723739624, + "learning_rate": 1.1878536829228775e-05, + "loss": 0.8156, + "step": 84220 + }, + { + "epoch": 1.3520281224417727, + "grad_norm": 0.9239603281021118, + "learning_rate": 1.1873170948668749e-05, + "loss": 0.7512, + "step": 84230 + }, + { + "epoch": 1.3521886386619368, + "grad_norm": 0.8848470449447632, + "learning_rate": 1.1867805902898003e-05, + "loss": 0.64, + "step": 84240 + }, + { + "epoch": 1.3523491548821007, + "grad_norm": 0.9962363243103027, + "learning_rate": 1.186244169225772e-05, + "loss": 0.6824, + "step": 84250 + }, + { + "epoch": 1.3525096711022648, + "grad_norm": 1.825828194618225, + "learning_rate": 1.1857078317089046e-05, + "loss": 0.727, + "step": 84260 + }, + { + "epoch": 1.352670187322429, + "grad_norm": 0.842032790184021, + "learning_rate": 1.1851715777733041e-05, + "loss": 0.8565, + "step": 84270 + }, + { + "epoch": 1.352830703542593, + "grad_norm": 1.110234022140503, + "learning_rate": 1.184635407453074e-05, + "loss": 0.8402, + "step": 84280 + }, + { + "epoch": 1.3529912197627572, + "grad_norm": 1.002404808998108, + "learning_rate": 1.1840993207823118e-05, + "loss": 0.7081, + "step": 84290 + }, + { + "epoch": 1.353151735982921, + "grad_norm": 1.2576905488967896, + "learning_rate": 1.1835633177951091e-05, + "loss": 0.7362, + "step": 84300 + }, + { + "epoch": 1.3533122522030852, + "grad_norm": 0.9868327379226685, + "learning_rate": 1.1830273985255524e-05, + "loss": 0.7388, + "step": 84310 + }, + { + "epoch": 1.353472768423249, + "grad_norm": 0.9790274500846863, + "learning_rate": 1.1824915630077245e-05, + "loss": 0.7476, + "step": 84320 + }, + { + "epoch": 1.3536332846434131, + "grad_norm": 1.0969125032424927, + "learning_rate": 1.1819558112756995e-05, + "loss": 0.8607, + "step": 84330 + }, + { + "epoch": 1.3537938008635773, + "grad_norm": 1.08942449092865, + "learning_rate": 1.1814201433635486e-05, + "loss": 0.7427, + "step": 84340 + }, + { + "epoch": 1.3539543170837414, + "grad_norm": 1.0338586568832397, + "learning_rate": 1.1808845593053372e-05, + "loss": 0.7369, + "step": 84350 + }, + { + "epoch": 1.3541148333039055, + "grad_norm": 1.368172526359558, + "learning_rate": 1.1803490591351255e-05, + "loss": 0.7225, + "step": 84360 + }, + { + "epoch": 1.3542753495240694, + "grad_norm": 0.7379749417304993, + "learning_rate": 1.179813642886968e-05, + "loss": 0.6722, + "step": 84370 + }, + { + "epoch": 1.3544358657442335, + "grad_norm": 0.9260835647583008, + "learning_rate": 1.179278310594914e-05, + "loss": 0.7276, + "step": 84380 + }, + { + "epoch": 1.3545963819643976, + "grad_norm": 1.6219117641448975, + "learning_rate": 1.1787430622930087e-05, + "loss": 0.6865, + "step": 84390 + }, + { + "epoch": 1.3547568981845615, + "grad_norm": 1.9278379678726196, + "learning_rate": 1.1782078980152886e-05, + "loss": 0.6454, + "step": 84400 + }, + { + "epoch": 1.3549174144047256, + "grad_norm": 0.9703067541122437, + "learning_rate": 1.1776728177957882e-05, + "loss": 0.7421, + "step": 84410 + }, + { + "epoch": 1.3550779306248897, + "grad_norm": 1.2567813396453857, + "learning_rate": 1.1771378216685353e-05, + "loss": 0.6813, + "step": 84420 + }, + { + "epoch": 1.3552384468450538, + "grad_norm": 1.3841959238052368, + "learning_rate": 1.1766029096675524e-05, + "loss": 0.7064, + "step": 84430 + }, + { + "epoch": 1.3553989630652177, + "grad_norm": 1.0979188680648804, + "learning_rate": 1.1760680818268568e-05, + "loss": 0.7137, + "step": 84440 + }, + { + "epoch": 1.3555594792853818, + "grad_norm": 1.0142196416854858, + "learning_rate": 1.1755333381804617e-05, + "loss": 0.6784, + "step": 84450 + }, + { + "epoch": 1.355719995505546, + "grad_norm": 0.9270263314247131, + "learning_rate": 1.1749986787623716e-05, + "loss": 0.649, + "step": 84460 + }, + { + "epoch": 1.3558805117257098, + "grad_norm": 1.0232547521591187, + "learning_rate": 1.1744641036065882e-05, + "loss": 0.5889, + "step": 84470 + }, + { + "epoch": 1.356041027945874, + "grad_norm": 0.9800785183906555, + "learning_rate": 1.1739296127471081e-05, + "loss": 0.7411, + "step": 84480 + }, + { + "epoch": 1.356201544166038, + "grad_norm": 0.8538102507591248, + "learning_rate": 1.173395206217921e-05, + "loss": 0.668, + "step": 84490 + }, + { + "epoch": 1.356362060386202, + "grad_norm": 0.7634408473968506, + "learning_rate": 1.1728608840530125e-05, + "loss": 0.7046, + "step": 84500 + }, + { + "epoch": 1.356522576606366, + "grad_norm": 1.5113071203231812, + "learning_rate": 1.1723266462863633e-05, + "loss": 0.7648, + "step": 84510 + }, + { + "epoch": 1.35668309282653, + "grad_norm": 1.1978785991668701, + "learning_rate": 1.1717924929519458e-05, + "loss": 0.7386, + "step": 84520 + }, + { + "epoch": 1.3568436090466942, + "grad_norm": 0.7515418529510498, + "learning_rate": 1.1712584240837301e-05, + "loss": 0.6901, + "step": 84530 + }, + { + "epoch": 1.357004125266858, + "grad_norm": 0.8568819165229797, + "learning_rate": 1.1707244397156794e-05, + "loss": 0.5892, + "step": 84540 + }, + { + "epoch": 1.3571646414870222, + "grad_norm": 1.0255839824676514, + "learning_rate": 1.1701905398817526e-05, + "loss": 0.6652, + "step": 84550 + }, + { + "epoch": 1.3573251577071863, + "grad_norm": 1.1408259868621826, + "learning_rate": 1.169656724615902e-05, + "loss": 0.7793, + "step": 84560 + }, + { + "epoch": 1.3574856739273504, + "grad_norm": 1.7388150691986084, + "learning_rate": 1.1691229939520756e-05, + "loss": 0.7465, + "step": 84570 + }, + { + "epoch": 1.3576461901475145, + "grad_norm": 0.799095869064331, + "learning_rate": 1.1685893479242151e-05, + "loss": 0.676, + "step": 84580 + }, + { + "epoch": 1.3578067063676784, + "grad_norm": 0.867562472820282, + "learning_rate": 1.1680557865662587e-05, + "loss": 0.6651, + "step": 84590 + }, + { + "epoch": 1.3579672225878425, + "grad_norm": 0.8376501798629761, + "learning_rate": 1.1675223099121355e-05, + "loss": 0.7875, + "step": 84600 + }, + { + "epoch": 1.3581277388080064, + "grad_norm": 1.732627272605896, + "learning_rate": 1.1669889179957725e-05, + "loss": 0.7408, + "step": 84610 + }, + { + "epoch": 1.3582882550281705, + "grad_norm": 0.7220125198364258, + "learning_rate": 1.1664556108510902e-05, + "loss": 0.5823, + "step": 84620 + }, + { + "epoch": 1.3584487712483346, + "grad_norm": 1.2861098051071167, + "learning_rate": 1.1659223885120041e-05, + "loss": 0.8302, + "step": 84630 + }, + { + "epoch": 1.3586092874684987, + "grad_norm": 0.9886950850486755, + "learning_rate": 1.1653892510124239e-05, + "loss": 0.6774, + "step": 84640 + }, + { + "epoch": 1.3587698036886628, + "grad_norm": 1.075566291809082, + "learning_rate": 1.1648561983862538e-05, + "loss": 0.6503, + "step": 84650 + }, + { + "epoch": 1.3589303199088267, + "grad_norm": 0.7734619379043579, + "learning_rate": 1.1643232306673931e-05, + "loss": 0.6273, + "step": 84660 + }, + { + "epoch": 1.3590908361289908, + "grad_norm": 1.3022727966308594, + "learning_rate": 1.1637903478897352e-05, + "loss": 0.8196, + "step": 84670 + }, + { + "epoch": 1.359251352349155, + "grad_norm": 0.94758141040802, + "learning_rate": 1.1632575500871687e-05, + "loss": 0.6861, + "step": 84680 + }, + { + "epoch": 1.3594118685693188, + "grad_norm": 0.8676806092262268, + "learning_rate": 1.162724837293576e-05, + "loss": 0.7737, + "step": 84690 + }, + { + "epoch": 1.359572384789483, + "grad_norm": 0.8218750953674316, + "learning_rate": 1.1621922095428353e-05, + "loss": 0.5638, + "step": 84700 + }, + { + "epoch": 1.359732901009647, + "grad_norm": 0.9818390607833862, + "learning_rate": 1.1616596668688174e-05, + "loss": 0.6976, + "step": 84710 + }, + { + "epoch": 1.3598934172298112, + "grad_norm": 0.6655218005180359, + "learning_rate": 1.1611272093053891e-05, + "loss": 0.6824, + "step": 84720 + }, + { + "epoch": 1.360053933449975, + "grad_norm": 1.0498600006103516, + "learning_rate": 1.1605948368864122e-05, + "loss": 0.602, + "step": 84730 + }, + { + "epoch": 1.3602144496701392, + "grad_norm": 1.1975957155227661, + "learning_rate": 1.160062549645742e-05, + "loss": 0.6581, + "step": 84740 + }, + { + "epoch": 1.3603749658903033, + "grad_norm": 0.8587167859077454, + "learning_rate": 1.1595303476172289e-05, + "loss": 0.6796, + "step": 84750 + }, + { + "epoch": 1.3605354821104672, + "grad_norm": 0.8543789386749268, + "learning_rate": 1.1589982308347191e-05, + "loss": 0.6083, + "step": 84760 + }, + { + "epoch": 1.3606959983306313, + "grad_norm": 0.9831714630126953, + "learning_rate": 1.1584661993320497e-05, + "loss": 0.7707, + "step": 84770 + }, + { + "epoch": 1.3608565145507954, + "grad_norm": 1.1615002155303955, + "learning_rate": 1.1579342531430563e-05, + "loss": 0.6595, + "step": 84780 + }, + { + "epoch": 1.3610170307709595, + "grad_norm": 3.136396646499634, + "learning_rate": 1.1574023923015673e-05, + "loss": 0.7905, + "step": 84790 + }, + { + "epoch": 1.3611775469911236, + "grad_norm": 0.7897974252700806, + "learning_rate": 1.1568706168414056e-05, + "loss": 0.7136, + "step": 84800 + }, + { + "epoch": 1.3613380632112875, + "grad_norm": 1.0559269189834595, + "learning_rate": 1.1563389267963896e-05, + "loss": 0.6496, + "step": 84810 + }, + { + "epoch": 1.3614985794314516, + "grad_norm": 0.7883936762809753, + "learning_rate": 1.155807322200332e-05, + "loss": 0.6799, + "step": 84820 + }, + { + "epoch": 1.3616590956516155, + "grad_norm": 0.7402806282043457, + "learning_rate": 1.1552758030870386e-05, + "loss": 0.6426, + "step": 84830 + }, + { + "epoch": 1.3618196118717796, + "grad_norm": 1.2086678743362427, + "learning_rate": 1.1547443694903109e-05, + "loss": 0.7591, + "step": 84840 + }, + { + "epoch": 1.3619801280919437, + "grad_norm": 1.2001839876174927, + "learning_rate": 1.1542130214439458e-05, + "loss": 0.6503, + "step": 84850 + }, + { + "epoch": 1.3621406443121078, + "grad_norm": 0.965160071849823, + "learning_rate": 1.1536817589817336e-05, + "loss": 0.8116, + "step": 84860 + }, + { + "epoch": 1.362301160532272, + "grad_norm": 1.1921335458755493, + "learning_rate": 1.1531505821374591e-05, + "loss": 0.6611, + "step": 84870 + }, + { + "epoch": 1.3624616767524358, + "grad_norm": 1.1836940050125122, + "learning_rate": 1.1526194909449026e-05, + "loss": 0.7135, + "step": 84880 + }, + { + "epoch": 1.3626221929726, + "grad_norm": 0.5734943151473999, + "learning_rate": 1.1520884854378391e-05, + "loss": 0.7859, + "step": 84890 + }, + { + "epoch": 1.362782709192764, + "grad_norm": 0.8218778371810913, + "learning_rate": 1.1515575656500355e-05, + "loss": 0.7961, + "step": 84900 + }, + { + "epoch": 1.362943225412928, + "grad_norm": 1.1345276832580566, + "learning_rate": 1.1510267316152563e-05, + "loss": 0.7707, + "step": 84910 + }, + { + "epoch": 1.363103741633092, + "grad_norm": 2.1513493061065674, + "learning_rate": 1.150495983367259e-05, + "loss": 0.6829, + "step": 84920 + }, + { + "epoch": 1.3632642578532561, + "grad_norm": 0.9105267524719238, + "learning_rate": 1.1499653209397967e-05, + "loss": 0.7083, + "step": 84930 + }, + { + "epoch": 1.3634247740734202, + "grad_norm": 0.9363168478012085, + "learning_rate": 1.1494347443666159e-05, + "loss": 0.7423, + "step": 84940 + }, + { + "epoch": 1.363585290293584, + "grad_norm": 0.6525752544403076, + "learning_rate": 1.1489042536814592e-05, + "loss": 0.6932, + "step": 84950 + }, + { + "epoch": 1.3637458065137482, + "grad_norm": 1.2648166418075562, + "learning_rate": 1.1483738489180613e-05, + "loss": 0.6932, + "step": 84960 + }, + { + "epoch": 1.3639063227339123, + "grad_norm": 0.9640535116195679, + "learning_rate": 1.1478435301101531e-05, + "loss": 0.7719, + "step": 84970 + }, + { + "epoch": 1.3640668389540762, + "grad_norm": 1.274213433265686, + "learning_rate": 1.1473132972914599e-05, + "loss": 0.687, + "step": 84980 + }, + { + "epoch": 1.3642273551742403, + "grad_norm": 1.2522038221359253, + "learning_rate": 1.1467831504957016e-05, + "loss": 0.6942, + "step": 84990 + }, + { + "epoch": 1.3643878713944044, + "grad_norm": 0.8111152648925781, + "learning_rate": 1.1462530897565926e-05, + "loss": 0.6882, + "step": 85000 + }, + { + "epoch": 1.3645483876145685, + "grad_norm": 1.1023921966552734, + "learning_rate": 1.145723115107842e-05, + "loss": 0.7392, + "step": 85010 + }, + { + "epoch": 1.3647089038347324, + "grad_norm": 1.0723152160644531, + "learning_rate": 1.145193226583152e-05, + "loss": 0.7315, + "step": 85020 + }, + { + "epoch": 1.3648694200548965, + "grad_norm": 0.7809601426124573, + "learning_rate": 1.1446634242162204e-05, + "loss": 0.7345, + "step": 85030 + }, + { + "epoch": 1.3650299362750606, + "grad_norm": 1.6037641763687134, + "learning_rate": 1.1441337080407402e-05, + "loss": 0.7739, + "step": 85040 + }, + { + "epoch": 1.3651904524952245, + "grad_norm": 1.193173885345459, + "learning_rate": 1.143604078090398e-05, + "loss": 0.7615, + "step": 85050 + }, + { + "epoch": 1.3653509687153886, + "grad_norm": 0.9262489676475525, + "learning_rate": 1.1430745343988752e-05, + "loss": 0.707, + "step": 85060 + }, + { + "epoch": 1.3655114849355527, + "grad_norm": 1.504935383796692, + "learning_rate": 1.1425450769998486e-05, + "loss": 0.6485, + "step": 85070 + }, + { + "epoch": 1.3656720011557169, + "grad_norm": 1.0880736112594604, + "learning_rate": 1.1420157059269867e-05, + "loss": 0.7633, + "step": 85080 + }, + { + "epoch": 1.365832517375881, + "grad_norm": 1.0331189632415771, + "learning_rate": 1.1414864212139554e-05, + "loss": 0.7102, + "step": 85090 + }, + { + "epoch": 1.3659930335960448, + "grad_norm": 0.7060509920120239, + "learning_rate": 1.140957222894414e-05, + "loss": 0.6777, + "step": 85100 + }, + { + "epoch": 1.366153549816209, + "grad_norm": 0.9296097755432129, + "learning_rate": 1.1404281110020166e-05, + "loss": 0.724, + "step": 85110 + }, + { + "epoch": 1.3663140660363728, + "grad_norm": 0.878687858581543, + "learning_rate": 1.1398990855704115e-05, + "loss": 0.6269, + "step": 85120 + }, + { + "epoch": 1.366474582256537, + "grad_norm": 1.1457804441452026, + "learning_rate": 1.139370146633242e-05, + "loss": 0.5575, + "step": 85130 + }, + { + "epoch": 1.366635098476701, + "grad_norm": 1.4642033576965332, + "learning_rate": 1.1388412942241447e-05, + "loss": 0.6676, + "step": 85140 + }, + { + "epoch": 1.3667956146968652, + "grad_norm": 1.2299257516860962, + "learning_rate": 1.1383125283767518e-05, + "loss": 0.6203, + "step": 85150 + }, + { + "epoch": 1.3669561309170293, + "grad_norm": 5.0501885414123535, + "learning_rate": 1.1377838491246895e-05, + "loss": 0.688, + "step": 85160 + }, + { + "epoch": 1.3671166471371932, + "grad_norm": 1.1196398735046387, + "learning_rate": 1.1372552565015793e-05, + "loss": 0.6597, + "step": 85170 + }, + { + "epoch": 1.3672771633573573, + "grad_norm": 0.7640538811683655, + "learning_rate": 1.1367267505410362e-05, + "loss": 0.7452, + "step": 85180 + }, + { + "epoch": 1.3674376795775214, + "grad_norm": 0.7784835696220398, + "learning_rate": 1.1361983312766703e-05, + "loss": 0.7477, + "step": 85190 + }, + { + "epoch": 1.3675981957976853, + "grad_norm": 1.633409857749939, + "learning_rate": 1.1356699987420855e-05, + "loss": 0.6716, + "step": 85200 + }, + { + "epoch": 1.3677587120178494, + "grad_norm": 0.8492593765258789, + "learning_rate": 1.1351417529708822e-05, + "loss": 0.7127, + "step": 85210 + }, + { + "epoch": 1.3679192282380135, + "grad_norm": 0.9581201076507568, + "learning_rate": 1.1346135939966516e-05, + "loss": 0.7662, + "step": 85220 + }, + { + "epoch": 1.3680797444581776, + "grad_norm": 0.7322894334793091, + "learning_rate": 1.134085521852982e-05, + "loss": 0.7332, + "step": 85230 + }, + { + "epoch": 1.3682402606783415, + "grad_norm": 2.3692007064819336, + "learning_rate": 1.1335575365734563e-05, + "loss": 0.7117, + "step": 85240 + }, + { + "epoch": 1.3684007768985056, + "grad_norm": 1.173947811126709, + "learning_rate": 1.1330296381916513e-05, + "loss": 0.7216, + "step": 85250 + }, + { + "epoch": 1.3685612931186697, + "grad_norm": 0.9467964172363281, + "learning_rate": 1.1325018267411378e-05, + "loss": 0.7176, + "step": 85260 + }, + { + "epoch": 1.3687218093388336, + "grad_norm": 1.3482518196105957, + "learning_rate": 1.131974102255482e-05, + "loss": 0.709, + "step": 85270 + }, + { + "epoch": 1.3688823255589977, + "grad_norm": 1.0626147985458374, + "learning_rate": 1.1314464647682435e-05, + "loss": 0.6444, + "step": 85280 + }, + { + "epoch": 1.3690428417791618, + "grad_norm": 1.1086533069610596, + "learning_rate": 1.1309189143129772e-05, + "loss": 0.7213, + "step": 85290 + }, + { + "epoch": 1.369203357999326, + "grad_norm": 0.9805077314376831, + "learning_rate": 1.1303914509232325e-05, + "loss": 0.7327, + "step": 85300 + }, + { + "epoch": 1.36936387421949, + "grad_norm": 0.9234163165092468, + "learning_rate": 1.1298640746325526e-05, + "loss": 0.6855, + "step": 85310 + }, + { + "epoch": 1.369524390439654, + "grad_norm": 2.019000768661499, + "learning_rate": 1.1293367854744766e-05, + "loss": 0.8225, + "step": 85320 + }, + { + "epoch": 1.369684906659818, + "grad_norm": 1.062179684638977, + "learning_rate": 1.1288095834825352e-05, + "loss": 0.6551, + "step": 85330 + }, + { + "epoch": 1.369845422879982, + "grad_norm": 1.0250693559646606, + "learning_rate": 1.1282824686902563e-05, + "loss": 0.6464, + "step": 85340 + }, + { + "epoch": 1.370005939100146, + "grad_norm": 1.0102099180221558, + "learning_rate": 1.1277554411311611e-05, + "loss": 0.8147, + "step": 85350 + }, + { + "epoch": 1.3701664553203101, + "grad_norm": 1.497228741645813, + "learning_rate": 1.1272285008387657e-05, + "loss": 0.7252, + "step": 85360 + }, + { + "epoch": 1.3703269715404742, + "grad_norm": 1.1691272258758545, + "learning_rate": 1.12670164784658e-05, + "loss": 0.4749, + "step": 85370 + }, + { + "epoch": 1.3704874877606383, + "grad_norm": 1.1636940240859985, + "learning_rate": 1.1261748821881102e-05, + "loss": 0.6419, + "step": 85380 + }, + { + "epoch": 1.3706480039808022, + "grad_norm": 1.3186198472976685, + "learning_rate": 1.1256482038968536e-05, + "loss": 0.7865, + "step": 85390 + }, + { + "epoch": 1.3708085202009663, + "grad_norm": 1.0932663679122925, + "learning_rate": 1.1251216130063041e-05, + "loss": 0.6918, + "step": 85400 + }, + { + "epoch": 1.3709690364211304, + "grad_norm": 1.0307642221450806, + "learning_rate": 1.1245951095499507e-05, + "loss": 0.689, + "step": 85410 + }, + { + "epoch": 1.3711295526412943, + "grad_norm": 0.940466582775116, + "learning_rate": 1.1240686935612755e-05, + "loss": 0.7986, + "step": 85420 + }, + { + "epoch": 1.3712900688614584, + "grad_norm": 1.0265800952911377, + "learning_rate": 1.1235423650737553e-05, + "loss": 0.7099, + "step": 85430 + }, + { + "epoch": 1.3714505850816225, + "grad_norm": 1.1815040111541748, + "learning_rate": 1.123016124120862e-05, + "loss": 0.6874, + "step": 85440 + }, + { + "epoch": 1.3716111013017867, + "grad_norm": 1.3566460609436035, + "learning_rate": 1.1224899707360616e-05, + "loss": 0.7666, + "step": 85450 + }, + { + "epoch": 1.3717716175219505, + "grad_norm": 0.7178074717521667, + "learning_rate": 1.1219639049528136e-05, + "loss": 0.7539, + "step": 85460 + }, + { + "epoch": 1.3719321337421146, + "grad_norm": 1.09394371509552, + "learning_rate": 1.1214379268045724e-05, + "loss": 0.7529, + "step": 85470 + }, + { + "epoch": 1.3720926499622788, + "grad_norm": 0.9914615154266357, + "learning_rate": 1.120912036324788e-05, + "loss": 0.7442, + "step": 85480 + }, + { + "epoch": 1.3722531661824426, + "grad_norm": 1.304383397102356, + "learning_rate": 1.1203862335469038e-05, + "loss": 0.6588, + "step": 85490 + }, + { + "epoch": 1.3724136824026067, + "grad_norm": 1.0231883525848389, + "learning_rate": 1.1198605185043573e-05, + "loss": 0.6849, + "step": 85500 + }, + { + "epoch": 1.3725741986227709, + "grad_norm": 1.0539964437484741, + "learning_rate": 1.1193348912305823e-05, + "loss": 0.6611, + "step": 85510 + }, + { + "epoch": 1.372734714842935, + "grad_norm": 0.8817369341850281, + "learning_rate": 1.1188093517590038e-05, + "loss": 0.5826, + "step": 85520 + }, + { + "epoch": 1.3728952310630989, + "grad_norm": 1.479509949684143, + "learning_rate": 1.1182839001230436e-05, + "loss": 0.6727, + "step": 85530 + }, + { + "epoch": 1.373055747283263, + "grad_norm": 1.116058588027954, + "learning_rate": 1.1177585363561174e-05, + "loss": 0.6734, + "step": 85540 + }, + { + "epoch": 1.373216263503427, + "grad_norm": 1.095413088798523, + "learning_rate": 1.1172332604916356e-05, + "loss": 0.6836, + "step": 85550 + }, + { + "epoch": 1.373376779723591, + "grad_norm": 0.8554247617721558, + "learning_rate": 1.1167080725630023e-05, + "loss": 0.6295, + "step": 85560 + }, + { + "epoch": 1.373537295943755, + "grad_norm": 0.7700021266937256, + "learning_rate": 1.1161829726036171e-05, + "loss": 0.727, + "step": 85570 + }, + { + "epoch": 1.3736978121639192, + "grad_norm": 1.0228923559188843, + "learning_rate": 1.115657960646872e-05, + "loss": 0.7383, + "step": 85580 + }, + { + "epoch": 1.3738583283840833, + "grad_norm": 1.0198333263397217, + "learning_rate": 1.1151330367261553e-05, + "loss": 0.7884, + "step": 85590 + }, + { + "epoch": 1.3740188446042474, + "grad_norm": 0.888423502445221, + "learning_rate": 1.1146082008748492e-05, + "loss": 0.7074, + "step": 85600 + }, + { + "epoch": 1.3741793608244113, + "grad_norm": 1.5299293994903564, + "learning_rate": 1.1140834531263303e-05, + "loss": 0.6735, + "step": 85610 + }, + { + "epoch": 1.3743398770445754, + "grad_norm": 1.2289024591445923, + "learning_rate": 1.1135587935139688e-05, + "loss": 0.7339, + "step": 85620 + }, + { + "epoch": 1.3745003932647393, + "grad_norm": 1.0177981853485107, + "learning_rate": 1.1130342220711318e-05, + "loss": 0.7193, + "step": 85630 + }, + { + "epoch": 1.3746609094849034, + "grad_norm": 0.9306913018226624, + "learning_rate": 1.1125097388311765e-05, + "loss": 0.6543, + "step": 85640 + }, + { + "epoch": 1.3748214257050675, + "grad_norm": 1.131875991821289, + "learning_rate": 1.1119853438274581e-05, + "loss": 0.8121, + "step": 85650 + }, + { + "epoch": 1.3749819419252316, + "grad_norm": 0.8643496036529541, + "learning_rate": 1.1114610370933249e-05, + "loss": 0.6603, + "step": 85660 + }, + { + "epoch": 1.3751424581453957, + "grad_norm": 0.7744705677032471, + "learning_rate": 1.1109368186621199e-05, + "loss": 0.7065, + "step": 85670 + }, + { + "epoch": 1.3753029743655596, + "grad_norm": 0.7625482082366943, + "learning_rate": 1.1104126885671804e-05, + "loss": 0.6046, + "step": 85680 + }, + { + "epoch": 1.3754634905857237, + "grad_norm": 0.7804808616638184, + "learning_rate": 1.1098886468418388e-05, + "loss": 0.815, + "step": 85690 + }, + { + "epoch": 1.3756240068058878, + "grad_norm": 0.8379820585250854, + "learning_rate": 1.1093646935194189e-05, + "loss": 0.6606, + "step": 85700 + }, + { + "epoch": 1.3757845230260517, + "grad_norm": 0.937261700630188, + "learning_rate": 1.1088408286332424e-05, + "loss": 0.7481, + "step": 85710 + }, + { + "epoch": 1.3759450392462158, + "grad_norm": 1.1801199913024902, + "learning_rate": 1.1083170522166243e-05, + "loss": 0.6955, + "step": 85720 + }, + { + "epoch": 1.37610555546638, + "grad_norm": 3.0262019634246826, + "learning_rate": 1.107793364302873e-05, + "loss": 0.7013, + "step": 85730 + }, + { + "epoch": 1.376266071686544, + "grad_norm": 0.8990810513496399, + "learning_rate": 1.1072697649252922e-05, + "loss": 0.7084, + "step": 85740 + }, + { + "epoch": 1.376426587906708, + "grad_norm": 2.617387056350708, + "learning_rate": 1.10674625411718e-05, + "loss": 0.7232, + "step": 85750 + }, + { + "epoch": 1.376587104126872, + "grad_norm": 1.1695202589035034, + "learning_rate": 1.1062228319118293e-05, + "loss": 0.663, + "step": 85760 + }, + { + "epoch": 1.3767476203470361, + "grad_norm": 0.6475804448127747, + "learning_rate": 1.105699498342525e-05, + "loss": 0.672, + "step": 85770 + }, + { + "epoch": 1.3769081365672, + "grad_norm": 0.9305492639541626, + "learning_rate": 1.1051762534425488e-05, + "loss": 0.686, + "step": 85780 + }, + { + "epoch": 1.3770686527873641, + "grad_norm": 0.9180136322975159, + "learning_rate": 1.1046530972451763e-05, + "loss": 0.6368, + "step": 85790 + }, + { + "epoch": 1.3772291690075282, + "grad_norm": 1.2689064741134644, + "learning_rate": 1.1041300297836768e-05, + "loss": 0.6735, + "step": 85800 + }, + { + "epoch": 1.3773896852276923, + "grad_norm": 0.8153706789016724, + "learning_rate": 1.1036070510913146e-05, + "loss": 0.7122, + "step": 85810 + }, + { + "epoch": 1.3775502014478562, + "grad_norm": 1.1967133283615112, + "learning_rate": 1.1030841612013478e-05, + "loss": 0.7615, + "step": 85820 + }, + { + "epoch": 1.3777107176680203, + "grad_norm": 1.0825470685958862, + "learning_rate": 1.1025613601470293e-05, + "loss": 0.5967, + "step": 85830 + }, + { + "epoch": 1.3778712338881844, + "grad_norm": 1.0162150859832764, + "learning_rate": 1.1020386479616074e-05, + "loss": 0.9058, + "step": 85840 + }, + { + "epoch": 1.3780317501083483, + "grad_norm": 2.0462846755981445, + "learning_rate": 1.101516024678321e-05, + "loss": 0.6704, + "step": 85850 + }, + { + "epoch": 1.3781922663285124, + "grad_norm": 1.4119298458099365, + "learning_rate": 1.1009934903304072e-05, + "loss": 0.7038, + "step": 85860 + }, + { + "epoch": 1.3783527825486765, + "grad_norm": 1.0766017436981201, + "learning_rate": 1.1004710449510963e-05, + "loss": 0.6913, + "step": 85870 + }, + { + "epoch": 1.3785132987688407, + "grad_norm": 0.7151301503181458, + "learning_rate": 1.0999486885736124e-05, + "loss": 0.5583, + "step": 85880 + }, + { + "epoch": 1.3786738149890048, + "grad_norm": 0.7985855340957642, + "learning_rate": 1.0994264212311745e-05, + "loss": 0.7116, + "step": 85890 + }, + { + "epoch": 1.3788343312091687, + "grad_norm": 1.0231181383132935, + "learning_rate": 1.0989042429569956e-05, + "loss": 0.6828, + "step": 85900 + }, + { + "epoch": 1.3789948474293328, + "grad_norm": 0.8736083507537842, + "learning_rate": 1.0983821537842834e-05, + "loss": 0.8748, + "step": 85910 + }, + { + "epoch": 1.3791553636494966, + "grad_norm": 1.1861047744750977, + "learning_rate": 1.0978601537462393e-05, + "loss": 0.7377, + "step": 85920 + }, + { + "epoch": 1.3793158798696608, + "grad_norm": 0.9074804782867432, + "learning_rate": 1.0973382428760598e-05, + "loss": 0.7234, + "step": 85930 + }, + { + "epoch": 1.3794763960898249, + "grad_norm": 0.6684462428092957, + "learning_rate": 1.0968164212069361e-05, + "loss": 0.7065, + "step": 85940 + }, + { + "epoch": 1.379636912309989, + "grad_norm": 0.9335461854934692, + "learning_rate": 1.0962946887720516e-05, + "loss": 0.6413, + "step": 85950 + }, + { + "epoch": 1.379797428530153, + "grad_norm": 1.1181824207305908, + "learning_rate": 1.0957730456045858e-05, + "loss": 0.7495, + "step": 85960 + }, + { + "epoch": 1.379957944750317, + "grad_norm": 1.2054270505905151, + "learning_rate": 1.0952514917377121e-05, + "loss": 0.6055, + "step": 85970 + }, + { + "epoch": 1.380118460970481, + "grad_norm": 1.519682765007019, + "learning_rate": 1.0947300272045987e-05, + "loss": 0.7176, + "step": 85980 + }, + { + "epoch": 1.3802789771906452, + "grad_norm": 1.203299641609192, + "learning_rate": 1.0942086520384076e-05, + "loss": 0.6247, + "step": 85990 + }, + { + "epoch": 1.380439493410809, + "grad_norm": 1.0228201150894165, + "learning_rate": 1.0936873662722948e-05, + "loss": 0.6196, + "step": 86000 + }, + { + "epoch": 1.3806000096309732, + "grad_norm": 0.6171165108680725, + "learning_rate": 1.0931661699394121e-05, + "loss": 0.6636, + "step": 86010 + }, + { + "epoch": 1.3807605258511373, + "grad_norm": 0.9204326272010803, + "learning_rate": 1.0926450630729031e-05, + "loss": 0.7268, + "step": 86020 + }, + { + "epoch": 1.3809210420713014, + "grad_norm": 1.1213953495025635, + "learning_rate": 1.0921240457059079e-05, + "loss": 0.6359, + "step": 86030 + }, + { + "epoch": 1.3810815582914653, + "grad_norm": 0.9833868145942688, + "learning_rate": 1.0916031178715597e-05, + "loss": 0.6404, + "step": 86040 + }, + { + "epoch": 1.3812420745116294, + "grad_norm": 1.9943054914474487, + "learning_rate": 1.0910822796029869e-05, + "loss": 0.7424, + "step": 86050 + }, + { + "epoch": 1.3814025907317935, + "grad_norm": 0.8407278060913086, + "learning_rate": 1.0905615309333114e-05, + "loss": 0.7278, + "step": 86060 + }, + { + "epoch": 1.3815631069519574, + "grad_norm": 1.05020272731781, + "learning_rate": 1.0900408718956509e-05, + "loss": 0.6836, + "step": 86070 + }, + { + "epoch": 1.3817236231721215, + "grad_norm": 1.4192527532577515, + "learning_rate": 1.0895203025231147e-05, + "loss": 0.6281, + "step": 86080 + }, + { + "epoch": 1.3818841393922856, + "grad_norm": 0.8466793894767761, + "learning_rate": 1.0889998228488083e-05, + "loss": 0.6653, + "step": 86090 + }, + { + "epoch": 1.3820446556124497, + "grad_norm": 1.0413098335266113, + "learning_rate": 1.0884794329058315e-05, + "loss": 0.6925, + "step": 86100 + }, + { + "epoch": 1.3822051718326138, + "grad_norm": 1.3011882305145264, + "learning_rate": 1.087959132727278e-05, + "loss": 0.8118, + "step": 86110 + }, + { + "epoch": 1.3823656880527777, + "grad_norm": 1.845245361328125, + "learning_rate": 1.0874389223462359e-05, + "loss": 0.602, + "step": 86120 + }, + { + "epoch": 1.3825262042729418, + "grad_norm": 1.1309102773666382, + "learning_rate": 1.0869188017957883e-05, + "loss": 0.5575, + "step": 86130 + }, + { + "epoch": 1.3826867204931057, + "grad_norm": 1.6862515211105347, + "learning_rate": 1.0863987711090101e-05, + "loss": 0.7223, + "step": 86140 + }, + { + "epoch": 1.3828472367132698, + "grad_norm": 1.1055574417114258, + "learning_rate": 1.085878830318973e-05, + "loss": 0.7175, + "step": 86150 + }, + { + "epoch": 1.383007752933434, + "grad_norm": 1.6065328121185303, + "learning_rate": 1.085358979458742e-05, + "loss": 0.8599, + "step": 86160 + }, + { + "epoch": 1.383168269153598, + "grad_norm": 1.87996506690979, + "learning_rate": 1.0848392185613773e-05, + "loss": 0.8262, + "step": 86170 + }, + { + "epoch": 1.3833287853737621, + "grad_norm": 1.204239010810852, + "learning_rate": 1.0843195476599318e-05, + "loss": 0.8371, + "step": 86180 + }, + { + "epoch": 1.383489301593926, + "grad_norm": 1.0465930700302124, + "learning_rate": 1.0837999667874548e-05, + "loss": 0.7296, + "step": 86190 + }, + { + "epoch": 1.3836498178140901, + "grad_norm": 0.5927920937538147, + "learning_rate": 1.0832804759769868e-05, + "loss": 0.6562, + "step": 86200 + }, + { + "epoch": 1.3838103340342542, + "grad_norm": 0.9981905221939087, + "learning_rate": 1.0827610752615655e-05, + "loss": 0.7031, + "step": 86210 + }, + { + "epoch": 1.3839708502544181, + "grad_norm": 0.8455749154090881, + "learning_rate": 1.0822417646742212e-05, + "loss": 0.7067, + "step": 86220 + }, + { + "epoch": 1.3841313664745822, + "grad_norm": 0.7233341932296753, + "learning_rate": 1.0817225442479792e-05, + "loss": 0.5624, + "step": 86230 + }, + { + "epoch": 1.3842918826947463, + "grad_norm": 0.8507183790206909, + "learning_rate": 1.0812034140158591e-05, + "loss": 0.6825, + "step": 86240 + }, + { + "epoch": 1.3844523989149105, + "grad_norm": 1.388709306716919, + "learning_rate": 1.0806843740108743e-05, + "loss": 0.7883, + "step": 86250 + }, + { + "epoch": 1.3846129151350743, + "grad_norm": 1.095920205116272, + "learning_rate": 1.0801654242660336e-05, + "loss": 0.6455, + "step": 86260 + }, + { + "epoch": 1.3847734313552384, + "grad_norm": 0.8310797810554504, + "learning_rate": 1.0796465648143375e-05, + "loss": 0.6683, + "step": 86270 + }, + { + "epoch": 1.3849339475754026, + "grad_norm": 0.7544679641723633, + "learning_rate": 1.0791277956887833e-05, + "loss": 0.6423, + "step": 86280 + }, + { + "epoch": 1.3850944637955664, + "grad_norm": 1.3706682920455933, + "learning_rate": 1.0786091169223617e-05, + "loss": 0.6084, + "step": 86290 + }, + { + "epoch": 1.3852549800157306, + "grad_norm": 1.0347157716751099, + "learning_rate": 1.0780905285480575e-05, + "loss": 0.6411, + "step": 86300 + }, + { + "epoch": 1.3854154962358947, + "grad_norm": 1.003981590270996, + "learning_rate": 1.0775720305988498e-05, + "loss": 0.6012, + "step": 86310 + }, + { + "epoch": 1.3855760124560588, + "grad_norm": 1.0410237312316895, + "learning_rate": 1.077053623107713e-05, + "loss": 0.7192, + "step": 86320 + }, + { + "epoch": 1.3857365286762227, + "grad_norm": 1.4396750926971436, + "learning_rate": 1.0765353061076134e-05, + "loss": 0.6896, + "step": 86330 + }, + { + "epoch": 1.3858970448963868, + "grad_norm": 1.0681265592575073, + "learning_rate": 1.076017079631513e-05, + "loss": 0.7528, + "step": 86340 + }, + { + "epoch": 1.3860575611165509, + "grad_norm": 0.7737208008766174, + "learning_rate": 1.0754989437123686e-05, + "loss": 0.6562, + "step": 86350 + }, + { + "epoch": 1.3862180773367148, + "grad_norm": 0.9055004119873047, + "learning_rate": 1.0749808983831303e-05, + "loss": 0.7673, + "step": 86360 + }, + { + "epoch": 1.3863785935568789, + "grad_norm": 0.8735693097114563, + "learning_rate": 1.0744629436767431e-05, + "loss": 0.7585, + "step": 86370 + }, + { + "epoch": 1.386539109777043, + "grad_norm": 0.9663300514221191, + "learning_rate": 1.0739450796261463e-05, + "loss": 0.6789, + "step": 86380 + }, + { + "epoch": 1.386699625997207, + "grad_norm": 0.9917658567428589, + "learning_rate": 1.0734273062642716e-05, + "loss": 0.7326, + "step": 86390 + }, + { + "epoch": 1.3868601422173712, + "grad_norm": 1.0928226709365845, + "learning_rate": 1.0729096236240468e-05, + "loss": 0.7897, + "step": 86400 + }, + { + "epoch": 1.387020658437535, + "grad_norm": 0.9407363533973694, + "learning_rate": 1.072392031738394e-05, + "loss": 0.8027, + "step": 86410 + }, + { + "epoch": 1.3871811746576992, + "grad_norm": 0.7779992818832397, + "learning_rate": 1.071874530640229e-05, + "loss": 0.7193, + "step": 86420 + }, + { + "epoch": 1.387341690877863, + "grad_norm": 1.0177425146102905, + "learning_rate": 1.0713571203624612e-05, + "loss": 0.8572, + "step": 86430 + }, + { + "epoch": 1.3875022070980272, + "grad_norm": 0.8197349905967712, + "learning_rate": 1.0708398009379955e-05, + "loss": 0.624, + "step": 86440 + }, + { + "epoch": 1.3876627233181913, + "grad_norm": 1.0228911638259888, + "learning_rate": 1.0703225723997301e-05, + "loss": 0.7183, + "step": 86450 + }, + { + "epoch": 1.3878232395383554, + "grad_norm": 1.2690753936767578, + "learning_rate": 1.0698054347805583e-05, + "loss": 0.8197, + "step": 86460 + }, + { + "epoch": 1.3879837557585195, + "grad_norm": 1.3539506196975708, + "learning_rate": 1.0692883881133661e-05, + "loss": 0.7779, + "step": 86470 + }, + { + "epoch": 1.3881442719786834, + "grad_norm": 0.9309757947921753, + "learning_rate": 1.0687714324310346e-05, + "loss": 0.7113, + "step": 86480 + }, + { + "epoch": 1.3883047881988475, + "grad_norm": 1.2564715147018433, + "learning_rate": 1.0682545677664396e-05, + "loss": 0.6585, + "step": 86490 + }, + { + "epoch": 1.3884653044190116, + "grad_norm": 1.2802726030349731, + "learning_rate": 1.0677377941524508e-05, + "loss": 0.7078, + "step": 86500 + }, + { + "epoch": 1.3886258206391755, + "grad_norm": 1.8098795413970947, + "learning_rate": 1.0672211116219316e-05, + "loss": 0.6725, + "step": 86510 + }, + { + "epoch": 1.3887863368593396, + "grad_norm": 0.6965939402580261, + "learning_rate": 1.0667045202077402e-05, + "loss": 0.8595, + "step": 86520 + }, + { + "epoch": 1.3889468530795037, + "grad_norm": 1.1663997173309326, + "learning_rate": 1.0661880199427288e-05, + "loss": 0.5811, + "step": 86530 + }, + { + "epoch": 1.3891073692996678, + "grad_norm": 0.7978525161743164, + "learning_rate": 1.0656716108597436e-05, + "loss": 0.6863, + "step": 86540 + }, + { + "epoch": 1.3892678855198317, + "grad_norm": 0.9443690776824951, + "learning_rate": 1.0651552929916253e-05, + "loss": 0.6858, + "step": 86550 + }, + { + "epoch": 1.3894284017399958, + "grad_norm": 0.8359951972961426, + "learning_rate": 1.0646390663712086e-05, + "loss": 0.6822, + "step": 86560 + }, + { + "epoch": 1.38958891796016, + "grad_norm": 0.9401812553405762, + "learning_rate": 1.0641229310313236e-05, + "loss": 0.6917, + "step": 86570 + }, + { + "epoch": 1.3897494341803238, + "grad_norm": 1.2226307392120361, + "learning_rate": 1.0636068870047915e-05, + "loss": 0.7053, + "step": 86580 + }, + { + "epoch": 1.389909950400488, + "grad_norm": 1.5230646133422852, + "learning_rate": 1.0630909343244308e-05, + "loss": 0.7343, + "step": 86590 + }, + { + "epoch": 1.390070466620652, + "grad_norm": 1.1918938159942627, + "learning_rate": 1.0625750730230527e-05, + "loss": 0.8535, + "step": 86600 + }, + { + "epoch": 1.3902309828408161, + "grad_norm": 1.0191402435302734, + "learning_rate": 1.062059303133463e-05, + "loss": 0.7172, + "step": 86610 + }, + { + "epoch": 1.39039149906098, + "grad_norm": 1.0892010927200317, + "learning_rate": 1.0615436246884619e-05, + "loss": 0.8427, + "step": 86620 + }, + { + "epoch": 1.3905520152811441, + "grad_norm": 1.0471760034561157, + "learning_rate": 1.0610280377208443e-05, + "loss": 0.6555, + "step": 86630 + }, + { + "epoch": 1.3907125315013082, + "grad_norm": 0.8883242607116699, + "learning_rate": 1.0605125422633966e-05, + "loss": 0.7139, + "step": 86640 + }, + { + "epoch": 1.3908730477214721, + "grad_norm": 0.9742724299430847, + "learning_rate": 1.0599971383489025e-05, + "loss": 0.7997, + "step": 86650 + }, + { + "epoch": 1.3910335639416362, + "grad_norm": 0.7562812566757202, + "learning_rate": 1.0594818260101383e-05, + "loss": 0.8341, + "step": 86660 + }, + { + "epoch": 1.3911940801618004, + "grad_norm": 1.0935883522033691, + "learning_rate": 1.058966605279875e-05, + "loss": 0.7063, + "step": 86670 + }, + { + "epoch": 1.3913545963819645, + "grad_norm": 0.9757621884346008, + "learning_rate": 1.0584514761908775e-05, + "loss": 0.6779, + "step": 86680 + }, + { + "epoch": 1.3915151126021286, + "grad_norm": 0.9600942134857178, + "learning_rate": 1.0579364387759061e-05, + "loss": 0.7659, + "step": 86690 + }, + { + "epoch": 1.3916756288222925, + "grad_norm": 1.0599294900894165, + "learning_rate": 1.057421493067712e-05, + "loss": 0.7134, + "step": 86700 + }, + { + "epoch": 1.3918361450424566, + "grad_norm": 1.3224998712539673, + "learning_rate": 1.0569066390990443e-05, + "loss": 0.6351, + "step": 86710 + }, + { + "epoch": 1.3919966612626204, + "grad_norm": 0.6963870525360107, + "learning_rate": 1.056391876902644e-05, + "loss": 0.6286, + "step": 86720 + }, + { + "epoch": 1.3921571774827846, + "grad_norm": 0.8049250245094299, + "learning_rate": 1.0558772065112471e-05, + "loss": 0.7889, + "step": 86730 + }, + { + "epoch": 1.3923176937029487, + "grad_norm": 1.1399540901184082, + "learning_rate": 1.055362627957584e-05, + "loss": 0.7081, + "step": 86740 + }, + { + "epoch": 1.3924782099231128, + "grad_norm": 1.059960126876831, + "learning_rate": 1.0548481412743794e-05, + "loss": 0.674, + "step": 86750 + }, + { + "epoch": 1.3926387261432769, + "grad_norm": 1.0180305242538452, + "learning_rate": 1.0543337464943497e-05, + "loss": 0.6778, + "step": 86760 + }, + { + "epoch": 1.3927992423634408, + "grad_norm": 1.0344080924987793, + "learning_rate": 1.0538194436502089e-05, + "loss": 0.7355, + "step": 86770 + }, + { + "epoch": 1.3929597585836049, + "grad_norm": 1.2994840145111084, + "learning_rate": 1.0533052327746629e-05, + "loss": 0.6955, + "step": 86780 + }, + { + "epoch": 1.393120274803769, + "grad_norm": 1.7994074821472168, + "learning_rate": 1.0527911139004132e-05, + "loss": 0.757, + "step": 86790 + }, + { + "epoch": 1.3932807910239329, + "grad_norm": 0.8044752478599548, + "learning_rate": 1.0522770870601542e-05, + "loss": 0.7148, + "step": 86800 + }, + { + "epoch": 1.393441307244097, + "grad_norm": 0.9992082118988037, + "learning_rate": 1.0517631522865751e-05, + "loss": 0.7104, + "step": 86810 + }, + { + "epoch": 1.393601823464261, + "grad_norm": 0.6973498463630676, + "learning_rate": 1.0512493096123605e-05, + "loss": 0.6946, + "step": 86820 + }, + { + "epoch": 1.3937623396844252, + "grad_norm": 1.0107351541519165, + "learning_rate": 1.0507355590701854e-05, + "loss": 0.7964, + "step": 86830 + }, + { + "epoch": 1.393922855904589, + "grad_norm": 1.234436273574829, + "learning_rate": 1.0502219006927221e-05, + "loss": 0.6582, + "step": 86840 + }, + { + "epoch": 1.3940833721247532, + "grad_norm": 1.0518018007278442, + "learning_rate": 1.0497083345126368e-05, + "loss": 0.6893, + "step": 86850 + }, + { + "epoch": 1.3942438883449173, + "grad_norm": 0.8415085077285767, + "learning_rate": 1.0491948605625892e-05, + "loss": 0.6689, + "step": 86860 + }, + { + "epoch": 1.3944044045650812, + "grad_norm": 0.895039975643158, + "learning_rate": 1.0486814788752331e-05, + "loss": 0.6881, + "step": 86870 + }, + { + "epoch": 1.3945649207852453, + "grad_norm": 1.515985369682312, + "learning_rate": 1.0481681894832173e-05, + "loss": 0.7029, + "step": 86880 + }, + { + "epoch": 1.3947254370054094, + "grad_norm": 0.6932781934738159, + "learning_rate": 1.0476549924191827e-05, + "loss": 0.7587, + "step": 86890 + }, + { + "epoch": 1.3948859532255735, + "grad_norm": 1.0374755859375, + "learning_rate": 1.0471418877157659e-05, + "loss": 0.7848, + "step": 86900 + }, + { + "epoch": 1.3950464694457376, + "grad_norm": 1.0388939380645752, + "learning_rate": 1.0466288754055978e-05, + "loss": 0.7983, + "step": 86910 + }, + { + "epoch": 1.3952069856659015, + "grad_norm": 1.0923672914505005, + "learning_rate": 1.0461159555213026e-05, + "loss": 0.755, + "step": 86920 + }, + { + "epoch": 1.3953675018860656, + "grad_norm": 1.394505262374878, + "learning_rate": 1.0456031280954992e-05, + "loss": 0.699, + "step": 86930 + }, + { + "epoch": 1.3955280181062295, + "grad_norm": 1.1734837293624878, + "learning_rate": 1.0450903931608016e-05, + "loss": 0.796, + "step": 86940 + }, + { + "epoch": 1.3956885343263936, + "grad_norm": 0.9966473579406738, + "learning_rate": 1.0445777507498145e-05, + "loss": 0.8263, + "step": 86950 + }, + { + "epoch": 1.3958490505465577, + "grad_norm": 0.9431701302528381, + "learning_rate": 1.0440652008951399e-05, + "loss": 0.6504, + "step": 86960 + }, + { + "epoch": 1.3960095667667218, + "grad_norm": 0.8134439587593079, + "learning_rate": 1.043552743629373e-05, + "loss": 0.6195, + "step": 86970 + }, + { + "epoch": 1.396170082986886, + "grad_norm": 1.7224749326705933, + "learning_rate": 1.0430403789851031e-05, + "loss": 0.7485, + "step": 86980 + }, + { + "epoch": 1.3963305992070498, + "grad_norm": 0.8129833936691284, + "learning_rate": 1.042528106994914e-05, + "loss": 0.6705, + "step": 86990 + }, + { + "epoch": 1.396491115427214, + "grad_norm": 0.8189294934272766, + "learning_rate": 1.0420159276913832e-05, + "loss": 0.7094, + "step": 87000 + }, + { + "epoch": 1.396651631647378, + "grad_norm": 1.4434696435928345, + "learning_rate": 1.0415038411070812e-05, + "loss": 0.6844, + "step": 87010 + }, + { + "epoch": 1.396812147867542, + "grad_norm": 0.5938977599143982, + "learning_rate": 1.0409918472745745e-05, + "loss": 0.6723, + "step": 87020 + }, + { + "epoch": 1.396972664087706, + "grad_norm": 0.8185879588127136, + "learning_rate": 1.0404799462264228e-05, + "loss": 0.7362, + "step": 87030 + }, + { + "epoch": 1.3971331803078701, + "grad_norm": 1.197464108467102, + "learning_rate": 1.03996813799518e-05, + "loss": 0.7054, + "step": 87040 + }, + { + "epoch": 1.3972936965280343, + "grad_norm": 1.0966882705688477, + "learning_rate": 1.039456422613394e-05, + "loss": 0.6836, + "step": 87050 + }, + { + "epoch": 1.3974542127481981, + "grad_norm": 1.0429717302322388, + "learning_rate": 1.038944800113607e-05, + "loss": 0.6002, + "step": 87060 + }, + { + "epoch": 1.3976147289683623, + "grad_norm": 0.9034978747367859, + "learning_rate": 1.0384332705283553e-05, + "loss": 0.7173, + "step": 87070 + }, + { + "epoch": 1.3977752451885264, + "grad_norm": 0.9126710891723633, + "learning_rate": 1.0379218338901688e-05, + "loss": 0.7171, + "step": 87080 + }, + { + "epoch": 1.3979357614086902, + "grad_norm": 1.3302454948425293, + "learning_rate": 1.0374104902315734e-05, + "loss": 0.6579, + "step": 87090 + }, + { + "epoch": 1.3980962776288544, + "grad_norm": 1.1285349130630493, + "learning_rate": 1.0368992395850851e-05, + "loss": 0.7471, + "step": 87100 + }, + { + "epoch": 1.3982567938490185, + "grad_norm": 0.8497422337532043, + "learning_rate": 1.0363880819832181e-05, + "loss": 0.7136, + "step": 87110 + }, + { + "epoch": 1.3984173100691826, + "grad_norm": 1.8164172172546387, + "learning_rate": 1.035877017458478e-05, + "loss": 0.7265, + "step": 87120 + }, + { + "epoch": 1.3985778262893465, + "grad_norm": 1.112363338470459, + "learning_rate": 1.0353660460433664e-05, + "loss": 0.7142, + "step": 87130 + }, + { + "epoch": 1.3987383425095106, + "grad_norm": 1.5195202827453613, + "learning_rate": 1.0348551677703777e-05, + "loss": 0.682, + "step": 87140 + }, + { + "epoch": 1.3988988587296747, + "grad_norm": 1.3200916051864624, + "learning_rate": 1.0343443826720005e-05, + "loss": 0.7781, + "step": 87150 + }, + { + "epoch": 1.3990593749498386, + "grad_norm": 0.951262354850769, + "learning_rate": 1.0338336907807184e-05, + "loss": 0.7015, + "step": 87160 + }, + { + "epoch": 1.3992198911700027, + "grad_norm": 0.7980061769485474, + "learning_rate": 1.033323092129008e-05, + "loss": 0.7638, + "step": 87170 + }, + { + "epoch": 1.3993804073901668, + "grad_norm": 1.5423816442489624, + "learning_rate": 1.0328125867493401e-05, + "loss": 0.8069, + "step": 87180 + }, + { + "epoch": 1.399540923610331, + "grad_norm": 0.9852812886238098, + "learning_rate": 1.0323021746741815e-05, + "loss": 0.6942, + "step": 87190 + }, + { + "epoch": 1.399701439830495, + "grad_norm": 0.979119062423706, + "learning_rate": 1.031791855935989e-05, + "loss": 0.689, + "step": 87200 + }, + { + "epoch": 1.3998619560506589, + "grad_norm": 0.7733275890350342, + "learning_rate": 1.0312816305672169e-05, + "loss": 0.7748, + "step": 87210 + }, + { + "epoch": 1.400022472270823, + "grad_norm": 1.1320757865905762, + "learning_rate": 1.0307714986003125e-05, + "loss": 0.6476, + "step": 87220 + }, + { + "epoch": 1.4001829884909869, + "grad_norm": 1.0444436073303223, + "learning_rate": 1.0302614600677174e-05, + "loss": 0.7284, + "step": 87230 + }, + { + "epoch": 1.400343504711151, + "grad_norm": 0.8391829133033752, + "learning_rate": 1.0297515150018671e-05, + "loss": 0.7383, + "step": 87240 + }, + { + "epoch": 1.400504020931315, + "grad_norm": 0.9284687638282776, + "learning_rate": 1.0292416634351918e-05, + "loss": 0.7173, + "step": 87250 + }, + { + "epoch": 1.4006645371514792, + "grad_norm": 0.868436336517334, + "learning_rate": 1.0287319054001133e-05, + "loss": 0.6142, + "step": 87260 + }, + { + "epoch": 1.4008250533716433, + "grad_norm": 0.8312922716140747, + "learning_rate": 1.02822224092905e-05, + "loss": 0.6653, + "step": 87270 + }, + { + "epoch": 1.4009855695918072, + "grad_norm": 1.098498821258545, + "learning_rate": 1.0277126700544137e-05, + "loss": 0.5974, + "step": 87280 + }, + { + "epoch": 1.4011460858119713, + "grad_norm": 0.9428097009658813, + "learning_rate": 1.0272031928086103e-05, + "loss": 0.6357, + "step": 87290 + }, + { + "epoch": 1.4013066020321354, + "grad_norm": 1.7345982789993286, + "learning_rate": 1.0266938092240394e-05, + "loss": 0.7284, + "step": 87300 + }, + { + "epoch": 1.4014671182522993, + "grad_norm": 1.2046806812286377, + "learning_rate": 1.0261845193330957e-05, + "loss": 0.6619, + "step": 87310 + }, + { + "epoch": 1.4016276344724634, + "grad_norm": 0.9593937993049622, + "learning_rate": 1.0256753231681654e-05, + "loss": 0.6934, + "step": 87320 + }, + { + "epoch": 1.4017881506926275, + "grad_norm": 0.9183451533317566, + "learning_rate": 1.0251662207616308e-05, + "loss": 0.7921, + "step": 87330 + }, + { + "epoch": 1.4019486669127916, + "grad_norm": 0.797244131565094, + "learning_rate": 1.0246572121458683e-05, + "loss": 0.6979, + "step": 87340 + }, + { + "epoch": 1.4021091831329555, + "grad_norm": 1.2029614448547363, + "learning_rate": 1.0241482973532479e-05, + "loss": 0.6426, + "step": 87350 + }, + { + "epoch": 1.4022696993531196, + "grad_norm": 1.1267727613449097, + "learning_rate": 1.0236394764161334e-05, + "loss": 0.6438, + "step": 87360 + }, + { + "epoch": 1.4024302155732837, + "grad_norm": 1.7240592241287231, + "learning_rate": 1.0231307493668827e-05, + "loss": 0.6869, + "step": 87370 + }, + { + "epoch": 1.4025907317934476, + "grad_norm": 0.9845771789550781, + "learning_rate": 1.0226221162378489e-05, + "loss": 0.6827, + "step": 87380 + }, + { + "epoch": 1.4027512480136117, + "grad_norm": 1.2524312734603882, + "learning_rate": 1.022113577061376e-05, + "loss": 0.8397, + "step": 87390 + }, + { + "epoch": 1.4029117642337758, + "grad_norm": 0.7194914221763611, + "learning_rate": 1.0216051318698059e-05, + "loss": 0.5289, + "step": 87400 + }, + { + "epoch": 1.40307228045394, + "grad_norm": 1.103532314300537, + "learning_rate": 1.0210967806954717e-05, + "loss": 0.7607, + "step": 87410 + }, + { + "epoch": 1.403232796674104, + "grad_norm": 0.9652681946754456, + "learning_rate": 1.0205885235707018e-05, + "loss": 0.7186, + "step": 87420 + }, + { + "epoch": 1.403393312894268, + "grad_norm": 1.2429225444793701, + "learning_rate": 1.020080360527819e-05, + "loss": 0.7044, + "step": 87430 + }, + { + "epoch": 1.403553829114432, + "grad_norm": 0.8774268627166748, + "learning_rate": 1.0195722915991395e-05, + "loss": 0.7797, + "step": 87440 + }, + { + "epoch": 1.403714345334596, + "grad_norm": 0.9187206625938416, + "learning_rate": 1.0190643168169724e-05, + "loss": 0.7289, + "step": 87450 + }, + { + "epoch": 1.40387486155476, + "grad_norm": 1.1674742698669434, + "learning_rate": 1.0185564362136227e-05, + "loss": 0.6458, + "step": 87460 + }, + { + "epoch": 1.4040353777749242, + "grad_norm": 0.8526319265365601, + "learning_rate": 1.0180486498213882e-05, + "loss": 0.7039, + "step": 87470 + }, + { + "epoch": 1.4041958939950883, + "grad_norm": 1.0834074020385742, + "learning_rate": 1.0175409576725613e-05, + "loss": 0.736, + "step": 87480 + }, + { + "epoch": 1.4043564102152524, + "grad_norm": 1.9416131973266602, + "learning_rate": 1.0170333597994287e-05, + "loss": 0.6897, + "step": 87490 + }, + { + "epoch": 1.4045169264354163, + "grad_norm": 0.732707142829895, + "learning_rate": 1.0165258562342708e-05, + "loss": 0.6243, + "step": 87500 + }, + { + "epoch": 1.4046774426555804, + "grad_norm": 1.2829009294509888, + "learning_rate": 1.0160184470093607e-05, + "loss": 0.6811, + "step": 87510 + }, + { + "epoch": 1.4048379588757445, + "grad_norm": 0.8626675605773926, + "learning_rate": 1.015511132156967e-05, + "loss": 0.6497, + "step": 87520 + }, + { + "epoch": 1.4049984750959084, + "grad_norm": 1.2398680448532104, + "learning_rate": 1.0150039117093527e-05, + "loss": 0.6407, + "step": 87530 + }, + { + "epoch": 1.4051589913160725, + "grad_norm": 2.0590970516204834, + "learning_rate": 1.0144967856987732e-05, + "loss": 0.7271, + "step": 87540 + }, + { + "epoch": 1.4053195075362366, + "grad_norm": 0.8812441825866699, + "learning_rate": 1.0139897541574792e-05, + "loss": 0.8072, + "step": 87550 + }, + { + "epoch": 1.4054800237564007, + "grad_norm": 1.3709723949432373, + "learning_rate": 1.0134828171177158e-05, + "loss": 0.6909, + "step": 87560 + }, + { + "epoch": 1.4056405399765646, + "grad_norm": 0.910304069519043, + "learning_rate": 1.0129759746117193e-05, + "loss": 0.7727, + "step": 87570 + }, + { + "epoch": 1.4058010561967287, + "grad_norm": 0.971449613571167, + "learning_rate": 1.0124692266717229e-05, + "loss": 0.669, + "step": 87580 + }, + { + "epoch": 1.4059615724168928, + "grad_norm": 1.2929232120513916, + "learning_rate": 1.0119625733299527e-05, + "loss": 0.6875, + "step": 87590 + }, + { + "epoch": 1.4061220886370567, + "grad_norm": 1.3096979856491089, + "learning_rate": 1.011456014618629e-05, + "loss": 0.6277, + "step": 87600 + }, + { + "epoch": 1.4062826048572208, + "grad_norm": 1.0494531393051147, + "learning_rate": 1.0109495505699659e-05, + "loss": 0.6206, + "step": 87610 + }, + { + "epoch": 1.406443121077385, + "grad_norm": 1.0505517721176147, + "learning_rate": 1.0104431812161716e-05, + "loss": 0.7694, + "step": 87620 + }, + { + "epoch": 1.406603637297549, + "grad_norm": 0.9936231970787048, + "learning_rate": 1.009936906589449e-05, + "loss": 0.7082, + "step": 87630 + }, + { + "epoch": 1.406764153517713, + "grad_norm": 1.1974427700042725, + "learning_rate": 1.0094307267219925e-05, + "loss": 0.804, + "step": 87640 + }, + { + "epoch": 1.406924669737877, + "grad_norm": 1.6582478284835815, + "learning_rate": 1.0089246416459928e-05, + "loss": 0.7393, + "step": 87650 + }, + { + "epoch": 1.407085185958041, + "grad_norm": 1.7147809267044067, + "learning_rate": 1.0084186513936344e-05, + "loss": 0.7538, + "step": 87660 + }, + { + "epoch": 1.407245702178205, + "grad_norm": 1.175061821937561, + "learning_rate": 1.0079127559970949e-05, + "loss": 0.8173, + "step": 87670 + }, + { + "epoch": 1.407406218398369, + "grad_norm": 1.5916744470596313, + "learning_rate": 1.0074069554885467e-05, + "loss": 0.6715, + "step": 87680 + }, + { + "epoch": 1.4075667346185332, + "grad_norm": 0.859457790851593, + "learning_rate": 1.0069012499001554e-05, + "loss": 0.853, + "step": 87690 + }, + { + "epoch": 1.4077272508386973, + "grad_norm": 1.0776780843734741, + "learning_rate": 1.0063956392640812e-05, + "loss": 0.5983, + "step": 87700 + }, + { + "epoch": 1.4078877670588614, + "grad_norm": 0.9298960566520691, + "learning_rate": 1.0058901236124782e-05, + "loss": 0.6768, + "step": 87710 + }, + { + "epoch": 1.4080482832790253, + "grad_norm": 1.1694910526275635, + "learning_rate": 1.0053847029774931e-05, + "loss": 0.6951, + "step": 87720 + }, + { + "epoch": 1.4082087994991894, + "grad_norm": 1.2104179859161377, + "learning_rate": 1.0048793773912684e-05, + "loss": 0.6435, + "step": 87730 + }, + { + "epoch": 1.4083693157193533, + "grad_norm": 0.9585343599319458, + "learning_rate": 1.00437414688594e-05, + "loss": 0.6231, + "step": 87740 + }, + { + "epoch": 1.4085298319395174, + "grad_norm": 1.1407047510147095, + "learning_rate": 1.0038690114936374e-05, + "loss": 0.733, + "step": 87750 + }, + { + "epoch": 1.4086903481596815, + "grad_norm": 1.1332865953445435, + "learning_rate": 1.0033639712464843e-05, + "loss": 0.7465, + "step": 87760 + }, + { + "epoch": 1.4088508643798456, + "grad_norm": 1.1025056838989258, + "learning_rate": 1.002859026176598e-05, + "loss": 0.5833, + "step": 87770 + }, + { + "epoch": 1.4090113806000097, + "grad_norm": 1.1259845495224, + "learning_rate": 1.0023541763160904e-05, + "loss": 0.7472, + "step": 87780 + }, + { + "epoch": 1.4091718968201736, + "grad_norm": 0.9076866507530212, + "learning_rate": 1.001849421697067e-05, + "loss": 0.6583, + "step": 87790 + }, + { + "epoch": 1.4093324130403377, + "grad_norm": 1.141536831855774, + "learning_rate": 1.0013447623516274e-05, + "loss": 0.7857, + "step": 87800 + }, + { + "epoch": 1.4094929292605018, + "grad_norm": 0.7917884588241577, + "learning_rate": 1.0008401983118651e-05, + "loss": 0.7051, + "step": 87810 + }, + { + "epoch": 1.4096534454806657, + "grad_norm": 1.2178821563720703, + "learning_rate": 1.0003357296098661e-05, + "loss": 0.6937, + "step": 87820 + }, + { + "epoch": 1.4098139617008298, + "grad_norm": 0.9789232611656189, + "learning_rate": 9.998313562777128e-06, + "loss": 0.5151, + "step": 87830 + }, + { + "epoch": 1.409974477920994, + "grad_norm": 0.8300692439079285, + "learning_rate": 9.993270783474801e-06, + "loss": 0.6793, + "step": 87840 + }, + { + "epoch": 1.410134994141158, + "grad_norm": 1.2195322513580322, + "learning_rate": 9.988228958512372e-06, + "loss": 0.5949, + "step": 87850 + }, + { + "epoch": 1.410295510361322, + "grad_norm": 1.4344773292541504, + "learning_rate": 9.98318808821047e-06, + "loss": 0.6665, + "step": 87860 + }, + { + "epoch": 1.410456026581486, + "grad_norm": 0.8414912223815918, + "learning_rate": 9.978148172889675e-06, + "loss": 0.603, + "step": 87870 + }, + { + "epoch": 1.4106165428016502, + "grad_norm": 1.0915476083755493, + "learning_rate": 9.97310921287048e-06, + "loss": 0.8424, + "step": 87880 + }, + { + "epoch": 1.410777059021814, + "grad_norm": 1.2631350755691528, + "learning_rate": 9.968071208473339e-06, + "loss": 0.7657, + "step": 87890 + }, + { + "epoch": 1.4109375752419782, + "grad_norm": 1.125070333480835, + "learning_rate": 9.963034160018641e-06, + "loss": 0.5839, + "step": 87900 + }, + { + "epoch": 1.4110980914621423, + "grad_norm": 0.7210896611213684, + "learning_rate": 9.957998067826712e-06, + "loss": 0.7845, + "step": 87910 + }, + { + "epoch": 1.4112586076823064, + "grad_norm": 1.0950247049331665, + "learning_rate": 9.952962932217818e-06, + "loss": 0.7575, + "step": 87920 + }, + { + "epoch": 1.4114191239024703, + "grad_norm": 0.8081896305084229, + "learning_rate": 9.947928753512165e-06, + "loss": 0.5984, + "step": 87930 + }, + { + "epoch": 1.4115796401226344, + "grad_norm": 1.0184662342071533, + "learning_rate": 9.942895532029902e-06, + "loss": 0.7313, + "step": 87940 + }, + { + "epoch": 1.4117401563427985, + "grad_norm": 0.9908921122550964, + "learning_rate": 9.937863268091102e-06, + "loss": 0.7642, + "step": 87950 + }, + { + "epoch": 1.4119006725629624, + "grad_norm": 0.9440973997116089, + "learning_rate": 9.932831962015793e-06, + "loss": 0.6074, + "step": 87960 + }, + { + "epoch": 1.4120611887831265, + "grad_norm": 0.9349722266197205, + "learning_rate": 9.927801614123933e-06, + "loss": 0.6431, + "step": 87970 + }, + { + "epoch": 1.4122217050032906, + "grad_norm": 0.8606159687042236, + "learning_rate": 9.922772224735426e-06, + "loss": 0.7613, + "step": 87980 + }, + { + "epoch": 1.4123822212234547, + "grad_norm": 1.7803573608398438, + "learning_rate": 9.917743794170112e-06, + "loss": 0.6995, + "step": 87990 + }, + { + "epoch": 1.4125427374436188, + "grad_norm": 0.7543964982032776, + "learning_rate": 9.912716322747779e-06, + "loss": 0.7686, + "step": 88000 + }, + { + "epoch": 1.4125427374436188, + "eval_loss": 0.7719078660011292, + "eval_runtime": 1833.8132, + "eval_samples_per_second": 14.304, + "eval_steps_per_second": 1.788, + "step": 88000 + }, + { + "epoch": 1.4127032536637827, + "grad_norm": 0.877136766910553, + "learning_rate": 9.907689810788123e-06, + "loss": 0.732, + "step": 88010 + }, + { + "epoch": 1.4128637698839468, + "grad_norm": 0.7662142515182495, + "learning_rate": 9.902664258610813e-06, + "loss": 0.7445, + "step": 88020 + }, + { + "epoch": 1.4130242861041107, + "grad_norm": 0.7506973147392273, + "learning_rate": 9.897639666535447e-06, + "loss": 0.6605, + "step": 88030 + }, + { + "epoch": 1.4131848023242748, + "grad_norm": 0.86881023645401, + "learning_rate": 9.892616034881552e-06, + "loss": 0.7067, + "step": 88040 + }, + { + "epoch": 1.413345318544439, + "grad_norm": 1.5746726989746094, + "learning_rate": 9.887593363968608e-06, + "loss": 0.7351, + "step": 88050 + }, + { + "epoch": 1.413505834764603, + "grad_norm": 1.3161003589630127, + "learning_rate": 9.882571654116032e-06, + "loss": 0.6639, + "step": 88060 + }, + { + "epoch": 1.4136663509847671, + "grad_norm": 0.9109221696853638, + "learning_rate": 9.877550905643163e-06, + "loss": 0.6806, + "step": 88070 + }, + { + "epoch": 1.413826867204931, + "grad_norm": 1.0587713718414307, + "learning_rate": 9.872531118869294e-06, + "loss": 0.8802, + "step": 88080 + }, + { + "epoch": 1.4139873834250951, + "grad_norm": 1.0826002359390259, + "learning_rate": 9.867512294113657e-06, + "loss": 0.5537, + "step": 88090 + }, + { + "epoch": 1.4141478996452592, + "grad_norm": 0.9939835667610168, + "learning_rate": 9.862494431695421e-06, + "loss": 0.7549, + "step": 88100 + }, + { + "epoch": 1.414308415865423, + "grad_norm": 0.7994885444641113, + "learning_rate": 9.85747753193369e-06, + "loss": 0.6826, + "step": 88110 + }, + { + "epoch": 1.4144689320855872, + "grad_norm": 1.4883829355239868, + "learning_rate": 9.852461595147517e-06, + "loss": 0.7084, + "step": 88120 + }, + { + "epoch": 1.4146294483057513, + "grad_norm": 0.7960163354873657, + "learning_rate": 9.847446621655873e-06, + "loss": 0.6779, + "step": 88130 + }, + { + "epoch": 1.4147899645259154, + "grad_norm": 1.3579440116882324, + "learning_rate": 9.842432611777688e-06, + "loss": 0.6619, + "step": 88140 + }, + { + "epoch": 1.4149504807460793, + "grad_norm": 0.8038487434387207, + "learning_rate": 9.83741956583182e-06, + "loss": 0.7247, + "step": 88150 + }, + { + "epoch": 1.4151109969662434, + "grad_norm": 1.1128101348876953, + "learning_rate": 9.832407484137077e-06, + "loss": 0.7824, + "step": 88160 + }, + { + "epoch": 1.4152715131864075, + "grad_norm": 0.9607415199279785, + "learning_rate": 9.82739636701219e-06, + "loss": 0.5726, + "step": 88170 + }, + { + "epoch": 1.4154320294065714, + "grad_norm": 0.9162858724594116, + "learning_rate": 9.822386214775842e-06, + "loss": 0.7907, + "step": 88180 + }, + { + "epoch": 1.4155925456267355, + "grad_norm": 0.8024137616157532, + "learning_rate": 9.817377027746652e-06, + "loss": 0.725, + "step": 88190 + }, + { + "epoch": 1.4157530618468996, + "grad_norm": 0.803484320640564, + "learning_rate": 9.812368806243163e-06, + "loss": 0.6519, + "step": 88200 + }, + { + "epoch": 1.4159135780670637, + "grad_norm": 1.443576693534851, + "learning_rate": 9.80736155058388e-06, + "loss": 0.7322, + "step": 88210 + }, + { + "epoch": 1.4160740942872279, + "grad_norm": 1.315825343132019, + "learning_rate": 9.802355261087226e-06, + "loss": 0.843, + "step": 88220 + }, + { + "epoch": 1.4162346105073917, + "grad_norm": 0.8388460874557495, + "learning_rate": 9.797349938071577e-06, + "loss": 0.684, + "step": 88230 + }, + { + "epoch": 1.4163951267275559, + "grad_norm": 1.5302181243896484, + "learning_rate": 9.792345581855242e-06, + "loss": 0.7047, + "step": 88240 + }, + { + "epoch": 1.4165556429477197, + "grad_norm": 1.1042088270187378, + "learning_rate": 9.787342192756478e-06, + "loss": 0.8152, + "step": 88250 + }, + { + "epoch": 1.4167161591678838, + "grad_norm": 1.2901920080184937, + "learning_rate": 9.78233977109345e-06, + "loss": 0.4867, + "step": 88260 + }, + { + "epoch": 1.416876675388048, + "grad_norm": 1.3351001739501953, + "learning_rate": 9.777338317184295e-06, + "loss": 0.7403, + "step": 88270 + }, + { + "epoch": 1.417037191608212, + "grad_norm": 0.95061856508255, + "learning_rate": 9.772337831347075e-06, + "loss": 0.8133, + "step": 88280 + }, + { + "epoch": 1.4171977078283762, + "grad_norm": 0.9641635417938232, + "learning_rate": 9.767338313899793e-06, + "loss": 0.6874, + "step": 88290 + }, + { + "epoch": 1.41735822404854, + "grad_norm": 0.9775354266166687, + "learning_rate": 9.762339765160386e-06, + "loss": 0.7131, + "step": 88300 + }, + { + "epoch": 1.4175187402687042, + "grad_norm": 1.7189152240753174, + "learning_rate": 9.757342185446733e-06, + "loss": 0.7719, + "step": 88310 + }, + { + "epoch": 1.4176792564888683, + "grad_norm": 0.9657423496246338, + "learning_rate": 9.752345575076654e-06, + "loss": 0.7021, + "step": 88320 + }, + { + "epoch": 1.4178397727090322, + "grad_norm": 0.9362476468086243, + "learning_rate": 9.7473499343679e-06, + "loss": 0.666, + "step": 88330 + }, + { + "epoch": 1.4180002889291963, + "grad_norm": 0.9176216125488281, + "learning_rate": 9.742355263638175e-06, + "loss": 0.6388, + "step": 88340 + }, + { + "epoch": 1.4181608051493604, + "grad_norm": 0.922341525554657, + "learning_rate": 9.737361563205096e-06, + "loss": 0.6863, + "step": 88350 + }, + { + "epoch": 1.4183213213695245, + "grad_norm": 1.1643038988113403, + "learning_rate": 9.732368833386238e-06, + "loss": 0.7786, + "step": 88360 + }, + { + "epoch": 1.4184818375896884, + "grad_norm": 0.7657498121261597, + "learning_rate": 9.727377074499109e-06, + "loss": 0.738, + "step": 88370 + }, + { + "epoch": 1.4186423538098525, + "grad_norm": 0.8316720128059387, + "learning_rate": 9.722386286861157e-06, + "loss": 0.6979, + "step": 88380 + }, + { + "epoch": 1.4188028700300166, + "grad_norm": 1.0048764944076538, + "learning_rate": 9.717396470789766e-06, + "loss": 0.6597, + "step": 88390 + }, + { + "epoch": 1.4189633862501805, + "grad_norm": 1.1031064987182617, + "learning_rate": 9.71240762660226e-06, + "loss": 0.751, + "step": 88400 + }, + { + "epoch": 1.4191239024703446, + "grad_norm": 0.9355453252792358, + "learning_rate": 9.707419754615901e-06, + "loss": 0.7401, + "step": 88410 + }, + { + "epoch": 1.4192844186905087, + "grad_norm": 0.946751058101654, + "learning_rate": 9.702432855147887e-06, + "loss": 0.775, + "step": 88420 + }, + { + "epoch": 1.4194449349106728, + "grad_norm": 1.0000298023223877, + "learning_rate": 9.697446928515356e-06, + "loss": 0.7454, + "step": 88430 + }, + { + "epoch": 1.4196054511308367, + "grad_norm": 0.8786251544952393, + "learning_rate": 9.692461975035391e-06, + "loss": 0.6791, + "step": 88440 + }, + { + "epoch": 1.4197659673510008, + "grad_norm": 0.9738619923591614, + "learning_rate": 9.68747799502499e-06, + "loss": 0.6519, + "step": 88450 + }, + { + "epoch": 1.419926483571165, + "grad_norm": 0.9329949021339417, + "learning_rate": 9.682494988801114e-06, + "loss": 0.7167, + "step": 88460 + }, + { + "epoch": 1.4200869997913288, + "grad_norm": 1.48002290725708, + "learning_rate": 9.67751295668065e-06, + "loss": 0.5609, + "step": 88470 + }, + { + "epoch": 1.420247516011493, + "grad_norm": 1.1038881540298462, + "learning_rate": 9.67253189898043e-06, + "loss": 0.814, + "step": 88480 + }, + { + "epoch": 1.420408032231657, + "grad_norm": 1.7575331926345825, + "learning_rate": 9.667551816017218e-06, + "loss": 0.6845, + "step": 88490 + }, + { + "epoch": 1.4205685484518211, + "grad_norm": 1.2149227857589722, + "learning_rate": 9.662572708107727e-06, + "loss": 0.7101, + "step": 88500 + }, + { + "epoch": 1.4207290646719852, + "grad_norm": 0.900489330291748, + "learning_rate": 9.657594575568582e-06, + "loss": 0.7273, + "step": 88510 + }, + { + "epoch": 1.4208895808921491, + "grad_norm": 0.8611798286437988, + "learning_rate": 9.65261741871637e-06, + "loss": 0.6253, + "step": 88520 + }, + { + "epoch": 1.4210500971123132, + "grad_norm": 0.8630499243736267, + "learning_rate": 9.64764123786761e-06, + "loss": 0.5787, + "step": 88530 + }, + { + "epoch": 1.4212106133324771, + "grad_norm": 1.3168762922286987, + "learning_rate": 9.642666033338756e-06, + "loss": 0.6642, + "step": 88540 + }, + { + "epoch": 1.4213711295526412, + "grad_norm": 0.7652456760406494, + "learning_rate": 9.63769180544621e-06, + "loss": 0.768, + "step": 88550 + }, + { + "epoch": 1.4215316457728053, + "grad_norm": 1.017905831336975, + "learning_rate": 9.632718554506302e-06, + "loss": 0.7172, + "step": 88560 + }, + { + "epoch": 1.4216921619929694, + "grad_norm": 1.2390103340148926, + "learning_rate": 9.62774628083529e-06, + "loss": 0.6384, + "step": 88570 + }, + { + "epoch": 1.4218526782131335, + "grad_norm": 0.8430167436599731, + "learning_rate": 9.622774984749386e-06, + "loss": 0.7516, + "step": 88580 + }, + { + "epoch": 1.4220131944332974, + "grad_norm": 0.9322136044502258, + "learning_rate": 9.61780466656474e-06, + "loss": 0.6742, + "step": 88590 + }, + { + "epoch": 1.4221737106534615, + "grad_norm": 0.9286547303199768, + "learning_rate": 9.612835326597434e-06, + "loss": 0.7231, + "step": 88600 + }, + { + "epoch": 1.4223342268736257, + "grad_norm": 1.5382603406906128, + "learning_rate": 9.607866965163484e-06, + "loss": 0.7765, + "step": 88610 + }, + { + "epoch": 1.4224947430937895, + "grad_norm": 1.0633436441421509, + "learning_rate": 9.602899582578864e-06, + "loss": 0.8216, + "step": 88620 + }, + { + "epoch": 1.4226552593139536, + "grad_norm": 1.3813236951828003, + "learning_rate": 9.597933179159449e-06, + "loss": 0.714, + "step": 88630 + }, + { + "epoch": 1.4228157755341178, + "grad_norm": 0.858902633190155, + "learning_rate": 9.592967755221084e-06, + "loss": 0.6846, + "step": 88640 + }, + { + "epoch": 1.4229762917542819, + "grad_norm": 1.0037912130355835, + "learning_rate": 9.588003311079538e-06, + "loss": 0.7117, + "step": 88650 + }, + { + "epoch": 1.4231368079744457, + "grad_norm": 1.196845531463623, + "learning_rate": 9.583039847050523e-06, + "loss": 0.6788, + "step": 88660 + }, + { + "epoch": 1.4232973241946099, + "grad_norm": 1.2259358167648315, + "learning_rate": 9.578077363449684e-06, + "loss": 0.6371, + "step": 88670 + }, + { + "epoch": 1.423457840414774, + "grad_norm": 0.9541433453559875, + "learning_rate": 9.57311586059262e-06, + "loss": 0.7744, + "step": 88680 + }, + { + "epoch": 1.4236183566349379, + "grad_norm": 1.3755556344985962, + "learning_rate": 9.568155338794827e-06, + "loss": 0.7111, + "step": 88690 + }, + { + "epoch": 1.423778872855102, + "grad_norm": 0.8007945418357849, + "learning_rate": 9.56319579837178e-06, + "loss": 0.764, + "step": 88700 + }, + { + "epoch": 1.423939389075266, + "grad_norm": 0.9510746598243713, + "learning_rate": 9.558237239638876e-06, + "loss": 0.7045, + "step": 88710 + }, + { + "epoch": 1.4240999052954302, + "grad_norm": 1.033962607383728, + "learning_rate": 9.553279662911452e-06, + "loss": 0.6917, + "step": 88720 + }, + { + "epoch": 1.4242604215155943, + "grad_norm": 2.495588541030884, + "learning_rate": 9.548323068504775e-06, + "loss": 0.7782, + "step": 88730 + }, + { + "epoch": 1.4244209377357582, + "grad_norm": 1.0160768032073975, + "learning_rate": 9.543367456734059e-06, + "loss": 0.722, + "step": 88740 + }, + { + "epoch": 1.4245814539559223, + "grad_norm": 0.5927950739860535, + "learning_rate": 9.538412827914461e-06, + "loss": 0.6403, + "step": 88750 + }, + { + "epoch": 1.4247419701760862, + "grad_norm": 0.8539732098579407, + "learning_rate": 9.53345918236105e-06, + "loss": 0.7511, + "step": 88760 + }, + { + "epoch": 1.4249024863962503, + "grad_norm": 1.5470383167266846, + "learning_rate": 9.528506520388852e-06, + "loss": 0.7558, + "step": 88770 + }, + { + "epoch": 1.4250630026164144, + "grad_norm": 0.9635469317436218, + "learning_rate": 9.523554842312835e-06, + "loss": 0.6932, + "step": 88780 + }, + { + "epoch": 1.4252235188365785, + "grad_norm": 0.7362543344497681, + "learning_rate": 9.51860414844789e-06, + "loss": 0.6744, + "step": 88790 + }, + { + "epoch": 1.4253840350567426, + "grad_norm": 1.3613157272338867, + "learning_rate": 9.513654439108855e-06, + "loss": 0.6511, + "step": 88800 + }, + { + "epoch": 1.4255445512769065, + "grad_norm": 0.8849863409996033, + "learning_rate": 9.508705714610511e-06, + "loss": 0.7235, + "step": 88810 + }, + { + "epoch": 1.4257050674970706, + "grad_norm": 0.6605616211891174, + "learning_rate": 9.50375797526755e-06, + "loss": 0.7204, + "step": 88820 + }, + { + "epoch": 1.4258655837172347, + "grad_norm": 0.9475060105323792, + "learning_rate": 9.498811221394627e-06, + "loss": 0.7062, + "step": 88830 + }, + { + "epoch": 1.4260260999373986, + "grad_norm": 0.8529754877090454, + "learning_rate": 9.493865453306328e-06, + "loss": 0.7632, + "step": 88840 + }, + { + "epoch": 1.4261866161575627, + "grad_norm": 0.9467063546180725, + "learning_rate": 9.488920671317175e-06, + "loss": 0.7376, + "step": 88850 + }, + { + "epoch": 1.4263471323777268, + "grad_norm": 0.829896092414856, + "learning_rate": 9.483976875741627e-06, + "loss": 0.66, + "step": 88860 + }, + { + "epoch": 1.426507648597891, + "grad_norm": 0.8871930837631226, + "learning_rate": 9.479034066894088e-06, + "loss": 0.5697, + "step": 88870 + }, + { + "epoch": 1.4266681648180548, + "grad_norm": 1.012603998184204, + "learning_rate": 9.474092245088876e-06, + "loss": 0.734, + "step": 88880 + }, + { + "epoch": 1.426828681038219, + "grad_norm": 1.4996153116226196, + "learning_rate": 9.469151410640267e-06, + "loss": 0.752, + "step": 88890 + }, + { + "epoch": 1.426989197258383, + "grad_norm": 1.2569763660430908, + "learning_rate": 9.46421156386247e-06, + "loss": 0.6637, + "step": 88900 + }, + { + "epoch": 1.427149713478547, + "grad_norm": 1.1803510189056396, + "learning_rate": 9.459272705069635e-06, + "loss": 0.6992, + "step": 88910 + }, + { + "epoch": 1.427310229698711, + "grad_norm": 1.2500966787338257, + "learning_rate": 9.454334834575842e-06, + "loss": 0.7361, + "step": 88920 + }, + { + "epoch": 1.4274707459188751, + "grad_norm": 1.9756078720092773, + "learning_rate": 9.449397952695108e-06, + "loss": 0.6725, + "step": 88930 + }, + { + "epoch": 1.4276312621390392, + "grad_norm": 1.683031678199768, + "learning_rate": 9.444462059741391e-06, + "loss": 0.6612, + "step": 88940 + }, + { + "epoch": 1.4277917783592031, + "grad_norm": 0.8908933997154236, + "learning_rate": 9.439527156028585e-06, + "loss": 0.7392, + "step": 88950 + }, + { + "epoch": 1.4279522945793672, + "grad_norm": 2.599780321121216, + "learning_rate": 9.434593241870531e-06, + "loss": 0.8225, + "step": 88960 + }, + { + "epoch": 1.4281128107995313, + "grad_norm": 0.9674651622772217, + "learning_rate": 9.42966031758098e-06, + "loss": 0.8587, + "step": 88970 + }, + { + "epoch": 1.4282733270196952, + "grad_norm": 1.0668439865112305, + "learning_rate": 9.424728383473641e-06, + "loss": 0.7362, + "step": 88980 + }, + { + "epoch": 1.4284338432398593, + "grad_norm": 1.0930269956588745, + "learning_rate": 9.41979743986216e-06, + "loss": 0.7043, + "step": 88990 + }, + { + "epoch": 1.4285943594600234, + "grad_norm": 1.087710976600647, + "learning_rate": 9.414867487060116e-06, + "loss": 0.6473, + "step": 89000 + }, + { + "epoch": 1.4287548756801876, + "grad_norm": 1.5939379930496216, + "learning_rate": 9.409938525381027e-06, + "loss": 0.7467, + "step": 89010 + }, + { + "epoch": 1.4289153919003517, + "grad_norm": 1.4188836812973022, + "learning_rate": 9.40501055513834e-06, + "loss": 0.6628, + "step": 89020 + }, + { + "epoch": 1.4290759081205155, + "grad_norm": 1.1829285621643066, + "learning_rate": 9.40008357664545e-06, + "loss": 0.6471, + "step": 89030 + }, + { + "epoch": 1.4292364243406797, + "grad_norm": 1.6854957342147827, + "learning_rate": 9.39515759021568e-06, + "loss": 0.7687, + "step": 89040 + }, + { + "epoch": 1.4293969405608435, + "grad_norm": 1.1162010431289673, + "learning_rate": 9.3902325961623e-06, + "loss": 0.7306, + "step": 89050 + }, + { + "epoch": 1.4295574567810077, + "grad_norm": 0.7873855829238892, + "learning_rate": 9.385308594798511e-06, + "loss": 0.7666, + "step": 89060 + }, + { + "epoch": 1.4297179730011718, + "grad_norm": 0.8409344553947449, + "learning_rate": 9.38038558643744e-06, + "loss": 0.6655, + "step": 89070 + }, + { + "epoch": 1.4298784892213359, + "grad_norm": 1.200905203819275, + "learning_rate": 9.375463571392167e-06, + "loss": 0.7688, + "step": 89080 + }, + { + "epoch": 1.4300390054415, + "grad_norm": 1.2537603378295898, + "learning_rate": 9.370542549975705e-06, + "loss": 0.6699, + "step": 89090 + }, + { + "epoch": 1.4301995216616639, + "grad_norm": 1.1107571125030518, + "learning_rate": 9.365622522501002e-06, + "loss": 0.6552, + "step": 89100 + }, + { + "epoch": 1.430360037881828, + "grad_norm": 0.9818306565284729, + "learning_rate": 9.360703489280942e-06, + "loss": 0.8047, + "step": 89110 + }, + { + "epoch": 1.430520554101992, + "grad_norm": 1.142911434173584, + "learning_rate": 9.355785450628355e-06, + "loss": 0.642, + "step": 89120 + }, + { + "epoch": 1.430681070322156, + "grad_norm": 0.5926787257194519, + "learning_rate": 9.35086840685598e-06, + "loss": 0.6584, + "step": 89130 + }, + { + "epoch": 1.43084158654232, + "grad_norm": 2.46016263961792, + "learning_rate": 9.345952358276528e-06, + "loss": 0.6558, + "step": 89140 + }, + { + "epoch": 1.4310021027624842, + "grad_norm": 0.8233826160430908, + "learning_rate": 9.34103730520262e-06, + "loss": 0.7211, + "step": 89150 + }, + { + "epoch": 1.4311626189826483, + "grad_norm": 1.2820285558700562, + "learning_rate": 9.336123247946835e-06, + "loss": 0.6955, + "step": 89160 + }, + { + "epoch": 1.4313231352028122, + "grad_norm": 0.9071996808052063, + "learning_rate": 9.331210186821674e-06, + "loss": 0.6928, + "step": 89170 + }, + { + "epoch": 1.4314836514229763, + "grad_norm": 0.8151643872261047, + "learning_rate": 9.326298122139585e-06, + "loss": 0.7835, + "step": 89180 + }, + { + "epoch": 1.4316441676431404, + "grad_norm": 0.7939061522483826, + "learning_rate": 9.321387054212932e-06, + "loss": 0.7347, + "step": 89190 + }, + { + "epoch": 1.4318046838633043, + "grad_norm": 1.151843786239624, + "learning_rate": 9.316476983354039e-06, + "loss": 0.7669, + "step": 89200 + }, + { + "epoch": 1.4319652000834684, + "grad_norm": 0.9122288823127747, + "learning_rate": 9.311567909875157e-06, + "loss": 0.7098, + "step": 89210 + }, + { + "epoch": 1.4321257163036325, + "grad_norm": 0.9062188267707825, + "learning_rate": 9.306659834088474e-06, + "loss": 0.62, + "step": 89220 + }, + { + "epoch": 1.4322862325237966, + "grad_norm": 1.1361278295516968, + "learning_rate": 9.301752756306115e-06, + "loss": 0.6113, + "step": 89230 + }, + { + "epoch": 1.4324467487439605, + "grad_norm": 1.777898907661438, + "learning_rate": 9.296846676840143e-06, + "loss": 0.7037, + "step": 89240 + }, + { + "epoch": 1.4326072649641246, + "grad_norm": 0.7992686629295349, + "learning_rate": 9.29194159600256e-06, + "loss": 0.6868, + "step": 89250 + }, + { + "epoch": 1.4327677811842887, + "grad_norm": 0.7973148226737976, + "learning_rate": 9.287037514105288e-06, + "loss": 0.7092, + "step": 89260 + }, + { + "epoch": 1.4329282974044526, + "grad_norm": 0.9661599397659302, + "learning_rate": 9.282134431460201e-06, + "loss": 0.7296, + "step": 89270 + }, + { + "epoch": 1.4330888136246167, + "grad_norm": 0.8842888474464417, + "learning_rate": 9.277232348379114e-06, + "loss": 0.6846, + "step": 89280 + }, + { + "epoch": 1.4332493298447808, + "grad_norm": 0.9847618341445923, + "learning_rate": 9.272331265173765e-06, + "loss": 0.597, + "step": 89290 + }, + { + "epoch": 1.433409846064945, + "grad_norm": 1.1169426441192627, + "learning_rate": 9.267431182155833e-06, + "loss": 0.705, + "step": 89300 + }, + { + "epoch": 1.433570362285109, + "grad_norm": 0.9147589206695557, + "learning_rate": 9.262532099636947e-06, + "loss": 0.8016, + "step": 89310 + }, + { + "epoch": 1.433730878505273, + "grad_norm": 1.3127354383468628, + "learning_rate": 9.257634017928644e-06, + "loss": 0.78, + "step": 89320 + }, + { + "epoch": 1.433891394725437, + "grad_norm": 1.0540015697479248, + "learning_rate": 9.252736937342415e-06, + "loss": 0.6496, + "step": 89330 + }, + { + "epoch": 1.434051910945601, + "grad_norm": 1.4051005840301514, + "learning_rate": 9.247840858189693e-06, + "loss": 0.7033, + "step": 89340 + }, + { + "epoch": 1.434212427165765, + "grad_norm": 1.494078278541565, + "learning_rate": 9.242945780781836e-06, + "loss": 0.6129, + "step": 89350 + }, + { + "epoch": 1.4343729433859291, + "grad_norm": 0.8225623965263367, + "learning_rate": 9.238051705430143e-06, + "loss": 0.6609, + "step": 89360 + }, + { + "epoch": 1.4345334596060932, + "grad_norm": 0.846028208732605, + "learning_rate": 9.233158632445854e-06, + "loss": 0.6765, + "step": 89370 + }, + { + "epoch": 1.4346939758262574, + "grad_norm": 1.2585375308990479, + "learning_rate": 9.228266562140129e-06, + "loss": 0.6799, + "step": 89380 + }, + { + "epoch": 1.4348544920464212, + "grad_norm": 1.3069753646850586, + "learning_rate": 9.223375494824077e-06, + "loss": 0.6468, + "step": 89390 + }, + { + "epoch": 1.4350150082665853, + "grad_norm": 1.2200196981430054, + "learning_rate": 9.218485430808745e-06, + "loss": 0.6706, + "step": 89400 + }, + { + "epoch": 1.4351755244867495, + "grad_norm": 1.0029397010803223, + "learning_rate": 9.213596370405114e-06, + "loss": 0.7167, + "step": 89410 + }, + { + "epoch": 1.4353360407069133, + "grad_norm": 0.9278778433799744, + "learning_rate": 9.208708313924094e-06, + "loss": 0.6732, + "step": 89420 + }, + { + "epoch": 1.4354965569270774, + "grad_norm": 0.8635597825050354, + "learning_rate": 9.20382126167655e-06, + "loss": 0.6692, + "step": 89430 + }, + { + "epoch": 1.4356570731472416, + "grad_norm": 1.0671637058258057, + "learning_rate": 9.19893521397325e-06, + "loss": 0.705, + "step": 89440 + }, + { + "epoch": 1.4358175893674057, + "grad_norm": 1.5452381372451782, + "learning_rate": 9.19405017112493e-06, + "loss": 0.7189, + "step": 89450 + }, + { + "epoch": 1.4359781055875696, + "grad_norm": 1.101365089416504, + "learning_rate": 9.189166133442245e-06, + "loss": 0.6504, + "step": 89460 + }, + { + "epoch": 1.4361386218077337, + "grad_norm": 0.9773374199867249, + "learning_rate": 9.184283101235794e-06, + "loss": 0.6051, + "step": 89470 + }, + { + "epoch": 1.4362991380278978, + "grad_norm": 0.8298909664154053, + "learning_rate": 9.17940107481611e-06, + "loss": 0.6901, + "step": 89480 + }, + { + "epoch": 1.4364596542480617, + "grad_norm": 0.9949231743812561, + "learning_rate": 9.174520054493668e-06, + "loss": 0.689, + "step": 89490 + }, + { + "epoch": 1.4366201704682258, + "grad_norm": 0.8860464692115784, + "learning_rate": 9.169640040578855e-06, + "loss": 0.6776, + "step": 89500 + }, + { + "epoch": 1.4367806866883899, + "grad_norm": 1.1807379722595215, + "learning_rate": 9.164761033382021e-06, + "loss": 0.6881, + "step": 89510 + }, + { + "epoch": 1.436941202908554, + "grad_norm": 0.8955403566360474, + "learning_rate": 9.159883033213443e-06, + "loss": 0.6486, + "step": 89520 + }, + { + "epoch": 1.437101719128718, + "grad_norm": 1.0132824182510376, + "learning_rate": 9.155006040383332e-06, + "loss": 0.6987, + "step": 89530 + }, + { + "epoch": 1.437262235348882, + "grad_norm": 1.0140403509140015, + "learning_rate": 9.150130055201836e-06, + "loss": 0.7706, + "step": 89540 + }, + { + "epoch": 1.437422751569046, + "grad_norm": 1.1302531957626343, + "learning_rate": 9.145255077979036e-06, + "loss": 0.7322, + "step": 89550 + }, + { + "epoch": 1.43758326778921, + "grad_norm": 0.9019303917884827, + "learning_rate": 9.14038110902496e-06, + "loss": 0.796, + "step": 89560 + }, + { + "epoch": 1.437743784009374, + "grad_norm": 1.0199768543243408, + "learning_rate": 9.135508148649557e-06, + "loss": 0.7045, + "step": 89570 + }, + { + "epoch": 1.4379043002295382, + "grad_norm": 0.9798575043678284, + "learning_rate": 9.13063619716272e-06, + "loss": 0.7461, + "step": 89580 + }, + { + "epoch": 1.4380648164497023, + "grad_norm": 1.1947143077850342, + "learning_rate": 9.125765254874285e-06, + "loss": 0.6376, + "step": 89590 + }, + { + "epoch": 1.4382253326698664, + "grad_norm": 2.0372345447540283, + "learning_rate": 9.120895322094e-06, + "loss": 0.5555, + "step": 89600 + }, + { + "epoch": 1.4383858488900303, + "grad_norm": 9.89968490600586, + "learning_rate": 9.116026399131572e-06, + "loss": 0.6837, + "step": 89610 + }, + { + "epoch": 1.4385463651101944, + "grad_norm": 0.7233784794807434, + "learning_rate": 9.111158486296637e-06, + "loss": 0.698, + "step": 89620 + }, + { + "epoch": 1.4387068813303585, + "grad_norm": 1.332430124282837, + "learning_rate": 9.106291583898762e-06, + "loss": 0.6402, + "step": 89630 + }, + { + "epoch": 1.4388673975505224, + "grad_norm": 1.4995423555374146, + "learning_rate": 9.101425692247457e-06, + "loss": 0.6628, + "step": 89640 + }, + { + "epoch": 1.4390279137706865, + "grad_norm": 0.7286564707756042, + "learning_rate": 9.09656081165216e-06, + "loss": 0.7669, + "step": 89650 + }, + { + "epoch": 1.4391884299908506, + "grad_norm": 1.2811346054077148, + "learning_rate": 9.091696942422256e-06, + "loss": 0.682, + "step": 89660 + }, + { + "epoch": 1.4393489462110147, + "grad_norm": 0.8392994403839111, + "learning_rate": 9.086834084867052e-06, + "loss": 0.6493, + "step": 89670 + }, + { + "epoch": 1.4395094624311786, + "grad_norm": 1.1515072584152222, + "learning_rate": 9.081972239295809e-06, + "loss": 0.7388, + "step": 89680 + }, + { + "epoch": 1.4396699786513427, + "grad_norm": 0.716391384601593, + "learning_rate": 9.077111406017691e-06, + "loss": 0.7675, + "step": 89690 + }, + { + "epoch": 1.4398304948715068, + "grad_norm": 0.8153060078620911, + "learning_rate": 9.072251585341832e-06, + "loss": 0.7188, + "step": 89700 + }, + { + "epoch": 1.4399910110916707, + "grad_norm": 1.2400174140930176, + "learning_rate": 9.067392777577283e-06, + "loss": 0.6957, + "step": 89710 + }, + { + "epoch": 1.4401515273118348, + "grad_norm": 1.1367583274841309, + "learning_rate": 9.062534983033041e-06, + "loss": 0.6074, + "step": 89720 + }, + { + "epoch": 1.440312043531999, + "grad_norm": 0.8780204653739929, + "learning_rate": 9.05767820201803e-06, + "loss": 0.6745, + "step": 89730 + }, + { + "epoch": 1.440472559752163, + "grad_norm": 1.0972284078598022, + "learning_rate": 9.052822434841124e-06, + "loss": 0.8567, + "step": 89740 + }, + { + "epoch": 1.440633075972327, + "grad_norm": 0.9375779032707214, + "learning_rate": 9.047967681811099e-06, + "loss": 0.5983, + "step": 89750 + }, + { + "epoch": 1.440793592192491, + "grad_norm": 0.7959539890289307, + "learning_rate": 9.043113943236704e-06, + "loss": 0.7049, + "step": 89760 + }, + { + "epoch": 1.4409541084126551, + "grad_norm": 1.332661747932434, + "learning_rate": 9.0382612194266e-06, + "loss": 0.7853, + "step": 89770 + }, + { + "epoch": 1.441114624632819, + "grad_norm": 1.9109548330307007, + "learning_rate": 9.0334095106894e-06, + "loss": 0.6952, + "step": 89780 + }, + { + "epoch": 1.4412751408529831, + "grad_norm": 1.3709908723831177, + "learning_rate": 9.02855881733364e-06, + "loss": 0.7013, + "step": 89790 + }, + { + "epoch": 1.4414356570731472, + "grad_norm": 0.964154064655304, + "learning_rate": 9.023709139667796e-06, + "loss": 0.7201, + "step": 89800 + }, + { + "epoch": 1.4415961732933114, + "grad_norm": 0.7306498885154724, + "learning_rate": 9.018860478000287e-06, + "loss": 0.714, + "step": 89810 + }, + { + "epoch": 1.4417566895134755, + "grad_norm": 1.0476640462875366, + "learning_rate": 9.014012832639445e-06, + "loss": 0.8167, + "step": 89820 + }, + { + "epoch": 1.4419172057336394, + "grad_norm": 1.3448463678359985, + "learning_rate": 9.009166203893557e-06, + "loss": 0.7088, + "step": 89830 + }, + { + "epoch": 1.4420777219538035, + "grad_norm": 0.9910193085670471, + "learning_rate": 9.004320592070844e-06, + "loss": 0.7018, + "step": 89840 + }, + { + "epoch": 1.4422382381739673, + "grad_norm": 0.6382800936698914, + "learning_rate": 8.999475997479455e-06, + "loss": 0.6225, + "step": 89850 + }, + { + "epoch": 1.4423987543941315, + "grad_norm": 1.3000847101211548, + "learning_rate": 8.99463242042748e-06, + "loss": 0.7746, + "step": 89860 + }, + { + "epoch": 1.4425592706142956, + "grad_norm": 0.9777624607086182, + "learning_rate": 8.989789861222948e-06, + "loss": 0.6628, + "step": 89870 + }, + { + "epoch": 1.4427197868344597, + "grad_norm": 0.9439882040023804, + "learning_rate": 8.984948320173803e-06, + "loss": 0.7015, + "step": 89880 + }, + { + "epoch": 1.4428803030546238, + "grad_norm": 1.6572458744049072, + "learning_rate": 8.980107797587944e-06, + "loss": 0.8111, + "step": 89890 + }, + { + "epoch": 1.4430408192747877, + "grad_norm": 0.659465491771698, + "learning_rate": 8.975268293773206e-06, + "loss": 0.6122, + "step": 89900 + }, + { + "epoch": 1.4432013354949518, + "grad_norm": 0.9521952867507935, + "learning_rate": 8.970429809037345e-06, + "loss": 0.6869, + "step": 89910 + }, + { + "epoch": 1.4433618517151159, + "grad_norm": 1.6426928043365479, + "learning_rate": 8.965592343688068e-06, + "loss": 0.7267, + "step": 89920 + }, + { + "epoch": 1.4435223679352798, + "grad_norm": 0.49250155687332153, + "learning_rate": 8.96075589803301e-06, + "loss": 0.6903, + "step": 89930 + }, + { + "epoch": 1.4436828841554439, + "grad_norm": 1.0567938089370728, + "learning_rate": 8.955920472379727e-06, + "loss": 0.5978, + "step": 89940 + }, + { + "epoch": 1.443843400375608, + "grad_norm": 0.9273326992988586, + "learning_rate": 8.951086067035736e-06, + "loss": 0.6751, + "step": 89950 + }, + { + "epoch": 1.444003916595772, + "grad_norm": 1.0231858491897583, + "learning_rate": 8.946252682308473e-06, + "loss": 0.6966, + "step": 89960 + }, + { + "epoch": 1.444164432815936, + "grad_norm": 0.9030721783638, + "learning_rate": 8.941420318505313e-06, + "loss": 0.7776, + "step": 89970 + }, + { + "epoch": 1.4443249490361, + "grad_norm": 1.2612136602401733, + "learning_rate": 8.936588975933566e-06, + "loss": 0.6574, + "step": 89980 + }, + { + "epoch": 1.4444854652562642, + "grad_norm": 0.8542338013648987, + "learning_rate": 8.931758654900487e-06, + "loss": 0.7546, + "step": 89990 + }, + { + "epoch": 1.444645981476428, + "grad_norm": 2.8378512859344482, + "learning_rate": 8.926929355713236e-06, + "loss": 0.6445, + "step": 90000 + }, + { + "epoch": 1.4448064976965922, + "grad_norm": 1.5601736307144165, + "learning_rate": 8.92210107867894e-06, + "loss": 0.6615, + "step": 90010 + }, + { + "epoch": 1.4449670139167563, + "grad_norm": 1.007088541984558, + "learning_rate": 8.91727382410465e-06, + "loss": 0.6222, + "step": 90020 + }, + { + "epoch": 1.4451275301369204, + "grad_norm": 1.063485026359558, + "learning_rate": 8.912447592297346e-06, + "loss": 0.7576, + "step": 90030 + }, + { + "epoch": 1.4452880463570845, + "grad_norm": 1.517088532447815, + "learning_rate": 8.907622383563952e-06, + "loss": 0.661, + "step": 90040 + }, + { + "epoch": 1.4454485625772484, + "grad_norm": 0.9423627853393555, + "learning_rate": 8.902798198211329e-06, + "loss": 0.7396, + "step": 90050 + }, + { + "epoch": 1.4456090787974125, + "grad_norm": 1.1598186492919922, + "learning_rate": 8.897975036546251e-06, + "loss": 0.6905, + "step": 90060 + }, + { + "epoch": 1.4457695950175764, + "grad_norm": 1.1020129919052124, + "learning_rate": 8.893152898875452e-06, + "loss": 0.6816, + "step": 90070 + }, + { + "epoch": 1.4459301112377405, + "grad_norm": 1.3455098867416382, + "learning_rate": 8.888331785505591e-06, + "loss": 0.6933, + "step": 90080 + }, + { + "epoch": 1.4460906274579046, + "grad_norm": 0.9129499197006226, + "learning_rate": 8.883511696743264e-06, + "loss": 0.7001, + "step": 90090 + }, + { + "epoch": 1.4462511436780687, + "grad_norm": 0.9701252579689026, + "learning_rate": 8.878692632894997e-06, + "loss": 0.7067, + "step": 90100 + }, + { + "epoch": 1.4464116598982328, + "grad_norm": 11.30141544342041, + "learning_rate": 8.873874594267256e-06, + "loss": 0.7314, + "step": 90110 + }, + { + "epoch": 1.4465721761183967, + "grad_norm": 1.3362232446670532, + "learning_rate": 8.869057581166448e-06, + "loss": 0.7236, + "step": 90120 + }, + { + "epoch": 1.4467326923385608, + "grad_norm": 0.9242321848869324, + "learning_rate": 8.864241593898892e-06, + "loss": 0.7085, + "step": 90130 + }, + { + "epoch": 1.446893208558725, + "grad_norm": 2.0670390129089355, + "learning_rate": 8.859426632770866e-06, + "loss": 0.7919, + "step": 90140 + }, + { + "epoch": 1.4470537247788888, + "grad_norm": 1.3405760526657104, + "learning_rate": 8.854612698088566e-06, + "loss": 0.7523, + "step": 90150 + }, + { + "epoch": 1.447214240999053, + "grad_norm": 0.9812236428260803, + "learning_rate": 8.849799790158134e-06, + "loss": 0.6455, + "step": 90160 + }, + { + "epoch": 1.447374757219217, + "grad_norm": 0.6446471810340881, + "learning_rate": 8.844987909285646e-06, + "loss": 0.6462, + "step": 90170 + }, + { + "epoch": 1.4475352734393812, + "grad_norm": 0.931678056716919, + "learning_rate": 8.840177055777107e-06, + "loss": 0.6445, + "step": 90180 + }, + { + "epoch": 1.447695789659545, + "grad_norm": 1.0274213552474976, + "learning_rate": 8.835367229938457e-06, + "loss": 0.6834, + "step": 90190 + }, + { + "epoch": 1.4478563058797091, + "grad_norm": 0.6913283467292786, + "learning_rate": 8.830558432075575e-06, + "loss": 0.7374, + "step": 90200 + }, + { + "epoch": 1.4480168220998733, + "grad_norm": 1.3162529468536377, + "learning_rate": 8.825750662494273e-06, + "loss": 0.6446, + "step": 90210 + }, + { + "epoch": 1.4481773383200371, + "grad_norm": 0.8907169103622437, + "learning_rate": 8.820943921500303e-06, + "loss": 0.7027, + "step": 90220 + }, + { + "epoch": 1.4483378545402013, + "grad_norm": 0.8066461086273193, + "learning_rate": 8.816138209399327e-06, + "loss": 0.6638, + "step": 90230 + }, + { + "epoch": 1.4484983707603654, + "grad_norm": 1.1451116800308228, + "learning_rate": 8.811333526496976e-06, + "loss": 0.6778, + "step": 90240 + }, + { + "epoch": 1.4486588869805295, + "grad_norm": 1.168536901473999, + "learning_rate": 8.806529873098793e-06, + "loss": 0.643, + "step": 90250 + }, + { + "epoch": 1.4488194032006934, + "grad_norm": 0.9931761026382446, + "learning_rate": 8.801727249510265e-06, + "loss": 0.6788, + "step": 90260 + }, + { + "epoch": 1.4489799194208575, + "grad_norm": 1.1024036407470703, + "learning_rate": 8.796925656036806e-06, + "loss": 0.6771, + "step": 90270 + }, + { + "epoch": 1.4491404356410216, + "grad_norm": 1.3180471658706665, + "learning_rate": 8.792125092983777e-06, + "loss": 0.7878, + "step": 90280 + }, + { + "epoch": 1.4493009518611855, + "grad_norm": 1.4502476453781128, + "learning_rate": 8.787325560656459e-06, + "loss": 0.7017, + "step": 90290 + }, + { + "epoch": 1.4494614680813496, + "grad_norm": 1.453042984008789, + "learning_rate": 8.782527059360086e-06, + "loss": 0.8204, + "step": 90300 + }, + { + "epoch": 1.4496219843015137, + "grad_norm": 0.8595662713050842, + "learning_rate": 8.777729589399797e-06, + "loss": 0.6958, + "step": 90310 + }, + { + "epoch": 1.4497825005216778, + "grad_norm": 1.27681303024292, + "learning_rate": 8.77293315108069e-06, + "loss": 0.6767, + "step": 90320 + }, + { + "epoch": 1.449943016741842, + "grad_norm": 0.8918519616127014, + "learning_rate": 8.768137744707794e-06, + "loss": 0.7066, + "step": 90330 + }, + { + "epoch": 1.4501035329620058, + "grad_norm": 1.0130698680877686, + "learning_rate": 8.763343370586063e-06, + "loss": 0.721, + "step": 90340 + }, + { + "epoch": 1.45026404918217, + "grad_norm": 1.1489521265029907, + "learning_rate": 8.758550029020395e-06, + "loss": 0.7689, + "step": 90350 + }, + { + "epoch": 1.4504245654023338, + "grad_norm": 1.0489842891693115, + "learning_rate": 8.753757720315619e-06, + "loss": 0.6452, + "step": 90360 + }, + { + "epoch": 1.4505850816224979, + "grad_norm": 1.2745803594589233, + "learning_rate": 8.748966444776502e-06, + "loss": 0.7078, + "step": 90370 + }, + { + "epoch": 1.450745597842662, + "grad_norm": 1.1517702341079712, + "learning_rate": 8.74417620270773e-06, + "loss": 0.7143, + "step": 90380 + }, + { + "epoch": 1.450906114062826, + "grad_norm": 1.131622314453125, + "learning_rate": 8.739386994413937e-06, + "loss": 0.6951, + "step": 90390 + }, + { + "epoch": 1.4510666302829902, + "grad_norm": 1.0670716762542725, + "learning_rate": 8.734598820199691e-06, + "loss": 0.6813, + "step": 90400 + }, + { + "epoch": 1.451227146503154, + "grad_norm": 1.2117899656295776, + "learning_rate": 8.729811680369493e-06, + "loss": 0.653, + "step": 90410 + }, + { + "epoch": 1.4513876627233182, + "grad_norm": 1.60556161403656, + "learning_rate": 8.725025575227778e-06, + "loss": 0.8648, + "step": 90420 + }, + { + "epoch": 1.4515481789434823, + "grad_norm": 1.4536478519439697, + "learning_rate": 8.720240505078917e-06, + "loss": 0.6893, + "step": 90430 + }, + { + "epoch": 1.4517086951636462, + "grad_norm": 1.5174487829208374, + "learning_rate": 8.715456470227198e-06, + "loss": 0.7612, + "step": 90440 + }, + { + "epoch": 1.4518692113838103, + "grad_norm": 1.1231240034103394, + "learning_rate": 8.71067347097687e-06, + "loss": 0.7869, + "step": 90450 + }, + { + "epoch": 1.4520297276039744, + "grad_norm": 1.620205283164978, + "learning_rate": 8.7058915076321e-06, + "loss": 0.6953, + "step": 90460 + }, + { + "epoch": 1.4521902438241385, + "grad_norm": 0.7072114944458008, + "learning_rate": 8.701110580496993e-06, + "loss": 0.5991, + "step": 90470 + }, + { + "epoch": 1.4523507600443024, + "grad_norm": 1.2603323459625244, + "learning_rate": 8.696330689875586e-06, + "loss": 0.7091, + "step": 90480 + }, + { + "epoch": 1.4525112762644665, + "grad_norm": 2.0096919536590576, + "learning_rate": 8.691551836071866e-06, + "loss": 0.7955, + "step": 90490 + }, + { + "epoch": 1.4526717924846306, + "grad_norm": 1.0052695274353027, + "learning_rate": 8.686774019389718e-06, + "loss": 0.6712, + "step": 90500 + }, + { + "epoch": 1.4528323087047945, + "grad_norm": 1.3312479257583618, + "learning_rate": 8.681997240132995e-06, + "loss": 0.8073, + "step": 90510 + }, + { + "epoch": 1.4529928249249586, + "grad_norm": 0.6028386950492859, + "learning_rate": 8.67722149860547e-06, + "loss": 0.6941, + "step": 90520 + }, + { + "epoch": 1.4531533411451227, + "grad_norm": 0.9467605948448181, + "learning_rate": 8.672446795110854e-06, + "loss": 0.7989, + "step": 90530 + }, + { + "epoch": 1.4533138573652868, + "grad_norm": 1.1763429641723633, + "learning_rate": 8.667673129952785e-06, + "loss": 0.705, + "step": 90540 + }, + { + "epoch": 1.4534743735854507, + "grad_norm": 1.805093765258789, + "learning_rate": 8.662900503434857e-06, + "loss": 0.7244, + "step": 90550 + }, + { + "epoch": 1.4536348898056148, + "grad_norm": 0.8766070604324341, + "learning_rate": 8.658128915860555e-06, + "loss": 0.7333, + "step": 90560 + }, + { + "epoch": 1.453795406025779, + "grad_norm": 1.1964678764343262, + "learning_rate": 8.65335836753334e-06, + "loss": 0.6107, + "step": 90570 + }, + { + "epoch": 1.4539559222459428, + "grad_norm": 0.882590651512146, + "learning_rate": 8.648588858756587e-06, + "loss": 0.8374, + "step": 90580 + }, + { + "epoch": 1.454116438466107, + "grad_norm": 0.6992864012718201, + "learning_rate": 8.64382038983361e-06, + "loss": 0.8264, + "step": 90590 + }, + { + "epoch": 1.454276954686271, + "grad_norm": 1.266981601715088, + "learning_rate": 8.639052961067653e-06, + "loss": 0.7576, + "step": 90600 + }, + { + "epoch": 1.4544374709064352, + "grad_norm": 0.8537298440933228, + "learning_rate": 8.634286572761902e-06, + "loss": 0.7894, + "step": 90610 + }, + { + "epoch": 1.4545979871265993, + "grad_norm": 1.1595938205718994, + "learning_rate": 8.629521225219476e-06, + "loss": 0.6094, + "step": 90620 + }, + { + "epoch": 1.4547585033467632, + "grad_norm": 1.3850603103637695, + "learning_rate": 8.624756918743404e-06, + "loss": 0.6833, + "step": 90630 + }, + { + "epoch": 1.4549190195669273, + "grad_norm": 0.9772661924362183, + "learning_rate": 8.61999365363668e-06, + "loss": 0.8237, + "step": 90640 + }, + { + "epoch": 1.4550795357870911, + "grad_norm": 1.0777963399887085, + "learning_rate": 8.61523143020222e-06, + "loss": 0.7278, + "step": 90650 + }, + { + "epoch": 1.4552400520072553, + "grad_norm": 0.7656727433204651, + "learning_rate": 8.610470248742871e-06, + "loss": 0.7106, + "step": 90660 + }, + { + "epoch": 1.4554005682274194, + "grad_norm": 1.760209083557129, + "learning_rate": 8.605710109561418e-06, + "loss": 0.6811, + "step": 90670 + }, + { + "epoch": 1.4555610844475835, + "grad_norm": 1.5773470401763916, + "learning_rate": 8.600951012960584e-06, + "loss": 0.7994, + "step": 90680 + }, + { + "epoch": 1.4557216006677476, + "grad_norm": 0.7180977463722229, + "learning_rate": 8.596192959243007e-06, + "loss": 0.6534, + "step": 90690 + }, + { + "epoch": 1.4558821168879115, + "grad_norm": 0.9249919652938843, + "learning_rate": 8.591435948711276e-06, + "loss": 0.7052, + "step": 90700 + }, + { + "epoch": 1.4560426331080756, + "grad_norm": 0.8189207911491394, + "learning_rate": 8.58667998166791e-06, + "loss": 0.7175, + "step": 90710 + }, + { + "epoch": 1.4562031493282397, + "grad_norm": 0.8343746662139893, + "learning_rate": 8.581925058415361e-06, + "loss": 0.7302, + "step": 90720 + }, + { + "epoch": 1.4563636655484036, + "grad_norm": 0.7743750810623169, + "learning_rate": 8.577171179256016e-06, + "loss": 0.7707, + "step": 90730 + }, + { + "epoch": 1.4565241817685677, + "grad_norm": 1.7478866577148438, + "learning_rate": 8.572418344492198e-06, + "loss": 0.6478, + "step": 90740 + }, + { + "epoch": 1.4566846979887318, + "grad_norm": 1.1375936269760132, + "learning_rate": 8.567666554426144e-06, + "loss": 0.7391, + "step": 90750 + }, + { + "epoch": 1.456845214208896, + "grad_norm": 0.5963486433029175, + "learning_rate": 8.56291580936005e-06, + "loss": 0.8043, + "step": 90760 + }, + { + "epoch": 1.4570057304290598, + "grad_norm": 1.0712776184082031, + "learning_rate": 8.558166109596035e-06, + "loss": 0.7281, + "step": 90770 + }, + { + "epoch": 1.457166246649224, + "grad_norm": 0.8864678144454956, + "learning_rate": 8.553417455436153e-06, + "loss": 0.7397, + "step": 90780 + }, + { + "epoch": 1.457326762869388, + "grad_norm": 0.9295701384544373, + "learning_rate": 8.548669847182386e-06, + "loss": 0.7794, + "step": 90790 + }, + { + "epoch": 1.457487279089552, + "grad_norm": 1.2000404596328735, + "learning_rate": 8.54392328513666e-06, + "loss": 0.7594, + "step": 90800 + }, + { + "epoch": 1.457647795309716, + "grad_norm": 0.876929759979248, + "learning_rate": 8.539177769600826e-06, + "loss": 0.629, + "step": 90810 + }, + { + "epoch": 1.45780831152988, + "grad_norm": 1.2852833271026611, + "learning_rate": 8.534433300876668e-06, + "loss": 0.7716, + "step": 90820 + }, + { + "epoch": 1.4579688277500442, + "grad_norm": 1.225503921508789, + "learning_rate": 8.529689879265909e-06, + "loss": 0.6013, + "step": 90830 + }, + { + "epoch": 1.4581293439702083, + "grad_norm": 1.4115960597991943, + "learning_rate": 8.524947505070212e-06, + "loss": 0.6218, + "step": 90840 + }, + { + "epoch": 1.4582898601903722, + "grad_norm": 0.833939254283905, + "learning_rate": 8.520206178591147e-06, + "loss": 0.7365, + "step": 90850 + }, + { + "epoch": 1.4584503764105363, + "grad_norm": 0.7641943097114563, + "learning_rate": 8.515465900130244e-06, + "loss": 0.5623, + "step": 90860 + }, + { + "epoch": 1.4586108926307002, + "grad_norm": 1.4267266988754272, + "learning_rate": 8.510726669988952e-06, + "loss": 0.6585, + "step": 90870 + }, + { + "epoch": 1.4587714088508643, + "grad_norm": 0.9564258456230164, + "learning_rate": 8.505988488468664e-06, + "loss": 0.7887, + "step": 90880 + }, + { + "epoch": 1.4589319250710284, + "grad_norm": 1.220360279083252, + "learning_rate": 8.501251355870698e-06, + "loss": 0.7022, + "step": 90890 + }, + { + "epoch": 1.4590924412911925, + "grad_norm": 1.044716238975525, + "learning_rate": 8.496515272496308e-06, + "loss": 0.698, + "step": 90900 + }, + { + "epoch": 1.4592529575113566, + "grad_norm": 1.0584262609481812, + "learning_rate": 8.491780238646679e-06, + "loss": 0.6496, + "step": 90910 + }, + { + "epoch": 1.4594134737315205, + "grad_norm": 0.9434950947761536, + "learning_rate": 8.487046254622935e-06, + "loss": 0.6682, + "step": 90920 + }, + { + "epoch": 1.4595739899516846, + "grad_norm": 0.9177314639091492, + "learning_rate": 8.482313320726138e-06, + "loss": 0.7231, + "step": 90930 + }, + { + "epoch": 1.4597345061718487, + "grad_norm": 0.9607457518577576, + "learning_rate": 8.477581437257253e-06, + "loss": 0.7249, + "step": 90940 + }, + { + "epoch": 1.4598950223920126, + "grad_norm": 1.1008546352386475, + "learning_rate": 8.472850604517216e-06, + "loss": 0.6944, + "step": 90950 + }, + { + "epoch": 1.4600555386121767, + "grad_norm": 1.0485541820526123, + "learning_rate": 8.46812082280687e-06, + "loss": 0.7728, + "step": 90960 + }, + { + "epoch": 1.4602160548323408, + "grad_norm": 0.8473919034004211, + "learning_rate": 8.463392092427015e-06, + "loss": 0.6973, + "step": 90970 + }, + { + "epoch": 1.460376571052505, + "grad_norm": 0.9395617842674255, + "learning_rate": 8.458664413678358e-06, + "loss": 0.6448, + "step": 90980 + }, + { + "epoch": 1.4605370872726688, + "grad_norm": 1.1185994148254395, + "learning_rate": 8.453937786861565e-06, + "loss": 0.69, + "step": 90990 + }, + { + "epoch": 1.460697603492833, + "grad_norm": 1.1255828142166138, + "learning_rate": 8.449212212277205e-06, + "loss": 0.6748, + "step": 91000 + }, + { + "epoch": 1.460858119712997, + "grad_norm": 1.115036129951477, + "learning_rate": 8.444487690225805e-06, + "loss": 0.8353, + "step": 91010 + }, + { + "epoch": 1.461018635933161, + "grad_norm": 1.2873635292053223, + "learning_rate": 8.439764221007818e-06, + "loss": 0.6682, + "step": 91020 + }, + { + "epoch": 1.461179152153325, + "grad_norm": 1.0158216953277588, + "learning_rate": 8.43504180492363e-06, + "loss": 0.831, + "step": 91030 + }, + { + "epoch": 1.4613396683734892, + "grad_norm": 1.2618640661239624, + "learning_rate": 8.430320442273552e-06, + "loss": 0.747, + "step": 91040 + }, + { + "epoch": 1.4615001845936533, + "grad_norm": 1.049182415008545, + "learning_rate": 8.42560013335785e-06, + "loss": 0.6241, + "step": 91050 + }, + { + "epoch": 1.4616607008138172, + "grad_norm": 1.2152705192565918, + "learning_rate": 8.420880878476689e-06, + "loss": 0.6218, + "step": 91060 + }, + { + "epoch": 1.4618212170339813, + "grad_norm": 2.3923497200012207, + "learning_rate": 8.416162677930196e-06, + "loss": 0.5843, + "step": 91070 + }, + { + "epoch": 1.4619817332541454, + "grad_norm": 0.8902493119239807, + "learning_rate": 8.41144553201842e-06, + "loss": 0.7452, + "step": 91080 + }, + { + "epoch": 1.4621422494743093, + "grad_norm": 0.7939468622207642, + "learning_rate": 8.406729441041341e-06, + "loss": 0.681, + "step": 91090 + }, + { + "epoch": 1.4623027656944734, + "grad_norm": 2.5518035888671875, + "learning_rate": 8.402014405298877e-06, + "loss": 0.6337, + "step": 91100 + }, + { + "epoch": 1.4624632819146375, + "grad_norm": 1.5095584392547607, + "learning_rate": 8.397300425090888e-06, + "loss": 0.7316, + "step": 91110 + }, + { + "epoch": 1.4626237981348016, + "grad_norm": 1.6300766468048096, + "learning_rate": 8.392587500717131e-06, + "loss": 0.6985, + "step": 91120 + }, + { + "epoch": 1.4627843143549657, + "grad_norm": 1.4583979845046997, + "learning_rate": 8.387875632477335e-06, + "loss": 0.5883, + "step": 91130 + }, + { + "epoch": 1.4629448305751296, + "grad_norm": 1.16599702835083, + "learning_rate": 8.383164820671147e-06, + "loss": 0.7626, + "step": 91140 + }, + { + "epoch": 1.4631053467952937, + "grad_norm": 0.9156755208969116, + "learning_rate": 8.378455065598146e-06, + "loss": 0.713, + "step": 91150 + }, + { + "epoch": 1.4632658630154576, + "grad_norm": 1.369518756866455, + "learning_rate": 8.373746367557842e-06, + "loss": 0.8535, + "step": 91160 + }, + { + "epoch": 1.4634263792356217, + "grad_norm": 0.7621539831161499, + "learning_rate": 8.369038726849682e-06, + "loss": 0.5842, + "step": 91170 + }, + { + "epoch": 1.4635868954557858, + "grad_norm": 0.8102837204933167, + "learning_rate": 8.364332143773056e-06, + "loss": 0.7162, + "step": 91180 + }, + { + "epoch": 1.46374741167595, + "grad_norm": 1.1189157962799072, + "learning_rate": 8.359626618627255e-06, + "loss": 0.6123, + "step": 91190 + }, + { + "epoch": 1.463907927896114, + "grad_norm": 1.1156227588653564, + "learning_rate": 8.354922151711533e-06, + "loss": 0.7132, + "step": 91200 + }, + { + "epoch": 1.464068444116278, + "grad_norm": 1.4522422552108765, + "learning_rate": 8.350218743325063e-06, + "loss": 0.6846, + "step": 91210 + }, + { + "epoch": 1.464228960336442, + "grad_norm": 1.5412243604660034, + "learning_rate": 8.345516393766959e-06, + "loss": 0.6567, + "step": 91220 + }, + { + "epoch": 1.4643894765566061, + "grad_norm": 1.0370577573776245, + "learning_rate": 8.340815103336259e-06, + "loss": 0.8769, + "step": 91230 + }, + { + "epoch": 1.46454999277677, + "grad_norm": 0.8641848564147949, + "learning_rate": 8.336114872331949e-06, + "loss": 0.7265, + "step": 91240 + }, + { + "epoch": 1.4647105089969341, + "grad_norm": 1.1859370470046997, + "learning_rate": 8.331415701052916e-06, + "loss": 0.7317, + "step": 91250 + }, + { + "epoch": 1.4648710252170982, + "grad_norm": 1.013994574546814, + "learning_rate": 8.32671758979801e-06, + "loss": 0.6519, + "step": 91260 + }, + { + "epoch": 1.4650315414372623, + "grad_norm": 1.0295171737670898, + "learning_rate": 8.322020538866004e-06, + "loss": 0.5964, + "step": 91270 + }, + { + "epoch": 1.4651920576574262, + "grad_norm": 1.1105514764785767, + "learning_rate": 8.317324548555599e-06, + "loss": 0.8266, + "step": 91280 + }, + { + "epoch": 1.4653525738775903, + "grad_norm": 0.9963943958282471, + "learning_rate": 8.312629619165438e-06, + "loss": 0.6777, + "step": 91290 + }, + { + "epoch": 1.4655130900977544, + "grad_norm": 2.446972370147705, + "learning_rate": 8.307935750994098e-06, + "loss": 0.6753, + "step": 91300 + }, + { + "epoch": 1.4656736063179183, + "grad_norm": 1.5920664072036743, + "learning_rate": 8.303242944340062e-06, + "loss": 0.7854, + "step": 91310 + }, + { + "epoch": 1.4658341225380824, + "grad_norm": 0.9166713953018188, + "learning_rate": 8.298551199501771e-06, + "loss": 0.685, + "step": 91320 + }, + { + "epoch": 1.4659946387582465, + "grad_norm": 1.0233911275863647, + "learning_rate": 8.293860516777602e-06, + "loss": 0.6138, + "step": 91330 + }, + { + "epoch": 1.4661551549784106, + "grad_norm": 1.4950677156448364, + "learning_rate": 8.289170896465847e-06, + "loss": 0.6712, + "step": 91340 + }, + { + "epoch": 1.4663156711985745, + "grad_norm": 1.4260146617889404, + "learning_rate": 8.28448233886474e-06, + "loss": 0.731, + "step": 91350 + }, + { + "epoch": 1.4664761874187386, + "grad_norm": 0.8934862017631531, + "learning_rate": 8.279794844272453e-06, + "loss": 0.7642, + "step": 91360 + }, + { + "epoch": 1.4666367036389028, + "grad_norm": 0.8479598164558411, + "learning_rate": 8.27510841298707e-06, + "loss": 0.6669, + "step": 91370 + }, + { + "epoch": 1.4667972198590666, + "grad_norm": 1.376442551612854, + "learning_rate": 8.270423045306628e-06, + "loss": 0.5705, + "step": 91380 + }, + { + "epoch": 1.4669577360792307, + "grad_norm": 0.823867678642273, + "learning_rate": 8.265738741529083e-06, + "loss": 0.8324, + "step": 91390 + }, + { + "epoch": 1.4671182522993949, + "grad_norm": 1.3025522232055664, + "learning_rate": 8.26105550195234e-06, + "loss": 0.6905, + "step": 91400 + }, + { + "epoch": 1.467278768519559, + "grad_norm": 1.067405104637146, + "learning_rate": 8.256373326874215e-06, + "loss": 0.6077, + "step": 91410 + }, + { + "epoch": 1.467439284739723, + "grad_norm": 1.6295876502990723, + "learning_rate": 8.251692216592471e-06, + "loss": 0.7422, + "step": 91420 + }, + { + "epoch": 1.467599800959887, + "grad_norm": 0.906060516834259, + "learning_rate": 8.247012171404803e-06, + "loss": 0.8154, + "step": 91430 + }, + { + "epoch": 1.467760317180051, + "grad_norm": 1.834416389465332, + "learning_rate": 8.242333191608828e-06, + "loss": 0.6537, + "step": 91440 + }, + { + "epoch": 1.4679208334002152, + "grad_norm": 1.1044106483459473, + "learning_rate": 8.237655277502104e-06, + "loss": 0.6137, + "step": 91450 + }, + { + "epoch": 1.468081349620379, + "grad_norm": 1.1513460874557495, + "learning_rate": 8.232978429382121e-06, + "loss": 0.707, + "step": 91460 + }, + { + "epoch": 1.4682418658405432, + "grad_norm": 0.766886293888092, + "learning_rate": 8.228302647546305e-06, + "loss": 0.7348, + "step": 91470 + }, + { + "epoch": 1.4684023820607073, + "grad_norm": 0.504482626914978, + "learning_rate": 8.223627932291995e-06, + "loss": 0.6894, + "step": 91480 + }, + { + "epoch": 1.4685628982808714, + "grad_norm": 1.2324583530426025, + "learning_rate": 8.218954283916477e-06, + "loss": 0.62, + "step": 91490 + }, + { + "epoch": 1.4687234145010353, + "grad_norm": 0.6687842607498169, + "learning_rate": 8.214281702716975e-06, + "loss": 0.6634, + "step": 91500 + }, + { + "epoch": 1.4688839307211994, + "grad_norm": 0.8047727346420288, + "learning_rate": 8.209610188990632e-06, + "loss": 0.6592, + "step": 91510 + }, + { + "epoch": 1.4690444469413635, + "grad_norm": 0.9184284210205078, + "learning_rate": 8.204939743034532e-06, + "loss": 0.7917, + "step": 91520 + }, + { + "epoch": 1.4692049631615274, + "grad_norm": 0.7318720817565918, + "learning_rate": 8.20027036514569e-06, + "loss": 0.8141, + "step": 91530 + }, + { + "epoch": 1.4693654793816915, + "grad_norm": 1.0854613780975342, + "learning_rate": 8.195602055621046e-06, + "loss": 0.6789, + "step": 91540 + }, + { + "epoch": 1.4695259956018556, + "grad_norm": 0.9242256879806519, + "learning_rate": 8.190934814757486e-06, + "loss": 0.683, + "step": 91550 + }, + { + "epoch": 1.4696865118220197, + "grad_norm": 1.2617201805114746, + "learning_rate": 8.186268642851807e-06, + "loss": 0.8007, + "step": 91560 + }, + { + "epoch": 1.4698470280421836, + "grad_norm": 0.6851112842559814, + "learning_rate": 8.181603540200756e-06, + "loss": 0.7641, + "step": 91570 + }, + { + "epoch": 1.4700075442623477, + "grad_norm": 0.9417204260826111, + "learning_rate": 8.176939507101005e-06, + "loss": 0.7068, + "step": 91580 + }, + { + "epoch": 1.4701680604825118, + "grad_norm": 0.8186387419700623, + "learning_rate": 8.172276543849161e-06, + "loss": 0.7732, + "step": 91590 + }, + { + "epoch": 1.4703285767026757, + "grad_norm": 1.0748709440231323, + "learning_rate": 8.167614650741759e-06, + "loss": 0.7754, + "step": 91600 + }, + { + "epoch": 1.4704890929228398, + "grad_norm": 1.6874983310699463, + "learning_rate": 8.162953828075275e-06, + "loss": 0.6809, + "step": 91610 + }, + { + "epoch": 1.470649609143004, + "grad_norm": 1.3989510536193848, + "learning_rate": 8.1582940761461e-06, + "loss": 0.6083, + "step": 91620 + }, + { + "epoch": 1.470810125363168, + "grad_norm": 1.1134885549545288, + "learning_rate": 8.15363539525057e-06, + "loss": 0.8673, + "step": 91630 + }, + { + "epoch": 1.4709706415833321, + "grad_norm": 1.1205706596374512, + "learning_rate": 8.14897778568495e-06, + "loss": 0.6773, + "step": 91640 + }, + { + "epoch": 1.471131157803496, + "grad_norm": 3.812089681625366, + "learning_rate": 8.144321247745438e-06, + "loss": 0.6863, + "step": 91650 + }, + { + "epoch": 1.4712916740236601, + "grad_norm": 1.4385775327682495, + "learning_rate": 8.139665781728162e-06, + "loss": 0.7328, + "step": 91660 + }, + { + "epoch": 1.471452190243824, + "grad_norm": 0.8987877368927002, + "learning_rate": 8.13501138792919e-06, + "loss": 0.6999, + "step": 91670 + }, + { + "epoch": 1.4716127064639881, + "grad_norm": 1.2253241539001465, + "learning_rate": 8.130358066644497e-06, + "loss": 0.6401, + "step": 91680 + }, + { + "epoch": 1.4717732226841522, + "grad_norm": 0.9778072237968445, + "learning_rate": 8.125705818170018e-06, + "loss": 0.5882, + "step": 91690 + }, + { + "epoch": 1.4719337389043163, + "grad_norm": 1.0027074813842773, + "learning_rate": 8.121054642801605e-06, + "loss": 0.6487, + "step": 91700 + }, + { + "epoch": 1.4720942551244804, + "grad_norm": 0.8128554821014404, + "learning_rate": 8.116404540835048e-06, + "loss": 0.7182, + "step": 91710 + }, + { + "epoch": 1.4722547713446443, + "grad_norm": 1.1022735834121704, + "learning_rate": 8.111755512566067e-06, + "loss": 0.7189, + "step": 91720 + }, + { + "epoch": 1.4724152875648084, + "grad_norm": 0.9330240488052368, + "learning_rate": 8.107107558290312e-06, + "loss": 0.7434, + "step": 91730 + }, + { + "epoch": 1.4725758037849725, + "grad_norm": 1.2634129524230957, + "learning_rate": 8.102460678303369e-06, + "loss": 0.7062, + "step": 91740 + }, + { + "epoch": 1.4727363200051364, + "grad_norm": 1.2546606063842773, + "learning_rate": 8.097814872900745e-06, + "loss": 0.6553, + "step": 91750 + }, + { + "epoch": 1.4728968362253005, + "grad_norm": 1.087226390838623, + "learning_rate": 8.093170142377884e-06, + "loss": 0.7159, + "step": 91760 + }, + { + "epoch": 1.4730573524454647, + "grad_norm": 1.2494478225708008, + "learning_rate": 8.088526487030174e-06, + "loss": 0.6607, + "step": 91770 + }, + { + "epoch": 1.4732178686656288, + "grad_norm": 1.0684573650360107, + "learning_rate": 8.083883907152917e-06, + "loss": 0.7693, + "step": 91780 + }, + { + "epoch": 1.4733783848857926, + "grad_norm": 0.8468582630157471, + "learning_rate": 8.079242403041357e-06, + "loss": 0.759, + "step": 91790 + }, + { + "epoch": 1.4735389011059568, + "grad_norm": 0.8663548827171326, + "learning_rate": 8.074601974990672e-06, + "loss": 0.7331, + "step": 91800 + }, + { + "epoch": 1.4736994173261209, + "grad_norm": 1.2345057725906372, + "learning_rate": 8.069962623295952e-06, + "loss": 0.7597, + "step": 91810 + }, + { + "epoch": 1.4738599335462848, + "grad_norm": 2.680403709411621, + "learning_rate": 8.06532434825224e-06, + "loss": 0.7646, + "step": 91820 + }, + { + "epoch": 1.4740204497664489, + "grad_norm": 1.1297132968902588, + "learning_rate": 8.060687150154503e-06, + "loss": 0.7502, + "step": 91830 + }, + { + "epoch": 1.474180965986613, + "grad_norm": 1.2966169118881226, + "learning_rate": 8.05605102929764e-06, + "loss": 0.7618, + "step": 91840 + }, + { + "epoch": 1.474341482206777, + "grad_norm": 1.2776001691818237, + "learning_rate": 8.05141598597648e-06, + "loss": 0.7578, + "step": 91850 + }, + { + "epoch": 1.474501998426941, + "grad_norm": 0.8707104325294495, + "learning_rate": 8.046782020485793e-06, + "loss": 0.7749, + "step": 91860 + }, + { + "epoch": 1.474662514647105, + "grad_norm": 1.7200002670288086, + "learning_rate": 8.042149133120255e-06, + "loss": 0.7107, + "step": 91870 + }, + { + "epoch": 1.4748230308672692, + "grad_norm": 0.8646041750907898, + "learning_rate": 8.037517324174503e-06, + "loss": 0.5698, + "step": 91880 + }, + { + "epoch": 1.474983547087433, + "grad_norm": 0.9662037491798401, + "learning_rate": 8.032886593943086e-06, + "loss": 0.7776, + "step": 91890 + }, + { + "epoch": 1.4751440633075972, + "grad_norm": 0.7601067423820496, + "learning_rate": 8.028256942720496e-06, + "loss": 0.7597, + "step": 91900 + }, + { + "epoch": 1.4753045795277613, + "grad_norm": 1.0359374284744263, + "learning_rate": 8.023628370801153e-06, + "loss": 0.6936, + "step": 91910 + }, + { + "epoch": 1.4754650957479254, + "grad_norm": 0.7891344428062439, + "learning_rate": 8.01900087847941e-06, + "loss": 0.7094, + "step": 91920 + }, + { + "epoch": 1.4756256119680895, + "grad_norm": 1.2743159532546997, + "learning_rate": 8.014374466049535e-06, + "loss": 0.7089, + "step": 91930 + }, + { + "epoch": 1.4757861281882534, + "grad_norm": 0.7786523103713989, + "learning_rate": 8.00974913380575e-06, + "loss": 0.6366, + "step": 91940 + }, + { + "epoch": 1.4759466444084175, + "grad_norm": 0.9966202974319458, + "learning_rate": 8.005124882042194e-06, + "loss": 0.6667, + "step": 91950 + }, + { + "epoch": 1.4761071606285814, + "grad_norm": 1.2952536344528198, + "learning_rate": 8.00050171105295e-06, + "loss": 0.5619, + "step": 91960 + }, + { + "epoch": 1.4762676768487455, + "grad_norm": 0.8240392208099365, + "learning_rate": 7.995879621132016e-06, + "loss": 0.7362, + "step": 91970 + }, + { + "epoch": 1.4764281930689096, + "grad_norm": 0.873108446598053, + "learning_rate": 7.991258612573337e-06, + "loss": 0.6771, + "step": 91980 + }, + { + "epoch": 1.4765887092890737, + "grad_norm": 0.6276317834854126, + "learning_rate": 7.986638685670785e-06, + "loss": 0.5973, + "step": 91990 + }, + { + "epoch": 1.4767492255092378, + "grad_norm": 1.222180962562561, + "learning_rate": 7.98201984071815e-06, + "loss": 0.701, + "step": 92000 + }, + { + "epoch": 1.4767492255092378, + "eval_loss": 0.7714616656303406, + "eval_runtime": 1833.3541, + "eval_samples_per_second": 14.308, + "eval_steps_per_second": 1.789, + "step": 92000 + }, + { + "epoch": 1.4769097417294017, + "grad_norm": 1.0356537103652954, + "learning_rate": 7.977402078009164e-06, + "loss": 0.6488, + "step": 92010 + }, + { + "epoch": 1.4770702579495658, + "grad_norm": 1.0612620115280151, + "learning_rate": 7.972785397837492e-06, + "loss": 0.6614, + "step": 92020 + }, + { + "epoch": 1.47723077416973, + "grad_norm": 1.0417875051498413, + "learning_rate": 7.968169800496733e-06, + "loss": 0.6705, + "step": 92030 + }, + { + "epoch": 1.4773912903898938, + "grad_norm": 2.007277011871338, + "learning_rate": 7.963555286280409e-06, + "loss": 0.7531, + "step": 92040 + }, + { + "epoch": 1.477551806610058, + "grad_norm": 0.9217204451560974, + "learning_rate": 7.958941855481972e-06, + "loss": 0.7162, + "step": 92050 + }, + { + "epoch": 1.477712322830222, + "grad_norm": 0.7045002579689026, + "learning_rate": 7.954329508394811e-06, + "loss": 0.6967, + "step": 92060 + }, + { + "epoch": 1.4778728390503861, + "grad_norm": 0.955061674118042, + "learning_rate": 7.949718245312248e-06, + "loss": 0.8048, + "step": 92070 + }, + { + "epoch": 1.47803335527055, + "grad_norm": 0.9909802675247192, + "learning_rate": 7.94510806652753e-06, + "loss": 0.6934, + "step": 92080 + }, + { + "epoch": 1.4781938714907141, + "grad_norm": 0.9249708652496338, + "learning_rate": 7.940498972333842e-06, + "loss": 0.7742, + "step": 92090 + }, + { + "epoch": 1.4783543877108782, + "grad_norm": 1.0950417518615723, + "learning_rate": 7.935890963024284e-06, + "loss": 0.8163, + "step": 92100 + }, + { + "epoch": 1.4785149039310421, + "grad_norm": 1.1950844526290894, + "learning_rate": 7.931284038891903e-06, + "loss": 0.634, + "step": 92110 + }, + { + "epoch": 1.4786754201512062, + "grad_norm": 1.2673505544662476, + "learning_rate": 7.926678200229676e-06, + "loss": 0.7366, + "step": 92120 + }, + { + "epoch": 1.4788359363713703, + "grad_norm": 0.8782106041908264, + "learning_rate": 7.922073447330504e-06, + "loss": 0.6353, + "step": 92130 + }, + { + "epoch": 1.4789964525915344, + "grad_norm": 0.9132172465324402, + "learning_rate": 7.917469780487224e-06, + "loss": 0.7178, + "step": 92140 + }, + { + "epoch": 1.4791569688116986, + "grad_norm": 2.117964744567871, + "learning_rate": 7.912867199992602e-06, + "loss": 0.7638, + "step": 92150 + }, + { + "epoch": 1.4793174850318624, + "grad_norm": 1.4364209175109863, + "learning_rate": 7.908265706139334e-06, + "loss": 0.7899, + "step": 92160 + }, + { + "epoch": 1.4794780012520266, + "grad_norm": 0.8767213225364685, + "learning_rate": 7.903665299220058e-06, + "loss": 0.7187, + "step": 92170 + }, + { + "epoch": 1.4796385174721904, + "grad_norm": 1.3753732442855835, + "learning_rate": 7.899065979527314e-06, + "loss": 0.7812, + "step": 92180 + }, + { + "epoch": 1.4797990336923545, + "grad_norm": 1.015817642211914, + "learning_rate": 7.894467747353604e-06, + "loss": 0.7645, + "step": 92190 + }, + { + "epoch": 1.4799595499125187, + "grad_norm": 1.0355803966522217, + "learning_rate": 7.889870602991346e-06, + "loss": 0.6088, + "step": 92200 + }, + { + "epoch": 1.4801200661326828, + "grad_norm": 0.8718878626823425, + "learning_rate": 7.885274546732887e-06, + "loss": 0.8375, + "step": 92210 + }, + { + "epoch": 1.4802805823528469, + "grad_norm": 0.9139881730079651, + "learning_rate": 7.880679578870517e-06, + "loss": 0.7154, + "step": 92220 + }, + { + "epoch": 1.4804410985730108, + "grad_norm": 0.7667800784111023, + "learning_rate": 7.876085699696453e-06, + "loss": 0.6364, + "step": 92230 + }, + { + "epoch": 1.4806016147931749, + "grad_norm": 1.0576905012130737, + "learning_rate": 7.871492909502822e-06, + "loss": 0.5731, + "step": 92240 + }, + { + "epoch": 1.480762131013339, + "grad_norm": 1.311134934425354, + "learning_rate": 7.866901208581711e-06, + "loss": 0.7085, + "step": 92250 + }, + { + "epoch": 1.4809226472335029, + "grad_norm": 1.2861076593399048, + "learning_rate": 7.86231059722512e-06, + "loss": 0.7094, + "step": 92260 + }, + { + "epoch": 1.481083163453667, + "grad_norm": 0.6214507818222046, + "learning_rate": 7.857721075724988e-06, + "loss": 0.6077, + "step": 92270 + }, + { + "epoch": 1.481243679673831, + "grad_norm": 1.386151909828186, + "learning_rate": 7.85313264437318e-06, + "loss": 0.5988, + "step": 92280 + }, + { + "epoch": 1.4814041958939952, + "grad_norm": 1.0135585069656372, + "learning_rate": 7.848545303461494e-06, + "loss": 0.6684, + "step": 92290 + }, + { + "epoch": 1.481564712114159, + "grad_norm": 1.8247491121292114, + "learning_rate": 7.843959053281663e-06, + "loss": 0.7542, + "step": 92300 + }, + { + "epoch": 1.4817252283343232, + "grad_norm": 0.8039498925209045, + "learning_rate": 7.839373894125337e-06, + "loss": 0.7821, + "step": 92310 + }, + { + "epoch": 1.4818857445544873, + "grad_norm": 0.8170836567878723, + "learning_rate": 7.834789826284106e-06, + "loss": 0.7378, + "step": 92320 + }, + { + "epoch": 1.4820462607746512, + "grad_norm": 1.4713988304138184, + "learning_rate": 7.830206850049492e-06, + "loss": 0.7346, + "step": 92330 + }, + { + "epoch": 1.4822067769948153, + "grad_norm": 0.996407687664032, + "learning_rate": 7.82562496571295e-06, + "loss": 0.5881, + "step": 92340 + }, + { + "epoch": 1.4823672932149794, + "grad_norm": 1.529238224029541, + "learning_rate": 7.821044173565854e-06, + "loss": 0.7254, + "step": 92350 + }, + { + "epoch": 1.4825278094351435, + "grad_norm": 1.1604135036468506, + "learning_rate": 7.816464473899523e-06, + "loss": 0.6513, + "step": 92360 + }, + { + "epoch": 1.4826883256553074, + "grad_norm": 1.1996623277664185, + "learning_rate": 7.811885867005193e-06, + "loss": 0.75, + "step": 92370 + }, + { + "epoch": 1.4828488418754715, + "grad_norm": 0.8078352212905884, + "learning_rate": 7.807308353174034e-06, + "loss": 0.7297, + "step": 92380 + }, + { + "epoch": 1.4830093580956356, + "grad_norm": 1.1507418155670166, + "learning_rate": 7.802731932697152e-06, + "loss": 0.6143, + "step": 92390 + }, + { + "epoch": 1.4831698743157995, + "grad_norm": 1.006565809249878, + "learning_rate": 7.798156605865583e-06, + "loss": 0.6791, + "step": 92400 + }, + { + "epoch": 1.4833303905359636, + "grad_norm": 1.6044483184814453, + "learning_rate": 7.793582372970291e-06, + "loss": 0.5056, + "step": 92410 + }, + { + "epoch": 1.4834909067561277, + "grad_norm": 1.2055988311767578, + "learning_rate": 7.789009234302172e-06, + "loss": 0.7398, + "step": 92420 + }, + { + "epoch": 1.4836514229762918, + "grad_norm": 1.7691184282302856, + "learning_rate": 7.784437190152042e-06, + "loss": 0.6202, + "step": 92430 + }, + { + "epoch": 1.483811939196456, + "grad_norm": 0.8782910704612732, + "learning_rate": 7.779866240810662e-06, + "loss": 0.6342, + "step": 92440 + }, + { + "epoch": 1.4839724554166198, + "grad_norm": 1.5843807458877563, + "learning_rate": 7.775296386568718e-06, + "loss": 0.8432, + "step": 92450 + }, + { + "epoch": 1.484132971636784, + "grad_norm": 2.5742392539978027, + "learning_rate": 7.770727627716822e-06, + "loss": 0.7468, + "step": 92460 + }, + { + "epoch": 1.4842934878569478, + "grad_norm": 1.0406732559204102, + "learning_rate": 7.766159964545528e-06, + "loss": 0.6481, + "step": 92470 + }, + { + "epoch": 1.484454004077112, + "grad_norm": 1.2066880464553833, + "learning_rate": 7.761593397345313e-06, + "loss": 0.8277, + "step": 92480 + }, + { + "epoch": 1.484614520297276, + "grad_norm": 1.2809698581695557, + "learning_rate": 7.757027926406568e-06, + "loss": 0.5063, + "step": 92490 + }, + { + "epoch": 1.4847750365174401, + "grad_norm": 1.2903908491134644, + "learning_rate": 7.752463552019645e-06, + "loss": 0.7339, + "step": 92500 + }, + { + "epoch": 1.4849355527376042, + "grad_norm": 0.6806390881538391, + "learning_rate": 7.747900274474807e-06, + "loss": 0.7923, + "step": 92510 + }, + { + "epoch": 1.4850960689577681, + "grad_norm": 1.6836051940917969, + "learning_rate": 7.74333809406225e-06, + "loss": 0.8645, + "step": 92520 + }, + { + "epoch": 1.4852565851779322, + "grad_norm": 0.9911530017852783, + "learning_rate": 7.738777011072105e-06, + "loss": 0.7163, + "step": 92530 + }, + { + "epoch": 1.4854171013980964, + "grad_norm": 1.4174832105636597, + "learning_rate": 7.73421702579443e-06, + "loss": 0.6933, + "step": 92540 + }, + { + "epoch": 1.4855776176182602, + "grad_norm": 1.2754042148590088, + "learning_rate": 7.72965813851922e-06, + "loss": 0.6183, + "step": 92550 + }, + { + "epoch": 1.4857381338384243, + "grad_norm": 0.8126199841499329, + "learning_rate": 7.725100349536377e-06, + "loss": 0.6985, + "step": 92560 + }, + { + "epoch": 1.4858986500585885, + "grad_norm": 1.272567868232727, + "learning_rate": 7.720543659135757e-06, + "loss": 0.6664, + "step": 92570 + }, + { + "epoch": 1.4860591662787526, + "grad_norm": 0.995111882686615, + "learning_rate": 7.715988067607142e-06, + "loss": 0.6629, + "step": 92580 + }, + { + "epoch": 1.4862196824989165, + "grad_norm": 1.46920907497406, + "learning_rate": 7.71143357524024e-06, + "loss": 0.7341, + "step": 92590 + }, + { + "epoch": 1.4863801987190806, + "grad_norm": 1.5877832174301147, + "learning_rate": 7.706880182324685e-06, + "loss": 0.6918, + "step": 92600 + }, + { + "epoch": 1.4865407149392447, + "grad_norm": 0.7598055005073547, + "learning_rate": 7.702327889150063e-06, + "loss": 0.7138, + "step": 92610 + }, + { + "epoch": 1.4867012311594086, + "grad_norm": 0.6711224913597107, + "learning_rate": 7.697776696005848e-06, + "loss": 0.6295, + "step": 92620 + }, + { + "epoch": 1.4868617473795727, + "grad_norm": 0.9538777470588684, + "learning_rate": 7.693226603181486e-06, + "loss": 0.6437, + "step": 92630 + }, + { + "epoch": 1.4870222635997368, + "grad_norm": 1.2586243152618408, + "learning_rate": 7.688677610966327e-06, + "loss": 0.7481, + "step": 92640 + }, + { + "epoch": 1.4871827798199009, + "grad_norm": 0.8577752113342285, + "learning_rate": 7.684129719649669e-06, + "loss": 0.787, + "step": 92650 + }, + { + "epoch": 1.4873432960400648, + "grad_norm": 1.1414399147033691, + "learning_rate": 7.679582929520723e-06, + "loss": 0.7146, + "step": 92660 + }, + { + "epoch": 1.4875038122602289, + "grad_norm": 1.5150303840637207, + "learning_rate": 7.675037240868643e-06, + "loss": 0.711, + "step": 92670 + }, + { + "epoch": 1.487664328480393, + "grad_norm": 0.9849445223808289, + "learning_rate": 7.670492653982509e-06, + "loss": 0.7638, + "step": 92680 + }, + { + "epoch": 1.4878248447005569, + "grad_norm": 1.271337866783142, + "learning_rate": 7.665949169151329e-06, + "loss": 0.7453, + "step": 92690 + }, + { + "epoch": 1.487985360920721, + "grad_norm": 1.5071160793304443, + "learning_rate": 7.661406786664039e-06, + "loss": 0.7017, + "step": 92700 + }, + { + "epoch": 1.488145877140885, + "grad_norm": 1.1314489841461182, + "learning_rate": 7.65686550680951e-06, + "loss": 0.6973, + "step": 92710 + }, + { + "epoch": 1.4883063933610492, + "grad_norm": 1.3943064212799072, + "learning_rate": 7.65232532987655e-06, + "loss": 0.6949, + "step": 92720 + }, + { + "epoch": 1.4884669095812133, + "grad_norm": 1.5427837371826172, + "learning_rate": 7.647786256153871e-06, + "loss": 0.671, + "step": 92730 + }, + { + "epoch": 1.4886274258013772, + "grad_norm": 0.9159241914749146, + "learning_rate": 7.643248285930138e-06, + "loss": 0.7665, + "step": 92740 + }, + { + "epoch": 1.4887879420215413, + "grad_norm": 1.0416336059570312, + "learning_rate": 7.638711419493942e-06, + "loss": 0.651, + "step": 92750 + }, + { + "epoch": 1.4889484582417052, + "grad_norm": 0.8158416748046875, + "learning_rate": 7.634175657133797e-06, + "loss": 0.6574, + "step": 92760 + }, + { + "epoch": 1.4891089744618693, + "grad_norm": 1.1214104890823364, + "learning_rate": 7.629640999138155e-06, + "loss": 0.6826, + "step": 92770 + }, + { + "epoch": 1.4892694906820334, + "grad_norm": 0.8395774364471436, + "learning_rate": 7.62510744579539e-06, + "loss": 0.7815, + "step": 92780 + }, + { + "epoch": 1.4894300069021975, + "grad_norm": 1.2541799545288086, + "learning_rate": 7.620574997393812e-06, + "loss": 0.7203, + "step": 92790 + }, + { + "epoch": 1.4895905231223616, + "grad_norm": 0.6720486879348755, + "learning_rate": 7.616043654221667e-06, + "loss": 0.661, + "step": 92800 + }, + { + "epoch": 1.4897510393425255, + "grad_norm": 1.3466463088989258, + "learning_rate": 7.611513416567101e-06, + "loss": 0.6733, + "step": 92810 + }, + { + "epoch": 1.4899115555626896, + "grad_norm": 0.808355987071991, + "learning_rate": 7.606984284718224e-06, + "loss": 0.6439, + "step": 92820 + }, + { + "epoch": 1.4900720717828537, + "grad_norm": 0.6044198870658875, + "learning_rate": 7.602456258963059e-06, + "loss": 0.7581, + "step": 92830 + }, + { + "epoch": 1.4902325880030176, + "grad_norm": 0.7243287563323975, + "learning_rate": 7.597929339589563e-06, + "loss": 0.742, + "step": 92840 + }, + { + "epoch": 1.4903931042231817, + "grad_norm": 0.9044568538665771, + "learning_rate": 7.59340352688562e-06, + "loss": 0.64, + "step": 92850 + }, + { + "epoch": 1.4905536204433458, + "grad_norm": 0.8581441640853882, + "learning_rate": 7.588878821139056e-06, + "loss": 0.6858, + "step": 92860 + }, + { + "epoch": 1.49071413666351, + "grad_norm": 1.3391809463500977, + "learning_rate": 7.584355222637596e-06, + "loss": 0.6605, + "step": 92870 + }, + { + "epoch": 1.4908746528836738, + "grad_norm": 0.951670229434967, + "learning_rate": 7.579832731668926e-06, + "loss": 0.738, + "step": 92880 + }, + { + "epoch": 1.491035169103838, + "grad_norm": 1.0799212455749512, + "learning_rate": 7.575311348520648e-06, + "loss": 0.6776, + "step": 92890 + }, + { + "epoch": 1.491195685324002, + "grad_norm": 1.303659200668335, + "learning_rate": 7.570791073480296e-06, + "loss": 0.7815, + "step": 92900 + }, + { + "epoch": 1.491356201544166, + "grad_norm": 1.1690927743911743, + "learning_rate": 7.5662719068353314e-06, + "loss": 0.6737, + "step": 92910 + }, + { + "epoch": 1.49151671776433, + "grad_norm": 0.7879657745361328, + "learning_rate": 7.561753848873157e-06, + "loss": 0.7563, + "step": 92920 + }, + { + "epoch": 1.4916772339844941, + "grad_norm": 1.2197242975234985, + "learning_rate": 7.5572368998810764e-06, + "loss": 0.6532, + "step": 92930 + }, + { + "epoch": 1.4918377502046583, + "grad_norm": 0.753649115562439, + "learning_rate": 7.552721060146353e-06, + "loss": 0.6938, + "step": 92940 + }, + { + "epoch": 1.4919982664248224, + "grad_norm": 1.1883491277694702, + "learning_rate": 7.548206329956162e-06, + "loss": 0.6822, + "step": 92950 + }, + { + "epoch": 1.4921587826449862, + "grad_norm": 1.4623253345489502, + "learning_rate": 7.543692709597619e-06, + "loss": 0.7248, + "step": 92960 + }, + { + "epoch": 1.4923192988651504, + "grad_norm": 0.8820719122886658, + "learning_rate": 7.539180199357762e-06, + "loss": 0.7631, + "step": 92970 + }, + { + "epoch": 1.4924798150853142, + "grad_norm": 0.7019714117050171, + "learning_rate": 7.534668799523565e-06, + "loss": 0.7548, + "step": 92980 + }, + { + "epoch": 1.4926403313054784, + "grad_norm": 0.954365074634552, + "learning_rate": 7.530158510381915e-06, + "loss": 0.8056, + "step": 92990 + }, + { + "epoch": 1.4928008475256425, + "grad_norm": 0.9765733480453491, + "learning_rate": 7.525649332219648e-06, + "loss": 0.6921, + "step": 93000 + }, + { + "epoch": 1.4929613637458066, + "grad_norm": 0.9735218286514282, + "learning_rate": 7.5211412653235166e-06, + "loss": 0.6891, + "step": 93010 + }, + { + "epoch": 1.4931218799659707, + "grad_norm": 0.8509323000907898, + "learning_rate": 7.516634309980211e-06, + "loss": 0.6574, + "step": 93020 + }, + { + "epoch": 1.4932823961861346, + "grad_norm": 1.757307767868042, + "learning_rate": 7.512128466476345e-06, + "loss": 0.7657, + "step": 93030 + }, + { + "epoch": 1.4934429124062987, + "grad_norm": 1.1982083320617676, + "learning_rate": 7.507623735098476e-06, + "loss": 0.7586, + "step": 93040 + }, + { + "epoch": 1.4936034286264628, + "grad_norm": 1.143458366394043, + "learning_rate": 7.503120116133059e-06, + "loss": 0.8484, + "step": 93050 + }, + { + "epoch": 1.4937639448466267, + "grad_norm": 1.4062753915786743, + "learning_rate": 7.498617609866504e-06, + "loss": 0.6365, + "step": 93060 + }, + { + "epoch": 1.4939244610667908, + "grad_norm": 1.4539790153503418, + "learning_rate": 7.4941162165851485e-06, + "loss": 0.8172, + "step": 93070 + }, + { + "epoch": 1.4940849772869549, + "grad_norm": 0.9248805642127991, + "learning_rate": 7.48961593657525e-06, + "loss": 0.7147, + "step": 93080 + }, + { + "epoch": 1.494245493507119, + "grad_norm": 2.3752942085266113, + "learning_rate": 7.485116770123005e-06, + "loss": 0.7583, + "step": 93090 + }, + { + "epoch": 1.4944060097272829, + "grad_norm": 0.9518377184867859, + "learning_rate": 7.480618717514532e-06, + "loss": 0.5223, + "step": 93100 + }, + { + "epoch": 1.494566525947447, + "grad_norm": 1.081591248512268, + "learning_rate": 7.476121779035886e-06, + "loss": 0.6184, + "step": 93110 + }, + { + "epoch": 1.494727042167611, + "grad_norm": 1.0449568033218384, + "learning_rate": 7.471625954973033e-06, + "loss": 0.6894, + "step": 93120 + }, + { + "epoch": 1.494887558387775, + "grad_norm": 0.876492977142334, + "learning_rate": 7.467131245611889e-06, + "loss": 0.7586, + "step": 93130 + }, + { + "epoch": 1.495048074607939, + "grad_norm": 1.2545437812805176, + "learning_rate": 7.462637651238289e-06, + "loss": 0.6698, + "step": 93140 + }, + { + "epoch": 1.4952085908281032, + "grad_norm": 1.006011724472046, + "learning_rate": 7.458145172138001e-06, + "loss": 0.6682, + "step": 93150 + }, + { + "epoch": 1.4953691070482673, + "grad_norm": 1.0566887855529785, + "learning_rate": 7.453653808596719e-06, + "loss": 0.6122, + "step": 93160 + }, + { + "epoch": 1.4955296232684312, + "grad_norm": 0.794274091720581, + "learning_rate": 7.449163560900077e-06, + "loss": 0.6683, + "step": 93170 + }, + { + "epoch": 1.4956901394885953, + "grad_norm": 1.3437732458114624, + "learning_rate": 7.444674429333612e-06, + "loss": 0.7152, + "step": 93180 + }, + { + "epoch": 1.4958506557087594, + "grad_norm": 0.9398924112319946, + "learning_rate": 7.440186414182815e-06, + "loss": 0.7553, + "step": 93190 + }, + { + "epoch": 1.4960111719289233, + "grad_norm": 0.8876321911811829, + "learning_rate": 7.435699515733094e-06, + "loss": 0.5989, + "step": 93200 + }, + { + "epoch": 1.4961716881490874, + "grad_norm": 0.8571386337280273, + "learning_rate": 7.431213734269793e-06, + "loss": 0.669, + "step": 93210 + }, + { + "epoch": 1.4963322043692515, + "grad_norm": 0.8599319458007812, + "learning_rate": 7.426729070078181e-06, + "loss": 0.7373, + "step": 93220 + }, + { + "epoch": 1.4964927205894156, + "grad_norm": 1.0281751155853271, + "learning_rate": 7.422245523443461e-06, + "loss": 0.818, + "step": 93230 + }, + { + "epoch": 1.4966532368095797, + "grad_norm": 0.9178064465522766, + "learning_rate": 7.417763094650748e-06, + "loss": 0.7283, + "step": 93240 + }, + { + "epoch": 1.4968137530297436, + "grad_norm": 1.1372699737548828, + "learning_rate": 7.413281783985107e-06, + "loss": 0.7056, + "step": 93250 + }, + { + "epoch": 1.4969742692499077, + "grad_norm": 0.9302920699119568, + "learning_rate": 7.408801591731518e-06, + "loss": 0.7187, + "step": 93260 + }, + { + "epoch": 1.4971347854700716, + "grad_norm": 1.3444329500198364, + "learning_rate": 7.4043225181749e-06, + "loss": 0.6964, + "step": 93270 + }, + { + "epoch": 1.4972953016902357, + "grad_norm": 0.5933625102043152, + "learning_rate": 7.399844563600092e-06, + "loss": 0.7061, + "step": 93280 + }, + { + "epoch": 1.4974558179103998, + "grad_norm": 0.9724730253219604, + "learning_rate": 7.395367728291869e-06, + "loss": 0.7163, + "step": 93290 + }, + { + "epoch": 1.497616334130564, + "grad_norm": 1.205877661705017, + "learning_rate": 7.390892012534928e-06, + "loss": 0.5831, + "step": 93300 + }, + { + "epoch": 1.497776850350728, + "grad_norm": 1.0602452754974365, + "learning_rate": 7.386417416613903e-06, + "loss": 0.7433, + "step": 93310 + }, + { + "epoch": 1.497937366570892, + "grad_norm": 1.5986872911453247, + "learning_rate": 7.381943940813346e-06, + "loss": 0.7457, + "step": 93320 + }, + { + "epoch": 1.498097882791056, + "grad_norm": 1.4429795742034912, + "learning_rate": 7.377471585417747e-06, + "loss": 0.6102, + "step": 93330 + }, + { + "epoch": 1.4982583990112202, + "grad_norm": 1.0705498456954956, + "learning_rate": 7.3730003507115316e-06, + "loss": 0.6756, + "step": 93340 + }, + { + "epoch": 1.498418915231384, + "grad_norm": 1.0252116918563843, + "learning_rate": 7.368530236979023e-06, + "loss": 0.6045, + "step": 93350 + }, + { + "epoch": 1.4985794314515481, + "grad_norm": 0.9047755599021912, + "learning_rate": 7.364061244504508e-06, + "loss": 0.6915, + "step": 93360 + }, + { + "epoch": 1.4987399476717123, + "grad_norm": 0.9400933980941772, + "learning_rate": 7.359593373572185e-06, + "loss": 0.6444, + "step": 93370 + }, + { + "epoch": 1.4989004638918764, + "grad_norm": 1.4222099781036377, + "learning_rate": 7.3551266244661825e-06, + "loss": 0.6942, + "step": 93380 + }, + { + "epoch": 1.4990609801120403, + "grad_norm": 1.2404171228408813, + "learning_rate": 7.350660997470563e-06, + "loss": 0.7885, + "step": 93390 + }, + { + "epoch": 1.4992214963322044, + "grad_norm": 1.3270188570022583, + "learning_rate": 7.346196492869315e-06, + "loss": 0.6075, + "step": 93400 + }, + { + "epoch": 1.4993820125523685, + "grad_norm": 0.8023919463157654, + "learning_rate": 7.34173311094635e-06, + "loss": 0.61, + "step": 93410 + }, + { + "epoch": 1.4995425287725324, + "grad_norm": 0.865720272064209, + "learning_rate": 7.337270851985526e-06, + "loss": 0.6777, + "step": 93420 + }, + { + "epoch": 1.4997030449926965, + "grad_norm": 0.8895630240440369, + "learning_rate": 7.332809716270594e-06, + "loss": 0.6019, + "step": 93430 + }, + { + "epoch": 1.4998635612128606, + "grad_norm": 1.284072756767273, + "learning_rate": 7.328349704085271e-06, + "loss": 0.6434, + "step": 93440 + }, + { + "epoch": 1.5000240774330247, + "grad_norm": 1.2293554544448853, + "learning_rate": 7.323890815713186e-06, + "loss": 0.5488, + "step": 93450 + }, + { + "epoch": 1.5001845936531888, + "grad_norm": 0.8528238534927368, + "learning_rate": 7.319433051437896e-06, + "loss": 0.7049, + "step": 93460 + }, + { + "epoch": 1.5003451098733527, + "grad_norm": 1.7506749629974365, + "learning_rate": 7.314976411542887e-06, + "loss": 0.6365, + "step": 93470 + }, + { + "epoch": 1.5005056260935168, + "grad_norm": 0.8340057730674744, + "learning_rate": 7.310520896311587e-06, + "loss": 0.6212, + "step": 93480 + }, + { + "epoch": 1.5006661423136807, + "grad_norm": 1.1240174770355225, + "learning_rate": 7.306066506027326e-06, + "loss": 0.7425, + "step": 93490 + }, + { + "epoch": 1.5008266585338448, + "grad_norm": 0.9033777117729187, + "learning_rate": 7.301613240973379e-06, + "loss": 0.7398, + "step": 93500 + }, + { + "epoch": 1.500987174754009, + "grad_norm": 1.0103987455368042, + "learning_rate": 7.297161101432953e-06, + "loss": 0.7567, + "step": 93510 + }, + { + "epoch": 1.501147690974173, + "grad_norm": 0.7874979376792908, + "learning_rate": 7.292710087689175e-06, + "loss": 0.7739, + "step": 93520 + }, + { + "epoch": 1.501308207194337, + "grad_norm": 0.8659284114837646, + "learning_rate": 7.288260200025104e-06, + "loss": 0.6477, + "step": 93530 + }, + { + "epoch": 1.501468723414501, + "grad_norm": 1.3439648151397705, + "learning_rate": 7.283811438723737e-06, + "loss": 0.7137, + "step": 93540 + }, + { + "epoch": 1.501629239634665, + "grad_norm": 1.3601844310760498, + "learning_rate": 7.279363804067968e-06, + "loss": 0.598, + "step": 93550 + }, + { + "epoch": 1.501789755854829, + "grad_norm": 1.271753191947937, + "learning_rate": 7.274917296340653e-06, + "loss": 0.6814, + "step": 93560 + }, + { + "epoch": 1.501950272074993, + "grad_norm": 0.8762750625610352, + "learning_rate": 7.270471915824561e-06, + "loss": 0.6361, + "step": 93570 + }, + { + "epoch": 1.5021107882951572, + "grad_norm": 1.061976432800293, + "learning_rate": 7.266027662802394e-06, + "loss": 0.6468, + "step": 93580 + }, + { + "epoch": 1.5022713045153213, + "grad_norm": 0.7513346076011658, + "learning_rate": 7.261584537556781e-06, + "loss": 0.6593, + "step": 93590 + }, + { + "epoch": 1.5024318207354854, + "grad_norm": 1.0148073434829712, + "learning_rate": 7.257142540370285e-06, + "loss": 0.8245, + "step": 93600 + }, + { + "epoch": 1.5025923369556495, + "grad_norm": 0.7297426462173462, + "learning_rate": 7.2527016715253746e-06, + "loss": 0.7684, + "step": 93610 + }, + { + "epoch": 1.5027528531758134, + "grad_norm": 1.5049430131912231, + "learning_rate": 7.248261931304473e-06, + "loss": 0.8115, + "step": 93620 + }, + { + "epoch": 1.5029133693959773, + "grad_norm": 1.0038906335830688, + "learning_rate": 7.24382331998992e-06, + "loss": 0.6856, + "step": 93630 + }, + { + "epoch": 1.5030738856161414, + "grad_norm": 0.7474145293235779, + "learning_rate": 7.239385837863985e-06, + "loss": 0.6937, + "step": 93640 + }, + { + "epoch": 1.5032344018363055, + "grad_norm": 1.414183259010315, + "learning_rate": 7.234949485208867e-06, + "loss": 0.7845, + "step": 93650 + }, + { + "epoch": 1.5033949180564696, + "grad_norm": 0.9407339692115784, + "learning_rate": 7.230514262306689e-06, + "loss": 0.8498, + "step": 93660 + }, + { + "epoch": 1.5035554342766337, + "grad_norm": 0.6930261850357056, + "learning_rate": 7.22608016943952e-06, + "loss": 0.6721, + "step": 93670 + }, + { + "epoch": 1.5037159504967978, + "grad_norm": 1.176837682723999, + "learning_rate": 7.22164720688932e-06, + "loss": 0.6406, + "step": 93680 + }, + { + "epoch": 1.5038764667169617, + "grad_norm": 0.8986731767654419, + "learning_rate": 7.217215374938008e-06, + "loss": 0.6993, + "step": 93690 + }, + { + "epoch": 1.5040369829371258, + "grad_norm": 0.7944579124450684, + "learning_rate": 7.212784673867426e-06, + "loss": 0.7169, + "step": 93700 + }, + { + "epoch": 1.5041974991572897, + "grad_norm": 0.7468014359474182, + "learning_rate": 7.208355103959338e-06, + "loss": 0.742, + "step": 93710 + }, + { + "epoch": 1.5043580153774538, + "grad_norm": 1.1677024364471436, + "learning_rate": 7.203926665495437e-06, + "loss": 0.7859, + "step": 93720 + }, + { + "epoch": 1.504518531597618, + "grad_norm": 2.2621653079986572, + "learning_rate": 7.199499358757358e-06, + "loss": 0.7109, + "step": 93730 + }, + { + "epoch": 1.504679047817782, + "grad_norm": 1.2533336877822876, + "learning_rate": 7.1950731840266325e-06, + "loss": 0.6754, + "step": 93740 + }, + { + "epoch": 1.5048395640379462, + "grad_norm": 1.0001481771469116, + "learning_rate": 7.1906481415847495e-06, + "loss": 0.6578, + "step": 93750 + }, + { + "epoch": 1.50500008025811, + "grad_norm": 0.9269145727157593, + "learning_rate": 7.186224231713112e-06, + "loss": 0.6108, + "step": 93760 + }, + { + "epoch": 1.5051605964782742, + "grad_norm": 1.069209337234497, + "learning_rate": 7.181801454693058e-06, + "loss": 0.6423, + "step": 93770 + }, + { + "epoch": 1.505321112698438, + "grad_norm": 0.6730810403823853, + "learning_rate": 7.177379810805851e-06, + "loss": 0.8336, + "step": 93780 + }, + { + "epoch": 1.5054816289186022, + "grad_norm": 1.1676220893859863, + "learning_rate": 7.172959300332685e-06, + "loss": 0.6861, + "step": 93790 + }, + { + "epoch": 1.5056421451387663, + "grad_norm": 1.0335954427719116, + "learning_rate": 7.168539923554668e-06, + "loss": 0.7309, + "step": 93800 + }, + { + "epoch": 1.5058026613589304, + "grad_norm": 1.2717918157577515, + "learning_rate": 7.16412168075285e-06, + "loss": 0.6638, + "step": 93810 + }, + { + "epoch": 1.5059631775790945, + "grad_norm": 0.9834570288658142, + "learning_rate": 7.159704572208206e-06, + "loss": 0.5941, + "step": 93820 + }, + { + "epoch": 1.5061236937992584, + "grad_norm": 1.004155158996582, + "learning_rate": 7.155288598201642e-06, + "loss": 0.6041, + "step": 93830 + }, + { + "epoch": 1.5062842100194225, + "grad_norm": 0.8058645129203796, + "learning_rate": 7.150873759013984e-06, + "loss": 0.786, + "step": 93840 + }, + { + "epoch": 1.5064447262395864, + "grad_norm": 1.300429105758667, + "learning_rate": 7.1464600549259995e-06, + "loss": 0.7863, + "step": 93850 + }, + { + "epoch": 1.5066052424597505, + "grad_norm": 1.0221272706985474, + "learning_rate": 7.142047486218359e-06, + "loss": 0.6746, + "step": 93860 + }, + { + "epoch": 1.5067657586799146, + "grad_norm": 0.7472708821296692, + "learning_rate": 7.137636053171681e-06, + "loss": 0.7244, + "step": 93870 + }, + { + "epoch": 1.5069262749000787, + "grad_norm": 1.04812753200531, + "learning_rate": 7.133225756066511e-06, + "loss": 0.6989, + "step": 93880 + }, + { + "epoch": 1.5070867911202428, + "grad_norm": 0.940962016582489, + "learning_rate": 7.128816595183313e-06, + "loss": 0.6971, + "step": 93890 + }, + { + "epoch": 1.507247307340407, + "grad_norm": 1.729992389678955, + "learning_rate": 7.124408570802488e-06, + "loss": 0.6875, + "step": 93900 + }, + { + "epoch": 1.5074078235605708, + "grad_norm": 0.8357125520706177, + "learning_rate": 7.1200016832043584e-06, + "loss": 0.7104, + "step": 93910 + }, + { + "epoch": 1.5075683397807347, + "grad_norm": 1.3860191106796265, + "learning_rate": 7.115595932669175e-06, + "loss": 0.7095, + "step": 93920 + }, + { + "epoch": 1.5077288560008988, + "grad_norm": 1.2517054080963135, + "learning_rate": 7.111191319477123e-06, + "loss": 0.6548, + "step": 93930 + }, + { + "epoch": 1.507889372221063, + "grad_norm": 0.9785143136978149, + "learning_rate": 7.1067878439083026e-06, + "loss": 0.84, + "step": 93940 + }, + { + "epoch": 1.508049888441227, + "grad_norm": 0.8690537214279175, + "learning_rate": 7.102385506242756e-06, + "loss": 0.7045, + "step": 93950 + }, + { + "epoch": 1.5082104046613911, + "grad_norm": 1.4493356943130493, + "learning_rate": 7.097984306760441e-06, + "loss": 0.6313, + "step": 93960 + }, + { + "epoch": 1.5083709208815552, + "grad_norm": 1.1621726751327515, + "learning_rate": 7.0935842457412585e-06, + "loss": 0.6644, + "step": 93970 + }, + { + "epoch": 1.508531437101719, + "grad_norm": 0.9502441883087158, + "learning_rate": 7.0891853234650084e-06, + "loss": 0.6138, + "step": 93980 + }, + { + "epoch": 1.5086919533218832, + "grad_norm": 1.0141983032226562, + "learning_rate": 7.084787540211449e-06, + "loss": 0.7154, + "step": 93990 + }, + { + "epoch": 1.508852469542047, + "grad_norm": 1.913602590560913, + "learning_rate": 7.08039089626025e-06, + "loss": 0.5368, + "step": 94000 + }, + { + "epoch": 1.5090129857622112, + "grad_norm": 1.2442988157272339, + "learning_rate": 7.0759953918910105e-06, + "loss": 0.5523, + "step": 94010 + }, + { + "epoch": 1.5091735019823753, + "grad_norm": 1.0551707744598389, + "learning_rate": 7.071601027383262e-06, + "loss": 0.5621, + "step": 94020 + }, + { + "epoch": 1.5093340182025394, + "grad_norm": 0.5762712359428406, + "learning_rate": 7.06720780301646e-06, + "loss": 0.7286, + "step": 94030 + }, + { + "epoch": 1.5094945344227035, + "grad_norm": 0.9676625728607178, + "learning_rate": 7.062815719069993e-06, + "loss": 0.8245, + "step": 94040 + }, + { + "epoch": 1.5096550506428674, + "grad_norm": 1.1533796787261963, + "learning_rate": 7.058424775823158e-06, + "loss": 0.7966, + "step": 94050 + }, + { + "epoch": 1.5098155668630315, + "grad_norm": 1.0904985666275024, + "learning_rate": 7.0540349735552016e-06, + "loss": 0.6584, + "step": 94060 + }, + { + "epoch": 1.5099760830831954, + "grad_norm": 1.2967747449874878, + "learning_rate": 7.049646312545288e-06, + "loss": 0.7652, + "step": 94070 + }, + { + "epoch": 1.5101365993033595, + "grad_norm": 0.9130401611328125, + "learning_rate": 7.045258793072512e-06, + "loss": 0.6562, + "step": 94080 + }, + { + "epoch": 1.5102971155235236, + "grad_norm": 0.9921247363090515, + "learning_rate": 7.040872415415892e-06, + "loss": 0.79, + "step": 94090 + }, + { + "epoch": 1.5104576317436877, + "grad_norm": 0.9221395254135132, + "learning_rate": 7.0364871798543865e-06, + "loss": 0.7498, + "step": 94100 + }, + { + "epoch": 1.5106181479638519, + "grad_norm": 0.970058798789978, + "learning_rate": 7.0321030866668525e-06, + "loss": 0.8107, + "step": 94110 + }, + { + "epoch": 1.5107786641840157, + "grad_norm": 2.356335401535034, + "learning_rate": 7.027720136132101e-06, + "loss": 0.7468, + "step": 94120 + }, + { + "epoch": 1.5109391804041798, + "grad_norm": 1.3435343503952026, + "learning_rate": 7.023338328528864e-06, + "loss": 0.6568, + "step": 94130 + }, + { + "epoch": 1.5110996966243437, + "grad_norm": 0.8377407789230347, + "learning_rate": 7.018957664135794e-06, + "loss": 0.6608, + "step": 94140 + }, + { + "epoch": 1.5112602128445078, + "grad_norm": 1.088276982307434, + "learning_rate": 7.014578143231482e-06, + "loss": 0.6775, + "step": 94150 + }, + { + "epoch": 1.511420729064672, + "grad_norm": 0.9077087640762329, + "learning_rate": 7.010199766094433e-06, + "loss": 0.8224, + "step": 94160 + }, + { + "epoch": 1.511581245284836, + "grad_norm": 1.3928008079528809, + "learning_rate": 7.0058225330031e-06, + "loss": 0.6813, + "step": 94170 + }, + { + "epoch": 1.5117417615050002, + "grad_norm": 1.0596644878387451, + "learning_rate": 7.001446444235829e-06, + "loss": 0.6664, + "step": 94180 + }, + { + "epoch": 1.5119022777251643, + "grad_norm": 1.5277626514434814, + "learning_rate": 6.997071500070926e-06, + "loss": 0.6243, + "step": 94190 + }, + { + "epoch": 1.5120627939453282, + "grad_norm": 1.0388435125350952, + "learning_rate": 6.992697700786607e-06, + "loss": 0.7359, + "step": 94200 + }, + { + "epoch": 1.5122233101654923, + "grad_norm": 1.0975044965744019, + "learning_rate": 6.98832504666102e-06, + "loss": 0.6366, + "step": 94210 + }, + { + "epoch": 1.5123838263856562, + "grad_norm": 0.9619945287704468, + "learning_rate": 6.983953537972246e-06, + "loss": 0.66, + "step": 94220 + }, + { + "epoch": 1.5125443426058203, + "grad_norm": 0.8946345448493958, + "learning_rate": 6.979583174998291e-06, + "loss": 0.6578, + "step": 94230 + }, + { + "epoch": 1.5127048588259844, + "grad_norm": 1.3158599138259888, + "learning_rate": 6.975213958017068e-06, + "loss": 0.7402, + "step": 94240 + }, + { + "epoch": 1.5128653750461485, + "grad_norm": 1.5131139755249023, + "learning_rate": 6.970845887306443e-06, + "loss": 0.7239, + "step": 94250 + }, + { + "epoch": 1.5130258912663126, + "grad_norm": 0.7926183938980103, + "learning_rate": 6.966478963144199e-06, + "loss": 0.7258, + "step": 94260 + }, + { + "epoch": 1.5131864074864765, + "grad_norm": 0.8730524182319641, + "learning_rate": 6.9621131858080445e-06, + "loss": 0.8697, + "step": 94270 + }, + { + "epoch": 1.5133469237066406, + "grad_norm": 1.0796687602996826, + "learning_rate": 6.9577485555756245e-06, + "loss": 0.8465, + "step": 94280 + }, + { + "epoch": 1.5135074399268045, + "grad_norm": 1.20193350315094, + "learning_rate": 6.953385072724502e-06, + "loss": 0.7189, + "step": 94290 + }, + { + "epoch": 1.5136679561469686, + "grad_norm": 1.0594170093536377, + "learning_rate": 6.9490227375321625e-06, + "loss": 0.7313, + "step": 94300 + }, + { + "epoch": 1.5138284723671327, + "grad_norm": 1.4302589893341064, + "learning_rate": 6.944661550276027e-06, + "loss": 0.6347, + "step": 94310 + }, + { + "epoch": 1.5139889885872968, + "grad_norm": 0.8331817388534546, + "learning_rate": 6.940301511233441e-06, + "loss": 0.6664, + "step": 94320 + }, + { + "epoch": 1.514149504807461, + "grad_norm": 2.199718952178955, + "learning_rate": 6.935942620681682e-06, + "loss": 0.771, + "step": 94330 + }, + { + "epoch": 1.5143100210276248, + "grad_norm": 1.5920196771621704, + "learning_rate": 6.931584878897946e-06, + "loss": 0.7628, + "step": 94340 + }, + { + "epoch": 1.514470537247789, + "grad_norm": 0.8487111330032349, + "learning_rate": 6.927228286159368e-06, + "loss": 0.6601, + "step": 94350 + }, + { + "epoch": 1.5146310534679528, + "grad_norm": 0.9386990666389465, + "learning_rate": 6.922872842742987e-06, + "loss": 0.6632, + "step": 94360 + }, + { + "epoch": 1.514791569688117, + "grad_norm": 1.1088014841079712, + "learning_rate": 6.91851854892579e-06, + "loss": 0.7439, + "step": 94370 + }, + { + "epoch": 1.514952085908281, + "grad_norm": 0.8618733286857605, + "learning_rate": 6.914165404984688e-06, + "loss": 0.6556, + "step": 94380 + }, + { + "epoch": 1.5151126021284451, + "grad_norm": 1.2267581224441528, + "learning_rate": 6.909813411196512e-06, + "loss": 0.7401, + "step": 94390 + }, + { + "epoch": 1.5152731183486092, + "grad_norm": 0.8263702988624573, + "learning_rate": 6.905462567838025e-06, + "loss": 0.6408, + "step": 94400 + }, + { + "epoch": 1.5154336345687733, + "grad_norm": 1.3856083154678345, + "learning_rate": 6.901112875185922e-06, + "loss": 0.7582, + "step": 94410 + }, + { + "epoch": 1.5155941507889372, + "grad_norm": 1.3850421905517578, + "learning_rate": 6.8967643335168035e-06, + "loss": 0.6109, + "step": 94420 + }, + { + "epoch": 1.515754667009101, + "grad_norm": 0.998757004737854, + "learning_rate": 6.892416943107216e-06, + "loss": 0.6614, + "step": 94430 + }, + { + "epoch": 1.5159151832292652, + "grad_norm": 6.3705549240112305, + "learning_rate": 6.888070704233632e-06, + "loss": 0.6048, + "step": 94440 + }, + { + "epoch": 1.5160756994494293, + "grad_norm": 1.216201663017273, + "learning_rate": 6.883725617172443e-06, + "loss": 0.7422, + "step": 94450 + }, + { + "epoch": 1.5162362156695934, + "grad_norm": 1.3258029222488403, + "learning_rate": 6.879381682199976e-06, + "loss": 0.7039, + "step": 94460 + }, + { + "epoch": 1.5163967318897575, + "grad_norm": 0.916867196559906, + "learning_rate": 6.875038899592476e-06, + "loss": 0.7836, + "step": 94470 + }, + { + "epoch": 1.5165572481099217, + "grad_norm": 1.1915667057037354, + "learning_rate": 6.870697269626125e-06, + "loss": 0.7212, + "step": 94480 + }, + { + "epoch": 1.5167177643300855, + "grad_norm": 1.0247092247009277, + "learning_rate": 6.866356792577014e-06, + "loss": 0.629, + "step": 94490 + }, + { + "epoch": 1.5168782805502496, + "grad_norm": 0.7311623692512512, + "learning_rate": 6.8620174687211765e-06, + "loss": 0.7507, + "step": 94500 + }, + { + "epoch": 1.5170387967704135, + "grad_norm": 0.8896577954292297, + "learning_rate": 6.857679298334568e-06, + "loss": 0.7554, + "step": 94510 + }, + { + "epoch": 1.5171993129905776, + "grad_norm": 0.7422189116477966, + "learning_rate": 6.853342281693073e-06, + "loss": 0.6727, + "step": 94520 + }, + { + "epoch": 1.5173598292107418, + "grad_norm": 1.0071938037872314, + "learning_rate": 6.849006419072498e-06, + "loss": 0.722, + "step": 94530 + }, + { + "epoch": 1.5175203454309059, + "grad_norm": 1.2780505418777466, + "learning_rate": 6.844671710748579e-06, + "loss": 0.7646, + "step": 94540 + }, + { + "epoch": 1.51768086165107, + "grad_norm": 1.3160794973373413, + "learning_rate": 6.840338156996978e-06, + "loss": 0.8242, + "step": 94550 + }, + { + "epoch": 1.5178413778712339, + "grad_norm": 0.8219822645187378, + "learning_rate": 6.836005758093283e-06, + "loss": 0.6492, + "step": 94560 + }, + { + "epoch": 1.518001894091398, + "grad_norm": 1.1880205869674683, + "learning_rate": 6.831674514313013e-06, + "loss": 0.6379, + "step": 94570 + }, + { + "epoch": 1.5181624103115618, + "grad_norm": 0.692940354347229, + "learning_rate": 6.827344425931606e-06, + "loss": 0.5777, + "step": 94580 + }, + { + "epoch": 1.518322926531726, + "grad_norm": 0.7831379771232605, + "learning_rate": 6.823015493224436e-06, + "loss": 0.7033, + "step": 94590 + }, + { + "epoch": 1.51848344275189, + "grad_norm": 1.2214280366897583, + "learning_rate": 6.8186877164667875e-06, + "loss": 0.7437, + "step": 94600 + }, + { + "epoch": 1.5186439589720542, + "grad_norm": 1.0026614665985107, + "learning_rate": 6.8143610959338855e-06, + "loss": 0.6902, + "step": 94610 + }, + { + "epoch": 1.5188044751922183, + "grad_norm": 0.940542459487915, + "learning_rate": 6.81003563190088e-06, + "loss": 0.7991, + "step": 94620 + }, + { + "epoch": 1.5189649914123822, + "grad_norm": 1.1774017810821533, + "learning_rate": 6.805711324642841e-06, + "loss": 0.6861, + "step": 94630 + }, + { + "epoch": 1.5191255076325463, + "grad_norm": 0.853158712387085, + "learning_rate": 6.8013881744347755e-06, + "loss": 0.6586, + "step": 94640 + }, + { + "epoch": 1.5192860238527102, + "grad_norm": 1.7565158605575562, + "learning_rate": 6.797066181551606e-06, + "loss": 0.7231, + "step": 94650 + }, + { + "epoch": 1.5194465400728743, + "grad_norm": 1.1642566919326782, + "learning_rate": 6.792745346268192e-06, + "loss": 0.7142, + "step": 94660 + }, + { + "epoch": 1.5196070562930384, + "grad_norm": 1.350411295890808, + "learning_rate": 6.788425668859305e-06, + "loss": 0.6, + "step": 94670 + }, + { + "epoch": 1.5197675725132025, + "grad_norm": 1.187724232673645, + "learning_rate": 6.784107149599653e-06, + "loss": 0.7181, + "step": 94680 + }, + { + "epoch": 1.5199280887333666, + "grad_norm": 1.087381362915039, + "learning_rate": 6.779789788763868e-06, + "loss": 0.7826, + "step": 94690 + }, + { + "epoch": 1.5200886049535307, + "grad_norm": 1.2697843313217163, + "learning_rate": 6.775473586626513e-06, + "loss": 0.7681, + "step": 94700 + }, + { + "epoch": 1.5202491211736946, + "grad_norm": 1.2438044548034668, + "learning_rate": 6.77115854346207e-06, + "loss": 0.6536, + "step": 94710 + }, + { + "epoch": 1.5204096373938587, + "grad_norm": 0.8060094118118286, + "learning_rate": 6.766844659544949e-06, + "loss": 0.5576, + "step": 94720 + }, + { + "epoch": 1.5205701536140226, + "grad_norm": 1.4293670654296875, + "learning_rate": 6.762531935149499e-06, + "loss": 0.7328, + "step": 94730 + }, + { + "epoch": 1.5207306698341867, + "grad_norm": 1.198287010192871, + "learning_rate": 6.75822037054997e-06, + "loss": 0.6854, + "step": 94740 + }, + { + "epoch": 1.5208911860543508, + "grad_norm": 0.6638859510421753, + "learning_rate": 6.753909966020552e-06, + "loss": 0.7707, + "step": 94750 + }, + { + "epoch": 1.521051702274515, + "grad_norm": 1.4339216947555542, + "learning_rate": 6.749600721835369e-06, + "loss": 0.7739, + "step": 94760 + }, + { + "epoch": 1.521212218494679, + "grad_norm": 1.1399478912353516, + "learning_rate": 6.745292638268463e-06, + "loss": 0.7333, + "step": 94770 + }, + { + "epoch": 1.521372734714843, + "grad_norm": 1.0600394010543823, + "learning_rate": 6.7409857155937975e-06, + "loss": 0.7021, + "step": 94780 + }, + { + "epoch": 1.521533250935007, + "grad_norm": 1.0748134851455688, + "learning_rate": 6.736679954085281e-06, + "loss": 0.7112, + "step": 94790 + }, + { + "epoch": 1.521693767155171, + "grad_norm": 1.3786882162094116, + "learning_rate": 6.732375354016718e-06, + "loss": 0.6822, + "step": 94800 + }, + { + "epoch": 1.521854283375335, + "grad_norm": 0.8967278599739075, + "learning_rate": 6.7280719156618596e-06, + "loss": 0.7928, + "step": 94810 + }, + { + "epoch": 1.5220147995954991, + "grad_norm": 1.1697502136230469, + "learning_rate": 6.723769639294383e-06, + "loss": 0.8686, + "step": 94820 + }, + { + "epoch": 1.5221753158156632, + "grad_norm": 1.088137149810791, + "learning_rate": 6.719468525187889e-06, + "loss": 0.77, + "step": 94830 + }, + { + "epoch": 1.5223358320358273, + "grad_norm": 0.8632739782333374, + "learning_rate": 6.715168573615896e-06, + "loss": 0.7335, + "step": 94840 + }, + { + "epoch": 1.5224963482559912, + "grad_norm": 1.438153624534607, + "learning_rate": 6.710869784851873e-06, + "loss": 0.7262, + "step": 94850 + }, + { + "epoch": 1.5226568644761553, + "grad_norm": 2.5085701942443848, + "learning_rate": 6.706572159169175e-06, + "loss": 0.6348, + "step": 94860 + }, + { + "epoch": 1.5228173806963192, + "grad_norm": 1.0046192407608032, + "learning_rate": 6.702275696841117e-06, + "loss": 0.6921, + "step": 94870 + }, + { + "epoch": 1.5229778969164833, + "grad_norm": 0.967911958694458, + "learning_rate": 6.69798039814093e-06, + "loss": 0.7576, + "step": 94880 + }, + { + "epoch": 1.5231384131366474, + "grad_norm": 0.9876746535301208, + "learning_rate": 6.693686263341764e-06, + "loss": 0.6601, + "step": 94890 + }, + { + "epoch": 1.5232989293568115, + "grad_norm": 0.9237608313560486, + "learning_rate": 6.689393292716706e-06, + "loss": 0.694, + "step": 94900 + }, + { + "epoch": 1.5234594455769757, + "grad_norm": 1.7394388914108276, + "learning_rate": 6.68510148653877e-06, + "loss": 0.5389, + "step": 94910 + }, + { + "epoch": 1.5236199617971398, + "grad_norm": 1.1262009143829346, + "learning_rate": 6.680810845080876e-06, + "loss": 0.5511, + "step": 94920 + }, + { + "epoch": 1.5237804780173037, + "grad_norm": 0.9470945596694946, + "learning_rate": 6.676521368615887e-06, + "loss": 0.7655, + "step": 94930 + }, + { + "epoch": 1.5239409942374675, + "grad_norm": 0.8992155194282532, + "learning_rate": 6.6722330574165905e-06, + "loss": 0.6109, + "step": 94940 + }, + { + "epoch": 1.5241015104576316, + "grad_norm": 0.8325353264808655, + "learning_rate": 6.667945911755702e-06, + "loss": 0.6581, + "step": 94950 + }, + { + "epoch": 1.5242620266777958, + "grad_norm": 1.0010803937911987, + "learning_rate": 6.6636599319058525e-06, + "loss": 0.6533, + "step": 94960 + }, + { + "epoch": 1.5244225428979599, + "grad_norm": 0.8997653126716614, + "learning_rate": 6.659375118139608e-06, + "loss": 0.6771, + "step": 94970 + }, + { + "epoch": 1.524583059118124, + "grad_norm": 0.7771396636962891, + "learning_rate": 6.655091470729466e-06, + "loss": 0.7426, + "step": 94980 + }, + { + "epoch": 1.524743575338288, + "grad_norm": 0.8104279041290283, + "learning_rate": 6.650808989947824e-06, + "loss": 0.7506, + "step": 94990 + }, + { + "epoch": 1.524904091558452, + "grad_norm": 0.9993910193443298, + "learning_rate": 6.646527676067032e-06, + "loss": 0.7292, + "step": 95000 + }, + { + "epoch": 1.525064607778616, + "grad_norm": 0.6212171912193298, + "learning_rate": 6.642247529359358e-06, + "loss": 0.7187, + "step": 95010 + }, + { + "epoch": 1.52522512399878, + "grad_norm": 0.8726744651794434, + "learning_rate": 6.63796855009699e-06, + "loss": 0.7634, + "step": 95020 + }, + { + "epoch": 1.525385640218944, + "grad_norm": 0.9832454919815063, + "learning_rate": 6.633690738552048e-06, + "loss": 0.7242, + "step": 95030 + }, + { + "epoch": 1.5255461564391082, + "grad_norm": 1.263738989830017, + "learning_rate": 6.6294140949965835e-06, + "loss": 0.737, + "step": 95040 + }, + { + "epoch": 1.5257066726592723, + "grad_norm": 0.6863707900047302, + "learning_rate": 6.62513861970255e-06, + "loss": 0.757, + "step": 95050 + }, + { + "epoch": 1.5258671888794364, + "grad_norm": 0.9189426898956299, + "learning_rate": 6.620864312941852e-06, + "loss": 0.767, + "step": 95060 + }, + { + "epoch": 1.5260277050996003, + "grad_norm": 0.9342272281646729, + "learning_rate": 6.616591174986309e-06, + "loss": 0.727, + "step": 95070 + }, + { + "epoch": 1.5261882213197644, + "grad_norm": 1.4222513437271118, + "learning_rate": 6.612319206107667e-06, + "loss": 0.7857, + "step": 95080 + }, + { + "epoch": 1.5263487375399283, + "grad_norm": 0.6930843591690063, + "learning_rate": 6.608048406577599e-06, + "loss": 0.7948, + "step": 95090 + }, + { + "epoch": 1.5265092537600924, + "grad_norm": 0.826601505279541, + "learning_rate": 6.6037787766677104e-06, + "loss": 0.6615, + "step": 95100 + }, + { + "epoch": 1.5266697699802565, + "grad_norm": 0.8926905393600464, + "learning_rate": 6.599510316649507e-06, + "loss": 0.7434, + "step": 95110 + }, + { + "epoch": 1.5268302862004206, + "grad_norm": 0.707598865032196, + "learning_rate": 6.595243026794451e-06, + "loss": 0.6319, + "step": 95120 + }, + { + "epoch": 1.5269908024205847, + "grad_norm": 0.9380571246147156, + "learning_rate": 6.590976907373911e-06, + "loss": 0.6663, + "step": 95130 + }, + { + "epoch": 1.5271513186407486, + "grad_norm": 1.156502366065979, + "learning_rate": 6.586711958659192e-06, + "loss": 0.7365, + "step": 95140 + }, + { + "epoch": 1.5273118348609127, + "grad_norm": 0.987929105758667, + "learning_rate": 6.582448180921516e-06, + "loss": 0.8203, + "step": 95150 + }, + { + "epoch": 1.5274723510810766, + "grad_norm": 1.143363118171692, + "learning_rate": 6.5781855744320355e-06, + "loss": 0.6743, + "step": 95160 + }, + { + "epoch": 1.5276328673012407, + "grad_norm": 2.34745192527771, + "learning_rate": 6.573924139461826e-06, + "loss": 0.7741, + "step": 95170 + }, + { + "epoch": 1.5277933835214048, + "grad_norm": 1.7272083759307861, + "learning_rate": 6.569663876281892e-06, + "loss": 0.6997, + "step": 95180 + }, + { + "epoch": 1.527953899741569, + "grad_norm": 1.639284610748291, + "learning_rate": 6.565404785163162e-06, + "loss": 0.6355, + "step": 95190 + }, + { + "epoch": 1.528114415961733, + "grad_norm": 0.7415310740470886, + "learning_rate": 6.5611468663764885e-06, + "loss": 0.7391, + "step": 95200 + }, + { + "epoch": 1.5282749321818971, + "grad_norm": 1.7338364124298096, + "learning_rate": 6.556890120192646e-06, + "loss": 0.7133, + "step": 95210 + }, + { + "epoch": 1.528435448402061, + "grad_norm": 1.2570419311523438, + "learning_rate": 6.5526345468823495e-06, + "loss": 0.7234, + "step": 95220 + }, + { + "epoch": 1.528595964622225, + "grad_norm": 0.8877266645431519, + "learning_rate": 6.548380146716215e-06, + "loss": 0.6021, + "step": 95230 + }, + { + "epoch": 1.528756480842389, + "grad_norm": 0.8257899284362793, + "learning_rate": 6.544126919964802e-06, + "loss": 0.7518, + "step": 95240 + }, + { + "epoch": 1.5289169970625531, + "grad_norm": 0.8386667966842651, + "learning_rate": 6.5398748668985935e-06, + "loss": 0.712, + "step": 95250 + }, + { + "epoch": 1.5290775132827172, + "grad_norm": 1.2944273948669434, + "learning_rate": 6.535623987787992e-06, + "loss": 0.7151, + "step": 95260 + }, + { + "epoch": 1.5292380295028813, + "grad_norm": 1.1424680948257446, + "learning_rate": 6.531374282903332e-06, + "loss": 0.6867, + "step": 95270 + }, + { + "epoch": 1.5293985457230455, + "grad_norm": 1.3693952560424805, + "learning_rate": 6.527125752514865e-06, + "loss": 0.7934, + "step": 95280 + }, + { + "epoch": 1.5295590619432093, + "grad_norm": 0.9117676019668579, + "learning_rate": 6.522878396892784e-06, + "loss": 0.6522, + "step": 95290 + }, + { + "epoch": 1.5297195781633735, + "grad_norm": 0.9219553470611572, + "learning_rate": 6.5186322163071804e-06, + "loss": 0.6727, + "step": 95300 + }, + { + "epoch": 1.5298800943835373, + "grad_norm": 1.0022655725479126, + "learning_rate": 6.514387211028092e-06, + "loss": 0.7351, + "step": 95310 + }, + { + "epoch": 1.5300406106037014, + "grad_norm": 0.8385307192802429, + "learning_rate": 6.510143381325479e-06, + "loss": 0.6983, + "step": 95320 + }, + { + "epoch": 1.5302011268238656, + "grad_norm": 0.9860352873802185, + "learning_rate": 6.505900727469222e-06, + "loss": 0.7499, + "step": 95330 + }, + { + "epoch": 1.5303616430440297, + "grad_norm": 1.0047966241836548, + "learning_rate": 6.5016592497291275e-06, + "loss": 0.6482, + "step": 95340 + }, + { + "epoch": 1.5305221592641938, + "grad_norm": 1.0846621990203857, + "learning_rate": 6.497418948374942e-06, + "loss": 0.7027, + "step": 95350 + }, + { + "epoch": 1.5306826754843577, + "grad_norm": 0.8077511191368103, + "learning_rate": 6.493179823676302e-06, + "loss": 0.6685, + "step": 95360 + }, + { + "epoch": 1.5308431917045218, + "grad_norm": 1.3233903646469116, + "learning_rate": 6.488941875902802e-06, + "loss": 0.6095, + "step": 95370 + }, + { + "epoch": 1.5310037079246857, + "grad_norm": 2.800670862197876, + "learning_rate": 6.484705105323949e-06, + "loss": 0.6954, + "step": 95380 + }, + { + "epoch": 1.5311642241448498, + "grad_norm": 0.7440524101257324, + "learning_rate": 6.480469512209178e-06, + "loss": 0.6417, + "step": 95390 + }, + { + "epoch": 1.5313247403650139, + "grad_norm": 1.1320757865905762, + "learning_rate": 6.476235096827845e-06, + "loss": 0.6155, + "step": 95400 + }, + { + "epoch": 1.531485256585178, + "grad_norm": 0.7889968156814575, + "learning_rate": 6.472001859449248e-06, + "loss": 0.7072, + "step": 95410 + }, + { + "epoch": 1.531645772805342, + "grad_norm": 1.1891189813613892, + "learning_rate": 6.467769800342574e-06, + "loss": 0.594, + "step": 95420 + }, + { + "epoch": 1.531806289025506, + "grad_norm": 0.7172242999076843, + "learning_rate": 6.463538919776968e-06, + "loss": 0.758, + "step": 95430 + }, + { + "epoch": 1.53196680524567, + "grad_norm": 0.718910276889801, + "learning_rate": 6.459309218021492e-06, + "loss": 0.7707, + "step": 95440 + }, + { + "epoch": 1.532127321465834, + "grad_norm": 1.1767321825027466, + "learning_rate": 6.455080695345123e-06, + "loss": 0.7269, + "step": 95450 + }, + { + "epoch": 1.532287837685998, + "grad_norm": 0.8964455723762512, + "learning_rate": 6.450853352016775e-06, + "loss": 0.603, + "step": 95460 + }, + { + "epoch": 1.5324483539061622, + "grad_norm": 1.1583054065704346, + "learning_rate": 6.44662718830529e-06, + "loss": 0.6833, + "step": 95470 + }, + { + "epoch": 1.5326088701263263, + "grad_norm": 1.282348871231079, + "learning_rate": 6.442402204479412e-06, + "loss": 0.7106, + "step": 95480 + }, + { + "epoch": 1.5327693863464904, + "grad_norm": 1.1624956130981445, + "learning_rate": 6.438178400807829e-06, + "loss": 0.7831, + "step": 95490 + }, + { + "epoch": 1.5329299025666545, + "grad_norm": 2.2769265174865723, + "learning_rate": 6.433955777559153e-06, + "loss": 0.6261, + "step": 95500 + }, + { + "epoch": 1.5330904187868184, + "grad_norm": 0.7377486824989319, + "learning_rate": 6.429734335001919e-06, + "loss": 0.6399, + "step": 95510 + }, + { + "epoch": 1.5332509350069825, + "grad_norm": 0.7174076437950134, + "learning_rate": 6.425514073404585e-06, + "loss": 0.6304, + "step": 95520 + }, + { + "epoch": 1.5334114512271464, + "grad_norm": 0.9330926537513733, + "learning_rate": 6.4212949930355345e-06, + "loss": 0.787, + "step": 95530 + }, + { + "epoch": 1.5335719674473105, + "grad_norm": 1.1083508729934692, + "learning_rate": 6.417077094163084e-06, + "loss": 0.6777, + "step": 95540 + }, + { + "epoch": 1.5337324836674746, + "grad_norm": 0.7821795344352722, + "learning_rate": 6.412860377055452e-06, + "loss": 0.602, + "step": 95550 + }, + { + "epoch": 1.5338929998876387, + "grad_norm": 0.7994478344917297, + "learning_rate": 6.408644841980804e-06, + "loss": 0.6326, + "step": 95560 + }, + { + "epoch": 1.5340535161078028, + "grad_norm": 0.9544768333435059, + "learning_rate": 6.404430489207225e-06, + "loss": 0.7532, + "step": 95570 + }, + { + "epoch": 1.5342140323279667, + "grad_norm": 0.9657972455024719, + "learning_rate": 6.400217319002724e-06, + "loss": 0.6895, + "step": 95580 + }, + { + "epoch": 1.5343745485481308, + "grad_norm": 1.1149072647094727, + "learning_rate": 6.39600533163523e-06, + "loss": 0.7481, + "step": 95590 + }, + { + "epoch": 1.5345350647682947, + "grad_norm": 1.6058775186538696, + "learning_rate": 6.391794527372608e-06, + "loss": 0.6997, + "step": 95600 + }, + { + "epoch": 1.5346955809884588, + "grad_norm": 0.9543459415435791, + "learning_rate": 6.38758490648263e-06, + "loss": 0.6256, + "step": 95610 + }, + { + "epoch": 1.534856097208623, + "grad_norm": 0.9685391187667847, + "learning_rate": 6.3833764692330076e-06, + "loss": 0.7201, + "step": 95620 + }, + { + "epoch": 1.535016613428787, + "grad_norm": 0.4679991602897644, + "learning_rate": 6.379169215891373e-06, + "loss": 0.6968, + "step": 95630 + }, + { + "epoch": 1.5351771296489511, + "grad_norm": 0.9669966101646423, + "learning_rate": 6.374963146725282e-06, + "loss": 0.6953, + "step": 95640 + }, + { + "epoch": 1.535337645869115, + "grad_norm": 1.115641713142395, + "learning_rate": 6.370758262002219e-06, + "loss": 0.6605, + "step": 95650 + }, + { + "epoch": 1.5354981620892791, + "grad_norm": 0.7107287645339966, + "learning_rate": 6.366554561989594e-06, + "loss": 0.6923, + "step": 95660 + }, + { + "epoch": 1.535658678309443, + "grad_norm": 1.0269746780395508, + "learning_rate": 6.362352046954722e-06, + "loss": 0.7378, + "step": 95670 + }, + { + "epoch": 1.5358191945296071, + "grad_norm": 6.560839653015137, + "learning_rate": 6.3581507171648735e-06, + "loss": 0.7021, + "step": 95680 + }, + { + "epoch": 1.5359797107497712, + "grad_norm": 1.1319043636322021, + "learning_rate": 6.353950572887218e-06, + "loss": 0.6122, + "step": 95690 + }, + { + "epoch": 1.5361402269699354, + "grad_norm": 1.2415732145309448, + "learning_rate": 6.349751614388868e-06, + "loss": 0.6411, + "step": 95700 + }, + { + "epoch": 1.5363007431900995, + "grad_norm": 1.1359068155288696, + "learning_rate": 6.345553841936847e-06, + "loss": 0.7088, + "step": 95710 + }, + { + "epoch": 1.5364612594102636, + "grad_norm": 1.4422346353530884, + "learning_rate": 6.341357255798114e-06, + "loss": 0.739, + "step": 95720 + }, + { + "epoch": 1.5366217756304275, + "grad_norm": 1.1946992874145508, + "learning_rate": 6.337161856239548e-06, + "loss": 0.7436, + "step": 95730 + }, + { + "epoch": 1.5367822918505913, + "grad_norm": 0.8711658716201782, + "learning_rate": 6.332967643527945e-06, + "loss": 0.7333, + "step": 95740 + }, + { + "epoch": 1.5369428080707555, + "grad_norm": 1.1321640014648438, + "learning_rate": 6.328774617930034e-06, + "loss": 0.6998, + "step": 95750 + }, + { + "epoch": 1.5371033242909196, + "grad_norm": 1.287674069404602, + "learning_rate": 6.324582779712468e-06, + "loss": 0.7222, + "step": 95760 + }, + { + "epoch": 1.5372638405110837, + "grad_norm": 1.113749384880066, + "learning_rate": 6.320392129141822e-06, + "loss": 0.5983, + "step": 95770 + }, + { + "epoch": 1.5374243567312478, + "grad_norm": 1.4600059986114502, + "learning_rate": 6.316202666484599e-06, + "loss": 0.6867, + "step": 95780 + }, + { + "epoch": 1.5375848729514119, + "grad_norm": 1.0841296911239624, + "learning_rate": 6.3120143920072265e-06, + "loss": 0.6719, + "step": 95790 + }, + { + "epoch": 1.5377453891715758, + "grad_norm": 0.958541989326477, + "learning_rate": 6.30782730597605e-06, + "loss": 0.7727, + "step": 95800 + }, + { + "epoch": 1.5379059053917399, + "grad_norm": 0.9084130525588989, + "learning_rate": 6.303641408657343e-06, + "loss": 0.6788, + "step": 95810 + }, + { + "epoch": 1.5380664216119038, + "grad_norm": 1.1264322996139526, + "learning_rate": 6.2994567003173075e-06, + "loss": 0.7132, + "step": 95820 + }, + { + "epoch": 1.5382269378320679, + "grad_norm": 1.117488980293274, + "learning_rate": 6.295273181222064e-06, + "loss": 0.7185, + "step": 95830 + }, + { + "epoch": 1.538387454052232, + "grad_norm": 1.019230604171753, + "learning_rate": 6.291090851637662e-06, + "loss": 0.7664, + "step": 95840 + }, + { + "epoch": 1.538547970272396, + "grad_norm": 1.6009418964385986, + "learning_rate": 6.286909711830077e-06, + "loss": 0.6167, + "step": 95850 + }, + { + "epoch": 1.5387084864925602, + "grad_norm": 0.9175178408622742, + "learning_rate": 6.2827297620651955e-06, + "loss": 0.6948, + "step": 95860 + }, + { + "epoch": 1.538869002712724, + "grad_norm": 0.707904040813446, + "learning_rate": 6.278551002608837e-06, + "loss": 0.7761, + "step": 95870 + }, + { + "epoch": 1.5390295189328882, + "grad_norm": 1.082622766494751, + "learning_rate": 6.274373433726754e-06, + "loss": 0.7585, + "step": 95880 + }, + { + "epoch": 1.539190035153052, + "grad_norm": 0.9543943405151367, + "learning_rate": 6.270197055684612e-06, + "loss": 0.7458, + "step": 95890 + }, + { + "epoch": 1.5393505513732162, + "grad_norm": 1.155263066291809, + "learning_rate": 6.266021868748006e-06, + "loss": 0.6073, + "step": 95900 + }, + { + "epoch": 1.5395110675933803, + "grad_norm": 1.0996273756027222, + "learning_rate": 6.261847873182458e-06, + "loss": 0.8012, + "step": 95910 + }, + { + "epoch": 1.5396715838135444, + "grad_norm": 1.1886298656463623, + "learning_rate": 6.257675069253396e-06, + "loss": 0.6492, + "step": 95920 + }, + { + "epoch": 1.5398321000337085, + "grad_norm": 1.547167420387268, + "learning_rate": 6.253503457226192e-06, + "loss": 0.731, + "step": 95930 + }, + { + "epoch": 1.5399926162538724, + "grad_norm": 1.065981149673462, + "learning_rate": 6.249333037366139e-06, + "loss": 0.7049, + "step": 95940 + }, + { + "epoch": 1.5401531324740365, + "grad_norm": 0.8497262001037598, + "learning_rate": 6.245163809938451e-06, + "loss": 0.7034, + "step": 95950 + }, + { + "epoch": 1.5403136486942004, + "grad_norm": 1.0396335124969482, + "learning_rate": 6.240995775208264e-06, + "loss": 0.5676, + "step": 95960 + }, + { + "epoch": 1.5404741649143645, + "grad_norm": 0.7002021670341492, + "learning_rate": 6.236828933440647e-06, + "loss": 0.617, + "step": 95970 + }, + { + "epoch": 1.5406346811345286, + "grad_norm": 1.2019927501678467, + "learning_rate": 6.2326632849005754e-06, + "loss": 0.6615, + "step": 95980 + }, + { + "epoch": 1.5407951973546927, + "grad_norm": 0.8388737440109253, + "learning_rate": 6.2284988298529635e-06, + "loss": 0.6063, + "step": 95990 + }, + { + "epoch": 1.5409557135748568, + "grad_norm": 0.7940643429756165, + "learning_rate": 6.22433556856265e-06, + "loss": 0.7181, + "step": 96000 + }, + { + "epoch": 1.5409557135748568, + "eval_loss": 0.7709181904792786, + "eval_runtime": 1833.5912, + "eval_samples_per_second": 14.306, + "eval_steps_per_second": 1.788, + "step": 96000 + }, + { + "epoch": 1.541116229795021, + "grad_norm": 1.5589677095413208, + "learning_rate": 6.220173501294394e-06, + "loss": 0.7565, + "step": 96010 + }, + { + "epoch": 1.5412767460151848, + "grad_norm": 1.0076168775558472, + "learning_rate": 6.216012628312875e-06, + "loss": 0.7323, + "step": 96020 + }, + { + "epoch": 1.541437262235349, + "grad_norm": 1.352689266204834, + "learning_rate": 6.21185294988271e-06, + "loss": 0.7277, + "step": 96030 + }, + { + "epoch": 1.5415977784555128, + "grad_norm": 1.177486538887024, + "learning_rate": 6.207694466268415e-06, + "loss": 0.7475, + "step": 96040 + }, + { + "epoch": 1.541758294675677, + "grad_norm": 1.2768440246582031, + "learning_rate": 6.2035371777344534e-06, + "loss": 0.6708, + "step": 96050 + }, + { + "epoch": 1.541918810895841, + "grad_norm": 1.0815327167510986, + "learning_rate": 6.1993810845452e-06, + "loss": 0.6026, + "step": 96060 + }, + { + "epoch": 1.5420793271160051, + "grad_norm": 1.2433041334152222, + "learning_rate": 6.195226186964964e-06, + "loss": 0.7492, + "step": 96070 + }, + { + "epoch": 1.5422398433361693, + "grad_norm": 1.0301158428192139, + "learning_rate": 6.191072485257971e-06, + "loss": 0.6115, + "step": 96080 + }, + { + "epoch": 1.5424003595563331, + "grad_norm": 1.5495123863220215, + "learning_rate": 6.186919979688366e-06, + "loss": 0.5908, + "step": 96090 + }, + { + "epoch": 1.5425608757764973, + "grad_norm": 1.5521594285964966, + "learning_rate": 6.1827686705202396e-06, + "loss": 0.7896, + "step": 96100 + }, + { + "epoch": 1.5427213919966611, + "grad_norm": 1.4754029512405396, + "learning_rate": 6.178618558017571e-06, + "loss": 0.8479, + "step": 96110 + }, + { + "epoch": 1.5428819082168252, + "grad_norm": 1.0027962923049927, + "learning_rate": 6.174469642444291e-06, + "loss": 0.7111, + "step": 96120 + }, + { + "epoch": 1.5430424244369894, + "grad_norm": 2.039745330810547, + "learning_rate": 6.170321924064251e-06, + "loss": 0.7706, + "step": 96130 + }, + { + "epoch": 1.5432029406571535, + "grad_norm": 0.7502259612083435, + "learning_rate": 6.166175403141214e-06, + "loss": 0.6534, + "step": 96140 + }, + { + "epoch": 1.5433634568773176, + "grad_norm": 1.0869089365005493, + "learning_rate": 6.16203007993888e-06, + "loss": 0.7245, + "step": 96150 + }, + { + "epoch": 1.5435239730974815, + "grad_norm": 0.5228856801986694, + "learning_rate": 6.157885954720871e-06, + "loss": 0.6408, + "step": 96160 + }, + { + "epoch": 1.5436844893176456, + "grad_norm": 0.8776002526283264, + "learning_rate": 6.153743027750716e-06, + "loss": 0.636, + "step": 96170 + }, + { + "epoch": 1.5438450055378095, + "grad_norm": 0.9718644022941589, + "learning_rate": 6.14960129929189e-06, + "loss": 0.6626, + "step": 96180 + }, + { + "epoch": 1.5440055217579736, + "grad_norm": 1.1374571323394775, + "learning_rate": 6.14546076960778e-06, + "loss": 0.6227, + "step": 96190 + }, + { + "epoch": 1.5441660379781377, + "grad_norm": 0.8367916941642761, + "learning_rate": 6.1413214389616975e-06, + "loss": 0.732, + "step": 96200 + }, + { + "epoch": 1.5443265541983018, + "grad_norm": 1.0174307823181152, + "learning_rate": 6.1371833076168865e-06, + "loss": 0.6627, + "step": 96210 + }, + { + "epoch": 1.544487070418466, + "grad_norm": 0.9384397864341736, + "learning_rate": 6.133046375836507e-06, + "loss": 0.7836, + "step": 96220 + }, + { + "epoch": 1.54464758663863, + "grad_norm": 1.0757489204406738, + "learning_rate": 6.128910643883637e-06, + "loss": 0.6813, + "step": 96230 + }, + { + "epoch": 1.5448081028587939, + "grad_norm": 1.3071173429489136, + "learning_rate": 6.124776112021285e-06, + "loss": 0.7807, + "step": 96240 + }, + { + "epoch": 1.5449686190789578, + "grad_norm": 1.2994338274002075, + "learning_rate": 6.120642780512386e-06, + "loss": 0.7387, + "step": 96250 + }, + { + "epoch": 1.5451291352991219, + "grad_norm": 1.1915282011032104, + "learning_rate": 6.1165106496197975e-06, + "loss": 0.6581, + "step": 96260 + }, + { + "epoch": 1.545289651519286, + "grad_norm": 1.3432022333145142, + "learning_rate": 6.112379719606295e-06, + "loss": 0.8117, + "step": 96270 + }, + { + "epoch": 1.54545016773945, + "grad_norm": 1.1179530620574951, + "learning_rate": 6.108249990734591e-06, + "loss": 0.7124, + "step": 96280 + }, + { + "epoch": 1.5456106839596142, + "grad_norm": 1.1317886114120483, + "learning_rate": 6.104121463267298e-06, + "loss": 0.6988, + "step": 96290 + }, + { + "epoch": 1.5457712001797783, + "grad_norm": 1.446398138999939, + "learning_rate": 6.099994137466972e-06, + "loss": 0.6437, + "step": 96300 + }, + { + "epoch": 1.5459317163999422, + "grad_norm": 1.3315273523330688, + "learning_rate": 6.095868013596087e-06, + "loss": 0.797, + "step": 96310 + }, + { + "epoch": 1.5460922326201063, + "grad_norm": 0.8570568561553955, + "learning_rate": 6.09174309191704e-06, + "loss": 0.773, + "step": 96320 + }, + { + "epoch": 1.5462527488402702, + "grad_norm": 0.9525404572486877, + "learning_rate": 6.0876193726921525e-06, + "loss": 0.6443, + "step": 96330 + }, + { + "epoch": 1.5464132650604343, + "grad_norm": 1.676453948020935, + "learning_rate": 6.083496856183668e-06, + "loss": 0.6758, + "step": 96340 + }, + { + "epoch": 1.5465737812805984, + "grad_norm": 0.6908430457115173, + "learning_rate": 6.079375542653762e-06, + "loss": 0.7224, + "step": 96350 + }, + { + "epoch": 1.5467342975007625, + "grad_norm": 1.1334924697875977, + "learning_rate": 6.07525543236451e-06, + "loss": 0.697, + "step": 96360 + }, + { + "epoch": 1.5468948137209266, + "grad_norm": 0.7752397656440735, + "learning_rate": 6.071136525577939e-06, + "loss": 0.7007, + "step": 96370 + }, + { + "epoch": 1.5470553299410905, + "grad_norm": 1.219498872756958, + "learning_rate": 6.067018822555981e-06, + "loss": 0.7222, + "step": 96380 + }, + { + "epoch": 1.5472158461612546, + "grad_norm": 1.3226810693740845, + "learning_rate": 6.062902323560501e-06, + "loss": 0.6631, + "step": 96390 + }, + { + "epoch": 1.5473763623814185, + "grad_norm": 1.319010853767395, + "learning_rate": 6.058787028853285e-06, + "loss": 0.7056, + "step": 96400 + }, + { + "epoch": 1.5475368786015826, + "grad_norm": 1.0679357051849365, + "learning_rate": 6.0546729386960405e-06, + "loss": 0.6901, + "step": 96410 + }, + { + "epoch": 1.5476973948217467, + "grad_norm": 0.7753534317016602, + "learning_rate": 6.050560053350399e-06, + "loss": 0.6369, + "step": 96420 + }, + { + "epoch": 1.5478579110419108, + "grad_norm": 1.015478253364563, + "learning_rate": 6.046448373077915e-06, + "loss": 0.7363, + "step": 96430 + }, + { + "epoch": 1.548018427262075, + "grad_norm": 0.8781059980392456, + "learning_rate": 6.0423378981400685e-06, + "loss": 0.6133, + "step": 96440 + }, + { + "epoch": 1.5481789434822388, + "grad_norm": 0.7321709990501404, + "learning_rate": 6.038228628798262e-06, + "loss": 0.7135, + "step": 96450 + }, + { + "epoch": 1.548339459702403, + "grad_norm": 1.2963578701019287, + "learning_rate": 6.034120565313822e-06, + "loss": 0.7561, + "step": 96460 + }, + { + "epoch": 1.5484999759225668, + "grad_norm": 1.3952008485794067, + "learning_rate": 6.0300137079480015e-06, + "loss": 0.7742, + "step": 96470 + }, + { + "epoch": 1.548660492142731, + "grad_norm": 1.109747052192688, + "learning_rate": 6.02590805696196e-06, + "loss": 0.6787, + "step": 96480 + }, + { + "epoch": 1.548821008362895, + "grad_norm": 1.2445968389511108, + "learning_rate": 6.021803612616797e-06, + "loss": 0.7074, + "step": 96490 + }, + { + "epoch": 1.5489815245830592, + "grad_norm": 0.901457667350769, + "learning_rate": 6.017700375173535e-06, + "loss": 0.6109, + "step": 96500 + }, + { + "epoch": 1.5491420408032233, + "grad_norm": 1.3791085481643677, + "learning_rate": 6.013598344893115e-06, + "loss": 0.7311, + "step": 96510 + }, + { + "epoch": 1.5493025570233874, + "grad_norm": 0.7223373651504517, + "learning_rate": 6.009497522036403e-06, + "loss": 0.692, + "step": 96520 + }, + { + "epoch": 1.5494630732435513, + "grad_norm": 1.0241291522979736, + "learning_rate": 6.00539790686419e-06, + "loss": 0.6646, + "step": 96530 + }, + { + "epoch": 1.5496235894637151, + "grad_norm": 0.8829692006111145, + "learning_rate": 6.001299499637178e-06, + "loss": 0.8087, + "step": 96540 + }, + { + "epoch": 1.5497841056838793, + "grad_norm": 0.9883977770805359, + "learning_rate": 5.997202300616003e-06, + "loss": 0.6452, + "step": 96550 + }, + { + "epoch": 1.5499446219040434, + "grad_norm": 1.3348146677017212, + "learning_rate": 5.993106310061231e-06, + "loss": 0.6669, + "step": 96560 + }, + { + "epoch": 1.5501051381242075, + "grad_norm": 1.216088056564331, + "learning_rate": 5.989011528233337e-06, + "loss": 0.6856, + "step": 96570 + }, + { + "epoch": 1.5502656543443716, + "grad_norm": 0.8431750535964966, + "learning_rate": 5.984917955392727e-06, + "loss": 0.7404, + "step": 96580 + }, + { + "epoch": 1.5504261705645357, + "grad_norm": 1.723318099975586, + "learning_rate": 5.980825591799733e-06, + "loss": 0.6984, + "step": 96590 + }, + { + "epoch": 1.5505866867846996, + "grad_norm": 1.3489404916763306, + "learning_rate": 5.976734437714595e-06, + "loss": 0.7248, + "step": 96600 + }, + { + "epoch": 1.5507472030048637, + "grad_norm": 1.1284726858139038, + "learning_rate": 5.972644493397492e-06, + "loss": 0.7565, + "step": 96610 + }, + { + "epoch": 1.5509077192250276, + "grad_norm": 1.4440687894821167, + "learning_rate": 5.9685557591085185e-06, + "loss": 0.673, + "step": 96620 + }, + { + "epoch": 1.5510682354451917, + "grad_norm": 1.2232142686843872, + "learning_rate": 5.9644682351076955e-06, + "loss": 0.7047, + "step": 96630 + }, + { + "epoch": 1.5512287516653558, + "grad_norm": 1.1385897397994995, + "learning_rate": 5.9603819216549645e-06, + "loss": 0.7219, + "step": 96640 + }, + { + "epoch": 1.55138926788552, + "grad_norm": 1.3055347204208374, + "learning_rate": 5.956296819010193e-06, + "loss": 0.6827, + "step": 96650 + }, + { + "epoch": 1.551549784105684, + "grad_norm": 1.1966018676757812, + "learning_rate": 5.952212927433176e-06, + "loss": 0.7773, + "step": 96660 + }, + { + "epoch": 1.551710300325848, + "grad_norm": 1.2484195232391357, + "learning_rate": 5.948130247183611e-06, + "loss": 0.7012, + "step": 96670 + }, + { + "epoch": 1.551870816546012, + "grad_norm": 0.7916039824485779, + "learning_rate": 5.944048778521141e-06, + "loss": 0.6682, + "step": 96680 + }, + { + "epoch": 1.5520313327661759, + "grad_norm": 1.0051625967025757, + "learning_rate": 5.939968521705319e-06, + "loss": 0.7188, + "step": 96690 + }, + { + "epoch": 1.55219184898634, + "grad_norm": 0.795526385307312, + "learning_rate": 5.935889476995629e-06, + "loss": 0.7075, + "step": 96700 + }, + { + "epoch": 1.552352365206504, + "grad_norm": 0.7927150726318359, + "learning_rate": 5.9318116446514725e-06, + "loss": 0.7566, + "step": 96710 + }, + { + "epoch": 1.5525128814266682, + "grad_norm": 1.0442942380905151, + "learning_rate": 5.927735024932183e-06, + "loss": 0.6632, + "step": 96720 + }, + { + "epoch": 1.5526733976468323, + "grad_norm": 0.5959987640380859, + "learning_rate": 5.923659618096997e-06, + "loss": 0.793, + "step": 96730 + }, + { + "epoch": 1.5528339138669962, + "grad_norm": 1.2075074911117554, + "learning_rate": 5.9195854244050915e-06, + "loss": 0.7433, + "step": 96740 + }, + { + "epoch": 1.5529944300871603, + "grad_norm": 0.9847227334976196, + "learning_rate": 5.915512444115564e-06, + "loss": 0.8068, + "step": 96750 + }, + { + "epoch": 1.5531549463073242, + "grad_norm": 1.7829511165618896, + "learning_rate": 5.911440677487428e-06, + "loss": 0.7328, + "step": 96760 + }, + { + "epoch": 1.5533154625274883, + "grad_norm": 0.675298810005188, + "learning_rate": 5.9073701247796295e-06, + "loss": 0.6955, + "step": 96770 + }, + { + "epoch": 1.5534759787476524, + "grad_norm": 1.2171481847763062, + "learning_rate": 5.903300786251034e-06, + "loss": 0.6913, + "step": 96780 + }, + { + "epoch": 1.5536364949678165, + "grad_norm": 0.9263421893119812, + "learning_rate": 5.899232662160417e-06, + "loss": 0.716, + "step": 96790 + }, + { + "epoch": 1.5537970111879806, + "grad_norm": 1.2095612287521362, + "learning_rate": 5.8951657527664905e-06, + "loss": 0.6516, + "step": 96800 + }, + { + "epoch": 1.5539575274081447, + "grad_norm": 1.3175585269927979, + "learning_rate": 5.891100058327889e-06, + "loss": 0.7035, + "step": 96810 + }, + { + "epoch": 1.5541180436283086, + "grad_norm": 1.4978481531143188, + "learning_rate": 5.887035579103167e-06, + "loss": 0.6952, + "step": 96820 + }, + { + "epoch": 1.5542785598484727, + "grad_norm": 0.8979431986808777, + "learning_rate": 5.882972315350799e-06, + "loss": 0.6574, + "step": 96830 + }, + { + "epoch": 1.5544390760686366, + "grad_norm": 1.8231568336486816, + "learning_rate": 5.878910267329193e-06, + "loss": 0.6309, + "step": 96840 + }, + { + "epoch": 1.5545995922888007, + "grad_norm": 1.0432108640670776, + "learning_rate": 5.874849435296659e-06, + "loss": 0.7187, + "step": 96850 + }, + { + "epoch": 1.5547601085089648, + "grad_norm": 0.9460548162460327, + "learning_rate": 5.870789819511446e-06, + "loss": 0.644, + "step": 96860 + }, + { + "epoch": 1.554920624729129, + "grad_norm": 0.9104604721069336, + "learning_rate": 5.866731420231725e-06, + "loss": 0.7957, + "step": 96870 + }, + { + "epoch": 1.555081140949293, + "grad_norm": 0.8973115086555481, + "learning_rate": 5.862674237715582e-06, + "loss": 0.7082, + "step": 96880 + }, + { + "epoch": 1.555241657169457, + "grad_norm": 0.8968822360038757, + "learning_rate": 5.858618272221034e-06, + "loss": 0.698, + "step": 96890 + }, + { + "epoch": 1.555402173389621, + "grad_norm": 0.8874959945678711, + "learning_rate": 5.854563524006018e-06, + "loss": 0.6802, + "step": 96900 + }, + { + "epoch": 1.555562689609785, + "grad_norm": 0.8171746134757996, + "learning_rate": 5.850509993328393e-06, + "loss": 0.6081, + "step": 96910 + }, + { + "epoch": 1.555723205829949, + "grad_norm": 1.4024512767791748, + "learning_rate": 5.846457680445933e-06, + "loss": 0.7665, + "step": 96920 + }, + { + "epoch": 1.5558837220501132, + "grad_norm": 1.1539943218231201, + "learning_rate": 5.842406585616342e-06, + "loss": 0.6256, + "step": 96930 + }, + { + "epoch": 1.5560442382702773, + "grad_norm": 0.6711437702178955, + "learning_rate": 5.838356709097248e-06, + "loss": 0.7473, + "step": 96940 + }, + { + "epoch": 1.5562047544904414, + "grad_norm": 1.0537058115005493, + "learning_rate": 5.8343080511462e-06, + "loss": 0.6432, + "step": 96950 + }, + { + "epoch": 1.5563652707106053, + "grad_norm": 1.0357775688171387, + "learning_rate": 5.830260612020672e-06, + "loss": 0.7084, + "step": 96960 + }, + { + "epoch": 1.5565257869307694, + "grad_norm": 1.293861985206604, + "learning_rate": 5.826214391978049e-06, + "loss": 0.6097, + "step": 96970 + }, + { + "epoch": 1.5566863031509333, + "grad_norm": 1.3366825580596924, + "learning_rate": 5.822169391275664e-06, + "loss": 0.7586, + "step": 96980 + }, + { + "epoch": 1.5568468193710974, + "grad_norm": 0.7829299569129944, + "learning_rate": 5.818125610170733e-06, + "loss": 0.7351, + "step": 96990 + }, + { + "epoch": 1.5570073355912615, + "grad_norm": 1.3513447046279907, + "learning_rate": 5.8140830489204295e-06, + "loss": 0.6971, + "step": 97000 + }, + { + "epoch": 1.5571678518114256, + "grad_norm": 1.4970656633377075, + "learning_rate": 5.810041707781833e-06, + "loss": 0.7065, + "step": 97010 + }, + { + "epoch": 1.5573283680315897, + "grad_norm": 0.9260422587394714, + "learning_rate": 5.806001587011947e-06, + "loss": 0.7042, + "step": 97020 + }, + { + "epoch": 1.5574888842517538, + "grad_norm": 0.7821476459503174, + "learning_rate": 5.8019626868677065e-06, + "loss": 0.7441, + "step": 97030 + }, + { + "epoch": 1.5576494004719177, + "grad_norm": 0.9628801345825195, + "learning_rate": 5.797925007605959e-06, + "loss": 0.7716, + "step": 97040 + }, + { + "epoch": 1.5578099166920816, + "grad_norm": 1.088260531425476, + "learning_rate": 5.793888549483472e-06, + "loss": 0.6801, + "step": 97050 + }, + { + "epoch": 1.5579704329122457, + "grad_norm": 1.0744460821151733, + "learning_rate": 5.789853312756949e-06, + "loss": 0.7082, + "step": 97060 + }, + { + "epoch": 1.5581309491324098, + "grad_norm": 1.103729248046875, + "learning_rate": 5.785819297683001e-06, + "loss": 0.7696, + "step": 97070 + }, + { + "epoch": 1.558291465352574, + "grad_norm": 1.0244801044464111, + "learning_rate": 5.781786504518172e-06, + "loss": 0.6475, + "step": 97080 + }, + { + "epoch": 1.558451981572738, + "grad_norm": 1.0582550764083862, + "learning_rate": 5.777754933518928e-06, + "loss": 0.7267, + "step": 97090 + }, + { + "epoch": 1.5586124977929021, + "grad_norm": 1.084586262702942, + "learning_rate": 5.7737245849416394e-06, + "loss": 0.7864, + "step": 97100 + }, + { + "epoch": 1.558773014013066, + "grad_norm": 1.2951719760894775, + "learning_rate": 5.769695459042617e-06, + "loss": 0.7846, + "step": 97110 + }, + { + "epoch": 1.5589335302332301, + "grad_norm": 0.8919025659561157, + "learning_rate": 5.7656675560781e-06, + "loss": 0.6052, + "step": 97120 + }, + { + "epoch": 1.559094046453394, + "grad_norm": 0.9098137021064758, + "learning_rate": 5.761640876304228e-06, + "loss": 0.7784, + "step": 97130 + }, + { + "epoch": 1.559254562673558, + "grad_norm": 1.0485719442367554, + "learning_rate": 5.757615419977078e-06, + "loss": 0.6247, + "step": 97140 + }, + { + "epoch": 1.5594150788937222, + "grad_norm": 0.990791916847229, + "learning_rate": 5.753591187352655e-06, + "loss": 0.7177, + "step": 97150 + }, + { + "epoch": 1.5595755951138863, + "grad_norm": 1.0489506721496582, + "learning_rate": 5.749568178686859e-06, + "loss": 0.7073, + "step": 97160 + }, + { + "epoch": 1.5597361113340504, + "grad_norm": 1.504311442375183, + "learning_rate": 5.74554639423554e-06, + "loss": 0.7718, + "step": 97170 + }, + { + "epoch": 1.5598966275542143, + "grad_norm": 1.4457454681396484, + "learning_rate": 5.7415258342544575e-06, + "loss": 0.8352, + "step": 97180 + }, + { + "epoch": 1.5600571437743784, + "grad_norm": 1.3973602056503296, + "learning_rate": 5.737506498999296e-06, + "loss": 0.786, + "step": 97190 + }, + { + "epoch": 1.5602176599945423, + "grad_norm": 0.6620640158653259, + "learning_rate": 5.733488388725661e-06, + "loss": 0.8435, + "step": 97200 + }, + { + "epoch": 1.5603781762147064, + "grad_norm": 1.2424232959747314, + "learning_rate": 5.729471503689082e-06, + "loss": 0.61, + "step": 97210 + }, + { + "epoch": 1.5605386924348705, + "grad_norm": 1.2240614891052246, + "learning_rate": 5.725455844145017e-06, + "loss": 0.6256, + "step": 97220 + }, + { + "epoch": 1.5606992086550346, + "grad_norm": 1.7106454372406006, + "learning_rate": 5.721441410348821e-06, + "loss": 0.7427, + "step": 97230 + }, + { + "epoch": 1.5608597248751988, + "grad_norm": 1.4529247283935547, + "learning_rate": 5.717428202555799e-06, + "loss": 0.6947, + "step": 97240 + }, + { + "epoch": 1.5610202410953626, + "grad_norm": 1.5559368133544922, + "learning_rate": 5.713416221021167e-06, + "loss": 0.7678, + "step": 97250 + }, + { + "epoch": 1.5611807573155267, + "grad_norm": 1.3126370906829834, + "learning_rate": 5.709405466000062e-06, + "loss": 0.543, + "step": 97260 + }, + { + "epoch": 1.5613412735356906, + "grad_norm": 1.3102463483810425, + "learning_rate": 5.705395937747546e-06, + "loss": 0.7903, + "step": 97270 + }, + { + "epoch": 1.5615017897558547, + "grad_norm": 0.7198097705841064, + "learning_rate": 5.701387636518604e-06, + "loss": 0.625, + "step": 97280 + }, + { + "epoch": 1.5616623059760188, + "grad_norm": 0.8249397873878479, + "learning_rate": 5.697380562568133e-06, + "loss": 0.6842, + "step": 97290 + }, + { + "epoch": 1.561822822196183, + "grad_norm": 1.9049243927001953, + "learning_rate": 5.693374716150962e-06, + "loss": 0.7894, + "step": 97300 + }, + { + "epoch": 1.561983338416347, + "grad_norm": 1.0879281759262085, + "learning_rate": 5.689370097521843e-06, + "loss": 0.7823, + "step": 97310 + }, + { + "epoch": 1.5621438546365112, + "grad_norm": 1.0946059226989746, + "learning_rate": 5.685366706935441e-06, + "loss": 0.727, + "step": 97320 + }, + { + "epoch": 1.562304370856675, + "grad_norm": 0.8903605341911316, + "learning_rate": 5.6813645446463535e-06, + "loss": 0.588, + "step": 97330 + }, + { + "epoch": 1.562464887076839, + "grad_norm": 1.1653279066085815, + "learning_rate": 5.6773636109090976e-06, + "loss": 0.6963, + "step": 97340 + }, + { + "epoch": 1.562625403297003, + "grad_norm": 1.6072156429290771, + "learning_rate": 5.6733639059780965e-06, + "loss": 0.7368, + "step": 97350 + }, + { + "epoch": 1.5627859195171672, + "grad_norm": 1.1289881467819214, + "learning_rate": 5.669365430107715e-06, + "loss": 0.6641, + "step": 97360 + }, + { + "epoch": 1.5629464357373313, + "grad_norm": 1.3835160732269287, + "learning_rate": 5.665368183552234e-06, + "loss": 0.7721, + "step": 97370 + }, + { + "epoch": 1.5631069519574954, + "grad_norm": 0.9338240027427673, + "learning_rate": 5.661372166565853e-06, + "loss": 0.6503, + "step": 97380 + }, + { + "epoch": 1.5632674681776595, + "grad_norm": 1.222820520401001, + "learning_rate": 5.657377379402695e-06, + "loss": 0.7203, + "step": 97390 + }, + { + "epoch": 1.5634279843978234, + "grad_norm": 1.2370551824569702, + "learning_rate": 5.653383822316816e-06, + "loss": 0.7346, + "step": 97400 + }, + { + "epoch": 1.5635885006179875, + "grad_norm": 0.9498128890991211, + "learning_rate": 5.649391495562162e-06, + "loss": 0.6641, + "step": 97410 + }, + { + "epoch": 1.5637490168381514, + "grad_norm": 1.032470464706421, + "learning_rate": 5.645400399392633e-06, + "loss": 0.6937, + "step": 97420 + }, + { + "epoch": 1.5639095330583155, + "grad_norm": 1.4474594593048096, + "learning_rate": 5.641410534062042e-06, + "loss": 0.7301, + "step": 97430 + }, + { + "epoch": 1.5640700492784796, + "grad_norm": 1.361608624458313, + "learning_rate": 5.637421899824116e-06, + "loss": 0.8046, + "step": 97440 + }, + { + "epoch": 1.5642305654986437, + "grad_norm": 0.8220045566558838, + "learning_rate": 5.633434496932508e-06, + "loss": 0.7494, + "step": 97450 + }, + { + "epoch": 1.5643910817188078, + "grad_norm": 1.4294826984405518, + "learning_rate": 5.629448325640799e-06, + "loss": 0.678, + "step": 97460 + }, + { + "epoch": 1.5645515979389717, + "grad_norm": 1.7517986297607422, + "learning_rate": 5.625463386202487e-06, + "loss": 0.6954, + "step": 97470 + }, + { + "epoch": 1.5647121141591358, + "grad_norm": 1.29558265209198, + "learning_rate": 5.621479678870983e-06, + "loss": 0.8645, + "step": 97480 + }, + { + "epoch": 1.5648726303792997, + "grad_norm": 0.885337769985199, + "learning_rate": 5.61749720389963e-06, + "loss": 0.6681, + "step": 97490 + }, + { + "epoch": 1.5650331465994638, + "grad_norm": 1.6235837936401367, + "learning_rate": 5.613515961541688e-06, + "loss": 0.6004, + "step": 97500 + }, + { + "epoch": 1.565193662819628, + "grad_norm": 0.6421568989753723, + "learning_rate": 5.609535952050346e-06, + "loss": 0.7212, + "step": 97510 + }, + { + "epoch": 1.565354179039792, + "grad_norm": 1.005098581314087, + "learning_rate": 5.6055571756787075e-06, + "loss": 0.6467, + "step": 97520 + }, + { + "epoch": 1.5655146952599561, + "grad_norm": 0.9146221876144409, + "learning_rate": 5.601579632679807e-06, + "loss": 0.6902, + "step": 97530 + }, + { + "epoch": 1.5656752114801202, + "grad_norm": 1.7736579179763794, + "learning_rate": 5.597603323306574e-06, + "loss": 0.6637, + "step": 97540 + }, + { + "epoch": 1.5658357277002841, + "grad_norm": 1.0599167346954346, + "learning_rate": 5.593628247811891e-06, + "loss": 0.6253, + "step": 97550 + }, + { + "epoch": 1.565996243920448, + "grad_norm": 1.038567066192627, + "learning_rate": 5.58965440644855e-06, + "loss": 0.8145, + "step": 97560 + }, + { + "epoch": 1.5661567601406121, + "grad_norm": 0.9967148900032043, + "learning_rate": 5.585681799469258e-06, + "loss": 0.757, + "step": 97570 + }, + { + "epoch": 1.5663172763607762, + "grad_norm": 0.7308923602104187, + "learning_rate": 5.581710427126654e-06, + "loss": 0.6143, + "step": 97580 + }, + { + "epoch": 1.5664777925809403, + "grad_norm": 1.4728310108184814, + "learning_rate": 5.5777402896732945e-06, + "loss": 0.6756, + "step": 97590 + }, + { + "epoch": 1.5666383088011044, + "grad_norm": 0.846435010433197, + "learning_rate": 5.573771387361662e-06, + "loss": 0.7737, + "step": 97600 + }, + { + "epoch": 1.5667988250212685, + "grad_norm": 0.6947010159492493, + "learning_rate": 5.569803720444141e-06, + "loss": 0.7503, + "step": 97610 + }, + { + "epoch": 1.5669593412414324, + "grad_norm": 0.9343696236610413, + "learning_rate": 5.565837289173059e-06, + "loss": 0.814, + "step": 97620 + }, + { + "epoch": 1.5671198574615965, + "grad_norm": 1.426917552947998, + "learning_rate": 5.56187209380066e-06, + "loss": 0.6716, + "step": 97630 + }, + { + "epoch": 1.5672803736817604, + "grad_norm": 1.3558869361877441, + "learning_rate": 5.557908134579104e-06, + "loss": 0.628, + "step": 97640 + }, + { + "epoch": 1.5674408899019245, + "grad_norm": 1.0129715204238892, + "learning_rate": 5.5539454117604805e-06, + "loss": 0.7011, + "step": 97650 + }, + { + "epoch": 1.5676014061220886, + "grad_norm": 0.9918426275253296, + "learning_rate": 5.549983925596788e-06, + "loss": 0.6554, + "step": 97660 + }, + { + "epoch": 1.5677619223422528, + "grad_norm": 1.2205110788345337, + "learning_rate": 5.546023676339959e-06, + "loss": 0.667, + "step": 97670 + }, + { + "epoch": 1.5679224385624169, + "grad_norm": 0.9148264527320862, + "learning_rate": 5.542064664241842e-06, + "loss": 0.7586, + "step": 97680 + }, + { + "epoch": 1.5680829547825808, + "grad_norm": 1.2684255838394165, + "learning_rate": 5.538106889554206e-06, + "loss": 0.8175, + "step": 97690 + }, + { + "epoch": 1.5682434710027449, + "grad_norm": 0.8648200631141663, + "learning_rate": 5.5341503525287415e-06, + "loss": 0.6104, + "step": 97700 + }, + { + "epoch": 1.5684039872229087, + "grad_norm": 0.9461622834205627, + "learning_rate": 5.530195053417062e-06, + "loss": 0.7198, + "step": 97710 + }, + { + "epoch": 1.5685645034430729, + "grad_norm": 0.77037113904953, + "learning_rate": 5.526240992470707e-06, + "loss": 0.6101, + "step": 97720 + }, + { + "epoch": 1.568725019663237, + "grad_norm": 1.6395875215530396, + "learning_rate": 5.52228816994112e-06, + "loss": 0.8123, + "step": 97730 + }, + { + "epoch": 1.568885535883401, + "grad_norm": 2.019376277923584, + "learning_rate": 5.518336586079681e-06, + "loss": 0.6326, + "step": 97740 + }, + { + "epoch": 1.5690460521035652, + "grad_norm": 0.7839473485946655, + "learning_rate": 5.5143862411376925e-06, + "loss": 0.7274, + "step": 97750 + }, + { + "epoch": 1.569206568323729, + "grad_norm": 1.4567296504974365, + "learning_rate": 5.510437135366367e-06, + "loss": 0.6782, + "step": 97760 + }, + { + "epoch": 1.5693670845438932, + "grad_norm": 1.2002607583999634, + "learning_rate": 5.506489269016849e-06, + "loss": 0.7473, + "step": 97770 + }, + { + "epoch": 1.569527600764057, + "grad_norm": 1.0125631093978882, + "learning_rate": 5.502542642340203e-06, + "loss": 0.7365, + "step": 97780 + }, + { + "epoch": 1.5696881169842212, + "grad_norm": 0.9241368770599365, + "learning_rate": 5.498597255587401e-06, + "loss": 0.7405, + "step": 97790 + }, + { + "epoch": 1.5698486332043853, + "grad_norm": 0.9295409917831421, + "learning_rate": 5.494653109009354e-06, + "loss": 0.7035, + "step": 97800 + }, + { + "epoch": 1.5700091494245494, + "grad_norm": 1.381635308265686, + "learning_rate": 5.490710202856883e-06, + "loss": 0.6814, + "step": 97810 + }, + { + "epoch": 1.5701696656447135, + "grad_norm": 1.064382553100586, + "learning_rate": 5.486768537380735e-06, + "loss": 0.7009, + "step": 97820 + }, + { + "epoch": 1.5703301818648776, + "grad_norm": 0.9363704323768616, + "learning_rate": 5.482828112831575e-06, + "loss": 0.6551, + "step": 97830 + }, + { + "epoch": 1.5704906980850415, + "grad_norm": 1.0930026769638062, + "learning_rate": 5.478888929460002e-06, + "loss": 0.7907, + "step": 97840 + }, + { + "epoch": 1.5706512143052054, + "grad_norm": 0.625890851020813, + "learning_rate": 5.474950987516508e-06, + "loss": 0.647, + "step": 97850 + }, + { + "epoch": 1.5708117305253695, + "grad_norm": 1.0370651483535767, + "learning_rate": 5.471014287251533e-06, + "loss": 0.6625, + "step": 97860 + }, + { + "epoch": 1.5709722467455336, + "grad_norm": 1.3019437789916992, + "learning_rate": 5.467078828915423e-06, + "loss": 0.7117, + "step": 97870 + }, + { + "epoch": 1.5711327629656977, + "grad_norm": 0.7767857313156128, + "learning_rate": 5.463144612758453e-06, + "loss": 0.8048, + "step": 97880 + }, + { + "epoch": 1.5712932791858618, + "grad_norm": 1.1610350608825684, + "learning_rate": 5.459211639030817e-06, + "loss": 0.6574, + "step": 97890 + }, + { + "epoch": 1.571453795406026, + "grad_norm": 1.0455085039138794, + "learning_rate": 5.4552799079826355e-06, + "loss": 0.6782, + "step": 97900 + }, + { + "epoch": 1.5716143116261898, + "grad_norm": 1.25421142578125, + "learning_rate": 5.45134941986393e-06, + "loss": 0.6734, + "step": 97910 + }, + { + "epoch": 1.571774827846354, + "grad_norm": 0.8534995913505554, + "learning_rate": 5.447420174924659e-06, + "loss": 0.7084, + "step": 97920 + }, + { + "epoch": 1.5719353440665178, + "grad_norm": 1.1032832860946655, + "learning_rate": 5.443492173414705e-06, + "loss": 0.7354, + "step": 97930 + }, + { + "epoch": 1.572095860286682, + "grad_norm": 1.3350163698196411, + "learning_rate": 5.439565415583864e-06, + "loss": 0.6891, + "step": 97940 + }, + { + "epoch": 1.572256376506846, + "grad_norm": 0.8892959356307983, + "learning_rate": 5.435639901681852e-06, + "loss": 0.6523, + "step": 97950 + }, + { + "epoch": 1.5724168927270101, + "grad_norm": 0.6831337213516235, + "learning_rate": 5.43171563195832e-06, + "loss": 0.675, + "step": 97960 + }, + { + "epoch": 1.5725774089471742, + "grad_norm": 1.1413456201553345, + "learning_rate": 5.427792606662813e-06, + "loss": 0.7406, + "step": 97970 + }, + { + "epoch": 1.5727379251673381, + "grad_norm": 1.1024709939956665, + "learning_rate": 5.423870826044819e-06, + "loss": 0.6895, + "step": 97980 + }, + { + "epoch": 1.5728984413875022, + "grad_norm": 1.0822036266326904, + "learning_rate": 5.41995029035374e-06, + "loss": 0.6825, + "step": 97990 + }, + { + "epoch": 1.5730589576076661, + "grad_norm": 1.1651114225387573, + "learning_rate": 5.4160309998389e-06, + "loss": 0.5978, + "step": 98000 + }, + { + "epoch": 1.5732194738278302, + "grad_norm": 0.83476322889328, + "learning_rate": 5.41211295474954e-06, + "loss": 0.7711, + "step": 98010 + }, + { + "epoch": 1.5733799900479943, + "grad_norm": 0.8217382431030273, + "learning_rate": 5.4081961553348295e-06, + "loss": 0.7059, + "step": 98020 + }, + { + "epoch": 1.5735405062681584, + "grad_norm": 2.6878912448883057, + "learning_rate": 5.404280601843856e-06, + "loss": 0.6703, + "step": 98030 + }, + { + "epoch": 1.5737010224883226, + "grad_norm": 1.1217968463897705, + "learning_rate": 5.400366294525616e-06, + "loss": 0.7818, + "step": 98040 + }, + { + "epoch": 1.5738615387084864, + "grad_norm": 1.1809090375900269, + "learning_rate": 5.39645323362904e-06, + "loss": 0.6845, + "step": 98050 + }, + { + "epoch": 1.5740220549286505, + "grad_norm": 1.1794484853744507, + "learning_rate": 5.392541419402977e-06, + "loss": 0.6802, + "step": 98060 + }, + { + "epoch": 1.5741825711488144, + "grad_norm": 0.8455941081047058, + "learning_rate": 5.388630852096197e-06, + "loss": 0.7427, + "step": 98070 + }, + { + "epoch": 1.5743430873689785, + "grad_norm": 1.0480575561523438, + "learning_rate": 5.3847215319573875e-06, + "loss": 0.7075, + "step": 98080 + }, + { + "epoch": 1.5745036035891427, + "grad_norm": 2.447399377822876, + "learning_rate": 5.380813459235165e-06, + "loss": 0.7186, + "step": 98090 + }, + { + "epoch": 1.5746641198093068, + "grad_norm": 1.1950994729995728, + "learning_rate": 5.3769066341780474e-06, + "loss": 0.6383, + "step": 98100 + }, + { + "epoch": 1.5748246360294709, + "grad_norm": 0.9355162978172302, + "learning_rate": 5.373001057034491e-06, + "loss": 0.5546, + "step": 98110 + }, + { + "epoch": 1.574985152249635, + "grad_norm": 0.9891206622123718, + "learning_rate": 5.369096728052869e-06, + "loss": 0.6994, + "step": 98120 + }, + { + "epoch": 1.5751456684697989, + "grad_norm": 0.9058851003646851, + "learning_rate": 5.365193647481473e-06, + "loss": 0.7393, + "step": 98130 + }, + { + "epoch": 1.575306184689963, + "grad_norm": 1.1164923906326294, + "learning_rate": 5.361291815568517e-06, + "loss": 0.7679, + "step": 98140 + }, + { + "epoch": 1.5754667009101269, + "grad_norm": 0.9573028087615967, + "learning_rate": 5.357391232562142e-06, + "loss": 0.6088, + "step": 98150 + }, + { + "epoch": 1.575627217130291, + "grad_norm": 1.109288215637207, + "learning_rate": 5.353491898710386e-06, + "loss": 0.6414, + "step": 98160 + }, + { + "epoch": 1.575787733350455, + "grad_norm": 1.1295970678329468, + "learning_rate": 5.34959381426123e-06, + "loss": 0.7001, + "step": 98170 + }, + { + "epoch": 1.5759482495706192, + "grad_norm": 1.137853980064392, + "learning_rate": 5.345696979462575e-06, + "loss": 0.7815, + "step": 98180 + }, + { + "epoch": 1.5761087657907833, + "grad_norm": 1.4038234949111938, + "learning_rate": 5.341801394562229e-06, + "loss": 0.6146, + "step": 98190 + }, + { + "epoch": 1.5762692820109472, + "grad_norm": 0.9394376277923584, + "learning_rate": 5.337907059807937e-06, + "loss": 0.6697, + "step": 98200 + }, + { + "epoch": 1.5764297982311113, + "grad_norm": 0.9429643750190735, + "learning_rate": 5.3340139754473465e-06, + "loss": 0.7549, + "step": 98210 + }, + { + "epoch": 1.5765903144512752, + "grad_norm": 1.023017406463623, + "learning_rate": 5.3301221417280415e-06, + "loss": 0.823, + "step": 98220 + }, + { + "epoch": 1.5767508306714393, + "grad_norm": 1.1312057971954346, + "learning_rate": 5.3262315588975265e-06, + "loss": 0.8028, + "step": 98230 + }, + { + "epoch": 1.5769113468916034, + "grad_norm": 1.1367294788360596, + "learning_rate": 5.322342227203203e-06, + "loss": 0.7476, + "step": 98240 + }, + { + "epoch": 1.5770718631117675, + "grad_norm": 1.1983237266540527, + "learning_rate": 5.318454146892418e-06, + "loss": 0.6732, + "step": 98250 + }, + { + "epoch": 1.5772323793319316, + "grad_norm": 1.5717086791992188, + "learning_rate": 5.314567318212429e-06, + "loss": 0.7355, + "step": 98260 + }, + { + "epoch": 1.5773928955520955, + "grad_norm": 1.7864898443222046, + "learning_rate": 5.310681741410417e-06, + "loss": 0.7486, + "step": 98270 + }, + { + "epoch": 1.5775534117722596, + "grad_norm": 2.470780611038208, + "learning_rate": 5.306797416733484e-06, + "loss": 0.6788, + "step": 98280 + }, + { + "epoch": 1.5777139279924235, + "grad_norm": 1.3563166856765747, + "learning_rate": 5.3029143444286444e-06, + "loss": 0.5875, + "step": 98290 + }, + { + "epoch": 1.5778744442125876, + "grad_norm": 4.863795280456543, + "learning_rate": 5.299032524742845e-06, + "loss": 0.7232, + "step": 98300 + }, + { + "epoch": 1.5780349604327517, + "grad_norm": 1.050595998764038, + "learning_rate": 5.295151957922942e-06, + "loss": 0.7627, + "step": 98310 + }, + { + "epoch": 1.5781954766529158, + "grad_norm": 1.2059897184371948, + "learning_rate": 5.29127264421572e-06, + "loss": 0.7103, + "step": 98320 + }, + { + "epoch": 1.57835599287308, + "grad_norm": 1.0634464025497437, + "learning_rate": 5.28739458386788e-06, + "loss": 0.7728, + "step": 98330 + }, + { + "epoch": 1.578516509093244, + "grad_norm": 1.0785566568374634, + "learning_rate": 5.2835177771260486e-06, + "loss": 0.8082, + "step": 98340 + }, + { + "epoch": 1.578677025313408, + "grad_norm": 1.5990065336227417, + "learning_rate": 5.2796422242367585e-06, + "loss": 0.6619, + "step": 98350 + }, + { + "epoch": 1.5788375415335718, + "grad_norm": 1.0623706579208374, + "learning_rate": 5.275767925446473e-06, + "loss": 0.7744, + "step": 98360 + }, + { + "epoch": 1.578998057753736, + "grad_norm": 1.1670621633529663, + "learning_rate": 5.2718948810015795e-06, + "loss": 0.7412, + "step": 98370 + }, + { + "epoch": 1.5791585739739, + "grad_norm": 1.3134040832519531, + "learning_rate": 5.2680230911483775e-06, + "loss": 0.7384, + "step": 98380 + }, + { + "epoch": 1.5793190901940641, + "grad_norm": 1.1450364589691162, + "learning_rate": 5.264152556133095e-06, + "loss": 0.6637, + "step": 98390 + }, + { + "epoch": 1.5794796064142282, + "grad_norm": 0.9167094826698303, + "learning_rate": 5.260283276201877e-06, + "loss": 0.779, + "step": 98400 + }, + { + "epoch": 1.5796401226343924, + "grad_norm": 1.2416625022888184, + "learning_rate": 5.256415251600777e-06, + "loss": 0.7051, + "step": 98410 + }, + { + "epoch": 1.5798006388545562, + "grad_norm": 1.0929553508758545, + "learning_rate": 5.252548482575781e-06, + "loss": 0.7587, + "step": 98420 + }, + { + "epoch": 1.5799611550747203, + "grad_norm": 1.078503966331482, + "learning_rate": 5.248682969372798e-06, + "loss": 0.7209, + "step": 98430 + }, + { + "epoch": 1.5801216712948842, + "grad_norm": 1.0203568935394287, + "learning_rate": 5.244818712237651e-06, + "loss": 0.7116, + "step": 98440 + }, + { + "epoch": 1.5802821875150483, + "grad_norm": 1.0900335311889648, + "learning_rate": 5.240955711416079e-06, + "loss": 0.7153, + "step": 98450 + }, + { + "epoch": 1.5804427037352125, + "grad_norm": 0.6429615616798401, + "learning_rate": 5.237093967153761e-06, + "loss": 0.6576, + "step": 98460 + }, + { + "epoch": 1.5806032199553766, + "grad_norm": 0.6605288982391357, + "learning_rate": 5.233233479696259e-06, + "loss": 0.5987, + "step": 98470 + }, + { + "epoch": 1.5807637361755407, + "grad_norm": 0.509242057800293, + "learning_rate": 5.229374249289093e-06, + "loss": 0.6778, + "step": 98480 + }, + { + "epoch": 1.5809242523957046, + "grad_norm": 2.1368038654327393, + "learning_rate": 5.225516276177681e-06, + "loss": 0.7294, + "step": 98490 + }, + { + "epoch": 1.5810847686158687, + "grad_norm": 1.007791519165039, + "learning_rate": 5.221659560607367e-06, + "loss": 0.7432, + "step": 98500 + }, + { + "epoch": 1.5812452848360325, + "grad_norm": 1.2488136291503906, + "learning_rate": 5.21780410282342e-06, + "loss": 0.711, + "step": 98510 + }, + { + "epoch": 1.5814058010561967, + "grad_norm": 0.9963626861572266, + "learning_rate": 5.213949903071022e-06, + "loss": 0.7563, + "step": 98520 + }, + { + "epoch": 1.5815663172763608, + "grad_norm": 0.8385132551193237, + "learning_rate": 5.210096961595287e-06, + "loss": 0.7527, + "step": 98530 + }, + { + "epoch": 1.5817268334965249, + "grad_norm": 0.8084752559661865, + "learning_rate": 5.2062452786412196e-06, + "loss": 0.7181, + "step": 98540 + }, + { + "epoch": 1.581887349716689, + "grad_norm": 1.283338189125061, + "learning_rate": 5.2023948544537764e-06, + "loss": 0.7609, + "step": 98550 + }, + { + "epoch": 1.5820478659368529, + "grad_norm": 1.3510781526565552, + "learning_rate": 5.198545689277823e-06, + "loss": 0.7053, + "step": 98560 + }, + { + "epoch": 1.582208382157017, + "grad_norm": 1.0810467004776, + "learning_rate": 5.194697783358138e-06, + "loss": 0.7876, + "step": 98570 + }, + { + "epoch": 1.5823688983771809, + "grad_norm": 0.8571422100067139, + "learning_rate": 5.1908511369394285e-06, + "loss": 0.6234, + "step": 98580 + }, + { + "epoch": 1.582529414597345, + "grad_norm": 0.9692324995994568, + "learning_rate": 5.187005750266327e-06, + "loss": 0.6891, + "step": 98590 + }, + { + "epoch": 1.582689930817509, + "grad_norm": 1.049586296081543, + "learning_rate": 5.183161623583363e-06, + "loss": 0.6199, + "step": 98600 + }, + { + "epoch": 1.5828504470376732, + "grad_norm": 0.5550608038902283, + "learning_rate": 5.179318757135004e-06, + "loss": 0.732, + "step": 98610 + }, + { + "epoch": 1.5830109632578373, + "grad_norm": 0.8345915079116821, + "learning_rate": 5.1754771511656395e-06, + "loss": 0.5839, + "step": 98620 + }, + { + "epoch": 1.5831714794780014, + "grad_norm": 1.018566608428955, + "learning_rate": 5.171636805919569e-06, + "loss": 0.6755, + "step": 98630 + }, + { + "epoch": 1.5833319956981653, + "grad_norm": 1.1858022212982178, + "learning_rate": 5.1677977216410155e-06, + "loss": 0.7223, + "step": 98640 + }, + { + "epoch": 1.5834925119183292, + "grad_norm": 1.1055018901824951, + "learning_rate": 5.163959898574131e-06, + "loss": 0.696, + "step": 98650 + }, + { + "epoch": 1.5836530281384933, + "grad_norm": 0.9704524874687195, + "learning_rate": 5.160123336962966e-06, + "loss": 0.7629, + "step": 98660 + }, + { + "epoch": 1.5838135443586574, + "grad_norm": 0.80201655626297, + "learning_rate": 5.156288037051507e-06, + "loss": 0.69, + "step": 98670 + }, + { + "epoch": 1.5839740605788215, + "grad_norm": 1.425971508026123, + "learning_rate": 5.152453999083659e-06, + "loss": 0.6745, + "step": 98680 + }, + { + "epoch": 1.5841345767989856, + "grad_norm": 0.7843067646026611, + "learning_rate": 5.1486212233032435e-06, + "loss": 0.7018, + "step": 98690 + }, + { + "epoch": 1.5842950930191497, + "grad_norm": 1.3137435913085938, + "learning_rate": 5.144789709954001e-06, + "loss": 0.8224, + "step": 98700 + }, + { + "epoch": 1.5844556092393136, + "grad_norm": 1.0077139139175415, + "learning_rate": 5.140959459279604e-06, + "loss": 0.6917, + "step": 98710 + }, + { + "epoch": 1.5846161254594777, + "grad_norm": 0.9486405849456787, + "learning_rate": 5.137130471523616e-06, + "loss": 0.7868, + "step": 98720 + }, + { + "epoch": 1.5847766416796416, + "grad_norm": 2.4560341835021973, + "learning_rate": 5.133302746929547e-06, + "loss": 0.6621, + "step": 98730 + }, + { + "epoch": 1.5849371578998057, + "grad_norm": 0.823319137096405, + "learning_rate": 5.129476285740817e-06, + "loss": 0.7967, + "step": 98740 + }, + { + "epoch": 1.5850976741199698, + "grad_norm": 0.8278446197509766, + "learning_rate": 5.125651088200767e-06, + "loss": 0.7839, + "step": 98750 + }, + { + "epoch": 1.585258190340134, + "grad_norm": 1.955122947692871, + "learning_rate": 5.121827154552658e-06, + "loss": 0.6543, + "step": 98760 + }, + { + "epoch": 1.585418706560298, + "grad_norm": 1.1418437957763672, + "learning_rate": 5.118004485039677e-06, + "loss": 0.7755, + "step": 98770 + }, + { + "epoch": 1.585579222780462, + "grad_norm": 1.1230945587158203, + "learning_rate": 5.1141830799049076e-06, + "loss": 0.7262, + "step": 98780 + }, + { + "epoch": 1.585739739000626, + "grad_norm": 1.4702352285385132, + "learning_rate": 5.110362939391375e-06, + "loss": 0.7518, + "step": 98790 + }, + { + "epoch": 1.58590025522079, + "grad_norm": 1.014237642288208, + "learning_rate": 5.106544063742022e-06, + "loss": 0.7068, + "step": 98800 + }, + { + "epoch": 1.586060771440954, + "grad_norm": 0.8970372676849365, + "learning_rate": 5.102726453199702e-06, + "loss": 0.7058, + "step": 98810 + }, + { + "epoch": 1.5862212876611181, + "grad_norm": 1.5351471900939941, + "learning_rate": 5.098910108007193e-06, + "loss": 0.7235, + "step": 98820 + }, + { + "epoch": 1.5863818038812822, + "grad_norm": 0.8289908170700073, + "learning_rate": 5.095095028407196e-06, + "loss": 0.6792, + "step": 98830 + }, + { + "epoch": 1.5865423201014464, + "grad_norm": 0.8180863261222839, + "learning_rate": 5.091281214642324e-06, + "loss": 0.6563, + "step": 98840 + }, + { + "epoch": 1.5867028363216102, + "grad_norm": 0.8015263676643372, + "learning_rate": 5.087468666955114e-06, + "loss": 0.6285, + "step": 98850 + }, + { + "epoch": 1.5868633525417744, + "grad_norm": 1.485270619392395, + "learning_rate": 5.083657385588031e-06, + "loss": 0.7291, + "step": 98860 + }, + { + "epoch": 1.5870238687619382, + "grad_norm": 1.1467937231063843, + "learning_rate": 5.079847370783433e-06, + "loss": 0.7132, + "step": 98870 + }, + { + "epoch": 1.5871843849821023, + "grad_norm": 0.8940847516059875, + "learning_rate": 5.076038622783621e-06, + "loss": 0.6916, + "step": 98880 + }, + { + "epoch": 1.5873449012022665, + "grad_norm": 1.0093660354614258, + "learning_rate": 5.072231141830814e-06, + "loss": 0.7186, + "step": 98890 + }, + { + "epoch": 1.5875054174224306, + "grad_norm": 1.1714262962341309, + "learning_rate": 5.068424928167139e-06, + "loss": 0.8063, + "step": 98900 + }, + { + "epoch": 1.5876659336425947, + "grad_norm": 1.4631716012954712, + "learning_rate": 5.064619982034654e-06, + "loss": 0.7065, + "step": 98910 + }, + { + "epoch": 1.5878264498627588, + "grad_norm": 1.1297879219055176, + "learning_rate": 5.060816303675331e-06, + "loss": 0.641, + "step": 98920 + }, + { + "epoch": 1.5879869660829227, + "grad_norm": 1.0831282138824463, + "learning_rate": 5.057013893331058e-06, + "loss": 0.7623, + "step": 98930 + }, + { + "epoch": 1.5881474823030868, + "grad_norm": 0.879568338394165, + "learning_rate": 5.053212751243649e-06, + "loss": 0.753, + "step": 98940 + }, + { + "epoch": 1.5883079985232507, + "grad_norm": 1.1364322900772095, + "learning_rate": 5.049412877654833e-06, + "loss": 0.7627, + "step": 98950 + }, + { + "epoch": 1.5884685147434148, + "grad_norm": 1.5245214700698853, + "learning_rate": 5.045614272806265e-06, + "loss": 0.6396, + "step": 98960 + }, + { + "epoch": 1.5886290309635789, + "grad_norm": 0.8061221837997437, + "learning_rate": 5.041816936939503e-06, + "loss": 0.6633, + "step": 98970 + }, + { + "epoch": 1.588789547183743, + "grad_norm": 0.948987603187561, + "learning_rate": 5.038020870296043e-06, + "loss": 0.7115, + "step": 98980 + }, + { + "epoch": 1.588950063403907, + "grad_norm": 1.20833420753479, + "learning_rate": 5.034226073117288e-06, + "loss": 0.671, + "step": 98990 + }, + { + "epoch": 1.589110579624071, + "grad_norm": 0.9390049576759338, + "learning_rate": 5.0304325456445675e-06, + "loss": 0.8106, + "step": 99000 + }, + { + "epoch": 1.589271095844235, + "grad_norm": 0.8340718150138855, + "learning_rate": 5.026640288119131e-06, + "loss": 0.597, + "step": 99010 + }, + { + "epoch": 1.589431612064399, + "grad_norm": 2.426023006439209, + "learning_rate": 5.022849300782145e-06, + "loss": 0.6082, + "step": 99020 + }, + { + "epoch": 1.589592128284563, + "grad_norm": 1.0518980026245117, + "learning_rate": 5.019059583874683e-06, + "loss": 0.7681, + "step": 99030 + }, + { + "epoch": 1.5897526445047272, + "grad_norm": 1.0896880626678467, + "learning_rate": 5.015271137637753e-06, + "loss": 0.7814, + "step": 99040 + }, + { + "epoch": 1.5899131607248913, + "grad_norm": 1.1026703119277954, + "learning_rate": 5.011483962312283e-06, + "loss": 0.761, + "step": 99050 + }, + { + "epoch": 1.5900736769450554, + "grad_norm": 1.3033891916275024, + "learning_rate": 5.007698058139112e-06, + "loss": 0.7377, + "step": 99060 + }, + { + "epoch": 1.5902341931652193, + "grad_norm": 3.0550146102905273, + "learning_rate": 5.003913425359e-06, + "loss": 0.7088, + "step": 99070 + }, + { + "epoch": 1.5903947093853834, + "grad_norm": 0.8024730682373047, + "learning_rate": 5.000130064212633e-06, + "loss": 0.7307, + "step": 99080 + }, + { + "epoch": 1.5905552256055473, + "grad_norm": 1.0000840425491333, + "learning_rate": 4.99634797494061e-06, + "loss": 0.8848, + "step": 99090 + }, + { + "epoch": 1.5907157418257114, + "grad_norm": 1.058182954788208, + "learning_rate": 4.99256715778344e-06, + "loss": 0.716, + "step": 99100 + }, + { + "epoch": 1.5908762580458755, + "grad_norm": 0.9335851669311523, + "learning_rate": 4.9887876129815654e-06, + "loss": 0.6675, + "step": 99110 + }, + { + "epoch": 1.5910367742660396, + "grad_norm": 1.2600529193878174, + "learning_rate": 4.985009340775349e-06, + "loss": 0.6918, + "step": 99120 + }, + { + "epoch": 1.5911972904862037, + "grad_norm": 0.8435268998146057, + "learning_rate": 4.981232341405059e-06, + "loss": 0.7498, + "step": 99130 + }, + { + "epoch": 1.5913578067063678, + "grad_norm": 0.9015218019485474, + "learning_rate": 4.977456615110893e-06, + "loss": 0.7068, + "step": 99140 + }, + { + "epoch": 1.5915183229265317, + "grad_norm": 1.0967084169387817, + "learning_rate": 4.9736821621329755e-06, + "loss": 0.6135, + "step": 99150 + }, + { + "epoch": 1.5916788391466956, + "grad_norm": 1.282639980316162, + "learning_rate": 4.9699089827113216e-06, + "loss": 0.7117, + "step": 99160 + }, + { + "epoch": 1.5918393553668597, + "grad_norm": 1.3498181104660034, + "learning_rate": 4.966137077085892e-06, + "loss": 0.7787, + "step": 99170 + }, + { + "epoch": 1.5919998715870238, + "grad_norm": 0.686840295791626, + "learning_rate": 4.962366445496558e-06, + "loss": 0.6725, + "step": 99180 + }, + { + "epoch": 1.592160387807188, + "grad_norm": 1.1152071952819824, + "learning_rate": 4.958597088183109e-06, + "loss": 0.6924, + "step": 99190 + }, + { + "epoch": 1.592320904027352, + "grad_norm": 1.101924180984497, + "learning_rate": 4.954829005385253e-06, + "loss": 0.754, + "step": 99200 + }, + { + "epoch": 1.5924814202475162, + "grad_norm": 0.7478103637695312, + "learning_rate": 4.951062197342626e-06, + "loss": 0.6421, + "step": 99210 + }, + { + "epoch": 1.59264193646768, + "grad_norm": 0.7739139199256897, + "learning_rate": 4.947296664294762e-06, + "loss": 0.6555, + "step": 99220 + }, + { + "epoch": 1.5928024526878442, + "grad_norm": 1.007941484451294, + "learning_rate": 4.943532406481133e-06, + "loss": 0.6047, + "step": 99230 + }, + { + "epoch": 1.592962968908008, + "grad_norm": 1.0075033903121948, + "learning_rate": 4.939769424141125e-06, + "loss": 0.7101, + "step": 99240 + }, + { + "epoch": 1.5931234851281721, + "grad_norm": 0.8666913509368896, + "learning_rate": 4.936007717514038e-06, + "loss": 0.725, + "step": 99250 + }, + { + "epoch": 1.5932840013483363, + "grad_norm": 1.2565206289291382, + "learning_rate": 4.932247286839095e-06, + "loss": 0.7142, + "step": 99260 + }, + { + "epoch": 1.5934445175685004, + "grad_norm": 0.6425206065177917, + "learning_rate": 4.928488132355446e-06, + "loss": 0.6857, + "step": 99270 + }, + { + "epoch": 1.5936050337886645, + "grad_norm": 1.0815430879592896, + "learning_rate": 4.92473025430214e-06, + "loss": 0.758, + "step": 99280 + }, + { + "epoch": 1.5937655500088284, + "grad_norm": 0.8757620453834534, + "learning_rate": 4.920973652918156e-06, + "loss": 0.6131, + "step": 99290 + }, + { + "epoch": 1.5939260662289925, + "grad_norm": 0.9806878566741943, + "learning_rate": 4.9172183284424e-06, + "loss": 0.6532, + "step": 99300 + }, + { + "epoch": 1.5940865824491564, + "grad_norm": 1.6780285835266113, + "learning_rate": 4.91346428111368e-06, + "loss": 0.6579, + "step": 99310 + }, + { + "epoch": 1.5942470986693205, + "grad_norm": 1.2781504392623901, + "learning_rate": 4.9097115111707395e-06, + "loss": 0.8279, + "step": 99320 + }, + { + "epoch": 1.5944076148894846, + "grad_norm": 0.9321437478065491, + "learning_rate": 4.905960018852235e-06, + "loss": 0.8011, + "step": 99330 + }, + { + "epoch": 1.5945681311096487, + "grad_norm": 1.576486349105835, + "learning_rate": 4.902209804396726e-06, + "loss": 0.6618, + "step": 99340 + }, + { + "epoch": 1.5947286473298128, + "grad_norm": 0.8881083130836487, + "learning_rate": 4.898460868042709e-06, + "loss": 0.5438, + "step": 99350 + }, + { + "epoch": 1.5948891635499767, + "grad_norm": 1.3968174457550049, + "learning_rate": 4.894713210028601e-06, + "loss": 0.7062, + "step": 99360 + }, + { + "epoch": 1.5950496797701408, + "grad_norm": 1.5975170135498047, + "learning_rate": 4.8909668305927244e-06, + "loss": 0.7126, + "step": 99370 + }, + { + "epoch": 1.5952101959903047, + "grad_norm": 0.9760381579399109, + "learning_rate": 4.887221729973329e-06, + "loss": 0.7216, + "step": 99380 + }, + { + "epoch": 1.5953707122104688, + "grad_norm": 1.0473971366882324, + "learning_rate": 4.883477908408582e-06, + "loss": 0.6592, + "step": 99390 + }, + { + "epoch": 1.5955312284306329, + "grad_norm": 1.319753885269165, + "learning_rate": 4.879735366136575e-06, + "loss": 0.6048, + "step": 99400 + }, + { + "epoch": 1.595691744650797, + "grad_norm": 1.2574800252914429, + "learning_rate": 4.875994103395298e-06, + "loss": 0.5859, + "step": 99410 + }, + { + "epoch": 1.595852260870961, + "grad_norm": 1.012681007385254, + "learning_rate": 4.872254120422678e-06, + "loss": 0.6426, + "step": 99420 + }, + { + "epoch": 1.5960127770911252, + "grad_norm": 1.4065415859222412, + "learning_rate": 4.868515417456562e-06, + "loss": 0.7083, + "step": 99430 + }, + { + "epoch": 1.596173293311289, + "grad_norm": 1.409090280532837, + "learning_rate": 4.864777994734704e-06, + "loss": 0.754, + "step": 99440 + }, + { + "epoch": 1.5963338095314532, + "grad_norm": 1.3372480869293213, + "learning_rate": 4.861041852494783e-06, + "loss": 0.6789, + "step": 99450 + }, + { + "epoch": 1.596494325751617, + "grad_norm": 0.8875026106834412, + "learning_rate": 4.857306990974397e-06, + "loss": 0.7218, + "step": 99460 + }, + { + "epoch": 1.5966548419717812, + "grad_norm": 1.060035228729248, + "learning_rate": 4.853573410411061e-06, + "loss": 0.7805, + "step": 99470 + }, + { + "epoch": 1.5968153581919453, + "grad_norm": 0.9655343890190125, + "learning_rate": 4.849841111042214e-06, + "loss": 0.7509, + "step": 99480 + }, + { + "epoch": 1.5969758744121094, + "grad_norm": 1.3266147375106812, + "learning_rate": 4.8461100931051966e-06, + "loss": 0.7111, + "step": 99490 + }, + { + "epoch": 1.5971363906322735, + "grad_norm": 1.058754324913025, + "learning_rate": 4.842380356837287e-06, + "loss": 0.714, + "step": 99500 + }, + { + "epoch": 1.5972969068524374, + "grad_norm": 1.5017905235290527, + "learning_rate": 4.838651902475672e-06, + "loss": 0.6586, + "step": 99510 + }, + { + "epoch": 1.5974574230726015, + "grad_norm": 1.14238703250885, + "learning_rate": 4.834924730257462e-06, + "loss": 0.8167, + "step": 99520 + }, + { + "epoch": 1.5976179392927654, + "grad_norm": 1.5302749872207642, + "learning_rate": 4.8311988404196816e-06, + "loss": 0.7186, + "step": 99530 + }, + { + "epoch": 1.5977784555129295, + "grad_norm": 1.5172010660171509, + "learning_rate": 4.827474233199278e-06, + "loss": 0.862, + "step": 99540 + }, + { + "epoch": 1.5979389717330936, + "grad_norm": 1.0152384042739868, + "learning_rate": 4.823750908833111e-06, + "loss": 0.6673, + "step": 99550 + }, + { + "epoch": 1.5980994879532577, + "grad_norm": 1.6900079250335693, + "learning_rate": 4.820028867557963e-06, + "loss": 0.7257, + "step": 99560 + }, + { + "epoch": 1.5982600041734218, + "grad_norm": 1.0331071615219116, + "learning_rate": 4.816308109610534e-06, + "loss": 0.7211, + "step": 99570 + }, + { + "epoch": 1.5984205203935857, + "grad_norm": 1.0426617860794067, + "learning_rate": 4.812588635227453e-06, + "loss": 0.7801, + "step": 99580 + }, + { + "epoch": 1.5985810366137498, + "grad_norm": 0.7314715385437012, + "learning_rate": 4.808870444645239e-06, + "loss": 0.6642, + "step": 99590 + }, + { + "epoch": 1.5987415528339137, + "grad_norm": 0.9289103746414185, + "learning_rate": 4.805153538100351e-06, + "loss": 0.7115, + "step": 99600 + }, + { + "epoch": 1.5989020690540778, + "grad_norm": 1.105062484741211, + "learning_rate": 4.8014379158291715e-06, + "loss": 0.6671, + "step": 99610 + }, + { + "epoch": 1.599062585274242, + "grad_norm": 1.0984423160552979, + "learning_rate": 4.797723578067983e-06, + "loss": 0.5944, + "step": 99620 + }, + { + "epoch": 1.599223101494406, + "grad_norm": 1.5823354721069336, + "learning_rate": 4.794010525053002e-06, + "loss": 0.7634, + "step": 99630 + }, + { + "epoch": 1.5993836177145702, + "grad_norm": 1.3555736541748047, + "learning_rate": 4.7902987570203535e-06, + "loss": 0.6112, + "step": 99640 + }, + { + "epoch": 1.5995441339347343, + "grad_norm": 1.1449583768844604, + "learning_rate": 4.786588274206091e-06, + "loss": 0.7717, + "step": 99650 + }, + { + "epoch": 1.5997046501548982, + "grad_norm": 1.2124940156936646, + "learning_rate": 4.782879076846169e-06, + "loss": 0.6831, + "step": 99660 + }, + { + "epoch": 1.599865166375062, + "grad_norm": 1.0184470415115356, + "learning_rate": 4.7791711651764745e-06, + "loss": 0.7037, + "step": 99670 + }, + { + "epoch": 1.6000256825952262, + "grad_norm": 1.242661476135254, + "learning_rate": 4.775464539432808e-06, + "loss": 0.6097, + "step": 99680 + }, + { + "epoch": 1.6001861988153903, + "grad_norm": 1.1193463802337646, + "learning_rate": 4.771759199850892e-06, + "loss": 0.7883, + "step": 99690 + }, + { + "epoch": 1.6003467150355544, + "grad_norm": 0.8020848035812378, + "learning_rate": 4.768055146666364e-06, + "loss": 0.7072, + "step": 99700 + }, + { + "epoch": 1.6005072312557185, + "grad_norm": 0.9811034798622131, + "learning_rate": 4.764352380114787e-06, + "loss": 0.6382, + "step": 99710 + }, + { + "epoch": 1.6006677474758826, + "grad_norm": 0.8102268576622009, + "learning_rate": 4.760650900431618e-06, + "loss": 0.7292, + "step": 99720 + }, + { + "epoch": 1.6008282636960465, + "grad_norm": 1.1230276823043823, + "learning_rate": 4.756950707852259e-06, + "loss": 0.6729, + "step": 99730 + }, + { + "epoch": 1.6009887799162106, + "grad_norm": 1.2580809593200684, + "learning_rate": 4.753251802612021e-06, + "loss": 0.6582, + "step": 99740 + }, + { + "epoch": 1.6011492961363745, + "grad_norm": 1.0702877044677734, + "learning_rate": 4.749554184946134e-06, + "loss": 0.6437, + "step": 99750 + }, + { + "epoch": 1.6013098123565386, + "grad_norm": 0.8230314254760742, + "learning_rate": 4.745857855089739e-06, + "loss": 0.8256, + "step": 99760 + }, + { + "epoch": 1.6014703285767027, + "grad_norm": 1.0614745616912842, + "learning_rate": 4.742162813277915e-06, + "loss": 0.6953, + "step": 99770 + }, + { + "epoch": 1.6016308447968668, + "grad_norm": 1.2326563596725464, + "learning_rate": 4.738469059745626e-06, + "loss": 0.6362, + "step": 99780 + }, + { + "epoch": 1.601791361017031, + "grad_norm": 5.584920883178711, + "learning_rate": 4.7347765947277804e-06, + "loss": 0.684, + "step": 99790 + }, + { + "epoch": 1.6019518772371948, + "grad_norm": 0.9775304794311523, + "learning_rate": 4.7310854184592e-06, + "loss": 0.7769, + "step": 99800 + }, + { + "epoch": 1.602112393457359, + "grad_norm": 1.2598379850387573, + "learning_rate": 4.7273955311746224e-06, + "loss": 0.7292, + "step": 99810 + }, + { + "epoch": 1.6022729096775228, + "grad_norm": 0.9813770055770874, + "learning_rate": 4.723706933108699e-06, + "loss": 0.6898, + "step": 99820 + }, + { + "epoch": 1.602433425897687, + "grad_norm": 1.024337887763977, + "learning_rate": 4.720019624496011e-06, + "loss": 0.6894, + "step": 99830 + }, + { + "epoch": 1.602593942117851, + "grad_norm": 0.9774400591850281, + "learning_rate": 4.716333605571038e-06, + "loss": 0.6318, + "step": 99840 + }, + { + "epoch": 1.602754458338015, + "grad_norm": 1.0224798917770386, + "learning_rate": 4.712648876568193e-06, + "loss": 0.6632, + "step": 99850 + }, + { + "epoch": 1.6029149745581792, + "grad_norm": 1.2869518995285034, + "learning_rate": 4.708965437721808e-06, + "loss": 0.7777, + "step": 99860 + }, + { + "epoch": 1.603075490778343, + "grad_norm": 1.3323580026626587, + "learning_rate": 4.705283289266121e-06, + "loss": 0.671, + "step": 99870 + }, + { + "epoch": 1.6032360069985072, + "grad_norm": 1.3208141326904297, + "learning_rate": 4.7016024314353004e-06, + "loss": 0.8169, + "step": 99880 + }, + { + "epoch": 1.603396523218671, + "grad_norm": 1.089634895324707, + "learning_rate": 4.697922864463427e-06, + "loss": 0.7089, + "step": 99890 + }, + { + "epoch": 1.6035570394388352, + "grad_norm": 0.9406909942626953, + "learning_rate": 4.694244588584504e-06, + "loss": 0.5877, + "step": 99900 + }, + { + "epoch": 1.6037175556589993, + "grad_norm": 0.863336443901062, + "learning_rate": 4.6905676040324365e-06, + "loss": 0.7237, + "step": 99910 + }, + { + "epoch": 1.6038780718791634, + "grad_norm": 0.8546100854873657, + "learning_rate": 4.686891911041063e-06, + "loss": 0.7232, + "step": 99920 + }, + { + "epoch": 1.6040385880993275, + "grad_norm": 1.0182278156280518, + "learning_rate": 4.68321750984414e-06, + "loss": 0.7479, + "step": 99930 + }, + { + "epoch": 1.6041991043194916, + "grad_norm": 1.1701698303222656, + "learning_rate": 4.679544400675334e-06, + "loss": 0.7329, + "step": 99940 + }, + { + "epoch": 1.6043596205396555, + "grad_norm": 0.8611049652099609, + "learning_rate": 4.6758725837682374e-06, + "loss": 0.7147, + "step": 99950 + }, + { + "epoch": 1.6045201367598194, + "grad_norm": 0.8746853470802307, + "learning_rate": 4.672202059356359e-06, + "loss": 0.6564, + "step": 99960 + }, + { + "epoch": 1.6046806529799835, + "grad_norm": 1.279219388961792, + "learning_rate": 4.668532827673114e-06, + "loss": 0.6292, + "step": 99970 + }, + { + "epoch": 1.6048411692001476, + "grad_norm": 0.8608855605125427, + "learning_rate": 4.6648648889518445e-06, + "loss": 0.8092, + "step": 99980 + }, + { + "epoch": 1.6050016854203117, + "grad_norm": 1.4731543064117432, + "learning_rate": 4.661198243425813e-06, + "loss": 0.6338, + "step": 99990 + }, + { + "epoch": 1.6051622016404758, + "grad_norm": 1.196329116821289, + "learning_rate": 4.657532891328198e-06, + "loss": 0.7618, + "step": 100000 + }, + { + "epoch": 1.6051622016404758, + "eval_loss": 0.7700876593589783, + "eval_runtime": 1833.9926, + "eval_samples_per_second": 14.303, + "eval_steps_per_second": 1.788, + "step": 100000 + }, + { + "epoch": 1.60532271786064, + "grad_norm": 1.0522372722625732, + "learning_rate": 4.653868832892092e-06, + "loss": 0.6266, + "step": 100010 + }, + { + "epoch": 1.6054832340808038, + "grad_norm": 1.413068413734436, + "learning_rate": 4.650206068350515e-06, + "loss": 0.8071, + "step": 100020 + }, + { + "epoch": 1.605643750300968, + "grad_norm": 0.8338518738746643, + "learning_rate": 4.646544597936387e-06, + "loss": 0.7734, + "step": 100030 + }, + { + "epoch": 1.6058042665211318, + "grad_norm": 0.9577062726020813, + "learning_rate": 4.642884421882557e-06, + "loss": 0.6547, + "step": 100040 + }, + { + "epoch": 1.605964782741296, + "grad_norm": 0.9507196545600891, + "learning_rate": 4.639225540421793e-06, + "loss": 0.7081, + "step": 100050 + }, + { + "epoch": 1.60612529896146, + "grad_norm": 1.12337064743042, + "learning_rate": 4.635567953786782e-06, + "loss": 0.8417, + "step": 100060 + }, + { + "epoch": 1.6062858151816242, + "grad_norm": 1.7995468378067017, + "learning_rate": 4.63191166221012e-06, + "loss": 0.7203, + "step": 100070 + }, + { + "epoch": 1.6064463314017883, + "grad_norm": 0.8575416207313538, + "learning_rate": 4.628256665924327e-06, + "loss": 0.7114, + "step": 100080 + }, + { + "epoch": 1.6066068476219522, + "grad_norm": 1.2612031698226929, + "learning_rate": 4.624602965161842e-06, + "loss": 0.6949, + "step": 100090 + }, + { + "epoch": 1.6067673638421163, + "grad_norm": 1.8471190929412842, + "learning_rate": 4.620950560155016e-06, + "loss": 0.6535, + "step": 100100 + }, + { + "epoch": 1.6069278800622802, + "grad_norm": 1.1186052560806274, + "learning_rate": 4.617299451136128e-06, + "loss": 0.6605, + "step": 100110 + }, + { + "epoch": 1.6070883962824443, + "grad_norm": 1.0011647939682007, + "learning_rate": 4.613649638337353e-06, + "loss": 0.7791, + "step": 100120 + }, + { + "epoch": 1.6072489125026084, + "grad_norm": 1.2796622514724731, + "learning_rate": 4.610001121990806e-06, + "loss": 0.621, + "step": 100130 + }, + { + "epoch": 1.6074094287227725, + "grad_norm": 0.9424450397491455, + "learning_rate": 4.606353902328511e-06, + "loss": 0.7136, + "step": 100140 + }, + { + "epoch": 1.6075699449429366, + "grad_norm": 0.8921444416046143, + "learning_rate": 4.60270797958241e-06, + "loss": 0.7347, + "step": 100150 + }, + { + "epoch": 1.6077304611631005, + "grad_norm": 1.0023716688156128, + "learning_rate": 4.599063353984362e-06, + "loss": 0.7541, + "step": 100160 + }, + { + "epoch": 1.6078909773832646, + "grad_norm": 1.4474730491638184, + "learning_rate": 4.595420025766142e-06, + "loss": 0.8036, + "step": 100170 + }, + { + "epoch": 1.6080514936034285, + "grad_norm": 1.1066737174987793, + "learning_rate": 4.591777995159446e-06, + "loss": 0.5894, + "step": 100180 + }, + { + "epoch": 1.6082120098235926, + "grad_norm": 1.5988589525222778, + "learning_rate": 4.588137262395884e-06, + "loss": 0.6521, + "step": 100190 + }, + { + "epoch": 1.6083725260437567, + "grad_norm": 1.0305598974227905, + "learning_rate": 4.584497827706988e-06, + "loss": 0.6719, + "step": 100200 + }, + { + "epoch": 1.6085330422639208, + "grad_norm": 0.9771848917007446, + "learning_rate": 4.58085969132421e-06, + "loss": 0.7098, + "step": 100210 + }, + { + "epoch": 1.608693558484085, + "grad_norm": 2.507660150527954, + "learning_rate": 4.577222853478902e-06, + "loss": 0.7159, + "step": 100220 + }, + { + "epoch": 1.608854074704249, + "grad_norm": 1.1889804601669312, + "learning_rate": 4.5735873144023475e-06, + "loss": 0.6647, + "step": 100230 + }, + { + "epoch": 1.609014590924413, + "grad_norm": 0.7319830060005188, + "learning_rate": 4.569953074325753e-06, + "loss": 0.6884, + "step": 100240 + }, + { + "epoch": 1.609175107144577, + "grad_norm": 0.892335832118988, + "learning_rate": 4.566320133480229e-06, + "loss": 0.7616, + "step": 100250 + }, + { + "epoch": 1.609335623364741, + "grad_norm": 0.8316525816917419, + "learning_rate": 4.562688492096809e-06, + "loss": 0.7136, + "step": 100260 + }, + { + "epoch": 1.609496139584905, + "grad_norm": 1.4154915809631348, + "learning_rate": 4.559058150406453e-06, + "loss": 0.6943, + "step": 100270 + }, + { + "epoch": 1.6096566558050691, + "grad_norm": 1.184730052947998, + "learning_rate": 4.5554291086400195e-06, + "loss": 0.7405, + "step": 100280 + }, + { + "epoch": 1.6098171720252332, + "grad_norm": 2.054452419281006, + "learning_rate": 4.551801367028294e-06, + "loss": 0.8209, + "step": 100290 + }, + { + "epoch": 1.6099776882453973, + "grad_norm": 1.295472502708435, + "learning_rate": 4.548174925801985e-06, + "loss": 0.6899, + "step": 100300 + }, + { + "epoch": 1.6101382044655612, + "grad_norm": 2.193364381790161, + "learning_rate": 4.54454978519171e-06, + "loss": 0.7297, + "step": 100310 + }, + { + "epoch": 1.6102987206857253, + "grad_norm": 1.5480071306228638, + "learning_rate": 4.540925945428007e-06, + "loss": 0.7417, + "step": 100320 + }, + { + "epoch": 1.6104592369058892, + "grad_norm": 1.395808219909668, + "learning_rate": 4.537303406741336e-06, + "loss": 0.643, + "step": 100330 + }, + { + "epoch": 1.6106197531260533, + "grad_norm": 1.0432004928588867, + "learning_rate": 4.53368216936206e-06, + "loss": 0.7513, + "step": 100340 + }, + { + "epoch": 1.6107802693462174, + "grad_norm": 1.5045323371887207, + "learning_rate": 4.530062233520474e-06, + "loss": 0.7994, + "step": 100350 + }, + { + "epoch": 1.6109407855663815, + "grad_norm": 1.197286605834961, + "learning_rate": 4.526443599446781e-06, + "loss": 0.6753, + "step": 100360 + }, + { + "epoch": 1.6111013017865456, + "grad_norm": 0.8020853996276855, + "learning_rate": 4.522826267371108e-06, + "loss": 0.6867, + "step": 100370 + }, + { + "epoch": 1.6112618180067095, + "grad_norm": 0.9200521111488342, + "learning_rate": 4.5192102375234965e-06, + "loss": 0.6569, + "step": 100380 + }, + { + "epoch": 1.6114223342268736, + "grad_norm": 0.9097069501876831, + "learning_rate": 4.5155955101339104e-06, + "loss": 0.6593, + "step": 100390 + }, + { + "epoch": 1.6115828504470375, + "grad_norm": 1.356813907623291, + "learning_rate": 4.511982085432212e-06, + "loss": 0.7922, + "step": 100400 + }, + { + "epoch": 1.6117433666672016, + "grad_norm": 0.8484895825386047, + "learning_rate": 4.508369963648202e-06, + "loss": 0.8389, + "step": 100410 + }, + { + "epoch": 1.6119038828873657, + "grad_norm": 1.389588475227356, + "learning_rate": 4.5047591450115865e-06, + "loss": 0.6112, + "step": 100420 + }, + { + "epoch": 1.6120643991075299, + "grad_norm": 0.8653028011322021, + "learning_rate": 4.501149629751994e-06, + "loss": 0.7071, + "step": 100430 + }, + { + "epoch": 1.612224915327694, + "grad_norm": 1.3777717351913452, + "learning_rate": 4.497541418098969e-06, + "loss": 0.8109, + "step": 100440 + }, + { + "epoch": 1.612385431547858, + "grad_norm": 1.0270779132843018, + "learning_rate": 4.493934510281975e-06, + "loss": 0.6293, + "step": 100450 + }, + { + "epoch": 1.612545947768022, + "grad_norm": 1.0106592178344727, + "learning_rate": 4.490328906530391e-06, + "loss": 0.707, + "step": 100460 + }, + { + "epoch": 1.6127064639881858, + "grad_norm": 1.1433676481246948, + "learning_rate": 4.486724607073506e-06, + "loss": 0.59, + "step": 100470 + }, + { + "epoch": 1.61286698020835, + "grad_norm": 1.1009917259216309, + "learning_rate": 4.483121612140534e-06, + "loss": 0.7414, + "step": 100480 + }, + { + "epoch": 1.613027496428514, + "grad_norm": 1.2331568002700806, + "learning_rate": 4.479519921960606e-06, + "loss": 0.746, + "step": 100490 + }, + { + "epoch": 1.6131880126486782, + "grad_norm": 1.0284851789474487, + "learning_rate": 4.475919536762765e-06, + "loss": 0.7215, + "step": 100500 + }, + { + "epoch": 1.6133485288688423, + "grad_norm": 0.9880824685096741, + "learning_rate": 4.4723204567759805e-06, + "loss": 0.8104, + "step": 100510 + }, + { + "epoch": 1.6135090450890064, + "grad_norm": 1.1608130931854248, + "learning_rate": 4.468722682229138e-06, + "loss": 0.8316, + "step": 100520 + }, + { + "epoch": 1.6136695613091703, + "grad_norm": 1.183912754058838, + "learning_rate": 4.465126213351015e-06, + "loss": 0.7516, + "step": 100530 + }, + { + "epoch": 1.6138300775293344, + "grad_norm": 1.6752601861953735, + "learning_rate": 4.461531050370341e-06, + "loss": 0.6526, + "step": 100540 + }, + { + "epoch": 1.6139905937494983, + "grad_norm": 1.109244704246521, + "learning_rate": 4.45793719351574e-06, + "loss": 0.7364, + "step": 100550 + }, + { + "epoch": 1.6141511099696624, + "grad_norm": 1.0165975093841553, + "learning_rate": 4.454344643015768e-06, + "loss": 0.7321, + "step": 100560 + }, + { + "epoch": 1.6143116261898265, + "grad_norm": 1.2159608602523804, + "learning_rate": 4.450753399098883e-06, + "loss": 0.7017, + "step": 100570 + }, + { + "epoch": 1.6144721424099906, + "grad_norm": 1.3469719886779785, + "learning_rate": 4.447163461993478e-06, + "loss": 0.7458, + "step": 100580 + }, + { + "epoch": 1.6146326586301547, + "grad_norm": 1.418602466583252, + "learning_rate": 4.443574831927838e-06, + "loss": 0.7417, + "step": 100590 + }, + { + "epoch": 1.6147931748503186, + "grad_norm": 0.8007618188858032, + "learning_rate": 4.439987509130183e-06, + "loss": 0.7119, + "step": 100600 + }, + { + "epoch": 1.6149536910704827, + "grad_norm": 1.0355859994888306, + "learning_rate": 4.436401493828649e-06, + "loss": 0.7347, + "step": 100610 + }, + { + "epoch": 1.6151142072906466, + "grad_norm": 1.1427886486053467, + "learning_rate": 4.4328167862512815e-06, + "loss": 0.6831, + "step": 100620 + }, + { + "epoch": 1.6152747235108107, + "grad_norm": 0.8363175392150879, + "learning_rate": 4.429233386626053e-06, + "loss": 0.6099, + "step": 100630 + }, + { + "epoch": 1.6154352397309748, + "grad_norm": 0.9555826187133789, + "learning_rate": 4.4256512951808465e-06, + "loss": 0.6801, + "step": 100640 + }, + { + "epoch": 1.615595755951139, + "grad_norm": 1.7636961936950684, + "learning_rate": 4.422070512143453e-06, + "loss": 0.6379, + "step": 100650 + }, + { + "epoch": 1.615756272171303, + "grad_norm": 0.8306326270103455, + "learning_rate": 4.418491037741593e-06, + "loss": 0.6654, + "step": 100660 + }, + { + "epoch": 1.615916788391467, + "grad_norm": 0.8593183755874634, + "learning_rate": 4.414912872202903e-06, + "loss": 0.8674, + "step": 100670 + }, + { + "epoch": 1.616077304611631, + "grad_norm": 0.944892942905426, + "learning_rate": 4.411336015754933e-06, + "loss": 0.6562, + "step": 100680 + }, + { + "epoch": 1.616237820831795, + "grad_norm": 1.020197868347168, + "learning_rate": 4.40776046862515e-06, + "loss": 0.6743, + "step": 100690 + }, + { + "epoch": 1.616398337051959, + "grad_norm": 0.698323130607605, + "learning_rate": 4.404186231040935e-06, + "loss": 0.6518, + "step": 100700 + }, + { + "epoch": 1.6165588532721231, + "grad_norm": 1.0419644117355347, + "learning_rate": 4.400613303229592e-06, + "loss": 0.7085, + "step": 100710 + }, + { + "epoch": 1.6167193694922872, + "grad_norm": 0.8853646516799927, + "learning_rate": 4.397041685418335e-06, + "loss": 0.6845, + "step": 100720 + }, + { + "epoch": 1.6168798857124513, + "grad_norm": 1.2374937534332275, + "learning_rate": 4.393471377834307e-06, + "loss": 0.7584, + "step": 100730 + }, + { + "epoch": 1.6170404019326154, + "grad_norm": 0.8966052532196045, + "learning_rate": 4.389902380704544e-06, + "loss": 0.8093, + "step": 100740 + }, + { + "epoch": 1.6172009181527793, + "grad_norm": 1.2711752653121948, + "learning_rate": 4.386334694256022e-06, + "loss": 0.6483, + "step": 100750 + }, + { + "epoch": 1.6173614343729434, + "grad_norm": 1.0064750909805298, + "learning_rate": 4.3827683187156245e-06, + "loss": 0.6841, + "step": 100760 + }, + { + "epoch": 1.6175219505931073, + "grad_norm": 1.171547770500183, + "learning_rate": 4.379203254310149e-06, + "loss": 0.7404, + "step": 100770 + }, + { + "epoch": 1.6176824668132714, + "grad_norm": 0.9238327741622925, + "learning_rate": 4.375639501266316e-06, + "loss": 0.683, + "step": 100780 + }, + { + "epoch": 1.6178429830334355, + "grad_norm": 1.239418387413025, + "learning_rate": 4.372077059810759e-06, + "loss": 0.7321, + "step": 100790 + }, + { + "epoch": 1.6180034992535997, + "grad_norm": 1.2559711933135986, + "learning_rate": 4.3685159301700255e-06, + "loss": 0.6181, + "step": 100800 + }, + { + "epoch": 1.6181640154737638, + "grad_norm": 1.5403673648834229, + "learning_rate": 4.364956112570584e-06, + "loss": 0.5993, + "step": 100810 + }, + { + "epoch": 1.6183245316939276, + "grad_norm": 0.975688636302948, + "learning_rate": 4.361397607238818e-06, + "loss": 0.69, + "step": 100820 + }, + { + "epoch": 1.6184850479140918, + "grad_norm": 1.4351831674575806, + "learning_rate": 4.357840414401035e-06, + "loss": 0.6299, + "step": 100830 + }, + { + "epoch": 1.6186455641342556, + "grad_norm": 0.8853704333305359, + "learning_rate": 4.35428453428344e-06, + "loss": 0.5936, + "step": 100840 + }, + { + "epoch": 1.6188060803544198, + "grad_norm": 0.9934795498847961, + "learning_rate": 4.350729967112169e-06, + "loss": 0.6908, + "step": 100850 + }, + { + "epoch": 1.6189665965745839, + "grad_norm": 1.1305125951766968, + "learning_rate": 4.347176713113271e-06, + "loss": 0.6979, + "step": 100860 + }, + { + "epoch": 1.619127112794748, + "grad_norm": 0.9508997201919556, + "learning_rate": 4.3436247725127165e-06, + "loss": 0.7652, + "step": 100870 + }, + { + "epoch": 1.619287629014912, + "grad_norm": 0.7611475586891174, + "learning_rate": 4.340074145536385e-06, + "loss": 0.7149, + "step": 100880 + }, + { + "epoch": 1.619448145235076, + "grad_norm": 1.017810344696045, + "learning_rate": 4.336524832410085e-06, + "loss": 0.7486, + "step": 100890 + }, + { + "epoch": 1.61960866145524, + "grad_norm": 1.1585685014724731, + "learning_rate": 4.332976833359515e-06, + "loss": 0.6544, + "step": 100900 + }, + { + "epoch": 1.619769177675404, + "grad_norm": 0.9765130877494812, + "learning_rate": 4.329430148610317e-06, + "loss": 0.6439, + "step": 100910 + }, + { + "epoch": 1.619929693895568, + "grad_norm": 0.9854376912117004, + "learning_rate": 4.325884778388037e-06, + "loss": 0.7222, + "step": 100920 + }, + { + "epoch": 1.6200902101157322, + "grad_norm": 1.307620644569397, + "learning_rate": 4.322340722918139e-06, + "loss": 0.7671, + "step": 100930 + }, + { + "epoch": 1.6202507263358963, + "grad_norm": 0.9287770986557007, + "learning_rate": 4.3187979824260065e-06, + "loss": 0.7098, + "step": 100940 + }, + { + "epoch": 1.6204112425560604, + "grad_norm": 0.8236685395240784, + "learning_rate": 4.3152565571369405e-06, + "loss": 0.5879, + "step": 100950 + }, + { + "epoch": 1.6205717587762245, + "grad_norm": 0.8006654977798462, + "learning_rate": 4.311716447276146e-06, + "loss": 0.63, + "step": 100960 + }, + { + "epoch": 1.6207322749963884, + "grad_norm": 0.9340654611587524, + "learning_rate": 4.308177653068757e-06, + "loss": 0.7028, + "step": 100970 + }, + { + "epoch": 1.6208927912165523, + "grad_norm": 0.6653180122375488, + "learning_rate": 4.304640174739821e-06, + "loss": 0.7462, + "step": 100980 + }, + { + "epoch": 1.6210533074367164, + "grad_norm": 0.9342182874679565, + "learning_rate": 4.3011040125143006e-06, + "loss": 0.7272, + "step": 100990 + }, + { + "epoch": 1.6212138236568805, + "grad_norm": 1.005603313446045, + "learning_rate": 4.297569166617074e-06, + "loss": 0.6213, + "step": 101000 + }, + { + "epoch": 1.6213743398770446, + "grad_norm": 1.2557568550109863, + "learning_rate": 4.294035637272939e-06, + "loss": 0.6956, + "step": 101010 + }, + { + "epoch": 1.6215348560972087, + "grad_norm": 0.9349071979522705, + "learning_rate": 4.29050342470661e-06, + "loss": 0.7398, + "step": 101020 + }, + { + "epoch": 1.6216953723173728, + "grad_norm": 1.2664916515350342, + "learning_rate": 4.286972529142705e-06, + "loss": 0.6955, + "step": 101030 + }, + { + "epoch": 1.6218558885375367, + "grad_norm": 1.259757399559021, + "learning_rate": 4.2834429508057745e-06, + "loss": 0.7315, + "step": 101040 + }, + { + "epoch": 1.6220164047577008, + "grad_norm": 0.8353701829910278, + "learning_rate": 4.279914689920278e-06, + "loss": 0.7824, + "step": 101050 + }, + { + "epoch": 1.6221769209778647, + "grad_norm": 0.7776173949241638, + "learning_rate": 4.276387746710594e-06, + "loss": 0.6742, + "step": 101060 + }, + { + "epoch": 1.6223374371980288, + "grad_norm": 0.7724810838699341, + "learning_rate": 4.272862121401014e-06, + "loss": 0.6966, + "step": 101070 + }, + { + "epoch": 1.622497953418193, + "grad_norm": 1.267045021057129, + "learning_rate": 4.269337814215751e-06, + "loss": 0.7414, + "step": 101080 + }, + { + "epoch": 1.622658469638357, + "grad_norm": 0.6631287932395935, + "learning_rate": 4.2658148253789225e-06, + "loss": 0.6679, + "step": 101090 + }, + { + "epoch": 1.6228189858585211, + "grad_norm": 0.8670876026153564, + "learning_rate": 4.262293155114572e-06, + "loss": 0.6788, + "step": 101100 + }, + { + "epoch": 1.622979502078685, + "grad_norm": 1.300228238105774, + "learning_rate": 4.25877280364666e-06, + "loss": 0.627, + "step": 101110 + }, + { + "epoch": 1.6231400182988491, + "grad_norm": 1.2741615772247314, + "learning_rate": 4.2552537711990585e-06, + "loss": 0.761, + "step": 101120 + }, + { + "epoch": 1.623300534519013, + "grad_norm": 1.2264844179153442, + "learning_rate": 4.251736057995559e-06, + "loss": 0.5689, + "step": 101130 + }, + { + "epoch": 1.6234610507391771, + "grad_norm": 1.0804647207260132, + "learning_rate": 4.248219664259873e-06, + "loss": 0.6414, + "step": 101140 + }, + { + "epoch": 1.6236215669593412, + "grad_norm": 0.8038386106491089, + "learning_rate": 4.2447045902156064e-06, + "loss": 0.6898, + "step": 101150 + }, + { + "epoch": 1.6237820831795053, + "grad_norm": 1.4421110153198242, + "learning_rate": 4.241190836086309e-06, + "loss": 0.7228, + "step": 101160 + }, + { + "epoch": 1.6239425993996695, + "grad_norm": 1.2448352575302124, + "learning_rate": 4.237678402095432e-06, + "loss": 0.7353, + "step": 101170 + }, + { + "epoch": 1.6241031156198333, + "grad_norm": 1.1443517208099365, + "learning_rate": 4.234167288466345e-06, + "loss": 0.8282, + "step": 101180 + }, + { + "epoch": 1.6242636318399974, + "grad_norm": 0.7843833565711975, + "learning_rate": 4.230657495422338e-06, + "loss": 0.7047, + "step": 101190 + }, + { + "epoch": 1.6244241480601613, + "grad_norm": 1.671467900276184, + "learning_rate": 4.227149023186616e-06, + "loss": 0.6405, + "step": 101200 + }, + { + "epoch": 1.6245846642803254, + "grad_norm": 1.384752869606018, + "learning_rate": 4.223641871982284e-06, + "loss": 0.6639, + "step": 101210 + }, + { + "epoch": 1.6247451805004895, + "grad_norm": 1.5453277826309204, + "learning_rate": 4.220136042032385e-06, + "loss": 0.6797, + "step": 101220 + }, + { + "epoch": 1.6249056967206537, + "grad_norm": 0.8407182693481445, + "learning_rate": 4.21663153355987e-06, + "loss": 0.7981, + "step": 101230 + }, + { + "epoch": 1.6250662129408178, + "grad_norm": 0.7532543540000916, + "learning_rate": 4.2131283467876005e-06, + "loss": 0.7989, + "step": 101240 + }, + { + "epoch": 1.6252267291609819, + "grad_norm": 0.9047394394874573, + "learning_rate": 4.209626481938364e-06, + "loss": 0.7035, + "step": 101250 + }, + { + "epoch": 1.6253872453811458, + "grad_norm": 1.1244343519210815, + "learning_rate": 4.2061259392348564e-06, + "loss": 0.6499, + "step": 101260 + }, + { + "epoch": 1.6255477616013096, + "grad_norm": 1.0557782649993896, + "learning_rate": 4.202626718899694e-06, + "loss": 0.6786, + "step": 101270 + }, + { + "epoch": 1.6257082778214738, + "grad_norm": 1.434529423713684, + "learning_rate": 4.199128821155401e-06, + "loss": 0.6523, + "step": 101280 + }, + { + "epoch": 1.6258687940416379, + "grad_norm": 1.3462060689926147, + "learning_rate": 4.195632246224426e-06, + "loss": 0.754, + "step": 101290 + }, + { + "epoch": 1.626029310261802, + "grad_norm": 1.3311606645584106, + "learning_rate": 4.192136994329129e-06, + "loss": 0.716, + "step": 101300 + }, + { + "epoch": 1.626189826481966, + "grad_norm": 1.8419970273971558, + "learning_rate": 4.188643065691791e-06, + "loss": 0.7978, + "step": 101310 + }, + { + "epoch": 1.6263503427021302, + "grad_norm": 1.0429348945617676, + "learning_rate": 4.185150460534604e-06, + "loss": 0.7616, + "step": 101320 + }, + { + "epoch": 1.626510858922294, + "grad_norm": 0.8841722011566162, + "learning_rate": 4.181659179079677e-06, + "loss": 0.7489, + "step": 101330 + }, + { + "epoch": 1.6266713751424582, + "grad_norm": 1.2398402690887451, + "learning_rate": 4.178169221549036e-06, + "loss": 0.6625, + "step": 101340 + }, + { + "epoch": 1.626831891362622, + "grad_norm": 0.917366623878479, + "learning_rate": 4.174680588164617e-06, + "loss": 0.7845, + "step": 101350 + }, + { + "epoch": 1.6269924075827862, + "grad_norm": 0.976901113986969, + "learning_rate": 4.171193279148289e-06, + "loss": 0.6365, + "step": 101360 + }, + { + "epoch": 1.6271529238029503, + "grad_norm": 0.9884894490242004, + "learning_rate": 4.16770729472181e-06, + "loss": 0.7316, + "step": 101370 + }, + { + "epoch": 1.6273134400231144, + "grad_norm": 1.6998568773269653, + "learning_rate": 4.1642226351068735e-06, + "loss": 0.6064, + "step": 101380 + }, + { + "epoch": 1.6274739562432785, + "grad_norm": 0.9414958953857422, + "learning_rate": 4.160739300525085e-06, + "loss": 0.7435, + "step": 101390 + }, + { + "epoch": 1.6276344724634424, + "grad_norm": 1.2846978902816772, + "learning_rate": 4.157257291197961e-06, + "loss": 0.6535, + "step": 101400 + }, + { + "epoch": 1.6277949886836065, + "grad_norm": 0.8501694798469543, + "learning_rate": 4.15377660734694e-06, + "loss": 0.7345, + "step": 101410 + }, + { + "epoch": 1.6279555049037704, + "grad_norm": 1.1019573211669922, + "learning_rate": 4.150297249193372e-06, + "loss": 0.7069, + "step": 101420 + }, + { + "epoch": 1.6281160211239345, + "grad_norm": 1.1670657396316528, + "learning_rate": 4.146819216958522e-06, + "loss": 0.6887, + "step": 101430 + }, + { + "epoch": 1.6282765373440986, + "grad_norm": 0.701581597328186, + "learning_rate": 4.143342510863574e-06, + "loss": 0.7486, + "step": 101440 + }, + { + "epoch": 1.6284370535642627, + "grad_norm": 1.0081695318222046, + "learning_rate": 4.139867131129632e-06, + "loss": 0.6747, + "step": 101450 + }, + { + "epoch": 1.6285975697844268, + "grad_norm": 2.488675355911255, + "learning_rate": 4.1363930779777014e-06, + "loss": 0.6179, + "step": 101460 + }, + { + "epoch": 1.6287580860045907, + "grad_norm": 0.9646506905555725, + "learning_rate": 4.132920351628711e-06, + "loss": 0.5817, + "step": 101470 + }, + { + "epoch": 1.6289186022247548, + "grad_norm": 1.0796623229980469, + "learning_rate": 4.129448952303508e-06, + "loss": 0.8278, + "step": 101480 + }, + { + "epoch": 1.6290791184449187, + "grad_norm": 1.2340179681777954, + "learning_rate": 4.125978880222856e-06, + "loss": 0.6801, + "step": 101490 + }, + { + "epoch": 1.6292396346650828, + "grad_norm": 0.961006224155426, + "learning_rate": 4.122510135607429e-06, + "loss": 0.7537, + "step": 101500 + }, + { + "epoch": 1.629400150885247, + "grad_norm": 0.9525434374809265, + "learning_rate": 4.119042718677826e-06, + "loss": 0.6662, + "step": 101510 + }, + { + "epoch": 1.629560667105411, + "grad_norm": 1.5672835111618042, + "learning_rate": 4.115576629654539e-06, + "loss": 0.7684, + "step": 101520 + }, + { + "epoch": 1.6297211833255751, + "grad_norm": 1.2925992012023926, + "learning_rate": 4.112111868758001e-06, + "loss": 0.736, + "step": 101530 + }, + { + "epoch": 1.6298816995457392, + "grad_norm": 1.4051449298858643, + "learning_rate": 4.108648436208548e-06, + "loss": 0.7109, + "step": 101540 + }, + { + "epoch": 1.6300422157659031, + "grad_norm": 1.2346370220184326, + "learning_rate": 4.105186332226435e-06, + "loss": 0.6812, + "step": 101550 + }, + { + "epoch": 1.6302027319860672, + "grad_norm": 1.4987529516220093, + "learning_rate": 4.101725557031829e-06, + "loss": 0.7582, + "step": 101560 + }, + { + "epoch": 1.6303632482062311, + "grad_norm": 0.744692862033844, + "learning_rate": 4.09826611084482e-06, + "loss": 0.7866, + "step": 101570 + }, + { + "epoch": 1.6305237644263952, + "grad_norm": 1.114614486694336, + "learning_rate": 4.094807993885411e-06, + "loss": 0.6916, + "step": 101580 + }, + { + "epoch": 1.6306842806465593, + "grad_norm": 1.0797417163848877, + "learning_rate": 4.091351206373506e-06, + "loss": 0.7056, + "step": 101590 + }, + { + "epoch": 1.6308447968667235, + "grad_norm": 0.8819329142570496, + "learning_rate": 4.087895748528941e-06, + "loss": 0.6664, + "step": 101600 + }, + { + "epoch": 1.6310053130868876, + "grad_norm": 0.9526052474975586, + "learning_rate": 4.0844416205714694e-06, + "loss": 0.6915, + "step": 101610 + }, + { + "epoch": 1.6311658293070515, + "grad_norm": 1.1772035360336304, + "learning_rate": 4.080988822720744e-06, + "loss": 0.7309, + "step": 101620 + }, + { + "epoch": 1.6313263455272156, + "grad_norm": 1.305275321006775, + "learning_rate": 4.077537355196351e-06, + "loss": 0.7013, + "step": 101630 + }, + { + "epoch": 1.6314868617473794, + "grad_norm": 1.178715705871582, + "learning_rate": 4.0740872182177846e-06, + "loss": 0.6158, + "step": 101640 + }, + { + "epoch": 1.6316473779675436, + "grad_norm": 0.6525168418884277, + "learning_rate": 4.070638412004441e-06, + "loss": 0.719, + "step": 101650 + }, + { + "epoch": 1.6318078941877077, + "grad_norm": 1.0837351083755493, + "learning_rate": 4.0671909367756526e-06, + "loss": 0.7209, + "step": 101660 + }, + { + "epoch": 1.6319684104078718, + "grad_norm": 1.168613076210022, + "learning_rate": 4.06374479275066e-06, + "loss": 0.6462, + "step": 101670 + }, + { + "epoch": 1.6321289266280359, + "grad_norm": 1.1871064901351929, + "learning_rate": 4.060299980148613e-06, + "loss": 0.6892, + "step": 101680 + }, + { + "epoch": 1.6322894428481998, + "grad_norm": 1.1500526666641235, + "learning_rate": 4.0568564991885825e-06, + "loss": 0.7377, + "step": 101690 + }, + { + "epoch": 1.6324499590683639, + "grad_norm": 0.9837421774864197, + "learning_rate": 4.0534143500895625e-06, + "loss": 0.682, + "step": 101700 + }, + { + "epoch": 1.6326104752885278, + "grad_norm": 1.6655596494674683, + "learning_rate": 4.049973533070439e-06, + "loss": 0.7259, + "step": 101710 + }, + { + "epoch": 1.6327709915086919, + "grad_norm": 0.8976698517799377, + "learning_rate": 4.046534048350034e-06, + "loss": 0.7637, + "step": 101720 + }, + { + "epoch": 1.632931507728856, + "grad_norm": 1.774614930152893, + "learning_rate": 4.043095896147081e-06, + "loss": 0.7095, + "step": 101730 + }, + { + "epoch": 1.63309202394902, + "grad_norm": 1.0810563564300537, + "learning_rate": 4.039659076680225e-06, + "loss": 0.6945, + "step": 101740 + }, + { + "epoch": 1.6332525401691842, + "grad_norm": 2.0109362602233887, + "learning_rate": 4.036223590168026e-06, + "loss": 0.6382, + "step": 101750 + }, + { + "epoch": 1.6334130563893483, + "grad_norm": 1.114881992340088, + "learning_rate": 4.032789436828966e-06, + "loss": 0.7236, + "step": 101760 + }, + { + "epoch": 1.6335735726095122, + "grad_norm": 1.1817245483398438, + "learning_rate": 4.029356616881428e-06, + "loss": 0.6834, + "step": 101770 + }, + { + "epoch": 1.633734088829676, + "grad_norm": 0.7630261778831482, + "learning_rate": 4.025925130543726e-06, + "loss": 0.6107, + "step": 101780 + }, + { + "epoch": 1.6338946050498402, + "grad_norm": 4.952128887176514, + "learning_rate": 4.022494978034077e-06, + "loss": 0.6842, + "step": 101790 + }, + { + "epoch": 1.6340551212700043, + "grad_norm": 0.8797298669815063, + "learning_rate": 4.019066159570622e-06, + "loss": 0.6347, + "step": 101800 + }, + { + "epoch": 1.6342156374901684, + "grad_norm": 1.0948957204818726, + "learning_rate": 4.015638675371416e-06, + "loss": 0.6599, + "step": 101810 + }, + { + "epoch": 1.6343761537103325, + "grad_norm": 0.7551016807556152, + "learning_rate": 4.012212525654424e-06, + "loss": 0.6784, + "step": 101820 + }, + { + "epoch": 1.6345366699304966, + "grad_norm": 1.3564074039459229, + "learning_rate": 4.008787710637535e-06, + "loss": 0.6916, + "step": 101830 + }, + { + "epoch": 1.6346971861506605, + "grad_norm": 1.0104743242263794, + "learning_rate": 4.005364230538538e-06, + "loss": 0.6225, + "step": 101840 + }, + { + "epoch": 1.6348577023708246, + "grad_norm": 1.022778868675232, + "learning_rate": 4.001942085575147e-06, + "loss": 0.679, + "step": 101850 + }, + { + "epoch": 1.6350182185909885, + "grad_norm": 1.0453022718429565, + "learning_rate": 3.998521275964995e-06, + "loss": 0.6252, + "step": 101860 + }, + { + "epoch": 1.6351787348111526, + "grad_norm": 1.2378523349761963, + "learning_rate": 3.995101801925622e-06, + "loss": 0.6557, + "step": 101870 + }, + { + "epoch": 1.6353392510313167, + "grad_norm": 1.0765061378479004, + "learning_rate": 3.991683663674489e-06, + "loss": 0.6119, + "step": 101880 + }, + { + "epoch": 1.6354997672514808, + "grad_norm": 1.0406166315078735, + "learning_rate": 3.988266861428974e-06, + "loss": 0.6416, + "step": 101890 + }, + { + "epoch": 1.635660283471645, + "grad_norm": 1.2416136264801025, + "learning_rate": 3.984851395406355e-06, + "loss": 0.6509, + "step": 101900 + }, + { + "epoch": 1.6358207996918088, + "grad_norm": 1.535516381263733, + "learning_rate": 3.9814372658238425e-06, + "loss": 0.7878, + "step": 101910 + }, + { + "epoch": 1.635981315911973, + "grad_norm": 1.2537963390350342, + "learning_rate": 3.978024472898551e-06, + "loss": 0.7474, + "step": 101920 + }, + { + "epoch": 1.6361418321321368, + "grad_norm": 0.9886354804039001, + "learning_rate": 3.974613016847517e-06, + "loss": 0.6914, + "step": 101930 + }, + { + "epoch": 1.636302348352301, + "grad_norm": 2.3911898136138916, + "learning_rate": 3.9712028978876905e-06, + "loss": 0.676, + "step": 101940 + }, + { + "epoch": 1.636462864572465, + "grad_norm": 1.1679902076721191, + "learning_rate": 3.9677941162359315e-06, + "loss": 0.7262, + "step": 101950 + }, + { + "epoch": 1.6366233807926291, + "grad_norm": 1.0337005853652954, + "learning_rate": 3.964386672109019e-06, + "loss": 0.635, + "step": 101960 + }, + { + "epoch": 1.6367838970127933, + "grad_norm": 1.2234231233596802, + "learning_rate": 3.960980565723649e-06, + "loss": 0.7739, + "step": 101970 + }, + { + "epoch": 1.6369444132329571, + "grad_norm": 1.467301607131958, + "learning_rate": 3.957575797296436e-06, + "loss": 0.6614, + "step": 101980 + }, + { + "epoch": 1.6371049294531212, + "grad_norm": 1.1104271411895752, + "learning_rate": 3.954172367043888e-06, + "loss": 0.7175, + "step": 101990 + }, + { + "epoch": 1.6372654456732851, + "grad_norm": 1.6040699481964111, + "learning_rate": 3.95077027518245e-06, + "loss": 0.6549, + "step": 102000 + }, + { + "epoch": 1.6374259618934492, + "grad_norm": 1.0872548818588257, + "learning_rate": 3.947369521928479e-06, + "loss": 0.6131, + "step": 102010 + }, + { + "epoch": 1.6375864781136134, + "grad_norm": 1.0742864608764648, + "learning_rate": 3.943970107498237e-06, + "loss": 0.6978, + "step": 102020 + }, + { + "epoch": 1.6377469943337775, + "grad_norm": 1.0091798305511475, + "learning_rate": 3.940572032107911e-06, + "loss": 0.6323, + "step": 102030 + }, + { + "epoch": 1.6379075105539416, + "grad_norm": 0.7273944616317749, + "learning_rate": 3.9371752959736e-06, + "loss": 0.6791, + "step": 102040 + }, + { + "epoch": 1.6380680267741057, + "grad_norm": 1.6649748086929321, + "learning_rate": 3.93377989931131e-06, + "loss": 0.6759, + "step": 102050 + }, + { + "epoch": 1.6382285429942696, + "grad_norm": 0.9947006106376648, + "learning_rate": 3.930385842336976e-06, + "loss": 0.6536, + "step": 102060 + }, + { + "epoch": 1.6383890592144337, + "grad_norm": 1.4443129301071167, + "learning_rate": 3.926993125266432e-06, + "loss": 0.6551, + "step": 102070 + }, + { + "epoch": 1.6385495754345976, + "grad_norm": 1.3427696228027344, + "learning_rate": 3.923601748315448e-06, + "loss": 0.626, + "step": 102080 + }, + { + "epoch": 1.6387100916547617, + "grad_norm": 1.0607777833938599, + "learning_rate": 3.920211711699681e-06, + "loss": 0.7463, + "step": 102090 + }, + { + "epoch": 1.6388706078749258, + "grad_norm": 0.8331270813941956, + "learning_rate": 3.916823015634721e-06, + "loss": 0.6639, + "step": 102100 + }, + { + "epoch": 1.6390311240950899, + "grad_norm": 1.2586463689804077, + "learning_rate": 3.913435660336074e-06, + "loss": 0.5728, + "step": 102110 + }, + { + "epoch": 1.639191640315254, + "grad_norm": 1.115053415298462, + "learning_rate": 3.910049646019154e-06, + "loss": 0.7785, + "step": 102120 + }, + { + "epoch": 1.6393521565354179, + "grad_norm": 1.101363182067871, + "learning_rate": 3.906664972899288e-06, + "loss": 0.7472, + "step": 102130 + }, + { + "epoch": 1.639512672755582, + "grad_norm": 1.9593722820281982, + "learning_rate": 3.903281641191734e-06, + "loss": 0.6648, + "step": 102140 + }, + { + "epoch": 1.6396731889757459, + "grad_norm": 1.088282823562622, + "learning_rate": 3.899899651111632e-06, + "loss": 0.7108, + "step": 102150 + }, + { + "epoch": 1.63983370519591, + "grad_norm": 1.0463907718658447, + "learning_rate": 3.896519002874072e-06, + "loss": 0.7272, + "step": 102160 + }, + { + "epoch": 1.639994221416074, + "grad_norm": 1.0761220455169678, + "learning_rate": 3.893139696694034e-06, + "loss": 0.5913, + "step": 102170 + }, + { + "epoch": 1.6401547376362382, + "grad_norm": 1.748165249824524, + "learning_rate": 3.889761732786429e-06, + "loss": 0.6398, + "step": 102180 + }, + { + "epoch": 1.6403152538564023, + "grad_norm": 0.6716709733009338, + "learning_rate": 3.886385111366075e-06, + "loss": 0.7013, + "step": 102190 + }, + { + "epoch": 1.6404757700765662, + "grad_norm": 1.0028717517852783, + "learning_rate": 3.8830098326477075e-06, + "loss": 0.7993, + "step": 102200 + }, + { + "epoch": 1.6406362862967303, + "grad_norm": 1.168299674987793, + "learning_rate": 3.879635896845965e-06, + "loss": 0.6935, + "step": 102210 + }, + { + "epoch": 1.6407968025168942, + "grad_norm": 1.551013708114624, + "learning_rate": 3.876263304175415e-06, + "loss": 0.5974, + "step": 102220 + }, + { + "epoch": 1.6409573187370583, + "grad_norm": 2.0689964294433594, + "learning_rate": 3.872892054850535e-06, + "loss": 0.732, + "step": 102230 + }, + { + "epoch": 1.6411178349572224, + "grad_norm": 1.163724422454834, + "learning_rate": 3.869522149085716e-06, + "loss": 0.664, + "step": 102240 + }, + { + "epoch": 1.6412783511773865, + "grad_norm": 0.7762517929077148, + "learning_rate": 3.866153587095267e-06, + "loss": 0.7415, + "step": 102250 + }, + { + "epoch": 1.6414388673975506, + "grad_norm": 0.7750906348228455, + "learning_rate": 3.862786369093413e-06, + "loss": 0.7939, + "step": 102260 + }, + { + "epoch": 1.6415993836177147, + "grad_norm": 1.3900455236434937, + "learning_rate": 3.859420495294275e-06, + "loss": 0.5996, + "step": 102270 + }, + { + "epoch": 1.6417598998378786, + "grad_norm": 0.8278480172157288, + "learning_rate": 3.856055965911912e-06, + "loss": 0.6401, + "step": 102280 + }, + { + "epoch": 1.6419204160580425, + "grad_norm": 2.0453476905822754, + "learning_rate": 3.852692781160286e-06, + "loss": 0.6962, + "step": 102290 + }, + { + "epoch": 1.6420809322782066, + "grad_norm": 1.7369272708892822, + "learning_rate": 3.849330941253279e-06, + "loss": 0.6934, + "step": 102300 + }, + { + "epoch": 1.6422414484983707, + "grad_norm": 0.6906320452690125, + "learning_rate": 3.845970446404682e-06, + "loss": 0.6873, + "step": 102310 + }, + { + "epoch": 1.6424019647185348, + "grad_norm": 0.9708258509635925, + "learning_rate": 3.842611296828208e-06, + "loss": 0.6817, + "step": 102320 + }, + { + "epoch": 1.642562480938699, + "grad_norm": 0.8765339851379395, + "learning_rate": 3.8392534927374685e-06, + "loss": 0.7256, + "step": 102330 + }, + { + "epoch": 1.642722997158863, + "grad_norm": 0.8967892527580261, + "learning_rate": 3.835897034346006e-06, + "loss": 0.6619, + "step": 102340 + }, + { + "epoch": 1.642883513379027, + "grad_norm": 0.6677783727645874, + "learning_rate": 3.832541921867272e-06, + "loss": 0.7402, + "step": 102350 + }, + { + "epoch": 1.643044029599191, + "grad_norm": 1.2671105861663818, + "learning_rate": 3.829188155514632e-06, + "loss": 0.6754, + "step": 102360 + }, + { + "epoch": 1.643204545819355, + "grad_norm": 1.433869481086731, + "learning_rate": 3.825835735501363e-06, + "loss": 0.6931, + "step": 102370 + }, + { + "epoch": 1.643365062039519, + "grad_norm": 0.8281867504119873, + "learning_rate": 3.8224846620406625e-06, + "loss": 0.6474, + "step": 102380 + }, + { + "epoch": 1.6435255782596832, + "grad_norm": 1.2053990364074707, + "learning_rate": 3.819134935345644e-06, + "loss": 0.6123, + "step": 102390 + }, + { + "epoch": 1.6436860944798473, + "grad_norm": 0.9312882423400879, + "learning_rate": 3.81578655562932e-06, + "loss": 0.8208, + "step": 102400 + }, + { + "epoch": 1.6438466107000114, + "grad_norm": 0.7206059694290161, + "learning_rate": 3.81243952310463e-06, + "loss": 0.6481, + "step": 102410 + }, + { + "epoch": 1.6440071269201753, + "grad_norm": 0.8810237646102905, + "learning_rate": 3.809093837984429e-06, + "loss": 0.672, + "step": 102420 + }, + { + "epoch": 1.6441676431403394, + "grad_norm": 0.9717005491256714, + "learning_rate": 3.8057495004814843e-06, + "loss": 0.6377, + "step": 102430 + }, + { + "epoch": 1.6443281593605032, + "grad_norm": 0.9455322623252869, + "learning_rate": 3.8024065108084704e-06, + "loss": 0.6025, + "step": 102440 + }, + { + "epoch": 1.6444886755806674, + "grad_norm": 0.8322290182113647, + "learning_rate": 3.7990648691779927e-06, + "loss": 0.7631, + "step": 102450 + }, + { + "epoch": 1.6446491918008315, + "grad_norm": 0.9288406372070312, + "learning_rate": 3.795724575802545e-06, + "loss": 0.7051, + "step": 102460 + }, + { + "epoch": 1.6448097080209956, + "grad_norm": 0.6787111163139343, + "learning_rate": 3.79238563089456e-06, + "loss": 0.6245, + "step": 102470 + }, + { + "epoch": 1.6449702242411597, + "grad_norm": 1.043344259262085, + "learning_rate": 3.78904803466637e-06, + "loss": 0.7517, + "step": 102480 + }, + { + "epoch": 1.6451307404613236, + "grad_norm": 1.0212100744247437, + "learning_rate": 3.7857117873302316e-06, + "loss": 0.6925, + "step": 102490 + }, + { + "epoch": 1.6452912566814877, + "grad_norm": 0.9240586161613464, + "learning_rate": 3.7823768890983074e-06, + "loss": 0.7204, + "step": 102500 + }, + { + "epoch": 1.6454517729016516, + "grad_norm": 1.5579814910888672, + "learning_rate": 3.779043340182686e-06, + "loss": 0.6964, + "step": 102510 + }, + { + "epoch": 1.6456122891218157, + "grad_norm": 10.563342094421387, + "learning_rate": 3.775711140795346e-06, + "loss": 0.6706, + "step": 102520 + }, + { + "epoch": 1.6457728053419798, + "grad_norm": 0.8711618185043335, + "learning_rate": 3.772380291148203e-06, + "loss": 0.836, + "step": 102530 + }, + { + "epoch": 1.645933321562144, + "grad_norm": 0.8658226132392883, + "learning_rate": 3.7690507914530825e-06, + "loss": 0.712, + "step": 102540 + }, + { + "epoch": 1.646093837782308, + "grad_norm": 1.2478007078170776, + "learning_rate": 3.765722641921715e-06, + "loss": 0.7431, + "step": 102550 + }, + { + "epoch": 1.646254354002472, + "grad_norm": 0.9335256814956665, + "learning_rate": 3.762395842765759e-06, + "loss": 0.7362, + "step": 102560 + }, + { + "epoch": 1.646414870222636, + "grad_norm": 0.9338676929473877, + "learning_rate": 3.759070394196773e-06, + "loss": 0.6485, + "step": 102570 + }, + { + "epoch": 1.6465753864427999, + "grad_norm": 1.6933989524841309, + "learning_rate": 3.755746296426238e-06, + "loss": 0.7447, + "step": 102580 + }, + { + "epoch": 1.646735902662964, + "grad_norm": 1.0098872184753418, + "learning_rate": 3.7524235496655486e-06, + "loss": 0.7283, + "step": 102590 + }, + { + "epoch": 1.646896418883128, + "grad_norm": 1.0197409391403198, + "learning_rate": 3.7491021541260108e-06, + "loss": 0.6905, + "step": 102600 + }, + { + "epoch": 1.6470569351032922, + "grad_norm": 2.364086151123047, + "learning_rate": 3.74578211001885e-06, + "loss": 0.8328, + "step": 102610 + }, + { + "epoch": 1.6472174513234563, + "grad_norm": 0.8178723454475403, + "learning_rate": 3.742463417555192e-06, + "loss": 0.6627, + "step": 102620 + }, + { + "epoch": 1.6473779675436204, + "grad_norm": 0.9468794465065002, + "learning_rate": 3.739146076946093e-06, + "loss": 0.6853, + "step": 102630 + }, + { + "epoch": 1.6475384837637843, + "grad_norm": 0.9919754266738892, + "learning_rate": 3.7358300884025145e-06, + "loss": 0.702, + "step": 102640 + }, + { + "epoch": 1.6476989999839484, + "grad_norm": 0.8066470623016357, + "learning_rate": 3.732515452135335e-06, + "loss": 0.6962, + "step": 102650 + }, + { + "epoch": 1.6478595162041123, + "grad_norm": 1.3022615909576416, + "learning_rate": 3.7292021683553448e-06, + "loss": 0.7677, + "step": 102660 + }, + { + "epoch": 1.6480200324242764, + "grad_norm": 1.1232656240463257, + "learning_rate": 3.7258902372732497e-06, + "loss": 0.7486, + "step": 102670 + }, + { + "epoch": 1.6481805486444405, + "grad_norm": 1.3812593221664429, + "learning_rate": 3.7225796590996676e-06, + "loss": 0.7089, + "step": 102680 + }, + { + "epoch": 1.6483410648646046, + "grad_norm": 1.1212983131408691, + "learning_rate": 3.7192704340451356e-06, + "loss": 0.6018, + "step": 102690 + }, + { + "epoch": 1.6485015810847687, + "grad_norm": 1.1919491291046143, + "learning_rate": 3.7159625623201022e-06, + "loss": 0.7411, + "step": 102700 + }, + { + "epoch": 1.6486620973049326, + "grad_norm": 1.01421320438385, + "learning_rate": 3.712656044134921e-06, + "loss": 0.709, + "step": 102710 + }, + { + "epoch": 1.6488226135250967, + "grad_norm": 0.6814343333244324, + "learning_rate": 3.709350879699872e-06, + "loss": 0.6568, + "step": 102720 + }, + { + "epoch": 1.6489831297452606, + "grad_norm": 0.866121232509613, + "learning_rate": 3.7060470692251436e-06, + "loss": 0.5784, + "step": 102730 + }, + { + "epoch": 1.6491436459654247, + "grad_norm": 1.8982094526290894, + "learning_rate": 3.702744612920836e-06, + "loss": 0.6541, + "step": 102740 + }, + { + "epoch": 1.6493041621855888, + "grad_norm": 1.4963821172714233, + "learning_rate": 3.6994435109969724e-06, + "loss": 0.6783, + "step": 102750 + }, + { + "epoch": 1.649464678405753, + "grad_norm": 1.5543245077133179, + "learning_rate": 3.6961437636634844e-06, + "loss": 0.6638, + "step": 102760 + }, + { + "epoch": 1.649625194625917, + "grad_norm": 1.030033826828003, + "learning_rate": 3.6928453711302046e-06, + "loss": 0.7551, + "step": 102770 + }, + { + "epoch": 1.649785710846081, + "grad_norm": 1.3141826391220093, + "learning_rate": 3.6895483336069015e-06, + "loss": 0.7317, + "step": 102780 + }, + { + "epoch": 1.649946227066245, + "grad_norm": 0.9394165873527527, + "learning_rate": 3.686252651303243e-06, + "loss": 0.8288, + "step": 102790 + }, + { + "epoch": 1.650106743286409, + "grad_norm": 1.2114794254302979, + "learning_rate": 3.6829583244288174e-06, + "loss": 0.7386, + "step": 102800 + }, + { + "epoch": 1.650267259506573, + "grad_norm": 1.0309066772460938, + "learning_rate": 3.6796653531931236e-06, + "loss": 0.6013, + "step": 102810 + }, + { + "epoch": 1.6504277757267372, + "grad_norm": 1.1972249746322632, + "learning_rate": 3.676373737805583e-06, + "loss": 0.6365, + "step": 102820 + }, + { + "epoch": 1.6505882919469013, + "grad_norm": 1.0429399013519287, + "learning_rate": 3.6730834784755087e-06, + "loss": 0.8731, + "step": 102830 + }, + { + "epoch": 1.6507488081670654, + "grad_norm": 0.804789662361145, + "learning_rate": 3.66979457541215e-06, + "loss": 0.6122, + "step": 102840 + }, + { + "epoch": 1.6509093243872295, + "grad_norm": 0.7787913084030151, + "learning_rate": 3.666507028824659e-06, + "loss": 0.5165, + "step": 102850 + }, + { + "epoch": 1.6510698406073934, + "grad_norm": 1.1823424100875854, + "learning_rate": 3.6632208389221075e-06, + "loss": 0.7241, + "step": 102860 + }, + { + "epoch": 1.6512303568275575, + "grad_norm": 0.9201653599739075, + "learning_rate": 3.659936005913475e-06, + "loss": 0.7156, + "step": 102870 + }, + { + "epoch": 1.6513908730477214, + "grad_norm": 0.9877551198005676, + "learning_rate": 3.656652530007665e-06, + "loss": 0.6782, + "step": 102880 + }, + { + "epoch": 1.6515513892678855, + "grad_norm": 0.9036855101585388, + "learning_rate": 3.6533704114134787e-06, + "loss": 0.803, + "step": 102890 + }, + { + "epoch": 1.6517119054880496, + "grad_norm": 1.3079116344451904, + "learning_rate": 3.6500896503396386e-06, + "loss": 0.7874, + "step": 102900 + }, + { + "epoch": 1.6518724217082137, + "grad_norm": 1.0587184429168701, + "learning_rate": 3.646810246994786e-06, + "loss": 0.7375, + "step": 102910 + }, + { + "epoch": 1.6520329379283778, + "grad_norm": 0.7894306182861328, + "learning_rate": 3.6435322015874713e-06, + "loss": 0.7497, + "step": 102920 + }, + { + "epoch": 1.6521934541485417, + "grad_norm": 1.6898705959320068, + "learning_rate": 3.64025551432616e-06, + "loss": 0.6478, + "step": 102930 + }, + { + "epoch": 1.6523539703687058, + "grad_norm": 2.018232822418213, + "learning_rate": 3.636980185419228e-06, + "loss": 0.6709, + "step": 102940 + }, + { + "epoch": 1.6525144865888697, + "grad_norm": 0.9598966240882874, + "learning_rate": 3.633706215074975e-06, + "loss": 0.655, + "step": 102950 + }, + { + "epoch": 1.6526750028090338, + "grad_norm": 1.707701325416565, + "learning_rate": 3.630433603501593e-06, + "loss": 0.6704, + "step": 102960 + }, + { + "epoch": 1.652835519029198, + "grad_norm": 0.8583108186721802, + "learning_rate": 3.6271623509072068e-06, + "loss": 0.7066, + "step": 102970 + }, + { + "epoch": 1.652996035249362, + "grad_norm": 1.0051559209823608, + "learning_rate": 3.62389245749985e-06, + "loss": 0.7421, + "step": 102980 + }, + { + "epoch": 1.6531565514695261, + "grad_norm": 0.9753533601760864, + "learning_rate": 3.6206239234874676e-06, + "loss": 0.6517, + "step": 102990 + }, + { + "epoch": 1.65331706768969, + "grad_norm": 1.2734466791152954, + "learning_rate": 3.6173567490779186e-06, + "loss": 0.653, + "step": 103000 + }, + { + "epoch": 1.653477583909854, + "grad_norm": 0.9699164628982544, + "learning_rate": 3.6140909344789838e-06, + "loss": 0.6663, + "step": 103010 + }, + { + "epoch": 1.653638100130018, + "grad_norm": 1.125407099723816, + "learning_rate": 3.6108264798983354e-06, + "loss": 0.7006, + "step": 103020 + }, + { + "epoch": 1.653798616350182, + "grad_norm": 1.0042510032653809, + "learning_rate": 3.607563385543583e-06, + "loss": 0.6596, + "step": 103030 + }, + { + "epoch": 1.6539591325703462, + "grad_norm": 1.2395191192626953, + "learning_rate": 3.604301651622238e-06, + "loss": 0.5639, + "step": 103040 + }, + { + "epoch": 1.6541196487905103, + "grad_norm": 0.8098469972610474, + "learning_rate": 3.601041278341727e-06, + "loss": 0.6802, + "step": 103050 + }, + { + "epoch": 1.6542801650106744, + "grad_norm": 0.8236487507820129, + "learning_rate": 3.597782265909394e-06, + "loss": 0.7265, + "step": 103060 + }, + { + "epoch": 1.6544406812308385, + "grad_norm": 1.6694056987762451, + "learning_rate": 3.5945246145324934e-06, + "loss": 0.698, + "step": 103070 + }, + { + "epoch": 1.6546011974510024, + "grad_norm": 0.9407623410224915, + "learning_rate": 3.591268324418187e-06, + "loss": 0.6213, + "step": 103080 + }, + { + "epoch": 1.6547617136711663, + "grad_norm": 1.1981956958770752, + "learning_rate": 3.588013395773557e-06, + "loss": 0.6814, + "step": 103090 + }, + { + "epoch": 1.6549222298913304, + "grad_norm": 0.8547660708427429, + "learning_rate": 3.5847598288056007e-06, + "loss": 0.7379, + "step": 103100 + }, + { + "epoch": 1.6550827461114945, + "grad_norm": 1.8802191019058228, + "learning_rate": 3.5815076237212225e-06, + "loss": 0.737, + "step": 103110 + }, + { + "epoch": 1.6552432623316586, + "grad_norm": 0.9299526810646057, + "learning_rate": 3.578256780727246e-06, + "loss": 0.5689, + "step": 103120 + }, + { + "epoch": 1.6554037785518227, + "grad_norm": 0.8649690747261047, + "learning_rate": 3.5750073000304113e-06, + "loss": 0.6202, + "step": 103130 + }, + { + "epoch": 1.6555642947719869, + "grad_norm": 1.6156259775161743, + "learning_rate": 3.571759181837356e-06, + "loss": 0.7023, + "step": 103140 + }, + { + "epoch": 1.6557248109921507, + "grad_norm": 1.367833137512207, + "learning_rate": 3.568512426354645e-06, + "loss": 0.6376, + "step": 103150 + }, + { + "epoch": 1.6558853272123149, + "grad_norm": 1.1300081014633179, + "learning_rate": 3.565267033788755e-06, + "loss": 0.7542, + "step": 103160 + }, + { + "epoch": 1.6560458434324787, + "grad_norm": 1.5963810682296753, + "learning_rate": 3.5620230043460714e-06, + "loss": 0.7382, + "step": 103170 + }, + { + "epoch": 1.6562063596526428, + "grad_norm": 0.6695784330368042, + "learning_rate": 3.558780338232895e-06, + "loss": 0.7559, + "step": 103180 + }, + { + "epoch": 1.656366875872807, + "grad_norm": 1.004593849182129, + "learning_rate": 3.555539035655442e-06, + "loss": 0.6211, + "step": 103190 + }, + { + "epoch": 1.656527392092971, + "grad_norm": 1.223706603050232, + "learning_rate": 3.5522990968198415e-06, + "loss": 0.6244, + "step": 103200 + }, + { + "epoch": 1.6566879083131352, + "grad_norm": 0.9008594751358032, + "learning_rate": 3.549060521932135e-06, + "loss": 0.7712, + "step": 103210 + }, + { + "epoch": 1.656848424533299, + "grad_norm": 1.8231990337371826, + "learning_rate": 3.545823311198271e-06, + "loss": 0.739, + "step": 103220 + }, + { + "epoch": 1.6570089407534632, + "grad_norm": 1.1451886892318726, + "learning_rate": 3.542587464824129e-06, + "loss": 0.7166, + "step": 103230 + }, + { + "epoch": 1.657169456973627, + "grad_norm": 1.2296327352523804, + "learning_rate": 3.539352983015476e-06, + "loss": 0.6266, + "step": 103240 + }, + { + "epoch": 1.6573299731937912, + "grad_norm": 0.9551473259925842, + "learning_rate": 3.5361198659780133e-06, + "loss": 0.7609, + "step": 103250 + }, + { + "epoch": 1.6574904894139553, + "grad_norm": 2.1332149505615234, + "learning_rate": 3.532888113917346e-06, + "loss": 0.8159, + "step": 103260 + }, + { + "epoch": 1.6576510056341194, + "grad_norm": 0.6775693297386169, + "learning_rate": 3.5296577270389956e-06, + "loss": 0.7233, + "step": 103270 + }, + { + "epoch": 1.6578115218542835, + "grad_norm": 1.3187305927276611, + "learning_rate": 3.526428705548396e-06, + "loss": 0.6884, + "step": 103280 + }, + { + "epoch": 1.6579720380744474, + "grad_norm": 0.7238060235977173, + "learning_rate": 3.5232010496508926e-06, + "loss": 0.7036, + "step": 103290 + }, + { + "epoch": 1.6581325542946115, + "grad_norm": 1.3104130029678345, + "learning_rate": 3.5199747595517473e-06, + "loss": 0.6927, + "step": 103300 + }, + { + "epoch": 1.6582930705147754, + "grad_norm": 0.8363596796989441, + "learning_rate": 3.5167498354561317e-06, + "loss": 0.7062, + "step": 103310 + }, + { + "epoch": 1.6584535867349395, + "grad_norm": 0.9787256717681885, + "learning_rate": 3.5135262775691374e-06, + "loss": 0.7403, + "step": 103320 + }, + { + "epoch": 1.6586141029551036, + "grad_norm": 1.0274900197982788, + "learning_rate": 3.510304086095756e-06, + "loss": 0.6596, + "step": 103330 + }, + { + "epoch": 1.6587746191752677, + "grad_norm": 1.2043911218643188, + "learning_rate": 3.507083261240901e-06, + "loss": 0.7095, + "step": 103340 + }, + { + "epoch": 1.6589351353954318, + "grad_norm": 0.6376013159751892, + "learning_rate": 3.5038638032094006e-06, + "loss": 0.6339, + "step": 103350 + }, + { + "epoch": 1.659095651615596, + "grad_norm": 0.9426864385604858, + "learning_rate": 3.500645712205994e-06, + "loss": 0.7578, + "step": 103360 + }, + { + "epoch": 1.6592561678357598, + "grad_norm": 1.268343210220337, + "learning_rate": 3.4974289884353308e-06, + "loss": 0.6645, + "step": 103370 + }, + { + "epoch": 1.6594166840559237, + "grad_norm": 3.8649652004241943, + "learning_rate": 3.494213632101981e-06, + "loss": 0.7363, + "step": 103380 + }, + { + "epoch": 1.6595772002760878, + "grad_norm": 0.7890548706054688, + "learning_rate": 3.4909996434104143e-06, + "loss": 0.6337, + "step": 103390 + }, + { + "epoch": 1.659737716496252, + "grad_norm": 1.3029625415802002, + "learning_rate": 3.4877870225650227e-06, + "loss": 0.6777, + "step": 103400 + }, + { + "epoch": 1.659898232716416, + "grad_norm": 1.295040488243103, + "learning_rate": 3.4845757697701153e-06, + "loss": 0.7152, + "step": 103410 + }, + { + "epoch": 1.6600587489365801, + "grad_norm": 1.168911099433899, + "learning_rate": 3.4813658852299064e-06, + "loss": 0.7467, + "step": 103420 + }, + { + "epoch": 1.6602192651567442, + "grad_norm": 1.1753902435302734, + "learning_rate": 3.478157369148524e-06, + "loss": 0.7044, + "step": 103430 + }, + { + "epoch": 1.6603797813769081, + "grad_norm": 1.0399466753005981, + "learning_rate": 3.474950221730011e-06, + "loss": 0.7282, + "step": 103440 + }, + { + "epoch": 1.6605402975970722, + "grad_norm": 0.9102397561073303, + "learning_rate": 3.471744443178335e-06, + "loss": 0.6909, + "step": 103450 + }, + { + "epoch": 1.660700813817236, + "grad_norm": 0.7928534150123596, + "learning_rate": 3.4685400336973455e-06, + "loss": 0.6473, + "step": 103460 + }, + { + "epoch": 1.6608613300374002, + "grad_norm": 1.5362545251846313, + "learning_rate": 3.4653369934908335e-06, + "loss": 0.636, + "step": 103470 + }, + { + "epoch": 1.6610218462575643, + "grad_norm": 0.9220024347305298, + "learning_rate": 3.4621353227624913e-06, + "loss": 0.7509, + "step": 103480 + }, + { + "epoch": 1.6611823624777284, + "grad_norm": 1.5755798816680908, + "learning_rate": 3.4589350217159312e-06, + "loss": 0.7005, + "step": 103490 + }, + { + "epoch": 1.6613428786978925, + "grad_norm": 1.4324681758880615, + "learning_rate": 3.4557360905546677e-06, + "loss": 0.7207, + "step": 103500 + }, + { + "epoch": 1.6615033949180564, + "grad_norm": 1.9859838485717773, + "learning_rate": 3.4525385294821467e-06, + "loss": 0.7927, + "step": 103510 + }, + { + "epoch": 1.6616639111382205, + "grad_norm": 0.924963116645813, + "learning_rate": 3.449342338701694e-06, + "loss": 0.6616, + "step": 103520 + }, + { + "epoch": 1.6618244273583844, + "grad_norm": 1.1080645322799683, + "learning_rate": 3.4461475184165815e-06, + "loss": 0.7699, + "step": 103530 + }, + { + "epoch": 1.6619849435785485, + "grad_norm": 0.7685850858688354, + "learning_rate": 3.442954068829979e-06, + "loss": 0.7939, + "step": 103540 + }, + { + "epoch": 1.6621454597987126, + "grad_norm": 2.2032899856567383, + "learning_rate": 3.4397619901449685e-06, + "loss": 0.7662, + "step": 103550 + }, + { + "epoch": 1.6623059760188768, + "grad_norm": 1.2258597612380981, + "learning_rate": 3.4365712825645518e-06, + "loss": 0.7508, + "step": 103560 + }, + { + "epoch": 1.6624664922390409, + "grad_norm": 0.7287399172782898, + "learning_rate": 3.433381946291642e-06, + "loss": 0.7787, + "step": 103570 + }, + { + "epoch": 1.662627008459205, + "grad_norm": 0.9202183485031128, + "learning_rate": 3.4301939815290533e-06, + "loss": 0.7283, + "step": 103580 + }, + { + "epoch": 1.6627875246793689, + "grad_norm": 1.0067704916000366, + "learning_rate": 3.4270073884795224e-06, + "loss": 0.7142, + "step": 103590 + }, + { + "epoch": 1.6629480408995327, + "grad_norm": 1.0113486051559448, + "learning_rate": 3.4238221673457027e-06, + "loss": 0.7147, + "step": 103600 + }, + { + "epoch": 1.6631085571196969, + "grad_norm": 0.9801037907600403, + "learning_rate": 3.420638318330155e-06, + "loss": 0.8169, + "step": 103610 + }, + { + "epoch": 1.663269073339861, + "grad_norm": 0.9925256371498108, + "learning_rate": 3.41745584163535e-06, + "loss": 0.6955, + "step": 103620 + }, + { + "epoch": 1.663429589560025, + "grad_norm": 0.8833163380622864, + "learning_rate": 3.4142747374636826e-06, + "loss": 0.5861, + "step": 103630 + }, + { + "epoch": 1.6635901057801892, + "grad_norm": 0.7037060856819153, + "learning_rate": 3.4110950060174422e-06, + "loss": 0.6323, + "step": 103640 + }, + { + "epoch": 1.6637506220003533, + "grad_norm": 1.072048544883728, + "learning_rate": 3.4079166474988435e-06, + "loss": 0.6678, + "step": 103650 + }, + { + "epoch": 1.6639111382205172, + "grad_norm": 1.0169063806533813, + "learning_rate": 3.4047396621100155e-06, + "loss": 0.6742, + "step": 103660 + }, + { + "epoch": 1.6640716544406813, + "grad_norm": 0.9338902235031128, + "learning_rate": 3.4015640500529938e-06, + "loss": 0.6423, + "step": 103670 + }, + { + "epoch": 1.6642321706608452, + "grad_norm": 1.190576434135437, + "learning_rate": 3.398389811529726e-06, + "loss": 0.779, + "step": 103680 + }, + { + "epoch": 1.6643926868810093, + "grad_norm": 1.1148391962051392, + "learning_rate": 3.395216946742086e-06, + "loss": 0.6984, + "step": 103690 + }, + { + "epoch": 1.6645532031011734, + "grad_norm": 1.044571042060852, + "learning_rate": 3.392045455891832e-06, + "loss": 0.7132, + "step": 103700 + }, + { + "epoch": 1.6647137193213375, + "grad_norm": 0.5501446723937988, + "learning_rate": 3.388875339180664e-06, + "loss": 0.7075, + "step": 103710 + }, + { + "epoch": 1.6648742355415016, + "grad_norm": 1.5244275331497192, + "learning_rate": 3.385706596810179e-06, + "loss": 0.7789, + "step": 103720 + }, + { + "epoch": 1.6650347517616655, + "grad_norm": 1.6002178192138672, + "learning_rate": 3.382539228981893e-06, + "loss": 0.6755, + "step": 103730 + }, + { + "epoch": 1.6651952679818296, + "grad_norm": 1.055626630783081, + "learning_rate": 3.379373235897229e-06, + "loss": 0.6969, + "step": 103740 + }, + { + "epoch": 1.6653557842019935, + "grad_norm": 1.062893033027649, + "learning_rate": 3.376208617757526e-06, + "loss": 0.6757, + "step": 103750 + }, + { + "epoch": 1.6655163004221576, + "grad_norm": 0.8559869527816772, + "learning_rate": 3.3730453747640447e-06, + "loss": 0.6672, + "step": 103760 + }, + { + "epoch": 1.6656768166423217, + "grad_norm": 1.9518778324127197, + "learning_rate": 3.3698835071179325e-06, + "loss": 0.5899, + "step": 103770 + }, + { + "epoch": 1.6658373328624858, + "grad_norm": 0.9203382730484009, + "learning_rate": 3.3667230150202734e-06, + "loss": 0.7252, + "step": 103780 + }, + { + "epoch": 1.66599784908265, + "grad_norm": 0.7914001941680908, + "learning_rate": 3.3635638986720563e-06, + "loss": 0.7017, + "step": 103790 + }, + { + "epoch": 1.6661583653028138, + "grad_norm": 1.2100111246109009, + "learning_rate": 3.3604061582741818e-06, + "loss": 0.7324, + "step": 103800 + }, + { + "epoch": 1.666318881522978, + "grad_norm": 0.9470927715301514, + "learning_rate": 3.3572497940274644e-06, + "loss": 0.6378, + "step": 103810 + }, + { + "epoch": 1.6664793977431418, + "grad_norm": 0.8751764297485352, + "learning_rate": 3.3540948061326295e-06, + "loss": 0.7146, + "step": 103820 + }, + { + "epoch": 1.666639913963306, + "grad_norm": 0.9766260385513306, + "learning_rate": 3.350941194790319e-06, + "loss": 0.7112, + "step": 103830 + }, + { + "epoch": 1.66680043018347, + "grad_norm": 0.7696958780288696, + "learning_rate": 3.3477889602010786e-06, + "loss": 0.7037, + "step": 103840 + }, + { + "epoch": 1.6669609464036341, + "grad_norm": 1.08664870262146, + "learning_rate": 3.3446381025653755e-06, + "loss": 0.7486, + "step": 103850 + }, + { + "epoch": 1.6671214626237982, + "grad_norm": 1.5055010318756104, + "learning_rate": 3.3414886220835913e-06, + "loss": 0.5338, + "step": 103860 + }, + { + "epoch": 1.6672819788439623, + "grad_norm": 1.241867184638977, + "learning_rate": 3.3383405189560013e-06, + "loss": 0.7176, + "step": 103870 + }, + { + "epoch": 1.6674424950641262, + "grad_norm": 0.8468398451805115, + "learning_rate": 3.3351937933828154e-06, + "loss": 0.6612, + "step": 103880 + }, + { + "epoch": 1.6676030112842901, + "grad_norm": 0.868105947971344, + "learning_rate": 3.3320484455641425e-06, + "loss": 0.6729, + "step": 103890 + }, + { + "epoch": 1.6677635275044542, + "grad_norm": 1.1523956060409546, + "learning_rate": 3.328904475700012e-06, + "loss": 0.7135, + "step": 103900 + }, + { + "epoch": 1.6679240437246183, + "grad_norm": 1.2564455270767212, + "learning_rate": 3.325761883990361e-06, + "loss": 0.7206, + "step": 103910 + }, + { + "epoch": 1.6680845599447824, + "grad_norm": 1.1050559282302856, + "learning_rate": 3.3226206706350375e-06, + "loss": 0.7048, + "step": 103920 + }, + { + "epoch": 1.6682450761649465, + "grad_norm": 0.7572906017303467, + "learning_rate": 3.3194808358338046e-06, + "loss": 0.7726, + "step": 103930 + }, + { + "epoch": 1.6684055923851107, + "grad_norm": 1.5435724258422852, + "learning_rate": 3.316342379786347e-06, + "loss": 0.6642, + "step": 103940 + }, + { + "epoch": 1.6685661086052745, + "grad_norm": 0.9849409461021423, + "learning_rate": 3.3132053026922375e-06, + "loss": 0.7206, + "step": 103950 + }, + { + "epoch": 1.6687266248254387, + "grad_norm": 1.1737966537475586, + "learning_rate": 3.3100696047509842e-06, + "loss": 0.805, + "step": 103960 + }, + { + "epoch": 1.6688871410456025, + "grad_norm": 0.9143280386924744, + "learning_rate": 3.306935286161994e-06, + "loss": 0.7178, + "step": 103970 + }, + { + "epoch": 1.6690476572657666, + "grad_norm": 1.2389432191848755, + "learning_rate": 3.303802347124596e-06, + "loss": 0.7027, + "step": 103980 + }, + { + "epoch": 1.6692081734859308, + "grad_norm": 0.7480961680412292, + "learning_rate": 3.3006707878380256e-06, + "loss": 0.6439, + "step": 103990 + }, + { + "epoch": 1.6693686897060949, + "grad_norm": 1.1900150775909424, + "learning_rate": 3.297540608501429e-06, + "loss": 0.6788, + "step": 104000 + }, + { + "epoch": 1.6693686897060949, + "eval_loss": 0.7696548700332642, + "eval_runtime": 1824.8737, + "eval_samples_per_second": 14.374, + "eval_steps_per_second": 1.797, + "step": 104000 + }, + { + "epoch": 1.669529205926259, + "grad_norm": 0.6654484868049622, + "learning_rate": 3.294411809313877e-06, + "loss": 0.7039, + "step": 104010 + }, + { + "epoch": 1.6696897221464229, + "grad_norm": 1.4772332906723022, + "learning_rate": 3.291284390474328e-06, + "loss": 0.719, + "step": 104020 + }, + { + "epoch": 1.669850238366587, + "grad_norm": 1.0286508798599243, + "learning_rate": 3.288158352181675e-06, + "loss": 0.6894, + "step": 104030 + }, + { + "epoch": 1.6700107545867509, + "grad_norm": 1.3850287199020386, + "learning_rate": 3.2850336946347148e-06, + "loss": 0.6228, + "step": 104040 + }, + { + "epoch": 1.670171270806915, + "grad_norm": 0.7672808170318604, + "learning_rate": 3.281910418032158e-06, + "loss": 0.6748, + "step": 104050 + }, + { + "epoch": 1.670331787027079, + "grad_norm": 1.0208971500396729, + "learning_rate": 3.278788522572626e-06, + "loss": 0.751, + "step": 104060 + }, + { + "epoch": 1.6704923032472432, + "grad_norm": 0.9065760970115662, + "learning_rate": 3.2756680084546575e-06, + "loss": 0.7315, + "step": 104070 + }, + { + "epoch": 1.6706528194674073, + "grad_norm": 0.72398841381073, + "learning_rate": 3.272548875876691e-06, + "loss": 0.594, + "step": 104080 + }, + { + "epoch": 1.6708133356875712, + "grad_norm": 0.9484813809394836, + "learning_rate": 3.2694311250370896e-06, + "loss": 0.6636, + "step": 104090 + }, + { + "epoch": 1.6709738519077353, + "grad_norm": 1.2267762422561646, + "learning_rate": 3.2663147561341227e-06, + "loss": 0.7002, + "step": 104100 + }, + { + "epoch": 1.6711343681278992, + "grad_norm": 1.0155813694000244, + "learning_rate": 3.263199769365971e-06, + "loss": 0.6772, + "step": 104110 + }, + { + "epoch": 1.6712948843480633, + "grad_norm": 0.9235029816627502, + "learning_rate": 3.260086164930734e-06, + "loss": 0.7322, + "step": 104120 + }, + { + "epoch": 1.6714554005682274, + "grad_norm": 1.0132856369018555, + "learning_rate": 3.2569739430264227e-06, + "loss": 0.7628, + "step": 104130 + }, + { + "epoch": 1.6716159167883915, + "grad_norm": 1.0801036357879639, + "learning_rate": 3.2538631038509432e-06, + "loss": 0.8177, + "step": 104140 + }, + { + "epoch": 1.6717764330085556, + "grad_norm": 1.0521180629730225, + "learning_rate": 3.2507536476021366e-06, + "loss": 0.67, + "step": 104150 + }, + { + "epoch": 1.6719369492287197, + "grad_norm": 0.7403601408004761, + "learning_rate": 3.2476455744777397e-06, + "loss": 0.6752, + "step": 104160 + }, + { + "epoch": 1.6720974654488836, + "grad_norm": 1.0298123359680176, + "learning_rate": 3.2445388846754106e-06, + "loss": 0.6992, + "step": 104170 + }, + { + "epoch": 1.6722579816690477, + "grad_norm": 1.0933928489685059, + "learning_rate": 3.2414335783927192e-06, + "loss": 0.5908, + "step": 104180 + }, + { + "epoch": 1.6724184978892116, + "grad_norm": 0.984451174736023, + "learning_rate": 3.2383296558271464e-06, + "loss": 0.7338, + "step": 104190 + }, + { + "epoch": 1.6725790141093757, + "grad_norm": 1.0478233098983765, + "learning_rate": 3.2352271171760735e-06, + "loss": 0.6994, + "step": 104200 + }, + { + "epoch": 1.6727395303295398, + "grad_norm": 0.9793069362640381, + "learning_rate": 3.232125962636809e-06, + "loss": 0.6837, + "step": 104210 + }, + { + "epoch": 1.672900046549704, + "grad_norm": 1.161554217338562, + "learning_rate": 3.22902619240657e-06, + "loss": 0.8299, + "step": 104220 + }, + { + "epoch": 1.673060562769868, + "grad_norm": 1.378565788269043, + "learning_rate": 3.2259278066824826e-06, + "loss": 0.6365, + "step": 104230 + }, + { + "epoch": 1.673221078990032, + "grad_norm": 0.7566215395927429, + "learning_rate": 3.2228308056615862e-06, + "loss": 0.7018, + "step": 104240 + }, + { + "epoch": 1.673381595210196, + "grad_norm": 0.9232525825500488, + "learning_rate": 3.2197351895408317e-06, + "loss": 0.7642, + "step": 104250 + }, + { + "epoch": 1.67354211143036, + "grad_norm": 0.5903162956237793, + "learning_rate": 3.2166409585170865e-06, + "loss": 0.7232, + "step": 104260 + }, + { + "epoch": 1.673702627650524, + "grad_norm": 1.1045881509780884, + "learning_rate": 3.2135481127871154e-06, + "loss": 0.5837, + "step": 104270 + }, + { + "epoch": 1.6738631438706881, + "grad_norm": 1.2726801633834839, + "learning_rate": 3.210456652547614e-06, + "loss": 0.721, + "step": 104280 + }, + { + "epoch": 1.6740236600908522, + "grad_norm": 1.0955790281295776, + "learning_rate": 3.2073665779951745e-06, + "loss": 0.8018, + "step": 104290 + }, + { + "epoch": 1.6741841763110163, + "grad_norm": 0.8553909659385681, + "learning_rate": 3.20427788932631e-06, + "loss": 0.7508, + "step": 104300 + }, + { + "epoch": 1.6743446925311802, + "grad_norm": 1.3676689863204956, + "learning_rate": 3.201190586737446e-06, + "loss": 0.7018, + "step": 104310 + }, + { + "epoch": 1.6745052087513443, + "grad_norm": 0.8631833791732788, + "learning_rate": 3.19810467042492e-06, + "loss": 0.6865, + "step": 104320 + }, + { + "epoch": 1.6746657249715082, + "grad_norm": 0.6454166173934937, + "learning_rate": 3.1950201405849674e-06, + "loss": 0.7828, + "step": 104330 + }, + { + "epoch": 1.6748262411916723, + "grad_norm": 0.9212415218353271, + "learning_rate": 3.19193699741375e-06, + "loss": 0.5917, + "step": 104340 + }, + { + "epoch": 1.6749867574118364, + "grad_norm": 1.3487021923065186, + "learning_rate": 3.1888552411073413e-06, + "loss": 0.7912, + "step": 104350 + }, + { + "epoch": 1.6751472736320006, + "grad_norm": 1.8244234323501587, + "learning_rate": 3.1857748718617214e-06, + "loss": 0.7126, + "step": 104360 + }, + { + "epoch": 1.6753077898521647, + "grad_norm": 1.1026674509048462, + "learning_rate": 3.1826958898727805e-06, + "loss": 0.6444, + "step": 104370 + }, + { + "epoch": 1.6754683060723288, + "grad_norm": 1.221464991569519, + "learning_rate": 3.179618295336334e-06, + "loss": 0.5848, + "step": 104380 + }, + { + "epoch": 1.6756288222924927, + "grad_norm": 1.6299688816070557, + "learning_rate": 3.176542088448084e-06, + "loss": 0.6991, + "step": 104390 + }, + { + "epoch": 1.6757893385126565, + "grad_norm": 0.7261486053466797, + "learning_rate": 3.1734672694036683e-06, + "loss": 0.7523, + "step": 104400 + }, + { + "epoch": 1.6759498547328207, + "grad_norm": 1.4414830207824707, + "learning_rate": 3.1703938383986248e-06, + "loss": 0.7057, + "step": 104410 + }, + { + "epoch": 1.6761103709529848, + "grad_norm": 0.7268303632736206, + "learning_rate": 3.167321795628406e-06, + "loss": 0.6683, + "step": 104420 + }, + { + "epoch": 1.6762708871731489, + "grad_norm": 1.3552714586257935, + "learning_rate": 3.164251141288374e-06, + "loss": 0.7332, + "step": 104430 + }, + { + "epoch": 1.676431403393313, + "grad_norm": 1.171050786972046, + "learning_rate": 3.1611818755738097e-06, + "loss": 0.8315, + "step": 104440 + }, + { + "epoch": 1.676591919613477, + "grad_norm": 1.3489910364151, + "learning_rate": 3.1581139986798956e-06, + "loss": 0.7053, + "step": 104450 + }, + { + "epoch": 1.676752435833641, + "grad_norm": 2.049694538116455, + "learning_rate": 3.155047510801734e-06, + "loss": 0.5537, + "step": 104460 + }, + { + "epoch": 1.676912952053805, + "grad_norm": 1.0956165790557861, + "learning_rate": 3.151982412134333e-06, + "loss": 0.6944, + "step": 104470 + }, + { + "epoch": 1.677073468273969, + "grad_norm": 1.301288366317749, + "learning_rate": 3.1489187028726162e-06, + "loss": 0.6329, + "step": 104480 + }, + { + "epoch": 1.677233984494133, + "grad_norm": 1.1257143020629883, + "learning_rate": 3.1458563832114233e-06, + "loss": 0.6806, + "step": 104490 + }, + { + "epoch": 1.6773945007142972, + "grad_norm": 1.2325403690338135, + "learning_rate": 3.1427954533454875e-06, + "loss": 0.6635, + "step": 104500 + }, + { + "epoch": 1.6775550169344613, + "grad_norm": 0.8864719271659851, + "learning_rate": 3.1397359134694713e-06, + "loss": 0.6658, + "step": 104510 + }, + { + "epoch": 1.6777155331546254, + "grad_norm": 0.7991852164268494, + "learning_rate": 3.1366777637779425e-06, + "loss": 0.7705, + "step": 104520 + }, + { + "epoch": 1.6778760493747893, + "grad_norm": 1.164412021636963, + "learning_rate": 3.133621004465387e-06, + "loss": 0.784, + "step": 104530 + }, + { + "epoch": 1.6780365655949534, + "grad_norm": 0.7361593246459961, + "learning_rate": 3.1305656357261903e-06, + "loss": 0.7185, + "step": 104540 + }, + { + "epoch": 1.6781970818151173, + "grad_norm": 0.923835813999176, + "learning_rate": 3.1275116577546614e-06, + "loss": 0.6475, + "step": 104550 + }, + { + "epoch": 1.6783575980352814, + "grad_norm": 1.3355457782745361, + "learning_rate": 3.124459070745009e-06, + "loss": 0.5015, + "step": 104560 + }, + { + "epoch": 1.6785181142554455, + "grad_norm": 1.2548221349716187, + "learning_rate": 3.1214078748913716e-06, + "loss": 0.6849, + "step": 104570 + }, + { + "epoch": 1.6786786304756096, + "grad_norm": 1.5230014324188232, + "learning_rate": 3.1183580703877723e-06, + "loss": 0.7971, + "step": 104580 + }, + { + "epoch": 1.6788391466957737, + "grad_norm": 0.9869030117988586, + "learning_rate": 3.115309657428167e-06, + "loss": 0.7273, + "step": 104590 + }, + { + "epoch": 1.6789996629159376, + "grad_norm": 0.9814349412918091, + "learning_rate": 3.11226263620642e-06, + "loss": 0.7411, + "step": 104600 + }, + { + "epoch": 1.6791601791361017, + "grad_norm": 1.000830888748169, + "learning_rate": 3.1092170069162985e-06, + "loss": 0.8606, + "step": 104610 + }, + { + "epoch": 1.6793206953562656, + "grad_norm": 2.1208364963531494, + "learning_rate": 3.106172769751492e-06, + "loss": 0.6863, + "step": 104620 + }, + { + "epoch": 1.6794812115764297, + "grad_norm": 1.062882661819458, + "learning_rate": 3.103129924905598e-06, + "loss": 0.7373, + "step": 104630 + }, + { + "epoch": 1.6796417277965938, + "grad_norm": 1.0899945497512817, + "learning_rate": 3.100088472572113e-06, + "loss": 0.7137, + "step": 104640 + }, + { + "epoch": 1.679802244016758, + "grad_norm": 1.6699237823486328, + "learning_rate": 3.0970484129444617e-06, + "loss": 0.6622, + "step": 104650 + }, + { + "epoch": 1.679962760236922, + "grad_norm": 1.1438051462173462, + "learning_rate": 3.0940097462159755e-06, + "loss": 0.6999, + "step": 104660 + }, + { + "epoch": 1.6801232764570861, + "grad_norm": 0.9617446064949036, + "learning_rate": 3.090972472579892e-06, + "loss": 0.5891, + "step": 104670 + }, + { + "epoch": 1.68028379267725, + "grad_norm": 0.9572314023971558, + "learning_rate": 3.087936592229368e-06, + "loss": 0.7079, + "step": 104680 + }, + { + "epoch": 1.680444308897414, + "grad_norm": 1.108169436454773, + "learning_rate": 3.0849021053574706e-06, + "loss": 0.7529, + "step": 104690 + }, + { + "epoch": 1.680604825117578, + "grad_norm": 0.8860900402069092, + "learning_rate": 3.081869012157165e-06, + "loss": 0.6994, + "step": 104700 + }, + { + "epoch": 1.6807653413377421, + "grad_norm": 1.165071964263916, + "learning_rate": 3.078837312821345e-06, + "loss": 0.7556, + "step": 104710 + }, + { + "epoch": 1.6809258575579062, + "grad_norm": 0.9576390981674194, + "learning_rate": 3.075807007542808e-06, + "loss": 0.5884, + "step": 104720 + }, + { + "epoch": 1.6810863737780704, + "grad_norm": 1.0816198587417603, + "learning_rate": 3.0727780965142614e-06, + "loss": 0.6841, + "step": 104730 + }, + { + "epoch": 1.6812468899982345, + "grad_norm": 1.057433843612671, + "learning_rate": 3.0697505799283322e-06, + "loss": 0.7279, + "step": 104740 + }, + { + "epoch": 1.6814074062183983, + "grad_norm": 0.9690642356872559, + "learning_rate": 3.0667244579775488e-06, + "loss": 0.7531, + "step": 104750 + }, + { + "epoch": 1.6815679224385625, + "grad_norm": 1.0878876447677612, + "learning_rate": 3.0636997308543527e-06, + "loss": 0.6765, + "step": 104760 + }, + { + "epoch": 1.6817284386587263, + "grad_norm": 1.3860126733779907, + "learning_rate": 3.0606763987511e-06, + "loss": 0.9312, + "step": 104770 + }, + { + "epoch": 1.6818889548788905, + "grad_norm": 1.8416825532913208, + "learning_rate": 3.057654461860057e-06, + "loss": 0.7819, + "step": 104780 + }, + { + "epoch": 1.6820494710990546, + "grad_norm": 1.6423465013504028, + "learning_rate": 3.054633920373401e-06, + "loss": 0.607, + "step": 104790 + }, + { + "epoch": 1.6822099873192187, + "grad_norm": 1.0361770391464233, + "learning_rate": 3.051614774483225e-06, + "loss": 0.7167, + "step": 104800 + }, + { + "epoch": 1.6823705035393828, + "grad_norm": 1.0738285779953003, + "learning_rate": 3.048597024381522e-06, + "loss": 0.7331, + "step": 104810 + }, + { + "epoch": 1.6825310197595467, + "grad_norm": 1.0575509071350098, + "learning_rate": 3.045580670260212e-06, + "loss": 0.5975, + "step": 104820 + }, + { + "epoch": 1.6826915359797108, + "grad_norm": 1.2243037223815918, + "learning_rate": 3.042565712311107e-06, + "loss": 0.7591, + "step": 104830 + }, + { + "epoch": 1.6828520521998747, + "grad_norm": 1.0767914056777954, + "learning_rate": 3.0395521507259466e-06, + "loss": 0.8706, + "step": 104840 + }, + { + "epoch": 1.6830125684200388, + "grad_norm": 1.30801522731781, + "learning_rate": 3.036539985696374e-06, + "loss": 0.7771, + "step": 104850 + }, + { + "epoch": 1.6831730846402029, + "grad_norm": 1.1379749774932861, + "learning_rate": 3.033529217413947e-06, + "loss": 0.7741, + "step": 104860 + }, + { + "epoch": 1.683333600860367, + "grad_norm": 1.0097076892852783, + "learning_rate": 3.030519846070129e-06, + "loss": 0.7214, + "step": 104870 + }, + { + "epoch": 1.683494117080531, + "grad_norm": 0.6674444675445557, + "learning_rate": 3.027511871856309e-06, + "loss": 0.7227, + "step": 104880 + }, + { + "epoch": 1.683654633300695, + "grad_norm": 0.8193458318710327, + "learning_rate": 3.0245052949637617e-06, + "loss": 0.6748, + "step": 104890 + }, + { + "epoch": 1.683815149520859, + "grad_norm": 1.5583178997039795, + "learning_rate": 3.0215001155836975e-06, + "loss": 0.6146, + "step": 104900 + }, + { + "epoch": 1.683975665741023, + "grad_norm": 0.8207075595855713, + "learning_rate": 3.0184963339072224e-06, + "loss": 0.6221, + "step": 104910 + }, + { + "epoch": 1.684136181961187, + "grad_norm": 1.4945178031921387, + "learning_rate": 3.0154939501253613e-06, + "loss": 0.6961, + "step": 104920 + }, + { + "epoch": 1.6842966981813512, + "grad_norm": 0.8524873852729797, + "learning_rate": 3.0124929644290504e-06, + "loss": 0.7028, + "step": 104930 + }, + { + "epoch": 1.6844572144015153, + "grad_norm": 2.765005111694336, + "learning_rate": 3.00949337700914e-06, + "loss": 0.7707, + "step": 104940 + }, + { + "epoch": 1.6846177306216794, + "grad_norm": 0.7927536964416504, + "learning_rate": 3.006495188056371e-06, + "loss": 0.7453, + "step": 104950 + }, + { + "epoch": 1.6847782468418435, + "grad_norm": 0.9341903328895569, + "learning_rate": 3.003498397761423e-06, + "loss": 0.7279, + "step": 104960 + }, + { + "epoch": 1.6849387630620074, + "grad_norm": 1.215414047241211, + "learning_rate": 3.0005030063148697e-06, + "loss": 0.6641, + "step": 104970 + }, + { + "epoch": 1.6850992792821715, + "grad_norm": 0.8195786476135254, + "learning_rate": 2.9975090139071986e-06, + "loss": 0.6927, + "step": 104980 + }, + { + "epoch": 1.6852597955023354, + "grad_norm": 0.9357839822769165, + "learning_rate": 2.9945164207288157e-06, + "loss": 0.7519, + "step": 104990 + }, + { + "epoch": 1.6854203117224995, + "grad_norm": 0.8392741084098816, + "learning_rate": 2.991525226970032e-06, + "loss": 0.6684, + "step": 105000 + }, + { + "epoch": 1.6855808279426636, + "grad_norm": 0.8452712297439575, + "learning_rate": 2.988535432821063e-06, + "loss": 0.6115, + "step": 105010 + }, + { + "epoch": 1.6857413441628277, + "grad_norm": 1.018808364868164, + "learning_rate": 2.9855470384720474e-06, + "loss": 0.7811, + "step": 105020 + }, + { + "epoch": 1.6859018603829918, + "grad_norm": 0.8965296745300293, + "learning_rate": 2.982560044113028e-06, + "loss": 0.7162, + "step": 105030 + }, + { + "epoch": 1.6860623766031557, + "grad_norm": 1.208753228187561, + "learning_rate": 2.979574449933961e-06, + "loss": 0.6922, + "step": 105040 + }, + { + "epoch": 1.6862228928233198, + "grad_norm": 0.854137122631073, + "learning_rate": 2.9765902561247114e-06, + "loss": 0.6642, + "step": 105050 + }, + { + "epoch": 1.6863834090434837, + "grad_norm": 1.712454080581665, + "learning_rate": 2.9736074628750583e-06, + "loss": 0.6395, + "step": 105060 + }, + { + "epoch": 1.6865439252636478, + "grad_norm": 1.1735680103302002, + "learning_rate": 2.9706260703746853e-06, + "loss": 0.6388, + "step": 105070 + }, + { + "epoch": 1.686704441483812, + "grad_norm": 0.8702462911605835, + "learning_rate": 2.9676460788131993e-06, + "loss": 0.6104, + "step": 105080 + }, + { + "epoch": 1.686864957703976, + "grad_norm": 1.1367392539978027, + "learning_rate": 2.9646674883801018e-06, + "loss": 0.7622, + "step": 105090 + }, + { + "epoch": 1.6870254739241402, + "grad_norm": 1.101873755455017, + "learning_rate": 2.9616902992648215e-06, + "loss": 0.7934, + "step": 105100 + }, + { + "epoch": 1.687185990144304, + "grad_norm": 0.7002279162406921, + "learning_rate": 2.9587145116566906e-06, + "loss": 0.7199, + "step": 105110 + }, + { + "epoch": 1.6873465063644681, + "grad_norm": 1.2798588275909424, + "learning_rate": 2.9557401257449406e-06, + "loss": 0.6822, + "step": 105120 + }, + { + "epoch": 1.687507022584632, + "grad_norm": 0.8857151865959167, + "learning_rate": 2.9527671417187346e-06, + "loss": 0.6632, + "step": 105130 + }, + { + "epoch": 1.6876675388047961, + "grad_norm": 1.4892016649246216, + "learning_rate": 2.949795559767132e-06, + "loss": 0.7026, + "step": 105140 + }, + { + "epoch": 1.6878280550249602, + "grad_norm": 1.0909379720687866, + "learning_rate": 2.94682538007911e-06, + "loss": 0.7007, + "step": 105150 + }, + { + "epoch": 1.6879885712451244, + "grad_norm": 1.5153040885925293, + "learning_rate": 2.943856602843556e-06, + "loss": 0.7164, + "step": 105160 + }, + { + "epoch": 1.6881490874652885, + "grad_norm": 0.8662139773368835, + "learning_rate": 2.940889228249266e-06, + "loss": 0.6303, + "step": 105170 + }, + { + "epoch": 1.6883096036854526, + "grad_norm": 1.0522719621658325, + "learning_rate": 2.937923256484948e-06, + "loss": 0.6178, + "step": 105180 + }, + { + "epoch": 1.6884701199056165, + "grad_norm": 1.1933393478393555, + "learning_rate": 2.9349586877392226e-06, + "loss": 0.6898, + "step": 105190 + }, + { + "epoch": 1.6886306361257803, + "grad_norm": 1.0950733423233032, + "learning_rate": 2.931995522200612e-06, + "loss": 0.702, + "step": 105200 + }, + { + "epoch": 1.6887911523459445, + "grad_norm": 0.7301512360572815, + "learning_rate": 2.929033760057559e-06, + "loss": 0.6172, + "step": 105210 + }, + { + "epoch": 1.6889516685661086, + "grad_norm": 1.4002459049224854, + "learning_rate": 2.9260734014984164e-06, + "loss": 0.7529, + "step": 105220 + }, + { + "epoch": 1.6891121847862727, + "grad_norm": 1.8569213151931763, + "learning_rate": 2.9231144467114447e-06, + "loss": 0.6283, + "step": 105230 + }, + { + "epoch": 1.6892727010064368, + "grad_norm": 1.44053316116333, + "learning_rate": 2.920156895884818e-06, + "loss": 0.6853, + "step": 105240 + }, + { + "epoch": 1.689433217226601, + "grad_norm": 1.173714518547058, + "learning_rate": 2.9172007492066196e-06, + "loss": 0.7275, + "step": 105250 + }, + { + "epoch": 1.6895937334467648, + "grad_norm": 1.2730343341827393, + "learning_rate": 2.914246006864835e-06, + "loss": 0.6203, + "step": 105260 + }, + { + "epoch": 1.6897542496669289, + "grad_norm": 1.2328218221664429, + "learning_rate": 2.9112926690473775e-06, + "loss": 0.7058, + "step": 105270 + }, + { + "epoch": 1.6899147658870928, + "grad_norm": 1.3164722919464111, + "learning_rate": 2.9083407359420555e-06, + "loss": 0.6787, + "step": 105280 + }, + { + "epoch": 1.6900752821072569, + "grad_norm": 1.317012071609497, + "learning_rate": 2.9053902077365997e-06, + "loss": 0.7479, + "step": 105290 + }, + { + "epoch": 1.690235798327421, + "grad_norm": 0.7620493769645691, + "learning_rate": 2.9024410846186427e-06, + "loss": 0.7507, + "step": 105300 + }, + { + "epoch": 1.690396314547585, + "grad_norm": 0.706174910068512, + "learning_rate": 2.8994933667757405e-06, + "loss": 0.7103, + "step": 105310 + }, + { + "epoch": 1.6905568307677492, + "grad_norm": 0.7225069403648376, + "learning_rate": 2.8965470543953348e-06, + "loss": 0.6998, + "step": 105320 + }, + { + "epoch": 1.690717346987913, + "grad_norm": 0.9382842779159546, + "learning_rate": 2.893602147664806e-06, + "loss": 0.6469, + "step": 105330 + }, + { + "epoch": 1.6908778632080772, + "grad_norm": 1.307315707206726, + "learning_rate": 2.890658646771427e-06, + "loss": 0.7582, + "step": 105340 + }, + { + "epoch": 1.691038379428241, + "grad_norm": 1.649861216545105, + "learning_rate": 2.887716551902392e-06, + "loss": 0.7045, + "step": 105350 + }, + { + "epoch": 1.6911988956484052, + "grad_norm": 0.9390506148338318, + "learning_rate": 2.8847758632447956e-06, + "loss": 0.5956, + "step": 105360 + }, + { + "epoch": 1.6913594118685693, + "grad_norm": 1.2636363506317139, + "learning_rate": 2.881836580985653e-06, + "loss": 0.7023, + "step": 105370 + }, + { + "epoch": 1.6915199280887334, + "grad_norm": 0.8914257884025574, + "learning_rate": 2.878898705311886e-06, + "loss": 0.6037, + "step": 105380 + }, + { + "epoch": 1.6916804443088975, + "grad_norm": 0.7186816334724426, + "learning_rate": 2.8759622364103204e-06, + "loss": 0.7827, + "step": 105390 + }, + { + "epoch": 1.6918409605290614, + "grad_norm": 0.9006560444831848, + "learning_rate": 2.873027174467702e-06, + "loss": 0.6949, + "step": 105400 + }, + { + "epoch": 1.6920014767492255, + "grad_norm": 0.9993123412132263, + "learning_rate": 2.870093519670683e-06, + "loss": 0.5939, + "step": 105410 + }, + { + "epoch": 1.6921619929693894, + "grad_norm": 1.1709775924682617, + "learning_rate": 2.867161272205826e-06, + "loss": 0.6645, + "step": 105420 + }, + { + "epoch": 1.6923225091895535, + "grad_norm": 1.217893362045288, + "learning_rate": 2.864230432259607e-06, + "loss": 0.7971, + "step": 105430 + }, + { + "epoch": 1.6924830254097176, + "grad_norm": 1.004776120185852, + "learning_rate": 2.8613010000184126e-06, + "loss": 0.8101, + "step": 105440 + }, + { + "epoch": 1.6926435416298817, + "grad_norm": 1.035390853881836, + "learning_rate": 2.8583729756685275e-06, + "loss": 0.6467, + "step": 105450 + }, + { + "epoch": 1.6928040578500458, + "grad_norm": 1.021192193031311, + "learning_rate": 2.855446359396163e-06, + "loss": 0.6433, + "step": 105460 + }, + { + "epoch": 1.69296457407021, + "grad_norm": 1.1577949523925781, + "learning_rate": 2.8525211513874383e-06, + "loss": 0.6289, + "step": 105470 + }, + { + "epoch": 1.6931250902903738, + "grad_norm": 1.083909273147583, + "learning_rate": 2.8495973518283733e-06, + "loss": 0.6734, + "step": 105480 + }, + { + "epoch": 1.693285606510538, + "grad_norm": 1.1775319576263428, + "learning_rate": 2.8466749609049077e-06, + "loss": 0.7008, + "step": 105490 + }, + { + "epoch": 1.6934461227307018, + "grad_norm": 1.0225157737731934, + "learning_rate": 2.8437539788028906e-06, + "loss": 0.7638, + "step": 105500 + }, + { + "epoch": 1.693606638950866, + "grad_norm": 1.2374978065490723, + "learning_rate": 2.840834405708073e-06, + "loss": 0.8162, + "step": 105510 + }, + { + "epoch": 1.69376715517103, + "grad_norm": 0.9367249011993408, + "learning_rate": 2.8379162418061267e-06, + "loss": 0.6872, + "step": 105520 + }, + { + "epoch": 1.6939276713911942, + "grad_norm": 1.1848043203353882, + "learning_rate": 2.8349994872826273e-06, + "loss": 0.6838, + "step": 105530 + }, + { + "epoch": 1.6940881876113583, + "grad_norm": 1.176611304283142, + "learning_rate": 2.8320841423230688e-06, + "loss": 0.7725, + "step": 105540 + }, + { + "epoch": 1.6942487038315222, + "grad_norm": 0.9198991060256958, + "learning_rate": 2.829170207112844e-06, + "loss": 0.6994, + "step": 105550 + }, + { + "epoch": 1.6944092200516863, + "grad_norm": 1.23841392993927, + "learning_rate": 2.826257681837269e-06, + "loss": 0.6915, + "step": 105560 + }, + { + "epoch": 1.6945697362718501, + "grad_norm": 1.2866021394729614, + "learning_rate": 2.8233465666815535e-06, + "loss": 0.7439, + "step": 105570 + }, + { + "epoch": 1.6947302524920143, + "grad_norm": 0.8914940357208252, + "learning_rate": 2.820436861830836e-06, + "loss": 0.6706, + "step": 105580 + }, + { + "epoch": 1.6948907687121784, + "grad_norm": 0.8783137798309326, + "learning_rate": 2.8175285674701483e-06, + "loss": 0.6353, + "step": 105590 + }, + { + "epoch": 1.6950512849323425, + "grad_norm": 0.8666733503341675, + "learning_rate": 2.814621683784449e-06, + "loss": 0.6126, + "step": 105600 + }, + { + "epoch": 1.6952118011525066, + "grad_norm": 0.7092216610908508, + "learning_rate": 2.8117162109585955e-06, + "loss": 0.7464, + "step": 105610 + }, + { + "epoch": 1.6953723173726705, + "grad_norm": 0.9214487671852112, + "learning_rate": 2.808812149177359e-06, + "loss": 0.6558, + "step": 105620 + }, + { + "epoch": 1.6955328335928346, + "grad_norm": 0.8518943786621094, + "learning_rate": 2.8059094986254255e-06, + "loss": 0.7129, + "step": 105630 + }, + { + "epoch": 1.6956933498129985, + "grad_norm": 0.9947484135627747, + "learning_rate": 2.803008259487375e-06, + "loss": 0.7075, + "step": 105640 + }, + { + "epoch": 1.6958538660331626, + "grad_norm": 1.7062394618988037, + "learning_rate": 2.800108431947718e-06, + "loss": 0.5204, + "step": 105650 + }, + { + "epoch": 1.6960143822533267, + "grad_norm": 0.8745878338813782, + "learning_rate": 2.7972100161908653e-06, + "loss": 0.6826, + "step": 105660 + }, + { + "epoch": 1.6961748984734908, + "grad_norm": 1.083156704902649, + "learning_rate": 2.794313012401137e-06, + "loss": 0.6107, + "step": 105670 + }, + { + "epoch": 1.696335414693655, + "grad_norm": 1.2760143280029297, + "learning_rate": 2.791417420762768e-06, + "loss": 0.6389, + "step": 105680 + }, + { + "epoch": 1.696495930913819, + "grad_norm": 1.6136713027954102, + "learning_rate": 2.788523241459898e-06, + "loss": 0.6229, + "step": 105690 + }, + { + "epoch": 1.696656447133983, + "grad_norm": 2.0694515705108643, + "learning_rate": 2.7856304746765814e-06, + "loss": 0.633, + "step": 105700 + }, + { + "epoch": 1.6968169633541468, + "grad_norm": 1.1101418733596802, + "learning_rate": 2.7827391205967835e-06, + "loss": 0.75, + "step": 105710 + }, + { + "epoch": 1.6969774795743109, + "grad_norm": 0.9188281893730164, + "learning_rate": 2.779849179404376e-06, + "loss": 0.6831, + "step": 105720 + }, + { + "epoch": 1.697137995794475, + "grad_norm": 1.226879358291626, + "learning_rate": 2.7769606512831393e-06, + "loss": 0.6259, + "step": 105730 + }, + { + "epoch": 1.697298512014639, + "grad_norm": 1.2669289112091064, + "learning_rate": 2.774073536416777e-06, + "loss": 0.7963, + "step": 105740 + }, + { + "epoch": 1.6974590282348032, + "grad_norm": 0.9723467826843262, + "learning_rate": 2.7711878349888783e-06, + "loss": 0.7326, + "step": 105750 + }, + { + "epoch": 1.6976195444549673, + "grad_norm": 0.9452223181724548, + "learning_rate": 2.7683035471829626e-06, + "loss": 0.8142, + "step": 105760 + }, + { + "epoch": 1.6977800606751312, + "grad_norm": 1.2559179067611694, + "learning_rate": 2.765420673182456e-06, + "loss": 0.6252, + "step": 105770 + }, + { + "epoch": 1.6979405768952953, + "grad_norm": 1.2226539850234985, + "learning_rate": 2.7625392131706895e-06, + "loss": 0.6815, + "step": 105780 + }, + { + "epoch": 1.6981010931154592, + "grad_norm": 1.227070927619934, + "learning_rate": 2.759659167330908e-06, + "loss": 0.7128, + "step": 105790 + }, + { + "epoch": 1.6982616093356233, + "grad_norm": 0.833321213722229, + "learning_rate": 2.756780535846268e-06, + "loss": 0.8204, + "step": 105800 + }, + { + "epoch": 1.6984221255557874, + "grad_norm": 0.966153085231781, + "learning_rate": 2.7539033188998365e-06, + "loss": 0.6922, + "step": 105810 + }, + { + "epoch": 1.6985826417759515, + "grad_norm": 0.7233078479766846, + "learning_rate": 2.7510275166745756e-06, + "loss": 0.7149, + "step": 105820 + }, + { + "epoch": 1.6987431579961156, + "grad_norm": 1.0138888359069824, + "learning_rate": 2.7481531293533787e-06, + "loss": 0.7359, + "step": 105830 + }, + { + "epoch": 1.6989036742162795, + "grad_norm": 0.9765477776527405, + "learning_rate": 2.7452801571190345e-06, + "loss": 0.654, + "step": 105840 + }, + { + "epoch": 1.6990641904364436, + "grad_norm": 1.1986340284347534, + "learning_rate": 2.7424086001542532e-06, + "loss": 0.531, + "step": 105850 + }, + { + "epoch": 1.6992247066566075, + "grad_norm": 0.9768362641334534, + "learning_rate": 2.739538458641644e-06, + "loss": 0.5902, + "step": 105860 + }, + { + "epoch": 1.6993852228767716, + "grad_norm": 0.8864530920982361, + "learning_rate": 2.7366697327637386e-06, + "loss": 0.7736, + "step": 105870 + }, + { + "epoch": 1.6995457390969357, + "grad_norm": 1.4961435794830322, + "learning_rate": 2.7338024227029633e-06, + "loss": 0.7359, + "step": 105880 + }, + { + "epoch": 1.6997062553170998, + "grad_norm": 0.7244638204574585, + "learning_rate": 2.7309365286416617e-06, + "loss": 0.7921, + "step": 105890 + }, + { + "epoch": 1.699866771537264, + "grad_norm": 1.1333788633346558, + "learning_rate": 2.7280720507620927e-06, + "loss": 0.6871, + "step": 105900 + }, + { + "epoch": 1.7000272877574278, + "grad_norm": 1.0092835426330566, + "learning_rate": 2.7252089892464168e-06, + "loss": 0.6984, + "step": 105910 + }, + { + "epoch": 1.700187803977592, + "grad_norm": 0.8814666271209717, + "learning_rate": 2.72234734427671e-06, + "loss": 0.7593, + "step": 105920 + }, + { + "epoch": 1.7003483201977558, + "grad_norm": 1.3522427082061768, + "learning_rate": 2.719487116034955e-06, + "loss": 0.7542, + "step": 105930 + }, + { + "epoch": 1.70050883641792, + "grad_norm": 0.9694757461547852, + "learning_rate": 2.716628304703053e-06, + "loss": 0.6724, + "step": 105940 + }, + { + "epoch": 1.700669352638084, + "grad_norm": 1.3869454860687256, + "learning_rate": 2.713770910462793e-06, + "loss": 0.6458, + "step": 105950 + }, + { + "epoch": 1.7008298688582482, + "grad_norm": 0.8784160614013672, + "learning_rate": 2.7109149334958976e-06, + "loss": 0.6607, + "step": 105960 + }, + { + "epoch": 1.7009903850784123, + "grad_norm": 1.039460301399231, + "learning_rate": 2.7080603739839894e-06, + "loss": 0.694, + "step": 105970 + }, + { + "epoch": 1.7011509012985764, + "grad_norm": 1.2839628458023071, + "learning_rate": 2.7052072321086e-06, + "loss": 0.6799, + "step": 105980 + }, + { + "epoch": 1.7013114175187403, + "grad_norm": 1.0725734233856201, + "learning_rate": 2.702355508051174e-06, + "loss": 0.6547, + "step": 105990 + }, + { + "epoch": 1.7014719337389042, + "grad_norm": 0.9432069063186646, + "learning_rate": 2.699505201993069e-06, + "loss": 0.7334, + "step": 106000 + }, + { + "epoch": 1.7016324499590683, + "grad_norm": 1.0609524250030518, + "learning_rate": 2.6966563141155366e-06, + "loss": 0.5769, + "step": 106010 + }, + { + "epoch": 1.7017929661792324, + "grad_norm": 1.3000565767288208, + "learning_rate": 2.693808844599757e-06, + "loss": 0.8063, + "step": 106020 + }, + { + "epoch": 1.7019534823993965, + "grad_norm": 1.0658575296401978, + "learning_rate": 2.6909627936268084e-06, + "loss": 0.6675, + "step": 106030 + }, + { + "epoch": 1.7021139986195606, + "grad_norm": 1.0435214042663574, + "learning_rate": 2.6881181613776868e-06, + "loss": 0.8546, + "step": 106040 + }, + { + "epoch": 1.7022745148397247, + "grad_norm": 1.4672831296920776, + "learning_rate": 2.6852749480332923e-06, + "loss": 0.7639, + "step": 106050 + }, + { + "epoch": 1.7024350310598886, + "grad_norm": 1.4124571084976196, + "learning_rate": 2.682433153774441e-06, + "loss": 0.7962, + "step": 106060 + }, + { + "epoch": 1.7025955472800527, + "grad_norm": 1.1907291412353516, + "learning_rate": 2.6795927787818474e-06, + "loss": 0.7658, + "step": 106070 + }, + { + "epoch": 1.7027560635002166, + "grad_norm": 1.135750651359558, + "learning_rate": 2.6767538232361446e-06, + "loss": 0.7948, + "step": 106080 + }, + { + "epoch": 1.7029165797203807, + "grad_norm": 1.407707691192627, + "learning_rate": 2.6739162873178765e-06, + "loss": 0.7755, + "step": 106090 + }, + { + "epoch": 1.7030770959405448, + "grad_norm": 0.9089909791946411, + "learning_rate": 2.671080171207488e-06, + "loss": 0.7056, + "step": 106100 + }, + { + "epoch": 1.703237612160709, + "grad_norm": 1.6379035711288452, + "learning_rate": 2.668245475085346e-06, + "loss": 0.7824, + "step": 106110 + }, + { + "epoch": 1.703398128380873, + "grad_norm": 0.6578342318534851, + "learning_rate": 2.66541219913172e-06, + "loss": 0.618, + "step": 106120 + }, + { + "epoch": 1.703558644601037, + "grad_norm": 1.8058104515075684, + "learning_rate": 2.6625803435267855e-06, + "loss": 0.754, + "step": 106130 + }, + { + "epoch": 1.703719160821201, + "grad_norm": 0.9591531157493591, + "learning_rate": 2.6597499084506316e-06, + "loss": 0.7521, + "step": 106140 + }, + { + "epoch": 1.703879677041365, + "grad_norm": 1.0128488540649414, + "learning_rate": 2.6569208940832618e-06, + "loss": 0.6238, + "step": 106150 + }, + { + "epoch": 1.704040193261529, + "grad_norm": 1.430046796798706, + "learning_rate": 2.6540933006045795e-06, + "loss": 0.8138, + "step": 106160 + }, + { + "epoch": 1.704200709481693, + "grad_norm": 0.9634451270103455, + "learning_rate": 2.6512671281944094e-06, + "loss": 0.5926, + "step": 106170 + }, + { + "epoch": 1.7043612257018572, + "grad_norm": 1.0537300109863281, + "learning_rate": 2.648442377032473e-06, + "loss": 0.6252, + "step": 106180 + }, + { + "epoch": 1.7045217419220213, + "grad_norm": 0.9580572843551636, + "learning_rate": 2.645619047298417e-06, + "loss": 0.6579, + "step": 106190 + }, + { + "epoch": 1.7046822581421852, + "grad_norm": 1.9924811124801636, + "learning_rate": 2.6427971391717794e-06, + "loss": 0.6755, + "step": 106200 + }, + { + "epoch": 1.7048427743623493, + "grad_norm": 0.9517027139663696, + "learning_rate": 2.6399766528320186e-06, + "loss": 0.7083, + "step": 106210 + }, + { + "epoch": 1.7050032905825132, + "grad_norm": 1.5287823677062988, + "learning_rate": 2.6371575884585026e-06, + "loss": 0.6715, + "step": 106220 + }, + { + "epoch": 1.7051638068026773, + "grad_norm": 1.1722556352615356, + "learning_rate": 2.63433994623051e-06, + "loss": 0.7076, + "step": 106230 + }, + { + "epoch": 1.7053243230228414, + "grad_norm": 0.5873991250991821, + "learning_rate": 2.6315237263272203e-06, + "loss": 0.588, + "step": 106240 + }, + { + "epoch": 1.7054848392430055, + "grad_norm": 1.1565353870391846, + "learning_rate": 2.6287089289277396e-06, + "loss": 0.6417, + "step": 106250 + }, + { + "epoch": 1.7056453554631696, + "grad_norm": 0.7922542095184326, + "learning_rate": 2.625895554211058e-06, + "loss": 0.7631, + "step": 106260 + }, + { + "epoch": 1.7058058716833338, + "grad_norm": 1.2978876829147339, + "learning_rate": 2.623083602356097e-06, + "loss": 0.6821, + "step": 106270 + }, + { + "epoch": 1.7059663879034976, + "grad_norm": 1.2551754713058472, + "learning_rate": 2.620273073541679e-06, + "loss": 0.7367, + "step": 106280 + }, + { + "epoch": 1.7061269041236617, + "grad_norm": 1.2292811870574951, + "learning_rate": 2.61746396794654e-06, + "loss": 0.7481, + "step": 106290 + }, + { + "epoch": 1.7062874203438256, + "grad_norm": 0.9667523503303528, + "learning_rate": 2.61465628574932e-06, + "loss": 0.6733, + "step": 106300 + }, + { + "epoch": 1.7064479365639897, + "grad_norm": 1.148815393447876, + "learning_rate": 2.6118500271285707e-06, + "loss": 0.6827, + "step": 106310 + }, + { + "epoch": 1.7066084527841539, + "grad_norm": 0.9811447262763977, + "learning_rate": 2.6090451922627547e-06, + "loss": 0.6993, + "step": 106320 + }, + { + "epoch": 1.706768969004318, + "grad_norm": 0.9222663640975952, + "learning_rate": 2.6062417813302433e-06, + "loss": 0.6897, + "step": 106330 + }, + { + "epoch": 1.706929485224482, + "grad_norm": 1.2775770425796509, + "learning_rate": 2.603439794509316e-06, + "loss": 0.7619, + "step": 106340 + }, + { + "epoch": 1.707090001444646, + "grad_norm": 1.2058501243591309, + "learning_rate": 2.6006392319781636e-06, + "loss": 0.6198, + "step": 106350 + }, + { + "epoch": 1.70725051766481, + "grad_norm": 0.7686787843704224, + "learning_rate": 2.5978400939148883e-06, + "loss": 0.7003, + "step": 106360 + }, + { + "epoch": 1.707411033884974, + "grad_norm": 0.6733344197273254, + "learning_rate": 2.5950423804974948e-06, + "loss": 0.7343, + "step": 106370 + }, + { + "epoch": 1.707571550105138, + "grad_norm": 1.6990734338760376, + "learning_rate": 2.5922460919038993e-06, + "loss": 0.7097, + "step": 106380 + }, + { + "epoch": 1.7077320663253022, + "grad_norm": 1.0928572416305542, + "learning_rate": 2.5894512283119314e-06, + "loss": 0.6107, + "step": 106390 + }, + { + "epoch": 1.7078925825454663, + "grad_norm": 0.9840784072875977, + "learning_rate": 2.58665778989933e-06, + "loss": 0.6774, + "step": 106400 + }, + { + "epoch": 1.7080530987656304, + "grad_norm": 1.38318932056427, + "learning_rate": 2.5838657768437413e-06, + "loss": 0.7116, + "step": 106410 + }, + { + "epoch": 1.7082136149857943, + "grad_norm": 1.111141562461853, + "learning_rate": 2.581075189322718e-06, + "loss": 0.6591, + "step": 106420 + }, + { + "epoch": 1.7083741312059584, + "grad_norm": 1.26133131980896, + "learning_rate": 2.5782860275137295e-06, + "loss": 0.7105, + "step": 106430 + }, + { + "epoch": 1.7085346474261223, + "grad_norm": 1.0775426626205444, + "learning_rate": 2.57549829159415e-06, + "loss": 0.7596, + "step": 106440 + }, + { + "epoch": 1.7086951636462864, + "grad_norm": 0.8852244019508362, + "learning_rate": 2.5727119817412577e-06, + "loss": 0.6282, + "step": 106450 + }, + { + "epoch": 1.7088556798664505, + "grad_norm": 0.8114358186721802, + "learning_rate": 2.5699270981322464e-06, + "loss": 0.7545, + "step": 106460 + }, + { + "epoch": 1.7090161960866146, + "grad_norm": 1.1319184303283691, + "learning_rate": 2.5671436409442222e-06, + "loss": 0.7037, + "step": 106470 + }, + { + "epoch": 1.7091767123067787, + "grad_norm": 1.0011742115020752, + "learning_rate": 2.5643616103541934e-06, + "loss": 0.7878, + "step": 106480 + }, + { + "epoch": 1.7093372285269428, + "grad_norm": 0.6891486644744873, + "learning_rate": 2.5615810065390824e-06, + "loss": 0.619, + "step": 106490 + }, + { + "epoch": 1.7094977447471067, + "grad_norm": 1.0071430206298828, + "learning_rate": 2.558801829675725e-06, + "loss": 0.6153, + "step": 106500 + }, + { + "epoch": 1.7096582609672706, + "grad_norm": 1.0513358116149902, + "learning_rate": 2.55602407994085e-06, + "loss": 0.6891, + "step": 106510 + }, + { + "epoch": 1.7098187771874347, + "grad_norm": 1.1413711309432983, + "learning_rate": 2.5532477575111097e-06, + "loss": 0.7113, + "step": 106520 + }, + { + "epoch": 1.7099792934075988, + "grad_norm": 1.1527161598205566, + "learning_rate": 2.5504728625630664e-06, + "loss": 0.6743, + "step": 106530 + }, + { + "epoch": 1.710139809627763, + "grad_norm": 0.9076132774353027, + "learning_rate": 2.5476993952731785e-06, + "loss": 0.7608, + "step": 106540 + }, + { + "epoch": 1.710300325847927, + "grad_norm": 0.9928293824195862, + "learning_rate": 2.5449273558178304e-06, + "loss": 0.6676, + "step": 106550 + }, + { + "epoch": 1.7104608420680911, + "grad_norm": 1.377168893814087, + "learning_rate": 2.5421567443733084e-06, + "loss": 0.7703, + "step": 106560 + }, + { + "epoch": 1.710621358288255, + "grad_norm": 0.9292163848876953, + "learning_rate": 2.5393875611158e-06, + "loss": 0.6916, + "step": 106570 + }, + { + "epoch": 1.7107818745084191, + "grad_norm": 0.8661201000213623, + "learning_rate": 2.536619806221413e-06, + "loss": 0.7022, + "step": 106580 + }, + { + "epoch": 1.710942390728583, + "grad_norm": 0.9522826671600342, + "learning_rate": 2.533853479866158e-06, + "loss": 0.7601, + "step": 106590 + }, + { + "epoch": 1.7111029069487471, + "grad_norm": 0.6758924722671509, + "learning_rate": 2.531088582225957e-06, + "loss": 0.6574, + "step": 106600 + }, + { + "epoch": 1.7112634231689112, + "grad_norm": 1.5748566389083862, + "learning_rate": 2.5283251134766457e-06, + "loss": 0.8004, + "step": 106610 + }, + { + "epoch": 1.7114239393890753, + "grad_norm": 1.0768392086029053, + "learning_rate": 2.5255630737939657e-06, + "loss": 0.6533, + "step": 106620 + }, + { + "epoch": 1.7115844556092394, + "grad_norm": 1.5689536333084106, + "learning_rate": 2.5228024633535574e-06, + "loss": 0.6312, + "step": 106630 + }, + { + "epoch": 1.7117449718294033, + "grad_norm": 0.8988999724388123, + "learning_rate": 2.5200432823309863e-06, + "loss": 0.7147, + "step": 106640 + }, + { + "epoch": 1.7119054880495674, + "grad_norm": 1.267966628074646, + "learning_rate": 2.517285530901717e-06, + "loss": 0.7017, + "step": 106650 + }, + { + "epoch": 1.7120660042697313, + "grad_norm": 1.4728673696517944, + "learning_rate": 2.514529209241129e-06, + "loss": 0.748, + "step": 106660 + }, + { + "epoch": 1.7122265204898954, + "grad_norm": 1.356523871421814, + "learning_rate": 2.5117743175245067e-06, + "loss": 0.6729, + "step": 106670 + }, + { + "epoch": 1.7123870367100595, + "grad_norm": 0.9547615051269531, + "learning_rate": 2.5090208559270496e-06, + "loss": 0.7207, + "step": 106680 + }, + { + "epoch": 1.7125475529302236, + "grad_norm": 0.9338287711143494, + "learning_rate": 2.506268824623853e-06, + "loss": 0.6426, + "step": 106690 + }, + { + "epoch": 1.7127080691503878, + "grad_norm": 0.8339183926582336, + "learning_rate": 2.503518223789933e-06, + "loss": 0.664, + "step": 106700 + }, + { + "epoch": 1.7128685853705516, + "grad_norm": 2.4556407928466797, + "learning_rate": 2.500769053600213e-06, + "loss": 0.7277, + "step": 106710 + }, + { + "epoch": 1.7130291015907158, + "grad_norm": 0.7300341725349426, + "learning_rate": 2.498021314229526e-06, + "loss": 0.764, + "step": 106720 + }, + { + "epoch": 1.7131896178108796, + "grad_norm": 1.323211908340454, + "learning_rate": 2.4952750058526097e-06, + "loss": 0.6558, + "step": 106730 + }, + { + "epoch": 1.7133501340310437, + "grad_norm": 1.2503255605697632, + "learning_rate": 2.4925301286441134e-06, + "loss": 0.7417, + "step": 106740 + }, + { + "epoch": 1.7135106502512079, + "grad_norm": 1.104594111442566, + "learning_rate": 2.4897866827786e-06, + "loss": 0.7403, + "step": 106750 + }, + { + "epoch": 1.713671166471372, + "grad_norm": 1.1431865692138672, + "learning_rate": 2.4870446684305273e-06, + "loss": 0.7008, + "step": 106760 + }, + { + "epoch": 1.713831682691536, + "grad_norm": 1.2687829732894897, + "learning_rate": 2.484304085774275e-06, + "loss": 0.708, + "step": 106770 + }, + { + "epoch": 1.7139921989117002, + "grad_norm": 1.0567119121551514, + "learning_rate": 2.4815649349841317e-06, + "loss": 0.6611, + "step": 106780 + }, + { + "epoch": 1.714152715131864, + "grad_norm": 1.7905833721160889, + "learning_rate": 2.4788272162342863e-06, + "loss": 0.6684, + "step": 106790 + }, + { + "epoch": 1.7143132313520282, + "grad_norm": 0.9548923969268799, + "learning_rate": 2.476090929698846e-06, + "loss": 0.6956, + "step": 106800 + }, + { + "epoch": 1.714473747572192, + "grad_norm": 1.3525317907333374, + "learning_rate": 2.473356075551825e-06, + "loss": 0.7391, + "step": 106810 + }, + { + "epoch": 1.7146342637923562, + "grad_norm": 1.394050121307373, + "learning_rate": 2.4706226539671333e-06, + "loss": 0.7006, + "step": 106820 + }, + { + "epoch": 1.7147947800125203, + "grad_norm": 1.23042631149292, + "learning_rate": 2.467890665118608e-06, + "loss": 0.6857, + "step": 106830 + }, + { + "epoch": 1.7149552962326844, + "grad_norm": 1.3576046228408813, + "learning_rate": 2.465160109179987e-06, + "loss": 0.6846, + "step": 106840 + }, + { + "epoch": 1.7151158124528485, + "grad_norm": 1.096462368965149, + "learning_rate": 2.462430986324915e-06, + "loss": 0.7148, + "step": 106850 + }, + { + "epoch": 1.7152763286730124, + "grad_norm": 1.052755355834961, + "learning_rate": 2.4597032967269533e-06, + "loss": 0.7885, + "step": 106860 + }, + { + "epoch": 1.7154368448931765, + "grad_norm": 0.6980003714561462, + "learning_rate": 2.456977040559566e-06, + "loss": 0.7094, + "step": 106870 + }, + { + "epoch": 1.7155973611133404, + "grad_norm": 0.9329773783683777, + "learning_rate": 2.45425221799612e-06, + "loss": 0.7053, + "step": 106880 + }, + { + "epoch": 1.7157578773335045, + "grad_norm": 0.9596794843673706, + "learning_rate": 2.4515288292099037e-06, + "loss": 0.7609, + "step": 106890 + }, + { + "epoch": 1.7159183935536686, + "grad_norm": 1.709786057472229, + "learning_rate": 2.4488068743741073e-06, + "loss": 0.7321, + "step": 106900 + }, + { + "epoch": 1.7160789097738327, + "grad_norm": 1.1740326881408691, + "learning_rate": 2.4460863536618335e-06, + "loss": 0.7081, + "step": 106910 + }, + { + "epoch": 1.7162394259939968, + "grad_norm": 1.2074812650680542, + "learning_rate": 2.443367267246088e-06, + "loss": 0.7431, + "step": 106920 + }, + { + "epoch": 1.7163999422141607, + "grad_norm": 0.9357895851135254, + "learning_rate": 2.440649615299789e-06, + "loss": 0.6114, + "step": 106930 + }, + { + "epoch": 1.7165604584343248, + "grad_norm": 0.9113976359367371, + "learning_rate": 2.4379333979957676e-06, + "loss": 0.6396, + "step": 106940 + }, + { + "epoch": 1.7167209746544887, + "grad_norm": 1.0857701301574707, + "learning_rate": 2.4352186155067543e-06, + "loss": 0.7695, + "step": 106950 + }, + { + "epoch": 1.7168814908746528, + "grad_norm": 1.0648707151412964, + "learning_rate": 2.4325052680053944e-06, + "loss": 0.6392, + "step": 106960 + }, + { + "epoch": 1.717042007094817, + "grad_norm": 2.649482250213623, + "learning_rate": 2.429793355664242e-06, + "loss": 0.6819, + "step": 106970 + }, + { + "epoch": 1.717202523314981, + "grad_norm": 1.1833131313323975, + "learning_rate": 2.4270828786557593e-06, + "loss": 0.6541, + "step": 106980 + }, + { + "epoch": 1.7173630395351451, + "grad_norm": 1.8262126445770264, + "learning_rate": 2.424373837152322e-06, + "loss": 0.8386, + "step": 106990 + }, + { + "epoch": 1.7175235557553092, + "grad_norm": 0.8530029058456421, + "learning_rate": 2.421666231326197e-06, + "loss": 0.7912, + "step": 107000 + }, + { + "epoch": 1.7176840719754731, + "grad_norm": 0.9565955400466919, + "learning_rate": 2.418960061349576e-06, + "loss": 0.6447, + "step": 107010 + }, + { + "epoch": 1.717844588195637, + "grad_norm": 1.0143616199493408, + "learning_rate": 2.4162553273945622e-06, + "loss": 0.7966, + "step": 107020 + }, + { + "epoch": 1.7180051044158011, + "grad_norm": 0.8695499897003174, + "learning_rate": 2.4135520296331538e-06, + "loss": 0.69, + "step": 107030 + }, + { + "epoch": 1.7181656206359652, + "grad_norm": 1.6830307245254517, + "learning_rate": 2.4108501682372693e-06, + "loss": 0.7465, + "step": 107040 + }, + { + "epoch": 1.7183261368561293, + "grad_norm": 0.9729588031768799, + "learning_rate": 2.4081497433787266e-06, + "loss": 0.7214, + "step": 107050 + }, + { + "epoch": 1.7184866530762934, + "grad_norm": 0.9892818927764893, + "learning_rate": 2.405450755229266e-06, + "loss": 0.7385, + "step": 107060 + }, + { + "epoch": 1.7186471692964576, + "grad_norm": 0.8580160737037659, + "learning_rate": 2.4027532039605145e-06, + "loss": 0.6898, + "step": 107070 + }, + { + "epoch": 1.7188076855166214, + "grad_norm": 1.0387418270111084, + "learning_rate": 2.400057089744029e-06, + "loss": 0.6102, + "step": 107080 + }, + { + "epoch": 1.7189682017367856, + "grad_norm": 1.5510481595993042, + "learning_rate": 2.3973624127512637e-06, + "loss": 0.7473, + "step": 107090 + }, + { + "epoch": 1.7191287179569494, + "grad_norm": 1.8349902629852295, + "learning_rate": 2.3946691731535853e-06, + "loss": 0.6572, + "step": 107100 + }, + { + "epoch": 1.7192892341771135, + "grad_norm": 1.0749118328094482, + "learning_rate": 2.3919773711222644e-06, + "loss": 0.6425, + "step": 107110 + }, + { + "epoch": 1.7194497503972777, + "grad_norm": 1.4322324991226196, + "learning_rate": 2.3892870068284946e-06, + "loss": 0.7554, + "step": 107120 + }, + { + "epoch": 1.7196102666174418, + "grad_norm": 1.7059340476989746, + "learning_rate": 2.3865980804433556e-06, + "loss": 0.664, + "step": 107130 + }, + { + "epoch": 1.7197707828376059, + "grad_norm": 2.4694905281066895, + "learning_rate": 2.38391059213785e-06, + "loss": 0.7689, + "step": 107140 + }, + { + "epoch": 1.7199312990577698, + "grad_norm": 0.8062805533409119, + "learning_rate": 2.381224542082888e-06, + "loss": 0.6983, + "step": 107150 + }, + { + "epoch": 1.7200918152779339, + "grad_norm": 1.014744520187378, + "learning_rate": 2.378539930449289e-06, + "loss": 0.6539, + "step": 107160 + }, + { + "epoch": 1.7202523314980978, + "grad_norm": 1.0272878408432007, + "learning_rate": 2.375856757407774e-06, + "loss": 0.7743, + "step": 107170 + }, + { + "epoch": 1.7204128477182619, + "grad_norm": 1.1135367155075073, + "learning_rate": 2.3731750231289816e-06, + "loss": 0.7207, + "step": 107180 + }, + { + "epoch": 1.720573363938426, + "grad_norm": 1.0732228755950928, + "learning_rate": 2.370494727783451e-06, + "loss": 0.7185, + "step": 107190 + }, + { + "epoch": 1.72073388015859, + "grad_norm": 1.1208298206329346, + "learning_rate": 2.3678158715416315e-06, + "loss": 0.6398, + "step": 107200 + }, + { + "epoch": 1.7208943963787542, + "grad_norm": 1.0646520853042603, + "learning_rate": 2.3651384545738843e-06, + "loss": 0.7269, + "step": 107210 + }, + { + "epoch": 1.721054912598918, + "grad_norm": 0.6333352327346802, + "learning_rate": 2.362462477050481e-06, + "loss": 0.7582, + "step": 107220 + }, + { + "epoch": 1.7212154288190822, + "grad_norm": 1.1758897304534912, + "learning_rate": 2.3597879391415915e-06, + "loss": 0.7015, + "step": 107230 + }, + { + "epoch": 1.721375945039246, + "grad_norm": 0.9956314563751221, + "learning_rate": 2.357114841017313e-06, + "loss": 0.7225, + "step": 107240 + }, + { + "epoch": 1.7215364612594102, + "grad_norm": 0.9138615727424622, + "learning_rate": 2.3544431828476238e-06, + "loss": 0.6412, + "step": 107250 + }, + { + "epoch": 1.7216969774795743, + "grad_norm": 1.4856178760528564, + "learning_rate": 2.351772964802432e-06, + "loss": 0.697, + "step": 107260 + }, + { + "epoch": 1.7218574936997384, + "grad_norm": 1.111386775970459, + "learning_rate": 2.3491041870515484e-06, + "loss": 0.7622, + "step": 107270 + }, + { + "epoch": 1.7220180099199025, + "grad_norm": 1.248085618019104, + "learning_rate": 2.346436849764694e-06, + "loss": 0.7949, + "step": 107280 + }, + { + "epoch": 1.7221785261400666, + "grad_norm": 1.157657504081726, + "learning_rate": 2.3437709531114906e-06, + "loss": 0.7975, + "step": 107290 + }, + { + "epoch": 1.7223390423602305, + "grad_norm": 1.3773448467254639, + "learning_rate": 2.341106497261475e-06, + "loss": 0.6245, + "step": 107300 + }, + { + "epoch": 1.7224995585803944, + "grad_norm": 0.7386329770088196, + "learning_rate": 2.338443482384098e-06, + "loss": 0.599, + "step": 107310 + }, + { + "epoch": 1.7226600748005585, + "grad_norm": 0.8308225870132446, + "learning_rate": 2.3357819086486987e-06, + "loss": 0.761, + "step": 107320 + }, + { + "epoch": 1.7228205910207226, + "grad_norm": 0.9085363149642944, + "learning_rate": 2.3331217762245475e-06, + "loss": 0.8348, + "step": 107330 + }, + { + "epoch": 1.7229811072408867, + "grad_norm": 1.1462721824645996, + "learning_rate": 2.3304630852808094e-06, + "loss": 0.7942, + "step": 107340 + }, + { + "epoch": 1.7231416234610508, + "grad_norm": 1.4759037494659424, + "learning_rate": 2.327805835986563e-06, + "loss": 0.744, + "step": 107350 + }, + { + "epoch": 1.723302139681215, + "grad_norm": 0.7970119714736938, + "learning_rate": 2.325150028510792e-06, + "loss": 0.6814, + "step": 107360 + }, + { + "epoch": 1.7234626559013788, + "grad_norm": 1.6624056100845337, + "learning_rate": 2.322495663022395e-06, + "loss": 0.7974, + "step": 107370 + }, + { + "epoch": 1.723623172121543, + "grad_norm": 1.0660353899002075, + "learning_rate": 2.3198427396901675e-06, + "loss": 0.6746, + "step": 107380 + }, + { + "epoch": 1.7237836883417068, + "grad_norm": 1.548519253730774, + "learning_rate": 2.3171912586828245e-06, + "loss": 0.6291, + "step": 107390 + }, + { + "epoch": 1.723944204561871, + "grad_norm": 1.1539573669433594, + "learning_rate": 2.314541220168978e-06, + "loss": 0.718, + "step": 107400 + }, + { + "epoch": 1.724104720782035, + "grad_norm": 0.9821314215660095, + "learning_rate": 2.3118926243171636e-06, + "loss": 0.6162, + "step": 107410 + }, + { + "epoch": 1.7242652370021991, + "grad_norm": 0.9319698214530945, + "learning_rate": 2.3092454712958124e-06, + "loss": 0.7823, + "step": 107420 + }, + { + "epoch": 1.7244257532223632, + "grad_norm": 0.975992739200592, + "learning_rate": 2.3065997612732706e-06, + "loss": 0.7378, + "step": 107430 + }, + { + "epoch": 1.7245862694425271, + "grad_norm": 1.3729342222213745, + "learning_rate": 2.3039554944177838e-06, + "loss": 0.6924, + "step": 107440 + }, + { + "epoch": 1.7247467856626912, + "grad_norm": 0.6655306220054626, + "learning_rate": 2.301312670897515e-06, + "loss": 0.8251, + "step": 107450 + }, + { + "epoch": 1.7249073018828551, + "grad_norm": 0.7643269300460815, + "learning_rate": 2.298671290880533e-06, + "loss": 0.6969, + "step": 107460 + }, + { + "epoch": 1.7250678181030192, + "grad_norm": 2.5663955211639404, + "learning_rate": 2.2960313545348138e-06, + "loss": 0.6452, + "step": 107470 + }, + { + "epoch": 1.7252283343231833, + "grad_norm": 1.128574252128601, + "learning_rate": 2.29339286202824e-06, + "loss": 0.6516, + "step": 107480 + }, + { + "epoch": 1.7253888505433475, + "grad_norm": 1.1915535926818848, + "learning_rate": 2.2907558135286086e-06, + "loss": 0.7697, + "step": 107490 + }, + { + "epoch": 1.7255493667635116, + "grad_norm": 0.9546256065368652, + "learning_rate": 2.28812020920362e-06, + "loss": 0.6969, + "step": 107500 + }, + { + "epoch": 1.7257098829836754, + "grad_norm": 0.6546581387519836, + "learning_rate": 2.285486049220878e-06, + "loss": 0.6241, + "step": 107510 + }, + { + "epoch": 1.7258703992038396, + "grad_norm": 0.8753691911697388, + "learning_rate": 2.282853333747903e-06, + "loss": 0.7491, + "step": 107520 + }, + { + "epoch": 1.7260309154240034, + "grad_norm": 1.4050437211990356, + "learning_rate": 2.280222062952117e-06, + "loss": 0.7151, + "step": 107530 + }, + { + "epoch": 1.7261914316441676, + "grad_norm": 1.0456633567810059, + "learning_rate": 2.2775922370008586e-06, + "loss": 0.6461, + "step": 107540 + }, + { + "epoch": 1.7263519478643317, + "grad_norm": 1.799246072769165, + "learning_rate": 2.274963856061366e-06, + "loss": 0.728, + "step": 107550 + }, + { + "epoch": 1.7265124640844958, + "grad_norm": 0.9543387293815613, + "learning_rate": 2.272336920300791e-06, + "loss": 0.7649, + "step": 107560 + }, + { + "epoch": 1.7266729803046599, + "grad_norm": 0.6573004126548767, + "learning_rate": 2.26971142988619e-06, + "loss": 0.7891, + "step": 107570 + }, + { + "epoch": 1.726833496524824, + "grad_norm": 1.3486506938934326, + "learning_rate": 2.2670873849845257e-06, + "loss": 0.6563, + "step": 107580 + }, + { + "epoch": 1.7269940127449879, + "grad_norm": 2.131782054901123, + "learning_rate": 2.264464785762679e-06, + "loss": 0.7631, + "step": 107590 + }, + { + "epoch": 1.727154528965152, + "grad_norm": 0.8029800653457642, + "learning_rate": 2.2618436323874252e-06, + "loss": 0.5977, + "step": 107600 + }, + { + "epoch": 1.7273150451853159, + "grad_norm": 0.7117334604263306, + "learning_rate": 2.2592239250254637e-06, + "loss": 0.7167, + "step": 107610 + }, + { + "epoch": 1.72747556140548, + "grad_norm": 1.3803801536560059, + "learning_rate": 2.256605663843378e-06, + "loss": 0.7713, + "step": 107620 + }, + { + "epoch": 1.727636077625644, + "grad_norm": 1.142201542854309, + "learning_rate": 2.2539888490076847e-06, + "loss": 0.6276, + "step": 107630 + }, + { + "epoch": 1.7277965938458082, + "grad_norm": 1.1521316766738892, + "learning_rate": 2.2513734806847957e-06, + "loss": 0.7024, + "step": 107640 + }, + { + "epoch": 1.7279571100659723, + "grad_norm": 1.0749139785766602, + "learning_rate": 2.2487595590410306e-06, + "loss": 0.7476, + "step": 107650 + }, + { + "epoch": 1.7281176262861362, + "grad_norm": 1.0332587957382202, + "learning_rate": 2.2461470842426196e-06, + "loss": 0.8092, + "step": 107660 + }, + { + "epoch": 1.7282781425063003, + "grad_norm": 1.1755651235580444, + "learning_rate": 2.2435360564557055e-06, + "loss": 0.7002, + "step": 107670 + }, + { + "epoch": 1.7284386587264642, + "grad_norm": 1.3535139560699463, + "learning_rate": 2.2409264758463363e-06, + "loss": 0.8712, + "step": 107680 + }, + { + "epoch": 1.7285991749466283, + "grad_norm": 1.3937257528305054, + "learning_rate": 2.238318342580453e-06, + "loss": 0.725, + "step": 107690 + }, + { + "epoch": 1.7287596911667924, + "grad_norm": 1.091539978981018, + "learning_rate": 2.23571165682393e-06, + "loss": 0.718, + "step": 107700 + }, + { + "epoch": 1.7289202073869565, + "grad_norm": 0.7995449304580688, + "learning_rate": 2.233106418742528e-06, + "loss": 0.6609, + "step": 107710 + }, + { + "epoch": 1.7290807236071206, + "grad_norm": 0.9877787828445435, + "learning_rate": 2.2305026285019347e-06, + "loss": 0.7389, + "step": 107720 + }, + { + "epoch": 1.7292412398272845, + "grad_norm": 1.0896902084350586, + "learning_rate": 2.2279002862677283e-06, + "loss": 0.7466, + "step": 107730 + }, + { + "epoch": 1.7294017560474486, + "grad_norm": 0.7262330055236816, + "learning_rate": 2.2252993922054097e-06, + "loss": 0.7403, + "step": 107740 + }, + { + "epoch": 1.7295622722676125, + "grad_norm": 1.6945639848709106, + "learning_rate": 2.2226999464803693e-06, + "loss": 0.6728, + "step": 107750 + }, + { + "epoch": 1.7297227884877766, + "grad_norm": 0.8933656215667725, + "learning_rate": 2.2201019492579274e-06, + "loss": 0.6143, + "step": 107760 + }, + { + "epoch": 1.7298833047079407, + "grad_norm": 1.3654866218566895, + "learning_rate": 2.217505400703293e-06, + "loss": 0.818, + "step": 107770 + }, + { + "epoch": 1.7300438209281048, + "grad_norm": 1.2226324081420898, + "learning_rate": 2.2149103009815984e-06, + "loss": 0.7099, + "step": 107780 + }, + { + "epoch": 1.730204337148269, + "grad_norm": 1.2680585384368896, + "learning_rate": 2.212316650257873e-06, + "loss": 0.7309, + "step": 107790 + }, + { + "epoch": 1.730364853368433, + "grad_norm": 0.9034165740013123, + "learning_rate": 2.2097244486970564e-06, + "loss": 0.5436, + "step": 107800 + }, + { + "epoch": 1.730525369588597, + "grad_norm": 1.259808897972107, + "learning_rate": 2.207133696464006e-06, + "loss": 0.8597, + "step": 107810 + }, + { + "epoch": 1.7306858858087608, + "grad_norm": 0.9388735294342041, + "learning_rate": 2.2045443937234676e-06, + "loss": 0.7438, + "step": 107820 + }, + { + "epoch": 1.730846402028925, + "grad_norm": 1.3131312131881714, + "learning_rate": 2.2019565406401097e-06, + "loss": 0.6092, + "step": 107830 + }, + { + "epoch": 1.731006918249089, + "grad_norm": 1.0623984336853027, + "learning_rate": 2.1993701373785035e-06, + "loss": 0.6626, + "step": 107840 + }, + { + "epoch": 1.7311674344692531, + "grad_norm": 1.6586072444915771, + "learning_rate": 2.1967851841031313e-06, + "loss": 0.6817, + "step": 107850 + }, + { + "epoch": 1.7313279506894172, + "grad_norm": 1.1007819175720215, + "learning_rate": 2.1942016809783805e-06, + "loss": 0.7223, + "step": 107860 + }, + { + "epoch": 1.7314884669095814, + "grad_norm": 0.9647361040115356, + "learning_rate": 2.1916196281685506e-06, + "loss": 0.6899, + "step": 107870 + }, + { + "epoch": 1.7316489831297452, + "grad_norm": 1.0033457279205322, + "learning_rate": 2.1890390258378385e-06, + "loss": 0.6723, + "step": 107880 + }, + { + "epoch": 1.7318094993499094, + "grad_norm": 1.0126146078109741, + "learning_rate": 2.186459874150354e-06, + "loss": 0.6278, + "step": 107890 + }, + { + "epoch": 1.7319700155700732, + "grad_norm": 0.9837015271186829, + "learning_rate": 2.1838821732701213e-06, + "loss": 0.6922, + "step": 107900 + }, + { + "epoch": 1.7321305317902373, + "grad_norm": 0.9219611883163452, + "learning_rate": 2.1813059233610653e-06, + "loss": 0.7257, + "step": 107910 + }, + { + "epoch": 1.7322910480104015, + "grad_norm": 1.092848300933838, + "learning_rate": 2.1787311245870216e-06, + "loss": 0.684, + "step": 107920 + }, + { + "epoch": 1.7324515642305656, + "grad_norm": 1.146929383277893, + "learning_rate": 2.1761577771117343e-06, + "loss": 0.7461, + "step": 107930 + }, + { + "epoch": 1.7326120804507297, + "grad_norm": 1.4510302543640137, + "learning_rate": 2.173585881098847e-06, + "loss": 0.6808, + "step": 107940 + }, + { + "epoch": 1.7327725966708936, + "grad_norm": 1.408591866493225, + "learning_rate": 2.1710154367119178e-06, + "loss": 0.7483, + "step": 107950 + }, + { + "epoch": 1.7329331128910577, + "grad_norm": 1.8268483877182007, + "learning_rate": 2.1684464441144136e-06, + "loss": 0.8381, + "step": 107960 + }, + { + "epoch": 1.7330936291112216, + "grad_norm": 1.042656660079956, + "learning_rate": 2.165878903469712e-06, + "loss": 0.633, + "step": 107970 + }, + { + "epoch": 1.7332541453313857, + "grad_norm": 1.0487940311431885, + "learning_rate": 2.1633128149410877e-06, + "loss": 0.7353, + "step": 107980 + }, + { + "epoch": 1.7334146615515498, + "grad_norm": 1.1192190647125244, + "learning_rate": 2.160748178691732e-06, + "loss": 0.716, + "step": 107990 + }, + { + "epoch": 1.7335751777717139, + "grad_norm": 0.812338650226593, + "learning_rate": 2.1581849948847377e-06, + "loss": 0.6943, + "step": 108000 + }, + { + "epoch": 1.7335751777717139, + "eval_loss": 0.7692725658416748, + "eval_runtime": 1834.3904, + "eval_samples_per_second": 14.3, + "eval_steps_per_second": 1.788, + "step": 108000 + }, + { + "epoch": 1.733735693991878, + "grad_norm": 1.4751245975494385, + "learning_rate": 2.1556232636831096e-06, + "loss": 0.7732, + "step": 108010 + }, + { + "epoch": 1.7338962102120419, + "grad_norm": 1.2007296085357666, + "learning_rate": 2.1530629852497596e-06, + "loss": 0.7945, + "step": 108020 + }, + { + "epoch": 1.734056726432206, + "grad_norm": 0.7894991636276245, + "learning_rate": 2.1505041597475044e-06, + "loss": 0.725, + "step": 108030 + }, + { + "epoch": 1.7342172426523699, + "grad_norm": 1.1872199773788452, + "learning_rate": 2.1479467873390726e-06, + "loss": 0.6064, + "step": 108040 + }, + { + "epoch": 1.734377758872534, + "grad_norm": 0.92991042137146, + "learning_rate": 2.1453908681871025e-06, + "loss": 0.7268, + "step": 108050 + }, + { + "epoch": 1.734538275092698, + "grad_norm": 2.8445518016815186, + "learning_rate": 2.1428364024541263e-06, + "loss": 0.6894, + "step": 108060 + }, + { + "epoch": 1.7346987913128622, + "grad_norm": 0.846873939037323, + "learning_rate": 2.1402833903025942e-06, + "loss": 0.7214, + "step": 108070 + }, + { + "epoch": 1.7348593075330263, + "grad_norm": 0.7632349133491516, + "learning_rate": 2.137731831894868e-06, + "loss": 0.6753, + "step": 108080 + }, + { + "epoch": 1.7350198237531904, + "grad_norm": 2.046767473220825, + "learning_rate": 2.135181727393207e-06, + "loss": 0.7148, + "step": 108090 + }, + { + "epoch": 1.7351803399733543, + "grad_norm": 0.8472796678543091, + "learning_rate": 2.1326330769597864e-06, + "loss": 0.7148, + "step": 108100 + }, + { + "epoch": 1.7353408561935182, + "grad_norm": 0.8998971581459045, + "learning_rate": 2.130085880756685e-06, + "loss": 0.7357, + "step": 108110 + }, + { + "epoch": 1.7355013724136823, + "grad_norm": 0.7675761580467224, + "learning_rate": 2.1275401389458936e-06, + "loss": 0.6, + "step": 108120 + }, + { + "epoch": 1.7356618886338464, + "grad_norm": 0.48794853687286377, + "learning_rate": 2.124995851689296e-06, + "loss": 0.7136, + "step": 108130 + }, + { + "epoch": 1.7358224048540105, + "grad_norm": 0.8870771527290344, + "learning_rate": 2.1224530191486987e-06, + "loss": 0.7293, + "step": 108140 + }, + { + "epoch": 1.7359829210741746, + "grad_norm": 1.318953514099121, + "learning_rate": 2.119911641485814e-06, + "loss": 0.7241, + "step": 108150 + }, + { + "epoch": 1.7361434372943387, + "grad_norm": 1.4552632570266724, + "learning_rate": 2.1173717188622545e-06, + "loss": 0.6914, + "step": 108160 + }, + { + "epoch": 1.7363039535145026, + "grad_norm": 1.6371572017669678, + "learning_rate": 2.114833251439549e-06, + "loss": 0.607, + "step": 108170 + }, + { + "epoch": 1.7364644697346667, + "grad_norm": 0.8450198173522949, + "learning_rate": 2.1122962393791244e-06, + "loss": 0.7343, + "step": 108180 + }, + { + "epoch": 1.7366249859548306, + "grad_norm": 1.044085144996643, + "learning_rate": 2.109760682842321e-06, + "loss": 0.6515, + "step": 108190 + }, + { + "epoch": 1.7367855021749947, + "grad_norm": 1.081115961074829, + "learning_rate": 2.1072265819903874e-06, + "loss": 0.845, + "step": 108200 + }, + { + "epoch": 1.7369460183951588, + "grad_norm": 0.911246120929718, + "learning_rate": 2.1046939369844754e-06, + "loss": 0.6945, + "step": 108210 + }, + { + "epoch": 1.737106534615323, + "grad_norm": 0.8848686814308167, + "learning_rate": 2.102162747985648e-06, + "loss": 0.884, + "step": 108220 + }, + { + "epoch": 1.737267050835487, + "grad_norm": 1.1671992540359497, + "learning_rate": 2.0996330151548726e-06, + "loss": 0.5792, + "step": 108230 + }, + { + "epoch": 1.737427567055651, + "grad_norm": 0.9435145854949951, + "learning_rate": 2.097104738653033e-06, + "loss": 0.6163, + "step": 108240 + }, + { + "epoch": 1.737588083275815, + "grad_norm": 2.4554507732391357, + "learning_rate": 2.094577918640897e-06, + "loss": 0.7177, + "step": 108250 + }, + { + "epoch": 1.737748599495979, + "grad_norm": 0.6473426222801208, + "learning_rate": 2.092052555279167e-06, + "loss": 0.6528, + "step": 108260 + }, + { + "epoch": 1.737909115716143, + "grad_norm": 0.8149453401565552, + "learning_rate": 2.0895286487284392e-06, + "loss": 0.6654, + "step": 108270 + }, + { + "epoch": 1.7380696319363071, + "grad_norm": 1.0626251697540283, + "learning_rate": 2.087006199149216e-06, + "loss": 0.6685, + "step": 108280 + }, + { + "epoch": 1.7382301481564713, + "grad_norm": 0.8237874507904053, + "learning_rate": 2.0844852067019138e-06, + "loss": 0.696, + "step": 108290 + }, + { + "epoch": 1.7383906643766354, + "grad_norm": 2.3913092613220215, + "learning_rate": 2.0819656715468566e-06, + "loss": 0.6828, + "step": 108300 + }, + { + "epoch": 1.7385511805967995, + "grad_norm": 1.1913564205169678, + "learning_rate": 2.0794475938442637e-06, + "loss": 0.5991, + "step": 108310 + }, + { + "epoch": 1.7387116968169634, + "grad_norm": 1.817394495010376, + "learning_rate": 2.0769309737542735e-06, + "loss": 0.7124, + "step": 108320 + }, + { + "epoch": 1.7388722130371272, + "grad_norm": 0.9805240631103516, + "learning_rate": 2.074415811436928e-06, + "loss": 0.6722, + "step": 108330 + }, + { + "epoch": 1.7390327292572914, + "grad_norm": 0.9130823612213135, + "learning_rate": 2.071902107052176e-06, + "loss": 0.6807, + "step": 108340 + }, + { + "epoch": 1.7391932454774555, + "grad_norm": 1.1028456687927246, + "learning_rate": 2.0693898607598793e-06, + "loss": 0.6767, + "step": 108350 + }, + { + "epoch": 1.7393537616976196, + "grad_norm": 1.0497663021087646, + "learning_rate": 2.0668790727197963e-06, + "loss": 0.7298, + "step": 108360 + }, + { + "epoch": 1.7395142779177837, + "grad_norm": 1.2643764019012451, + "learning_rate": 2.064369743091604e-06, + "loss": 0.6879, + "step": 108370 + }, + { + "epoch": 1.7396747941379478, + "grad_norm": 0.7281783819198608, + "learning_rate": 2.061861872034876e-06, + "loss": 0.681, + "step": 108380 + }, + { + "epoch": 1.7398353103581117, + "grad_norm": 2.408787250518799, + "learning_rate": 2.0593554597090975e-06, + "loss": 0.7351, + "step": 108390 + }, + { + "epoch": 1.7399958265782758, + "grad_norm": 0.9040789008140564, + "learning_rate": 2.0568505062736636e-06, + "loss": 0.6729, + "step": 108400 + }, + { + "epoch": 1.7401563427984397, + "grad_norm": 1.240679383277893, + "learning_rate": 2.054347011887875e-06, + "loss": 0.6417, + "step": 108410 + }, + { + "epoch": 1.7403168590186038, + "grad_norm": 1.6355574131011963, + "learning_rate": 2.05184497671094e-06, + "loss": 0.6511, + "step": 108420 + }, + { + "epoch": 1.7404773752387679, + "grad_norm": 0.9299646615982056, + "learning_rate": 2.049344400901976e-06, + "loss": 0.7634, + "step": 108430 + }, + { + "epoch": 1.740637891458932, + "grad_norm": 1.4171212911605835, + "learning_rate": 2.0468452846199997e-06, + "loss": 0.7351, + "step": 108440 + }, + { + "epoch": 1.740798407679096, + "grad_norm": 0.8370511531829834, + "learning_rate": 2.0443476280239376e-06, + "loss": 0.5338, + "step": 108450 + }, + { + "epoch": 1.74095892389926, + "grad_norm": 0.9849986433982849, + "learning_rate": 2.0418514312726345e-06, + "loss": 0.7398, + "step": 108460 + }, + { + "epoch": 1.741119440119424, + "grad_norm": 1.3277363777160645, + "learning_rate": 2.0393566945248273e-06, + "loss": 0.6437, + "step": 108470 + }, + { + "epoch": 1.741279956339588, + "grad_norm": 0.9897245764732361, + "learning_rate": 2.0368634179391695e-06, + "loss": 0.6785, + "step": 108480 + }, + { + "epoch": 1.741440472559752, + "grad_norm": 1.1728770732879639, + "learning_rate": 2.034371601674223e-06, + "loss": 0.6274, + "step": 108490 + }, + { + "epoch": 1.7416009887799162, + "grad_norm": 1.4050183296203613, + "learning_rate": 2.0318812458884453e-06, + "loss": 0.7705, + "step": 108500 + }, + { + "epoch": 1.7417615050000803, + "grad_norm": 1.142581582069397, + "learning_rate": 2.0293923507402085e-06, + "loss": 0.7467, + "step": 108510 + }, + { + "epoch": 1.7419220212202444, + "grad_norm": 1.4801418781280518, + "learning_rate": 2.026904916387795e-06, + "loss": 0.9204, + "step": 108520 + }, + { + "epoch": 1.7420825374404083, + "grad_norm": 1.0123482942581177, + "learning_rate": 2.0244189429893895e-06, + "loss": 0.6818, + "step": 108530 + }, + { + "epoch": 1.7422430536605724, + "grad_norm": 1.1996128559112549, + "learning_rate": 2.02193443070309e-06, + "loss": 0.7812, + "step": 108540 + }, + { + "epoch": 1.7424035698807363, + "grad_norm": 0.9477663040161133, + "learning_rate": 2.019451379686893e-06, + "loss": 0.636, + "step": 108550 + }, + { + "epoch": 1.7425640861009004, + "grad_norm": 1.127278208732605, + "learning_rate": 2.016969790098705e-06, + "loss": 0.6113, + "step": 108560 + }, + { + "epoch": 1.7427246023210645, + "grad_norm": 0.8949620127677917, + "learning_rate": 2.0144896620963414e-06, + "loss": 0.6352, + "step": 108570 + }, + { + "epoch": 1.7428851185412286, + "grad_norm": 1.2200998067855835, + "learning_rate": 2.012010995837521e-06, + "loss": 0.8, + "step": 108580 + }, + { + "epoch": 1.7430456347613927, + "grad_norm": 1.5503236055374146, + "learning_rate": 2.0095337914798783e-06, + "loss": 0.8338, + "step": 108590 + }, + { + "epoch": 1.7432061509815568, + "grad_norm": 3.3516950607299805, + "learning_rate": 2.007058049180946e-06, + "loss": 0.6746, + "step": 108600 + }, + { + "epoch": 1.7433666672017207, + "grad_norm": 1.16060209274292, + "learning_rate": 2.0045837690981674e-06, + "loss": 0.7125, + "step": 108610 + }, + { + "epoch": 1.7435271834218846, + "grad_norm": 1.047214388847351, + "learning_rate": 2.0021109513888943e-06, + "loss": 0.6904, + "step": 108620 + }, + { + "epoch": 1.7436876996420487, + "grad_norm": 0.7212704420089722, + "learning_rate": 1.999639596210376e-06, + "loss": 0.7307, + "step": 108630 + }, + { + "epoch": 1.7438482158622128, + "grad_norm": 0.8188061118125916, + "learning_rate": 1.997169703719784e-06, + "loss": 0.7323, + "step": 108640 + }, + { + "epoch": 1.744008732082377, + "grad_norm": 1.3952628374099731, + "learning_rate": 1.9947012740741843e-06, + "loss": 0.7839, + "step": 108650 + }, + { + "epoch": 1.744169248302541, + "grad_norm": 1.278896689414978, + "learning_rate": 1.9922343074305543e-06, + "loss": 0.6531, + "step": 108660 + }, + { + "epoch": 1.7443297645227052, + "grad_norm": 1.4791476726531982, + "learning_rate": 1.989768803945782e-06, + "loss": 0.7535, + "step": 108670 + }, + { + "epoch": 1.744490280742869, + "grad_norm": 1.0770834684371948, + "learning_rate": 1.9873047637766623e-06, + "loss": 0.6702, + "step": 108680 + }, + { + "epoch": 1.7446507969630332, + "grad_norm": 0.8466160297393799, + "learning_rate": 1.9848421870798827e-06, + "loss": 0.651, + "step": 108690 + }, + { + "epoch": 1.744811313183197, + "grad_norm": 0.9116744995117188, + "learning_rate": 1.9823810740120575e-06, + "loss": 0.6916, + "step": 108700 + }, + { + "epoch": 1.7449718294033612, + "grad_norm": 1.5113646984100342, + "learning_rate": 1.9799214247296944e-06, + "loss": 0.7391, + "step": 108710 + }, + { + "epoch": 1.7451323456235253, + "grad_norm": 1.331621527671814, + "learning_rate": 1.9774632393892157e-06, + "loss": 0.7398, + "step": 108720 + }, + { + "epoch": 1.7452928618436894, + "grad_norm": 1.3487427234649658, + "learning_rate": 1.975006518146946e-06, + "loss": 0.8799, + "step": 108730 + }, + { + "epoch": 1.7454533780638535, + "grad_norm": 1.3185064792633057, + "learning_rate": 1.972551261159117e-06, + "loss": 0.7906, + "step": 108740 + }, + { + "epoch": 1.7456138942840174, + "grad_norm": 1.531561017036438, + "learning_rate": 1.9700974685818775e-06, + "loss": 0.6368, + "step": 108750 + }, + { + "epoch": 1.7457744105041815, + "grad_norm": 0.8419125080108643, + "learning_rate": 1.9676451405712613e-06, + "loss": 0.6212, + "step": 108760 + }, + { + "epoch": 1.7459349267243454, + "grad_norm": 1.0375685691833496, + "learning_rate": 1.9651942772832276e-06, + "loss": 0.6675, + "step": 108770 + }, + { + "epoch": 1.7460954429445095, + "grad_norm": 1.0234583616256714, + "learning_rate": 1.9627448788736376e-06, + "loss": 0.6629, + "step": 108780 + }, + { + "epoch": 1.7462559591646736, + "grad_norm": 0.8823028802871704, + "learning_rate": 1.9602969454982577e-06, + "loss": 0.6602, + "step": 108790 + }, + { + "epoch": 1.7464164753848377, + "grad_norm": 1.0167672634124756, + "learning_rate": 1.957850477312764e-06, + "loss": 0.6804, + "step": 108800 + }, + { + "epoch": 1.7465769916050018, + "grad_norm": 1.327968955039978, + "learning_rate": 1.955405474472738e-06, + "loss": 0.6835, + "step": 108810 + }, + { + "epoch": 1.7467375078251657, + "grad_norm": 1.043700933456421, + "learning_rate": 1.9529619371336627e-06, + "loss": 0.597, + "step": 108820 + }, + { + "epoch": 1.7468980240453298, + "grad_norm": 1.04363214969635, + "learning_rate": 1.9505198654509364e-06, + "loss": 0.7302, + "step": 108830 + }, + { + "epoch": 1.7470585402654937, + "grad_norm": 0.5976032018661499, + "learning_rate": 1.9480792595798604e-06, + "loss": 0.7408, + "step": 108840 + }, + { + "epoch": 1.7472190564856578, + "grad_norm": 1.888870358467102, + "learning_rate": 1.945640119675643e-06, + "loss": 0.6466, + "step": 108850 + }, + { + "epoch": 1.747379572705822, + "grad_norm": 0.9266118407249451, + "learning_rate": 1.9432024458934022e-06, + "loss": 0.8236, + "step": 108860 + }, + { + "epoch": 1.747540088925986, + "grad_norm": 1.3357903957366943, + "learning_rate": 1.940766238388153e-06, + "loss": 0.7723, + "step": 108870 + }, + { + "epoch": 1.74770060514615, + "grad_norm": 0.9293719530105591, + "learning_rate": 1.9383314973148243e-06, + "loss": 0.6876, + "step": 108880 + }, + { + "epoch": 1.7478611213663142, + "grad_norm": 0.7886795997619629, + "learning_rate": 1.9358982228282556e-06, + "loss": 0.6636, + "step": 108890 + }, + { + "epoch": 1.748021637586478, + "grad_norm": 0.8070201277732849, + "learning_rate": 1.9334664150831876e-06, + "loss": 0.6642, + "step": 108900 + }, + { + "epoch": 1.7481821538066422, + "grad_norm": 1.284803032875061, + "learning_rate": 1.9310360742342686e-06, + "loss": 0.8577, + "step": 108910 + }, + { + "epoch": 1.748342670026806, + "grad_norm": 1.0780497789382935, + "learning_rate": 1.9286072004360554e-06, + "loss": 0.6421, + "step": 108920 + }, + { + "epoch": 1.7485031862469702, + "grad_norm": 1.067753791809082, + "learning_rate": 1.926179793843014e-06, + "loss": 0.7303, + "step": 108930 + }, + { + "epoch": 1.7486637024671343, + "grad_norm": 1.6384843587875366, + "learning_rate": 1.9237538546095e-06, + "loss": 0.6475, + "step": 108940 + }, + { + "epoch": 1.7488242186872984, + "grad_norm": 1.2835144996643066, + "learning_rate": 1.9213293828898e-06, + "loss": 0.6785, + "step": 108950 + }, + { + "epoch": 1.7489847349074625, + "grad_norm": 0.8466501235961914, + "learning_rate": 1.9189063788380925e-06, + "loss": 0.7089, + "step": 108960 + }, + { + "epoch": 1.7491452511276264, + "grad_norm": 1.2665727138519287, + "learning_rate": 1.916484842608468e-06, + "loss": 0.6995, + "step": 108970 + }, + { + "epoch": 1.7493057673477905, + "grad_norm": 1.6020736694335938, + "learning_rate": 1.9140647743549177e-06, + "loss": 0.6988, + "step": 108980 + }, + { + "epoch": 1.7494662835679544, + "grad_norm": 1.5264266729354858, + "learning_rate": 1.911646174231355e-06, + "loss": 0.6694, + "step": 108990 + }, + { + "epoch": 1.7496267997881185, + "grad_norm": 0.813651978969574, + "learning_rate": 1.909229042391575e-06, + "loss": 0.8316, + "step": 109000 + }, + { + "epoch": 1.7497873160082826, + "grad_norm": 1.5013858079910278, + "learning_rate": 1.9068133789892971e-06, + "loss": 0.8125, + "step": 109010 + }, + { + "epoch": 1.7499478322284467, + "grad_norm": 1.328124761581421, + "learning_rate": 1.904399184178149e-06, + "loss": 0.7998, + "step": 109020 + }, + { + "epoch": 1.7501083484486109, + "grad_norm": 1.4619064331054688, + "learning_rate": 1.9019864581116514e-06, + "loss": 0.7419, + "step": 109030 + }, + { + "epoch": 1.7502688646687747, + "grad_norm": 1.6112982034683228, + "learning_rate": 1.899575200943246e-06, + "loss": 0.7398, + "step": 109040 + }, + { + "epoch": 1.7504293808889388, + "grad_norm": 1.187006950378418, + "learning_rate": 1.8971654128262767e-06, + "loss": 0.6667, + "step": 109050 + }, + { + "epoch": 1.7505898971091027, + "grad_norm": 1.1515125036239624, + "learning_rate": 1.894757093913982e-06, + "loss": 0.6759, + "step": 109060 + }, + { + "epoch": 1.7507504133292668, + "grad_norm": 0.9163339734077454, + "learning_rate": 1.8923502443595225e-06, + "loss": 0.7548, + "step": 109070 + }, + { + "epoch": 1.750910929549431, + "grad_norm": 0.750000536441803, + "learning_rate": 1.8899448643159566e-06, + "loss": 0.7064, + "step": 109080 + }, + { + "epoch": 1.751071445769595, + "grad_norm": 2.231701374053955, + "learning_rate": 1.8875409539362593e-06, + "loss": 0.7267, + "step": 109090 + }, + { + "epoch": 1.7512319619897592, + "grad_norm": 1.461351752281189, + "learning_rate": 1.8851385133732967e-06, + "loss": 0.7517, + "step": 109100 + }, + { + "epoch": 1.7513924782099233, + "grad_norm": 2.0161075592041016, + "learning_rate": 1.8827375427798611e-06, + "loss": 0.7484, + "step": 109110 + }, + { + "epoch": 1.7515529944300872, + "grad_norm": 0.8975949287414551, + "learning_rate": 1.8803380423086297e-06, + "loss": 0.7426, + "step": 109120 + }, + { + "epoch": 1.751713510650251, + "grad_norm": 1.1198786497116089, + "learning_rate": 1.8779400121122005e-06, + "loss": 0.7485, + "step": 109130 + }, + { + "epoch": 1.7518740268704152, + "grad_norm": 1.0387654304504395, + "learning_rate": 1.8755434523430703e-06, + "loss": 0.6028, + "step": 109140 + }, + { + "epoch": 1.7520345430905793, + "grad_norm": 1.077675223350525, + "learning_rate": 1.873148363153654e-06, + "loss": 0.5603, + "step": 109150 + }, + { + "epoch": 1.7521950593107434, + "grad_norm": 1.462687373161316, + "learning_rate": 1.870754744696257e-06, + "loss": 0.7101, + "step": 109160 + }, + { + "epoch": 1.7523555755309075, + "grad_norm": 1.0092108249664307, + "learning_rate": 1.868362597123105e-06, + "loss": 0.6342, + "step": 109170 + }, + { + "epoch": 1.7525160917510716, + "grad_norm": 0.8390539884567261, + "learning_rate": 1.8659719205863291e-06, + "loss": 0.6851, + "step": 109180 + }, + { + "epoch": 1.7526766079712355, + "grad_norm": 1.2798349857330322, + "learning_rate": 1.8635827152379493e-06, + "loss": 0.6982, + "step": 109190 + }, + { + "epoch": 1.7528371241913996, + "grad_norm": 1.6108051538467407, + "learning_rate": 1.8611949812299135e-06, + "loss": 0.6973, + "step": 109200 + }, + { + "epoch": 1.7529976404115635, + "grad_norm": 1.0742954015731812, + "learning_rate": 1.858808718714064e-06, + "loss": 0.7524, + "step": 109210 + }, + { + "epoch": 1.7531581566317276, + "grad_norm": 1.35643470287323, + "learning_rate": 1.8564239278421547e-06, + "loss": 0.7686, + "step": 109220 + }, + { + "epoch": 1.7533186728518917, + "grad_norm": 0.6917544603347778, + "learning_rate": 1.8540406087658475e-06, + "loss": 0.6659, + "step": 109230 + }, + { + "epoch": 1.7534791890720558, + "grad_norm": 0.8550674915313721, + "learning_rate": 1.851658761636707e-06, + "loss": 0.6675, + "step": 109240 + }, + { + "epoch": 1.75363970529222, + "grad_norm": 0.7478025555610657, + "learning_rate": 1.8492783866061986e-06, + "loss": 0.724, + "step": 109250 + }, + { + "epoch": 1.7538002215123838, + "grad_norm": 1.0778498649597168, + "learning_rate": 1.8468994838257036e-06, + "loss": 0.7441, + "step": 109260 + }, + { + "epoch": 1.753960737732548, + "grad_norm": 1.2566908597946167, + "learning_rate": 1.844522053446504e-06, + "loss": 0.632, + "step": 109270 + }, + { + "epoch": 1.7541212539527118, + "grad_norm": 1.7333966493606567, + "learning_rate": 1.8421460956197955e-06, + "loss": 0.5901, + "step": 109280 + }, + { + "epoch": 1.754281770172876, + "grad_norm": 1.6230868101119995, + "learning_rate": 1.839771610496674e-06, + "loss": 0.6793, + "step": 109290 + }, + { + "epoch": 1.75444228639304, + "grad_norm": 0.9590969085693359, + "learning_rate": 1.8373985982281432e-06, + "loss": 0.8321, + "step": 109300 + }, + { + "epoch": 1.7546028026132041, + "grad_norm": 1.0236233472824097, + "learning_rate": 1.835027058965108e-06, + "loss": 0.7409, + "step": 109310 + }, + { + "epoch": 1.7547633188333682, + "grad_norm": 1.367497444152832, + "learning_rate": 1.8326569928583865e-06, + "loss": 0.6956, + "step": 109320 + }, + { + "epoch": 1.754923835053532, + "grad_norm": 1.388262152671814, + "learning_rate": 1.8302884000587023e-06, + "loss": 0.7128, + "step": 109330 + }, + { + "epoch": 1.7550843512736962, + "grad_norm": 1.5743850469589233, + "learning_rate": 1.8279212807166824e-06, + "loss": 0.6668, + "step": 109340 + }, + { + "epoch": 1.75524486749386, + "grad_norm": 1.2251794338226318, + "learning_rate": 1.8255556349828646e-06, + "loss": 0.6699, + "step": 109350 + }, + { + "epoch": 1.7554053837140242, + "grad_norm": 0.872451901435852, + "learning_rate": 1.8231914630076868e-06, + "loss": 0.5489, + "step": 109360 + }, + { + "epoch": 1.7555658999341883, + "grad_norm": 1.236918330192566, + "learning_rate": 1.8208287649415012e-06, + "loss": 0.656, + "step": 109370 + }, + { + "epoch": 1.7557264161543524, + "grad_norm": 1.1126619577407837, + "learning_rate": 1.8184675409345541e-06, + "loss": 0.708, + "step": 109380 + }, + { + "epoch": 1.7558869323745165, + "grad_norm": 0.8876074552536011, + "learning_rate": 1.8161077911370117e-06, + "loss": 0.7608, + "step": 109390 + }, + { + "epoch": 1.7560474485946806, + "grad_norm": 1.1358940601348877, + "learning_rate": 1.8137495156989371e-06, + "loss": 0.7644, + "step": 109400 + }, + { + "epoch": 1.7562079648148445, + "grad_norm": 1.5242472887039185, + "learning_rate": 1.8113927147703046e-06, + "loss": 0.6438, + "step": 109410 + }, + { + "epoch": 1.7563684810350084, + "grad_norm": 1.311110019683838, + "learning_rate": 1.809037388500992e-06, + "loss": 0.8219, + "step": 109420 + }, + { + "epoch": 1.7565289972551725, + "grad_norm": 0.9095600843429565, + "learning_rate": 1.806683537040782e-06, + "loss": 0.7566, + "step": 109430 + }, + { + "epoch": 1.7566895134753366, + "grad_norm": 0.8841552138328552, + "learning_rate": 1.8043311605393715e-06, + "loss": 0.7135, + "step": 109440 + }, + { + "epoch": 1.7568500296955007, + "grad_norm": 1.146503210067749, + "learning_rate": 1.801980259146352e-06, + "loss": 0.6836, + "step": 109450 + }, + { + "epoch": 1.7570105459156649, + "grad_norm": 0.7458778619766235, + "learning_rate": 1.7996308330112287e-06, + "loss": 0.7043, + "step": 109460 + }, + { + "epoch": 1.757171062135829, + "grad_norm": 0.7422993183135986, + "learning_rate": 1.7972828822834158e-06, + "loss": 0.7179, + "step": 109470 + }, + { + "epoch": 1.7573315783559929, + "grad_norm": 0.9017037153244019, + "learning_rate": 1.7949364071122215e-06, + "loss": 0.6823, + "step": 109480 + }, + { + "epoch": 1.757492094576157, + "grad_norm": 1.0317039489746094, + "learning_rate": 1.7925914076468791e-06, + "loss": 0.6868, + "step": 109490 + }, + { + "epoch": 1.7576526107963208, + "grad_norm": 0.8065909147262573, + "learning_rate": 1.7902478840365028e-06, + "loss": 0.7867, + "step": 109500 + }, + { + "epoch": 1.757813127016485, + "grad_norm": 1.2590199708938599, + "learning_rate": 1.7879058364301344e-06, + "loss": 0.7871, + "step": 109510 + }, + { + "epoch": 1.757973643236649, + "grad_norm": 1.1093286275863647, + "learning_rate": 1.7855652649767157e-06, + "loss": 0.7511, + "step": 109520 + }, + { + "epoch": 1.7581341594568132, + "grad_norm": 1.7245713472366333, + "learning_rate": 1.783226169825089e-06, + "loss": 0.7708, + "step": 109530 + }, + { + "epoch": 1.7582946756769773, + "grad_norm": 1.4266351461410522, + "learning_rate": 1.7808885511240103e-06, + "loss": 0.7359, + "step": 109540 + }, + { + "epoch": 1.7584551918971412, + "grad_norm": 1.5629258155822754, + "learning_rate": 1.778552409022141e-06, + "loss": 0.906, + "step": 109550 + }, + { + "epoch": 1.7586157081173053, + "grad_norm": 1.3170117139816284, + "learning_rate": 1.7762177436680377e-06, + "loss": 0.6692, + "step": 109560 + }, + { + "epoch": 1.7587762243374692, + "grad_norm": 0.7575194239616394, + "learning_rate": 1.7738845552101786e-06, + "loss": 0.8622, + "step": 109570 + }, + { + "epoch": 1.7589367405576333, + "grad_norm": 1.038957118988037, + "learning_rate": 1.7715528437969365e-06, + "loss": 0.6808, + "step": 109580 + }, + { + "epoch": 1.7590972567777974, + "grad_norm": 1.960207462310791, + "learning_rate": 1.7692226095765958e-06, + "loss": 0.6111, + "step": 109590 + }, + { + "epoch": 1.7592577729979615, + "grad_norm": 0.7603963613510132, + "learning_rate": 1.766893852697346e-06, + "loss": 0.5934, + "step": 109600 + }, + { + "epoch": 1.7594182892181256, + "grad_norm": 0.9084694981575012, + "learning_rate": 1.7645665733072885e-06, + "loss": 0.8144, + "step": 109610 + }, + { + "epoch": 1.7595788054382895, + "grad_norm": 1.7144395112991333, + "learning_rate": 1.7622407715544132e-06, + "loss": 0.7924, + "step": 109620 + }, + { + "epoch": 1.7597393216584536, + "grad_norm": 0.752878725528717, + "learning_rate": 1.7599164475866348e-06, + "loss": 0.5971, + "step": 109630 + }, + { + "epoch": 1.7598998378786175, + "grad_norm": 0.8267036080360413, + "learning_rate": 1.757593601551763e-06, + "loss": 0.7025, + "step": 109640 + }, + { + "epoch": 1.7600603540987816, + "grad_norm": 1.9707118272781372, + "learning_rate": 1.7552722335975213e-06, + "loss": 0.7303, + "step": 109650 + }, + { + "epoch": 1.7602208703189457, + "grad_norm": 1.0089399814605713, + "learning_rate": 1.7529523438715306e-06, + "loss": 0.771, + "step": 109660 + }, + { + "epoch": 1.7603813865391098, + "grad_norm": 1.3322983980178833, + "learning_rate": 1.750633932521331e-06, + "loss": 0.6952, + "step": 109670 + }, + { + "epoch": 1.760541902759274, + "grad_norm": 1.127624273300171, + "learning_rate": 1.7483169996943466e-06, + "loss": 0.5835, + "step": 109680 + }, + { + "epoch": 1.760702418979438, + "grad_norm": 0.7915677428245544, + "learning_rate": 1.7460015455379287e-06, + "loss": 0.6816, + "step": 109690 + }, + { + "epoch": 1.760862935199602, + "grad_norm": 0.8999337553977966, + "learning_rate": 1.743687570199326e-06, + "loss": 0.8832, + "step": 109700 + }, + { + "epoch": 1.761023451419766, + "grad_norm": 0.663137674331665, + "learning_rate": 1.7413750738256907e-06, + "loss": 0.627, + "step": 109710 + }, + { + "epoch": 1.76118396763993, + "grad_norm": 0.9974910020828247, + "learning_rate": 1.739064056564088e-06, + "loss": 0.6948, + "step": 109720 + }, + { + "epoch": 1.761344483860094, + "grad_norm": 1.0844378471374512, + "learning_rate": 1.7367545185614842e-06, + "loss": 0.6798, + "step": 109730 + }, + { + "epoch": 1.7615050000802581, + "grad_norm": 1.1090716123580933, + "learning_rate": 1.7344464599647559e-06, + "loss": 0.7562, + "step": 109740 + }, + { + "epoch": 1.7616655163004222, + "grad_norm": 0.9068195819854736, + "learning_rate": 1.7321398809206718e-06, + "loss": 0.653, + "step": 109750 + }, + { + "epoch": 1.7618260325205863, + "grad_norm": 1.068818211555481, + "learning_rate": 1.7298347815759259e-06, + "loss": 0.7825, + "step": 109760 + }, + { + "epoch": 1.7619865487407502, + "grad_norm": 1.1966149806976318, + "learning_rate": 1.7275311620771034e-06, + "loss": 0.7161, + "step": 109770 + }, + { + "epoch": 1.7621470649609143, + "grad_norm": 0.8173187971115112, + "learning_rate": 1.7252290225707042e-06, + "loss": 0.7127, + "step": 109780 + }, + { + "epoch": 1.7623075811810782, + "grad_norm": 1.241186261177063, + "learning_rate": 1.72292836320313e-06, + "loss": 0.7589, + "step": 109790 + }, + { + "epoch": 1.7624680974012423, + "grad_norm": 1.471279501914978, + "learning_rate": 1.7206291841206923e-06, + "loss": 0.7111, + "step": 109800 + }, + { + "epoch": 1.7626286136214064, + "grad_norm": 0.5959886908531189, + "learning_rate": 1.7183314854696015e-06, + "loss": 0.6393, + "step": 109810 + }, + { + "epoch": 1.7627891298415705, + "grad_norm": 1.4067096710205078, + "learning_rate": 1.7160352673959768e-06, + "loss": 0.7275, + "step": 109820 + }, + { + "epoch": 1.7629496460617347, + "grad_norm": 1.4469263553619385, + "learning_rate": 1.713740530045846e-06, + "loss": 0.7152, + "step": 109830 + }, + { + "epoch": 1.7631101622818985, + "grad_norm": 1.1093748807907104, + "learning_rate": 1.7114472735651394e-06, + "loss": 0.756, + "step": 109840 + }, + { + "epoch": 1.7632706785020626, + "grad_norm": 0.5182979106903076, + "learning_rate": 1.7091554980996988e-06, + "loss": 0.5637, + "step": 109850 + }, + { + "epoch": 1.7634311947222265, + "grad_norm": 1.0988868474960327, + "learning_rate": 1.7068652037952686e-06, + "loss": 0.7575, + "step": 109860 + }, + { + "epoch": 1.7635917109423906, + "grad_norm": 0.7137137651443481, + "learning_rate": 1.7045763907974882e-06, + "loss": 0.6842, + "step": 109870 + }, + { + "epoch": 1.7637522271625548, + "grad_norm": 0.8985896706581116, + "learning_rate": 1.7022890592519214e-06, + "loss": 0.7166, + "step": 109880 + }, + { + "epoch": 1.7639127433827189, + "grad_norm": 1.1578017473220825, + "learning_rate": 1.700003209304027e-06, + "loss": 0.6466, + "step": 109890 + }, + { + "epoch": 1.764073259602883, + "grad_norm": 1.1427186727523804, + "learning_rate": 1.6977188410991718e-06, + "loss": 0.8038, + "step": 109900 + }, + { + "epoch": 1.764233775823047, + "grad_norm": 1.3143864870071411, + "learning_rate": 1.695435954782626e-06, + "loss": 0.8362, + "step": 109910 + }, + { + "epoch": 1.764394292043211, + "grad_norm": 0.9965969324111938, + "learning_rate": 1.693154550499576e-06, + "loss": 0.6973, + "step": 109920 + }, + { + "epoch": 1.7645548082633749, + "grad_norm": 1.4099326133728027, + "learning_rate": 1.6908746283950922e-06, + "loss": 0.676, + "step": 109930 + }, + { + "epoch": 1.764715324483539, + "grad_norm": 2.6983439922332764, + "learning_rate": 1.6885961886141721e-06, + "loss": 0.8236, + "step": 109940 + }, + { + "epoch": 1.764875840703703, + "grad_norm": 0.754199206829071, + "learning_rate": 1.6863192313017113e-06, + "loss": 0.6678, + "step": 109950 + }, + { + "epoch": 1.7650363569238672, + "grad_norm": 1.3870471715927124, + "learning_rate": 1.6840437566025102e-06, + "loss": 0.7066, + "step": 109960 + }, + { + "epoch": 1.7651968731440313, + "grad_norm": 0.8552031517028809, + "learning_rate": 1.6817697646612756e-06, + "loss": 0.7837, + "step": 109970 + }, + { + "epoch": 1.7653573893641954, + "grad_norm": 1.2125217914581299, + "learning_rate": 1.6794972556226197e-06, + "loss": 0.771, + "step": 109980 + }, + { + "epoch": 1.7655179055843593, + "grad_norm": 1.1718653440475464, + "learning_rate": 1.6772262296310598e-06, + "loss": 0.6388, + "step": 109990 + }, + { + "epoch": 1.7656784218045234, + "grad_norm": 1.2845113277435303, + "learning_rate": 1.6749566868310251e-06, + "loss": 0.7184, + "step": 110000 + }, + { + "epoch": 1.7658389380246873, + "grad_norm": 0.9567359685897827, + "learning_rate": 1.672688627366839e-06, + "loss": 0.7243, + "step": 110010 + }, + { + "epoch": 1.7659994542448514, + "grad_norm": 1.6762229204177856, + "learning_rate": 1.6704220513827363e-06, + "loss": 0.6697, + "step": 110020 + }, + { + "epoch": 1.7661599704650155, + "grad_norm": 1.0408070087432861, + "learning_rate": 1.6681569590228625e-06, + "loss": 0.717, + "step": 110030 + }, + { + "epoch": 1.7663204866851796, + "grad_norm": 0.9812464714050293, + "learning_rate": 1.6658933504312607e-06, + "loss": 0.7166, + "step": 110040 + }, + { + "epoch": 1.7664810029053437, + "grad_norm": 0.5312385559082031, + "learning_rate": 1.6636312257518854e-06, + "loss": 0.7443, + "step": 110050 + }, + { + "epoch": 1.7666415191255076, + "grad_norm": 1.0681425333023071, + "learning_rate": 1.661370585128591e-06, + "loss": 0.7835, + "step": 110060 + }, + { + "epoch": 1.7668020353456717, + "grad_norm": 0.883055567741394, + "learning_rate": 1.6591114287051457e-06, + "loss": 0.6307, + "step": 110070 + }, + { + "epoch": 1.7669625515658356, + "grad_norm": 1.202120304107666, + "learning_rate": 1.6568537566252178e-06, + "loss": 0.7037, + "step": 110080 + }, + { + "epoch": 1.7671230677859997, + "grad_norm": 1.2318472862243652, + "learning_rate": 1.6545975690323789e-06, + "loss": 0.7353, + "step": 110090 + }, + { + "epoch": 1.7672835840061638, + "grad_norm": 1.4176864624023438, + "learning_rate": 1.6523428660701111e-06, + "loss": 0.792, + "step": 110100 + }, + { + "epoch": 1.767444100226328, + "grad_norm": 2.7988083362579346, + "learning_rate": 1.650089647881803e-06, + "loss": 0.751, + "step": 110110 + }, + { + "epoch": 1.767604616446492, + "grad_norm": 1.2216877937316895, + "learning_rate": 1.647837914610742e-06, + "loss": 0.8106, + "step": 110120 + }, + { + "epoch": 1.767765132666656, + "grad_norm": 1.2210838794708252, + "learning_rate": 1.6455876664001224e-06, + "loss": 0.7619, + "step": 110130 + }, + { + "epoch": 1.76792564888682, + "grad_norm": 1.7509902715682983, + "learning_rate": 1.6433389033930546e-06, + "loss": 0.6121, + "step": 110140 + }, + { + "epoch": 1.768086165106984, + "grad_norm": 0.9316558241844177, + "learning_rate": 1.641091625732541e-06, + "loss": 0.6801, + "step": 110150 + }, + { + "epoch": 1.768246681327148, + "grad_norm": 1.1638891696929932, + "learning_rate": 1.6388458335614975e-06, + "loss": 0.7519, + "step": 110160 + }, + { + "epoch": 1.7684071975473121, + "grad_norm": 0.873062014579773, + "learning_rate": 1.6366015270227463e-06, + "loss": 0.7628, + "step": 110170 + }, + { + "epoch": 1.7685677137674762, + "grad_norm": 1.271787405014038, + "learning_rate": 1.6343587062590065e-06, + "loss": 0.7734, + "step": 110180 + }, + { + "epoch": 1.7687282299876403, + "grad_norm": 1.0471971035003662, + "learning_rate": 1.6321173714129085e-06, + "loss": 0.7087, + "step": 110190 + }, + { + "epoch": 1.7688887462078045, + "grad_norm": 1.053369402885437, + "learning_rate": 1.6298775226269908e-06, + "loss": 0.797, + "step": 110200 + }, + { + "epoch": 1.7690492624279683, + "grad_norm": 0.8547409176826477, + "learning_rate": 1.6276391600436953e-06, + "loss": 0.7393, + "step": 110210 + }, + { + "epoch": 1.7692097786481324, + "grad_norm": 0.934246301651001, + "learning_rate": 1.6254022838053662e-06, + "loss": 0.8299, + "step": 110220 + }, + { + "epoch": 1.7693702948682963, + "grad_norm": 0.7083905339241028, + "learning_rate": 1.6231668940542622e-06, + "loss": 0.7457, + "step": 110230 + }, + { + "epoch": 1.7695308110884604, + "grad_norm": 1.4613455533981323, + "learning_rate": 1.6209329909325304e-06, + "loss": 0.8083, + "step": 110240 + }, + { + "epoch": 1.7696913273086246, + "grad_norm": 1.204819917678833, + "learning_rate": 1.6187005745822382e-06, + "loss": 0.7413, + "step": 110250 + }, + { + "epoch": 1.7698518435287887, + "grad_norm": 1.0559204816818237, + "learning_rate": 1.6164696451453576e-06, + "loss": 0.7806, + "step": 110260 + }, + { + "epoch": 1.7700123597489528, + "grad_norm": 0.779460072517395, + "learning_rate": 1.6142402027637587e-06, + "loss": 0.7047, + "step": 110270 + }, + { + "epoch": 1.7701728759691167, + "grad_norm": 1.4909573793411255, + "learning_rate": 1.6120122475792226e-06, + "loss": 0.6438, + "step": 110280 + }, + { + "epoch": 1.7703333921892808, + "grad_norm": 0.9780411720275879, + "learning_rate": 1.6097857797334331e-06, + "loss": 0.6567, + "step": 110290 + }, + { + "epoch": 1.7704939084094446, + "grad_norm": 0.7889228463172913, + "learning_rate": 1.6075607993679853e-06, + "loss": 0.6537, + "step": 110300 + }, + { + "epoch": 1.7706544246296088, + "grad_norm": 0.9794952869415283, + "learning_rate": 1.6053373066243689e-06, + "loss": 0.7342, + "step": 110310 + }, + { + "epoch": 1.7708149408497729, + "grad_norm": 0.8691179156303406, + "learning_rate": 1.6031153016439842e-06, + "loss": 0.7806, + "step": 110320 + }, + { + "epoch": 1.770975457069937, + "grad_norm": 1.4564919471740723, + "learning_rate": 1.6008947845681439e-06, + "loss": 0.617, + "step": 110330 + }, + { + "epoch": 1.771135973290101, + "grad_norm": 1.1302490234375, + "learning_rate": 1.5986757555380538e-06, + "loss": 0.8379, + "step": 110340 + }, + { + "epoch": 1.771296489510265, + "grad_norm": 1.0628968477249146, + "learning_rate": 1.5964582146948376e-06, + "loss": 0.6839, + "step": 110350 + }, + { + "epoch": 1.771457005730429, + "grad_norm": 1.1025828123092651, + "learning_rate": 1.594242162179513e-06, + "loss": 0.5618, + "step": 110360 + }, + { + "epoch": 1.771617521950593, + "grad_norm": 1.301520586013794, + "learning_rate": 1.5920275981330084e-06, + "loss": 0.6236, + "step": 110370 + }, + { + "epoch": 1.771778038170757, + "grad_norm": 1.024350643157959, + "learning_rate": 1.5898145226961564e-06, + "loss": 0.6775, + "step": 110380 + }, + { + "epoch": 1.7719385543909212, + "grad_norm": 1.6834362745285034, + "learning_rate": 1.5876029360096995e-06, + "loss": 0.7089, + "step": 110390 + }, + { + "epoch": 1.7720990706110853, + "grad_norm": 1.1914173364639282, + "learning_rate": 1.5853928382142752e-06, + "loss": 0.681, + "step": 110400 + }, + { + "epoch": 1.7722595868312494, + "grad_norm": 2.5919642448425293, + "learning_rate": 1.583184229450438e-06, + "loss": 0.791, + "step": 110410 + }, + { + "epoch": 1.7724201030514135, + "grad_norm": 0.830954909324646, + "learning_rate": 1.5809771098586473e-06, + "loss": 0.5042, + "step": 110420 + }, + { + "epoch": 1.7725806192715774, + "grad_norm": 1.3833553791046143, + "learning_rate": 1.5787714795792496e-06, + "loss": 0.5854, + "step": 110430 + }, + { + "epoch": 1.7727411354917413, + "grad_norm": 1.4628660678863525, + "learning_rate": 1.5765673387525186e-06, + "loss": 0.6328, + "step": 110440 + }, + { + "epoch": 1.7729016517119054, + "grad_norm": 2.6454625129699707, + "learning_rate": 1.574364687518623e-06, + "loss": 0.7314, + "step": 110450 + }, + { + "epoch": 1.7730621679320695, + "grad_norm": 0.6209930181503296, + "learning_rate": 1.5721635260176392e-06, + "loss": 0.6868, + "step": 110460 + }, + { + "epoch": 1.7732226841522336, + "grad_norm": 4.788200855255127, + "learning_rate": 1.5699638543895472e-06, + "loss": 0.6948, + "step": 110470 + }, + { + "epoch": 1.7733832003723977, + "grad_norm": 1.1068910360336304, + "learning_rate": 1.567765672774238e-06, + "loss": 0.5925, + "step": 110480 + }, + { + "epoch": 1.7735437165925618, + "grad_norm": 1.2620198726654053, + "learning_rate": 1.5655689813114937e-06, + "loss": 0.7125, + "step": 110490 + }, + { + "epoch": 1.7737042328127257, + "grad_norm": 1.1301212310791016, + "learning_rate": 1.563373780141017e-06, + "loss": 0.7513, + "step": 110500 + }, + { + "epoch": 1.7738647490328898, + "grad_norm": 1.7671583890914917, + "learning_rate": 1.561180069402407e-06, + "loss": 0.6427, + "step": 110510 + }, + { + "epoch": 1.7740252652530537, + "grad_norm": 1.4677751064300537, + "learning_rate": 1.5589878492351746e-06, + "loss": 0.8554, + "step": 110520 + }, + { + "epoch": 1.7741857814732178, + "grad_norm": 0.9127177000045776, + "learning_rate": 1.55679711977873e-06, + "loss": 0.7197, + "step": 110530 + }, + { + "epoch": 1.774346297693382, + "grad_norm": 1.3710448741912842, + "learning_rate": 1.554607881172393e-06, + "loss": 0.8233, + "step": 110540 + }, + { + "epoch": 1.774506813913546, + "grad_norm": 0.9031721949577332, + "learning_rate": 1.5524201335553851e-06, + "loss": 0.6714, + "step": 110550 + }, + { + "epoch": 1.7746673301337101, + "grad_norm": 1.0964082479476929, + "learning_rate": 1.5502338770668313e-06, + "loss": 0.6388, + "step": 110560 + }, + { + "epoch": 1.774827846353874, + "grad_norm": 1.0427390336990356, + "learning_rate": 1.5480491118457647e-06, + "loss": 0.7608, + "step": 110570 + }, + { + "epoch": 1.7749883625740381, + "grad_norm": 1.3422971963882446, + "learning_rate": 1.545865838031127e-06, + "loss": 0.7322, + "step": 110580 + }, + { + "epoch": 1.775148878794202, + "grad_norm": 1.2259173393249512, + "learning_rate": 1.5436840557617598e-06, + "loss": 0.7224, + "step": 110590 + }, + { + "epoch": 1.7753093950143661, + "grad_norm": 2.0779001712799072, + "learning_rate": 1.5415037651764136e-06, + "loss": 0.7182, + "step": 110600 + }, + { + "epoch": 1.7754699112345302, + "grad_norm": 1.218536615371704, + "learning_rate": 1.5393249664137383e-06, + "loss": 0.7084, + "step": 110610 + }, + { + "epoch": 1.7756304274546943, + "grad_norm": 0.861237645149231, + "learning_rate": 1.5371476596123007e-06, + "loss": 0.6269, + "step": 110620 + }, + { + "epoch": 1.7757909436748585, + "grad_norm": 1.4332654476165771, + "learning_rate": 1.5349718449105543e-06, + "loss": 0.6839, + "step": 110630 + }, + { + "epoch": 1.7759514598950223, + "grad_norm": 0.8964446783065796, + "learning_rate": 1.5327975224468716e-06, + "loss": 0.6879, + "step": 110640 + }, + { + "epoch": 1.7761119761151865, + "grad_norm": 0.857674777507782, + "learning_rate": 1.5306246923595308e-06, + "loss": 0.7402, + "step": 110650 + }, + { + "epoch": 1.7762724923353503, + "grad_norm": 1.1934404373168945, + "learning_rate": 1.5284533547867048e-06, + "loss": 0.6994, + "step": 110660 + }, + { + "epoch": 1.7764330085555144, + "grad_norm": 1.2469979524612427, + "learning_rate": 1.5262835098664828e-06, + "loss": 0.7926, + "step": 110670 + }, + { + "epoch": 1.7765935247756786, + "grad_norm": 0.8425871133804321, + "learning_rate": 1.5241151577368551e-06, + "loss": 0.8327, + "step": 110680 + }, + { + "epoch": 1.7767540409958427, + "grad_norm": 1.254449725151062, + "learning_rate": 1.521948298535711e-06, + "loss": 0.7071, + "step": 110690 + }, + { + "epoch": 1.7769145572160068, + "grad_norm": 1.1684662103652954, + "learning_rate": 1.5197829324008539e-06, + "loss": 0.813, + "step": 110700 + }, + { + "epoch": 1.7770750734361709, + "grad_norm": 0.9424136877059937, + "learning_rate": 1.517619059469988e-06, + "loss": 0.7606, + "step": 110710 + }, + { + "epoch": 1.7772355896563348, + "grad_norm": 1.1186928749084473, + "learning_rate": 1.5154566798807224e-06, + "loss": 0.7337, + "step": 110720 + }, + { + "epoch": 1.7773961058764987, + "grad_norm": 1.4662104845046997, + "learning_rate": 1.513295793770575e-06, + "loss": 0.6986, + "step": 110730 + }, + { + "epoch": 1.7775566220966628, + "grad_norm": 1.227920651435852, + "learning_rate": 1.511136401276958e-06, + "loss": 0.5788, + "step": 110740 + }, + { + "epoch": 1.7777171383168269, + "grad_norm": 0.8073061108589172, + "learning_rate": 1.5089785025372006e-06, + "loss": 0.7477, + "step": 110750 + }, + { + "epoch": 1.777877654536991, + "grad_norm": 1.2839531898498535, + "learning_rate": 1.5068220976885316e-06, + "loss": 0.6163, + "step": 110760 + }, + { + "epoch": 1.778038170757155, + "grad_norm": 1.1500049829483032, + "learning_rate": 1.5046671868680861e-06, + "loss": 0.7306, + "step": 110770 + }, + { + "epoch": 1.7781986869773192, + "grad_norm": 0.979301929473877, + "learning_rate": 1.5025137702129072e-06, + "loss": 0.7287, + "step": 110780 + }, + { + "epoch": 1.778359203197483, + "grad_norm": 1.3958507776260376, + "learning_rate": 1.5003618478599352e-06, + "loss": 0.7706, + "step": 110790 + }, + { + "epoch": 1.7785197194176472, + "grad_norm": 1.2195500135421753, + "learning_rate": 1.498211419946019e-06, + "loss": 0.6528, + "step": 110800 + }, + { + "epoch": 1.778680235637811, + "grad_norm": 1.447446346282959, + "learning_rate": 1.496062486607916e-06, + "loss": 0.8141, + "step": 110810 + }, + { + "epoch": 1.7788407518579752, + "grad_norm": 0.8556258678436279, + "learning_rate": 1.4939150479822834e-06, + "loss": 0.7848, + "step": 110820 + }, + { + "epoch": 1.7790012680781393, + "grad_norm": 1.3070250749588013, + "learning_rate": 1.4917691042056897e-06, + "loss": 0.7651, + "step": 110830 + }, + { + "epoch": 1.7791617842983034, + "grad_norm": 1.2173051834106445, + "learning_rate": 1.4896246554145981e-06, + "loss": 0.8039, + "step": 110840 + }, + { + "epoch": 1.7793223005184675, + "grad_norm": 1.0477969646453857, + "learning_rate": 1.4874817017453885e-06, + "loss": 0.7485, + "step": 110850 + }, + { + "epoch": 1.7794828167386314, + "grad_norm": 0.9922446608543396, + "learning_rate": 1.4853402433343406e-06, + "loss": 0.6107, + "step": 110860 + }, + { + "epoch": 1.7796433329587955, + "grad_norm": 0.7710512280464172, + "learning_rate": 1.4832002803176342e-06, + "loss": 0.811, + "step": 110870 + }, + { + "epoch": 1.7798038491789594, + "grad_norm": 1.4085322618484497, + "learning_rate": 1.4810618128313608e-06, + "loss": 0.668, + "step": 110880 + }, + { + "epoch": 1.7799643653991235, + "grad_norm": 1.2425822019577026, + "learning_rate": 1.4789248410115113e-06, + "loss": 0.6602, + "step": 110890 + }, + { + "epoch": 1.7801248816192876, + "grad_norm": 1.2735191583633423, + "learning_rate": 1.4767893649939885e-06, + "loss": 0.721, + "step": 110900 + }, + { + "epoch": 1.7802853978394517, + "grad_norm": 0.8622840642929077, + "learning_rate": 1.4746553849145944e-06, + "loss": 0.7363, + "step": 110910 + }, + { + "epoch": 1.7804459140596158, + "grad_norm": 1.3667668104171753, + "learning_rate": 1.472522900909043e-06, + "loss": 0.6303, + "step": 110920 + }, + { + "epoch": 1.7806064302797797, + "grad_norm": 0.9255276918411255, + "learning_rate": 1.4703919131129367e-06, + "loss": 0.7173, + "step": 110930 + }, + { + "epoch": 1.7807669464999438, + "grad_norm": 5.098833084106445, + "learning_rate": 1.4682624216618035e-06, + "loss": 0.7159, + "step": 110940 + }, + { + "epoch": 1.7809274627201077, + "grad_norm": 1.7357770204544067, + "learning_rate": 1.4661344266910599e-06, + "loss": 0.7086, + "step": 110950 + }, + { + "epoch": 1.7810879789402718, + "grad_norm": 2.416666269302368, + "learning_rate": 1.4640079283360392e-06, + "loss": 0.6524, + "step": 110960 + }, + { + "epoch": 1.781248495160436, + "grad_norm": 1.8073519468307495, + "learning_rate": 1.4618829267319695e-06, + "loss": 0.6668, + "step": 110970 + }, + { + "epoch": 1.7814090113806, + "grad_norm": 1.3399847745895386, + "learning_rate": 1.4597594220139982e-06, + "loss": 0.6945, + "step": 110980 + }, + { + "epoch": 1.7815695276007641, + "grad_norm": 4.503965854644775, + "learning_rate": 1.4576374143171535e-06, + "loss": 0.7242, + "step": 110990 + }, + { + "epoch": 1.7817300438209283, + "grad_norm": 1.4964460134506226, + "learning_rate": 1.4555169037763911e-06, + "loss": 0.6499, + "step": 111000 + }, + { + "epoch": 1.7818905600410921, + "grad_norm": 0.8516881465911865, + "learning_rate": 1.4533978905265615e-06, + "loss": 0.6463, + "step": 111010 + }, + { + "epoch": 1.7820510762612563, + "grad_norm": 0.9055961966514587, + "learning_rate": 1.4512803747024213e-06, + "loss": 0.5795, + "step": 111020 + }, + { + "epoch": 1.7822115924814201, + "grad_norm": 0.6320428848266602, + "learning_rate": 1.4491643564386315e-06, + "loss": 0.71, + "step": 111030 + }, + { + "epoch": 1.7823721087015842, + "grad_norm": 0.9674670696258545, + "learning_rate": 1.4470498358697653e-06, + "loss": 0.6785, + "step": 111040 + }, + { + "epoch": 1.7825326249217484, + "grad_norm": 1.9900233745574951, + "learning_rate": 1.444936813130282e-06, + "loss": 0.6383, + "step": 111050 + }, + { + "epoch": 1.7826931411419125, + "grad_norm": 1.6300126314163208, + "learning_rate": 1.442825288354563e-06, + "loss": 0.762, + "step": 111060 + }, + { + "epoch": 1.7828536573620766, + "grad_norm": 0.9427264928817749, + "learning_rate": 1.4407152616768892e-06, + "loss": 0.6491, + "step": 111070 + }, + { + "epoch": 1.7830141735822405, + "grad_norm": 1.0887260437011719, + "learning_rate": 1.4386067332314484e-06, + "loss": 0.6973, + "step": 111080 + }, + { + "epoch": 1.7831746898024046, + "grad_norm": 0.8432139158248901, + "learning_rate": 1.4364997031523247e-06, + "loss": 0.6565, + "step": 111090 + }, + { + "epoch": 1.7833352060225685, + "grad_norm": 0.8156108856201172, + "learning_rate": 1.4343941715735193e-06, + "loss": 0.6877, + "step": 111100 + }, + { + "epoch": 1.7834957222427326, + "grad_norm": 0.719372570514679, + "learning_rate": 1.4322901386289277e-06, + "loss": 0.6155, + "step": 111110 + }, + { + "epoch": 1.7836562384628967, + "grad_norm": 1.034720778465271, + "learning_rate": 1.4301876044523544e-06, + "loss": 0.7766, + "step": 111120 + }, + { + "epoch": 1.7838167546830608, + "grad_norm": 1.0004373788833618, + "learning_rate": 1.4280865691775087e-06, + "loss": 0.7222, + "step": 111130 + }, + { + "epoch": 1.7839772709032249, + "grad_norm": 1.1814959049224854, + "learning_rate": 1.4259870329380032e-06, + "loss": 0.7151, + "step": 111140 + }, + { + "epoch": 1.7841377871233888, + "grad_norm": 1.1015406847000122, + "learning_rate": 1.423888995867359e-06, + "loss": 0.603, + "step": 111150 + }, + { + "epoch": 1.7842983033435529, + "grad_norm": 0.9249890446662903, + "learning_rate": 1.4217924580989945e-06, + "loss": 0.7747, + "step": 111160 + }, + { + "epoch": 1.7844588195637168, + "grad_norm": 0.9021983742713928, + "learning_rate": 1.4196974197662444e-06, + "loss": 0.7083, + "step": 111170 + }, + { + "epoch": 1.7846193357838809, + "grad_norm": 1.5558667182922363, + "learning_rate": 1.417603881002333e-06, + "loss": 0.7217, + "step": 111180 + }, + { + "epoch": 1.784779852004045, + "grad_norm": 1.0393030643463135, + "learning_rate": 1.415511841940398e-06, + "loss": 0.7245, + "step": 111190 + }, + { + "epoch": 1.784940368224209, + "grad_norm": 1.0849003791809082, + "learning_rate": 1.4134213027134858e-06, + "loss": 0.7268, + "step": 111200 + }, + { + "epoch": 1.7851008844443732, + "grad_norm": 0.7076601386070251, + "learning_rate": 1.4113322634545373e-06, + "loss": 0.6929, + "step": 111210 + }, + { + "epoch": 1.7852614006645373, + "grad_norm": 1.1387109756469727, + "learning_rate": 1.4092447242964046e-06, + "loss": 0.7426, + "step": 111220 + }, + { + "epoch": 1.7854219168847012, + "grad_norm": 1.0674664974212646, + "learning_rate": 1.4071586853718456e-06, + "loss": 0.7032, + "step": 111230 + }, + { + "epoch": 1.785582433104865, + "grad_norm": 0.7723910212516785, + "learning_rate": 1.4050741468135175e-06, + "loss": 0.6361, + "step": 111240 + }, + { + "epoch": 1.7857429493250292, + "grad_norm": 0.9037462472915649, + "learning_rate": 1.4029911087539899e-06, + "loss": 0.7088, + "step": 111250 + }, + { + "epoch": 1.7859034655451933, + "grad_norm": 2.1440441608428955, + "learning_rate": 1.4009095713257258e-06, + "loss": 0.7241, + "step": 111260 + }, + { + "epoch": 1.7860639817653574, + "grad_norm": 1.7499427795410156, + "learning_rate": 1.3988295346610974e-06, + "loss": 0.6901, + "step": 111270 + }, + { + "epoch": 1.7862244979855215, + "grad_norm": 1.128433108329773, + "learning_rate": 1.3967509988923848e-06, + "loss": 0.5655, + "step": 111280 + }, + { + "epoch": 1.7863850142056856, + "grad_norm": 1.071362853050232, + "learning_rate": 1.3946739641517743e-06, + "loss": 0.6272, + "step": 111290 + }, + { + "epoch": 1.7865455304258495, + "grad_norm": 1.2449549436569214, + "learning_rate": 1.3925984305713485e-06, + "loss": 0.6565, + "step": 111300 + }, + { + "epoch": 1.7867060466460136, + "grad_norm": 0.846024215221405, + "learning_rate": 1.3905243982831024e-06, + "loss": 0.7537, + "step": 111310 + }, + { + "epoch": 1.7868665628661775, + "grad_norm": 1.060167908668518, + "learning_rate": 1.3884518674189329e-06, + "loss": 0.8125, + "step": 111320 + }, + { + "epoch": 1.7870270790863416, + "grad_norm": 1.1372904777526855, + "learning_rate": 1.3863808381106347e-06, + "loss": 0.6956, + "step": 111330 + }, + { + "epoch": 1.7871875953065057, + "grad_norm": 1.183023452758789, + "learning_rate": 1.3843113104899192e-06, + "loss": 0.8013, + "step": 111340 + }, + { + "epoch": 1.7873481115266698, + "grad_norm": 1.396918535232544, + "learning_rate": 1.3822432846883949e-06, + "loss": 0.6272, + "step": 111350 + }, + { + "epoch": 1.787508627746834, + "grad_norm": 1.3716000318527222, + "learning_rate": 1.3801767608375787e-06, + "loss": 0.6619, + "step": 111360 + }, + { + "epoch": 1.7876691439669978, + "grad_norm": 0.7991777062416077, + "learning_rate": 1.3781117390688825e-06, + "loss": 0.7237, + "step": 111370 + }, + { + "epoch": 1.787829660187162, + "grad_norm": 1.4383654594421387, + "learning_rate": 1.3760482195136344e-06, + "loss": 0.7077, + "step": 111380 + }, + { + "epoch": 1.7879901764073258, + "grad_norm": 0.9295501112937927, + "learning_rate": 1.3739862023030597e-06, + "loss": 0.7007, + "step": 111390 + }, + { + "epoch": 1.78815069262749, + "grad_norm": 1.255152940750122, + "learning_rate": 1.3719256875682929e-06, + "loss": 0.7283, + "step": 111400 + }, + { + "epoch": 1.788311208847654, + "grad_norm": 0.8812426328659058, + "learning_rate": 1.3698666754403677e-06, + "loss": 0.6393, + "step": 111410 + }, + { + "epoch": 1.7884717250678182, + "grad_norm": 0.8412447571754456, + "learning_rate": 1.3678091660502323e-06, + "loss": 0.719, + "step": 111420 + }, + { + "epoch": 1.7886322412879823, + "grad_norm": 1.6532554626464844, + "learning_rate": 1.3657531595287238e-06, + "loss": 0.6684, + "step": 111430 + }, + { + "epoch": 1.7887927575081461, + "grad_norm": 2.188028573989868, + "learning_rate": 1.3636986560065956e-06, + "loss": 0.7706, + "step": 111440 + }, + { + "epoch": 1.7889532737283103, + "grad_norm": 1.2543506622314453, + "learning_rate": 1.3616456556145018e-06, + "loss": 0.6687, + "step": 111450 + }, + { + "epoch": 1.7891137899484741, + "grad_norm": 0.9925280213356018, + "learning_rate": 1.359594158482999e-06, + "loss": 0.6814, + "step": 111460 + }, + { + "epoch": 1.7892743061686383, + "grad_norm": 1.076520323753357, + "learning_rate": 1.3575441647425553e-06, + "loss": 0.6673, + "step": 111470 + }, + { + "epoch": 1.7894348223888024, + "grad_norm": 1.852329969406128, + "learning_rate": 1.3554956745235381e-06, + "loss": 0.7474, + "step": 111480 + }, + { + "epoch": 1.7895953386089665, + "grad_norm": 1.1331628561019897, + "learning_rate": 1.353448687956213e-06, + "loss": 0.7309, + "step": 111490 + }, + { + "epoch": 1.7897558548291306, + "grad_norm": 1.0037798881530762, + "learning_rate": 1.3514032051707593e-06, + "loss": 0.7448, + "step": 111500 + }, + { + "epoch": 1.7899163710492947, + "grad_norm": 1.3438069820404053, + "learning_rate": 1.3493592262972616e-06, + "loss": 0.625, + "step": 111510 + }, + { + "epoch": 1.7900768872694586, + "grad_norm": 0.9738750457763672, + "learning_rate": 1.3473167514656992e-06, + "loss": 0.8127, + "step": 111520 + }, + { + "epoch": 1.7902374034896227, + "grad_norm": 1.4779982566833496, + "learning_rate": 1.3452757808059657e-06, + "loss": 0.784, + "step": 111530 + }, + { + "epoch": 1.7903979197097866, + "grad_norm": 1.538784384727478, + "learning_rate": 1.3432363144478544e-06, + "loss": 0.7576, + "step": 111540 + }, + { + "epoch": 1.7905584359299507, + "grad_norm": 1.0519766807556152, + "learning_rate": 1.3411983525210613e-06, + "loss": 0.8641, + "step": 111550 + }, + { + "epoch": 1.7907189521501148, + "grad_norm": 0.9087038040161133, + "learning_rate": 1.3391618951551887e-06, + "loss": 0.7057, + "step": 111560 + }, + { + "epoch": 1.790879468370279, + "grad_norm": 2.657480001449585, + "learning_rate": 1.3371269424797467e-06, + "loss": 0.7337, + "step": 111570 + }, + { + "epoch": 1.791039984590443, + "grad_norm": 1.0584148168563843, + "learning_rate": 1.3350934946241428e-06, + "loss": 0.722, + "step": 111580 + }, + { + "epoch": 1.7912005008106069, + "grad_norm": 1.467778205871582, + "learning_rate": 1.333061551717693e-06, + "loss": 0.7983, + "step": 111590 + }, + { + "epoch": 1.791361017030771, + "grad_norm": 0.6631199717521667, + "learning_rate": 1.331031113889622e-06, + "loss": 0.7103, + "step": 111600 + }, + { + "epoch": 1.7915215332509349, + "grad_norm": 1.2661128044128418, + "learning_rate": 1.3290021812690457e-06, + "loss": 0.7881, + "step": 111610 + }, + { + "epoch": 1.791682049471099, + "grad_norm": 0.8121427893638611, + "learning_rate": 1.326974753984997e-06, + "loss": 0.6762, + "step": 111620 + }, + { + "epoch": 1.791842565691263, + "grad_norm": 0.7901380062103271, + "learning_rate": 1.324948832166406e-06, + "loss": 0.785, + "step": 111630 + }, + { + "epoch": 1.7920030819114272, + "grad_norm": 1.722050666809082, + "learning_rate": 1.3229244159421118e-06, + "loss": 0.6639, + "step": 111640 + }, + { + "epoch": 1.7921635981315913, + "grad_norm": 1.2145295143127441, + "learning_rate": 1.3209015054408525e-06, + "loss": 0.7626, + "step": 111650 + }, + { + "epoch": 1.7923241143517552, + "grad_norm": 1.221474528312683, + "learning_rate": 1.3188801007912783e-06, + "loss": 0.6611, + "step": 111660 + }, + { + "epoch": 1.7924846305719193, + "grad_norm": 1.7251040935516357, + "learning_rate": 1.3168602021219362e-06, + "loss": 0.6819, + "step": 111670 + }, + { + "epoch": 1.7926451467920832, + "grad_norm": 1.0942635536193848, + "learning_rate": 1.3148418095612763e-06, + "loss": 0.5891, + "step": 111680 + }, + { + "epoch": 1.7928056630122473, + "grad_norm": 1.1817456483840942, + "learning_rate": 1.31282492323766e-06, + "loss": 0.6438, + "step": 111690 + }, + { + "epoch": 1.7929661792324114, + "grad_norm": 1.6600409746170044, + "learning_rate": 1.310809543279351e-06, + "loss": 0.7617, + "step": 111700 + }, + { + "epoch": 1.7931266954525755, + "grad_norm": 1.6383087635040283, + "learning_rate": 1.308795669814511e-06, + "loss": 0.7113, + "step": 111710 + }, + { + "epoch": 1.7932872116727396, + "grad_norm": 1.5314157009124756, + "learning_rate": 1.306783302971215e-06, + "loss": 0.7128, + "step": 111720 + }, + { + "epoch": 1.7934477278929037, + "grad_norm": 1.4054149389266968, + "learning_rate": 1.3047724428774389e-06, + "loss": 0.7873, + "step": 111730 + }, + { + "epoch": 1.7936082441130676, + "grad_norm": 1.0308873653411865, + "learning_rate": 1.3027630896610549e-06, + "loss": 0.608, + "step": 111740 + }, + { + "epoch": 1.7937687603332315, + "grad_norm": 0.9964166879653931, + "learning_rate": 1.3007552434498472e-06, + "loss": 0.6804, + "step": 111750 + }, + { + "epoch": 1.7939292765533956, + "grad_norm": 1.0951793193817139, + "learning_rate": 1.298748904371508e-06, + "loss": 0.6457, + "step": 111760 + }, + { + "epoch": 1.7940897927735597, + "grad_norm": 1.3828377723693848, + "learning_rate": 1.2967440725536267e-06, + "loss": 0.6547, + "step": 111770 + }, + { + "epoch": 1.7942503089937238, + "grad_norm": 1.2096657752990723, + "learning_rate": 1.2947407481236989e-06, + "loss": 0.6357, + "step": 111780 + }, + { + "epoch": 1.794410825213888, + "grad_norm": 1.120471477508545, + "learning_rate": 1.2927389312091255e-06, + "loss": 0.8041, + "step": 111790 + }, + { + "epoch": 1.794571341434052, + "grad_norm": 0.9736739993095398, + "learning_rate": 1.290738621937207e-06, + "loss": 0.66, + "step": 111800 + }, + { + "epoch": 1.794731857654216, + "grad_norm": 0.9692931771278381, + "learning_rate": 1.2887398204351503e-06, + "loss": 0.802, + "step": 111810 + }, + { + "epoch": 1.79489237387438, + "grad_norm": 0.8037682175636292, + "learning_rate": 1.2867425268300703e-06, + "loss": 0.6508, + "step": 111820 + }, + { + "epoch": 1.795052890094544, + "grad_norm": 0.8227572441101074, + "learning_rate": 1.284746741248985e-06, + "loss": 0.7345, + "step": 111830 + }, + { + "epoch": 1.795213406314708, + "grad_norm": 1.660598635673523, + "learning_rate": 1.282752463818812e-06, + "loss": 0.7171, + "step": 111840 + }, + { + "epoch": 1.7953739225348722, + "grad_norm": 0.8868923187255859, + "learning_rate": 1.2807596946663752e-06, + "loss": 0.7275, + "step": 111850 + }, + { + "epoch": 1.7955344387550363, + "grad_norm": 0.8978890776634216, + "learning_rate": 1.2787684339184036e-06, + "loss": 0.6943, + "step": 111860 + }, + { + "epoch": 1.7956949549752004, + "grad_norm": 0.912710428237915, + "learning_rate": 1.2767786817015348e-06, + "loss": 0.7193, + "step": 111870 + }, + { + "epoch": 1.7958554711953643, + "grad_norm": 1.3158464431762695, + "learning_rate": 1.2747904381422982e-06, + "loss": 0.6691, + "step": 111880 + }, + { + "epoch": 1.7960159874155284, + "grad_norm": 1.18837308883667, + "learning_rate": 1.2728037033671343e-06, + "loss": 0.6524, + "step": 111890 + }, + { + "epoch": 1.7961765036356923, + "grad_norm": 1.0804314613342285, + "learning_rate": 1.2708184775023917e-06, + "loss": 0.7019, + "step": 111900 + }, + { + "epoch": 1.7963370198558564, + "grad_norm": 0.8482969403266907, + "learning_rate": 1.2688347606743173e-06, + "loss": 0.6456, + "step": 111910 + }, + { + "epoch": 1.7964975360760205, + "grad_norm": 1.7941862344741821, + "learning_rate": 1.2668525530090652e-06, + "loss": 0.8171, + "step": 111920 + }, + { + "epoch": 1.7966580522961846, + "grad_norm": 1.6941581964492798, + "learning_rate": 1.2648718546326903e-06, + "loss": 0.8651, + "step": 111930 + }, + { + "epoch": 1.7968185685163487, + "grad_norm": 1.13310706615448, + "learning_rate": 1.2628926656711559e-06, + "loss": 0.524, + "step": 111940 + }, + { + "epoch": 1.7969790847365126, + "grad_norm": 0.9679355621337891, + "learning_rate": 1.260914986250325e-06, + "loss": 0.6549, + "step": 111950 + }, + { + "epoch": 1.7971396009566767, + "grad_norm": 0.7853065729141235, + "learning_rate": 1.2589388164959665e-06, + "loss": 0.8063, + "step": 111960 + }, + { + "epoch": 1.7973001171768406, + "grad_norm": 0.7299247980117798, + "learning_rate": 1.256964156533752e-06, + "loss": 0.6756, + "step": 111970 + }, + { + "epoch": 1.7974606333970047, + "grad_norm": 1.4581938982009888, + "learning_rate": 1.2549910064892645e-06, + "loss": 0.6623, + "step": 111980 + }, + { + "epoch": 1.7976211496171688, + "grad_norm": 1.185529112815857, + "learning_rate": 1.2530193664879757e-06, + "loss": 0.6122, + "step": 111990 + }, + { + "epoch": 1.797781665837333, + "grad_norm": 1.1488416194915771, + "learning_rate": 1.2510492366552745e-06, + "loss": 0.6832, + "step": 112000 + }, + { + "epoch": 1.797781665837333, + "eval_loss": 0.7689330577850342, + "eval_runtime": 1833.7289, + "eval_samples_per_second": 14.305, + "eval_steps_per_second": 1.788, + "step": 112000 + }, + { + "epoch": 1.797942182057497, + "grad_norm": 0.7656244039535522, + "learning_rate": 1.2490806171164494e-06, + "loss": 0.7296, + "step": 112010 + }, + { + "epoch": 1.7981026982776611, + "grad_norm": 0.7751190066337585, + "learning_rate": 1.2471135079966944e-06, + "loss": 0.7846, + "step": 112020 + }, + { + "epoch": 1.798263214497825, + "grad_norm": 1.0139992237091064, + "learning_rate": 1.2451479094211043e-06, + "loss": 0.795, + "step": 112030 + }, + { + "epoch": 1.7984237307179889, + "grad_norm": 1.246437668800354, + "learning_rate": 1.2431838215146846e-06, + "loss": 0.7414, + "step": 112040 + }, + { + "epoch": 1.798584246938153, + "grad_norm": 1.2143282890319824, + "learning_rate": 1.2412212444023325e-06, + "loss": 0.7464, + "step": 112050 + }, + { + "epoch": 1.798744763158317, + "grad_norm": 0.9224913716316223, + "learning_rate": 1.2392601782088564e-06, + "loss": 0.7639, + "step": 112060 + }, + { + "epoch": 1.7989052793784812, + "grad_norm": 1.0620042085647583, + "learning_rate": 1.2373006230589734e-06, + "loss": 0.7904, + "step": 112070 + }, + { + "epoch": 1.7990657955986453, + "grad_norm": 0.7313375473022461, + "learning_rate": 1.2353425790772977e-06, + "loss": 0.7439, + "step": 112080 + }, + { + "epoch": 1.7992263118188094, + "grad_norm": 1.2852553129196167, + "learning_rate": 1.2333860463883518e-06, + "loss": 0.7483, + "step": 112090 + }, + { + "epoch": 1.7993868280389733, + "grad_norm": 0.8419426679611206, + "learning_rate": 1.2314310251165585e-06, + "loss": 0.6863, + "step": 112100 + }, + { + "epoch": 1.7995473442591374, + "grad_norm": 0.9245090484619141, + "learning_rate": 1.2294775153862404e-06, + "loss": 0.6687, + "step": 112110 + }, + { + "epoch": 1.7997078604793013, + "grad_norm": 1.9231232404708862, + "learning_rate": 1.227525517321637e-06, + "loss": 0.7028, + "step": 112120 + }, + { + "epoch": 1.7998683766994654, + "grad_norm": 1.0383647680282593, + "learning_rate": 1.2255750310468773e-06, + "loss": 0.5934, + "step": 112130 + }, + { + "epoch": 1.8000288929196295, + "grad_norm": 1.313997507095337, + "learning_rate": 1.2236260566860057e-06, + "loss": 0.6222, + "step": 112140 + }, + { + "epoch": 1.8001894091397936, + "grad_norm": 0.9353392124176025, + "learning_rate": 1.2216785943629627e-06, + "loss": 0.8178, + "step": 112150 + }, + { + "epoch": 1.8003499253599577, + "grad_norm": 1.5193140506744385, + "learning_rate": 1.2197326442015962e-06, + "loss": 0.7096, + "step": 112160 + }, + { + "epoch": 1.8005104415801216, + "grad_norm": 1.0079597234725952, + "learning_rate": 1.2177882063256624e-06, + "loss": 0.7738, + "step": 112170 + }, + { + "epoch": 1.8006709578002857, + "grad_norm": 1.217673897743225, + "learning_rate": 1.2158452808588071e-06, + "loss": 0.6683, + "step": 112180 + }, + { + "epoch": 1.8008314740204496, + "grad_norm": 1.1003901958465576, + "learning_rate": 1.2139038679245929e-06, + "loss": 0.5981, + "step": 112190 + }, + { + "epoch": 1.8009919902406137, + "grad_norm": 1.0216776132583618, + "learning_rate": 1.2119639676464816e-06, + "loss": 0.7425, + "step": 112200 + }, + { + "epoch": 1.8011525064607778, + "grad_norm": 1.1689356565475464, + "learning_rate": 1.2100255801478417e-06, + "loss": 0.7359, + "step": 112210 + }, + { + "epoch": 1.801313022680942, + "grad_norm": 0.8483970761299133, + "learning_rate": 1.2080887055519408e-06, + "loss": 0.627, + "step": 112220 + }, + { + "epoch": 1.801473538901106, + "grad_norm": 0.777038037776947, + "learning_rate": 1.2061533439819583e-06, + "loss": 0.7124, + "step": 112230 + }, + { + "epoch": 1.80163405512127, + "grad_norm": 1.8126378059387207, + "learning_rate": 1.2042194955609653e-06, + "loss": 0.5706, + "step": 112240 + }, + { + "epoch": 1.801794571341434, + "grad_norm": 0.8730329275131226, + "learning_rate": 1.202287160411944e-06, + "loss": 0.6246, + "step": 112250 + }, + { + "epoch": 1.801955087561598, + "grad_norm": 1.3790228366851807, + "learning_rate": 1.2003563386577793e-06, + "loss": 0.6455, + "step": 112260 + }, + { + "epoch": 1.802115603781762, + "grad_norm": 0.9599780440330505, + "learning_rate": 1.198427030421262e-06, + "loss": 0.6984, + "step": 112270 + }, + { + "epoch": 1.8022761200019262, + "grad_norm": 0.912298858165741, + "learning_rate": 1.1964992358250882e-06, + "loss": 0.6848, + "step": 112280 + }, + { + "epoch": 1.8024366362220903, + "grad_norm": 1.1410623788833618, + "learning_rate": 1.1945729549918488e-06, + "loss": 0.7747, + "step": 112290 + }, + { + "epoch": 1.8025971524422544, + "grad_norm": 2.3491339683532715, + "learning_rate": 1.192648188044046e-06, + "loss": 0.7013, + "step": 112300 + }, + { + "epoch": 1.8027576686624185, + "grad_norm": 1.006022572517395, + "learning_rate": 1.1907249351040817e-06, + "loss": 0.8638, + "step": 112310 + }, + { + "epoch": 1.8029181848825824, + "grad_norm": 0.884968638420105, + "learning_rate": 1.1888031962942663e-06, + "loss": 0.6714, + "step": 112320 + }, + { + "epoch": 1.8030787011027465, + "grad_norm": 1.0885484218597412, + "learning_rate": 1.1868829717368106e-06, + "loss": 0.8688, + "step": 112330 + }, + { + "epoch": 1.8032392173229104, + "grad_norm": 1.1178267002105713, + "learning_rate": 1.1849642615538282e-06, + "loss": 0.7656, + "step": 112340 + }, + { + "epoch": 1.8033997335430745, + "grad_norm": 0.7689014077186584, + "learning_rate": 1.1830470658673432e-06, + "loss": 0.6389, + "step": 112350 + }, + { + "epoch": 1.8035602497632386, + "grad_norm": 0.8013586401939392, + "learning_rate": 1.1811313847992695e-06, + "loss": 0.7162, + "step": 112360 + }, + { + "epoch": 1.8037207659834027, + "grad_norm": 0.5813196301460266, + "learning_rate": 1.1792172184714372e-06, + "loss": 0.6929, + "step": 112370 + }, + { + "epoch": 1.8038812822035668, + "grad_norm": 0.9638510942459106, + "learning_rate": 1.1773045670055744e-06, + "loss": 0.7202, + "step": 112380 + }, + { + "epoch": 1.8040417984237307, + "grad_norm": 0.9914662837982178, + "learning_rate": 1.1753934305233165e-06, + "loss": 0.7103, + "step": 112390 + }, + { + "epoch": 1.8042023146438948, + "grad_norm": 0.9228740930557251, + "learning_rate": 1.173483809146203e-06, + "loss": 0.7548, + "step": 112400 + }, + { + "epoch": 1.8043628308640587, + "grad_norm": 0.7909543514251709, + "learning_rate": 1.1715757029956725e-06, + "loss": 0.6899, + "step": 112410 + }, + { + "epoch": 1.8045233470842228, + "grad_norm": 1.2987627983093262, + "learning_rate": 1.1696691121930642e-06, + "loss": 0.6955, + "step": 112420 + }, + { + "epoch": 1.804683863304387, + "grad_norm": 0.9791368842124939, + "learning_rate": 1.1677640368596337e-06, + "loss": 0.7424, + "step": 112430 + }, + { + "epoch": 1.804844379524551, + "grad_norm": 1.2100024223327637, + "learning_rate": 1.1658604771165289e-06, + "loss": 0.5429, + "step": 112440 + }, + { + "epoch": 1.8050048957447151, + "grad_norm": 1.0630601644515991, + "learning_rate": 1.1639584330848025e-06, + "loss": 0.6989, + "step": 112450 + }, + { + "epoch": 1.805165411964879, + "grad_norm": 0.9820070266723633, + "learning_rate": 1.162057904885419e-06, + "loss": 0.7214, + "step": 112460 + }, + { + "epoch": 1.8053259281850431, + "grad_norm": 1.4103347063064575, + "learning_rate": 1.1601588926392376e-06, + "loss": 0.6844, + "step": 112470 + }, + { + "epoch": 1.805486444405207, + "grad_norm": 2.0216329097747803, + "learning_rate": 1.1582613964670253e-06, + "loss": 0.7488, + "step": 112480 + }, + { + "epoch": 1.8056469606253711, + "grad_norm": 1.0930004119873047, + "learning_rate": 1.1563654164894522e-06, + "loss": 0.7844, + "step": 112490 + }, + { + "epoch": 1.8058074768455352, + "grad_norm": 0.9885354042053223, + "learning_rate": 1.1544709528270915e-06, + "loss": 0.6906, + "step": 112500 + }, + { + "epoch": 1.8059679930656993, + "grad_norm": 0.906060516834259, + "learning_rate": 1.1525780056004188e-06, + "loss": 0.7603, + "step": 112510 + }, + { + "epoch": 1.8061285092858634, + "grad_norm": 1.2918128967285156, + "learning_rate": 1.150686574929813e-06, + "loss": 0.7089, + "step": 112520 + }, + { + "epoch": 1.8062890255060275, + "grad_norm": 0.6986578106880188, + "learning_rate": 1.148796660935561e-06, + "loss": 0.6394, + "step": 112530 + }, + { + "epoch": 1.8064495417261914, + "grad_norm": 1.1964185237884521, + "learning_rate": 1.1469082637378476e-06, + "loss": 0.7515, + "step": 112540 + }, + { + "epoch": 1.8066100579463553, + "grad_norm": 1.0079114437103271, + "learning_rate": 1.1450213834567653e-06, + "loss": 0.7149, + "step": 112550 + }, + { + "epoch": 1.8067705741665194, + "grad_norm": 0.8196514844894409, + "learning_rate": 1.1431360202123098e-06, + "loss": 0.6503, + "step": 112560 + }, + { + "epoch": 1.8069310903866835, + "grad_norm": 1.2185271978378296, + "learning_rate": 1.141252174124377e-06, + "loss": 0.6045, + "step": 112570 + }, + { + "epoch": 1.8070916066068476, + "grad_norm": 1.5939403772354126, + "learning_rate": 1.1393698453127682e-06, + "loss": 0.7307, + "step": 112580 + }, + { + "epoch": 1.8072521228270118, + "grad_norm": 1.294407606124878, + "learning_rate": 1.1374890338971905e-06, + "loss": 0.7657, + "step": 112590 + }, + { + "epoch": 1.8074126390471759, + "grad_norm": 1.0162162780761719, + "learning_rate": 1.1356097399972537e-06, + "loss": 0.6996, + "step": 112600 + }, + { + "epoch": 1.8075731552673397, + "grad_norm": 1.0570377111434937, + "learning_rate": 1.1337319637324678e-06, + "loss": 0.7403, + "step": 112610 + }, + { + "epoch": 1.8077336714875039, + "grad_norm": 1.138185739517212, + "learning_rate": 1.1318557052222456e-06, + "loss": 0.8412, + "step": 112620 + }, + { + "epoch": 1.8078941877076677, + "grad_norm": 1.2472827434539795, + "learning_rate": 1.1299809645859083e-06, + "loss": 0.7056, + "step": 112630 + }, + { + "epoch": 1.8080547039278319, + "grad_norm": 0.9789376854896545, + "learning_rate": 1.12810774194268e-06, + "loss": 0.6336, + "step": 112640 + }, + { + "epoch": 1.808215220147996, + "grad_norm": 0.919017493724823, + "learning_rate": 1.1262360374116848e-06, + "loss": 0.707, + "step": 112650 + }, + { + "epoch": 1.80837573636816, + "grad_norm": 0.8932560086250305, + "learning_rate": 1.1243658511119582e-06, + "loss": 0.6826, + "step": 112660 + }, + { + "epoch": 1.8085362525883242, + "grad_norm": 0.8848891258239746, + "learning_rate": 1.1224971831624214e-06, + "loss": 0.7239, + "step": 112670 + }, + { + "epoch": 1.808696768808488, + "grad_norm": 1.6353609561920166, + "learning_rate": 1.1206300336819186e-06, + "loss": 0.5237, + "step": 112680 + }, + { + "epoch": 1.8088572850286522, + "grad_norm": 0.6457672715187073, + "learning_rate": 1.118764402789188e-06, + "loss": 0.611, + "step": 112690 + }, + { + "epoch": 1.809017801248816, + "grad_norm": 0.7307332158088684, + "learning_rate": 1.1169002906028736e-06, + "loss": 0.6172, + "step": 112700 + }, + { + "epoch": 1.8091783174689802, + "grad_norm": 1.4603779315948486, + "learning_rate": 1.1150376972415193e-06, + "loss": 0.6593, + "step": 112710 + }, + { + "epoch": 1.8093388336891443, + "grad_norm": 1.5284911394119263, + "learning_rate": 1.1131766228235779e-06, + "loss": 0.7398, + "step": 112720 + }, + { + "epoch": 1.8094993499093084, + "grad_norm": 1.024592399597168, + "learning_rate": 1.1113170674674045e-06, + "loss": 0.6446, + "step": 112730 + }, + { + "epoch": 1.8096598661294725, + "grad_norm": 1.2047290802001953, + "learning_rate": 1.1094590312912517e-06, + "loss": 0.7058, + "step": 112740 + }, + { + "epoch": 1.8098203823496364, + "grad_norm": 1.5047470331192017, + "learning_rate": 1.107602514413278e-06, + "loss": 0.6347, + "step": 112750 + }, + { + "epoch": 1.8099808985698005, + "grad_norm": 0.6860899329185486, + "learning_rate": 1.1057475169515524e-06, + "loss": 0.7368, + "step": 112760 + }, + { + "epoch": 1.8101414147899644, + "grad_norm": 1.840334415435791, + "learning_rate": 1.1038940390240422e-06, + "loss": 0.748, + "step": 112770 + }, + { + "epoch": 1.8103019310101285, + "grad_norm": 1.2871838808059692, + "learning_rate": 1.1020420807486109e-06, + "loss": 0.7152, + "step": 112780 + }, + { + "epoch": 1.8104624472302926, + "grad_norm": 1.3751678466796875, + "learning_rate": 1.1001916422430426e-06, + "loss": 0.7081, + "step": 112790 + }, + { + "epoch": 1.8106229634504567, + "grad_norm": 1.2766056060791016, + "learning_rate": 1.0983427236250038e-06, + "loss": 0.7002, + "step": 112800 + }, + { + "epoch": 1.8107834796706208, + "grad_norm": 0.8936429619789124, + "learning_rate": 1.0964953250120786e-06, + "loss": 0.7206, + "step": 112810 + }, + { + "epoch": 1.810943995890785, + "grad_norm": 1.034103512763977, + "learning_rate": 1.0946494465217533e-06, + "loss": 0.7128, + "step": 112820 + }, + { + "epoch": 1.8111045121109488, + "grad_norm": 0.8739681839942932, + "learning_rate": 1.092805088271412e-06, + "loss": 0.714, + "step": 112830 + }, + { + "epoch": 1.811265028331113, + "grad_norm": 0.8881606459617615, + "learning_rate": 1.090962250378344e-06, + "loss": 0.6653, + "step": 112840 + }, + { + "epoch": 1.8114255445512768, + "grad_norm": 2.1207611560821533, + "learning_rate": 1.0891209329597502e-06, + "loss": 0.7123, + "step": 112850 + }, + { + "epoch": 1.811586060771441, + "grad_norm": 1.0771410465240479, + "learning_rate": 1.08728113613272e-06, + "loss": 0.7178, + "step": 112860 + }, + { + "epoch": 1.811746576991605, + "grad_norm": 0.7113217115402222, + "learning_rate": 1.0854428600142575e-06, + "loss": 0.7091, + "step": 112870 + }, + { + "epoch": 1.8119070932117691, + "grad_norm": 1.0920683145523071, + "learning_rate": 1.0836061047212604e-06, + "loss": 0.762, + "step": 112880 + }, + { + "epoch": 1.8120676094319332, + "grad_norm": 0.869250476360321, + "learning_rate": 1.0817708703705436e-06, + "loss": 0.7595, + "step": 112890 + }, + { + "epoch": 1.8122281256520971, + "grad_norm": 0.8466544151306152, + "learning_rate": 1.0799371570788114e-06, + "loss": 0.6148, + "step": 112900 + }, + { + "epoch": 1.8123886418722612, + "grad_norm": 1.429267406463623, + "learning_rate": 1.078104964962684e-06, + "loss": 0.6079, + "step": 112910 + }, + { + "epoch": 1.8125491580924251, + "grad_norm": 0.9573989510536194, + "learning_rate": 1.076274294138671e-06, + "loss": 0.7856, + "step": 112920 + }, + { + "epoch": 1.8127096743125892, + "grad_norm": 1.1387264728546143, + "learning_rate": 1.0744451447231935e-06, + "loss": 0.7027, + "step": 112930 + }, + { + "epoch": 1.8128701905327533, + "grad_norm": 1.581314206123352, + "learning_rate": 1.0726175168325774e-06, + "loss": 0.7257, + "step": 112940 + }, + { + "epoch": 1.8130307067529174, + "grad_norm": 0.8074445724487305, + "learning_rate": 1.0707914105830468e-06, + "loss": 0.6532, + "step": 112950 + }, + { + "epoch": 1.8131912229730816, + "grad_norm": 0.9649646878242493, + "learning_rate": 1.0689668260907337e-06, + "loss": 0.5956, + "step": 112960 + }, + { + "epoch": 1.8133517391932454, + "grad_norm": 0.9716358184814453, + "learning_rate": 1.0671437634716702e-06, + "loss": 0.8236, + "step": 112970 + }, + { + "epoch": 1.8135122554134095, + "grad_norm": 0.7445030808448792, + "learning_rate": 1.0653222228417914e-06, + "loss": 0.7109, + "step": 112980 + }, + { + "epoch": 1.8136727716335734, + "grad_norm": 1.5438563823699951, + "learning_rate": 1.0635022043169352e-06, + "loss": 0.63, + "step": 112990 + }, + { + "epoch": 1.8138332878537375, + "grad_norm": 1.0874285697937012, + "learning_rate": 1.061683708012845e-06, + "loss": 0.643, + "step": 113000 + }, + { + "epoch": 1.8139938040739017, + "grad_norm": 1.030748963356018, + "learning_rate": 1.0598667340451702e-06, + "loss": 0.637, + "step": 113010 + }, + { + "epoch": 1.8141543202940658, + "grad_norm": 1.0798931121826172, + "learning_rate": 1.0580512825294542e-06, + "loss": 0.7461, + "step": 113020 + }, + { + "epoch": 1.8143148365142299, + "grad_norm": 1.127659559249878, + "learning_rate": 1.056237353581152e-06, + "loss": 0.7116, + "step": 113030 + }, + { + "epoch": 1.814475352734394, + "grad_norm": 0.7622491717338562, + "learning_rate": 1.0544249473156214e-06, + "loss": 0.628, + "step": 113040 + }, + { + "epoch": 1.8146358689545579, + "grad_norm": 1.0182750225067139, + "learning_rate": 1.0526140638481175e-06, + "loss": 0.7068, + "step": 113050 + }, + { + "epoch": 1.8147963851747217, + "grad_norm": 0.963988721370697, + "learning_rate": 1.0508047032937978e-06, + "loss": 0.7126, + "step": 113060 + }, + { + "epoch": 1.8149569013948859, + "grad_norm": 1.671696424484253, + "learning_rate": 1.0489968657677346e-06, + "loss": 0.6831, + "step": 113070 + }, + { + "epoch": 1.81511741761505, + "grad_norm": 0.9879150986671448, + "learning_rate": 1.0471905513848912e-06, + "loss": 0.6799, + "step": 113080 + }, + { + "epoch": 1.815277933835214, + "grad_norm": 0.7614034414291382, + "learning_rate": 1.0453857602601424e-06, + "loss": 0.7952, + "step": 113090 + }, + { + "epoch": 1.8154384500553782, + "grad_norm": 1.1233183145523071, + "learning_rate": 1.0435824925082577e-06, + "loss": 0.7717, + "step": 113100 + }, + { + "epoch": 1.8155989662755423, + "grad_norm": 1.789556860923767, + "learning_rate": 1.0417807482439173e-06, + "loss": 0.7045, + "step": 113110 + }, + { + "epoch": 1.8157594824957062, + "grad_norm": 4.744681358337402, + "learning_rate": 1.039980527581702e-06, + "loss": 0.803, + "step": 113120 + }, + { + "epoch": 1.8159199987158703, + "grad_norm": 0.9249565601348877, + "learning_rate": 1.0381818306360953e-06, + "loss": 0.739, + "step": 113130 + }, + { + "epoch": 1.8160805149360342, + "grad_norm": 1.0201424360275269, + "learning_rate": 1.0363846575214776e-06, + "loss": 0.6813, + "step": 113140 + }, + { + "epoch": 1.8162410311561983, + "grad_norm": 0.8860863447189331, + "learning_rate": 1.0345890083521465e-06, + "loss": 0.6708, + "step": 113150 + }, + { + "epoch": 1.8164015473763624, + "grad_norm": 1.0324773788452148, + "learning_rate": 1.0327948832422913e-06, + "loss": 0.7274, + "step": 113160 + }, + { + "epoch": 1.8165620635965265, + "grad_norm": 0.9514585733413696, + "learning_rate": 1.0310022823060095e-06, + "loss": 0.6978, + "step": 113170 + }, + { + "epoch": 1.8167225798166906, + "grad_norm": 0.9165278077125549, + "learning_rate": 1.0292112056572962e-06, + "loss": 0.6398, + "step": 113180 + }, + { + "epoch": 1.8168830960368545, + "grad_norm": 1.6343320608139038, + "learning_rate": 1.0274216534100573e-06, + "loss": 0.8317, + "step": 113190 + }, + { + "epoch": 1.8170436122570186, + "grad_norm": 1.2292776107788086, + "learning_rate": 1.0256336256780963e-06, + "loss": 0.7339, + "step": 113200 + }, + { + "epoch": 1.8172041284771825, + "grad_norm": 1.0437442064285278, + "learning_rate": 1.0238471225751223e-06, + "loss": 0.7439, + "step": 113210 + }, + { + "epoch": 1.8173646446973466, + "grad_norm": 1.0713140964508057, + "learning_rate": 1.022062144214747e-06, + "loss": 0.6244, + "step": 113220 + }, + { + "epoch": 1.8175251609175107, + "grad_norm": 1.8938385248184204, + "learning_rate": 1.0202786907104828e-06, + "loss": 0.6728, + "step": 113230 + }, + { + "epoch": 1.8176856771376748, + "grad_norm": 1.1366745233535767, + "learning_rate": 1.018496762175744e-06, + "loss": 0.7549, + "step": 113240 + }, + { + "epoch": 1.817846193357839, + "grad_norm": 1.1279644966125488, + "learning_rate": 1.016716358723857e-06, + "loss": 0.684, + "step": 113250 + }, + { + "epoch": 1.8180067095780028, + "grad_norm": 1.2443397045135498, + "learning_rate": 1.0149374804680395e-06, + "loss": 0.6782, + "step": 113260 + }, + { + "epoch": 1.818167225798167, + "grad_norm": 0.9981672167778015, + "learning_rate": 1.0131601275214203e-06, + "loss": 0.744, + "step": 113270 + }, + { + "epoch": 1.8183277420183308, + "grad_norm": 1.2344434261322021, + "learning_rate": 1.0113842999970313e-06, + "loss": 0.6615, + "step": 113280 + }, + { + "epoch": 1.818488258238495, + "grad_norm": 0.7092366814613342, + "learning_rate": 1.0096099980078017e-06, + "loss": 0.7189, + "step": 113290 + }, + { + "epoch": 1.818648774458659, + "grad_norm": 0.8256872892379761, + "learning_rate": 1.007837221666566e-06, + "loss": 0.7405, + "step": 113300 + }, + { + "epoch": 1.8188092906788231, + "grad_norm": 1.0640720129013062, + "learning_rate": 1.006065971086062e-06, + "loss": 0.628, + "step": 113310 + }, + { + "epoch": 1.8189698068989872, + "grad_norm": 1.1254383325576782, + "learning_rate": 1.004296246378933e-06, + "loss": 0.8576, + "step": 113320 + }, + { + "epoch": 1.8191303231191513, + "grad_norm": 0.9666174650192261, + "learning_rate": 1.002528047657722e-06, + "loss": 0.7371, + "step": 113330 + }, + { + "epoch": 1.8192908393393152, + "grad_norm": 1.0952636003494263, + "learning_rate": 1.0007613750348755e-06, + "loss": 0.7124, + "step": 113340 + }, + { + "epoch": 1.8194513555594791, + "grad_norm": 1.1669546365737915, + "learning_rate": 9.989962286227507e-07, + "loss": 0.6445, + "step": 113350 + }, + { + "epoch": 1.8196118717796432, + "grad_norm": 1.479259967803955, + "learning_rate": 9.97232608533588e-07, + "loss": 0.7733, + "step": 113360 + }, + { + "epoch": 1.8197723879998073, + "grad_norm": 1.1370642185211182, + "learning_rate": 9.954705148795484e-07, + "loss": 0.7932, + "step": 113370 + }, + { + "epoch": 1.8199329042199714, + "grad_norm": 0.8746928572654724, + "learning_rate": 9.937099477726947e-07, + "loss": 0.7507, + "step": 113380 + }, + { + "epoch": 1.8200934204401356, + "grad_norm": 1.5606521368026733, + "learning_rate": 9.919509073249844e-07, + "loss": 0.7392, + "step": 113390 + }, + { + "epoch": 1.8202539366602997, + "grad_norm": 1.3866554498672485, + "learning_rate": 9.901933936482838e-07, + "loss": 0.822, + "step": 113400 + }, + { + "epoch": 1.8204144528804636, + "grad_norm": 1.0085277557373047, + "learning_rate": 9.884374068543618e-07, + "loss": 0.7221, + "step": 113410 + }, + { + "epoch": 1.8205749691006277, + "grad_norm": 0.9994884729385376, + "learning_rate": 9.866829470548844e-07, + "loss": 0.7238, + "step": 113420 + }, + { + "epoch": 1.8207354853207915, + "grad_norm": 1.1367075443267822, + "learning_rate": 9.849300143614293e-07, + "loss": 0.6528, + "step": 113430 + }, + { + "epoch": 1.8208960015409557, + "grad_norm": 1.1328078508377075, + "learning_rate": 9.831786088854712e-07, + "loss": 0.6701, + "step": 113440 + }, + { + "epoch": 1.8210565177611198, + "grad_norm": 1.4753062725067139, + "learning_rate": 9.814287307383873e-07, + "loss": 0.5798, + "step": 113450 + }, + { + "epoch": 1.8212170339812839, + "grad_norm": 0.9732462763786316, + "learning_rate": 9.796803800314614e-07, + "loss": 0.7125, + "step": 113460 + }, + { + "epoch": 1.821377550201448, + "grad_norm": 3.148735284805298, + "learning_rate": 9.779335568758818e-07, + "loss": 0.8176, + "step": 113470 + }, + { + "epoch": 1.8215380664216119, + "grad_norm": 0.9879348278045654, + "learning_rate": 9.761882613827322e-07, + "loss": 0.7002, + "step": 113480 + }, + { + "epoch": 1.821698582641776, + "grad_norm": 1.3049191236495972, + "learning_rate": 9.744444936630016e-07, + "loss": 0.7833, + "step": 113490 + }, + { + "epoch": 1.8218590988619399, + "grad_norm": 1.6189364194869995, + "learning_rate": 9.727022538275843e-07, + "loss": 0.6169, + "step": 113500 + }, + { + "epoch": 1.822019615082104, + "grad_norm": 0.6810141205787659, + "learning_rate": 9.709615419872809e-07, + "loss": 0.7543, + "step": 113510 + }, + { + "epoch": 1.822180131302268, + "grad_norm": 0.6978349685668945, + "learning_rate": 9.692223582527832e-07, + "loss": 0.8061, + "step": 113520 + }, + { + "epoch": 1.8223406475224322, + "grad_norm": 0.8176087737083435, + "learning_rate": 9.674847027347001e-07, + "loss": 0.7853, + "step": 113530 + }, + { + "epoch": 1.8225011637425963, + "grad_norm": 0.7100682258605957, + "learning_rate": 9.657485755435375e-07, + "loss": 0.8209, + "step": 113540 + }, + { + "epoch": 1.8226616799627602, + "grad_norm": 0.9795094132423401, + "learning_rate": 9.640139767896934e-07, + "loss": 0.6071, + "step": 113550 + }, + { + "epoch": 1.8228221961829243, + "grad_norm": 1.1869184970855713, + "learning_rate": 9.62280906583482e-07, + "loss": 0.7193, + "step": 113560 + }, + { + "epoch": 1.8229827124030882, + "grad_norm": 0.9714131355285645, + "learning_rate": 9.605493650351211e-07, + "loss": 0.6977, + "step": 113570 + }, + { + "epoch": 1.8231432286232523, + "grad_norm": 1.0401601791381836, + "learning_rate": 9.588193522547222e-07, + "loss": 0.7556, + "step": 113580 + }, + { + "epoch": 1.8233037448434164, + "grad_norm": 0.8743678331375122, + "learning_rate": 9.57090868352306e-07, + "loss": 0.705, + "step": 113590 + }, + { + "epoch": 1.8234642610635805, + "grad_norm": 0.795114278793335, + "learning_rate": 9.553639134377928e-07, + "loss": 0.736, + "step": 113600 + }, + { + "epoch": 1.8236247772837446, + "grad_norm": 1.272497296333313, + "learning_rate": 9.536384876210086e-07, + "loss": 0.7029, + "step": 113610 + }, + { + "epoch": 1.8237852935039087, + "grad_norm": 0.9314149022102356, + "learning_rate": 9.51914591011674e-07, + "loss": 0.6493, + "step": 113620 + }, + { + "epoch": 1.8239458097240726, + "grad_norm": 1.0294694900512695, + "learning_rate": 9.50192223719426e-07, + "loss": 0.7316, + "step": 113630 + }, + { + "epoch": 1.8241063259442367, + "grad_norm": 1.2646210193634033, + "learning_rate": 9.484713858537941e-07, + "loss": 0.6986, + "step": 113640 + }, + { + "epoch": 1.8242668421644006, + "grad_norm": 1.004687786102295, + "learning_rate": 9.467520775242156e-07, + "loss": 0.6338, + "step": 113650 + }, + { + "epoch": 1.8244273583845647, + "grad_norm": 0.7117734551429749, + "learning_rate": 9.450342988400279e-07, + "loss": 0.6819, + "step": 113660 + }, + { + "epoch": 1.8245878746047288, + "grad_norm": 0.9821359515190125, + "learning_rate": 9.433180499104688e-07, + "loss": 0.6833, + "step": 113670 + }, + { + "epoch": 1.824748390824893, + "grad_norm": 1.0833152532577515, + "learning_rate": 9.416033308446814e-07, + "loss": 0.7299, + "step": 113680 + }, + { + "epoch": 1.824908907045057, + "grad_norm": 1.3904814720153809, + "learning_rate": 9.398901417517148e-07, + "loss": 0.7286, + "step": 113690 + }, + { + "epoch": 1.825069423265221, + "grad_norm": 1.1007936000823975, + "learning_rate": 9.381784827405177e-07, + "loss": 0.7137, + "step": 113700 + }, + { + "epoch": 1.825229939485385, + "grad_norm": 1.1235299110412598, + "learning_rate": 9.364683539199392e-07, + "loss": 0.6049, + "step": 113710 + }, + { + "epoch": 1.825390455705549, + "grad_norm": 1.0968706607818604, + "learning_rate": 9.347597553987341e-07, + "loss": 0.7302, + "step": 113720 + }, + { + "epoch": 1.825550971925713, + "grad_norm": 1.9164756536483765, + "learning_rate": 9.330526872855628e-07, + "loss": 0.7109, + "step": 113730 + }, + { + "epoch": 1.8257114881458771, + "grad_norm": 1.0486394166946411, + "learning_rate": 9.313471496889797e-07, + "loss": 0.8275, + "step": 113740 + }, + { + "epoch": 1.8258720043660412, + "grad_norm": 1.0401241779327393, + "learning_rate": 9.296431427174513e-07, + "loss": 0.7288, + "step": 113750 + }, + { + "epoch": 1.8260325205862054, + "grad_norm": 1.1635136604309082, + "learning_rate": 9.279406664793377e-07, + "loss": 0.756, + "step": 113760 + }, + { + "epoch": 1.8261930368063692, + "grad_norm": 0.8311468958854675, + "learning_rate": 9.26239721082911e-07, + "loss": 0.6537, + "step": 113770 + }, + { + "epoch": 1.8263535530265333, + "grad_norm": 1.0241998434066772, + "learning_rate": 9.245403066363401e-07, + "loss": 0.6352, + "step": 113780 + }, + { + "epoch": 1.8265140692466972, + "grad_norm": 1.137104868888855, + "learning_rate": 9.22842423247694e-07, + "loss": 0.7013, + "step": 113790 + }, + { + "epoch": 1.8266745854668613, + "grad_norm": 0.9055271744728088, + "learning_rate": 9.211460710249531e-07, + "loss": 0.7118, + "step": 113800 + }, + { + "epoch": 1.8268351016870255, + "grad_norm": 1.6512304544448853, + "learning_rate": 9.194512500759922e-07, + "loss": 0.5906, + "step": 113810 + }, + { + "epoch": 1.8269956179071896, + "grad_norm": 1.6129982471466064, + "learning_rate": 9.177579605085946e-07, + "loss": 0.628, + "step": 113820 + }, + { + "epoch": 1.8271561341273537, + "grad_norm": 1.0691900253295898, + "learning_rate": 9.160662024304434e-07, + "loss": 0.6954, + "step": 113830 + }, + { + "epoch": 1.8273166503475178, + "grad_norm": 1.6121569871902466, + "learning_rate": 9.143759759491193e-07, + "loss": 0.7107, + "step": 113840 + }, + { + "epoch": 1.8274771665676817, + "grad_norm": 0.9980257153511047, + "learning_rate": 9.126872811721227e-07, + "loss": 0.7433, + "step": 113850 + }, + { + "epoch": 1.8276376827878456, + "grad_norm": 1.4147535562515259, + "learning_rate": 9.110001182068312e-07, + "loss": 0.8388, + "step": 113860 + }, + { + "epoch": 1.8277981990080097, + "grad_norm": 0.783236026763916, + "learning_rate": 9.093144871605452e-07, + "loss": 0.7901, + "step": 113870 + }, + { + "epoch": 1.8279587152281738, + "grad_norm": 1.622573971748352, + "learning_rate": 9.076303881404568e-07, + "loss": 0.6207, + "step": 113880 + }, + { + "epoch": 1.8281192314483379, + "grad_norm": 1.076362133026123, + "learning_rate": 9.059478212536721e-07, + "loss": 0.8604, + "step": 113890 + }, + { + "epoch": 1.828279747668502, + "grad_norm": 1.0587923526763916, + "learning_rate": 9.042667866071858e-07, + "loss": 0.658, + "step": 113900 + }, + { + "epoch": 1.828440263888666, + "grad_norm": 0.8515668511390686, + "learning_rate": 9.025872843079069e-07, + "loss": 0.7018, + "step": 113910 + }, + { + "epoch": 1.82860078010883, + "grad_norm": 0.7347649931907654, + "learning_rate": 9.009093144626391e-07, + "loss": 0.7339, + "step": 113920 + }, + { + "epoch": 1.828761296328994, + "grad_norm": 1.4304512739181519, + "learning_rate": 8.992328771780883e-07, + "loss": 0.6306, + "step": 113930 + }, + { + "epoch": 1.828921812549158, + "grad_norm": 1.3900113105773926, + "learning_rate": 8.975579725608724e-07, + "loss": 0.6947, + "step": 113940 + }, + { + "epoch": 1.829082328769322, + "grad_norm": 1.0478979349136353, + "learning_rate": 8.958846007175003e-07, + "loss": 0.7228, + "step": 113950 + }, + { + "epoch": 1.8292428449894862, + "grad_norm": 1.118062138557434, + "learning_rate": 8.9421276175439e-07, + "loss": 0.7256, + "step": 113960 + }, + { + "epoch": 1.8294033612096503, + "grad_norm": 1.059156060218811, + "learning_rate": 8.925424557778672e-07, + "loss": 0.706, + "step": 113970 + }, + { + "epoch": 1.8295638774298144, + "grad_norm": 1.066405177116394, + "learning_rate": 8.908736828941417e-07, + "loss": 0.6611, + "step": 113980 + }, + { + "epoch": 1.8297243936499783, + "grad_norm": 0.5997633934020996, + "learning_rate": 8.89206443209345e-07, + "loss": 0.5456, + "step": 113990 + }, + { + "epoch": 1.8298849098701424, + "grad_norm": 1.5079048871994019, + "learning_rate": 8.875407368295035e-07, + "loss": 0.6658, + "step": 114000 + }, + { + "epoch": 1.8300454260903063, + "grad_norm": 0.7206734418869019, + "learning_rate": 8.858765638605465e-07, + "loss": 0.7144, + "step": 114010 + }, + { + "epoch": 1.8302059423104704, + "grad_norm": 0.988867998123169, + "learning_rate": 8.842139244083031e-07, + "loss": 0.7163, + "step": 114020 + }, + { + "epoch": 1.8303664585306345, + "grad_norm": 1.6236238479614258, + "learning_rate": 8.825528185785109e-07, + "loss": 0.6495, + "step": 114030 + }, + { + "epoch": 1.8305269747507986, + "grad_norm": 1.1238611936569214, + "learning_rate": 8.808932464768021e-07, + "loss": 0.7119, + "step": 114040 + }, + { + "epoch": 1.8306874909709627, + "grad_norm": 1.071851134300232, + "learning_rate": 8.792352082087174e-07, + "loss": 0.6471, + "step": 114050 + }, + { + "epoch": 1.8308480071911266, + "grad_norm": 3.64725923538208, + "learning_rate": 8.775787038797e-07, + "loss": 0.6589, + "step": 114060 + }, + { + "epoch": 1.8310085234112907, + "grad_norm": 1.2658199071884155, + "learning_rate": 8.759237335950909e-07, + "loss": 0.6814, + "step": 114070 + }, + { + "epoch": 1.8311690396314546, + "grad_norm": 0.9885071516036987, + "learning_rate": 8.742702974601391e-07, + "loss": 0.7879, + "step": 114080 + }, + { + "epoch": 1.8313295558516187, + "grad_norm": 0.571524441242218, + "learning_rate": 8.726183955799938e-07, + "loss": 0.7405, + "step": 114090 + }, + { + "epoch": 1.8314900720717828, + "grad_norm": 0.8156326413154602, + "learning_rate": 8.709680280597099e-07, + "loss": 0.5763, + "step": 114100 + }, + { + "epoch": 1.831650588291947, + "grad_norm": 1.2605507373809814, + "learning_rate": 8.693191950042312e-07, + "loss": 0.8278, + "step": 114110 + }, + { + "epoch": 1.831811104512111, + "grad_norm": 0.9635432958602905, + "learning_rate": 8.676718965184183e-07, + "loss": 0.7847, + "step": 114120 + }, + { + "epoch": 1.8319716207322752, + "grad_norm": 1.0135201215744019, + "learning_rate": 8.660261327070318e-07, + "loss": 0.8194, + "step": 114130 + }, + { + "epoch": 1.832132136952439, + "grad_norm": 0.794713020324707, + "learning_rate": 8.643819036747298e-07, + "loss": 0.7465, + "step": 114140 + }, + { + "epoch": 1.832292653172603, + "grad_norm": 1.0302209854125977, + "learning_rate": 8.627392095260783e-07, + "loss": 0.7342, + "step": 114150 + }, + { + "epoch": 1.832453169392767, + "grad_norm": 0.7390179634094238, + "learning_rate": 8.610980503655442e-07, + "loss": 0.5326, + "step": 114160 + }, + { + "epoch": 1.8326136856129311, + "grad_norm": 0.990038275718689, + "learning_rate": 8.594584262974881e-07, + "loss": 0.619, + "step": 114170 + }, + { + "epoch": 1.8327742018330953, + "grad_norm": 1.3635458946228027, + "learning_rate": 8.578203374261878e-07, + "loss": 0.7609, + "step": 114180 + }, + { + "epoch": 1.8329347180532594, + "grad_norm": 1.1691864728927612, + "learning_rate": 8.561837838558156e-07, + "loss": 0.7021, + "step": 114190 + }, + { + "epoch": 1.8330952342734235, + "grad_norm": 1.7453025579452515, + "learning_rate": 8.54548765690441e-07, + "loss": 0.8022, + "step": 114200 + }, + { + "epoch": 1.8332557504935874, + "grad_norm": 1.2462598085403442, + "learning_rate": 8.529152830340475e-07, + "loss": 0.5901, + "step": 114210 + }, + { + "epoch": 1.8334162667137515, + "grad_norm": 2.2551991939544678, + "learning_rate": 8.512833359905159e-07, + "loss": 0.7098, + "step": 114220 + }, + { + "epoch": 1.8335767829339154, + "grad_norm": 1.2193880081176758, + "learning_rate": 8.496529246636243e-07, + "loss": 0.5517, + "step": 114230 + }, + { + "epoch": 1.8337372991540795, + "grad_norm": 0.9887769222259521, + "learning_rate": 8.480240491570563e-07, + "loss": 0.7734, + "step": 114240 + }, + { + "epoch": 1.8338978153742436, + "grad_norm": 1.141430377960205, + "learning_rate": 8.463967095744013e-07, + "loss": 0.5743, + "step": 114250 + }, + { + "epoch": 1.8340583315944077, + "grad_norm": 0.7970943450927734, + "learning_rate": 8.447709060191488e-07, + "loss": 0.7359, + "step": 114260 + }, + { + "epoch": 1.8342188478145718, + "grad_norm": 0.923493504524231, + "learning_rate": 8.431466385946885e-07, + "loss": 0.62, + "step": 114270 + }, + { + "epoch": 1.8343793640347357, + "grad_norm": 1.1002745628356934, + "learning_rate": 8.415239074043207e-07, + "loss": 0.6883, + "step": 114280 + }, + { + "epoch": 1.8345398802548998, + "grad_norm": 0.9567010402679443, + "learning_rate": 8.399027125512326e-07, + "loss": 0.7173, + "step": 114290 + }, + { + "epoch": 1.8347003964750637, + "grad_norm": 1.1339987516403198, + "learning_rate": 8.382830541385251e-07, + "loss": 0.6471, + "step": 114300 + }, + { + "epoch": 1.8348609126952278, + "grad_norm": 1.3654865026474, + "learning_rate": 8.366649322692016e-07, + "loss": 0.7163, + "step": 114310 + }, + { + "epoch": 1.8350214289153919, + "grad_norm": 1.2535321712493896, + "learning_rate": 8.350483470461661e-07, + "loss": 0.6989, + "step": 114320 + }, + { + "epoch": 1.835181945135556, + "grad_norm": 0.948477029800415, + "learning_rate": 8.334332985722199e-07, + "loss": 0.7654, + "step": 114330 + }, + { + "epoch": 1.83534246135572, + "grad_norm": 1.0055440664291382, + "learning_rate": 8.318197869500721e-07, + "loss": 0.763, + "step": 114340 + }, + { + "epoch": 1.8355029775758842, + "grad_norm": 1.285212516784668, + "learning_rate": 8.302078122823353e-07, + "loss": 0.6328, + "step": 114350 + }, + { + "epoch": 1.835663493796048, + "grad_norm": 1.0931053161621094, + "learning_rate": 8.285973746715164e-07, + "loss": 0.6834, + "step": 114360 + }, + { + "epoch": 1.835824010016212, + "grad_norm": 1.1647121906280518, + "learning_rate": 8.269884742200334e-07, + "loss": 0.7632, + "step": 114370 + }, + { + "epoch": 1.835984526236376, + "grad_norm": 0.8710189461708069, + "learning_rate": 8.253811110302074e-07, + "loss": 0.8427, + "step": 114380 + }, + { + "epoch": 1.8361450424565402, + "grad_norm": 0.7870017886161804, + "learning_rate": 8.23775285204248e-07, + "loss": 0.6506, + "step": 114390 + }, + { + "epoch": 1.8363055586767043, + "grad_norm": 1.3468550443649292, + "learning_rate": 8.221709968442792e-07, + "loss": 0.7136, + "step": 114400 + }, + { + "epoch": 1.8364660748968684, + "grad_norm": 0.8389063477516174, + "learning_rate": 8.205682460523278e-07, + "loss": 0.6711, + "step": 114410 + }, + { + "epoch": 1.8366265911170325, + "grad_norm": 1.155714511871338, + "learning_rate": 8.189670329303151e-07, + "loss": 0.6261, + "step": 114420 + }, + { + "epoch": 1.8367871073371964, + "grad_norm": 1.4462895393371582, + "learning_rate": 8.173673575800706e-07, + "loss": 0.8663, + "step": 114430 + }, + { + "epoch": 1.8369476235573605, + "grad_norm": 1.3449773788452148, + "learning_rate": 8.157692201033268e-07, + "loss": 0.7821, + "step": 114440 + }, + { + "epoch": 1.8371081397775244, + "grad_norm": 0.8234454393386841, + "learning_rate": 8.141726206017109e-07, + "loss": 0.7291, + "step": 114450 + }, + { + "epoch": 1.8372686559976885, + "grad_norm": 1.591370940208435, + "learning_rate": 8.125775591767609e-07, + "loss": 0.7467, + "step": 114460 + }, + { + "epoch": 1.8374291722178526, + "grad_norm": 0.7752012014389038, + "learning_rate": 8.109840359299154e-07, + "loss": 0.7527, + "step": 114470 + }, + { + "epoch": 1.8375896884380167, + "grad_norm": 1.1565630435943604, + "learning_rate": 8.093920509625097e-07, + "loss": 0.7445, + "step": 114480 + }, + { + "epoch": 1.8377502046581808, + "grad_norm": 0.846734881401062, + "learning_rate": 8.078016043757825e-07, + "loss": 0.7576, + "step": 114490 + }, + { + "epoch": 1.8379107208783447, + "grad_norm": 0.9668983221054077, + "learning_rate": 8.062126962708805e-07, + "loss": 0.6523, + "step": 114500 + }, + { + "epoch": 1.8380712370985088, + "grad_norm": 1.1863068342208862, + "learning_rate": 8.046253267488452e-07, + "loss": 0.6794, + "step": 114510 + }, + { + "epoch": 1.8382317533186727, + "grad_norm": 1.5046590566635132, + "learning_rate": 8.030394959106291e-07, + "loss": 0.6711, + "step": 114520 + }, + { + "epoch": 1.8383922695388368, + "grad_norm": 0.8505411148071289, + "learning_rate": 8.014552038570821e-07, + "loss": 0.7218, + "step": 114530 + }, + { + "epoch": 1.838552785759001, + "grad_norm": 1.1349807977676392, + "learning_rate": 7.998724506889488e-07, + "loss": 0.7058, + "step": 114540 + }, + { + "epoch": 1.838713301979165, + "grad_norm": 0.778085470199585, + "learning_rate": 7.9829123650689e-07, + "loss": 0.6344, + "step": 114550 + }, + { + "epoch": 1.8388738181993292, + "grad_norm": 0.8733119964599609, + "learning_rate": 7.967115614114562e-07, + "loss": 0.7973, + "step": 114560 + }, + { + "epoch": 1.839034334419493, + "grad_norm": 1.0417383909225464, + "learning_rate": 7.951334255031112e-07, + "loss": 0.7227, + "step": 114570 + }, + { + "epoch": 1.8391948506396572, + "grad_norm": 1.2537164688110352, + "learning_rate": 7.935568288822082e-07, + "loss": 0.7309, + "step": 114580 + }, + { + "epoch": 1.839355366859821, + "grad_norm": 1.1186962127685547, + "learning_rate": 7.919817716490197e-07, + "loss": 0.6628, + "step": 114590 + }, + { + "epoch": 1.8395158830799851, + "grad_norm": 0.8568621277809143, + "learning_rate": 7.904082539036989e-07, + "loss": 0.565, + "step": 114600 + }, + { + "epoch": 1.8396763993001493, + "grad_norm": 1.4219447374343872, + "learning_rate": 7.888362757463186e-07, + "loss": 0.6914, + "step": 114610 + }, + { + "epoch": 1.8398369155203134, + "grad_norm": 2.221675157546997, + "learning_rate": 7.87265837276846e-07, + "loss": 0.6285, + "step": 114620 + }, + { + "epoch": 1.8399974317404775, + "grad_norm": 1.2408573627471924, + "learning_rate": 7.856969385951512e-07, + "loss": 0.7273, + "step": 114630 + }, + { + "epoch": 1.8401579479606416, + "grad_norm": 1.3107582330703735, + "learning_rate": 7.841295798010073e-07, + "loss": 0.5875, + "step": 114640 + }, + { + "epoch": 1.8403184641808055, + "grad_norm": 1.3013719320297241, + "learning_rate": 7.8256376099409e-07, + "loss": 0.6202, + "step": 114650 + }, + { + "epoch": 1.8404789804009694, + "grad_norm": 0.9736677408218384, + "learning_rate": 7.809994822739808e-07, + "loss": 0.7336, + "step": 114660 + }, + { + "epoch": 1.8406394966211335, + "grad_norm": 0.9379357099533081, + "learning_rate": 7.794367437401473e-07, + "loss": 0.7582, + "step": 114670 + }, + { + "epoch": 1.8408000128412976, + "grad_norm": 0.7915248274803162, + "learning_rate": 7.778755454919768e-07, + "loss": 0.769, + "step": 114680 + }, + { + "epoch": 1.8409605290614617, + "grad_norm": 2.6606478691101074, + "learning_rate": 7.763158876287535e-07, + "loss": 0.6063, + "step": 114690 + }, + { + "epoch": 1.8411210452816258, + "grad_norm": 1.076987385749817, + "learning_rate": 7.747577702496595e-07, + "loss": 0.6666, + "step": 114700 + }, + { + "epoch": 1.84128156150179, + "grad_norm": 1.141114592552185, + "learning_rate": 7.732011934537847e-07, + "loss": 0.708, + "step": 114710 + }, + { + "epoch": 1.8414420777219538, + "grad_norm": 1.5373477935791016, + "learning_rate": 7.716461573401196e-07, + "loss": 0.6589, + "step": 114720 + }, + { + "epoch": 1.841602593942118, + "grad_norm": 1.2503288984298706, + "learning_rate": 7.700926620075516e-07, + "loss": 0.7366, + "step": 114730 + }, + { + "epoch": 1.8417631101622818, + "grad_norm": 1.4965795278549194, + "learning_rate": 7.685407075548739e-07, + "loss": 0.6306, + "step": 114740 + }, + { + "epoch": 1.8419236263824459, + "grad_norm": 1.6258476972579956, + "learning_rate": 7.669902940807827e-07, + "loss": 0.6663, + "step": 114750 + }, + { + "epoch": 1.84208414260261, + "grad_norm": 1.2439799308776855, + "learning_rate": 7.654414216838767e-07, + "loss": 0.6871, + "step": 114760 + }, + { + "epoch": 1.842244658822774, + "grad_norm": 1.1626451015472412, + "learning_rate": 7.638940904626524e-07, + "loss": 0.6895, + "step": 114770 + }, + { + "epoch": 1.8424051750429382, + "grad_norm": 1.0407313108444214, + "learning_rate": 7.623483005155141e-07, + "loss": 0.7935, + "step": 114780 + }, + { + "epoch": 1.842565691263102, + "grad_norm": 1.142090082168579, + "learning_rate": 7.608040519407639e-07, + "loss": 0.8149, + "step": 114790 + }, + { + "epoch": 1.8427262074832662, + "grad_norm": 0.6772428154945374, + "learning_rate": 7.592613448366037e-07, + "loss": 0.7128, + "step": 114800 + }, + { + "epoch": 1.84288672370343, + "grad_norm": 0.8482848405838013, + "learning_rate": 7.57720179301144e-07, + "loss": 0.6136, + "step": 114810 + }, + { + "epoch": 1.8430472399235942, + "grad_norm": 1.4393808841705322, + "learning_rate": 7.561805554323926e-07, + "loss": 0.6881, + "step": 114820 + }, + { + "epoch": 1.8432077561437583, + "grad_norm": 0.937684178352356, + "learning_rate": 7.546424733282625e-07, + "loss": 0.6494, + "step": 114830 + }, + { + "epoch": 1.8433682723639224, + "grad_norm": 0.8210170865058899, + "learning_rate": 7.531059330865675e-07, + "loss": 0.7371, + "step": 114840 + }, + { + "epoch": 1.8435287885840865, + "grad_norm": 0.8585982322692871, + "learning_rate": 7.515709348050182e-07, + "loss": 0.8109, + "step": 114850 + }, + { + "epoch": 1.8436893048042504, + "grad_norm": 1.5197385549545288, + "learning_rate": 7.500374785812336e-07, + "loss": 0.697, + "step": 114860 + }, + { + "epoch": 1.8438498210244145, + "grad_norm": 1.013342261314392, + "learning_rate": 7.485055645127304e-07, + "loss": 0.7756, + "step": 114870 + }, + { + "epoch": 1.8440103372445784, + "grad_norm": 1.2032071352005005, + "learning_rate": 7.469751926969332e-07, + "loss": 0.7362, + "step": 114880 + }, + { + "epoch": 1.8441708534647425, + "grad_norm": 1.1685709953308105, + "learning_rate": 7.454463632311615e-07, + "loss": 0.7083, + "step": 114890 + }, + { + "epoch": 1.8443313696849066, + "grad_norm": 0.9747641086578369, + "learning_rate": 7.43919076212643e-07, + "loss": 0.7891, + "step": 114900 + }, + { + "epoch": 1.8444918859050707, + "grad_norm": 1.0040512084960938, + "learning_rate": 7.423933317385057e-07, + "loss": 0.7581, + "step": 114910 + }, + { + "epoch": 1.8446524021252348, + "grad_norm": 0.8195027709007263, + "learning_rate": 7.408691299057718e-07, + "loss": 0.6892, + "step": 114920 + }, + { + "epoch": 1.844812918345399, + "grad_norm": 1.1326080560684204, + "learning_rate": 7.393464708113723e-07, + "loss": 0.6785, + "step": 114930 + }, + { + "epoch": 1.8449734345655628, + "grad_norm": 1.8186630010604858, + "learning_rate": 7.378253545521435e-07, + "loss": 0.8043, + "step": 114940 + }, + { + "epoch": 1.845133950785727, + "grad_norm": 0.44452810287475586, + "learning_rate": 7.363057812248164e-07, + "loss": 0.7393, + "step": 114950 + }, + { + "epoch": 1.8452944670058908, + "grad_norm": 1.0243401527404785, + "learning_rate": 7.347877509260304e-07, + "loss": 0.6564, + "step": 114960 + }, + { + "epoch": 1.845454983226055, + "grad_norm": 1.0530343055725098, + "learning_rate": 7.332712637523193e-07, + "loss": 0.7814, + "step": 114970 + }, + { + "epoch": 1.845615499446219, + "grad_norm": 1.1491179466247559, + "learning_rate": 7.317563198001254e-07, + "loss": 0.6908, + "step": 114980 + }, + { + "epoch": 1.8457760156663832, + "grad_norm": 1.2398945093154907, + "learning_rate": 7.302429191657883e-07, + "loss": 0.6282, + "step": 114990 + }, + { + "epoch": 1.8459365318865473, + "grad_norm": 0.7414517998695374, + "learning_rate": 7.287310619455561e-07, + "loss": 0.7353, + "step": 115000 + }, + { + "epoch": 1.8460970481067112, + "grad_norm": 0.7672755122184753, + "learning_rate": 7.272207482355686e-07, + "loss": 0.7914, + "step": 115010 + }, + { + "epoch": 1.8462575643268753, + "grad_norm": 1.5379823446273804, + "learning_rate": 7.257119781318739e-07, + "loss": 0.7843, + "step": 115020 + }, + { + "epoch": 1.8464180805470392, + "grad_norm": 1.0491571426391602, + "learning_rate": 7.242047517304229e-07, + "loss": 0.7162, + "step": 115030 + }, + { + "epoch": 1.8465785967672033, + "grad_norm": 1.5456268787384033, + "learning_rate": 7.22699069127064e-07, + "loss": 0.612, + "step": 115040 + }, + { + "epoch": 1.8467391129873674, + "grad_norm": 1.1304750442504883, + "learning_rate": 7.211949304175514e-07, + "loss": 0.598, + "step": 115050 + }, + { + "epoch": 1.8468996292075315, + "grad_norm": 2.3105132579803467, + "learning_rate": 7.196923356975388e-07, + "loss": 0.8714, + "step": 115060 + }, + { + "epoch": 1.8470601454276956, + "grad_norm": 1.1167805194854736, + "learning_rate": 7.181912850625832e-07, + "loss": 0.7071, + "step": 115070 + }, + { + "epoch": 1.8472206616478595, + "grad_norm": 0.7709978818893433, + "learning_rate": 7.166917786081417e-07, + "loss": 0.6911, + "step": 115080 + }, + { + "epoch": 1.8473811778680236, + "grad_norm": 0.6991096138954163, + "learning_rate": 7.151938164295768e-07, + "loss": 0.7818, + "step": 115090 + }, + { + "epoch": 1.8475416940881875, + "grad_norm": 0.6687971353530884, + "learning_rate": 7.136973986221457e-07, + "loss": 0.5892, + "step": 115100 + }, + { + "epoch": 1.8477022103083516, + "grad_norm": 1.6760867834091187, + "learning_rate": 7.122025252810139e-07, + "loss": 0.6516, + "step": 115110 + }, + { + "epoch": 1.8478627265285157, + "grad_norm": 0.8251492381095886, + "learning_rate": 7.107091965012442e-07, + "loss": 0.6798, + "step": 115120 + }, + { + "epoch": 1.8480232427486798, + "grad_norm": 0.9701213240623474, + "learning_rate": 7.092174123778106e-07, + "loss": 0.6844, + "step": 115130 + }, + { + "epoch": 1.848183758968844, + "grad_norm": 1.2857681512832642, + "learning_rate": 7.077271730055735e-07, + "loss": 0.6456, + "step": 115140 + }, + { + "epoch": 1.848344275189008, + "grad_norm": 0.9197409749031067, + "learning_rate": 7.062384784793097e-07, + "loss": 0.6273, + "step": 115150 + }, + { + "epoch": 1.848504791409172, + "grad_norm": 1.4104852676391602, + "learning_rate": 7.04751328893688e-07, + "loss": 0.7403, + "step": 115160 + }, + { + "epoch": 1.8486653076293358, + "grad_norm": 1.0389034748077393, + "learning_rate": 7.032657243432827e-07, + "loss": 0.6862, + "step": 115170 + }, + { + "epoch": 1.8488258238495, + "grad_norm": 1.0217761993408203, + "learning_rate": 7.017816649225711e-07, + "loss": 0.7164, + "step": 115180 + }, + { + "epoch": 1.848986340069664, + "grad_norm": 1.212196946144104, + "learning_rate": 7.002991507259276e-07, + "loss": 0.7019, + "step": 115190 + }, + { + "epoch": 1.8491468562898281, + "grad_norm": 1.241572618484497, + "learning_rate": 6.988181818476325e-07, + "loss": 0.6782, + "step": 115200 + }, + { + "epoch": 1.8493073725099922, + "grad_norm": 0.9779046177864075, + "learning_rate": 6.973387583818713e-07, + "loss": 0.7156, + "step": 115210 + }, + { + "epoch": 1.8494678887301563, + "grad_norm": 0.8616869449615479, + "learning_rate": 6.958608804227218e-07, + "loss": 0.7124, + "step": 115220 + }, + { + "epoch": 1.8496284049503202, + "grad_norm": 1.028061032295227, + "learning_rate": 6.943845480641698e-07, + "loss": 0.8309, + "step": 115230 + }, + { + "epoch": 1.8497889211704843, + "grad_norm": 1.1219754219055176, + "learning_rate": 6.929097614000985e-07, + "loss": 0.6731, + "step": 115240 + }, + { + "epoch": 1.8499494373906482, + "grad_norm": 0.8361799120903015, + "learning_rate": 6.914365205242995e-07, + "loss": 0.7166, + "step": 115250 + }, + { + "epoch": 1.8501099536108123, + "grad_norm": 1.5736706256866455, + "learning_rate": 6.89964825530462e-07, + "loss": 0.7513, + "step": 115260 + }, + { + "epoch": 1.8502704698309764, + "grad_norm": 1.2148243188858032, + "learning_rate": 6.884946765121747e-07, + "loss": 0.7003, + "step": 115270 + }, + { + "epoch": 1.8504309860511405, + "grad_norm": 2.0273609161376953, + "learning_rate": 6.870260735629352e-07, + "loss": 0.7422, + "step": 115280 + }, + { + "epoch": 1.8505915022713046, + "grad_norm": 0.9265609979629517, + "learning_rate": 6.8555901677613e-07, + "loss": 0.7072, + "step": 115290 + }, + { + "epoch": 1.8507520184914685, + "grad_norm": 0.9067003130912781, + "learning_rate": 6.840935062450621e-07, + "loss": 0.8562, + "step": 115300 + }, + { + "epoch": 1.8509125347116326, + "grad_norm": 2.506283760070801, + "learning_rate": 6.826295420629264e-07, + "loss": 0.7548, + "step": 115310 + }, + { + "epoch": 1.8510730509317965, + "grad_norm": 1.20966637134552, + "learning_rate": 6.811671243228263e-07, + "loss": 0.6921, + "step": 115320 + }, + { + "epoch": 1.8512335671519606, + "grad_norm": 1.039176344871521, + "learning_rate": 6.797062531177567e-07, + "loss": 0.6865, + "step": 115330 + }, + { + "epoch": 1.8513940833721247, + "grad_norm": 0.7817941308021545, + "learning_rate": 6.782469285406267e-07, + "loss": 0.6747, + "step": 115340 + }, + { + "epoch": 1.8515545995922889, + "grad_norm": 1.3501816987991333, + "learning_rate": 6.767891506842372e-07, + "loss": 0.8153, + "step": 115350 + }, + { + "epoch": 1.851715115812453, + "grad_norm": 1.4302598237991333, + "learning_rate": 6.753329196412944e-07, + "loss": 0.6815, + "step": 115360 + }, + { + "epoch": 1.8518756320326168, + "grad_norm": 0.5071420073509216, + "learning_rate": 6.738782355044049e-07, + "loss": 0.6975, + "step": 115370 + }, + { + "epoch": 1.852036148252781, + "grad_norm": 0.9677152633666992, + "learning_rate": 6.724250983660807e-07, + "loss": 0.7709, + "step": 115380 + }, + { + "epoch": 1.8521966644729448, + "grad_norm": 1.1256612539291382, + "learning_rate": 6.709735083187312e-07, + "loss": 0.696, + "step": 115390 + }, + { + "epoch": 1.852357180693109, + "grad_norm": 1.8856029510498047, + "learning_rate": 6.695234654546745e-07, + "loss": 0.7145, + "step": 115400 + }, + { + "epoch": 1.852517696913273, + "grad_norm": 0.933427095413208, + "learning_rate": 6.680749698661143e-07, + "loss": 0.7892, + "step": 115410 + }, + { + "epoch": 1.8526782131334372, + "grad_norm": 0.9446956515312195, + "learning_rate": 6.666280216451742e-07, + "loss": 0.6308, + "step": 115420 + }, + { + "epoch": 1.8528387293536013, + "grad_norm": 1.3644676208496094, + "learning_rate": 6.651826208838696e-07, + "loss": 0.71, + "step": 115430 + }, + { + "epoch": 1.8529992455737654, + "grad_norm": 1.7107855081558228, + "learning_rate": 6.637387676741186e-07, + "loss": 0.7496, + "step": 115440 + }, + { + "epoch": 1.8531597617939293, + "grad_norm": 1.3022836446762085, + "learning_rate": 6.622964621077449e-07, + "loss": 0.6448, + "step": 115450 + }, + { + "epoch": 1.8533202780140932, + "grad_norm": 0.7718107104301453, + "learning_rate": 6.608557042764641e-07, + "loss": 0.6578, + "step": 115460 + }, + { + "epoch": 1.8534807942342573, + "grad_norm": 0.8717560768127441, + "learning_rate": 6.594164942719111e-07, + "loss": 0.7716, + "step": 115470 + }, + { + "epoch": 1.8536413104544214, + "grad_norm": 0.9629083275794983, + "learning_rate": 6.57978832185599e-07, + "loss": 0.6614, + "step": 115480 + }, + { + "epoch": 1.8538018266745855, + "grad_norm": 1.492597222328186, + "learning_rate": 6.5654271810896e-07, + "loss": 0.7979, + "step": 115490 + }, + { + "epoch": 1.8539623428947496, + "grad_norm": 1.0466707944869995, + "learning_rate": 6.551081521333241e-07, + "loss": 0.6743, + "step": 115500 + }, + { + "epoch": 1.8541228591149137, + "grad_norm": 1.1976354122161865, + "learning_rate": 6.536751343499181e-07, + "loss": 0.6733, + "step": 115510 + }, + { + "epoch": 1.8542833753350776, + "grad_norm": 0.9588160514831543, + "learning_rate": 6.522436648498776e-07, + "loss": 0.6355, + "step": 115520 + }, + { + "epoch": 1.8544438915552417, + "grad_norm": 1.046318531036377, + "learning_rate": 6.50813743724235e-07, + "loss": 0.7548, + "step": 115530 + }, + { + "epoch": 1.8546044077754056, + "grad_norm": 0.8377198576927185, + "learning_rate": 6.49385371063918e-07, + "loss": 0.7467, + "step": 115540 + }, + { + "epoch": 1.8547649239955697, + "grad_norm": 1.1493159532546997, + "learning_rate": 6.479585469597704e-07, + "loss": 0.7119, + "step": 115550 + }, + { + "epoch": 1.8549254402157338, + "grad_norm": 1.480057954788208, + "learning_rate": 6.465332715025252e-07, + "loss": 0.8821, + "step": 115560 + }, + { + "epoch": 1.855085956435898, + "grad_norm": 1.3836983442306519, + "learning_rate": 6.451095447828264e-07, + "loss": 0.7267, + "step": 115570 + }, + { + "epoch": 1.855246472656062, + "grad_norm": 2.485710620880127, + "learning_rate": 6.436873668912103e-07, + "loss": 0.6782, + "step": 115580 + }, + { + "epoch": 1.855406988876226, + "grad_norm": 1.0994266271591187, + "learning_rate": 6.422667379181207e-07, + "loss": 0.7622, + "step": 115590 + }, + { + "epoch": 1.85556750509639, + "grad_norm": 1.4297306537628174, + "learning_rate": 6.408476579538996e-07, + "loss": 0.687, + "step": 115600 + }, + { + "epoch": 1.855728021316554, + "grad_norm": 0.8097190856933594, + "learning_rate": 6.394301270887942e-07, + "loss": 0.7316, + "step": 115610 + }, + { + "epoch": 1.855888537536718, + "grad_norm": 1.7190415859222412, + "learning_rate": 6.380141454129518e-07, + "loss": 0.6897, + "step": 115620 + }, + { + "epoch": 1.8560490537568821, + "grad_norm": 1.7295829057693481, + "learning_rate": 6.365997130164225e-07, + "loss": 0.8037, + "step": 115630 + }, + { + "epoch": 1.8562095699770462, + "grad_norm": 1.0655443668365479, + "learning_rate": 6.351868299891511e-07, + "loss": 0.6304, + "step": 115640 + }, + { + "epoch": 1.8563700861972103, + "grad_norm": 1.1437393426895142, + "learning_rate": 6.33775496420988e-07, + "loss": 0.6923, + "step": 115650 + }, + { + "epoch": 1.8565306024173742, + "grad_norm": 1.0999191999435425, + "learning_rate": 6.323657124016891e-07, + "loss": 0.7248, + "step": 115660 + }, + { + "epoch": 1.8566911186375383, + "grad_norm": 1.3844462633132935, + "learning_rate": 6.309574780209076e-07, + "loss": 0.6268, + "step": 115670 + }, + { + "epoch": 1.8568516348577022, + "grad_norm": 1.2793365716934204, + "learning_rate": 6.295507933681972e-07, + "loss": 0.6918, + "step": 115680 + }, + { + "epoch": 1.8570121510778663, + "grad_norm": 0.733573853969574, + "learning_rate": 6.281456585330192e-07, + "loss": 0.586, + "step": 115690 + }, + { + "epoch": 1.8571726672980304, + "grad_norm": 0.7991589903831482, + "learning_rate": 6.267420736047274e-07, + "loss": 0.7045, + "step": 115700 + }, + { + "epoch": 1.8573331835181945, + "grad_norm": 1.0363149642944336, + "learning_rate": 6.25340038672581e-07, + "loss": 0.7474, + "step": 115710 + }, + { + "epoch": 1.8574936997383587, + "grad_norm": 1.1495110988616943, + "learning_rate": 6.239395538257503e-07, + "loss": 0.7047, + "step": 115720 + }, + { + "epoch": 1.8576542159585228, + "grad_norm": 0.9671218395233154, + "learning_rate": 6.225406191532862e-07, + "loss": 0.7216, + "step": 115730 + }, + { + "epoch": 1.8578147321786866, + "grad_norm": 1.1367061138153076, + "learning_rate": 6.211432347441564e-07, + "loss": 0.6603, + "step": 115740 + }, + { + "epoch": 1.8579752483988508, + "grad_norm": 1.0780845880508423, + "learning_rate": 6.197474006872289e-07, + "loss": 0.7226, + "step": 115750 + }, + { + "epoch": 1.8581357646190146, + "grad_norm": 0.9630643129348755, + "learning_rate": 6.183531170712687e-07, + "loss": 0.541, + "step": 115760 + }, + { + "epoch": 1.8582962808391787, + "grad_norm": 1.2321306467056274, + "learning_rate": 6.169603839849464e-07, + "loss": 0.641, + "step": 115770 + }, + { + "epoch": 1.8584567970593429, + "grad_norm": 1.1711103916168213, + "learning_rate": 6.155692015168301e-07, + "loss": 0.6014, + "step": 115780 + }, + { + "epoch": 1.858617313279507, + "grad_norm": 1.1229822635650635, + "learning_rate": 6.141795697553882e-07, + "loss": 0.6436, + "step": 115790 + }, + { + "epoch": 1.858777829499671, + "grad_norm": 1.4668550491333008, + "learning_rate": 6.127914887889968e-07, + "loss": 0.6502, + "step": 115800 + }, + { + "epoch": 1.858938345719835, + "grad_norm": 0.9164636731147766, + "learning_rate": 6.114049587059273e-07, + "loss": 0.7344, + "step": 115810 + }, + { + "epoch": 1.859098861939999, + "grad_norm": 0.7858783006668091, + "learning_rate": 6.100199795943562e-07, + "loss": 0.716, + "step": 115820 + }, + { + "epoch": 1.859259378160163, + "grad_norm": 0.7868449687957764, + "learning_rate": 6.086365515423575e-07, + "loss": 0.6997, + "step": 115830 + }, + { + "epoch": 1.859419894380327, + "grad_norm": 1.1673856973648071, + "learning_rate": 6.072546746379165e-07, + "loss": 0.6832, + "step": 115840 + }, + { + "epoch": 1.8595804106004912, + "grad_norm": 1.0811125040054321, + "learning_rate": 6.058743489689045e-07, + "loss": 0.5555, + "step": 115850 + }, + { + "epoch": 1.8597409268206553, + "grad_norm": 0.8115656971931458, + "learning_rate": 6.04495574623104e-07, + "loss": 0.782, + "step": 115860 + }, + { + "epoch": 1.8599014430408194, + "grad_norm": 1.6469171047210693, + "learning_rate": 6.031183516881977e-07, + "loss": 0.8307, + "step": 115870 + }, + { + "epoch": 1.8600619592609833, + "grad_norm": 1.2961084842681885, + "learning_rate": 6.017426802517711e-07, + "loss": 0.7721, + "step": 115880 + }, + { + "epoch": 1.8602224754811474, + "grad_norm": 0.9748512506484985, + "learning_rate": 6.003685604013043e-07, + "loss": 0.7829, + "step": 115890 + }, + { + "epoch": 1.8603829917013113, + "grad_norm": 0.8557847142219543, + "learning_rate": 5.989959922241912e-07, + "loss": 0.7224, + "step": 115900 + }, + { + "epoch": 1.8605435079214754, + "grad_norm": 0.9268510937690735, + "learning_rate": 5.976249758077091e-07, + "loss": 0.6358, + "step": 115910 + }, + { + "epoch": 1.8607040241416395, + "grad_norm": 0.8278096318244934, + "learning_rate": 5.962555112390522e-07, + "loss": 0.5987, + "step": 115920 + }, + { + "epoch": 1.8608645403618036, + "grad_norm": 1.673664927482605, + "learning_rate": 5.948875986053065e-07, + "loss": 0.7265, + "step": 115930 + }, + { + "epoch": 1.8610250565819677, + "grad_norm": 1.7809250354766846, + "learning_rate": 5.93521237993469e-07, + "loss": 0.7371, + "step": 115940 + }, + { + "epoch": 1.8611855728021318, + "grad_norm": 1.3060132265090942, + "learning_rate": 5.921564294904286e-07, + "loss": 0.7639, + "step": 115950 + }, + { + "epoch": 1.8613460890222957, + "grad_norm": 1.0963890552520752, + "learning_rate": 5.907931731829824e-07, + "loss": 0.655, + "step": 115960 + }, + { + "epoch": 1.8615066052424596, + "grad_norm": 1.2463130950927734, + "learning_rate": 5.894314691578195e-07, + "loss": 0.6856, + "step": 115970 + }, + { + "epoch": 1.8616671214626237, + "grad_norm": 1.1313191652297974, + "learning_rate": 5.880713175015401e-07, + "loss": 0.6664, + "step": 115980 + }, + { + "epoch": 1.8618276376827878, + "grad_norm": 1.3130706548690796, + "learning_rate": 5.867127183006415e-07, + "loss": 0.6272, + "step": 115990 + }, + { + "epoch": 1.861988153902952, + "grad_norm": 0.9881348013877869, + "learning_rate": 5.853556716415243e-07, + "loss": 0.7281, + "step": 116000 + }, + { + "epoch": 1.861988153902952, + "eval_loss": 0.7689798474311829, + "eval_runtime": 1834.2486, + "eval_samples_per_second": 14.301, + "eval_steps_per_second": 1.788, + "step": 116000 + }, + { + "epoch": 1.862148670123116, + "grad_norm": 1.7816461324691772, + "learning_rate": 5.840001776104859e-07, + "loss": 0.6366, + "step": 116010 + }, + { + "epoch": 1.8623091863432801, + "grad_norm": 2.2947113513946533, + "learning_rate": 5.826462362937268e-07, + "loss": 0.6936, + "step": 116020 + }, + { + "epoch": 1.862469702563444, + "grad_norm": 1.2215399742126465, + "learning_rate": 5.812938477773561e-07, + "loss": 0.6672, + "step": 116030 + }, + { + "epoch": 1.8626302187836081, + "grad_norm": 0.9447054862976074, + "learning_rate": 5.799430121473715e-07, + "loss": 0.5879, + "step": 116040 + }, + { + "epoch": 1.862790735003772, + "grad_norm": 0.9042874574661255, + "learning_rate": 5.785937294896792e-07, + "loss": 0.7042, + "step": 116050 + }, + { + "epoch": 1.8629512512239361, + "grad_norm": 0.8352022767066956, + "learning_rate": 5.772459998900859e-07, + "loss": 0.7158, + "step": 116060 + }, + { + "epoch": 1.8631117674441002, + "grad_norm": 1.1093230247497559, + "learning_rate": 5.758998234343005e-07, + "loss": 0.8205, + "step": 116070 + }, + { + "epoch": 1.8632722836642643, + "grad_norm": 1.445427656173706, + "learning_rate": 5.745552002079324e-07, + "loss": 0.6205, + "step": 116080 + }, + { + "epoch": 1.8634327998844284, + "grad_norm": 1.8770861625671387, + "learning_rate": 5.73212130296491e-07, + "loss": 0.6863, + "step": 116090 + }, + { + "epoch": 1.8635933161045923, + "grad_norm": 0.8317080140113831, + "learning_rate": 5.718706137853858e-07, + "loss": 0.6817, + "step": 116100 + }, + { + "epoch": 1.8637538323247564, + "grad_norm": 1.1418166160583496, + "learning_rate": 5.705306507599317e-07, + "loss": 0.7284, + "step": 116110 + }, + { + "epoch": 1.8639143485449203, + "grad_norm": 0.9026671648025513, + "learning_rate": 5.691922413053441e-07, + "loss": 0.643, + "step": 116120 + }, + { + "epoch": 1.8640748647650844, + "grad_norm": 1.311983585357666, + "learning_rate": 5.678553855067325e-07, + "loss": 0.7036, + "step": 116130 + }, + { + "epoch": 1.8642353809852485, + "grad_norm": 1.0236729383468628, + "learning_rate": 5.66520083449118e-07, + "loss": 0.6387, + "step": 116140 + }, + { + "epoch": 1.8643958972054127, + "grad_norm": 0.9756208658218384, + "learning_rate": 5.651863352174158e-07, + "loss": 0.711, + "step": 116150 + }, + { + "epoch": 1.8645564134255768, + "grad_norm": 1.0687092542648315, + "learning_rate": 5.638541408964443e-07, + "loss": 0.7644, + "step": 116160 + }, + { + "epoch": 1.8647169296457407, + "grad_norm": 1.031572937965393, + "learning_rate": 5.625235005709245e-07, + "loss": 0.6678, + "step": 116170 + }, + { + "epoch": 1.8648774458659048, + "grad_norm": 1.7588942050933838, + "learning_rate": 5.611944143254749e-07, + "loss": 0.713, + "step": 116180 + }, + { + "epoch": 1.8650379620860686, + "grad_norm": 0.73753821849823, + "learning_rate": 5.598668822446223e-07, + "loss": 0.7101, + "step": 116190 + }, + { + "epoch": 1.8651984783062328, + "grad_norm": 0.9102754592895508, + "learning_rate": 5.585409044127854e-07, + "loss": 0.5962, + "step": 116200 + }, + { + "epoch": 1.8653589945263969, + "grad_norm": 1.4713635444641113, + "learning_rate": 5.572164809142882e-07, + "loss": 0.7864, + "step": 116210 + }, + { + "epoch": 1.865519510746561, + "grad_norm": 1.1088709831237793, + "learning_rate": 5.558936118333607e-07, + "loss": 0.6246, + "step": 116220 + }, + { + "epoch": 1.865680026966725, + "grad_norm": 0.9179599285125732, + "learning_rate": 5.545722972541245e-07, + "loss": 0.8229, + "step": 116230 + }, + { + "epoch": 1.8658405431868892, + "grad_norm": 0.7828149795532227, + "learning_rate": 5.532525372606123e-07, + "loss": 0.645, + "step": 116240 + }, + { + "epoch": 1.866001059407053, + "grad_norm": 1.0478957891464233, + "learning_rate": 5.519343319367543e-07, + "loss": 0.5268, + "step": 116250 + }, + { + "epoch": 1.8661615756272172, + "grad_norm": 1.1178470849990845, + "learning_rate": 5.506176813663722e-07, + "loss": 0.6814, + "step": 116260 + }, + { + "epoch": 1.866322091847381, + "grad_norm": 0.9945940971374512, + "learning_rate": 5.493025856332046e-07, + "loss": 0.741, + "step": 116270 + }, + { + "epoch": 1.8664826080675452, + "grad_norm": 0.9591667652130127, + "learning_rate": 5.479890448208791e-07, + "loss": 0.6424, + "step": 116280 + }, + { + "epoch": 1.8666431242877093, + "grad_norm": 2.232599973678589, + "learning_rate": 5.466770590129344e-07, + "loss": 0.6472, + "step": 116290 + }, + { + "epoch": 1.8668036405078734, + "grad_norm": 1.5924274921417236, + "learning_rate": 5.45366628292801e-07, + "loss": 0.613, + "step": 116300 + }, + { + "epoch": 1.8669641567280375, + "grad_norm": 1.204736590385437, + "learning_rate": 5.440577527438151e-07, + "loss": 0.6046, + "step": 116310 + }, + { + "epoch": 1.8671246729482014, + "grad_norm": 1.0895761251449585, + "learning_rate": 5.427504324492156e-07, + "loss": 0.6467, + "step": 116320 + }, + { + "epoch": 1.8672851891683655, + "grad_norm": 1.423609733581543, + "learning_rate": 5.414446674921386e-07, + "loss": 0.6687, + "step": 116330 + }, + { + "epoch": 1.8674457053885294, + "grad_norm": 2.1226422786712646, + "learning_rate": 5.401404579556263e-07, + "loss": 0.6971, + "step": 116340 + }, + { + "epoch": 1.8676062216086935, + "grad_norm": 0.8223735690116882, + "learning_rate": 5.388378039226122e-07, + "loss": 0.7294, + "step": 116350 + }, + { + "epoch": 1.8677667378288576, + "grad_norm": 0.861606776714325, + "learning_rate": 5.375367054759411e-07, + "loss": 0.6964, + "step": 116360 + }, + { + "epoch": 1.8679272540490217, + "grad_norm": 0.8653781414031982, + "learning_rate": 5.362371626983581e-07, + "loss": 0.7943, + "step": 116370 + }, + { + "epoch": 1.8680877702691858, + "grad_norm": 1.8439452648162842, + "learning_rate": 5.349391756725025e-07, + "loss": 0.7548, + "step": 116380 + }, + { + "epoch": 1.8682482864893497, + "grad_norm": 1.2551555633544922, + "learning_rate": 5.336427444809195e-07, + "loss": 0.7148, + "step": 116390 + }, + { + "epoch": 1.8684088027095138, + "grad_norm": 1.5203309059143066, + "learning_rate": 5.323478692060568e-07, + "loss": 0.7464, + "step": 116400 + }, + { + "epoch": 1.8685693189296777, + "grad_norm": 1.2828047275543213, + "learning_rate": 5.310545499302599e-07, + "loss": 0.6939, + "step": 116410 + }, + { + "epoch": 1.8687298351498418, + "grad_norm": 0.8209085464477539, + "learning_rate": 5.297627867357712e-07, + "loss": 0.6872, + "step": 116420 + }, + { + "epoch": 1.868890351370006, + "grad_norm": 1.1904640197753906, + "learning_rate": 5.284725797047474e-07, + "loss": 0.7477, + "step": 116430 + }, + { + "epoch": 1.86905086759017, + "grad_norm": 1.1846532821655273, + "learning_rate": 5.27183928919231e-07, + "loss": 0.7628, + "step": 116440 + }, + { + "epoch": 1.8692113838103341, + "grad_norm": 0.8981404304504395, + "learning_rate": 5.258968344611787e-07, + "loss": 0.7329, + "step": 116450 + }, + { + "epoch": 1.8693719000304982, + "grad_norm": 1.8378829956054688, + "learning_rate": 5.246112964124389e-07, + "loss": 0.6782, + "step": 116460 + }, + { + "epoch": 1.8695324162506621, + "grad_norm": 1.1165969371795654, + "learning_rate": 5.233273148547657e-07, + "loss": 0.6307, + "step": 116470 + }, + { + "epoch": 1.869692932470826, + "grad_norm": 1.3223427534103394, + "learning_rate": 5.220448898698104e-07, + "loss": 0.735, + "step": 116480 + }, + { + "epoch": 1.8698534486909901, + "grad_norm": 1.292548418045044, + "learning_rate": 5.207640215391274e-07, + "loss": 0.6301, + "step": 116490 + }, + { + "epoch": 1.8700139649111542, + "grad_norm": 1.1151450872421265, + "learning_rate": 5.194847099441763e-07, + "loss": 0.7687, + "step": 116500 + }, + { + "epoch": 1.8701744811313183, + "grad_norm": 0.8443423509597778, + "learning_rate": 5.182069551663088e-07, + "loss": 0.7828, + "step": 116510 + }, + { + "epoch": 1.8703349973514825, + "grad_norm": 1.2552670240402222, + "learning_rate": 5.169307572867904e-07, + "loss": 0.7387, + "step": 116520 + }, + { + "epoch": 1.8704955135716466, + "grad_norm": 0.7335842847824097, + "learning_rate": 5.156561163867729e-07, + "loss": 0.7465, + "step": 116530 + }, + { + "epoch": 1.8706560297918104, + "grad_norm": 1.3643742799758911, + "learning_rate": 5.143830325473164e-07, + "loss": 0.7362, + "step": 116540 + }, + { + "epoch": 1.8708165460119746, + "grad_norm": 1.1239014863967896, + "learning_rate": 5.131115058493813e-07, + "loss": 0.6006, + "step": 116550 + }, + { + "epoch": 1.8709770622321384, + "grad_norm": 1.168269157409668, + "learning_rate": 5.118415363738332e-07, + "loss": 0.7129, + "step": 116560 + }, + { + "epoch": 1.8711375784523026, + "grad_norm": 1.4185349941253662, + "learning_rate": 5.105731242014327e-07, + "loss": 0.7299, + "step": 116570 + }, + { + "epoch": 1.8712980946724667, + "grad_norm": 1.2711548805236816, + "learning_rate": 5.093062694128431e-07, + "loss": 0.7314, + "step": 116580 + }, + { + "epoch": 1.8714586108926308, + "grad_norm": 1.1684192419052124, + "learning_rate": 5.080409720886304e-07, + "loss": 0.7324, + "step": 116590 + }, + { + "epoch": 1.8716191271127949, + "grad_norm": 2.198517322540283, + "learning_rate": 5.067772323092579e-07, + "loss": 0.6009, + "step": 116600 + }, + { + "epoch": 1.8717796433329588, + "grad_norm": 2.4911961555480957, + "learning_rate": 5.055150501550921e-07, + "loss": 0.6597, + "step": 116610 + }, + { + "epoch": 1.8719401595531229, + "grad_norm": 1.1203789710998535, + "learning_rate": 5.042544257063992e-07, + "loss": 0.5876, + "step": 116620 + }, + { + "epoch": 1.8721006757732868, + "grad_norm": 0.8995938897132874, + "learning_rate": 5.029953590433511e-07, + "loss": 0.6089, + "step": 116630 + }, + { + "epoch": 1.8722611919934509, + "grad_norm": 0.9798839092254639, + "learning_rate": 5.017378502460147e-07, + "loss": 0.7451, + "step": 116640 + }, + { + "epoch": 1.872421708213615, + "grad_norm": 1.1681619882583618, + "learning_rate": 5.004818993943645e-07, + "loss": 0.793, + "step": 116650 + }, + { + "epoch": 1.872582224433779, + "grad_norm": 1.7812095880508423, + "learning_rate": 4.992275065682644e-07, + "loss": 0.7568, + "step": 116660 + }, + { + "epoch": 1.8727427406539432, + "grad_norm": 0.8954894542694092, + "learning_rate": 4.979746718474898e-07, + "loss": 0.7298, + "step": 116670 + }, + { + "epoch": 1.872903256874107, + "grad_norm": 0.8286190629005432, + "learning_rate": 4.967233953117156e-07, + "loss": 0.5564, + "step": 116680 + }, + { + "epoch": 1.8730637730942712, + "grad_norm": 0.8274640440940857, + "learning_rate": 4.954736770405117e-07, + "loss": 0.7264, + "step": 116690 + }, + { + "epoch": 1.873224289314435, + "grad_norm": 1.2997409105300903, + "learning_rate": 4.942255171133559e-07, + "loss": 0.7915, + "step": 116700 + }, + { + "epoch": 1.8733848055345992, + "grad_norm": 0.859249472618103, + "learning_rate": 4.929789156096237e-07, + "loss": 0.8286, + "step": 116710 + }, + { + "epoch": 1.8735453217547633, + "grad_norm": 1.3747758865356445, + "learning_rate": 4.917338726085907e-07, + "loss": 0.7235, + "step": 116720 + }, + { + "epoch": 1.8737058379749274, + "grad_norm": 1.1680941581726074, + "learning_rate": 4.904903881894352e-07, + "loss": 0.7495, + "step": 116730 + }, + { + "epoch": 1.8738663541950915, + "grad_norm": 1.1384283304214478, + "learning_rate": 4.892484624312327e-07, + "loss": 0.7379, + "step": 116740 + }, + { + "epoch": 1.8740268704152556, + "grad_norm": 1.0407863855361938, + "learning_rate": 4.880080954129645e-07, + "loss": 0.7214, + "step": 116750 + }, + { + "epoch": 1.8741873866354195, + "grad_norm": 0.5577530860900879, + "learning_rate": 4.867692872135121e-07, + "loss": 0.6034, + "step": 116760 + }, + { + "epoch": 1.8743479028555834, + "grad_norm": 0.8820061683654785, + "learning_rate": 4.855320379116541e-07, + "loss": 0.7183, + "step": 116770 + }, + { + "epoch": 1.8745084190757475, + "grad_norm": 1.1229249238967896, + "learning_rate": 4.842963475860745e-07, + "loss": 0.6753, + "step": 116780 + }, + { + "epoch": 1.8746689352959116, + "grad_norm": 1.3583298921585083, + "learning_rate": 4.830622163153525e-07, + "loss": 0.6867, + "step": 116790 + }, + { + "epoch": 1.8748294515160757, + "grad_norm": 0.9788544178009033, + "learning_rate": 4.818296441779751e-07, + "loss": 0.628, + "step": 116800 + }, + { + "epoch": 1.8749899677362398, + "grad_norm": 1.0608758926391602, + "learning_rate": 4.805986312523241e-07, + "loss": 0.7542, + "step": 116810 + }, + { + "epoch": 1.875150483956404, + "grad_norm": 2.0685994625091553, + "learning_rate": 4.79369177616687e-07, + "loss": 0.7435, + "step": 116820 + }, + { + "epoch": 1.8753110001765678, + "grad_norm": 0.9660042524337769, + "learning_rate": 4.781412833492483e-07, + "loss": 0.6768, + "step": 116830 + }, + { + "epoch": 1.875471516396732, + "grad_norm": 1.218804955482483, + "learning_rate": 4.769149485280955e-07, + "loss": 0.7052, + "step": 116840 + }, + { + "epoch": 1.8756320326168958, + "grad_norm": 1.8525190353393555, + "learning_rate": 4.756901732312163e-07, + "loss": 0.7049, + "step": 116850 + }, + { + "epoch": 1.87579254883706, + "grad_norm": 1.0743156671524048, + "learning_rate": 4.744669575364985e-07, + "loss": 0.6511, + "step": 116860 + }, + { + "epoch": 1.875953065057224, + "grad_norm": 1.938153862953186, + "learning_rate": 4.7324530152172977e-07, + "loss": 0.6892, + "step": 116870 + }, + { + "epoch": 1.8761135812773881, + "grad_norm": 1.0056594610214233, + "learning_rate": 4.7202520526460626e-07, + "loss": 0.649, + "step": 116880 + }, + { + "epoch": 1.8762740974975523, + "grad_norm": 0.7724015116691589, + "learning_rate": 4.7080666884271594e-07, + "loss": 0.6152, + "step": 116890 + }, + { + "epoch": 1.8764346137177161, + "grad_norm": 1.241307020187378, + "learning_rate": 4.6958969233354686e-07, + "loss": 0.6357, + "step": 116900 + }, + { + "epoch": 1.8765951299378802, + "grad_norm": 1.0797040462493896, + "learning_rate": 4.683742758144927e-07, + "loss": 0.7352, + "step": 116910 + }, + { + "epoch": 1.8767556461580441, + "grad_norm": 0.5862993001937866, + "learning_rate": 4.671604193628526e-07, + "loss": 0.6756, + "step": 116920 + }, + { + "epoch": 1.8769161623782082, + "grad_norm": 0.9061014652252197, + "learning_rate": 4.659481230558149e-07, + "loss": 0.7322, + "step": 116930 + }, + { + "epoch": 1.8770766785983724, + "grad_norm": 0.693355917930603, + "learning_rate": 4.647373869704735e-07, + "loss": 0.748, + "step": 116940 + }, + { + "epoch": 1.8772371948185365, + "grad_norm": 2.0390822887420654, + "learning_rate": 4.635282111838307e-07, + "loss": 0.7152, + "step": 116950 + }, + { + "epoch": 1.8773977110387006, + "grad_norm": 1.981826663017273, + "learning_rate": 4.623205957727805e-07, + "loss": 0.679, + "step": 116960 + }, + { + "epoch": 1.8775582272588645, + "grad_norm": 0.7675228714942932, + "learning_rate": 4.6111454081411707e-07, + "loss": 0.5498, + "step": 116970 + }, + { + "epoch": 1.8777187434790286, + "grad_norm": 1.7426080703735352, + "learning_rate": 4.599100463845374e-07, + "loss": 0.6963, + "step": 116980 + }, + { + "epoch": 1.8778792596991924, + "grad_norm": 0.9880105257034302, + "learning_rate": 4.587071125606468e-07, + "loss": 0.7568, + "step": 116990 + }, + { + "epoch": 1.8780397759193566, + "grad_norm": 1.2809407711029053, + "learning_rate": 4.575057394189397e-07, + "loss": 0.6596, + "step": 117000 + }, + { + "epoch": 1.8782002921395207, + "grad_norm": 0.8559603095054626, + "learning_rate": 4.563059270358161e-07, + "loss": 0.7877, + "step": 117010 + }, + { + "epoch": 1.8783608083596848, + "grad_norm": 1.1742591857910156, + "learning_rate": 4.551076754875816e-07, + "loss": 0.7365, + "step": 117020 + }, + { + "epoch": 1.8785213245798489, + "grad_norm": 0.7988681793212891, + "learning_rate": 4.539109848504336e-07, + "loss": 0.6854, + "step": 117030 + }, + { + "epoch": 1.878681840800013, + "grad_norm": 1.5633546113967896, + "learning_rate": 4.5271585520047787e-07, + "loss": 0.6028, + "step": 117040 + }, + { + "epoch": 1.8788423570201769, + "grad_norm": 1.1113085746765137, + "learning_rate": 4.51522286613712e-07, + "loss": 0.6793, + "step": 117050 + }, + { + "epoch": 1.879002873240341, + "grad_norm": 1.0073106288909912, + "learning_rate": 4.503302791660474e-07, + "loss": 0.8105, + "step": 117060 + }, + { + "epoch": 1.8791633894605049, + "grad_norm": 1.469475507736206, + "learning_rate": 4.491398329332819e-07, + "loss": 0.6489, + "step": 117070 + }, + { + "epoch": 1.879323905680669, + "grad_norm": 1.0212652683258057, + "learning_rate": 4.4795094799112705e-07, + "loss": 0.8158, + "step": 117080 + }, + { + "epoch": 1.879484421900833, + "grad_norm": 1.011300802230835, + "learning_rate": 4.4676362441518625e-07, + "loss": 0.6362, + "step": 117090 + }, + { + "epoch": 1.8796449381209972, + "grad_norm": 2.14984393119812, + "learning_rate": 4.455778622809631e-07, + "loss": 0.7081, + "step": 117100 + }, + { + "epoch": 1.8798054543411613, + "grad_norm": 1.4186232089996338, + "learning_rate": 4.443936616638694e-07, + "loss": 0.6361, + "step": 117110 + }, + { + "epoch": 1.8799659705613252, + "grad_norm": 0.6801685690879822, + "learning_rate": 4.432110226392117e-07, + "loss": 0.7431, + "step": 117120 + }, + { + "epoch": 1.8801264867814893, + "grad_norm": 2.2254929542541504, + "learning_rate": 4.420299452822019e-07, + "loss": 0.7061, + "step": 117130 + }, + { + "epoch": 1.8802870030016532, + "grad_norm": 1.5846925973892212, + "learning_rate": 4.4085042966794387e-07, + "loss": 0.6559, + "step": 117140 + }, + { + "epoch": 1.8804475192218173, + "grad_norm": 1.1034703254699707, + "learning_rate": 4.3967247587145264e-07, + "loss": 0.7242, + "step": 117150 + }, + { + "epoch": 1.8806080354419814, + "grad_norm": 1.3977863788604736, + "learning_rate": 4.3849608396763773e-07, + "loss": 0.6592, + "step": 117160 + }, + { + "epoch": 1.8807685516621455, + "grad_norm": 1.0806862115859985, + "learning_rate": 4.3732125403130877e-07, + "loss": 0.6501, + "step": 117170 + }, + { + "epoch": 1.8809290678823096, + "grad_norm": 0.8713932037353516, + "learning_rate": 4.361479861371809e-07, + "loss": 0.7619, + "step": 117180 + }, + { + "epoch": 1.8810895841024735, + "grad_norm": 1.3564716577529907, + "learning_rate": 4.349762803598639e-07, + "loss": 0.5723, + "step": 117190 + }, + { + "epoch": 1.8812501003226376, + "grad_norm": 1.6861495971679688, + "learning_rate": 4.3380613677387604e-07, + "loss": 0.6532, + "step": 117200 + }, + { + "epoch": 1.8814106165428015, + "grad_norm": 1.2422150373458862, + "learning_rate": 4.3263755545362994e-07, + "loss": 0.6532, + "step": 117210 + }, + { + "epoch": 1.8815711327629656, + "grad_norm": 1.4596251249313354, + "learning_rate": 4.314705364734356e-07, + "loss": 0.7662, + "step": 117220 + }, + { + "epoch": 1.8817316489831297, + "grad_norm": 1.10184645652771, + "learning_rate": 4.303050799075142e-07, + "loss": 0.8017, + "step": 117230 + }, + { + "epoch": 1.8818921652032938, + "grad_norm": 1.0834788084030151, + "learning_rate": 4.2914118582998154e-07, + "loss": 0.7526, + "step": 117240 + }, + { + "epoch": 1.882052681423458, + "grad_norm": 0.689449667930603, + "learning_rate": 4.2797885431485053e-07, + "loss": 0.6161, + "step": 117250 + }, + { + "epoch": 1.882213197643622, + "grad_norm": 1.318129539489746, + "learning_rate": 4.268180854360426e-07, + "loss": 0.5574, + "step": 117260 + }, + { + "epoch": 1.882373713863786, + "grad_norm": 1.5233973264694214, + "learning_rate": 4.2565887926737656e-07, + "loss": 0.7241, + "step": 117270 + }, + { + "epoch": 1.8825342300839498, + "grad_norm": 1.0829849243164062, + "learning_rate": 4.245012358825684e-07, + "loss": 0.7084, + "step": 117280 + }, + { + "epoch": 1.882694746304114, + "grad_norm": 1.5245617628097534, + "learning_rate": 4.233451553552342e-07, + "loss": 0.7354, + "step": 117290 + }, + { + "epoch": 1.882855262524278, + "grad_norm": 1.0194401741027832, + "learning_rate": 4.221906377589013e-07, + "loss": 0.695, + "step": 117300 + }, + { + "epoch": 1.8830157787444421, + "grad_norm": 1.2211986780166626, + "learning_rate": 4.21037683166986e-07, + "loss": 0.6506, + "step": 117310 + }, + { + "epoch": 1.8831762949646063, + "grad_norm": 1.195770502090454, + "learning_rate": 4.198862916528101e-07, + "loss": 0.623, + "step": 117320 + }, + { + "epoch": 1.8833368111847704, + "grad_norm": 1.2462741136550903, + "learning_rate": 4.187364632895957e-07, + "loss": 0.6877, + "step": 117330 + }, + { + "epoch": 1.8834973274049343, + "grad_norm": 1.055683970451355, + "learning_rate": 4.175881981504648e-07, + "loss": 0.6576, + "step": 117340 + }, + { + "epoch": 1.8836578436250984, + "grad_norm": 1.097172498703003, + "learning_rate": 4.1644149630843955e-07, + "loss": 0.7612, + "step": 117350 + }, + { + "epoch": 1.8838183598452622, + "grad_norm": 1.0175373554229736, + "learning_rate": 4.152963578364449e-07, + "loss": 0.749, + "step": 117360 + }, + { + "epoch": 1.8839788760654264, + "grad_norm": 1.0332828760147095, + "learning_rate": 4.1415278280730606e-07, + "loss": 0.6562, + "step": 117370 + }, + { + "epoch": 1.8841393922855905, + "grad_norm": 0.9100266098976135, + "learning_rate": 4.1301077129374256e-07, + "loss": 0.572, + "step": 117380 + }, + { + "epoch": 1.8842999085057546, + "grad_norm": 1.3960613012313843, + "learning_rate": 4.1187032336838517e-07, + "loss": 0.6353, + "step": 117390 + }, + { + "epoch": 1.8844604247259187, + "grad_norm": 1.0144009590148926, + "learning_rate": 4.107314391037592e-07, + "loss": 0.6035, + "step": 117400 + }, + { + "epoch": 1.8846209409460826, + "grad_norm": 0.8508219718933105, + "learning_rate": 4.0959411857228734e-07, + "loss": 0.6588, + "step": 117410 + }, + { + "epoch": 1.8847814571662467, + "grad_norm": 1.1388089656829834, + "learning_rate": 4.084583618462978e-07, + "loss": 0.6376, + "step": 117420 + }, + { + "epoch": 1.8849419733864106, + "grad_norm": 1.4710830450057983, + "learning_rate": 4.0732416899801895e-07, + "loss": 0.6602, + "step": 117430 + }, + { + "epoch": 1.8851024896065747, + "grad_norm": 1.4206756353378296, + "learning_rate": 4.0619154009957914e-07, + "loss": 0.8236, + "step": 117440 + }, + { + "epoch": 1.8852630058267388, + "grad_norm": 1.519774079322815, + "learning_rate": 4.0506047522300417e-07, + "loss": 0.7109, + "step": 117450 + }, + { + "epoch": 1.8854235220469029, + "grad_norm": 0.744451642036438, + "learning_rate": 4.039309744402281e-07, + "loss": 0.6938, + "step": 117460 + }, + { + "epoch": 1.885584038267067, + "grad_norm": 1.2902559041976929, + "learning_rate": 4.0280303782307683e-07, + "loss": 0.7922, + "step": 117470 + }, + { + "epoch": 1.8857445544872309, + "grad_norm": 1.4804282188415527, + "learning_rate": 4.016766654432819e-07, + "loss": 0.6208, + "step": 117480 + }, + { + "epoch": 1.885905070707395, + "grad_norm": 0.8938883543014526, + "learning_rate": 4.0055185737247203e-07, + "loss": 0.6997, + "step": 117490 + }, + { + "epoch": 1.8860655869275589, + "grad_norm": 1.7958135604858398, + "learning_rate": 3.9942861368218453e-07, + "loss": 0.7941, + "step": 117500 + }, + { + "epoch": 1.886226103147723, + "grad_norm": 1.2085490226745605, + "learning_rate": 3.983069344438428e-07, + "loss": 0.7561, + "step": 117510 + }, + { + "epoch": 1.886386619367887, + "grad_norm": 0.8508670330047607, + "learning_rate": 3.9718681972878146e-07, + "loss": 0.6983, + "step": 117520 + }, + { + "epoch": 1.8865471355880512, + "grad_norm": 1.345054268836975, + "learning_rate": 3.9606826960823794e-07, + "loss": 0.6473, + "step": 117530 + }, + { + "epoch": 1.8867076518082153, + "grad_norm": 1.0494060516357422, + "learning_rate": 3.949512841533387e-07, + "loss": 0.7668, + "step": 117540 + }, + { + "epoch": 1.8868681680283794, + "grad_norm": 1.33912992477417, + "learning_rate": 3.938358634351241e-07, + "loss": 0.6863, + "step": 117550 + }, + { + "epoch": 1.8870286842485433, + "grad_norm": 1.0516489744186401, + "learning_rate": 3.927220075245208e-07, + "loss": 0.7009, + "step": 117560 + }, + { + "epoch": 1.8871892004687074, + "grad_norm": 0.8723039627075195, + "learning_rate": 3.916097164923721e-07, + "loss": 0.773, + "step": 117570 + }, + { + "epoch": 1.8873497166888713, + "grad_norm": 1.480180025100708, + "learning_rate": 3.904989904094075e-07, + "loss": 0.4954, + "step": 117580 + }, + { + "epoch": 1.8875102329090354, + "grad_norm": 0.9866092801094055, + "learning_rate": 3.89389829346265e-07, + "loss": 0.7657, + "step": 117590 + }, + { + "epoch": 1.8876707491291995, + "grad_norm": 1.118478536605835, + "learning_rate": 3.8828223337347703e-07, + "loss": 0.5846, + "step": 117600 + }, + { + "epoch": 1.8878312653493636, + "grad_norm": 1.3555022478103638, + "learning_rate": 3.8717620256148447e-07, + "loss": 0.7819, + "step": 117610 + }, + { + "epoch": 1.8879917815695277, + "grad_norm": 0.9851873517036438, + "learning_rate": 3.860717369806227e-07, + "loss": 0.7575, + "step": 117620 + }, + { + "epoch": 1.8881522977896916, + "grad_norm": 0.9960554242134094, + "learning_rate": 3.849688367011273e-07, + "loss": 0.7144, + "step": 117630 + }, + { + "epoch": 1.8883128140098557, + "grad_norm": 0.8722273707389832, + "learning_rate": 3.83867501793142e-07, + "loss": 0.6421, + "step": 117640 + }, + { + "epoch": 1.8884733302300196, + "grad_norm": 1.0496371984481812, + "learning_rate": 3.827677323266998e-07, + "loss": 0.621, + "step": 117650 + }, + { + "epoch": 1.8886338464501837, + "grad_norm": 0.9955131411552429, + "learning_rate": 3.81669528371742e-07, + "loss": 0.6931, + "step": 117660 + }, + { + "epoch": 1.8887943626703478, + "grad_norm": 1.1794829368591309, + "learning_rate": 3.80572889998107e-07, + "loss": 0.7186, + "step": 117670 + }, + { + "epoch": 1.888954878890512, + "grad_norm": 1.4057440757751465, + "learning_rate": 3.7947781727553365e-07, + "loss": 0.5678, + "step": 117680 + }, + { + "epoch": 1.889115395110676, + "grad_norm": 0.9905102252960205, + "learning_rate": 3.7838431027366337e-07, + "loss": 0.6636, + "step": 117690 + }, + { + "epoch": 1.88927591133084, + "grad_norm": 1.282344937324524, + "learning_rate": 3.772923690620378e-07, + "loss": 0.7659, + "step": 117700 + }, + { + "epoch": 1.889436427551004, + "grad_norm": 1.5953309535980225, + "learning_rate": 3.7620199371009855e-07, + "loss": 0.7377, + "step": 117710 + }, + { + "epoch": 1.889596943771168, + "grad_norm": 1.0365909337997437, + "learning_rate": 3.7511318428718465e-07, + "loss": 0.7497, + "step": 117720 + }, + { + "epoch": 1.889757459991332, + "grad_norm": 1.0557819604873657, + "learning_rate": 3.740259408625352e-07, + "loss": 0.7912, + "step": 117730 + }, + { + "epoch": 1.8899179762114962, + "grad_norm": 0.9920076131820679, + "learning_rate": 3.729402635053003e-07, + "loss": 0.6014, + "step": 117740 + }, + { + "epoch": 1.8900784924316603, + "grad_norm": 1.0314610004425049, + "learning_rate": 3.7185615228451653e-07, + "loss": 0.7031, + "step": 117750 + }, + { + "epoch": 1.8902390086518244, + "grad_norm": 1.251373291015625, + "learning_rate": 3.7077360726912866e-07, + "loss": 0.6672, + "step": 117760 + }, + { + "epoch": 1.8903995248719885, + "grad_norm": 1.2880816459655762, + "learning_rate": 3.696926285279817e-07, + "loss": 0.8425, + "step": 117770 + }, + { + "epoch": 1.8905600410921524, + "grad_norm": 0.9935459494590759, + "learning_rate": 3.686132161298178e-07, + "loss": 0.7527, + "step": 117780 + }, + { + "epoch": 1.8907205573123163, + "grad_norm": 1.7675936222076416, + "learning_rate": 3.675353701432821e-07, + "loss": 0.6628, + "step": 117790 + }, + { + "epoch": 1.8908810735324804, + "grad_norm": 1.170440435409546, + "learning_rate": 3.6645909063691694e-07, + "loss": 0.6879, + "step": 117800 + }, + { + "epoch": 1.8910415897526445, + "grad_norm": 1.0492035150527954, + "learning_rate": 3.653843776791732e-07, + "loss": 0.6886, + "step": 117810 + }, + { + "epoch": 1.8912021059728086, + "grad_norm": 0.8443636894226074, + "learning_rate": 3.6431123133838784e-07, + "loss": 0.7241, + "step": 117820 + }, + { + "epoch": 1.8913626221929727, + "grad_norm": 1.0676820278167725, + "learning_rate": 3.6323965168281737e-07, + "loss": 0.7043, + "step": 117830 + }, + { + "epoch": 1.8915231384131368, + "grad_norm": 1.7495944499969482, + "learning_rate": 3.6216963878059616e-07, + "loss": 0.737, + "step": 117840 + }, + { + "epoch": 1.8916836546333007, + "grad_norm": 0.7972350120544434, + "learning_rate": 3.6110119269978093e-07, + "loss": 0.6836, + "step": 117850 + }, + { + "epoch": 1.8918441708534648, + "grad_norm": 1.1239920854568481, + "learning_rate": 3.6003431350831173e-07, + "loss": 0.6809, + "step": 117860 + }, + { + "epoch": 1.8920046870736287, + "grad_norm": 1.0473990440368652, + "learning_rate": 3.589690012740371e-07, + "loss": 0.6598, + "step": 117870 + }, + { + "epoch": 1.8921652032937928, + "grad_norm": 1.0729554891586304, + "learning_rate": 3.579052560647084e-07, + "loss": 0.6456, + "step": 117880 + }, + { + "epoch": 1.892325719513957, + "grad_norm": 1.3526955842971802, + "learning_rate": 3.568430779479687e-07, + "loss": 0.6934, + "step": 117890 + }, + { + "epoch": 1.892486235734121, + "grad_norm": 1.0577468872070312, + "learning_rate": 3.557824669913723e-07, + "loss": 0.5756, + "step": 117900 + }, + { + "epoch": 1.8926467519542851, + "grad_norm": 1.243017554283142, + "learning_rate": 3.5472342326235974e-07, + "loss": 0.6701, + "step": 117910 + }, + { + "epoch": 1.892807268174449, + "grad_norm": 1.1678872108459473, + "learning_rate": 3.5366594682828824e-07, + "loss": 0.6262, + "step": 117920 + }, + { + "epoch": 1.892967784394613, + "grad_norm": 1.2964415550231934, + "learning_rate": 3.526100377564012e-07, + "loss": 0.7509, + "step": 117930 + }, + { + "epoch": 1.893128300614777, + "grad_norm": 1.2977055311203003, + "learning_rate": 3.5155569611384775e-07, + "loss": 0.7437, + "step": 117940 + }, + { + "epoch": 1.893288816834941, + "grad_norm": 1.4775490760803223, + "learning_rate": 3.5050292196768255e-07, + "loss": 0.7306, + "step": 117950 + }, + { + "epoch": 1.8934493330551052, + "grad_norm": 0.5679828524589539, + "learning_rate": 3.4945171538485754e-07, + "loss": 0.7185, + "step": 117960 + }, + { + "epoch": 1.8936098492752693, + "grad_norm": 1.4407374858856201, + "learning_rate": 3.484020764322138e-07, + "loss": 0.7417, + "step": 117970 + }, + { + "epoch": 1.8937703654954334, + "grad_norm": 1.2156322002410889, + "learning_rate": 3.4735400517651173e-07, + "loss": 0.699, + "step": 117980 + }, + { + "epoch": 1.8939308817155973, + "grad_norm": 1.0428762435913086, + "learning_rate": 3.463075016843953e-07, + "loss": 0.6899, + "step": 117990 + }, + { + "epoch": 1.8940913979357614, + "grad_norm": 1.0091376304626465, + "learning_rate": 3.452625660224196e-07, + "loss": 0.7037, + "step": 118000 + }, + { + "epoch": 1.8942519141559253, + "grad_norm": 1.9613523483276367, + "learning_rate": 3.4421919825703696e-07, + "loss": 0.7218, + "step": 118010 + }, + { + "epoch": 1.8944124303760894, + "grad_norm": 1.8558402061462402, + "learning_rate": 3.431773984546027e-07, + "loss": 0.6341, + "step": 118020 + }, + { + "epoch": 1.8945729465962535, + "grad_norm": 0.9561228156089783, + "learning_rate": 3.421371666813611e-07, + "loss": 0.687, + "step": 118030 + }, + { + "epoch": 1.8947334628164176, + "grad_norm": 1.7931880950927734, + "learning_rate": 3.4109850300346746e-07, + "loss": 0.7126, + "step": 118040 + }, + { + "epoch": 1.8948939790365817, + "grad_norm": 1.168800711631775, + "learning_rate": 3.4006140748697736e-07, + "loss": 0.5292, + "step": 118050 + }, + { + "epoch": 1.8950544952567459, + "grad_norm": 1.1484047174453735, + "learning_rate": 3.3902588019784077e-07, + "loss": 0.7525, + "step": 118060 + }, + { + "epoch": 1.8952150114769097, + "grad_norm": 1.2786470651626587, + "learning_rate": 3.3799192120191614e-07, + "loss": 0.651, + "step": 118070 + }, + { + "epoch": 1.8953755276970736, + "grad_norm": 1.3521925210952759, + "learning_rate": 3.3695953056495087e-07, + "loss": 0.6141, + "step": 118080 + }, + { + "epoch": 1.8955360439172377, + "grad_norm": 1.0677677392959595, + "learning_rate": 3.359287083526036e-07, + "loss": 0.6791, + "step": 118090 + }, + { + "epoch": 1.8956965601374018, + "grad_norm": 1.202277660369873, + "learning_rate": 3.348994546304274e-07, + "loss": 0.7754, + "step": 118100 + }, + { + "epoch": 1.895857076357566, + "grad_norm": 0.9454233646392822, + "learning_rate": 3.338717694638782e-07, + "loss": 0.7312, + "step": 118110 + }, + { + "epoch": 1.89601759257773, + "grad_norm": 1.246893286705017, + "learning_rate": 3.328456529183066e-07, + "loss": 0.7211, + "step": 118120 + }, + { + "epoch": 1.8961781087978942, + "grad_norm": 1.3413466215133667, + "learning_rate": 3.3182110505897144e-07, + "loss": 0.7046, + "step": 118130 + }, + { + "epoch": 1.896338625018058, + "grad_norm": 1.28057861328125, + "learning_rate": 3.307981259510262e-07, + "loss": 0.7432, + "step": 118140 + }, + { + "epoch": 1.8964991412382222, + "grad_norm": 1.1767587661743164, + "learning_rate": 3.297767156595244e-07, + "loss": 0.7095, + "step": 118150 + }, + { + "epoch": 1.896659657458386, + "grad_norm": 2.0659921169281006, + "learning_rate": 3.2875687424942513e-07, + "loss": 0.652, + "step": 118160 + }, + { + "epoch": 1.8968201736785502, + "grad_norm": 1.070446252822876, + "learning_rate": 3.277386017855849e-07, + "loss": 0.6339, + "step": 118170 + }, + { + "epoch": 1.8969806898987143, + "grad_norm": 0.8383901119232178, + "learning_rate": 3.267218983327547e-07, + "loss": 0.7679, + "step": 118180 + }, + { + "epoch": 1.8971412061188784, + "grad_norm": 1.2640750408172607, + "learning_rate": 3.257067639555966e-07, + "loss": 0.7, + "step": 118190 + }, + { + "epoch": 1.8973017223390425, + "grad_norm": 1.3156572580337524, + "learning_rate": 3.2469319871866467e-07, + "loss": 0.8967, + "step": 118200 + }, + { + "epoch": 1.8974622385592064, + "grad_norm": 1.4280312061309814, + "learning_rate": 3.236812026864183e-07, + "loss": 0.7282, + "step": 118210 + }, + { + "epoch": 1.8976227547793705, + "grad_norm": 0.6397854685783386, + "learning_rate": 3.226707759232089e-07, + "loss": 0.6237, + "step": 118220 + }, + { + "epoch": 1.8977832709995344, + "grad_norm": 0.8625026941299438, + "learning_rate": 3.2166191849329897e-07, + "loss": 0.6255, + "step": 118230 + }, + { + "epoch": 1.8979437872196985, + "grad_norm": 0.7746291756629944, + "learning_rate": 3.2065463046084264e-07, + "loss": 0.8086, + "step": 118240 + }, + { + "epoch": 1.8981043034398626, + "grad_norm": 1.0439881086349487, + "learning_rate": 3.1964891188989986e-07, + "loss": 0.7134, + "step": 118250 + }, + { + "epoch": 1.8982648196600267, + "grad_norm": 1.7234258651733398, + "learning_rate": 3.18644762844425e-07, + "loss": 0.6536, + "step": 118260 + }, + { + "epoch": 1.8984253358801908, + "grad_norm": 2.242539644241333, + "learning_rate": 3.1764218338828086e-07, + "loss": 0.5877, + "step": 118270 + }, + { + "epoch": 1.8985858521003547, + "grad_norm": 0.968544065952301, + "learning_rate": 3.1664117358522203e-07, + "loss": 0.6761, + "step": 118280 + }, + { + "epoch": 1.8987463683205188, + "grad_norm": 1.2107633352279663, + "learning_rate": 3.156417334989087e-07, + "loss": 0.702, + "step": 118290 + }, + { + "epoch": 1.8989068845406827, + "grad_norm": 0.8746790885925293, + "learning_rate": 3.146438631928983e-07, + "loss": 0.7093, + "step": 118300 + }, + { + "epoch": 1.8990674007608468, + "grad_norm": 0.9143620133399963, + "learning_rate": 3.1364756273064845e-07, + "loss": 0.6901, + "step": 118310 + }, + { + "epoch": 1.899227916981011, + "grad_norm": 1.3704345226287842, + "learning_rate": 3.1265283217551956e-07, + "loss": 0.586, + "step": 118320 + }, + { + "epoch": 1.899388433201175, + "grad_norm": 1.2279435396194458, + "learning_rate": 3.116596715907721e-07, + "loss": 0.7853, + "step": 118330 + }, + { + "epoch": 1.8995489494213391, + "grad_norm": 1.180389404296875, + "learning_rate": 3.10668081039564e-07, + "loss": 0.6893, + "step": 118340 + }, + { + "epoch": 1.8997094656415032, + "grad_norm": 1.004075288772583, + "learning_rate": 3.0967806058495307e-07, + "loss": 0.6172, + "step": 118350 + }, + { + "epoch": 1.8998699818616671, + "grad_norm": 1.107337474822998, + "learning_rate": 3.0868961028990005e-07, + "loss": 0.7898, + "step": 118360 + }, + { + "epoch": 1.9000304980818312, + "grad_norm": 0.8257560133934021, + "learning_rate": 3.077027302172658e-07, + "loss": 0.7236, + "step": 118370 + }, + { + "epoch": 1.900191014301995, + "grad_norm": 1.140894889831543, + "learning_rate": 3.067174204298057e-07, + "loss": 0.6979, + "step": 118380 + }, + { + "epoch": 1.9003515305221592, + "grad_norm": 0.7474260330200195, + "learning_rate": 3.0573368099018615e-07, + "loss": 0.8037, + "step": 118390 + }, + { + "epoch": 1.9005120467423233, + "grad_norm": 0.8884649872779846, + "learning_rate": 3.0475151196096285e-07, + "loss": 0.7735, + "step": 118400 + }, + { + "epoch": 1.9006725629624874, + "grad_norm": 1.3943132162094116, + "learning_rate": 3.0377091340459683e-07, + "loss": 0.7777, + "step": 118410 + }, + { + "epoch": 1.9008330791826515, + "grad_norm": 1.223876953125, + "learning_rate": 3.0279188538344936e-07, + "loss": 0.6947, + "step": 118420 + }, + { + "epoch": 1.9009935954028154, + "grad_norm": 1.0981886386871338, + "learning_rate": 3.01814427959779e-07, + "loss": 0.6523, + "step": 118430 + }, + { + "epoch": 1.9011541116229795, + "grad_norm": 1.372159481048584, + "learning_rate": 3.008385411957471e-07, + "loss": 0.8042, + "step": 118440 + }, + { + "epoch": 1.9013146278431434, + "grad_norm": 1.1509095430374146, + "learning_rate": 2.9986422515341504e-07, + "loss": 0.7491, + "step": 118450 + }, + { + "epoch": 1.9014751440633075, + "grad_norm": 0.8576065897941589, + "learning_rate": 2.9889147989474453e-07, + "loss": 0.7528, + "step": 118460 + }, + { + "epoch": 1.9016356602834716, + "grad_norm": 1.5341209173202515, + "learning_rate": 2.9792030548159155e-07, + "loss": 0.6988, + "step": 118470 + }, + { + "epoch": 1.9017961765036357, + "grad_norm": 0.8566706776618958, + "learning_rate": 2.969507019757234e-07, + "loss": 0.6598, + "step": 118480 + }, + { + "epoch": 1.9019566927237999, + "grad_norm": 1.1049504280090332, + "learning_rate": 2.9598266943879904e-07, + "loss": 0.7313, + "step": 118490 + }, + { + "epoch": 1.9021172089439637, + "grad_norm": 2.548694610595703, + "learning_rate": 2.950162079323776e-07, + "loss": 0.795, + "step": 118500 + }, + { + "epoch": 1.9022777251641279, + "grad_norm": 1.0830214023590088, + "learning_rate": 2.94051317517921e-07, + "loss": 0.6752, + "step": 118510 + }, + { + "epoch": 1.9024382413842917, + "grad_norm": 0.70945805311203, + "learning_rate": 2.930879982567941e-07, + "loss": 0.6261, + "step": 118520 + }, + { + "epoch": 1.9025987576044558, + "grad_norm": 0.6783342957496643, + "learning_rate": 2.9212625021025344e-07, + "loss": 0.7725, + "step": 118530 + }, + { + "epoch": 1.90275927382462, + "grad_norm": 0.8936478495597839, + "learning_rate": 2.911660734394639e-07, + "loss": 0.8232, + "step": 118540 + }, + { + "epoch": 1.902919790044784, + "grad_norm": 0.9563254117965698, + "learning_rate": 2.9020746800548504e-07, + "loss": 0.7043, + "step": 118550 + }, + { + "epoch": 1.9030803062649482, + "grad_norm": 0.9123371243476868, + "learning_rate": 2.892504339692792e-07, + "loss": 0.6756, + "step": 118560 + }, + { + "epoch": 1.9032408224851123, + "grad_norm": 1.1858142614364624, + "learning_rate": 2.8829497139170883e-07, + "loss": 0.6067, + "step": 118570 + }, + { + "epoch": 1.9034013387052762, + "grad_norm": 1.9523639678955078, + "learning_rate": 2.873410803335391e-07, + "loss": 0.6904, + "step": 118580 + }, + { + "epoch": 1.90356185492544, + "grad_norm": 1.0389060974121094, + "learning_rate": 2.863887608554244e-07, + "loss": 0.7016, + "step": 118590 + }, + { + "epoch": 1.9037223711456042, + "grad_norm": 0.7840349674224854, + "learning_rate": 2.8543801301793014e-07, + "loss": 0.7028, + "step": 118600 + }, + { + "epoch": 1.9038828873657683, + "grad_norm": 1.2166688442230225, + "learning_rate": 2.844888368815163e-07, + "loss": 0.7322, + "step": 118610 + }, + { + "epoch": 1.9040434035859324, + "grad_norm": 1.1452933549880981, + "learning_rate": 2.835412325065512e-07, + "loss": 0.6095, + "step": 118620 + }, + { + "epoch": 1.9042039198060965, + "grad_norm": 1.1696351766586304, + "learning_rate": 2.825951999532894e-07, + "loss": 0.7401, + "step": 118630 + }, + { + "epoch": 1.9043644360262606, + "grad_norm": 2.470459222793579, + "learning_rate": 2.8165073928189957e-07, + "loss": 0.7298, + "step": 118640 + }, + { + "epoch": 1.9045249522464245, + "grad_norm": 1.1761322021484375, + "learning_rate": 2.807078505524391e-07, + "loss": 0.7955, + "step": 118650 + }, + { + "epoch": 1.9046854684665886, + "grad_norm": 1.1377356052398682, + "learning_rate": 2.797665338248739e-07, + "loss": 0.7337, + "step": 118660 + }, + { + "epoch": 1.9048459846867525, + "grad_norm": 0.846274197101593, + "learning_rate": 2.788267891590618e-07, + "loss": 0.6491, + "step": 118670 + }, + { + "epoch": 1.9050065009069166, + "grad_norm": 1.1756064891815186, + "learning_rate": 2.7788861661476875e-07, + "loss": 0.5774, + "step": 118680 + }, + { + "epoch": 1.9051670171270807, + "grad_norm": 0.933623194694519, + "learning_rate": 2.7695201625165254e-07, + "loss": 0.5935, + "step": 118690 + }, + { + "epoch": 1.9053275333472448, + "grad_norm": 0.8052477836608887, + "learning_rate": 2.7601698812928236e-07, + "loss": 0.7505, + "step": 118700 + }, + { + "epoch": 1.905488049567409, + "grad_norm": 1.091244101524353, + "learning_rate": 2.7508353230711324e-07, + "loss": 0.6379, + "step": 118710 + }, + { + "epoch": 1.9056485657875728, + "grad_norm": 1.058791995048523, + "learning_rate": 2.741516488445145e-07, + "loss": 0.7217, + "step": 118720 + }, + { + "epoch": 1.905809082007737, + "grad_norm": 1.121230125427246, + "learning_rate": 2.732213378007414e-07, + "loss": 0.7268, + "step": 118730 + }, + { + "epoch": 1.9059695982279008, + "grad_norm": 1.3535315990447998, + "learning_rate": 2.7229259923496066e-07, + "loss": 0.8136, + "step": 118740 + }, + { + "epoch": 1.906130114448065, + "grad_norm": 0.8254081606864929, + "learning_rate": 2.71365433206236e-07, + "loss": 0.6658, + "step": 118750 + }, + { + "epoch": 1.906290630668229, + "grad_norm": 0.9463878273963928, + "learning_rate": 2.704398397735258e-07, + "loss": 0.7192, + "step": 118760 + }, + { + "epoch": 1.9064511468883931, + "grad_norm": 0.8984910249710083, + "learning_rate": 2.6951581899569424e-07, + "loss": 0.6565, + "step": 118770 + }, + { + "epoch": 1.9066116631085572, + "grad_norm": 1.333750605583191, + "learning_rate": 2.6859337093150526e-07, + "loss": 0.679, + "step": 118780 + }, + { + "epoch": 1.9067721793287211, + "grad_norm": 1.207838535308838, + "learning_rate": 2.676724956396148e-07, + "loss": 0.696, + "step": 118790 + }, + { + "epoch": 1.9069326955488852, + "grad_norm": 1.2175053358078003, + "learning_rate": 2.6675319317859273e-07, + "loss": 0.739, + "step": 118800 + }, + { + "epoch": 1.9070932117690491, + "grad_norm": 1.4631216526031494, + "learning_rate": 2.658354636068977e-07, + "loss": 0.5694, + "step": 118810 + }, + { + "epoch": 1.9072537279892132, + "grad_norm": 0.9928074479103088, + "learning_rate": 2.6491930698289146e-07, + "loss": 0.8037, + "step": 118820 + }, + { + "epoch": 1.9074142442093773, + "grad_norm": 0.9188700318336487, + "learning_rate": 2.640047233648413e-07, + "loss": 0.7707, + "step": 118830 + }, + { + "epoch": 1.9075747604295414, + "grad_norm": 1.2284997701644897, + "learning_rate": 2.630917128109006e-07, + "loss": 0.7072, + "step": 118840 + }, + { + "epoch": 1.9077352766497055, + "grad_norm": 1.2976644039154053, + "learning_rate": 2.621802753791369e-07, + "loss": 0.7051, + "step": 118850 + }, + { + "epoch": 1.9078957928698697, + "grad_norm": 0.8953333497047424, + "learning_rate": 2.6127041112751215e-07, + "loss": 0.6772, + "step": 118860 + }, + { + "epoch": 1.9080563090900335, + "grad_norm": 0.8392735123634338, + "learning_rate": 2.603621201138856e-07, + "loss": 0.5867, + "step": 118870 + }, + { + "epoch": 1.9082168253101977, + "grad_norm": 1.542044758796692, + "learning_rate": 2.594554023960249e-07, + "loss": 0.8317, + "step": 118880 + }, + { + "epoch": 1.9083773415303615, + "grad_norm": 1.7594363689422607, + "learning_rate": 2.5855025803158684e-07, + "loss": 0.6089, + "step": 118890 + }, + { + "epoch": 1.9085378577505256, + "grad_norm": 1.0165343284606934, + "learning_rate": 2.5764668707813353e-07, + "loss": 0.7335, + "step": 118900 + }, + { + "epoch": 1.9086983739706898, + "grad_norm": 0.8097573518753052, + "learning_rate": 2.5674468959313024e-07, + "loss": 0.7034, + "step": 118910 + }, + { + "epoch": 1.9088588901908539, + "grad_norm": 1.5778257846832275, + "learning_rate": 2.558442656339338e-07, + "loss": 0.7098, + "step": 118920 + }, + { + "epoch": 1.909019406411018, + "grad_norm": 1.0872176885604858, + "learning_rate": 2.5494541525780955e-07, + "loss": 0.7724, + "step": 118930 + }, + { + "epoch": 1.9091799226311819, + "grad_norm": 0.870863676071167, + "learning_rate": 2.540481385219201e-07, + "loss": 0.7101, + "step": 118940 + }, + { + "epoch": 1.909340438851346, + "grad_norm": 0.8031889200210571, + "learning_rate": 2.5315243548332256e-07, + "loss": 0.7162, + "step": 118950 + }, + { + "epoch": 1.9095009550715099, + "grad_norm": 1.5080598592758179, + "learning_rate": 2.522583061989825e-07, + "loss": 0.7273, + "step": 118960 + }, + { + "epoch": 1.909661471291674, + "grad_norm": 0.792672336101532, + "learning_rate": 2.5136575072575995e-07, + "loss": 0.8055, + "step": 118970 + }, + { + "epoch": 1.909821987511838, + "grad_norm": 1.4100102186203003, + "learning_rate": 2.5047476912041514e-07, + "loss": 0.6888, + "step": 118980 + }, + { + "epoch": 1.9099825037320022, + "grad_norm": 1.2410095930099487, + "learning_rate": 2.495853614396082e-07, + "loss": 0.6463, + "step": 118990 + }, + { + "epoch": 1.9101430199521663, + "grad_norm": 1.5586364269256592, + "learning_rate": 2.4869752773990506e-07, + "loss": 0.7165, + "step": 119000 + }, + { + "epoch": 1.9103035361723302, + "grad_norm": 0.945393979549408, + "learning_rate": 2.478112680777606e-07, + "loss": 0.761, + "step": 119010 + }, + { + "epoch": 1.9104640523924943, + "grad_norm": 1.5931092500686646, + "learning_rate": 2.4692658250954346e-07, + "loss": 0.7499, + "step": 119020 + }, + { + "epoch": 1.9106245686126582, + "grad_norm": 0.7859266400337219, + "learning_rate": 2.4604347109150607e-07, + "loss": 0.711, + "step": 119030 + }, + { + "epoch": 1.9107850848328223, + "grad_norm": 0.8630663752555847, + "learning_rate": 2.451619338798145e-07, + "loss": 0.7055, + "step": 119040 + }, + { + "epoch": 1.9109456010529864, + "grad_norm": 1.0355117321014404, + "learning_rate": 2.4428197093052664e-07, + "loss": 0.6333, + "step": 119050 + }, + { + "epoch": 1.9111061172731505, + "grad_norm": 1.0130788087844849, + "learning_rate": 2.4340358229960613e-07, + "loss": 0.6876, + "step": 119060 + }, + { + "epoch": 1.9112666334933146, + "grad_norm": 0.9859045147895813, + "learning_rate": 2.425267680429083e-07, + "loss": 0.6036, + "step": 119070 + }, + { + "epoch": 1.9114271497134787, + "grad_norm": 1.0785982608795166, + "learning_rate": 2.4165152821619963e-07, + "loss": 0.7358, + "step": 119080 + }, + { + "epoch": 1.9115876659336426, + "grad_norm": 1.0964374542236328, + "learning_rate": 2.407778628751356e-07, + "loss": 0.6564, + "step": 119090 + }, + { + "epoch": 1.9117481821538065, + "grad_norm": 0.7888166308403015, + "learning_rate": 2.399057720752773e-07, + "loss": 0.6611, + "step": 119100 + }, + { + "epoch": 1.9119086983739706, + "grad_norm": 1.1583738327026367, + "learning_rate": 2.3903525587208595e-07, + "loss": 0.6453, + "step": 119110 + }, + { + "epoch": 1.9120692145941347, + "grad_norm": 1.3180804252624512, + "learning_rate": 2.381663143209173e-07, + "loss": 0.7043, + "step": 119120 + }, + { + "epoch": 1.9122297308142988, + "grad_norm": 1.3558729887008667, + "learning_rate": 2.372989474770354e-07, + "loss": 0.6513, + "step": 119130 + }, + { + "epoch": 1.912390247034463, + "grad_norm": 1.4593493938446045, + "learning_rate": 2.3643315539560173e-07, + "loss": 0.7438, + "step": 119140 + }, + { + "epoch": 1.912550763254627, + "grad_norm": 0.953472375869751, + "learning_rate": 2.355689381316667e-07, + "loss": 0.7903, + "step": 119150 + }, + { + "epoch": 1.912711279474791, + "grad_norm": 0.9196639060974121, + "learning_rate": 2.3470629574019742e-07, + "loss": 0.7192, + "step": 119160 + }, + { + "epoch": 1.912871795694955, + "grad_norm": 0.9709891676902771, + "learning_rate": 2.3384522827605004e-07, + "loss": 0.6618, + "step": 119170 + }, + { + "epoch": 1.913032311915119, + "grad_norm": 1.1210588216781616, + "learning_rate": 2.3298573579398352e-07, + "loss": 0.6504, + "step": 119180 + }, + { + "epoch": 1.913192828135283, + "grad_norm": 0.7796626687049866, + "learning_rate": 2.3212781834865695e-07, + "loss": 0.6898, + "step": 119190 + }, + { + "epoch": 1.9133533443554471, + "grad_norm": 1.0397382974624634, + "learning_rate": 2.312714759946294e-07, + "loss": 0.5068, + "step": 119200 + }, + { + "epoch": 1.9135138605756112, + "grad_norm": 1.1373836994171143, + "learning_rate": 2.3041670878635457e-07, + "loss": 0.6962, + "step": 119210 + }, + { + "epoch": 1.9136743767957753, + "grad_norm": 0.7671271562576294, + "learning_rate": 2.2956351677819733e-07, + "loss": 0.6593, + "step": 119220 + }, + { + "epoch": 1.9138348930159392, + "grad_norm": 1.1760402917861938, + "learning_rate": 2.2871190002441146e-07, + "loss": 0.8132, + "step": 119230 + }, + { + "epoch": 1.9139954092361033, + "grad_norm": 0.7797781229019165, + "learning_rate": 2.2786185857915642e-07, + "loss": 0.693, + "step": 119240 + }, + { + "epoch": 1.9141559254562672, + "grad_norm": 1.0811142921447754, + "learning_rate": 2.27013392496489e-07, + "loss": 0.5458, + "step": 119250 + }, + { + "epoch": 1.9143164416764313, + "grad_norm": 1.6891392469406128, + "learning_rate": 2.261665018303688e-07, + "loss": 0.7282, + "step": 119260 + }, + { + "epoch": 1.9144769578965954, + "grad_norm": 1.1620733737945557, + "learning_rate": 2.2532118663464997e-07, + "loss": 0.7375, + "step": 119270 + }, + { + "epoch": 1.9146374741167596, + "grad_norm": 1.0122041702270508, + "learning_rate": 2.2447744696309225e-07, + "loss": 0.6685, + "step": 119280 + }, + { + "epoch": 1.9147979903369237, + "grad_norm": 0.48466554284095764, + "learning_rate": 2.2363528286934998e-07, + "loss": 0.6275, + "step": 119290 + }, + { + "epoch": 1.9149585065570875, + "grad_norm": 0.7414604425430298, + "learning_rate": 2.2279469440698308e-07, + "loss": 0.6197, + "step": 119300 + }, + { + "epoch": 1.9151190227772517, + "grad_norm": 0.9148067235946655, + "learning_rate": 2.2195568162944325e-07, + "loss": 0.6345, + "step": 119310 + }, + { + "epoch": 1.9152795389974155, + "grad_norm": 1.0885871648788452, + "learning_rate": 2.2111824459009055e-07, + "loss": 0.7442, + "step": 119320 + }, + { + "epoch": 1.9154400552175797, + "grad_norm": 1.1375017166137695, + "learning_rate": 2.2028238334218244e-07, + "loss": 0.7602, + "step": 119330 + }, + { + "epoch": 1.9156005714377438, + "grad_norm": 1.0698684453964233, + "learning_rate": 2.1944809793887078e-07, + "loss": 0.6323, + "step": 119340 + }, + { + "epoch": 1.9157610876579079, + "grad_norm": 1.340254306793213, + "learning_rate": 2.1861538843321316e-07, + "loss": 0.6307, + "step": 119350 + }, + { + "epoch": 1.915921603878072, + "grad_norm": 1.0869731903076172, + "learning_rate": 2.1778425487816446e-07, + "loss": 0.7173, + "step": 119360 + }, + { + "epoch": 1.916082120098236, + "grad_norm": 1.3204798698425293, + "learning_rate": 2.1695469732658235e-07, + "loss": 0.7261, + "step": 119370 + }, + { + "epoch": 1.9162426363184, + "grad_norm": 1.2826340198516846, + "learning_rate": 2.1612671583121912e-07, + "loss": 0.6929, + "step": 119380 + }, + { + "epoch": 1.9164031525385639, + "grad_norm": 1.2677642107009888, + "learning_rate": 2.1530031044472987e-07, + "loss": 0.7575, + "step": 119390 + }, + { + "epoch": 1.916563668758728, + "grad_norm": 1.2747650146484375, + "learning_rate": 2.1447548121966697e-07, + "loss": 0.6692, + "step": 119400 + }, + { + "epoch": 1.916724184978892, + "grad_norm": 1.4108268022537231, + "learning_rate": 2.1365222820849128e-07, + "loss": 0.6512, + "step": 119410 + }, + { + "epoch": 1.9168847011990562, + "grad_norm": 1.3548779487609863, + "learning_rate": 2.1283055146354979e-07, + "loss": 0.6995, + "step": 119420 + }, + { + "epoch": 1.9170452174192203, + "grad_norm": 1.1328656673431396, + "learning_rate": 2.120104510371007e-07, + "loss": 0.7699, + "step": 119430 + }, + { + "epoch": 1.9172057336393844, + "grad_norm": 0.9089491963386536, + "learning_rate": 2.1119192698129397e-07, + "loss": 0.6942, + "step": 119440 + }, + { + "epoch": 1.9173662498595483, + "grad_norm": 1.8684730529785156, + "learning_rate": 2.1037497934818796e-07, + "loss": 0.7609, + "step": 119450 + }, + { + "epoch": 1.9175267660797124, + "grad_norm": 1.0865752696990967, + "learning_rate": 2.0955960818973275e-07, + "loss": 0.8004, + "step": 119460 + }, + { + "epoch": 1.9176872822998763, + "grad_norm": 0.9321668744087219, + "learning_rate": 2.0874581355778133e-07, + "loss": 0.5771, + "step": 119470 + }, + { + "epoch": 1.9178477985200404, + "grad_norm": 2.147808074951172, + "learning_rate": 2.0793359550408397e-07, + "loss": 0.7407, + "step": 119480 + }, + { + "epoch": 1.9180083147402045, + "grad_norm": 1.417507290840149, + "learning_rate": 2.071229540802966e-07, + "loss": 0.8274, + "step": 119490 + }, + { + "epoch": 1.9181688309603686, + "grad_norm": 1.266314148902893, + "learning_rate": 2.0631388933796959e-07, + "loss": 0.8213, + "step": 119500 + }, + { + "epoch": 1.9183293471805327, + "grad_norm": 0.864354133605957, + "learning_rate": 2.0550640132855904e-07, + "loss": 0.6264, + "step": 119510 + }, + { + "epoch": 1.9184898634006966, + "grad_norm": 0.6149396896362305, + "learning_rate": 2.047004901034072e-07, + "loss": 0.6344, + "step": 119520 + }, + { + "epoch": 1.9186503796208607, + "grad_norm": 1.190651535987854, + "learning_rate": 2.0389615571377309e-07, + "loss": 0.6559, + "step": 119530 + }, + { + "epoch": 1.9188108958410246, + "grad_norm": 1.2118828296661377, + "learning_rate": 2.0309339821080463e-07, + "loss": 0.7654, + "step": 119540 + }, + { + "epoch": 1.9189714120611887, + "grad_norm": 1.0395115613937378, + "learning_rate": 2.0229221764555273e-07, + "loss": 0.5677, + "step": 119550 + }, + { + "epoch": 1.9191319282813528, + "grad_norm": 1.0601686239242554, + "learning_rate": 2.0149261406896825e-07, + "loss": 0.5755, + "step": 119560 + }, + { + "epoch": 1.919292444501517, + "grad_norm": 1.2550280094146729, + "learning_rate": 2.006945875318994e-07, + "loss": 0.7672, + "step": 119570 + }, + { + "epoch": 1.919452960721681, + "grad_norm": 1.213916540145874, + "learning_rate": 1.9989813808510006e-07, + "loss": 0.6021, + "step": 119580 + }, + { + "epoch": 1.919613476941845, + "grad_norm": 0.9033512473106384, + "learning_rate": 1.991032657792158e-07, + "loss": 0.8413, + "step": 119590 + }, + { + "epoch": 1.919773993162009, + "grad_norm": 0.9635958671569824, + "learning_rate": 1.9830997066479784e-07, + "loss": 0.7084, + "step": 119600 + }, + { + "epoch": 1.919934509382173, + "grad_norm": 0.9818968772888184, + "learning_rate": 1.9751825279229474e-07, + "loss": 0.6651, + "step": 119610 + }, + { + "epoch": 1.920095025602337, + "grad_norm": 0.8200908303260803, + "learning_rate": 1.9672811221205512e-07, + "loss": 0.7175, + "step": 119620 + }, + { + "epoch": 1.9202555418225011, + "grad_norm": 1.6745070219039917, + "learning_rate": 1.9593954897432765e-07, + "loss": 0.6663, + "step": 119630 + }, + { + "epoch": 1.9204160580426652, + "grad_norm": 1.4975297451019287, + "learning_rate": 1.9515256312925834e-07, + "loss": 0.7796, + "step": 119640 + }, + { + "epoch": 1.9205765742628294, + "grad_norm": 1.1302735805511475, + "learning_rate": 1.943671547268988e-07, + "loss": 0.7538, + "step": 119650 + }, + { + "epoch": 1.9207370904829935, + "grad_norm": 0.8965901136398315, + "learning_rate": 1.9358332381719245e-07, + "loss": 0.6831, + "step": 119660 + }, + { + "epoch": 1.9208976067031573, + "grad_norm": 1.1579023599624634, + "learning_rate": 1.928010704499855e-07, + "loss": 0.6663, + "step": 119670 + }, + { + "epoch": 1.9210581229233215, + "grad_norm": 1.544571042060852, + "learning_rate": 1.9202039467502985e-07, + "loss": 0.6339, + "step": 119680 + }, + { + "epoch": 1.9212186391434853, + "grad_norm": 0.9779437780380249, + "learning_rate": 1.912412965419691e-07, + "loss": 0.7369, + "step": 119690 + }, + { + "epoch": 1.9213791553636494, + "grad_norm": 0.9388981461524963, + "learning_rate": 1.904637761003497e-07, + "loss": 0.7421, + "step": 119700 + }, + { + "epoch": 1.9215396715838136, + "grad_norm": 1.40340256690979, + "learning_rate": 1.8968783339961548e-07, + "loss": 0.6677, + "step": 119710 + }, + { + "epoch": 1.9217001878039777, + "grad_norm": 0.8065948486328125, + "learning_rate": 1.8891346848911306e-07, + "loss": 0.7501, + "step": 119720 + }, + { + "epoch": 1.9218607040241418, + "grad_norm": 1.0859498977661133, + "learning_rate": 1.8814068141808638e-07, + "loss": 0.6652, + "step": 119730 + }, + { + "epoch": 1.9220212202443057, + "grad_norm": 1.4176748991012573, + "learning_rate": 1.8736947223568502e-07, + "loss": 0.7532, + "step": 119740 + }, + { + "epoch": 1.9221817364644698, + "grad_norm": 2.4212961196899414, + "learning_rate": 1.8659984099094752e-07, + "loss": 0.6737, + "step": 119750 + }, + { + "epoch": 1.9223422526846337, + "grad_norm": 1.1079676151275635, + "learning_rate": 1.8583178773282084e-07, + "loss": 0.6487, + "step": 119760 + }, + { + "epoch": 1.9225027689047978, + "grad_norm": 2.393653154373169, + "learning_rate": 1.8506531251014925e-07, + "loss": 0.7257, + "step": 119770 + }, + { + "epoch": 1.9226632851249619, + "grad_norm": 1.331305980682373, + "learning_rate": 1.8430041537167152e-07, + "loss": 0.7836, + "step": 119780 + }, + { + "epoch": 1.922823801345126, + "grad_norm": 0.9170563817024231, + "learning_rate": 1.8353709636603765e-07, + "loss": 0.6053, + "step": 119790 + }, + { + "epoch": 1.92298431756529, + "grad_norm": 0.956602156162262, + "learning_rate": 1.8277535554178383e-07, + "loss": 0.6525, + "step": 119800 + }, + { + "epoch": 1.923144833785454, + "grad_norm": 1.263824701309204, + "learning_rate": 1.8201519294735737e-07, + "loss": 0.6864, + "step": 119810 + }, + { + "epoch": 1.923305350005618, + "grad_norm": 1.2896569967269897, + "learning_rate": 1.8125660863109738e-07, + "loss": 0.6444, + "step": 119820 + }, + { + "epoch": 1.923465866225782, + "grad_norm": 0.9012478590011597, + "learning_rate": 1.804996026412459e-07, + "loss": 0.5927, + "step": 119830 + }, + { + "epoch": 1.923626382445946, + "grad_norm": 0.658074140548706, + "learning_rate": 1.797441750259421e-07, + "loss": 0.6844, + "step": 119840 + }, + { + "epoch": 1.9237868986661102, + "grad_norm": 1.5501770973205566, + "learning_rate": 1.7899032583323095e-07, + "loss": 0.7522, + "step": 119850 + }, + { + "epoch": 1.9239474148862743, + "grad_norm": 0.9384968280792236, + "learning_rate": 1.7823805511104906e-07, + "loss": 0.7387, + "step": 119860 + }, + { + "epoch": 1.9241079311064384, + "grad_norm": 1.0685036182403564, + "learning_rate": 1.7748736290723877e-07, + "loss": 0.7064, + "step": 119870 + }, + { + "epoch": 1.9242684473266025, + "grad_norm": 0.8266396522521973, + "learning_rate": 1.7673824926953963e-07, + "loss": 0.7079, + "step": 119880 + }, + { + "epoch": 1.9244289635467664, + "grad_norm": 0.9305410981178284, + "learning_rate": 1.7599071424558855e-07, + "loss": 0.713, + "step": 119890 + }, + { + "epoch": 1.9245894797669303, + "grad_norm": 0.9974923729896545, + "learning_rate": 1.7524475788292806e-07, + "loss": 0.673, + "step": 119900 + }, + { + "epoch": 1.9247499959870944, + "grad_norm": 1.3197813034057617, + "learning_rate": 1.745003802289924e-07, + "loss": 0.6087, + "step": 119910 + }, + { + "epoch": 1.9249105122072585, + "grad_norm": 1.8415356874465942, + "learning_rate": 1.737575813311215e-07, + "loss": 0.6488, + "step": 119920 + }, + { + "epoch": 1.9250710284274226, + "grad_norm": 1.0545247793197632, + "learning_rate": 1.7301636123655262e-07, + "loss": 0.593, + "step": 119930 + }, + { + "epoch": 1.9252315446475867, + "grad_norm": 1.1640665531158447, + "learning_rate": 1.722767199924258e-07, + "loss": 0.6533, + "step": 119940 + }, + { + "epoch": 1.9253920608677508, + "grad_norm": 1.1696062088012695, + "learning_rate": 1.7153865764577282e-07, + "loss": 0.7018, + "step": 119950 + }, + { + "epoch": 1.9255525770879147, + "grad_norm": 2.466172218322754, + "learning_rate": 1.7080217424353396e-07, + "loss": 0.7349, + "step": 119960 + }, + { + "epoch": 1.9257130933080788, + "grad_norm": 0.8941882252693176, + "learning_rate": 1.7006726983254673e-07, + "loss": 0.8598, + "step": 119970 + }, + { + "epoch": 1.9258736095282427, + "grad_norm": 0.6936658620834351, + "learning_rate": 1.6933394445954042e-07, + "loss": 0.6783, + "step": 119980 + }, + { + "epoch": 1.9260341257484068, + "grad_norm": 0.9235751032829285, + "learning_rate": 1.6860219817115552e-07, + "loss": 0.7626, + "step": 119990 + }, + { + "epoch": 1.926194641968571, + "grad_norm": 1.3109455108642578, + "learning_rate": 1.67872031013927e-07, + "loss": 0.711, + "step": 120000 + }, + { + "epoch": 1.926194641968571, + "eval_loss": 0.768983781337738, + "eval_runtime": 1834.0408, + "eval_samples_per_second": 14.302, + "eval_steps_per_second": 1.788, + "step": 120000 + }, + { + "epoch": 1.926355158188735, + "grad_norm": 1.0862035751342773, + "learning_rate": 1.671434430342872e-07, + "loss": 0.8248, + "step": 120010 + }, + { + "epoch": 1.9265156744088991, + "grad_norm": 0.7636085748672485, + "learning_rate": 1.6641643427856845e-07, + "loss": 0.6534, + "step": 120020 + }, + { + "epoch": 1.926676190629063, + "grad_norm": 0.8438390493392944, + "learning_rate": 1.656910047930088e-07, + "loss": 0.7392, + "step": 120030 + }, + { + "epoch": 1.9268367068492271, + "grad_norm": 1.2702875137329102, + "learning_rate": 1.6496715462373524e-07, + "loss": 0.7053, + "step": 120040 + }, + { + "epoch": 1.926997223069391, + "grad_norm": 1.0808581113815308, + "learning_rate": 1.6424488381678593e-07, + "loss": 0.7396, + "step": 120050 + }, + { + "epoch": 1.9271577392895551, + "grad_norm": 1.668398380279541, + "learning_rate": 1.6352419241809081e-07, + "loss": 0.743, + "step": 120060 + }, + { + "epoch": 1.9273182555097192, + "grad_norm": 1.7068824768066406, + "learning_rate": 1.6280508047347986e-07, + "loss": 0.744, + "step": 120070 + }, + { + "epoch": 1.9274787717298834, + "grad_norm": 1.325596809387207, + "learning_rate": 1.6208754802868876e-07, + "loss": 0.7677, + "step": 120080 + }, + { + "epoch": 1.9276392879500475, + "grad_norm": 0.8684011697769165, + "learning_rate": 1.6137159512934485e-07, + "loss": 0.656, + "step": 120090 + }, + { + "epoch": 1.9277998041702114, + "grad_norm": 1.418387532234192, + "learning_rate": 1.606572218209812e-07, + "loss": 0.7295, + "step": 120100 + }, + { + "epoch": 1.9279603203903755, + "grad_norm": 0.7767691016197205, + "learning_rate": 1.5994442814902254e-07, + "loss": 0.6503, + "step": 120110 + }, + { + "epoch": 1.9281208366105393, + "grad_norm": 1.1412923336029053, + "learning_rate": 1.5923321415880488e-07, + "loss": 0.7784, + "step": 120120 + }, + { + "epoch": 1.9282813528307035, + "grad_norm": 2.4072563648223877, + "learning_rate": 1.5852357989555312e-07, + "loss": 0.6818, + "step": 120130 + }, + { + "epoch": 1.9284418690508676, + "grad_norm": 1.0117356777191162, + "learning_rate": 1.5781552540439781e-07, + "loss": 0.7331, + "step": 120140 + }, + { + "epoch": 1.9286023852710317, + "grad_norm": 1.0518232583999634, + "learning_rate": 1.5710905073036687e-07, + "loss": 0.7303, + "step": 120150 + }, + { + "epoch": 1.9287629014911958, + "grad_norm": 1.0448724031448364, + "learning_rate": 1.5640415591838542e-07, + "loss": 0.7683, + "step": 120160 + }, + { + "epoch": 1.9289234177113599, + "grad_norm": 1.177223563194275, + "learning_rate": 1.5570084101328431e-07, + "loss": 0.6825, + "step": 120170 + }, + { + "epoch": 1.9290839339315238, + "grad_norm": 1.0738669633865356, + "learning_rate": 1.5499910605978606e-07, + "loss": 0.7225, + "step": 120180 + }, + { + "epoch": 1.9292444501516877, + "grad_norm": 0.9346128106117249, + "learning_rate": 1.5429895110252167e-07, + "loss": 0.649, + "step": 120190 + }, + { + "epoch": 1.9294049663718518, + "grad_norm": 1.5988034009933472, + "learning_rate": 1.5360037618601387e-07, + "loss": 0.6167, + "step": 120200 + }, + { + "epoch": 1.9295654825920159, + "grad_norm": 1.034642219543457, + "learning_rate": 1.5290338135468817e-07, + "loss": 0.6845, + "step": 120210 + }, + { + "epoch": 1.92972599881218, + "grad_norm": 1.2770907878875732, + "learning_rate": 1.522079666528703e-07, + "loss": 0.7545, + "step": 120220 + }, + { + "epoch": 1.929886515032344, + "grad_norm": 0.753699004650116, + "learning_rate": 1.5151413212478317e-07, + "loss": 0.6265, + "step": 120230 + }, + { + "epoch": 1.9300470312525082, + "grad_norm": 1.2031999826431274, + "learning_rate": 1.5082187781454982e-07, + "loss": 0.7688, + "step": 120240 + }, + { + "epoch": 1.930207547472672, + "grad_norm": 1.1747325658798218, + "learning_rate": 1.5013120376619893e-07, + "loss": 0.7832, + "step": 120250 + }, + { + "epoch": 1.9303680636928362, + "grad_norm": 3.609107494354248, + "learning_rate": 1.4944211002364815e-07, + "loss": 0.7442, + "step": 120260 + }, + { + "epoch": 1.930528579913, + "grad_norm": 1.551377534866333, + "learning_rate": 1.4875459663072077e-07, + "loss": 0.6601, + "step": 120270 + }, + { + "epoch": 1.9306890961331642, + "grad_norm": 1.0286248922348022, + "learning_rate": 1.4806866363114013e-07, + "loss": 0.6728, + "step": 120280 + }, + { + "epoch": 1.9308496123533283, + "grad_norm": 1.4758729934692383, + "learning_rate": 1.4738431106852413e-07, + "loss": 0.7407, + "step": 120290 + }, + { + "epoch": 1.9310101285734924, + "grad_norm": 1.1701178550720215, + "learning_rate": 1.4670153898639626e-07, + "loss": 0.6082, + "step": 120300 + }, + { + "epoch": 1.9311706447936565, + "grad_norm": 1.2306303977966309, + "learning_rate": 1.4602034742817737e-07, + "loss": 0.6938, + "step": 120310 + }, + { + "epoch": 1.9313311610138204, + "grad_norm": 0.8013895153999329, + "learning_rate": 1.4534073643718838e-07, + "loss": 0.612, + "step": 120320 + }, + { + "epoch": 1.9314916772339845, + "grad_norm": 0.9508262872695923, + "learning_rate": 1.4466270605664466e-07, + "loss": 0.6922, + "step": 120330 + }, + { + "epoch": 1.9316521934541484, + "grad_norm": 0.9083688259124756, + "learning_rate": 1.4398625632966457e-07, + "loss": 0.7978, + "step": 120340 + }, + { + "epoch": 1.9318127096743125, + "grad_norm": 1.303667426109314, + "learning_rate": 1.4331138729927197e-07, + "loss": 0.7744, + "step": 120350 + }, + { + "epoch": 1.9319732258944766, + "grad_norm": 0.9593427181243896, + "learning_rate": 1.4263809900837977e-07, + "loss": 0.6716, + "step": 120360 + }, + { + "epoch": 1.9321337421146407, + "grad_norm": 1.8121243715286255, + "learning_rate": 1.419663914998065e-07, + "loss": 0.6673, + "step": 120370 + }, + { + "epoch": 1.9322942583348048, + "grad_norm": 0.9492111802101135, + "learning_rate": 1.41296264816268e-07, + "loss": 0.6794, + "step": 120380 + }, + { + "epoch": 1.932454774554969, + "grad_norm": 1.1226648092269897, + "learning_rate": 1.4062771900038296e-07, + "loss": 0.7793, + "step": 120390 + }, + { + "epoch": 1.9326152907751328, + "grad_norm": 3.653974771499634, + "learning_rate": 1.399607540946646e-07, + "loss": 0.683, + "step": 120400 + }, + { + "epoch": 1.9327758069952967, + "grad_norm": 0.8787305355072021, + "learning_rate": 1.3929537014152893e-07, + "loss": 0.681, + "step": 120410 + }, + { + "epoch": 1.9329363232154608, + "grad_norm": 0.8967496752738953, + "learning_rate": 1.386315671832894e-07, + "loss": 0.7325, + "step": 120420 + }, + { + "epoch": 1.933096839435625, + "grad_norm": 0.9929953217506409, + "learning_rate": 1.3796934526216222e-07, + "loss": 0.6222, + "step": 120430 + }, + { + "epoch": 1.933257355655789, + "grad_norm": 0.9496781826019287, + "learning_rate": 1.373087044202581e-07, + "loss": 0.6579, + "step": 120440 + }, + { + "epoch": 1.9334178718759532, + "grad_norm": 1.246519923210144, + "learning_rate": 1.3664964469959352e-07, + "loss": 0.6632, + "step": 120450 + }, + { + "epoch": 1.9335783880961173, + "grad_norm": 1.0654369592666626, + "learning_rate": 1.3599216614207655e-07, + "loss": 0.7554, + "step": 120460 + }, + { + "epoch": 1.9337389043162811, + "grad_norm": 0.8056846857070923, + "learning_rate": 1.3533626878952098e-07, + "loss": 0.6677, + "step": 120470 + }, + { + "epoch": 1.9338994205364453, + "grad_norm": 1.1669632196426392, + "learning_rate": 1.3468195268363794e-07, + "loss": 0.7442, + "step": 120480 + }, + { + "epoch": 1.9340599367566091, + "grad_norm": 0.9375160336494446, + "learning_rate": 1.340292178660385e-07, + "loss": 0.6832, + "step": 120490 + }, + { + "epoch": 1.9342204529767733, + "grad_norm": 1.459387183189392, + "learning_rate": 1.333780643782312e-07, + "loss": 0.6395, + "step": 120500 + }, + { + "epoch": 1.9343809691969374, + "grad_norm": 1.165335774421692, + "learning_rate": 1.327284922616273e-07, + "loss": 0.646, + "step": 120510 + }, + { + "epoch": 1.9345414854171015, + "grad_norm": 1.0884612798690796, + "learning_rate": 1.3208050155753548e-07, + "loss": 0.8102, + "step": 120520 + }, + { + "epoch": 1.9347020016372656, + "grad_norm": 0.9643990993499756, + "learning_rate": 1.314340923071644e-07, + "loss": 0.6436, + "step": 120530 + }, + { + "epoch": 1.9348625178574295, + "grad_norm": 1.1547443866729736, + "learning_rate": 1.307892645516201e-07, + "loss": 0.7329, + "step": 120540 + }, + { + "epoch": 1.9350230340775936, + "grad_norm": 0.9697743058204651, + "learning_rate": 1.3014601833191141e-07, + "loss": 0.6535, + "step": 120550 + }, + { + "epoch": 1.9351835502977575, + "grad_norm": 1.3671436309814453, + "learning_rate": 1.2950435368894455e-07, + "loss": 0.7041, + "step": 120560 + }, + { + "epoch": 1.9353440665179216, + "grad_norm": 1.1416985988616943, + "learning_rate": 1.2886427066352846e-07, + "loss": 0.7525, + "step": 120570 + }, + { + "epoch": 1.9355045827380857, + "grad_norm": 1.1125541925430298, + "learning_rate": 1.2822576929636397e-07, + "loss": 0.7445, + "step": 120580 + }, + { + "epoch": 1.9356650989582498, + "grad_norm": 1.5269043445587158, + "learning_rate": 1.2758884962805751e-07, + "loss": 0.5963, + "step": 120590 + }, + { + "epoch": 1.935825615178414, + "grad_norm": 1.2119464874267578, + "learning_rate": 1.2695351169911274e-07, + "loss": 0.6811, + "step": 120600 + }, + { + "epoch": 1.9359861313985778, + "grad_norm": 1.1119564771652222, + "learning_rate": 1.2631975554993624e-07, + "loss": 0.7367, + "step": 120610 + }, + { + "epoch": 1.936146647618742, + "grad_norm": 0.9785775542259216, + "learning_rate": 1.2568758122083191e-07, + "loss": 0.6686, + "step": 120620 + }, + { + "epoch": 1.9363071638389058, + "grad_norm": 1.3190809488296509, + "learning_rate": 1.2505698875199812e-07, + "loss": 0.6678, + "step": 120630 + }, + { + "epoch": 1.9364676800590699, + "grad_norm": 0.8745836615562439, + "learning_rate": 1.2442797818353892e-07, + "loss": 0.6006, + "step": 120640 + }, + { + "epoch": 1.936628196279234, + "grad_norm": 1.3094980716705322, + "learning_rate": 1.2380054955545283e-07, + "loss": 0.6673, + "step": 120650 + }, + { + "epoch": 1.936788712499398, + "grad_norm": 2.1653707027435303, + "learning_rate": 1.2317470290764688e-07, + "loss": 0.7018, + "step": 120660 + }, + { + "epoch": 1.9369492287195622, + "grad_norm": 1.2403745651245117, + "learning_rate": 1.2255043827991696e-07, + "loss": 0.7769, + "step": 120670 + }, + { + "epoch": 1.9371097449397263, + "grad_norm": 1.7712693214416504, + "learning_rate": 1.2192775571196191e-07, + "loss": 0.7773, + "step": 120680 + }, + { + "epoch": 1.9372702611598902, + "grad_norm": 0.8477523922920227, + "learning_rate": 1.2130665524338336e-07, + "loss": 0.6528, + "step": 120690 + }, + { + "epoch": 1.937430777380054, + "grad_norm": 1.0109567642211914, + "learning_rate": 1.206871369136775e-07, + "loss": 0.7287, + "step": 120700 + }, + { + "epoch": 1.9375912936002182, + "grad_norm": 1.0286526679992676, + "learning_rate": 1.2006920076224336e-07, + "loss": 0.6136, + "step": 120710 + }, + { + "epoch": 1.9377518098203823, + "grad_norm": 1.0639806985855103, + "learning_rate": 1.194528468283801e-07, + "loss": 0.6543, + "step": 120720 + }, + { + "epoch": 1.9379123260405464, + "grad_norm": 0.8262003660202026, + "learning_rate": 1.1883807515127854e-07, + "loss": 0.7937, + "step": 120730 + }, + { + "epoch": 1.9380728422607105, + "grad_norm": 0.8972403407096863, + "learning_rate": 1.1822488577003799e-07, + "loss": 0.72, + "step": 120740 + }, + { + "epoch": 1.9382333584808746, + "grad_norm": 1.0649811029434204, + "learning_rate": 1.1761327872365501e-07, + "loss": 0.7093, + "step": 120750 + }, + { + "epoch": 1.9383938747010385, + "grad_norm": 1.2757656574249268, + "learning_rate": 1.1700325405102353e-07, + "loss": 0.6832, + "step": 120760 + }, + { + "epoch": 1.9385543909212026, + "grad_norm": 0.8996958136558533, + "learning_rate": 1.163948117909347e-07, + "loss": 0.5886, + "step": 120770 + }, + { + "epoch": 1.9387149071413665, + "grad_norm": 1.5653842687606812, + "learning_rate": 1.1578795198208536e-07, + "loss": 0.7027, + "step": 120780 + }, + { + "epoch": 1.9388754233615306, + "grad_norm": 1.3821933269500732, + "learning_rate": 1.1518267466306409e-07, + "loss": 0.8098, + "step": 120790 + }, + { + "epoch": 1.9390359395816947, + "grad_norm": 1.359155535697937, + "learning_rate": 1.1457897987236787e-07, + "loss": 0.766, + "step": 120800 + }, + { + "epoch": 1.9391964558018588, + "grad_norm": 1.1875603199005127, + "learning_rate": 1.1397686764838544e-07, + "loss": 0.8147, + "step": 120810 + }, + { + "epoch": 1.939356972022023, + "grad_norm": 0.9375169277191162, + "learning_rate": 1.1337633802940839e-07, + "loss": 0.7236, + "step": 120820 + }, + { + "epoch": 1.9395174882421868, + "grad_norm": 1.4089248180389404, + "learning_rate": 1.1277739105362839e-07, + "loss": 0.6721, + "step": 120830 + }, + { + "epoch": 1.939678004462351, + "grad_norm": 1.2585468292236328, + "learning_rate": 1.1218002675912886e-07, + "loss": 0.6712, + "step": 120840 + }, + { + "epoch": 1.9398385206825148, + "grad_norm": 1.5960265398025513, + "learning_rate": 1.115842451839072e-07, + "loss": 0.6372, + "step": 120850 + }, + { + "epoch": 1.939999036902679, + "grad_norm": 0.7710285186767578, + "learning_rate": 1.1099004636584421e-07, + "loss": 0.7121, + "step": 120860 + }, + { + "epoch": 1.940159553122843, + "grad_norm": 1.421939730644226, + "learning_rate": 1.1039743034273186e-07, + "loss": 0.6096, + "step": 120870 + }, + { + "epoch": 1.9403200693430072, + "grad_norm": 0.9910525679588318, + "learning_rate": 1.0980639715225671e-07, + "loss": 0.7439, + "step": 120880 + }, + { + "epoch": 1.9404805855631713, + "grad_norm": 1.1827492713928223, + "learning_rate": 1.0921694683200257e-07, + "loss": 0.5182, + "step": 120890 + }, + { + "epoch": 1.9406411017833352, + "grad_norm": 0.9195522665977478, + "learning_rate": 1.0862907941945888e-07, + "loss": 0.784, + "step": 120900 + }, + { + "epoch": 1.9408016180034993, + "grad_norm": 0.8755864500999451, + "learning_rate": 1.0804279495200409e-07, + "loss": 0.674, + "step": 120910 + }, + { + "epoch": 1.9409621342236631, + "grad_norm": 1.375059723854065, + "learning_rate": 1.0745809346693059e-07, + "loss": 0.7341, + "step": 120920 + }, + { + "epoch": 1.9411226504438273, + "grad_norm": 1.250290036201477, + "learning_rate": 1.0687497500141419e-07, + "loss": 0.6792, + "step": 120930 + }, + { + "epoch": 1.9412831666639914, + "grad_norm": 0.787898063659668, + "learning_rate": 1.0629343959254468e-07, + "loss": 0.6244, + "step": 120940 + }, + { + "epoch": 1.9414436828841555, + "grad_norm": 0.9687046408653259, + "learning_rate": 1.057134872773008e-07, + "loss": 0.6529, + "step": 120950 + }, + { + "epoch": 1.9416041991043196, + "grad_norm": 1.2579929828643799, + "learning_rate": 1.0513511809256416e-07, + "loss": 0.6864, + "step": 120960 + }, + { + "epoch": 1.9417647153244837, + "grad_norm": 1.0502783060073853, + "learning_rate": 1.0455833207511645e-07, + "loss": 0.7453, + "step": 120970 + }, + { + "epoch": 1.9419252315446476, + "grad_norm": 1.3711156845092773, + "learning_rate": 1.0398312926163668e-07, + "loss": 0.6309, + "step": 120980 + }, + { + "epoch": 1.9420857477648117, + "grad_norm": 1.1734752655029297, + "learning_rate": 1.0340950968870388e-07, + "loss": 0.5803, + "step": 120990 + }, + { + "epoch": 1.9422462639849756, + "grad_norm": 1.0425041913986206, + "learning_rate": 1.0283747339280003e-07, + "loss": 0.6723, + "step": 121000 + }, + { + "epoch": 1.9424067802051397, + "grad_norm": 1.2462266683578491, + "learning_rate": 1.0226702041030156e-07, + "loss": 0.6901, + "step": 121010 + }, + { + "epoch": 1.9425672964253038, + "grad_norm": 0.9358248114585876, + "learning_rate": 1.0169815077748501e-07, + "loss": 0.6362, + "step": 121020 + }, + { + "epoch": 1.942727812645468, + "grad_norm": 1.1762701272964478, + "learning_rate": 1.0113086453052701e-07, + "loss": 0.7171, + "step": 121030 + }, + { + "epoch": 1.942888328865632, + "grad_norm": 1.2536249160766602, + "learning_rate": 1.0056516170550423e-07, + "loss": 0.6703, + "step": 121040 + }, + { + "epoch": 1.943048845085796, + "grad_norm": 1.079546570777893, + "learning_rate": 1.0000104233839346e-07, + "loss": 0.6939, + "step": 121050 + }, + { + "epoch": 1.94320936130596, + "grad_norm": 0.929488480091095, + "learning_rate": 9.94385064650688e-08, + "loss": 0.6346, + "step": 121060 + }, + { + "epoch": 1.943369877526124, + "grad_norm": 1.1326857805252075, + "learning_rate": 9.88775541213044e-08, + "loss": 0.6414, + "step": 121070 + }, + { + "epoch": 1.943530393746288, + "grad_norm": 0.8064638376235962, + "learning_rate": 9.831818534277171e-08, + "loss": 0.6956, + "step": 121080 + }, + { + "epoch": 1.943690909966452, + "grad_norm": 0.771818995475769, + "learning_rate": 9.776040016504228e-08, + "loss": 0.7293, + "step": 121090 + }, + { + "epoch": 1.9438514261866162, + "grad_norm": 0.8349019885063171, + "learning_rate": 9.720419862359331e-08, + "loss": 0.553, + "step": 121100 + }, + { + "epoch": 1.9440119424067803, + "grad_norm": 1.3086040019989014, + "learning_rate": 9.664958075379093e-08, + "loss": 0.7544, + "step": 121110 + }, + { + "epoch": 1.9441724586269442, + "grad_norm": 1.10867440700531, + "learning_rate": 9.609654659090695e-08, + "loss": 0.8013, + "step": 121120 + }, + { + "epoch": 1.9443329748471083, + "grad_norm": 1.1061261892318726, + "learning_rate": 9.55450961701132e-08, + "loss": 0.7303, + "step": 121130 + }, + { + "epoch": 1.9444934910672722, + "grad_norm": 0.7252489924430847, + "learning_rate": 9.49952295264761e-08, + "loss": 0.8028, + "step": 121140 + }, + { + "epoch": 1.9446540072874363, + "grad_norm": 0.7540103793144226, + "learning_rate": 9.44469466949649e-08, + "loss": 0.5312, + "step": 121150 + }, + { + "epoch": 1.9448145235076004, + "grad_norm": 0.9018798470497131, + "learning_rate": 9.390024771044614e-08, + "loss": 0.7325, + "step": 121160 + }, + { + "epoch": 1.9449750397277645, + "grad_norm": 0.9200366139411926, + "learning_rate": 9.335513260768925e-08, + "loss": 0.5932, + "step": 121170 + }, + { + "epoch": 1.9451355559479286, + "grad_norm": 0.9527754783630371, + "learning_rate": 9.281160142135815e-08, + "loss": 0.6587, + "step": 121180 + }, + { + "epoch": 1.9452960721680927, + "grad_norm": 1.096612572669983, + "learning_rate": 9.226965418601963e-08, + "loss": 0.7007, + "step": 121190 + }, + { + "epoch": 1.9454565883882566, + "grad_norm": 3.8984858989715576, + "learning_rate": 9.172929093613502e-08, + "loss": 0.5723, + "step": 121200 + }, + { + "epoch": 1.9456171046084205, + "grad_norm": 0.997075080871582, + "learning_rate": 9.119051170607406e-08, + "loss": 0.6525, + "step": 121210 + }, + { + "epoch": 1.9457776208285846, + "grad_norm": 1.2752143144607544, + "learning_rate": 9.065331653009546e-08, + "loss": 0.8851, + "step": 121220 + }, + { + "epoch": 1.9459381370487487, + "grad_norm": 0.8397300839424133, + "learning_rate": 9.011770544236076e-08, + "loss": 0.6744, + "step": 121230 + }, + { + "epoch": 1.9460986532689128, + "grad_norm": 1.071371078491211, + "learning_rate": 8.958367847693716e-08, + "loss": 0.6836, + "step": 121240 + }, + { + "epoch": 1.946259169489077, + "grad_norm": 0.4616483151912689, + "learning_rate": 8.905123566778084e-08, + "loss": 0.6397, + "step": 121250 + }, + { + "epoch": 1.946419685709241, + "grad_norm": 1.313394546508789, + "learning_rate": 8.85203770487536e-08, + "loss": 0.737, + "step": 121260 + }, + { + "epoch": 1.946580201929405, + "grad_norm": 1.0177562236785889, + "learning_rate": 8.799110265361455e-08, + "loss": 0.7471, + "step": 121270 + }, + { + "epoch": 1.946740718149569, + "grad_norm": 1.1288752555847168, + "learning_rate": 8.746341251602285e-08, + "loss": 0.8006, + "step": 121280 + }, + { + "epoch": 1.946901234369733, + "grad_norm": 1.4129984378814697, + "learning_rate": 8.693730666953504e-08, + "loss": 0.7538, + "step": 121290 + }, + { + "epoch": 1.947061750589897, + "grad_norm": 1.36524498462677, + "learning_rate": 8.64127851476132e-08, + "loss": 0.6763, + "step": 121300 + }, + { + "epoch": 1.9472222668100612, + "grad_norm": 1.0940581560134888, + "learning_rate": 8.588984798360567e-08, + "loss": 0.7378, + "step": 121310 + }, + { + "epoch": 1.9473827830302253, + "grad_norm": 0.936551034450531, + "learning_rate": 8.53684952107775e-08, + "loss": 0.8408, + "step": 121320 + }, + { + "epoch": 1.9475432992503894, + "grad_norm": 1.1769384145736694, + "learning_rate": 8.484872686227718e-08, + "loss": 0.7211, + "step": 121330 + }, + { + "epoch": 1.9477038154705533, + "grad_norm": 1.4107606410980225, + "learning_rate": 8.433054297115883e-08, + "loss": 0.7653, + "step": 121340 + }, + { + "epoch": 1.9478643316907174, + "grad_norm": 0.8721921443939209, + "learning_rate": 8.381394357037942e-08, + "loss": 0.7001, + "step": 121350 + }, + { + "epoch": 1.9480248479108813, + "grad_norm": 0.8751373887062073, + "learning_rate": 8.329892869279043e-08, + "loss": 0.6265, + "step": 121360 + }, + { + "epoch": 1.9481853641310454, + "grad_norm": 2.073667287826538, + "learning_rate": 8.278549837114346e-08, + "loss": 0.747, + "step": 121370 + }, + { + "epoch": 1.9483458803512095, + "grad_norm": 1.4540828466415405, + "learning_rate": 8.227365263809018e-08, + "loss": 0.6278, + "step": 121380 + }, + { + "epoch": 1.9485063965713736, + "grad_norm": 1.4105502367019653, + "learning_rate": 8.17633915261795e-08, + "loss": 0.6497, + "step": 121390 + }, + { + "epoch": 1.9486669127915377, + "grad_norm": 1.580857276916504, + "learning_rate": 8.125471506786053e-08, + "loss": 0.5724, + "step": 121400 + }, + { + "epoch": 1.9488274290117016, + "grad_norm": 1.5386015176773071, + "learning_rate": 8.074762329548513e-08, + "loss": 0.5961, + "step": 121410 + }, + { + "epoch": 1.9489879452318657, + "grad_norm": 1.4290289878845215, + "learning_rate": 8.024211624129974e-08, + "loss": 0.7118, + "step": 121420 + }, + { + "epoch": 1.9491484614520296, + "grad_norm": 1.061155915260315, + "learning_rate": 7.973819393745085e-08, + "loss": 0.6564, + "step": 121430 + }, + { + "epoch": 1.9493089776721937, + "grad_norm": 1.051579236984253, + "learning_rate": 7.923585641598508e-08, + "loss": 0.6371, + "step": 121440 + }, + { + "epoch": 1.9494694938923578, + "grad_norm": 1.239102840423584, + "learning_rate": 7.873510370885184e-08, + "loss": 0.6247, + "step": 121450 + }, + { + "epoch": 1.949630010112522, + "grad_norm": 0.9349039196968079, + "learning_rate": 7.823593584788958e-08, + "loss": 0.6713, + "step": 121460 + }, + { + "epoch": 1.949790526332686, + "grad_norm": 0.8485777378082275, + "learning_rate": 7.773835286484787e-08, + "loss": 0.6735, + "step": 121470 + }, + { + "epoch": 1.9499510425528501, + "grad_norm": 0.9880431890487671, + "learning_rate": 7.72423547913681e-08, + "loss": 0.574, + "step": 121480 + }, + { + "epoch": 1.950111558773014, + "grad_norm": 1.0467791557312012, + "learning_rate": 7.674794165899169e-08, + "loss": 0.7752, + "step": 121490 + }, + { + "epoch": 1.950272074993178, + "grad_norm": 0.9809511303901672, + "learning_rate": 7.625511349916015e-08, + "loss": 0.7374, + "step": 121500 + }, + { + "epoch": 1.950432591213342, + "grad_norm": 0.8101708292961121, + "learning_rate": 7.576387034322063e-08, + "loss": 0.7242, + "step": 121510 + }, + { + "epoch": 1.9505931074335061, + "grad_norm": 1.476906657218933, + "learning_rate": 7.527421222240371e-08, + "loss": 0.8034, + "step": 121520 + }, + { + "epoch": 1.9507536236536702, + "grad_norm": 1.1708528995513916, + "learning_rate": 7.478613916785671e-08, + "loss": 0.6177, + "step": 121530 + }, + { + "epoch": 1.9509141398738343, + "grad_norm": 1.1725192070007324, + "learning_rate": 7.42996512106131e-08, + "loss": 0.7096, + "step": 121540 + }, + { + "epoch": 1.9510746560939984, + "grad_norm": 0.8443190455436707, + "learning_rate": 7.381474838161484e-08, + "loss": 0.6288, + "step": 121550 + }, + { + "epoch": 1.9512351723141623, + "grad_norm": 0.5403175354003906, + "learning_rate": 7.333143071169557e-08, + "loss": 0.7056, + "step": 121560 + }, + { + "epoch": 1.9513956885343264, + "grad_norm": 1.7069084644317627, + "learning_rate": 7.284969823159183e-08, + "loss": 0.6571, + "step": 121570 + }, + { + "epoch": 1.9515562047544903, + "grad_norm": 1.4859663248062134, + "learning_rate": 7.23695509719402e-08, + "loss": 0.7637, + "step": 121580 + }, + { + "epoch": 1.9517167209746544, + "grad_norm": 0.7937588691711426, + "learning_rate": 7.189098896327461e-08, + "loss": 0.8304, + "step": 121590 + }, + { + "epoch": 1.9518772371948185, + "grad_norm": 1.542056679725647, + "learning_rate": 7.141401223602906e-08, + "loss": 0.715, + "step": 121600 + }, + { + "epoch": 1.9520377534149826, + "grad_norm": 1.2220906019210815, + "learning_rate": 7.093862082053759e-08, + "loss": 0.7336, + "step": 121610 + }, + { + "epoch": 1.9521982696351468, + "grad_norm": 1.0557963848114014, + "learning_rate": 7.046481474702881e-08, + "loss": 0.6675, + "step": 121620 + }, + { + "epoch": 1.9523587858553106, + "grad_norm": 1.3879344463348389, + "learning_rate": 6.999259404563974e-08, + "loss": 0.7113, + "step": 121630 + }, + { + "epoch": 1.9525193020754747, + "grad_norm": 1.374497890472412, + "learning_rate": 6.952195874639356e-08, + "loss": 0.8432, + "step": 121640 + }, + { + "epoch": 1.9526798182956386, + "grad_norm": 0.873794674873352, + "learning_rate": 6.905290887922744e-08, + "loss": 0.6495, + "step": 121650 + }, + { + "epoch": 1.9528403345158027, + "grad_norm": 0.6914440393447876, + "learning_rate": 6.858544447396476e-08, + "loss": 0.6782, + "step": 121660 + }, + { + "epoch": 1.9530008507359669, + "grad_norm": 0.9127739071846008, + "learning_rate": 6.81195655603345e-08, + "loss": 0.762, + "step": 121670 + }, + { + "epoch": 1.953161366956131, + "grad_norm": 1.1423968076705933, + "learning_rate": 6.765527216796575e-08, + "loss": 0.741, + "step": 121680 + }, + { + "epoch": 1.953321883176295, + "grad_norm": 1.5985212326049805, + "learning_rate": 6.71925643263821e-08, + "loss": 0.7931, + "step": 121690 + }, + { + "epoch": 1.953482399396459, + "grad_norm": 0.9558928608894348, + "learning_rate": 6.67314420650128e-08, + "loss": 0.6877, + "step": 121700 + }, + { + "epoch": 1.953642915616623, + "grad_norm": 1.1818838119506836, + "learning_rate": 6.627190541317885e-08, + "loss": 0.6938, + "step": 121710 + }, + { + "epoch": 1.953803431836787, + "grad_norm": 0.9784784317016602, + "learning_rate": 6.581395440010407e-08, + "loss": 0.7482, + "step": 121720 + }, + { + "epoch": 1.953963948056951, + "grad_norm": 1.6717437505722046, + "learning_rate": 6.535758905491518e-08, + "loss": 0.7729, + "step": 121730 + }, + { + "epoch": 1.9541244642771152, + "grad_norm": 1.5412579774856567, + "learning_rate": 6.49028094066334e-08, + "loss": 0.7748, + "step": 121740 + }, + { + "epoch": 1.9542849804972793, + "grad_norm": 1.3341437578201294, + "learning_rate": 6.444961548417727e-08, + "loss": 0.7211, + "step": 121750 + }, + { + "epoch": 1.9544454967174434, + "grad_norm": 1.2715340852737427, + "learning_rate": 6.399800731636818e-08, + "loss": 0.7417, + "step": 121760 + }, + { + "epoch": 1.9546060129376075, + "grad_norm": 0.7400960326194763, + "learning_rate": 6.35479849319276e-08, + "loss": 0.6398, + "step": 121770 + }, + { + "epoch": 1.9547665291577714, + "grad_norm": 0.768482506275177, + "learning_rate": 6.309954835947152e-08, + "loss": 0.6851, + "step": 121780 + }, + { + "epoch": 1.9549270453779355, + "grad_norm": 0.8178081512451172, + "learning_rate": 6.26526976275188e-08, + "loss": 0.7108, + "step": 121790 + }, + { + "epoch": 1.9550875615980994, + "grad_norm": 1.6191288232803345, + "learning_rate": 6.220743276449115e-08, + "loss": 0.7328, + "step": 121800 + }, + { + "epoch": 1.9552480778182635, + "grad_norm": 1.2235851287841797, + "learning_rate": 6.176375379869648e-08, + "loss": 0.7275, + "step": 121810 + }, + { + "epoch": 1.9554085940384276, + "grad_norm": 1.076729655265808, + "learning_rate": 6.132166075835665e-08, + "loss": 0.6337, + "step": 121820 + }, + { + "epoch": 1.9555691102585917, + "grad_norm": 1.0490641593933105, + "learning_rate": 6.088115367158254e-08, + "loss": 0.6567, + "step": 121830 + }, + { + "epoch": 1.9557296264787558, + "grad_norm": 2.127504825592041, + "learning_rate": 6.044223256639059e-08, + "loss": 0.6827, + "step": 121840 + }, + { + "epoch": 1.9558901426989197, + "grad_norm": 0.8246901631355286, + "learning_rate": 6.000489747069183e-08, + "loss": 0.6208, + "step": 121850 + }, + { + "epoch": 1.9560506589190838, + "grad_norm": 1.1056246757507324, + "learning_rate": 5.95691484123001e-08, + "loss": 0.8346, + "step": 121860 + }, + { + "epoch": 1.9562111751392477, + "grad_norm": 1.0823460817337036, + "learning_rate": 5.913498541892381e-08, + "loss": 0.5875, + "step": 121870 + }, + { + "epoch": 1.9563716913594118, + "grad_norm": 0.9143173694610596, + "learning_rate": 5.8702408518174215e-08, + "loss": 0.6428, + "step": 121880 + }, + { + "epoch": 1.956532207579576, + "grad_norm": 0.903348445892334, + "learning_rate": 5.827141773755984e-08, + "loss": 0.7612, + "step": 121890 + }, + { + "epoch": 1.95669272379974, + "grad_norm": 1.4153441190719604, + "learning_rate": 5.784201310449211e-08, + "loss": 0.6586, + "step": 121900 + }, + { + "epoch": 1.9568532400199041, + "grad_norm": 0.855265200138092, + "learning_rate": 5.741419464627695e-08, + "loss": 0.689, + "step": 121910 + }, + { + "epoch": 1.957013756240068, + "grad_norm": 1.095076322555542, + "learning_rate": 5.698796239012039e-08, + "loss": 0.7041, + "step": 121920 + }, + { + "epoch": 1.9571742724602321, + "grad_norm": 1.393550157546997, + "learning_rate": 5.656331636312851e-08, + "loss": 0.6248, + "step": 121930 + }, + { + "epoch": 1.957334788680396, + "grad_norm": 1.3059712648391724, + "learning_rate": 5.614025659230471e-08, + "loss": 0.864, + "step": 121940 + }, + { + "epoch": 1.9574953049005601, + "grad_norm": 1.3495856523513794, + "learning_rate": 5.571878310455802e-08, + "loss": 0.7894, + "step": 121950 + }, + { + "epoch": 1.9576558211207242, + "grad_norm": 1.6258646249771118, + "learning_rate": 5.5298895926686464e-08, + "loss": 0.7799, + "step": 121960 + }, + { + "epoch": 1.9578163373408883, + "grad_norm": 1.219002604484558, + "learning_rate": 5.4880595085396446e-08, + "loss": 0.8203, + "step": 121970 + }, + { + "epoch": 1.9579768535610524, + "grad_norm": 1.5142837762832642, + "learning_rate": 5.446388060728613e-08, + "loss": 0.8633, + "step": 121980 + }, + { + "epoch": 1.9581373697812166, + "grad_norm": 0.6787332892417908, + "learning_rate": 5.4048752518859326e-08, + "loss": 0.7018, + "step": 121990 + }, + { + "epoch": 1.9582978860013804, + "grad_norm": 0.8519037961959839, + "learning_rate": 5.3635210846511576e-08, + "loss": 0.7101, + "step": 122000 + }, + { + "epoch": 1.9584584022215443, + "grad_norm": 1.6771259307861328, + "learning_rate": 5.322325561654684e-08, + "loss": 0.7454, + "step": 122010 + }, + { + "epoch": 1.9586189184417084, + "grad_norm": 0.6861966848373413, + "learning_rate": 5.281288685515806e-08, + "loss": 0.7126, + "step": 122020 + }, + { + "epoch": 1.9587794346618725, + "grad_norm": 1.068734049797058, + "learning_rate": 5.240410458844658e-08, + "loss": 0.7938, + "step": 122030 + }, + { + "epoch": 1.9589399508820367, + "grad_norm": 0.7626992464065552, + "learning_rate": 5.199690884240549e-08, + "loss": 0.7494, + "step": 122040 + }, + { + "epoch": 1.9591004671022008, + "grad_norm": 0.9346611499786377, + "learning_rate": 5.1591299642930745e-08, + "loss": 0.7008, + "step": 122050 + }, + { + "epoch": 1.9592609833223649, + "grad_norm": 0.8134592771530151, + "learning_rate": 5.1187277015815607e-08, + "loss": 0.7619, + "step": 122060 + }, + { + "epoch": 1.9594214995425288, + "grad_norm": 0.8110677599906921, + "learning_rate": 5.078484098675618e-08, + "loss": 0.6698, + "step": 122070 + }, + { + "epoch": 1.9595820157626929, + "grad_norm": 1.7031548023223877, + "learning_rate": 5.038399158134588e-08, + "loss": 0.7567, + "step": 122080 + }, + { + "epoch": 1.9597425319828568, + "grad_norm": 0.9693776369094849, + "learning_rate": 4.998472882507266e-08, + "loss": 0.6359, + "step": 122090 + }, + { + "epoch": 1.9599030482030209, + "grad_norm": 1.0908609628677368, + "learning_rate": 4.958705274332731e-08, + "loss": 0.8007, + "step": 122100 + }, + { + "epoch": 1.960063564423185, + "grad_norm": 0.6011602282524109, + "learning_rate": 4.91909633614035e-08, + "loss": 0.7751, + "step": 122110 + }, + { + "epoch": 1.960224080643349, + "grad_norm": 0.8667726516723633, + "learning_rate": 4.87964607044894e-08, + "loss": 0.7071, + "step": 122120 + }, + { + "epoch": 1.9603845968635132, + "grad_norm": 1.575619101524353, + "learning_rate": 4.840354479767051e-08, + "loss": 0.6283, + "step": 122130 + }, + { + "epoch": 1.960545113083677, + "grad_norm": 0.8985066413879395, + "learning_rate": 4.801221566593239e-08, + "loss": 0.5869, + "step": 122140 + }, + { + "epoch": 1.9607056293038412, + "grad_norm": 1.0647186040878296, + "learning_rate": 4.7622473334166253e-08, + "loss": 0.7339, + "step": 122150 + }, + { + "epoch": 1.960866145524005, + "grad_norm": 1.0988080501556396, + "learning_rate": 4.723431782715504e-08, + "loss": 0.6946, + "step": 122160 + }, + { + "epoch": 1.9610266617441692, + "grad_norm": 1.260762095451355, + "learning_rate": 4.6847749169584565e-08, + "loss": 0.7178, + "step": 122170 + }, + { + "epoch": 1.9611871779643333, + "grad_norm": 0.7835880517959595, + "learning_rate": 4.6462767386035166e-08, + "loss": 0.7374, + "step": 122180 + }, + { + "epoch": 1.9613476941844974, + "grad_norm": 0.8834766745567322, + "learning_rate": 4.6079372500992815e-08, + "loss": 0.7164, + "step": 122190 + }, + { + "epoch": 1.9615082104046615, + "grad_norm": 1.1799043416976929, + "learning_rate": 4.5697564538838e-08, + "loss": 0.6916, + "step": 122200 + }, + { + "epoch": 1.9616687266248254, + "grad_norm": 1.0109364986419678, + "learning_rate": 4.5317343523848534e-08, + "loss": 0.8341, + "step": 122210 + }, + { + "epoch": 1.9618292428449895, + "grad_norm": 0.9090092778205872, + "learning_rate": 4.493870948021062e-08, + "loss": 0.6175, + "step": 122220 + }, + { + "epoch": 1.9619897590651534, + "grad_norm": 1.3001917600631714, + "learning_rate": 4.456166243199667e-08, + "loss": 0.6853, + "step": 122230 + }, + { + "epoch": 1.9621502752853175, + "grad_norm": 0.9449962973594666, + "learning_rate": 4.418620240318749e-08, + "loss": 0.7583, + "step": 122240 + }, + { + "epoch": 1.9623107915054816, + "grad_norm": 1.0314977169036865, + "learning_rate": 4.3812329417661224e-08, + "loss": 0.631, + "step": 122250 + }, + { + "epoch": 1.9624713077256457, + "grad_norm": 1.4433753490447998, + "learning_rate": 4.3440043499193283e-08, + "loss": 0.7559, + "step": 122260 + }, + { + "epoch": 1.9626318239458098, + "grad_norm": 0.9203351736068726, + "learning_rate": 4.30693446714564e-08, + "loss": 0.7492, + "step": 122270 + }, + { + "epoch": 1.962792340165974, + "grad_norm": 1.112657904624939, + "learning_rate": 4.270023295802894e-08, + "loss": 0.7527, + "step": 122280 + }, + { + "epoch": 1.9629528563861378, + "grad_norm": 0.9075795412063599, + "learning_rate": 4.233270838238101e-08, + "loss": 0.5383, + "step": 122290 + }, + { + "epoch": 1.963113372606302, + "grad_norm": 1.2236086130142212, + "learning_rate": 4.196677096788559e-08, + "loss": 0.7478, + "step": 122300 + }, + { + "epoch": 1.9632738888264658, + "grad_norm": 0.6472920775413513, + "learning_rate": 4.160242073781573e-08, + "loss": 0.7023, + "step": 122310 + }, + { + "epoch": 1.96343440504663, + "grad_norm": 1.1668157577514648, + "learning_rate": 4.1239657715341774e-08, + "loss": 0.7338, + "step": 122320 + }, + { + "epoch": 1.963594921266794, + "grad_norm": 1.4762235879898071, + "learning_rate": 4.0878481923531386e-08, + "loss": 0.7132, + "step": 122330 + }, + { + "epoch": 1.9637554374869581, + "grad_norm": 1.6177616119384766, + "learning_rate": 4.051889338535508e-08, + "loss": 0.683, + "step": 122340 + }, + { + "epoch": 1.9639159537071222, + "grad_norm": 1.043927788734436, + "learning_rate": 4.0160892123680684e-08, + "loss": 0.7771, + "step": 122350 + }, + { + "epoch": 1.9640764699272861, + "grad_norm": 0.7115771174430847, + "learning_rate": 3.9804478161273303e-08, + "loss": 0.7427, + "step": 122360 + }, + { + "epoch": 1.9642369861474502, + "grad_norm": 1.1482837200164795, + "learning_rate": 3.944965152079816e-08, + "loss": 0.4899, + "step": 122370 + }, + { + "epoch": 1.9643975023676141, + "grad_norm": 0.7986605167388916, + "learning_rate": 3.90964122248233e-08, + "loss": 0.7242, + "step": 122380 + }, + { + "epoch": 1.9645580185877782, + "grad_norm": 1.0601003170013428, + "learning_rate": 3.874476029581131e-08, + "loss": 0.7709, + "step": 122390 + }, + { + "epoch": 1.9647185348079423, + "grad_norm": 0.8459908962249756, + "learning_rate": 3.839469575612209e-08, + "loss": 0.7321, + "step": 122400 + }, + { + "epoch": 1.9648790510281064, + "grad_norm": 0.9614055752754211, + "learning_rate": 3.8046218628023935e-08, + "loss": 0.7036, + "step": 122410 + }, + { + "epoch": 1.9650395672482706, + "grad_norm": 1.1296335458755493, + "learning_rate": 3.769932893367134e-08, + "loss": 0.7787, + "step": 122420 + }, + { + "epoch": 1.9652000834684344, + "grad_norm": 1.2680703401565552, + "learning_rate": 3.7354026695129996e-08, + "loss": 0.6765, + "step": 122430 + }, + { + "epoch": 1.9653605996885986, + "grad_norm": 0.9417257905006409, + "learning_rate": 3.7010311934354555e-08, + "loss": 0.7612, + "step": 122440 + }, + { + "epoch": 1.9655211159087624, + "grad_norm": 1.2712844610214233, + "learning_rate": 3.6668184673208095e-08, + "loss": 0.6691, + "step": 122450 + }, + { + "epoch": 1.9656816321289265, + "grad_norm": 1.4501703977584839, + "learning_rate": 3.632764493344265e-08, + "loss": 0.705, + "step": 122460 + }, + { + "epoch": 1.9658421483490907, + "grad_norm": 1.168477177619934, + "learning_rate": 3.598869273671868e-08, + "loss": 0.6582, + "step": 122470 + }, + { + "epoch": 1.9660026645692548, + "grad_norm": 2.18932843208313, + "learning_rate": 3.565132810459115e-08, + "loss": 0.6561, + "step": 122480 + }, + { + "epoch": 1.9661631807894189, + "grad_norm": 0.9927055239677429, + "learning_rate": 3.531555105850959e-08, + "loss": 0.705, + "step": 122490 + }, + { + "epoch": 1.966323697009583, + "grad_norm": 0.8189218640327454, + "learning_rate": 3.4981361619834676e-08, + "loss": 0.7717, + "step": 122500 + }, + { + "epoch": 1.9664842132297469, + "grad_norm": 1.0854305028915405, + "learning_rate": 3.46487598098133e-08, + "loss": 0.6673, + "step": 122510 + }, + { + "epoch": 1.9666447294499108, + "grad_norm": 0.9539245963096619, + "learning_rate": 3.4317745649600775e-08, + "loss": 0.7833, + "step": 122520 + }, + { + "epoch": 1.9668052456700749, + "grad_norm": 0.9010229706764221, + "learning_rate": 3.398831916024414e-08, + "loss": 0.5893, + "step": 122530 + }, + { + "epoch": 1.966965761890239, + "grad_norm": 0.7954834699630737, + "learning_rate": 3.366048036269609e-08, + "loss": 0.7061, + "step": 122540 + }, + { + "epoch": 1.967126278110403, + "grad_norm": 0.7084245681762695, + "learning_rate": 3.333422927780383e-08, + "loss": 0.6313, + "step": 122550 + }, + { + "epoch": 1.9672867943305672, + "grad_norm": 0.8124331831932068, + "learning_rate": 3.3009565926314655e-08, + "loss": 0.8986, + "step": 122560 + }, + { + "epoch": 1.9674473105507313, + "grad_norm": 0.8235214948654175, + "learning_rate": 3.2686490328875944e-08, + "loss": 0.6831, + "step": 122570 + }, + { + "epoch": 1.9676078267708952, + "grad_norm": 1.3935973644256592, + "learning_rate": 3.236500250603236e-08, + "loss": 0.7285, + "step": 122580 + }, + { + "epoch": 1.9677683429910593, + "grad_norm": 1.1235154867172241, + "learning_rate": 3.2045102478231446e-08, + "loss": 0.7655, + "step": 122590 + }, + { + "epoch": 1.9679288592112232, + "grad_norm": 0.8512223958969116, + "learning_rate": 3.172679026581249e-08, + "loss": 0.6579, + "step": 122600 + }, + { + "epoch": 1.9680893754313873, + "grad_norm": 1.7656270265579224, + "learning_rate": 3.1410065889023175e-08, + "loss": 0.5478, + "step": 122610 + }, + { + "epoch": 1.9682498916515514, + "grad_norm": 0.9789643883705139, + "learning_rate": 3.109492936800018e-08, + "loss": 0.7242, + "step": 122620 + }, + { + "epoch": 1.9684104078717155, + "grad_norm": 1.3254286050796509, + "learning_rate": 3.078138072278858e-08, + "loss": 0.6678, + "step": 122630 + }, + { + "epoch": 1.9685709240918796, + "grad_norm": 1.0651572942733765, + "learning_rate": 3.046941997332797e-08, + "loss": 0.6988, + "step": 122640 + }, + { + "epoch": 1.9687314403120435, + "grad_norm": 0.8565966486930847, + "learning_rate": 3.015904713945528e-08, + "loss": 0.7053, + "step": 122650 + }, + { + "epoch": 1.9688919565322076, + "grad_norm": 0.9816609621047974, + "learning_rate": 2.9850262240907476e-08, + "loss": 0.7265, + "step": 122660 + }, + { + "epoch": 1.9690524727523715, + "grad_norm": 1.0118058919906616, + "learning_rate": 2.9543065297324423e-08, + "loss": 0.664, + "step": 122670 + }, + { + "epoch": 1.9692129889725356, + "grad_norm": 1.4320837259292603, + "learning_rate": 2.9237456328240486e-08, + "loss": 0.7956, + "step": 122680 + }, + { + "epoch": 1.9693735051926997, + "grad_norm": 1.104966163635254, + "learning_rate": 2.89334353530929e-08, + "loss": 0.6375, + "step": 122690 + }, + { + "epoch": 1.9695340214128638, + "grad_norm": 1.455540418624878, + "learning_rate": 2.8631002391210638e-08, + "loss": 0.7527, + "step": 122700 + }, + { + "epoch": 1.969694537633028, + "grad_norm": 1.4407076835632324, + "learning_rate": 2.8330157461831096e-08, + "loss": 0.6713, + "step": 122710 + }, + { + "epoch": 1.9698550538531918, + "grad_norm": 1.2111713886260986, + "learning_rate": 2.803090058408342e-08, + "loss": 0.6441, + "step": 122720 + }, + { + "epoch": 1.970015570073356, + "grad_norm": 1.1800458431243896, + "learning_rate": 2.7733231776999603e-08, + "loss": 0.5893, + "step": 122730 + }, + { + "epoch": 1.9701760862935198, + "grad_norm": 0.9407630562782288, + "learning_rate": 2.7437151059511722e-08, + "loss": 0.6942, + "step": 122740 + }, + { + "epoch": 1.970336602513684, + "grad_norm": 0.835047721862793, + "learning_rate": 2.714265845044639e-08, + "loss": 0.6415, + "step": 122750 + }, + { + "epoch": 1.970497118733848, + "grad_norm": 0.9764014482498169, + "learning_rate": 2.6849753968533063e-08, + "loss": 0.7834, + "step": 122760 + }, + { + "epoch": 1.9706576349540121, + "grad_norm": 0.9895374178886414, + "learning_rate": 2.6558437632395737e-08, + "loss": 0.6849, + "step": 122770 + }, + { + "epoch": 1.9708181511741762, + "grad_norm": 1.4406731128692627, + "learning_rate": 2.6268709460564032e-08, + "loss": 0.6074, + "step": 122780 + }, + { + "epoch": 1.9709786673943404, + "grad_norm": 0.9876652956008911, + "learning_rate": 2.5980569471459327e-08, + "loss": 0.7498, + "step": 122790 + }, + { + "epoch": 1.9711391836145042, + "grad_norm": 1.7194918394088745, + "learning_rate": 2.5694017683411398e-08, + "loss": 0.8028, + "step": 122800 + }, + { + "epoch": 1.9712996998346681, + "grad_norm": 0.9212223291397095, + "learning_rate": 2.540905411463623e-08, + "loss": 0.5635, + "step": 122810 + }, + { + "epoch": 1.9714602160548322, + "grad_norm": 0.7585978507995605, + "learning_rate": 2.5125678783260997e-08, + "loss": 0.759, + "step": 122820 + }, + { + "epoch": 1.9716207322749963, + "grad_norm": 0.9788730144500732, + "learning_rate": 2.484389170730461e-08, + "loss": 0.6652, + "step": 122830 + }, + { + "epoch": 1.9717812484951605, + "grad_norm": 1.5172582864761353, + "learning_rate": 2.456369290468885e-08, + "loss": 0.6056, + "step": 122840 + }, + { + "epoch": 1.9719417647153246, + "grad_norm": 0.8226392269134521, + "learning_rate": 2.4285082393230018e-08, + "loss": 0.6681, + "step": 122850 + }, + { + "epoch": 1.9721022809354887, + "grad_norm": 0.9638616442680359, + "learning_rate": 2.4008060190647276e-08, + "loss": 0.7246, + "step": 122860 + }, + { + "epoch": 1.9722627971556526, + "grad_norm": 1.3672584295272827, + "learning_rate": 2.373262631455708e-08, + "loss": 0.7155, + "step": 122870 + }, + { + "epoch": 1.9724233133758167, + "grad_norm": 1.4744020700454712, + "learning_rate": 2.3458780782475985e-08, + "loss": 0.6862, + "step": 122880 + }, + { + "epoch": 1.9725838295959806, + "grad_norm": 1.1369332075119019, + "learning_rate": 2.318652361182061e-08, + "loss": 0.6925, + "step": 122890 + }, + { + "epoch": 1.9727443458161447, + "grad_norm": 1.5708973407745361, + "learning_rate": 2.2915854819902106e-08, + "loss": 0.6029, + "step": 122900 + }, + { + "epoch": 1.9729048620363088, + "grad_norm": 0.965470552444458, + "learning_rate": 2.264677442393448e-08, + "loss": 0.7035, + "step": 122910 + }, + { + "epoch": 1.9730653782564729, + "grad_norm": 1.3887450695037842, + "learning_rate": 2.2379282441031823e-08, + "loss": 0.6516, + "step": 122920 + }, + { + "epoch": 1.973225894476637, + "grad_norm": 1.1999164819717407, + "learning_rate": 2.2113378888199975e-08, + "loss": 0.6471, + "step": 122930 + }, + { + "epoch": 1.9733864106968009, + "grad_norm": 1.7217835187911987, + "learning_rate": 2.184906378235596e-08, + "loss": 0.8605, + "step": 122940 + }, + { + "epoch": 1.973546926916965, + "grad_norm": 0.9692628383636475, + "learning_rate": 2.1586337140300228e-08, + "loss": 0.6117, + "step": 122950 + }, + { + "epoch": 1.9737074431371289, + "grad_norm": 1.240110158920288, + "learning_rate": 2.1325198978747184e-08, + "loss": 0.6469, + "step": 122960 + }, + { + "epoch": 1.973867959357293, + "grad_norm": 0.7816643118858337, + "learning_rate": 2.1065649314302992e-08, + "loss": 0.7077, + "step": 122970 + }, + { + "epoch": 1.974028475577457, + "grad_norm": 1.936659574508667, + "learning_rate": 2.0807688163468342e-08, + "loss": 0.7091, + "step": 122980 + }, + { + "epoch": 1.9741889917976212, + "grad_norm": 1.0404560565948486, + "learning_rate": 2.0551315542655103e-08, + "loss": 0.6695, + "step": 122990 + }, + { + "epoch": 1.9743495080177853, + "grad_norm": 1.4032467603683472, + "learning_rate": 2.029653146816135e-08, + "loss": 0.8606, + "step": 123000 + }, + { + "epoch": 1.9745100242379492, + "grad_norm": 0.8561841249465942, + "learning_rate": 2.0043335956193564e-08, + "loss": 0.6798, + "step": 123010 + }, + { + "epoch": 1.9746705404581133, + "grad_norm": 1.0476784706115723, + "learning_rate": 1.9791729022852756e-08, + "loss": 0.7489, + "step": 123020 + }, + { + "epoch": 1.9748310566782772, + "grad_norm": 1.0943020582199097, + "learning_rate": 1.9541710684137236e-08, + "loss": 0.6346, + "step": 123030 + }, + { + "epoch": 1.9749915728984413, + "grad_norm": 1.2360713481903076, + "learning_rate": 1.9293280955950955e-08, + "loss": 0.7309, + "step": 123040 + }, + { + "epoch": 1.9751520891186054, + "grad_norm": 1.3524527549743652, + "learning_rate": 1.9046439854086827e-08, + "loss": 0.6491, + "step": 123050 + }, + { + "epoch": 1.9753126053387695, + "grad_norm": 0.8295291066169739, + "learning_rate": 1.8801187394248965e-08, + "loss": 0.7758, + "step": 123060 + }, + { + "epoch": 1.9754731215589336, + "grad_norm": 1.4088263511657715, + "learning_rate": 1.855752359202767e-08, + "loss": 0.694, + "step": 123070 + }, + { + "epoch": 1.9756336377790977, + "grad_norm": 0.935021162033081, + "learning_rate": 1.8315448462924433e-08, + "loss": 0.6845, + "step": 123080 + }, + { + "epoch": 1.9757941539992616, + "grad_norm": 0.7779065370559692, + "learning_rate": 1.8074962022329723e-08, + "loss": 0.6306, + "step": 123090 + }, + { + "epoch": 1.9759546702194257, + "grad_norm": 1.3484190702438354, + "learning_rate": 1.7836064285539633e-08, + "loss": 0.6825, + "step": 123100 + }, + { + "epoch": 1.9761151864395896, + "grad_norm": 1.1303837299346924, + "learning_rate": 1.7598755267744794e-08, + "loss": 0.7905, + "step": 123110 + }, + { + "epoch": 1.9762757026597537, + "grad_norm": 0.9524754285812378, + "learning_rate": 1.736303498403591e-08, + "loss": 0.7267, + "step": 123120 + }, + { + "epoch": 1.9764362188799178, + "grad_norm": 0.8980169892311096, + "learning_rate": 1.712890344940654e-08, + "loss": 0.7248, + "step": 123130 + }, + { + "epoch": 1.976596735100082, + "grad_norm": 0.8352994322776794, + "learning_rate": 1.6896360678744782e-08, + "loss": 0.6379, + "step": 123140 + }, + { + "epoch": 1.976757251320246, + "grad_norm": 0.9977482557296753, + "learning_rate": 1.6665406686836026e-08, + "loss": 0.776, + "step": 123150 + }, + { + "epoch": 1.97691776754041, + "grad_norm": 1.293986439704895, + "learning_rate": 1.64360414883713e-08, + "loss": 0.7111, + "step": 123160 + }, + { + "epoch": 1.977078283760574, + "grad_norm": 0.7850258350372314, + "learning_rate": 1.620826509793616e-08, + "loss": 0.6616, + "step": 123170 + }, + { + "epoch": 1.977238799980738, + "grad_norm": 1.4405428171157837, + "learning_rate": 1.5982077530016237e-08, + "loss": 0.6672, + "step": 123180 + }, + { + "epoch": 1.977399316200902, + "grad_norm": 1.0320807695388794, + "learning_rate": 1.5757478798994473e-08, + "loss": 0.7564, + "step": 123190 + }, + { + "epoch": 1.9775598324210661, + "grad_norm": 0.8279857039451599, + "learning_rate": 1.5534468919153887e-08, + "loss": 0.6868, + "step": 123200 + }, + { + "epoch": 1.9777203486412303, + "grad_norm": 0.7608804106712341, + "learning_rate": 1.531304790467758e-08, + "loss": 0.638, + "step": 123210 + }, + { + "epoch": 1.9778808648613944, + "grad_norm": 0.7649704217910767, + "learning_rate": 1.5093215769645953e-08, + "loss": 0.6215, + "step": 123220 + }, + { + "epoch": 1.9780413810815582, + "grad_norm": 1.2454780340194702, + "learning_rate": 1.4874972528039488e-08, + "loss": 0.7381, + "step": 123230 + }, + { + "epoch": 1.9782018973017224, + "grad_norm": 1.5926413536071777, + "learning_rate": 1.4658318193735976e-08, + "loss": 0.7159, + "step": 123240 + }, + { + "epoch": 1.9783624135218862, + "grad_norm": 1.1233676671981812, + "learning_rate": 1.4443252780516059e-08, + "loss": 0.6832, + "step": 123250 + }, + { + "epoch": 1.9785229297420504, + "grad_norm": 0.9559934139251709, + "learning_rate": 1.4229776302052133e-08, + "loss": 0.6737, + "step": 123260 + }, + { + "epoch": 1.9786834459622145, + "grad_norm": 0.8381972312927246, + "learning_rate": 1.4017888771925003e-08, + "loss": 0.6567, + "step": 123270 + }, + { + "epoch": 1.9788439621823786, + "grad_norm": 1.238882064819336, + "learning_rate": 1.3807590203607224e-08, + "loss": 0.7067, + "step": 123280 + }, + { + "epoch": 1.9790044784025427, + "grad_norm": 0.8123843669891357, + "learning_rate": 1.3598880610471431e-08, + "loss": 0.6936, + "step": 123290 + }, + { + "epoch": 1.9791649946227068, + "grad_norm": 0.5457252860069275, + "learning_rate": 1.339176000579312e-08, + "loss": 0.8286, + "step": 123300 + }, + { + "epoch": 1.9793255108428707, + "grad_norm": 0.9415172338485718, + "learning_rate": 1.3186228402742307e-08, + "loss": 0.7241, + "step": 123310 + }, + { + "epoch": 1.9794860270630346, + "grad_norm": 1.2180455923080444, + "learning_rate": 1.2982285814389095e-08, + "loss": 0.7492, + "step": 123320 + }, + { + "epoch": 1.9796465432831987, + "grad_norm": 1.0708106756210327, + "learning_rate": 1.2779932253703663e-08, + "loss": 0.7571, + "step": 123330 + }, + { + "epoch": 1.9798070595033628, + "grad_norm": 1.6639302968978882, + "learning_rate": 1.2579167733553498e-08, + "loss": 0.7855, + "step": 123340 + }, + { + "epoch": 1.9799675757235269, + "grad_norm": 1.3051503896713257, + "learning_rate": 1.237999226670894e-08, + "loss": 0.668, + "step": 123350 + }, + { + "epoch": 1.980128091943691, + "grad_norm": 0.9699289798736572, + "learning_rate": 1.218240586583208e-08, + "loss": 0.8986, + "step": 123360 + }, + { + "epoch": 1.980288608163855, + "grad_norm": 1.0018768310546875, + "learning_rate": 1.1986408543493421e-08, + "loss": 0.6465, + "step": 123370 + }, + { + "epoch": 1.980449124384019, + "grad_norm": 0.6863855719566345, + "learning_rate": 1.179200031215244e-08, + "loss": 0.7121, + "step": 123380 + }, + { + "epoch": 1.980609640604183, + "grad_norm": 1.5571469068527222, + "learning_rate": 1.1599181184174247e-08, + "loss": 0.7494, + "step": 123390 + }, + { + "epoch": 1.980770156824347, + "grad_norm": 2.525444746017456, + "learning_rate": 1.1407951171821252e-08, + "loss": 0.7227, + "step": 123400 + }, + { + "epoch": 1.980930673044511, + "grad_norm": 0.7371312379837036, + "learning_rate": 1.121831028725595e-08, + "loss": 0.7682, + "step": 123410 + }, + { + "epoch": 1.9810911892646752, + "grad_norm": 1.3356318473815918, + "learning_rate": 1.1030258542535366e-08, + "loss": 0.688, + "step": 123420 + }, + { + "epoch": 1.9812517054848393, + "grad_norm": 1.04508376121521, + "learning_rate": 1.0843795949619373e-08, + "loss": 0.7876, + "step": 123430 + }, + { + "epoch": 1.9814122217050034, + "grad_norm": 0.8550383448600769, + "learning_rate": 1.0658922520367931e-08, + "loss": 0.6613, + "step": 123440 + }, + { + "epoch": 1.9815727379251673, + "grad_norm": 1.2727051973342896, + "learning_rate": 1.0475638266535526e-08, + "loss": 0.7909, + "step": 123450 + }, + { + "epoch": 1.9817332541453314, + "grad_norm": 1.5021718740463257, + "learning_rate": 1.02939431997795e-08, + "loss": 0.6984, + "step": 123460 + }, + { + "epoch": 1.9818937703654953, + "grad_norm": 0.9354788661003113, + "learning_rate": 1.0113837331654496e-08, + "loss": 0.5964, + "step": 123470 + }, + { + "epoch": 1.9820542865856594, + "grad_norm": 1.3943530321121216, + "learning_rate": 9.93532067361247e-09, + "loss": 0.775, + "step": 123480 + }, + { + "epoch": 1.9822148028058235, + "grad_norm": 1.0902496576309204, + "learning_rate": 9.758393237008223e-09, + "loss": 0.7409, + "step": 123490 + }, + { + "epoch": 1.9823753190259876, + "grad_norm": 0.8719119429588318, + "learning_rate": 9.583055033091092e-09, + "loss": 0.6644, + "step": 123500 + }, + { + "epoch": 1.9825358352461517, + "grad_norm": 1.3908638954162598, + "learning_rate": 9.409306073013269e-09, + "loss": 0.7889, + "step": 123510 + }, + { + "epoch": 1.9826963514663156, + "grad_norm": 1.1280336380004883, + "learning_rate": 9.237146367824245e-09, + "loss": 0.7396, + "step": 123520 + }, + { + "epoch": 1.9828568676864797, + "grad_norm": 1.5023304224014282, + "learning_rate": 9.066575928473598e-09, + "loss": 0.8196, + "step": 123530 + }, + { + "epoch": 1.9830173839066436, + "grad_norm": 1.0767098665237427, + "learning_rate": 8.897594765802652e-09, + "loss": 0.6935, + "step": 123540 + }, + { + "epoch": 1.9831779001268077, + "grad_norm": 1.386794090270996, + "learning_rate": 8.730202890563922e-09, + "loss": 0.6444, + "step": 123550 + }, + { + "epoch": 1.9833384163469718, + "grad_norm": 1.089401125907898, + "learning_rate": 8.564400313401666e-09, + "loss": 0.7313, + "step": 123560 + }, + { + "epoch": 1.983498932567136, + "grad_norm": 0.8801611661911011, + "learning_rate": 8.400187044857455e-09, + "loss": 0.6332, + "step": 123570 + }, + { + "epoch": 1.9836594487873, + "grad_norm": 1.5234993696212769, + "learning_rate": 8.237563095375712e-09, + "loss": 0.6543, + "step": 123580 + }, + { + "epoch": 1.9838199650074642, + "grad_norm": 1.19259774684906, + "learning_rate": 8.076528475298162e-09, + "loss": 0.7938, + "step": 123590 + }, + { + "epoch": 1.983980481227628, + "grad_norm": 1.3543034791946411, + "learning_rate": 7.917083194866615e-09, + "loss": 0.657, + "step": 123600 + }, + { + "epoch": 1.9841409974477922, + "grad_norm": 1.0487624406814575, + "learning_rate": 7.759227264217406e-09, + "loss": 0.7484, + "step": 123610 + }, + { + "epoch": 1.984301513667956, + "grad_norm": 1.1310741901397705, + "learning_rate": 7.602960693395278e-09, + "loss": 0.7979, + "step": 123620 + }, + { + "epoch": 1.9844620298881201, + "grad_norm": 1.4454644918441772, + "learning_rate": 7.448283492331176e-09, + "loss": 0.7565, + "step": 123630 + }, + { + "epoch": 1.9846225461082843, + "grad_norm": 2.4904677867889404, + "learning_rate": 7.295195670867228e-09, + "loss": 0.628, + "step": 123640 + }, + { + "epoch": 1.9847830623284484, + "grad_norm": 0.8226103782653809, + "learning_rate": 7.143697238734537e-09, + "loss": 0.7081, + "step": 123650 + }, + { + "epoch": 1.9849435785486125, + "grad_norm": 1.0723118782043457, + "learning_rate": 6.9937882055698404e-09, + "loss": 0.7534, + "step": 123660 + }, + { + "epoch": 1.9851040947687764, + "grad_norm": 1.1291072368621826, + "learning_rate": 6.8454685809071775e-09, + "loss": 0.7553, + "step": 123670 + }, + { + "epoch": 1.9852646109889405, + "grad_norm": 1.1902506351470947, + "learning_rate": 6.698738374177893e-09, + "loss": 0.6442, + "step": 123680 + }, + { + "epoch": 1.9854251272091044, + "grad_norm": 1.0925747156143188, + "learning_rate": 6.553597594713412e-09, + "loss": 0.7415, + "step": 123690 + }, + { + "epoch": 1.9855856434292685, + "grad_norm": 0.8823854923248291, + "learning_rate": 6.410046251742463e-09, + "loss": 0.8036, + "step": 123700 + }, + { + "epoch": 1.9857461596494326, + "grad_norm": 0.8575567007064819, + "learning_rate": 6.268084354393855e-09, + "loss": 0.7063, + "step": 123710 + }, + { + "epoch": 1.9859066758695967, + "grad_norm": 0.872783362865448, + "learning_rate": 6.1277119116992525e-09, + "loss": 0.6357, + "step": 123720 + }, + { + "epoch": 1.9860671920897608, + "grad_norm": 1.1346999406814575, + "learning_rate": 5.9889289325820715e-09, + "loss": 0.6346, + "step": 123730 + }, + { + "epoch": 1.9862277083099247, + "grad_norm": 1.1823822259902954, + "learning_rate": 5.851735425868587e-09, + "loss": 0.7227, + "step": 123740 + }, + { + "epoch": 1.9863882245300888, + "grad_norm": 0.8645641207695007, + "learning_rate": 5.716131400285152e-09, + "loss": 0.6839, + "step": 123750 + }, + { + "epoch": 1.9865487407502527, + "grad_norm": 0.7821374535560608, + "learning_rate": 5.582116864452647e-09, + "loss": 0.6347, + "step": 123760 + }, + { + "epoch": 1.9867092569704168, + "grad_norm": 0.8892423510551453, + "learning_rate": 5.449691826897585e-09, + "loss": 0.6894, + "step": 123770 + }, + { + "epoch": 1.986869773190581, + "grad_norm": 1.176153540611267, + "learning_rate": 5.318856296035457e-09, + "loss": 0.7817, + "step": 123780 + }, + { + "epoch": 1.987030289410745, + "grad_norm": 0.8283487558364868, + "learning_rate": 5.1896102801929356e-09, + "loss": 0.7664, + "step": 123790 + }, + { + "epoch": 1.987190805630909, + "grad_norm": 1.4665050506591797, + "learning_rate": 5.061953787585671e-09, + "loss": 0.7536, + "step": 123800 + }, + { + "epoch": 1.9873513218510732, + "grad_norm": 0.8977430462837219, + "learning_rate": 4.93588682633217e-09, + "loss": 0.673, + "step": 123810 + }, + { + "epoch": 1.987511838071237, + "grad_norm": 1.0739389657974243, + "learning_rate": 4.811409404451017e-09, + "loss": 0.5561, + "step": 123820 + }, + { + "epoch": 1.987672354291401, + "grad_norm": 1.2182563543319702, + "learning_rate": 4.688521529858103e-09, + "loss": 0.7483, + "step": 123830 + }, + { + "epoch": 1.987832870511565, + "grad_norm": 1.3113608360290527, + "learning_rate": 4.5672232103666225e-09, + "loss": 0.8373, + "step": 123840 + }, + { + "epoch": 1.9879933867317292, + "grad_norm": 1.289129376411438, + "learning_rate": 4.4475144536898495e-09, + "loss": 0.6577, + "step": 123850 + }, + { + "epoch": 1.9881539029518933, + "grad_norm": 0.8504583239555359, + "learning_rate": 4.3293952674439145e-09, + "loss": 0.78, + "step": 123860 + }, + { + "epoch": 1.9883144191720574, + "grad_norm": 0.6889305710792542, + "learning_rate": 4.212865659139475e-09, + "loss": 0.7921, + "step": 123870 + }, + { + "epoch": 1.9884749353922215, + "grad_norm": 1.0516659021377563, + "learning_rate": 4.097925636184496e-09, + "loss": 0.6776, + "step": 123880 + }, + { + "epoch": 1.9886354516123854, + "grad_norm": 1.5663769245147705, + "learning_rate": 3.984575205892571e-09, + "loss": 0.6181, + "step": 123890 + }, + { + "epoch": 1.9887959678325495, + "grad_norm": 0.9984056353569031, + "learning_rate": 3.872814375469047e-09, + "loss": 0.7229, + "step": 123900 + }, + { + "epoch": 1.9889564840527134, + "grad_norm": 1.0234429836273193, + "learning_rate": 3.762643152022127e-09, + "loss": 0.6209, + "step": 123910 + }, + { + "epoch": 1.9891170002728775, + "grad_norm": 1.2957016229629517, + "learning_rate": 3.654061542557319e-09, + "loss": 0.7387, + "step": 123920 + }, + { + "epoch": 1.9892775164930416, + "grad_norm": 0.8998965620994568, + "learning_rate": 3.547069553980209e-09, + "loss": 0.6609, + "step": 123930 + }, + { + "epoch": 1.9894380327132057, + "grad_norm": 1.2480524778366089, + "learning_rate": 3.441667193096465e-09, + "loss": 0.5892, + "step": 123940 + }, + { + "epoch": 1.9895985489333698, + "grad_norm": 0.725440263748169, + "learning_rate": 3.337854466606283e-09, + "loss": 0.7213, + "step": 123950 + }, + { + "epoch": 1.9897590651535337, + "grad_norm": 1.021767020225525, + "learning_rate": 3.2356313811154893e-09, + "loss": 0.6593, + "step": 123960 + }, + { + "epoch": 1.9899195813736978, + "grad_norm": 0.6906185746192932, + "learning_rate": 3.1349979431216646e-09, + "loss": 0.6418, + "step": 123970 + }, + { + "epoch": 1.9900800975938617, + "grad_norm": 0.8486871719360352, + "learning_rate": 3.0359541590224694e-09, + "loss": 0.631, + "step": 123980 + }, + { + "epoch": 1.9902406138140258, + "grad_norm": 1.0125759840011597, + "learning_rate": 2.9385000351211944e-09, + "loss": 0.6344, + "step": 123990 + }, + { + "epoch": 1.99040113003419, + "grad_norm": 1.062587022781372, + "learning_rate": 2.84263557761566e-09, + "loss": 0.6707, + "step": 124000 + }, + { + "epoch": 1.99040113003419, + "eval_loss": 0.7689428925514221, + "eval_runtime": 1834.7305, + "eval_samples_per_second": 14.297, + "eval_steps_per_second": 1.787, + "step": 124000 + } + ], + "logging_steps": 10, + "max_steps": 124598, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 4000, + "total_flos": 6.294434425977858e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}