diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4282 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 60701, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016474193176389186, + "grad_norm": 35.35807800292969, + "learning_rate": 4.1186161449752885e-07, + "loss": 1.1167, + "step": 100 + }, + { + "epoch": 0.0032948386352778373, + "grad_norm": 2.662261724472046, + "learning_rate": 8.237232289950577e-07, + "loss": 0.8293, + "step": 200 + }, + { + "epoch": 0.004942257952916756, + "grad_norm": 3.5144577026367188, + "learning_rate": 1.2355848434925866e-06, + "loss": 0.8225, + "step": 300 + }, + { + "epoch": 0.006589677270555675, + "grad_norm": 2.4613759517669678, + "learning_rate": 1.6474464579901154e-06, + "loss": 0.767, + "step": 400 + }, + { + "epoch": 0.008237096588194593, + "grad_norm": 2.598069906234741, + "learning_rate": 2.0593080724876445e-06, + "loss": 0.687, + "step": 500 + }, + { + "epoch": 0.009884515905833512, + "grad_norm": 3.49182391166687, + "learning_rate": 2.471169686985173e-06, + "loss": 0.6412, + "step": 600 + }, + { + "epoch": 0.01153193522347243, + "grad_norm": 1.9722175598144531, + "learning_rate": 2.883031301482702e-06, + "loss": 0.6173, + "step": 700 + }, + { + "epoch": 0.01317935454111135, + "grad_norm": 2.084155321121216, + "learning_rate": 3.294892915980231e-06, + "loss": 0.622, + "step": 800 + }, + { + "epoch": 0.014826773858750269, + "grad_norm": 2.001030206680298, + "learning_rate": 3.70675453047776e-06, + "loss": 0.5975, + "step": 900 + }, + { + "epoch": 0.016474193176389186, + "grad_norm": 2.722954034805298, + "learning_rate": 4.118616144975289e-06, + "loss": 0.6171, + "step": 1000 + }, + { + "epoch": 0.018121612494028105, + "grad_norm": 2.851048469543457, + "learning_rate": 4.5304777594728176e-06, + "loss": 0.5398, + "step": 1100 + }, + { + "epoch": 0.019769031811667025, + "grad_norm": 2.0754776000976562, + "learning_rate": 4.942339373970346e-06, + "loss": 0.5444, + "step": 1200 + }, + { + "epoch": 0.021416451129305944, + "grad_norm": 1.9554790258407593, + "learning_rate": 4.999974215318018e-06, + "loss": 0.5688, + "step": 1300 + }, + { + "epoch": 0.02306387044694486, + "grad_norm": 2.532405376434326, + "learning_rate": 4.999879388694095e-06, + "loss": 0.5549, + "step": 1400 + }, + { + "epoch": 0.02471128976458378, + "grad_norm": 2.0328919887542725, + "learning_rate": 4.999714839456846e-06, + "loss": 0.5484, + "step": 1500 + }, + { + "epoch": 0.0263587090822227, + "grad_norm": 1.7955541610717773, + "learning_rate": 4.999480572195616e-06, + "loss": 0.5765, + "step": 1600 + }, + { + "epoch": 0.028006128399861618, + "grad_norm": 1.7495211362838745, + "learning_rate": 4.999176593444209e-06, + "loss": 0.5829, + "step": 1700 + }, + { + "epoch": 0.029653547717500537, + "grad_norm": 2.1942079067230225, + "learning_rate": 4.9988029116807125e-06, + "loss": 0.5331, + "step": 1800 + }, + { + "epoch": 0.03130096703513945, + "grad_norm": 2.9001498222351074, + "learning_rate": 4.998359537327255e-06, + "loss": 0.5108, + "step": 1900 + }, + { + "epoch": 0.03294838635277837, + "grad_norm": 2.320958375930786, + "learning_rate": 4.997846482749723e-06, + "loss": 0.5484, + "step": 2000 + }, + { + "epoch": 0.03459580567041729, + "grad_norm": 2.4439444541931152, + "learning_rate": 4.9972637622574074e-06, + "loss": 0.5448, + "step": 2100 + }, + { + "epoch": 0.03624322498805621, + "grad_norm": 2.403137445449829, + "learning_rate": 4.996611392102611e-06, + "loss": 0.519, + "step": 2200 + }, + { + "epoch": 0.03789064430569513, + "grad_norm": 1.4548203945159912, + "learning_rate": 4.995889390480193e-06, + "loss": 0.4869, + "step": 2300 + }, + { + "epoch": 0.03953806362333405, + "grad_norm": 2.335745334625244, + "learning_rate": 4.99509777752706e-06, + "loss": 0.5545, + "step": 2400 + }, + { + "epoch": 0.04118548294097297, + "grad_norm": 2.894595146179199, + "learning_rate": 4.994236575321607e-06, + "loss": 0.5364, + "step": 2500 + }, + { + "epoch": 0.04283290225861189, + "grad_norm": 3.079472064971924, + "learning_rate": 4.993305807883101e-06, + "loss": 0.5514, + "step": 2600 + }, + { + "epoch": 0.0444803215762508, + "grad_norm": 2.3833718299865723, + "learning_rate": 4.9923055011710075e-06, + "loss": 0.492, + "step": 2700 + }, + { + "epoch": 0.04612774089388972, + "grad_norm": 2.7838637828826904, + "learning_rate": 4.991235683084274e-06, + "loss": 0.5156, + "step": 2800 + }, + { + "epoch": 0.04777516021152864, + "grad_norm": 1.7487517595291138, + "learning_rate": 4.9900963834605445e-06, + "loss": 0.514, + "step": 2900 + }, + { + "epoch": 0.04942257952916756, + "grad_norm": 1.7354815006256104, + "learning_rate": 4.98888763407533e-06, + "loss": 0.5202, + "step": 3000 + }, + { + "epoch": 0.05106999884680648, + "grad_norm": 4.250129222869873, + "learning_rate": 4.987609468641125e-06, + "loss": 0.5069, + "step": 3100 + }, + { + "epoch": 0.0527174181644454, + "grad_norm": 2.1309328079223633, + "learning_rate": 4.986261922806461e-06, + "loss": 0.5372, + "step": 3200 + }, + { + "epoch": 0.054364837482084316, + "grad_norm": 2.0532209873199463, + "learning_rate": 4.9848450341549196e-06, + "loss": 0.5118, + "step": 3300 + }, + { + "epoch": 0.056012256799723235, + "grad_norm": 2.774035692214966, + "learning_rate": 4.983358842204078e-06, + "loss": 0.5082, + "step": 3400 + }, + { + "epoch": 0.057659676117362155, + "grad_norm": 4.331142425537109, + "learning_rate": 4.981803388404411e-06, + "loss": 0.5328, + "step": 3500 + }, + { + "epoch": 0.059307095435001074, + "grad_norm": 2.5397560596466064, + "learning_rate": 4.980178716138135e-06, + "loss": 0.5173, + "step": 3600 + }, + { + "epoch": 0.060954514752639986, + "grad_norm": 2.2354204654693604, + "learning_rate": 4.978484870717991e-06, + "loss": 0.4946, + "step": 3700 + }, + { + "epoch": 0.0626019340702789, + "grad_norm": 1.8501393795013428, + "learning_rate": 4.976721899385992e-06, + "loss": 0.5341, + "step": 3800 + }, + { + "epoch": 0.06424935338791783, + "grad_norm": 1.828378677368164, + "learning_rate": 4.974889851312098e-06, + "loss": 0.5097, + "step": 3900 + }, + { + "epoch": 0.06589677270555674, + "grad_norm": 2.1924521923065186, + "learning_rate": 4.972988777592845e-06, + "loss": 0.505, + "step": 4000 + }, + { + "epoch": 0.06754419202319567, + "grad_norm": 1.9084734916687012, + "learning_rate": 4.971018731249923e-06, + "loss": 0.5043, + "step": 4100 + }, + { + "epoch": 0.06919161134083458, + "grad_norm": 2.8705804347991943, + "learning_rate": 4.968979767228693e-06, + "loss": 0.5118, + "step": 4200 + }, + { + "epoch": 0.0708390306584735, + "grad_norm": 2.0432722568511963, + "learning_rate": 4.96687194239666e-06, + "loss": 0.5295, + "step": 4300 + }, + { + "epoch": 0.07248644997611242, + "grad_norm": 2.022822380065918, + "learning_rate": 4.964695315541883e-06, + "loss": 0.5649, + "step": 4400 + }, + { + "epoch": 0.07413386929375133, + "grad_norm": 2.284590721130371, + "learning_rate": 4.962449947371334e-06, + "loss": 0.4841, + "step": 4500 + }, + { + "epoch": 0.07578128861139026, + "grad_norm": 3.217561721801758, + "learning_rate": 4.9601359005092095e-06, + "loss": 0.5401, + "step": 4600 + }, + { + "epoch": 0.07742870792902917, + "grad_norm": 1.9388020038604736, + "learning_rate": 4.957753239495181e-06, + "loss": 0.5251, + "step": 4700 + }, + { + "epoch": 0.0790761272466681, + "grad_norm": 1.3349353075027466, + "learning_rate": 4.955302030782596e-06, + "loss": 0.4962, + "step": 4800 + }, + { + "epoch": 0.08072354656430701, + "grad_norm": 2.4485511779785156, + "learning_rate": 4.952782342736625e-06, + "loss": 0.4939, + "step": 4900 + }, + { + "epoch": 0.08237096588194594, + "grad_norm": 3.657675266265869, + "learning_rate": 4.950194245632349e-06, + "loss": 0.5123, + "step": 5000 + }, + { + "epoch": 0.08401838519958485, + "grad_norm": 2.871431589126587, + "learning_rate": 4.9475378116528105e-06, + "loss": 0.5063, + "step": 5100 + }, + { + "epoch": 0.08566580451722378, + "grad_norm": 1.394823431968689, + "learning_rate": 4.944813114886991e-06, + "loss": 0.4939, + "step": 5200 + }, + { + "epoch": 0.08731322383486269, + "grad_norm": 1.6979378461837769, + "learning_rate": 4.942020231327749e-06, + "loss": 0.5102, + "step": 5300 + }, + { + "epoch": 0.0889606431525016, + "grad_norm": 1.941582202911377, + "learning_rate": 4.939159238869698e-06, + "loss": 0.5347, + "step": 5400 + }, + { + "epoch": 0.09060806247014053, + "grad_norm": 1.9074257612228394, + "learning_rate": 4.936230217307035e-06, + "loss": 0.4935, + "step": 5500 + }, + { + "epoch": 0.09225548178777944, + "grad_norm": 2.327624797821045, + "learning_rate": 4.933233248331317e-06, + "loss": 0.5218, + "step": 5600 + }, + { + "epoch": 0.09390290110541837, + "grad_norm": 2.605468988418579, + "learning_rate": 4.930168415529181e-06, + "loss": 0.4831, + "step": 5700 + }, + { + "epoch": 0.09555032042305728, + "grad_norm": 2.137749671936035, + "learning_rate": 4.927035804380012e-06, + "loss": 0.4983, + "step": 5800 + }, + { + "epoch": 0.0971977397406962, + "grad_norm": 1.9908422231674194, + "learning_rate": 4.923835502253558e-06, + "loss": 0.4991, + "step": 5900 + }, + { + "epoch": 0.09884515905833512, + "grad_norm": 1.8356066942214966, + "learning_rate": 4.920567598407498e-06, + "loss": 0.4907, + "step": 6000 + }, + { + "epoch": 0.10049257837597404, + "grad_norm": 2.3301796913146973, + "learning_rate": 4.917232183984946e-06, + "loss": 0.4833, + "step": 6100 + }, + { + "epoch": 0.10213999769361295, + "grad_norm": 2.835822582244873, + "learning_rate": 4.913829352011914e-06, + "loss": 0.554, + "step": 6200 + }, + { + "epoch": 0.10378741701125187, + "grad_norm": 1.825016736984253, + "learning_rate": 4.910359197394717e-06, + "loss": 0.5082, + "step": 6300 + }, + { + "epoch": 0.1054348363288908, + "grad_norm": 3.021340847015381, + "learning_rate": 4.9068218169173245e-06, + "loss": 0.4945, + "step": 6400 + }, + { + "epoch": 0.1070822556465297, + "grad_norm": 3.6816606521606445, + "learning_rate": 4.903217309238658e-06, + "loss": 0.54, + "step": 6500 + }, + { + "epoch": 0.10872967496416863, + "grad_norm": 2.1384148597717285, + "learning_rate": 4.899545774889848e-06, + "loss": 0.497, + "step": 6600 + }, + { + "epoch": 0.11037709428180754, + "grad_norm": 2.311786651611328, + "learning_rate": 4.895807316271421e-06, + "loss": 0.4927, + "step": 6700 + }, + { + "epoch": 0.11202451359944647, + "grad_norm": 1.765767216682434, + "learning_rate": 4.892002037650451e-06, + "loss": 0.4984, + "step": 6800 + }, + { + "epoch": 0.11367193291708538, + "grad_norm": 1.8108317852020264, + "learning_rate": 4.888130045157645e-06, + "loss": 0.4957, + "step": 6900 + }, + { + "epoch": 0.11531935223472431, + "grad_norm": 2.6695711612701416, + "learning_rate": 4.884191446784387e-06, + "loss": 0.4992, + "step": 7000 + }, + { + "epoch": 0.11696677155236322, + "grad_norm": 2.477202892303467, + "learning_rate": 4.880186352379726e-06, + "loss": 0.4947, + "step": 7100 + }, + { + "epoch": 0.11861419087000215, + "grad_norm": 3.69132399559021, + "learning_rate": 4.876114873647308e-06, + "loss": 0.5092, + "step": 7200 + }, + { + "epoch": 0.12026161018764106, + "grad_norm": 2.353121042251587, + "learning_rate": 4.871977124142271e-06, + "loss": 0.4752, + "step": 7300 + }, + { + "epoch": 0.12190902950527997, + "grad_norm": 2.3746302127838135, + "learning_rate": 4.867773219268062e-06, + "loss": 0.5186, + "step": 7400 + }, + { + "epoch": 0.1235564488229189, + "grad_norm": 2.437284469604492, + "learning_rate": 4.863503276273232e-06, + "loss": 0.4882, + "step": 7500 + }, + { + "epoch": 0.1252038681405578, + "grad_norm": 2.287785291671753, + "learning_rate": 4.859167414248163e-06, + "loss": 0.4755, + "step": 7600 + }, + { + "epoch": 0.12685128745819674, + "grad_norm": 4.1828413009643555, + "learning_rate": 4.854765754121738e-06, + "loss": 0.5062, + "step": 7700 + }, + { + "epoch": 0.12849870677583566, + "grad_norm": 2.3262546062469482, + "learning_rate": 4.85029841865798e-06, + "loss": 0.4756, + "step": 7800 + }, + { + "epoch": 0.13014612609347456, + "grad_norm": 1.2054634094238281, + "learning_rate": 4.8457655324526215e-06, + "loss": 0.4827, + "step": 7900 + }, + { + "epoch": 0.1317935454111135, + "grad_norm": 2.3276774883270264, + "learning_rate": 4.8411672219296304e-06, + "loss": 0.4833, + "step": 8000 + }, + { + "epoch": 0.13344096472875241, + "grad_norm": 1.9837372303009033, + "learning_rate": 4.836503615337684e-06, + "loss": 0.4681, + "step": 8100 + }, + { + "epoch": 0.13508838404639134, + "grad_norm": 1.6989622116088867, + "learning_rate": 4.831774842746595e-06, + "loss": 0.5375, + "step": 8200 + }, + { + "epoch": 0.13673580336403024, + "grad_norm": 2.29801869392395, + "learning_rate": 4.826981036043677e-06, + "loss": 0.5102, + "step": 8300 + }, + { + "epoch": 0.13838322268166917, + "grad_norm": 8.920065879821777, + "learning_rate": 4.822122328930076e-06, + "loss": 0.5145, + "step": 8400 + }, + { + "epoch": 0.1400306419993081, + "grad_norm": 2.425342321395874, + "learning_rate": 4.817198856917029e-06, + "loss": 0.4888, + "step": 8500 + }, + { + "epoch": 0.141678061316947, + "grad_norm": 2.2098586559295654, + "learning_rate": 4.812210757322096e-06, + "loss": 0.5088, + "step": 8600 + }, + { + "epoch": 0.14332548063458592, + "grad_norm": 2.6320948600769043, + "learning_rate": 4.807158169265326e-06, + "loss": 0.4868, + "step": 8700 + }, + { + "epoch": 0.14497289995222484, + "grad_norm": 2.660802125930786, + "learning_rate": 4.802041233665373e-06, + "loss": 0.4742, + "step": 8800 + }, + { + "epoch": 0.14662031926986377, + "grad_norm": 2.3442442417144775, + "learning_rate": 4.796860093235572e-06, + "loss": 0.4789, + "step": 8900 + }, + { + "epoch": 0.14826773858750267, + "grad_norm": 2.416050434112549, + "learning_rate": 4.791614892479956e-06, + "loss": 0.5149, + "step": 9000 + }, + { + "epoch": 0.1499151579051416, + "grad_norm": 2.576631784439087, + "learning_rate": 4.786305777689222e-06, + "loss": 0.5096, + "step": 9100 + }, + { + "epoch": 0.15156257722278052, + "grad_norm": 1.699407935142517, + "learning_rate": 4.7809328969366585e-06, + "loss": 0.5006, + "step": 9200 + }, + { + "epoch": 0.15320999654041945, + "grad_norm": 2.303194046020508, + "learning_rate": 4.7754964000740086e-06, + "loss": 0.5113, + "step": 9300 + }, + { + "epoch": 0.15485741585805834, + "grad_norm": 2.021639347076416, + "learning_rate": 4.7699964387272964e-06, + "loss": 0.4823, + "step": 9400 + }, + { + "epoch": 0.15650483517569727, + "grad_norm": 1.7534514665603638, + "learning_rate": 4.764433166292593e-06, + "loss": 0.4912, + "step": 9500 + }, + { + "epoch": 0.1581522544933362, + "grad_norm": 2.9182558059692383, + "learning_rate": 4.758806737931741e-06, + "loss": 0.4957, + "step": 9600 + }, + { + "epoch": 0.1597996738109751, + "grad_norm": 2.112656831741333, + "learning_rate": 4.753117310568026e-06, + "loss": 0.4733, + "step": 9700 + }, + { + "epoch": 0.16144709312861402, + "grad_norm": 2.052156686782837, + "learning_rate": 4.7473650428818025e-06, + "loss": 0.4794, + "step": 9800 + }, + { + "epoch": 0.16309451244625295, + "grad_norm": 2.4516518115997314, + "learning_rate": 4.741550095306065e-06, + "loss": 0.4807, + "step": 9900 + }, + { + "epoch": 0.16474193176389187, + "grad_norm": 1.8814926147460938, + "learning_rate": 4.7356726300219715e-06, + "loss": 0.4392, + "step": 10000 + }, + { + "epoch": 0.16638935108153077, + "grad_norm": 1.6867588758468628, + "learning_rate": 4.729732810954329e-06, + "loss": 0.489, + "step": 10100 + }, + { + "epoch": 0.1680367703991697, + "grad_norm": 1.996559739112854, + "learning_rate": 4.723730803767014e-06, + "loss": 0.45, + "step": 10200 + }, + { + "epoch": 0.16968418971680863, + "grad_norm": 2.4676289558410645, + "learning_rate": 4.71766677585835e-06, + "loss": 0.49, + "step": 10300 + }, + { + "epoch": 0.17133160903444755, + "grad_norm": 2.4000778198242188, + "learning_rate": 4.711540896356447e-06, + "loss": 0.5133, + "step": 10400 + }, + { + "epoch": 0.17297902835208645, + "grad_norm": 1.6576099395751953, + "learning_rate": 4.70535333611448e-06, + "loss": 0.4682, + "step": 10500 + }, + { + "epoch": 0.17462644766972538, + "grad_norm": 2.6019415855407715, + "learning_rate": 4.699104267705921e-06, + "loss": 0.5221, + "step": 10600 + }, + { + "epoch": 0.1762738669873643, + "grad_norm": 2.8221852779388428, + "learning_rate": 4.692793865419731e-06, + "loss": 0.5142, + "step": 10700 + }, + { + "epoch": 0.1779212863050032, + "grad_norm": 1.781231164932251, + "learning_rate": 4.686422305255498e-06, + "loss": 0.4908, + "step": 10800 + }, + { + "epoch": 0.17956870562264213, + "grad_norm": 2.3753836154937744, + "learning_rate": 4.679989764918524e-06, + "loss": 0.4894, + "step": 10900 + }, + { + "epoch": 0.18121612494028105, + "grad_norm": 1.7550493478775024, + "learning_rate": 4.673496423814874e-06, + "loss": 0.4707, + "step": 11000 + }, + { + "epoch": 0.18286354425791998, + "grad_norm": 1.6989047527313232, + "learning_rate": 4.666942463046369e-06, + "loss": 0.5209, + "step": 11100 + }, + { + "epoch": 0.18451096357555888, + "grad_norm": 2.0338029861450195, + "learning_rate": 4.660328065405537e-06, + "loss": 0.5168, + "step": 11200 + }, + { + "epoch": 0.1861583828931978, + "grad_norm": 2.14629864692688, + "learning_rate": 4.6536534153705135e-06, + "loss": 0.4802, + "step": 11300 + }, + { + "epoch": 0.18780580221083673, + "grad_norm": 1.9664320945739746, + "learning_rate": 4.646918699099898e-06, + "loss": 0.505, + "step": 11400 + }, + { + "epoch": 0.18945322152847563, + "grad_norm": 2.435833692550659, + "learning_rate": 4.640124104427558e-06, + "loss": 0.5205, + "step": 11500 + }, + { + "epoch": 0.19110064084611456, + "grad_norm": 1.8850288391113281, + "learning_rate": 4.633269820857397e-06, + "loss": 0.4964, + "step": 11600 + }, + { + "epoch": 0.19274806016375348, + "grad_norm": 1.9810831546783447, + "learning_rate": 4.626356039558061e-06, + "loss": 0.5006, + "step": 11700 + }, + { + "epoch": 0.1943954794813924, + "grad_norm": 2.52791166305542, + "learning_rate": 4.619382953357615e-06, + "loss": 0.4809, + "step": 11800 + }, + { + "epoch": 0.1960428987990313, + "grad_norm": 2.0693445205688477, + "learning_rate": 4.612350756738157e-06, + "loss": 0.4591, + "step": 11900 + }, + { + "epoch": 0.19769031811667023, + "grad_norm": 2.312404155731201, + "learning_rate": 4.6052596458303996e-06, + "loss": 0.4695, + "step": 12000 + }, + { + "epoch": 0.19933773743430916, + "grad_norm": 2.2149617671966553, + "learning_rate": 4.5981098184081995e-06, + "loss": 0.4743, + "step": 12100 + }, + { + "epoch": 0.20098515675194809, + "grad_norm": 2.597283124923706, + "learning_rate": 4.590901473883037e-06, + "loss": 0.4893, + "step": 12200 + }, + { + "epoch": 0.20263257606958698, + "grad_norm": 1.9223053455352783, + "learning_rate": 4.5836348132984584e-06, + "loss": 0.4706, + "step": 12300 + }, + { + "epoch": 0.2042799953872259, + "grad_norm": 1.0610065460205078, + "learning_rate": 4.57631003932447e-06, + "loss": 0.4566, + "step": 12400 + }, + { + "epoch": 0.20592741470486484, + "grad_norm": 2.5029940605163574, + "learning_rate": 4.568927356251878e-06, + "loss": 0.451, + "step": 12500 + }, + { + "epoch": 0.20757483402250373, + "grad_norm": 1.3197004795074463, + "learning_rate": 4.5614869699866e-06, + "loss": 0.4583, + "step": 12600 + }, + { + "epoch": 0.20922225334014266, + "grad_norm": 1.5407695770263672, + "learning_rate": 4.553989088043919e-06, + "loss": 0.4673, + "step": 12700 + }, + { + "epoch": 0.2108696726577816, + "grad_norm": 1.6594492197036743, + "learning_rate": 4.546433919542691e-06, + "loss": 0.5023, + "step": 12800 + }, + { + "epoch": 0.2125170919754205, + "grad_norm": 1.9056370258331299, + "learning_rate": 4.538821675199521e-06, + "loss": 0.5202, + "step": 12900 + }, + { + "epoch": 0.2141645112930594, + "grad_norm": 3.2313265800476074, + "learning_rate": 4.531152567322877e-06, + "loss": 0.4649, + "step": 13000 + }, + { + "epoch": 0.21581193061069834, + "grad_norm": 2.2487971782684326, + "learning_rate": 4.5234268098071766e-06, + "loss": 0.4611, + "step": 13100 + }, + { + "epoch": 0.21745934992833726, + "grad_norm": 2.0419654846191406, + "learning_rate": 4.515644618126816e-06, + "loss": 0.4851, + "step": 13200 + }, + { + "epoch": 0.2191067692459762, + "grad_norm": 1.4483575820922852, + "learning_rate": 4.507806209330165e-06, + "loss": 0.4789, + "step": 13300 + }, + { + "epoch": 0.2207541885636151, + "grad_norm": 2.3362390995025635, + "learning_rate": 4.499911802033508e-06, + "loss": 0.4846, + "step": 13400 + }, + { + "epoch": 0.22240160788125402, + "grad_norm": 2.0402286052703857, + "learning_rate": 4.491961616414948e-06, + "loss": 0.5099, + "step": 13500 + }, + { + "epoch": 0.22404902719889294, + "grad_norm": 2.0675928592681885, + "learning_rate": 4.483955874208273e-06, + "loss": 0.4878, + "step": 13600 + }, + { + "epoch": 0.22569644651653184, + "grad_norm": 1.6327743530273438, + "learning_rate": 4.4758947986967614e-06, + "loss": 0.4765, + "step": 13700 + }, + { + "epoch": 0.22734386583417077, + "grad_norm": 2.0917341709136963, + "learning_rate": 4.4677786147069595e-06, + "loss": 0.4525, + "step": 13800 + }, + { + "epoch": 0.2289912851518097, + "grad_norm": 1.5012590885162354, + "learning_rate": 4.459607548602412e-06, + "loss": 0.4699, + "step": 13900 + }, + { + "epoch": 0.23063870446944862, + "grad_norm": 2.0980496406555176, + "learning_rate": 4.451381828277346e-06, + "loss": 0.5045, + "step": 14000 + }, + { + "epoch": 0.23228612378708752, + "grad_norm": 1.8820241689682007, + "learning_rate": 4.443101683150316e-06, + "loss": 0.4918, + "step": 14100 + }, + { + "epoch": 0.23393354310472644, + "grad_norm": 2.0610568523406982, + "learning_rate": 4.434767344157808e-06, + "loss": 0.4917, + "step": 14200 + }, + { + "epoch": 0.23558096242236537, + "grad_norm": 2.2509660720825195, + "learning_rate": 4.426379043747793e-06, + "loss": 0.4933, + "step": 14300 + }, + { + "epoch": 0.2372283817400043, + "grad_norm": 3.667386531829834, + "learning_rate": 4.417937015873249e-06, + "loss": 0.4784, + "step": 14400 + }, + { + "epoch": 0.2388758010576432, + "grad_norm": 2.4788925647735596, + "learning_rate": 4.409441495985632e-06, + "loss": 0.4901, + "step": 14500 + }, + { + "epoch": 0.24052322037528212, + "grad_norm": 1.6511657238006592, + "learning_rate": 4.4008927210283144e-06, + "loss": 0.4777, + "step": 14600 + }, + { + "epoch": 0.24217063969292105, + "grad_norm": 1.7784366607666016, + "learning_rate": 4.392290929429971e-06, + "loss": 0.4863, + "step": 14700 + }, + { + "epoch": 0.24381805901055995, + "grad_norm": 2.4235856533050537, + "learning_rate": 4.383636361097931e-06, + "loss": 0.4578, + "step": 14800 + }, + { + "epoch": 0.24546547832819887, + "grad_norm": 1.6377619504928589, + "learning_rate": 4.3749292574114886e-06, + "loss": 0.4846, + "step": 14900 + }, + { + "epoch": 0.2471128976458378, + "grad_norm": 1.5944766998291016, + "learning_rate": 4.366169861215168e-06, + "loss": 0.4744, + "step": 15000 + }, + { + "epoch": 0.24876031696347672, + "grad_norm": 2.405319929122925, + "learning_rate": 4.357358416811955e-06, + "loss": 0.4685, + "step": 15100 + }, + { + "epoch": 0.2504077362811156, + "grad_norm": 2.4015884399414062, + "learning_rate": 4.348495169956477e-06, + "loss": 0.4783, + "step": 15200 + }, + { + "epoch": 0.2520551555987546, + "grad_norm": 2.325193166732788, + "learning_rate": 4.339580367848153e-06, + "loss": 0.4579, + "step": 15300 + }, + { + "epoch": 0.2537025749163935, + "grad_norm": 1.8238539695739746, + "learning_rate": 4.3306142591243e-06, + "loss": 0.4697, + "step": 15400 + }, + { + "epoch": 0.2553499942340324, + "grad_norm": 1.4284635782241821, + "learning_rate": 4.321597093853194e-06, + "loss": 0.452, + "step": 15500 + }, + { + "epoch": 0.2569974135516713, + "grad_norm": 1.5146524906158447, + "learning_rate": 4.3125291235271e-06, + "loss": 0.4858, + "step": 15600 + }, + { + "epoch": 0.2586448328693102, + "grad_norm": 2.1129367351531982, + "learning_rate": 4.303410601055253e-06, + "loss": 0.4986, + "step": 15700 + }, + { + "epoch": 0.2602922521869491, + "grad_norm": 2.0981929302215576, + "learning_rate": 4.29424178075681e-06, + "loss": 0.4505, + "step": 15800 + }, + { + "epoch": 0.2619396715045881, + "grad_norm": 1.3321784734725952, + "learning_rate": 4.285022918353755e-06, + "loss": 0.4983, + "step": 15900 + }, + { + "epoch": 0.263587090822227, + "grad_norm": 2.7090718746185303, + "learning_rate": 4.275754270963763e-06, + "loss": 0.482, + "step": 16000 + }, + { + "epoch": 0.2652345101398659, + "grad_norm": 1.5834273099899292, + "learning_rate": 4.26643609709303e-06, + "loss": 0.5029, + "step": 16100 + }, + { + "epoch": 0.26688192945750483, + "grad_norm": 2.400024175643921, + "learning_rate": 4.257068656629071e-06, + "loss": 0.4579, + "step": 16200 + }, + { + "epoch": 0.26852934877514373, + "grad_norm": 1.9160480499267578, + "learning_rate": 4.24765221083346e-06, + "loss": 0.4892, + "step": 16300 + }, + { + "epoch": 0.2701767680927827, + "grad_norm": 2.4766881465911865, + "learning_rate": 4.238187022334553e-06, + "loss": 0.4633, + "step": 16400 + }, + { + "epoch": 0.2718241874104216, + "grad_norm": 2.2665488719940186, + "learning_rate": 4.228673355120156e-06, + "loss": 0.4682, + "step": 16500 + }, + { + "epoch": 0.2734716067280605, + "grad_norm": 2.582789897918701, + "learning_rate": 4.2191114745301654e-06, + "loss": 0.4761, + "step": 16600 + }, + { + "epoch": 0.27511902604569943, + "grad_norm": 2.240748882293701, + "learning_rate": 4.20950164724917e-06, + "loss": 0.4613, + "step": 16700 + }, + { + "epoch": 0.27676644536333833, + "grad_norm": 2.4156808853149414, + "learning_rate": 4.1998441412990085e-06, + "loss": 0.4907, + "step": 16800 + }, + { + "epoch": 0.27841386468097723, + "grad_norm": 2.348371744155884, + "learning_rate": 4.190139226031297e-06, + "loss": 0.4675, + "step": 16900 + }, + { + "epoch": 0.2800612839986162, + "grad_norm": 1.7973005771636963, + "learning_rate": 4.180387172119916e-06, + "loss": 0.4738, + "step": 17000 + }, + { + "epoch": 0.2817087033162551, + "grad_norm": 2.322040557861328, + "learning_rate": 4.17058825155346e-06, + "loss": 0.4644, + "step": 17100 + }, + { + "epoch": 0.283356122633894, + "grad_norm": 2.3491313457489014, + "learning_rate": 4.160742737627656e-06, + "loss": 0.5077, + "step": 17200 + }, + { + "epoch": 0.28500354195153293, + "grad_norm": 1.630631446838379, + "learning_rate": 4.150850904937733e-06, + "loss": 0.4797, + "step": 17300 + }, + { + "epoch": 0.28665096126917183, + "grad_norm": 2.0471599102020264, + "learning_rate": 4.140913029370774e-06, + "loss": 0.461, + "step": 17400 + }, + { + "epoch": 0.2882983805868108, + "grad_norm": 2.4391767978668213, + "learning_rate": 4.130929388098011e-06, + "loss": 0.4962, + "step": 17500 + }, + { + "epoch": 0.2899457999044497, + "grad_norm": 2.0148985385894775, + "learning_rate": 4.120900259567103e-06, + "loss": 0.4634, + "step": 17600 + }, + { + "epoch": 0.2915932192220886, + "grad_norm": 2.3383798599243164, + "learning_rate": 4.110825923494365e-06, + "loss": 0.4553, + "step": 17700 + }, + { + "epoch": 0.29324063853972754, + "grad_norm": 1.539428949356079, + "learning_rate": 4.100706660856968e-06, + "loss": 0.4864, + "step": 17800 + }, + { + "epoch": 0.29488805785736644, + "grad_norm": 1.8251954317092896, + "learning_rate": 4.090542753885101e-06, + "loss": 0.487, + "step": 17900 + }, + { + "epoch": 0.29653547717500534, + "grad_norm": 2.269007921218872, + "learning_rate": 4.080334486054104e-06, + "loss": 0.4423, + "step": 18000 + }, + { + "epoch": 0.2981828964926443, + "grad_norm": 2.4436540603637695, + "learning_rate": 4.0700821420765566e-06, + "loss": 0.4916, + "step": 18100 + }, + { + "epoch": 0.2998303158102832, + "grad_norm": 2.570488929748535, + "learning_rate": 4.05978600789434e-06, + "loss": 0.4536, + "step": 18200 + }, + { + "epoch": 0.3014777351279221, + "grad_norm": 2.247633934020996, + "learning_rate": 4.049446370670661e-06, + "loss": 0.4891, + "step": 18300 + }, + { + "epoch": 0.30312515444556104, + "grad_norm": 1.7023581266403198, + "learning_rate": 4.0390635187820435e-06, + "loss": 0.4594, + "step": 18400 + }, + { + "epoch": 0.30477257376319994, + "grad_norm": 2.0368921756744385, + "learning_rate": 4.028637741810285e-06, + "loss": 0.4191, + "step": 18500 + }, + { + "epoch": 0.3064199930808389, + "grad_norm": 2.0896544456481934, + "learning_rate": 4.018169330534381e-06, + "loss": 0.4691, + "step": 18600 + }, + { + "epoch": 0.3080674123984778, + "grad_norm": 2.5784189701080322, + "learning_rate": 4.007658576922413e-06, + "loss": 0.4442, + "step": 18700 + }, + { + "epoch": 0.3097148317161167, + "grad_norm": 2.169424057006836, + "learning_rate": 3.997105774123409e-06, + "loss": 0.4552, + "step": 18800 + }, + { + "epoch": 0.31136225103375564, + "grad_norm": 2.076741933822632, + "learning_rate": 3.986511216459163e-06, + "loss": 0.462, + "step": 18900 + }, + { + "epoch": 0.31300967035139454, + "grad_norm": 2.33245849609375, + "learning_rate": 3.97587519941603e-06, + "loss": 0.5015, + "step": 19000 + }, + { + "epoch": 0.31465708966903344, + "grad_norm": 2.465367555618286, + "learning_rate": 3.965198019636684e-06, + "loss": 0.4726, + "step": 19100 + }, + { + "epoch": 0.3163045089866724, + "grad_norm": 2.0327184200286865, + "learning_rate": 3.95447997491184e-06, + "loss": 0.4602, + "step": 19200 + }, + { + "epoch": 0.3179519283043113, + "grad_norm": 2.6782443523406982, + "learning_rate": 3.943721364171957e-06, + "loss": 0.4676, + "step": 19300 + }, + { + "epoch": 0.3195993476219502, + "grad_norm": 2.373873233795166, + "learning_rate": 3.932922487478894e-06, + "loss": 0.4466, + "step": 19400 + }, + { + "epoch": 0.32124676693958915, + "grad_norm": 2.5210931301116943, + "learning_rate": 3.9220836460175415e-06, + "loss": 0.4543, + "step": 19500 + }, + { + "epoch": 0.32289418625722804, + "grad_norm": 2.384608268737793, + "learning_rate": 3.911205142087425e-06, + "loss": 0.4758, + "step": 19600 + }, + { + "epoch": 0.324541605574867, + "grad_norm": 2.8322508335113525, + "learning_rate": 3.900287279094274e-06, + "loss": 0.4597, + "step": 19700 + }, + { + "epoch": 0.3261890248925059, + "grad_norm": 3.4156792163848877, + "learning_rate": 3.889330361541552e-06, + "loss": 0.4552, + "step": 19800 + }, + { + "epoch": 0.3278364442101448, + "grad_norm": 1.7643976211547852, + "learning_rate": 3.878334695021973e-06, + "loss": 0.4589, + "step": 19900 + }, + { + "epoch": 0.32948386352778375, + "grad_norm": 1.7313556671142578, + "learning_rate": 3.867300586208975e-06, + "loss": 0.4444, + "step": 20000 + }, + { + "epoch": 0.33113128284542265, + "grad_norm": 1.821792721748352, + "learning_rate": 3.856228342848167e-06, + "loss": 0.4945, + "step": 20100 + }, + { + "epoch": 0.33277870216306155, + "grad_norm": 2.735888719558716, + "learning_rate": 3.845118273748743e-06, + "loss": 0.4431, + "step": 20200 + }, + { + "epoch": 0.3344261214807005, + "grad_norm": 2.3234407901763916, + "learning_rate": 3.833970688774872e-06, + "loss": 0.4838, + "step": 20300 + }, + { + "epoch": 0.3360735407983394, + "grad_norm": 1.709910273551941, + "learning_rate": 3.822785898837058e-06, + "loss": 0.4754, + "step": 20400 + }, + { + "epoch": 0.3377209601159783, + "grad_norm": 2.435945987701416, + "learning_rate": 3.811564215883463e-06, + "loss": 0.4737, + "step": 20500 + }, + { + "epoch": 0.33936837943361725, + "grad_norm": 1.9514074325561523, + "learning_rate": 3.8003059528912123e-06, + "loss": 0.4861, + "step": 20600 + }, + { + "epoch": 0.34101579875125615, + "grad_norm": 2.4523439407348633, + "learning_rate": 3.7890114238576616e-06, + "loss": 0.4814, + "step": 20700 + }, + { + "epoch": 0.3426632180688951, + "grad_norm": 2.690749406814575, + "learning_rate": 3.777680943791639e-06, + "loss": 0.4837, + "step": 20800 + }, + { + "epoch": 0.344310637386534, + "grad_norm": 1.8186627626419067, + "learning_rate": 3.7663148287046635e-06, + "loss": 0.4384, + "step": 20900 + }, + { + "epoch": 0.3459580567041729, + "grad_norm": 2.5133306980133057, + "learning_rate": 3.754913395602129e-06, + "loss": 0.4612, + "step": 21000 + }, + { + "epoch": 0.34760547602181185, + "grad_norm": 1.9760069847106934, + "learning_rate": 3.7434769624744586e-06, + "loss": 0.4619, + "step": 21100 + }, + { + "epoch": 0.34925289533945075, + "grad_norm": 2.461090326309204, + "learning_rate": 3.732005848288245e-06, + "loss": 0.4762, + "step": 21200 + }, + { + "epoch": 0.35090031465708965, + "grad_norm": 1.82012939453125, + "learning_rate": 3.7205003729773454e-06, + "loss": 0.4309, + "step": 21300 + }, + { + "epoch": 0.3525477339747286, + "grad_norm": 1.5199309587478638, + "learning_rate": 3.708960857433964e-06, + "loss": 0.4632, + "step": 21400 + }, + { + "epoch": 0.3541951532923675, + "grad_norm": 1.8525145053863525, + "learning_rate": 3.6973876234997004e-06, + "loss": 0.4595, + "step": 21500 + }, + { + "epoch": 0.3558425726100064, + "grad_norm": 1.7146118879318237, + "learning_rate": 3.6857809939565724e-06, + "loss": 0.4414, + "step": 21600 + }, + { + "epoch": 0.35748999192764536, + "grad_norm": 2.75750994682312, + "learning_rate": 3.6741412925180153e-06, + "loss": 0.4624, + "step": 21700 + }, + { + "epoch": 0.35913741124528425, + "grad_norm": 2.6996710300445557, + "learning_rate": 3.6624688438198506e-06, + "loss": 0.4888, + "step": 21800 + }, + { + "epoch": 0.36078483056292315, + "grad_norm": 1.895980715751648, + "learning_rate": 3.650763973411238e-06, + "loss": 0.4395, + "step": 21900 + }, + { + "epoch": 0.3624322498805621, + "grad_norm": 2.5552258491516113, + "learning_rate": 3.639027007745585e-06, + "loss": 0.465, + "step": 22000 + }, + { + "epoch": 0.364079669198201, + "grad_norm": 1.6127821207046509, + "learning_rate": 3.6272582741714547e-06, + "loss": 0.4282, + "step": 22100 + }, + { + "epoch": 0.36572708851583996, + "grad_norm": 2.0909807682037354, + "learning_rate": 3.615458100923425e-06, + "loss": 0.4713, + "step": 22200 + }, + { + "epoch": 0.36737450783347886, + "grad_norm": 1.798374056816101, + "learning_rate": 3.603626817112941e-06, + "loss": 0.4784, + "step": 22300 + }, + { + "epoch": 0.36902192715111776, + "grad_norm": 2.0519778728485107, + "learning_rate": 3.5917647527191328e-06, + "loss": 0.4782, + "step": 22400 + }, + { + "epoch": 0.3706693464687567, + "grad_norm": 2.137410879135132, + "learning_rate": 3.5798722385796137e-06, + "loss": 0.4599, + "step": 22500 + }, + { + "epoch": 0.3723167657863956, + "grad_norm": 2.040231943130493, + "learning_rate": 3.5679496063812507e-06, + "loss": 0.434, + "step": 22600 + }, + { + "epoch": 0.3739641851040345, + "grad_norm": 2.0495615005493164, + "learning_rate": 3.5559971886509163e-06, + "loss": 0.473, + "step": 22700 + }, + { + "epoch": 0.37561160442167346, + "grad_norm": 2.5767838954925537, + "learning_rate": 3.5440153187462146e-06, + "loss": 0.4522, + "step": 22800 + }, + { + "epoch": 0.37725902373931236, + "grad_norm": 2.11317777633667, + "learning_rate": 3.5320043308461784e-06, + "loss": 0.4971, + "step": 22900 + }, + { + "epoch": 0.37890644305695126, + "grad_norm": 2.7997255325317383, + "learning_rate": 3.5199645599419574e-06, + "loss": 0.4562, + "step": 23000 + }, + { + "epoch": 0.3805538623745902, + "grad_norm": 2.3313941955566406, + "learning_rate": 3.5078963418274666e-06, + "loss": 0.4466, + "step": 23100 + }, + { + "epoch": 0.3822012816922291, + "grad_norm": 1.4548770189285278, + "learning_rate": 3.4958000130900273e-06, + "loss": 0.4628, + "step": 23200 + }, + { + "epoch": 0.38384870100986807, + "grad_norm": 1.5566315650939941, + "learning_rate": 3.4836759111009767e-06, + "loss": 0.47, + "step": 23300 + }, + { + "epoch": 0.38549612032750696, + "grad_norm": 1.3899728059768677, + "learning_rate": 3.4715243740062577e-06, + "loss": 0.46, + "step": 23400 + }, + { + "epoch": 0.38714353964514586, + "grad_norm": 2.3716745376586914, + "learning_rate": 3.4593457407169896e-06, + "loss": 0.4389, + "step": 23500 + }, + { + "epoch": 0.3887909589627848, + "grad_norm": 2.0501861572265625, + "learning_rate": 3.4471403509000166e-06, + "loss": 0.4621, + "step": 23600 + }, + { + "epoch": 0.3904383782804237, + "grad_norm": 2.131397008895874, + "learning_rate": 3.4349085449684306e-06, + "loss": 0.4643, + "step": 23700 + }, + { + "epoch": 0.3920857975980626, + "grad_norm": 2.515228509902954, + "learning_rate": 3.4226506640720804e-06, + "loss": 0.4691, + "step": 23800 + }, + { + "epoch": 0.39373321691570157, + "grad_norm": 1.9131451845169067, + "learning_rate": 3.4103670500880564e-06, + "loss": 0.4583, + "step": 23900 + }, + { + "epoch": 0.39538063623334047, + "grad_norm": 2.1132075786590576, + "learning_rate": 3.3980580456111528e-06, + "loss": 0.4572, + "step": 24000 + }, + { + "epoch": 0.39702805555097936, + "grad_norm": 2.0267536640167236, + "learning_rate": 3.385723993944317e-06, + "loss": 0.4605, + "step": 24100 + }, + { + "epoch": 0.3986754748686183, + "grad_norm": 1.9140433073043823, + "learning_rate": 3.3733652390890714e-06, + "loss": 0.4634, + "step": 24200 + }, + { + "epoch": 0.4003228941862572, + "grad_norm": 1.319580078125, + "learning_rate": 3.3609821257359187e-06, + "loss": 0.4607, + "step": 24300 + }, + { + "epoch": 0.40197031350389617, + "grad_norm": 2.329153299331665, + "learning_rate": 3.3485749992547312e-06, + "loss": 0.4864, + "step": 24400 + }, + { + "epoch": 0.40361773282153507, + "grad_norm": 1.709675669670105, + "learning_rate": 3.336144205685117e-06, + "loss": 0.4772, + "step": 24500 + }, + { + "epoch": 0.40526515213917397, + "grad_norm": 1.869702696800232, + "learning_rate": 3.3236900917267663e-06, + "loss": 0.4691, + "step": 24600 + }, + { + "epoch": 0.4069125714568129, + "grad_norm": 2.017636775970459, + "learning_rate": 3.311213004729787e-06, + "loss": 0.4568, + "step": 24700 + }, + { + "epoch": 0.4085599907744518, + "grad_norm": 2.2239317893981934, + "learning_rate": 3.2987132926850123e-06, + "loss": 0.4976, + "step": 24800 + }, + { + "epoch": 0.4102074100920907, + "grad_norm": 2.3074443340301514, + "learning_rate": 3.286191304214296e-06, + "loss": 0.4669, + "step": 24900 + }, + { + "epoch": 0.4118548294097297, + "grad_norm": 1.9659165143966675, + "learning_rate": 3.2736473885607932e-06, + "loss": 0.4794, + "step": 25000 + }, + { + "epoch": 0.41350224872736857, + "grad_norm": 2.3997573852539062, + "learning_rate": 3.2610818955792135e-06, + "loss": 0.4847, + "step": 25100 + }, + { + "epoch": 0.41514966804500747, + "grad_norm": 2.5638508796691895, + "learning_rate": 3.248495175726068e-06, + "loss": 0.4452, + "step": 25200 + }, + { + "epoch": 0.4167970873626464, + "grad_norm": 1.7153327465057373, + "learning_rate": 3.235887580049893e-06, + "loss": 0.4598, + "step": 25300 + }, + { + "epoch": 0.4184445066802853, + "grad_norm": 2.540421485900879, + "learning_rate": 3.223259460181461e-06, + "loss": 0.4573, + "step": 25400 + }, + { + "epoch": 0.4200919259979243, + "grad_norm": 2.420246124267578, + "learning_rate": 3.2106111683239703e-06, + "loss": 0.4593, + "step": 25500 + }, + { + "epoch": 0.4217393453155632, + "grad_norm": 2.1598918437957764, + "learning_rate": 3.1979430572432256e-06, + "loss": 0.4343, + "step": 25600 + }, + { + "epoch": 0.4233867646332021, + "grad_norm": 2.091474771499634, + "learning_rate": 3.185255480257797e-06, + "loss": 0.4423, + "step": 25700 + }, + { + "epoch": 0.425034183950841, + "grad_norm": 3.1766490936279297, + "learning_rate": 3.1725487912291654e-06, + "loss": 0.4499, + "step": 25800 + }, + { + "epoch": 0.4266816032684799, + "grad_norm": 1.8975087404251099, + "learning_rate": 3.1598233445518544e-06, + "loss": 0.4833, + "step": 25900 + }, + { + "epoch": 0.4283290225861188, + "grad_norm": 2.459707498550415, + "learning_rate": 3.1470794951435473e-06, + "loss": 0.4563, + "step": 26000 + }, + { + "epoch": 0.4299764419037578, + "grad_norm": 1.9212175607681274, + "learning_rate": 3.1343175984351842e-06, + "loss": 0.4451, + "step": 26100 + }, + { + "epoch": 0.4316238612213967, + "grad_norm": 2.1869616508483887, + "learning_rate": 3.121538010361054e-06, + "loss": 0.4438, + "step": 26200 + }, + { + "epoch": 0.4332712805390356, + "grad_norm": 2.3515875339508057, + "learning_rate": 3.108741087348862e-06, + "loss": 0.4433, + "step": 26300 + }, + { + "epoch": 0.43491869985667453, + "grad_norm": 2.7230703830718994, + "learning_rate": 3.095927186309795e-06, + "loss": 0.452, + "step": 26400 + }, + { + "epoch": 0.4365661191743134, + "grad_norm": 1.987182855606079, + "learning_rate": 3.08309666462856e-06, + "loss": 0.4508, + "step": 26500 + }, + { + "epoch": 0.4382135384919524, + "grad_norm": 1.8598235845565796, + "learning_rate": 3.0702498801534234e-06, + "loss": 0.4502, + "step": 26600 + }, + { + "epoch": 0.4398609578095913, + "grad_norm": 1.3509740829467773, + "learning_rate": 3.0573871911862252e-06, + "loss": 0.4618, + "step": 26700 + }, + { + "epoch": 0.4415083771272302, + "grad_norm": 2.3464887142181396, + "learning_rate": 3.044508956472388e-06, + "loss": 0.4687, + "step": 26800 + }, + { + "epoch": 0.44315579644486913, + "grad_norm": 2.453792095184326, + "learning_rate": 3.0316155351909136e-06, + "loss": 0.4581, + "step": 26900 + }, + { + "epoch": 0.44480321576250803, + "grad_norm": 1.8684953451156616, + "learning_rate": 3.0187072869443595e-06, + "loss": 0.4775, + "step": 27000 + }, + { + "epoch": 0.44645063508014693, + "grad_norm": 2.501569986343384, + "learning_rate": 3.005784571748816e-06, + "loss": 0.4721, + "step": 27100 + }, + { + "epoch": 0.4480980543977859, + "grad_norm": 2.526435613632202, + "learning_rate": 2.992847750023861e-06, + "loss": 0.4327, + "step": 27200 + }, + { + "epoch": 0.4497454737154248, + "grad_norm": 2.1223368644714355, + "learning_rate": 2.9798971825825107e-06, + "loss": 0.4494, + "step": 27300 + }, + { + "epoch": 0.4513928930330637, + "grad_norm": 3.0751936435699463, + "learning_rate": 2.9669332306211513e-06, + "loss": 0.4513, + "step": 27400 + }, + { + "epoch": 0.45304031235070263, + "grad_norm": 1.7349650859832764, + "learning_rate": 2.95395625570947e-06, + "loss": 0.4516, + "step": 27500 + }, + { + "epoch": 0.45468773166834153, + "grad_norm": 1.474882960319519, + "learning_rate": 2.9409666197803715e-06, + "loss": 0.4269, + "step": 27600 + }, + { + "epoch": 0.4563351509859805, + "grad_norm": 1.845004916191101, + "learning_rate": 2.9279646851198796e-06, + "loss": 0.4598, + "step": 27700 + }, + { + "epoch": 0.4579825703036194, + "grad_norm": 1.4891762733459473, + "learning_rate": 2.9149508143570317e-06, + "loss": 0.4383, + "step": 27800 + }, + { + "epoch": 0.4596299896212583, + "grad_norm": 2.5375092029571533, + "learning_rate": 2.9019253704537725e-06, + "loss": 0.4903, + "step": 27900 + }, + { + "epoch": 0.46127740893889724, + "grad_norm": 2.7068655490875244, + "learning_rate": 2.888888716694824e-06, + "loss": 0.4673, + "step": 28000 + }, + { + "epoch": 0.46292482825653614, + "grad_norm": 1.9553802013397217, + "learning_rate": 2.8758412166775536e-06, + "loss": 0.4722, + "step": 28100 + }, + { + "epoch": 0.46457224757417503, + "grad_norm": 2.417858362197876, + "learning_rate": 2.8627832343018392e-06, + "loss": 0.4778, + "step": 28200 + }, + { + "epoch": 0.466219666891814, + "grad_norm": 2.021970748901367, + "learning_rate": 2.849715133759912e-06, + "loss": 0.438, + "step": 28300 + }, + { + "epoch": 0.4678670862094529, + "grad_norm": 1.203245997428894, + "learning_rate": 2.8366372795262043e-06, + "loss": 0.448, + "step": 28400 + }, + { + "epoch": 0.4695145055270918, + "grad_norm": 1.60651433467865, + "learning_rate": 2.8235500363471835e-06, + "loss": 0.4667, + "step": 28500 + }, + { + "epoch": 0.47116192484473074, + "grad_norm": 2.5438413619995117, + "learning_rate": 2.8104537692311772e-06, + "loss": 0.4411, + "step": 28600 + }, + { + "epoch": 0.47280934416236964, + "grad_norm": 1.9837552309036255, + "learning_rate": 2.7973488434381936e-06, + "loss": 0.4772, + "step": 28700 + }, + { + "epoch": 0.4744567634800086, + "grad_norm": 4.9808573722839355, + "learning_rate": 2.7842356244697365e-06, + "loss": 0.4585, + "step": 28800 + }, + { + "epoch": 0.4761041827976475, + "grad_norm": 2.3967010974884033, + "learning_rate": 2.771114478058609e-06, + "loss": 0.4434, + "step": 28900 + }, + { + "epoch": 0.4777516021152864, + "grad_norm": 2.0720436573028564, + "learning_rate": 2.757985770158712e-06, + "loss": 0.4553, + "step": 29000 + }, + { + "epoch": 0.47939902143292534, + "grad_norm": 2.0397377014160156, + "learning_rate": 2.744849866934843e-06, + "loss": 0.4335, + "step": 29100 + }, + { + "epoch": 0.48104644075056424, + "grad_norm": 1.8307183980941772, + "learning_rate": 2.7317071347524756e-06, + "loss": 0.4575, + "step": 29200 + }, + { + "epoch": 0.48269386006820314, + "grad_norm": 2.0401103496551514, + "learning_rate": 2.7185579401675478e-06, + "loss": 0.4536, + "step": 29300 + }, + { + "epoch": 0.4843412793858421, + "grad_norm": 1.5589044094085693, + "learning_rate": 2.705402649916238e-06, + "loss": 0.4464, + "step": 29400 + }, + { + "epoch": 0.485988698703481, + "grad_norm": 1.7465211153030396, + "learning_rate": 2.692241630904732e-06, + "loss": 0.443, + "step": 29500 + }, + { + "epoch": 0.4876361180211199, + "grad_norm": 1.9152140617370605, + "learning_rate": 2.679075250198995e-06, + "loss": 0.4453, + "step": 29600 + }, + { + "epoch": 0.48928353733875884, + "grad_norm": 1.9584287405014038, + "learning_rate": 2.665903875014531e-06, + "loss": 0.4412, + "step": 29700 + }, + { + "epoch": 0.49093095665639774, + "grad_norm": 2.4530208110809326, + "learning_rate": 2.6527278727061438e-06, + "loss": 0.455, + "step": 29800 + }, + { + "epoch": 0.4925783759740367, + "grad_norm": 2.28879451751709, + "learning_rate": 2.6395476107576866e-06, + "loss": 0.4545, + "step": 29900 + }, + { + "epoch": 0.4942257952916756, + "grad_norm": 2.3238701820373535, + "learning_rate": 2.626363456771818e-06, + "loss": 0.4659, + "step": 30000 + }, + { + "epoch": 0.4958732146093145, + "grad_norm": 2.5362935066223145, + "learning_rate": 2.613175778459746e-06, + "loss": 0.475, + "step": 30100 + }, + { + "epoch": 0.49752063392695345, + "grad_norm": 1.6304713487625122, + "learning_rate": 2.599984943630974e-06, + "loss": 0.4344, + "step": 30200 + }, + { + "epoch": 0.49916805324459235, + "grad_norm": 2.1046688556671143, + "learning_rate": 2.5867913201830415e-06, + "loss": 0.442, + "step": 30300 + }, + { + "epoch": 0.5008154725622312, + "grad_norm": 2.016679048538208, + "learning_rate": 2.5735952760912623e-06, + "loss": 0.4468, + "step": 30400 + }, + { + "epoch": 0.5024628918798701, + "grad_norm": 1.700775384902954, + "learning_rate": 2.560397179398467e-06, + "loss": 0.4755, + "step": 30500 + }, + { + "epoch": 0.5041103111975092, + "grad_norm": 2.6758084297180176, + "learning_rate": 2.5471973982047283e-06, + "loss": 0.4734, + "step": 30600 + }, + { + "epoch": 0.505757730515148, + "grad_norm": 2.0318357944488525, + "learning_rate": 2.533996300657105e-06, + "loss": 0.4257, + "step": 30700 + }, + { + "epoch": 0.507405149832787, + "grad_norm": 1.755279779434204, + "learning_rate": 2.5207942549393678e-06, + "loss": 0.4311, + "step": 30800 + }, + { + "epoch": 0.5090525691504258, + "grad_norm": 1.3220248222351074, + "learning_rate": 2.507591629261732e-06, + "loss": 0.4586, + "step": 30900 + }, + { + "epoch": 0.5106999884680647, + "grad_norm": 1.8418200016021729, + "learning_rate": 2.4943887918505887e-06, + "loss": 0.4856, + "step": 31000 + }, + { + "epoch": 0.5123474077857036, + "grad_norm": 2.0014216899871826, + "learning_rate": 2.4811861109382337e-06, + "loss": 0.4691, + "step": 31100 + }, + { + "epoch": 0.5139948271033427, + "grad_norm": 2.2227587699890137, + "learning_rate": 2.4679839547526e-06, + "loss": 0.4465, + "step": 31200 + }, + { + "epoch": 0.5156422464209816, + "grad_norm": 2.022191047668457, + "learning_rate": 2.4547826915069816e-06, + "loss": 0.4344, + "step": 31300 + }, + { + "epoch": 0.5172896657386205, + "grad_norm": 1.4360835552215576, + "learning_rate": 2.441582689389772e-06, + "loss": 0.446, + "step": 31400 + }, + { + "epoch": 0.5189370850562594, + "grad_norm": 2.100766658782959, + "learning_rate": 2.4283843165541914e-06, + "loss": 0.4457, + "step": 31500 + }, + { + "epoch": 0.5205845043738982, + "grad_norm": 1.6528244018554688, + "learning_rate": 2.4151879411080144e-06, + "loss": 0.4477, + "step": 31600 + }, + { + "epoch": 0.5222319236915373, + "grad_norm": 2.4091269969940186, + "learning_rate": 2.401993931103312e-06, + "loss": 0.4764, + "step": 31700 + }, + { + "epoch": 0.5238793430091762, + "grad_norm": 2.416269302368164, + "learning_rate": 2.388802654526182e-06, + "loss": 0.4572, + "step": 31800 + }, + { + "epoch": 0.5255267623268151, + "grad_norm": 1.747132420539856, + "learning_rate": 2.3756144792864812e-06, + "loss": 0.4439, + "step": 31900 + }, + { + "epoch": 0.527174181644454, + "grad_norm": 1.7760906219482422, + "learning_rate": 2.3624297732075747e-06, + "loss": 0.4589, + "step": 32000 + }, + { + "epoch": 0.5288216009620929, + "grad_norm": 1.9603146314620972, + "learning_rate": 2.349248904016069e-06, + "loss": 0.4464, + "step": 32100 + }, + { + "epoch": 0.5304690202797318, + "grad_norm": 2.7575228214263916, + "learning_rate": 2.336072239331555e-06, + "loss": 0.425, + "step": 32200 + }, + { + "epoch": 0.5321164395973708, + "grad_norm": 3.160569190979004, + "learning_rate": 2.3229001466563647e-06, + "loss": 0.4493, + "step": 32300 + }, + { + "epoch": 0.5337638589150097, + "grad_norm": 1.3065659999847412, + "learning_rate": 2.3097329933653116e-06, + "loss": 0.4134, + "step": 32400 + }, + { + "epoch": 0.5354112782326486, + "grad_norm": 1.933773159980774, + "learning_rate": 2.2965711466954444e-06, + "loss": 0.4465, + "step": 32500 + }, + { + "epoch": 0.5370586975502875, + "grad_norm": 1.7939263582229614, + "learning_rate": 2.283414973735816e-06, + "loss": 0.4577, + "step": 32600 + }, + { + "epoch": 0.5387061168679264, + "grad_norm": 2.202970027923584, + "learning_rate": 2.270264841417229e-06, + "loss": 0.4506, + "step": 32700 + }, + { + "epoch": 0.5403535361855654, + "grad_norm": 1.2232089042663574, + "learning_rate": 2.2571211165020164e-06, + "loss": 0.4412, + "step": 32800 + }, + { + "epoch": 0.5420009555032043, + "grad_norm": 2.2651045322418213, + "learning_rate": 2.243984165573804e-06, + "loss": 0.4838, + "step": 32900 + }, + { + "epoch": 0.5436483748208432, + "grad_norm": 1.1817712783813477, + "learning_rate": 2.2308543550272853e-06, + "loss": 0.4426, + "step": 33000 + }, + { + "epoch": 0.5452957941384821, + "grad_norm": 3.3513026237487793, + "learning_rate": 2.2177320510580115e-06, + "loss": 0.4432, + "step": 33100 + }, + { + "epoch": 0.546943213456121, + "grad_norm": 2.345806837081909, + "learning_rate": 2.2046176196521706e-06, + "loss": 0.4591, + "step": 33200 + }, + { + "epoch": 0.5485906327737599, + "grad_norm": 2.1807124614715576, + "learning_rate": 2.191511426576377e-06, + "loss": 0.4589, + "step": 33300 + }, + { + "epoch": 0.5502380520913989, + "grad_norm": 2.6100516319274902, + "learning_rate": 2.1784138373674817e-06, + "loss": 0.4644, + "step": 33400 + }, + { + "epoch": 0.5518854714090378, + "grad_norm": 1.3514959812164307, + "learning_rate": 2.165325217322367e-06, + "loss": 0.4123, + "step": 33500 + }, + { + "epoch": 0.5535328907266767, + "grad_norm": 2.2316343784332275, + "learning_rate": 2.1522459314877603e-06, + "loss": 0.4329, + "step": 33600 + }, + { + "epoch": 0.5551803100443156, + "grad_norm": 1.948644757270813, + "learning_rate": 2.1391763446500583e-06, + "loss": 0.4485, + "step": 33700 + }, + { + "epoch": 0.5568277293619545, + "grad_norm": 2.1561203002929688, + "learning_rate": 2.1261168213251465e-06, + "loss": 0.4557, + "step": 33800 + }, + { + "epoch": 0.5584751486795935, + "grad_norm": 2.097280263900757, + "learning_rate": 2.1130677257482328e-06, + "loss": 0.4535, + "step": 33900 + }, + { + "epoch": 0.5601225679972324, + "grad_norm": 2.417245388031006, + "learning_rate": 2.1000294218636963e-06, + "loss": 0.4758, + "step": 34000 + }, + { + "epoch": 0.5617699873148713, + "grad_norm": 1.9167017936706543, + "learning_rate": 2.0870022733149287e-06, + "loss": 0.4742, + "step": 34100 + }, + { + "epoch": 0.5634174066325102, + "grad_norm": 1.482334017753601, + "learning_rate": 2.073986643434193e-06, + "loss": 0.4287, + "step": 34200 + }, + { + "epoch": 0.5650648259501491, + "grad_norm": 1.6773154735565186, + "learning_rate": 2.0609828952324954e-06, + "loss": 0.4211, + "step": 34300 + }, + { + "epoch": 0.566712245267788, + "grad_norm": 1.883154273033142, + "learning_rate": 2.047991391389458e-06, + "loss": 0.453, + "step": 34400 + }, + { + "epoch": 0.568359664585427, + "grad_norm": 2.0675201416015625, + "learning_rate": 2.035012494243198e-06, + "loss": 0.4762, + "step": 34500 + }, + { + "epoch": 0.5700070839030659, + "grad_norm": 2.362501382827759, + "learning_rate": 2.0220465657802322e-06, + "loss": 0.4566, + "step": 34600 + }, + { + "epoch": 0.5716545032207048, + "grad_norm": 1.8373854160308838, + "learning_rate": 2.0090939676253744e-06, + "loss": 0.442, + "step": 34700 + }, + { + "epoch": 0.5733019225383437, + "grad_norm": 1.8830519914627075, + "learning_rate": 1.9961550610316477e-06, + "loss": 0.4521, + "step": 34800 + }, + { + "epoch": 0.5749493418559826, + "grad_norm": 1.484971523284912, + "learning_rate": 1.9832302068702162e-06, + "loss": 0.4795, + "step": 34900 + }, + { + "epoch": 0.5765967611736216, + "grad_norm": 1.619246482849121, + "learning_rate": 1.9703197656203153e-06, + "loss": 0.4525, + "step": 35000 + }, + { + "epoch": 0.5782441804912605, + "grad_norm": 1.589003562927246, + "learning_rate": 1.9574240973591955e-06, + "loss": 0.4346, + "step": 35100 + }, + { + "epoch": 0.5798915998088994, + "grad_norm": 1.2750858068466187, + "learning_rate": 1.944543561752088e-06, + "loss": 0.4595, + "step": 35200 + }, + { + "epoch": 0.5815390191265383, + "grad_norm": 2.5024302005767822, + "learning_rate": 1.931678518042165e-06, + "loss": 0.4469, + "step": 35300 + }, + { + "epoch": 0.5831864384441772, + "grad_norm": 2.244246244430542, + "learning_rate": 1.918829325040523e-06, + "loss": 0.4475, + "step": 35400 + }, + { + "epoch": 0.5848338577618161, + "grad_norm": 1.7237255573272705, + "learning_rate": 1.9059963411161788e-06, + "loss": 0.4578, + "step": 35500 + }, + { + "epoch": 0.5864812770794551, + "grad_norm": 1.9429930448532104, + "learning_rate": 1.8931799241860704e-06, + "loss": 0.4776, + "step": 35600 + }, + { + "epoch": 0.588128696397094, + "grad_norm": 2.0698490142822266, + "learning_rate": 1.880380431705075e-06, + "loss": 0.4422, + "step": 35700 + }, + { + "epoch": 0.5897761157147329, + "grad_norm": 1.440127968788147, + "learning_rate": 1.8675982206560417e-06, + "loss": 0.4528, + "step": 35800 + }, + { + "epoch": 0.5914235350323718, + "grad_norm": 2.600696563720703, + "learning_rate": 1.854833647539833e-06, + "loss": 0.4167, + "step": 35900 + }, + { + "epoch": 0.5930709543500107, + "grad_norm": 2.2462635040283203, + "learning_rate": 1.8420870683653819e-06, + "loss": 0.4461, + "step": 36000 + }, + { + "epoch": 0.5947183736676497, + "grad_norm": 2.301934003829956, + "learning_rate": 1.8293588386397646e-06, + "loss": 0.4609, + "step": 36100 + }, + { + "epoch": 0.5963657929852886, + "grad_norm": 1.231947422027588, + "learning_rate": 1.816649313358284e-06, + "loss": 0.4617, + "step": 36200 + }, + { + "epoch": 0.5980132123029275, + "grad_norm": 1.6088837385177612, + "learning_rate": 1.8039588469945675e-06, + "loss": 0.4298, + "step": 36300 + }, + { + "epoch": 0.5996606316205664, + "grad_norm": 2.1999731063842773, + "learning_rate": 1.791287793490682e-06, + "loss": 0.4576, + "step": 36400 + }, + { + "epoch": 0.6013080509382053, + "grad_norm": 1.9624534845352173, + "learning_rate": 1.7786365062472645e-06, + "loss": 0.4416, + "step": 36500 + }, + { + "epoch": 0.6029554702558442, + "grad_norm": 2.441080093383789, + "learning_rate": 1.7660053381136593e-06, + "loss": 0.4613, + "step": 36600 + }, + { + "epoch": 0.6046028895734832, + "grad_norm": 1.7500004768371582, + "learning_rate": 1.7533946413780845e-06, + "loss": 0.4493, + "step": 36700 + }, + { + "epoch": 0.6062503088911221, + "grad_norm": 1.9511518478393555, + "learning_rate": 1.7408047677578016e-06, + "loss": 0.4487, + "step": 36800 + }, + { + "epoch": 0.607897728208761, + "grad_norm": 2.2485551834106445, + "learning_rate": 1.7282360683893057e-06, + "loss": 0.4515, + "step": 36900 + }, + { + "epoch": 0.6095451475263999, + "grad_norm": 2.1224875450134277, + "learning_rate": 1.7156888938185373e-06, + "loss": 0.4384, + "step": 37000 + }, + { + "epoch": 0.6111925668440388, + "grad_norm": 2.6325182914733887, + "learning_rate": 1.7031635939910968e-06, + "loss": 0.4625, + "step": 37100 + }, + { + "epoch": 0.6128399861616778, + "grad_norm": 1.8848086595535278, + "learning_rate": 1.6906605182424942e-06, + "loss": 0.4627, + "step": 37200 + }, + { + "epoch": 0.6144874054793167, + "grad_norm": 1.8694807291030884, + "learning_rate": 1.6781800152884004e-06, + "loss": 0.4572, + "step": 37300 + }, + { + "epoch": 0.6161348247969556, + "grad_norm": 1.9170241355895996, + "learning_rate": 1.6657224332149185e-06, + "loss": 0.4646, + "step": 37400 + }, + { + "epoch": 0.6177822441145945, + "grad_norm": 2.1769967079162598, + "learning_rate": 1.6532881194688843e-06, + "loss": 0.4584, + "step": 37500 + }, + { + "epoch": 0.6194296634322334, + "grad_norm": 2.1281752586364746, + "learning_rate": 1.640877420848169e-06, + "loss": 0.4588, + "step": 37600 + }, + { + "epoch": 0.6210770827498723, + "grad_norm": 3.2545199394226074, + "learning_rate": 1.6284906834920056e-06, + "loss": 0.4494, + "step": 37700 + }, + { + "epoch": 0.6227245020675113, + "grad_norm": 2.595705032348633, + "learning_rate": 1.6161282528713429e-06, + "loss": 0.4702, + "step": 37800 + }, + { + "epoch": 0.6243719213851502, + "grad_norm": 2.0563864707946777, + "learning_rate": 1.6037904737792037e-06, + "loss": 0.4374, + "step": 37900 + }, + { + "epoch": 0.6260193407027891, + "grad_norm": 2.5470025539398193, + "learning_rate": 1.5914776903210675e-06, + "loss": 0.4467, + "step": 38000 + }, + { + "epoch": 0.627666760020428, + "grad_norm": 2.6239607334136963, + "learning_rate": 1.5791902459052793e-06, + "loss": 0.4156, + "step": 38100 + }, + { + "epoch": 0.6293141793380669, + "grad_norm": 1.202338457107544, + "learning_rate": 1.5669284832334671e-06, + "loss": 0.4163, + "step": 38200 + }, + { + "epoch": 0.6309615986557059, + "grad_norm": 2.398700714111328, + "learning_rate": 1.554692744290984e-06, + "loss": 0.4515, + "step": 38300 + }, + { + "epoch": 0.6326090179733448, + "grad_norm": 2.2210938930511475, + "learning_rate": 1.542483370337372e-06, + "loss": 0.4704, + "step": 38400 + }, + { + "epoch": 0.6342564372909837, + "grad_norm": 1.1223909854888916, + "learning_rate": 1.530300701896844e-06, + "loss": 0.4231, + "step": 38500 + }, + { + "epoch": 0.6359038566086226, + "grad_norm": 2.2360265254974365, + "learning_rate": 1.5181450787487839e-06, + "loss": 0.4339, + "step": 38600 + }, + { + "epoch": 0.6375512759262615, + "grad_norm": 1.6431453227996826, + "learning_rate": 1.5060168399182731e-06, + "loss": 0.4341, + "step": 38700 + }, + { + "epoch": 0.6391986952439004, + "grad_norm": 1.9951646327972412, + "learning_rate": 1.4939163236666338e-06, + "loss": 0.4744, + "step": 38800 + }, + { + "epoch": 0.6408461145615394, + "grad_norm": 3.3914270401000977, + "learning_rate": 1.4818438674819934e-06, + "loss": 0.4595, + "step": 38900 + }, + { + "epoch": 0.6424935338791783, + "grad_norm": 2.1617212295532227, + "learning_rate": 1.4697998080698745e-06, + "loss": 0.4465, + "step": 39000 + }, + { + "epoch": 0.6441409531968172, + "grad_norm": 2.4593045711517334, + "learning_rate": 1.4577844813438022e-06, + "loss": 0.4695, + "step": 39100 + }, + { + "epoch": 0.6457883725144561, + "grad_norm": 2.2030935287475586, + "learning_rate": 1.4457982224159346e-06, + "loss": 0.4449, + "step": 39200 + }, + { + "epoch": 0.647435791832095, + "grad_norm": 1.3730400800704956, + "learning_rate": 1.433841365587719e-06, + "loss": 0.4382, + "step": 39300 + }, + { + "epoch": 0.649083211149734, + "grad_norm": 3.4730331897735596, + "learning_rate": 1.421914244340567e-06, + "loss": 0.4469, + "step": 39400 + }, + { + "epoch": 0.6507306304673729, + "grad_norm": 1.946877360343933, + "learning_rate": 1.410017191326551e-06, + "loss": 0.4685, + "step": 39500 + }, + { + "epoch": 0.6523780497850118, + "grad_norm": 1.6987239122390747, + "learning_rate": 1.39815053835913e-06, + "loss": 0.4469, + "step": 39600 + }, + { + "epoch": 0.6540254691026507, + "grad_norm": 1.93442964553833, + "learning_rate": 1.3863146164038946e-06, + "loss": 0.4523, + "step": 39700 + }, + { + "epoch": 0.6556728884202896, + "grad_norm": 2.016063690185547, + "learning_rate": 1.3745097555693343e-06, + "loss": 0.4079, + "step": 39800 + }, + { + "epoch": 0.6573203077379285, + "grad_norm": 1.9582340717315674, + "learning_rate": 1.3627362850976323e-06, + "loss": 0.4524, + "step": 39900 + }, + { + "epoch": 0.6589677270555675, + "grad_norm": 1.6741374731063843, + "learning_rate": 1.3509945333554828e-06, + "loss": 0.4346, + "step": 40000 + }, + { + "epoch": 0.6606151463732064, + "grad_norm": 2.514186382293701, + "learning_rate": 1.3392848278249298e-06, + "loss": 0.4761, + "step": 40100 + }, + { + "epoch": 0.6622625656908453, + "grad_norm": 2.4352760314941406, + "learning_rate": 1.3276074950942381e-06, + "loss": 0.4182, + "step": 40200 + }, + { + "epoch": 0.6639099850084842, + "grad_norm": 1.9086421728134155, + "learning_rate": 1.3159628608487848e-06, + "loss": 0.4431, + "step": 40300 + }, + { + "epoch": 0.6655574043261231, + "grad_norm": 1.9062386751174927, + "learning_rate": 1.3043512498619677e-06, + "loss": 0.4494, + "step": 40400 + }, + { + "epoch": 0.6672048236437621, + "grad_norm": 2.4138245582580566, + "learning_rate": 1.2927729859861571e-06, + "loss": 0.4493, + "step": 40500 + }, + { + "epoch": 0.668852242961401, + "grad_norm": 2.2896976470947266, + "learning_rate": 1.2812283921436597e-06, + "loss": 0.4383, + "step": 40600 + }, + { + "epoch": 0.6704996622790399, + "grad_norm": 2.136972427368164, + "learning_rate": 1.2697177903177077e-06, + "loss": 0.4233, + "step": 40700 + }, + { + "epoch": 0.6721470815966788, + "grad_norm": 1.7220128774642944, + "learning_rate": 1.2582415015434857e-06, + "loss": 0.4331, + "step": 40800 + }, + { + "epoch": 0.6737945009143177, + "grad_norm": 2.0941953659057617, + "learning_rate": 1.2467998458991768e-06, + "loss": 0.482, + "step": 40900 + }, + { + "epoch": 0.6754419202319566, + "grad_norm": 2.6354613304138184, + "learning_rate": 1.2353931424970258e-06, + "loss": 0.4487, + "step": 41000 + }, + { + "epoch": 0.6770893395495956, + "grad_norm": 2.2864413261413574, + "learning_rate": 1.224021709474451e-06, + "loss": 0.4668, + "step": 41100 + }, + { + "epoch": 0.6787367588672345, + "grad_norm": 1.8881123065948486, + "learning_rate": 1.2126858639851649e-06, + "loss": 0.4572, + "step": 41200 + }, + { + "epoch": 0.6803841781848734, + "grad_norm": 2.1788628101348877, + "learning_rate": 1.2013859221903273e-06, + "loss": 0.4589, + "step": 41300 + }, + { + "epoch": 0.6820315975025123, + "grad_norm": 2.4340453147888184, + "learning_rate": 1.190122199249733e-06, + "loss": 0.4363, + "step": 41400 + }, + { + "epoch": 0.6836790168201512, + "grad_norm": 2.3238346576690674, + "learning_rate": 1.1788950093130177e-06, + "loss": 0.4187, + "step": 41500 + }, + { + "epoch": 0.6853264361377902, + "grad_norm": 2.4663116931915283, + "learning_rate": 1.1677046655108974e-06, + "loss": 0.4542, + "step": 41600 + }, + { + "epoch": 0.6869738554554291, + "grad_norm": 1.5595173835754395, + "learning_rate": 1.1565514799464354e-06, + "loss": 0.4612, + "step": 41700 + }, + { + "epoch": 0.688621274773068, + "grad_norm": 2.0184364318847656, + "learning_rate": 1.145435763686335e-06, + "loss": 0.4535, + "step": 41800 + }, + { + "epoch": 0.6902686940907069, + "grad_norm": 3.0829389095306396, + "learning_rate": 1.134357826752269e-06, + "loss": 0.4307, + "step": 41900 + }, + { + "epoch": 0.6919161134083458, + "grad_norm": 2.8656702041625977, + "learning_rate": 1.1233179781122286e-06, + "loss": 0.4511, + "step": 42000 + }, + { + "epoch": 0.6935635327259847, + "grad_norm": 2.2438855171203613, + "learning_rate": 1.1123165256719077e-06, + "loss": 0.4358, + "step": 42100 + }, + { + "epoch": 0.6952109520436237, + "grad_norm": 2.6837387084960938, + "learning_rate": 1.1013537762661147e-06, + "loss": 0.4702, + "step": 42200 + }, + { + "epoch": 0.6968583713612626, + "grad_norm": 2.0240025520324707, + "learning_rate": 1.0904300356502174e-06, + "loss": 0.4211, + "step": 42300 + }, + { + "epoch": 0.6985057906789015, + "grad_norm": 2.1769285202026367, + "learning_rate": 1.0795456084916095e-06, + "loss": 0.4635, + "step": 42400 + }, + { + "epoch": 0.7001532099965404, + "grad_norm": 1.203687310218811, + "learning_rate": 1.0687007983612189e-06, + "loss": 0.4241, + "step": 42500 + }, + { + "epoch": 0.7018006293141793, + "grad_norm": 2.5927300453186035, + "learning_rate": 1.0578959077250417e-06, + "loss": 0.4603, + "step": 42600 + }, + { + "epoch": 0.7034480486318182, + "grad_norm": 1.3485939502716064, + "learning_rate": 1.0471312379356991e-06, + "loss": 0.4563, + "step": 42700 + }, + { + "epoch": 0.7050954679494572, + "grad_norm": 1.8091089725494385, + "learning_rate": 1.03640708922404e-06, + "loss": 0.4303, + "step": 42800 + }, + { + "epoch": 0.7067428872670961, + "grad_norm": 2.243220090866089, + "learning_rate": 1.0257237606907647e-06, + "loss": 0.4484, + "step": 42900 + }, + { + "epoch": 0.708390306584735, + "grad_norm": 1.7703299522399902, + "learning_rate": 1.0150815502980804e-06, + "loss": 0.4459, + "step": 43000 + }, + { + "epoch": 0.7100377259023739, + "grad_norm": 1.7074419260025024, + "learning_rate": 1.0044807548613947e-06, + "loss": 0.3932, + "step": 43100 + }, + { + "epoch": 0.7116851452200128, + "grad_norm": 2.930617332458496, + "learning_rate": 9.939216700410387e-07, + "loss": 0.4411, + "step": 43200 + }, + { + "epoch": 0.7133325645376518, + "grad_norm": 1.8758985996246338, + "learning_rate": 9.834045903340127e-07, + "loss": 0.434, + "step": 43300 + }, + { + "epoch": 0.7149799838552907, + "grad_norm": 2.038867712020874, + "learning_rate": 9.729298090657821e-07, + "loss": 0.4666, + "step": 43400 + }, + { + "epoch": 0.7166274031729296, + "grad_norm": 2.4463798999786377, + "learning_rate": 9.624976183820914e-07, + "loss": 0.4492, + "step": 43500 + }, + { + "epoch": 0.7182748224905685, + "grad_norm": 0.9264168739318848, + "learning_rate": 9.521083092408148e-07, + "loss": 0.4308, + "step": 43600 + }, + { + "epoch": 0.7199222418082074, + "grad_norm": 1.8402535915374756, + "learning_rate": 9.417621714038455e-07, + "loss": 0.4375, + "step": 43700 + }, + { + "epoch": 0.7215696611258463, + "grad_norm": 2.28937029838562, + "learning_rate": 9.314594934290147e-07, + "loss": 0.4451, + "step": 43800 + }, + { + "epoch": 0.7232170804434853, + "grad_norm": 2.710644245147705, + "learning_rate": 9.212005626620354e-07, + "loss": 0.4923, + "step": 43900 + }, + { + "epoch": 0.7248644997611242, + "grad_norm": 1.6825114488601685, + "learning_rate": 9.109856652284979e-07, + "loss": 0.4281, + "step": 44000 + }, + { + "epoch": 0.7265119190787631, + "grad_norm": 1.5312185287475586, + "learning_rate": 9.008150860258852e-07, + "loss": 0.4252, + "step": 44100 + }, + { + "epoch": 0.728159338396402, + "grad_norm": 1.606581449508667, + "learning_rate": 8.90689108715625e-07, + "loss": 0.4449, + "step": 44200 + }, + { + "epoch": 0.7298067577140409, + "grad_norm": 2.8217248916625977, + "learning_rate": 8.806080157151828e-07, + "loss": 0.4399, + "step": 44300 + }, + { + "epoch": 0.7314541770316799, + "grad_norm": 2.25714373588562, + "learning_rate": 8.705720881901855e-07, + "loss": 0.435, + "step": 44400 + }, + { + "epoch": 0.7331015963493188, + "grad_norm": 2.2999300956726074, + "learning_rate": 8.605816060465725e-07, + "loss": 0.4481, + "step": 44500 + }, + { + "epoch": 0.7347490156669577, + "grad_norm": 2.1442625522613525, + "learning_rate": 8.506368479227958e-07, + "loss": 0.4396, + "step": 44600 + }, + { + "epoch": 0.7363964349845966, + "grad_norm": 2.097804307937622, + "learning_rate": 8.407380911820487e-07, + "loss": 0.4486, + "step": 44700 + }, + { + "epoch": 0.7380438543022355, + "grad_norm": 2.046945333480835, + "learning_rate": 8.308856119045239e-07, + "loss": 0.4639, + "step": 44800 + }, + { + "epoch": 0.7396912736198744, + "grad_norm": 1.8260259628295898, + "learning_rate": 8.210796848797193e-07, + "loss": 0.4433, + "step": 44900 + }, + { + "epoch": 0.7413386929375134, + "grad_norm": 2.123908281326294, + "learning_rate": 8.113205835987756e-07, + "loss": 0.4183, + "step": 45000 + }, + { + "epoch": 0.7429861122551523, + "grad_norm": 2.8095531463623047, + "learning_rate": 8.016085802468399e-07, + "loss": 0.4357, + "step": 45100 + }, + { + "epoch": 0.7446335315727912, + "grad_norm": 3.761507511138916, + "learning_rate": 7.919439456954822e-07, + "loss": 0.4282, + "step": 45200 + }, + { + "epoch": 0.7462809508904301, + "grad_norm": 1.9820051193237305, + "learning_rate": 7.823269494951394e-07, + "loss": 0.4714, + "step": 45300 + }, + { + "epoch": 0.747928370208069, + "grad_norm": 1.8739370107650757, + "learning_rate": 7.727578598675917e-07, + "loss": 0.4312, + "step": 45400 + }, + { + "epoch": 0.749575789525708, + "grad_norm": 2.4350790977478027, + "learning_rate": 7.632369436984921e-07, + "loss": 0.4308, + "step": 45500 + }, + { + "epoch": 0.7512232088433469, + "grad_norm": 2.3461410999298096, + "learning_rate": 7.53764466529914e-07, + "loss": 0.4495, + "step": 45600 + }, + { + "epoch": 0.7528706281609858, + "grad_norm": 2.332594633102417, + "learning_rate": 7.443406925529467e-07, + "loss": 0.4271, + "step": 45700 + }, + { + "epoch": 0.7545180474786247, + "grad_norm": 2.7010247707366943, + "learning_rate": 7.349658846003318e-07, + "loss": 0.4581, + "step": 45800 + }, + { + "epoch": 0.7561654667962636, + "grad_norm": 2.0763182640075684, + "learning_rate": 7.256403041391258e-07, + "loss": 0.4599, + "step": 45900 + }, + { + "epoch": 0.7578128861139025, + "grad_norm": 1.678594708442688, + "learning_rate": 7.163642112634134e-07, + "loss": 0.4614, + "step": 46000 + }, + { + "epoch": 0.7594603054315415, + "grad_norm": 1.6114099025726318, + "learning_rate": 7.071378646870525e-07, + "loss": 0.4352, + "step": 46100 + }, + { + "epoch": 0.7611077247491804, + "grad_norm": 2.531679391860962, + "learning_rate": 6.979615217364539e-07, + "loss": 0.452, + "step": 46200 + }, + { + "epoch": 0.7627551440668193, + "grad_norm": 1.2857202291488647, + "learning_rate": 6.888354383434098e-07, + "loss": 0.4425, + "step": 46300 + }, + { + "epoch": 0.7644025633844582, + "grad_norm": 1.769644021987915, + "learning_rate": 6.797598690379542e-07, + "loss": 0.4325, + "step": 46400 + }, + { + "epoch": 0.7660499827020971, + "grad_norm": 1.5384021997451782, + "learning_rate": 6.707350669412613e-07, + "loss": 0.4739, + "step": 46500 + }, + { + "epoch": 0.7676974020197361, + "grad_norm": 2.200972318649292, + "learning_rate": 6.617612837585887e-07, + "loss": 0.4702, + "step": 46600 + }, + { + "epoch": 0.769344821337375, + "grad_norm": 2.062885046005249, + "learning_rate": 6.528387697722599e-07, + "loss": 0.4703, + "step": 46700 + }, + { + "epoch": 0.7709922406550139, + "grad_norm": 1.4489109516143799, + "learning_rate": 6.439677738346752e-07, + "loss": 0.4403, + "step": 46800 + }, + { + "epoch": 0.7726396599726528, + "grad_norm": 3.070599317550659, + "learning_rate": 6.351485433613799e-07, + "loss": 0.4353, + "step": 46900 + }, + { + "epoch": 0.7742870792902917, + "grad_norm": 2.201493978500366, + "learning_rate": 6.263813243241593e-07, + "loss": 0.4201, + "step": 47000 + }, + { + "epoch": 0.7759344986079306, + "grad_norm": 2.203810930252075, + "learning_rate": 6.176663612441785e-07, + "loss": 0.4681, + "step": 47100 + }, + { + "epoch": 0.7775819179255696, + "grad_norm": 2.4481027126312256, + "learning_rate": 6.090038971851642e-07, + "loss": 0.4721, + "step": 47200 + }, + { + "epoch": 0.7792293372432085, + "grad_norm": 1.9644261598587036, + "learning_rate": 6.003941737466273e-07, + "loss": 0.4365, + "step": 47300 + }, + { + "epoch": 0.7808767565608474, + "grad_norm": 1.6432219743728638, + "learning_rate": 5.918374310571176e-07, + "loss": 0.4291, + "step": 47400 + }, + { + "epoch": 0.7825241758784863, + "grad_norm": 2.489579200744629, + "learning_rate": 5.833339077675343e-07, + "loss": 0.4396, + "step": 47500 + }, + { + "epoch": 0.7841715951961252, + "grad_norm": 1.5569617748260498, + "learning_rate": 5.748838410444665e-07, + "loss": 0.4491, + "step": 47600 + }, + { + "epoch": 0.7858190145137642, + "grad_norm": 2.200166702270508, + "learning_rate": 5.664874665635767e-07, + "loss": 0.4672, + "step": 47700 + }, + { + "epoch": 0.7874664338314031, + "grad_norm": 2.1616365909576416, + "learning_rate": 5.581450185030315e-07, + "loss": 0.4579, + "step": 47800 + }, + { + "epoch": 0.789113853149042, + "grad_norm": 1.2923545837402344, + "learning_rate": 5.4985672953697e-07, + "loss": 0.4424, + "step": 47900 + }, + { + "epoch": 0.7907612724666809, + "grad_norm": 2.338345527648926, + "learning_rate": 5.416228308290095e-07, + "loss": 0.4416, + "step": 48000 + }, + { + "epoch": 0.7924086917843198, + "grad_norm": 1.684395670890808, + "learning_rate": 5.334435520258039e-07, + "loss": 0.4136, + "step": 48100 + }, + { + "epoch": 0.7940561111019587, + "grad_norm": 1.9474413394927979, + "learning_rate": 5.25319121250637e-07, + "loss": 0.4252, + "step": 48200 + }, + { + "epoch": 0.7957035304195977, + "grad_norm": 2.8479621410369873, + "learning_rate": 5.172497650970567e-07, + "loss": 0.4375, + "step": 48300 + }, + { + "epoch": 0.7973509497372366, + "grad_norm": 1.9628188610076904, + "learning_rate": 5.092357086225627e-07, + "loss": 0.4455, + "step": 48400 + }, + { + "epoch": 0.7989983690548755, + "grad_norm": 1.8695141077041626, + "learning_rate": 5.012771753423223e-07, + "loss": 0.4819, + "step": 48500 + }, + { + "epoch": 0.8006457883725144, + "grad_norm": 1.873336672782898, + "learning_rate": 4.933743872229388e-07, + "loss": 0.4405, + "step": 48600 + }, + { + "epoch": 0.8022932076901533, + "grad_norm": 2.134643077850342, + "learning_rate": 4.85527564676262e-07, + "loss": 0.4381, + "step": 48700 + }, + { + "epoch": 0.8039406270077923, + "grad_norm": 2.1162221431732178, + "learning_rate": 4.777369265532408e-07, + "loss": 0.4577, + "step": 48800 + }, + { + "epoch": 0.8055880463254312, + "grad_norm": 2.036649227142334, + "learning_rate": 4.7000269013781604e-07, + "loss": 0.4238, + "step": 48900 + }, + { + "epoch": 0.8072354656430701, + "grad_norm": 1.4969152212142944, + "learning_rate": 4.6232507114086613e-07, + "loss": 0.45, + "step": 49000 + }, + { + "epoch": 0.808882884960709, + "grad_norm": 1.9845752716064453, + "learning_rate": 4.547042836941865e-07, + "loss": 0.4548, + "step": 49100 + }, + { + "epoch": 0.8105303042783479, + "grad_norm": 1.967536449432373, + "learning_rate": 4.4714054034451585e-07, + "loss": 0.4057, + "step": 49200 + }, + { + "epoch": 0.8121777235959868, + "grad_norm": 1.79136323928833, + "learning_rate": 4.3963405204761416e-07, + "loss": 0.4456, + "step": 49300 + }, + { + "epoch": 0.8138251429136258, + "grad_norm": 2.0205838680267334, + "learning_rate": 4.3218502816237433e-07, + "loss": 0.398, + "step": 49400 + }, + { + "epoch": 0.8154725622312647, + "grad_norm": 1.4011536836624146, + "learning_rate": 4.247936764449828e-07, + "loss": 0.4542, + "step": 49500 + }, + { + "epoch": 0.8171199815489036, + "grad_norm": 1.8763850927352905, + "learning_rate": 4.174602030431299e-07, + "loss": 0.4464, + "step": 49600 + }, + { + "epoch": 0.8187674008665425, + "grad_norm": 1.8748266696929932, + "learning_rate": 4.1018481249025523e-07, + "loss": 0.4608, + "step": 49700 + }, + { + "epoch": 0.8204148201841814, + "grad_norm": 2.887885808944702, + "learning_rate": 4.0296770769984393e-07, + "loss": 0.468, + "step": 49800 + }, + { + "epoch": 0.8220622395018204, + "grad_norm": 3.4386472702026367, + "learning_rate": 3.958090899597705e-07, + "loss": 0.4487, + "step": 49900 + }, + { + "epoch": 0.8237096588194593, + "grad_norm": 2.4126787185668945, + "learning_rate": 3.8870915892668253e-07, + "loss": 0.452, + "step": 50000 + }, + { + "epoch": 0.8253570781370982, + "grad_norm": 1.8389333486557007, + "learning_rate": 3.816681126204297e-07, + "loss": 0.4666, + "step": 50100 + }, + { + "epoch": 0.8270044974547371, + "grad_norm": 2.392357349395752, + "learning_rate": 3.746861474185487e-07, + "loss": 0.4457, + "step": 50200 + }, + { + "epoch": 0.828651916772376, + "grad_norm": 2.450810194015503, + "learning_rate": 3.677634580507758e-07, + "loss": 0.4777, + "step": 50300 + }, + { + "epoch": 0.8302993360900149, + "grad_norm": 2.1401236057281494, + "learning_rate": 3.609002375936244e-07, + "loss": 0.4546, + "step": 50400 + }, + { + "epoch": 0.831946755407654, + "grad_norm": 2.275261163711548, + "learning_rate": 3.540966774649962e-07, + "loss": 0.4286, + "step": 50500 + }, + { + "epoch": 0.8335941747252928, + "grad_norm": 2.4037744998931885, + "learning_rate": 3.4735296741884113e-07, + "loss": 0.441, + "step": 50600 + }, + { + "epoch": 0.8352415940429317, + "grad_norm": 1.7885956764221191, + "learning_rate": 3.406692955398699e-07, + "loss": 0.4487, + "step": 50700 + }, + { + "epoch": 0.8368890133605706, + "grad_norm": 2.087801456451416, + "learning_rate": 3.340458482383038e-07, + "loss": 0.4414, + "step": 50800 + }, + { + "epoch": 0.8385364326782095, + "grad_norm": 1.9815489053726196, + "learning_rate": 3.2748281024467615e-07, + "loss": 0.4408, + "step": 50900 + }, + { + "epoch": 0.8401838519958486, + "grad_norm": 2.0206503868103027, + "learning_rate": 3.209803646046825e-07, + "loss": 0.4769, + "step": 51000 + }, + { + "epoch": 0.8418312713134875, + "grad_norm": 2.112884521484375, + "learning_rate": 3.14538692674074e-07, + "loss": 0.4392, + "step": 51100 + }, + { + "epoch": 0.8434786906311263, + "grad_norm": 1.9830784797668457, + "learning_rate": 3.0815797411359705e-07, + "loss": 0.4534, + "step": 51200 + }, + { + "epoch": 0.8451261099487652, + "grad_norm": 2.5792412757873535, + "learning_rate": 3.0183838688398834e-07, + "loss": 0.4141, + "step": 51300 + }, + { + "epoch": 0.8467735292664041, + "grad_norm": 1.4945428371429443, + "learning_rate": 2.9558010724100556e-07, + "loss": 0.4413, + "step": 51400 + }, + { + "epoch": 0.848420948584043, + "grad_norm": 1.6658538579940796, + "learning_rate": 2.893833097305135e-07, + "loss": 0.4381, + "step": 51500 + }, + { + "epoch": 0.850068367901682, + "grad_norm": 1.9433872699737549, + "learning_rate": 2.832481671836174e-07, + "loss": 0.4916, + "step": 51600 + }, + { + "epoch": 0.851715787219321, + "grad_norm": 2.8448355197906494, + "learning_rate": 2.771748507118413e-07, + "loss": 0.4529, + "step": 51700 + }, + { + "epoch": 0.8533632065369599, + "grad_norm": 1.6692224740982056, + "learning_rate": 2.711635297023546e-07, + "loss": 0.4331, + "step": 51800 + }, + { + "epoch": 0.8550106258545987, + "grad_norm": 2.085247039794922, + "learning_rate": 2.6521437181325105e-07, + "loss": 0.4573, + "step": 51900 + }, + { + "epoch": 0.8566580451722376, + "grad_norm": 1.9214270114898682, + "learning_rate": 2.593275429688699e-07, + "loss": 0.443, + "step": 52000 + }, + { + "epoch": 0.8583054644898767, + "grad_norm": 1.856969952583313, + "learning_rate": 2.535032073551677e-07, + "loss": 0.4804, + "step": 52100 + }, + { + "epoch": 0.8599528838075156, + "grad_norm": 2.086461067199707, + "learning_rate": 2.4774152741514207e-07, + "loss": 0.4505, + "step": 52200 + }, + { + "epoch": 0.8616003031251545, + "grad_norm": 2.729485511779785, + "learning_rate": 2.4204266384429855e-07, + "loss": 0.4661, + "step": 52300 + }, + { + "epoch": 0.8632477224427934, + "grad_norm": 1.9726873636245728, + "learning_rate": 2.3640677558616875e-07, + "loss": 0.4561, + "step": 52400 + }, + { + "epoch": 0.8648951417604323, + "grad_norm": 1.9894851446151733, + "learning_rate": 2.308340198278808e-07, + "loss": 0.4564, + "step": 52500 + }, + { + "epoch": 0.8665425610780711, + "grad_norm": 1.4880281686782837, + "learning_rate": 2.2532455199577085e-07, + "loss": 0.43, + "step": 52600 + }, + { + "epoch": 0.8681899803957102, + "grad_norm": 1.956846833229065, + "learning_rate": 2.198785257510491e-07, + "loss": 0.4671, + "step": 52700 + }, + { + "epoch": 0.8698373997133491, + "grad_norm": 2.6969892978668213, + "learning_rate": 2.144960929855175e-07, + "loss": 0.4306, + "step": 52800 + }, + { + "epoch": 0.871484819030988, + "grad_norm": 2.5215413570404053, + "learning_rate": 2.091774038173297e-07, + "loss": 0.4458, + "step": 52900 + }, + { + "epoch": 0.8731322383486269, + "grad_norm": 1.9688514471054077, + "learning_rate": 2.039226065868044e-07, + "loss": 0.4283, + "step": 53000 + }, + { + "epoch": 0.8747796576662658, + "grad_norm": 2.583317995071411, + "learning_rate": 1.9873184785229205e-07, + "loss": 0.4429, + "step": 53100 + }, + { + "epoch": 0.8764270769839048, + "grad_norm": 1.426698088645935, + "learning_rate": 1.9360527238608206e-07, + "loss": 0.4559, + "step": 53200 + }, + { + "epoch": 0.8780744963015437, + "grad_norm": 1.861429214477539, + "learning_rate": 1.8854302317036805e-07, + "loss": 0.4513, + "step": 53300 + }, + { + "epoch": 0.8797219156191826, + "grad_norm": 1.8271915912628174, + "learning_rate": 1.8354524139325923e-07, + "loss": 0.4387, + "step": 53400 + }, + { + "epoch": 0.8813693349368215, + "grad_norm": 1.5195509195327759, + "learning_rate": 1.786120664448432e-07, + "loss": 0.4354, + "step": 53500 + }, + { + "epoch": 0.8830167542544604, + "grad_norm": 1.372504711151123, + "learning_rate": 1.7374363591329768e-07, + "loss": 0.4212, + "step": 53600 + }, + { + "epoch": 0.8846641735720993, + "grad_norm": 1.619235634803772, + "learning_rate": 1.6894008558105274e-07, + "loss": 0.427, + "step": 53700 + }, + { + "epoch": 0.8863115928897383, + "grad_norm": 2.1850979328155518, + "learning_rate": 1.6420154942100585e-07, + "loss": 0.4412, + "step": 53800 + }, + { + "epoch": 0.8879590122073772, + "grad_norm": 2.942978858947754, + "learning_rate": 1.5952815959278168e-07, + "loss": 0.4453, + "step": 53900 + }, + { + "epoch": 0.8896064315250161, + "grad_norm": 2.521692991256714, + "learning_rate": 1.5492004643904962e-07, + "loss": 0.4242, + "step": 54000 + }, + { + "epoch": 0.891253850842655, + "grad_norm": 2.2875068187713623, + "learning_rate": 1.5037733848188658e-07, + "loss": 0.4234, + "step": 54100 + }, + { + "epoch": 0.8929012701602939, + "grad_norm": 2.937547445297241, + "learning_rate": 1.4590016241919357e-07, + "loss": 0.4557, + "step": 54200 + }, + { + "epoch": 0.8945486894779329, + "grad_norm": 2.359915256500244, + "learning_rate": 1.4148864312116124e-07, + "loss": 0.4355, + "step": 54300 + }, + { + "epoch": 0.8961961087955718, + "grad_norm": 1.8787094354629517, + "learning_rate": 1.3714290362678685e-07, + "loss": 0.4478, + "step": 54400 + }, + { + "epoch": 0.8978435281132107, + "grad_norm": 1.8454256057739258, + "learning_rate": 1.328630651404436e-07, + "loss": 0.4374, + "step": 54500 + }, + { + "epoch": 0.8994909474308496, + "grad_norm": 1.6232373714447021, + "learning_rate": 1.286492470285e-07, + "loss": 0.4501, + "step": 54600 + }, + { + "epoch": 0.9011383667484885, + "grad_norm": 2.0913541316986084, + "learning_rate": 1.2450156681598964e-07, + "loss": 0.4564, + "step": 54700 + }, + { + "epoch": 0.9027857860661274, + "grad_norm": 3.337273120880127, + "learning_rate": 1.2042014018333575e-07, + "loss": 0.444, + "step": 54800 + }, + { + "epoch": 0.9044332053837664, + "grad_norm": 1.986515760421753, + "learning_rate": 1.1640508096312259e-07, + "loss": 0.409, + "step": 54900 + }, + { + "epoch": 0.9060806247014053, + "grad_norm": 2.8050506114959717, + "learning_rate": 1.1245650113692052e-07, + "loss": 0.4345, + "step": 55000 + }, + { + "epoch": 0.9077280440190442, + "grad_norm": 1.7033820152282715, + "learning_rate": 1.085745108321648e-07, + "loss": 0.443, + "step": 55100 + }, + { + "epoch": 0.9093754633366831, + "grad_norm": 1.3102610111236572, + "learning_rate": 1.0475921831908265e-07, + "loss": 0.452, + "step": 55200 + }, + { + "epoch": 0.911022882654322, + "grad_norm": 1.4171772003173828, + "learning_rate": 1.0101073000767264e-07, + "loss": 0.4472, + "step": 55300 + }, + { + "epoch": 0.912670301971961, + "grad_norm": 2.2562355995178223, + "learning_rate": 9.732915044474017e-08, + "loss": 0.4424, + "step": 55400 + }, + { + "epoch": 0.9143177212895999, + "grad_norm": 1.537164330482483, + "learning_rate": 9.371458231097807e-08, + "loss": 0.4339, + "step": 55500 + }, + { + "epoch": 0.9159651406072388, + "grad_norm": 1.478975534439087, + "learning_rate": 9.016712641810393e-08, + "loss": 0.4746, + "step": 55600 + }, + { + "epoch": 0.9176125599248777, + "grad_norm": 2.3379318714141846, + "learning_rate": 8.668688170604955e-08, + "loss": 0.4573, + "step": 55700 + }, + { + "epoch": 0.9192599792425166, + "grad_norm": 2.287503242492676, + "learning_rate": 8.327394524020094e-08, + "loss": 0.459, + "step": 55800 + }, + { + "epoch": 0.9209073985601555, + "grad_norm": 2.074932098388672, + "learning_rate": 7.992841220868908e-08, + "loss": 0.4406, + "step": 55900 + }, + { + "epoch": 0.9225548178777945, + "grad_norm": 2.3185274600982666, + "learning_rate": 7.665037591973873e-08, + "loss": 0.4315, + "step": 56000 + }, + { + "epoch": 0.9242022371954334, + "grad_norm": 2.681718587875366, + "learning_rate": 7.343992779906328e-08, + "loss": 0.4496, + "step": 56100 + }, + { + "epoch": 0.9258496565130723, + "grad_norm": 2.437779188156128, + "learning_rate": 7.029715738731541e-08, + "loss": 0.4363, + "step": 56200 + }, + { + "epoch": 0.9274970758307112, + "grad_norm": 2.111402988433838, + "learning_rate": 6.722215233759071e-08, + "loss": 0.446, + "step": 56300 + }, + { + "epoch": 0.9291444951483501, + "grad_norm": 1.8886587619781494, + "learning_rate": 6.421499841298195e-08, + "loss": 0.4414, + "step": 56400 + }, + { + "epoch": 0.9307919144659891, + "grad_norm": 1.649271011352539, + "learning_rate": 6.127577948418728e-08, + "loss": 0.4409, + "step": 56500 + }, + { + "epoch": 0.932439333783628, + "grad_norm": 2.6484766006469727, + "learning_rate": 5.84045775271716e-08, + "loss": 0.4325, + "step": 56600 + }, + { + "epoch": 0.9340867531012669, + "grad_norm": 1.9493142366409302, + "learning_rate": 5.560147262088034e-08, + "loss": 0.4165, + "step": 56700 + }, + { + "epoch": 0.9357341724189058, + "grad_norm": 1.875835657119751, + "learning_rate": 5.286654294500454e-08, + "loss": 0.433, + "step": 56800 + }, + { + "epoch": 0.9373815917365447, + "grad_norm": 1.9242185354232788, + "learning_rate": 5.019986477780181e-08, + "loss": 0.445, + "step": 56900 + }, + { + "epoch": 0.9390290110541836, + "grad_norm": 2.1051392555236816, + "learning_rate": 4.7601512493968824e-08, + "loss": 0.4469, + "step": 57000 + }, + { + "epoch": 0.9406764303718226, + "grad_norm": 1.5556972026824951, + "learning_rate": 4.507155856256634e-08, + "loss": 0.4746, + "step": 57100 + }, + { + "epoch": 0.9423238496894615, + "grad_norm": 1.9394145011901855, + "learning_rate": 4.2610073544998577e-08, + "loss": 0.4347, + "step": 57200 + }, + { + "epoch": 0.9439712690071004, + "grad_norm": 1.9497727155685425, + "learning_rate": 4.021712609304507e-08, + "loss": 0.4426, + "step": 57300 + }, + { + "epoch": 0.9456186883247393, + "grad_norm": 1.587270736694336, + "learning_rate": 3.789278294694498e-08, + "loss": 0.4277, + "step": 57400 + }, + { + "epoch": 0.9472661076423782, + "grad_norm": 1.201451301574707, + "learning_rate": 3.563710893353778e-08, + "loss": 0.4448, + "step": 57500 + }, + { + "epoch": 0.9489135269600172, + "grad_norm": 2.1374833583831787, + "learning_rate": 3.345016696445297e-08, + "loss": 0.4276, + "step": 57600 + }, + { + "epoch": 0.9505609462776561, + "grad_norm": 2.4307470321655273, + "learning_rate": 3.133201803435737e-08, + "loss": 0.4353, + "step": 57700 + }, + { + "epoch": 0.952208365595295, + "grad_norm": 1.3492801189422607, + "learning_rate": 2.928272121925202e-08, + "loss": 0.4129, + "step": 57800 + }, + { + "epoch": 0.9538557849129339, + "grad_norm": 1.4907076358795166, + "learning_rate": 2.7302333674827098e-08, + "loss": 0.4478, + "step": 57900 + }, + { + "epoch": 0.9555032042305728, + "grad_norm": 1.893916368484497, + "learning_rate": 2.539091063486432e-08, + "loss": 0.4465, + "step": 58000 + }, + { + "epoch": 0.9571506235482117, + "grad_norm": 2.277837038040161, + "learning_rate": 2.354850540969983e-08, + "loss": 0.4326, + "step": 58100 + }, + { + "epoch": 0.9587980428658507, + "grad_norm": 1.9928171634674072, + "learning_rate": 2.177516938473567e-08, + "loss": 0.418, + "step": 58200 + }, + { + "epoch": 0.9604454621834896, + "grad_norm": 2.096127986907959, + "learning_rate": 2.0070952019006496e-08, + "loss": 0.453, + "step": 58300 + }, + { + "epoch": 0.9620928815011285, + "grad_norm": 2.574500322341919, + "learning_rate": 1.8435900843800926e-08, + "loss": 0.4425, + "step": 58400 + }, + { + "epoch": 0.9637403008187674, + "grad_norm": 2.5897390842437744, + "learning_rate": 1.6870061461335685e-08, + "loss": 0.4273, + "step": 58500 + }, + { + "epoch": 0.9653877201364063, + "grad_norm": 1.7342420816421509, + "learning_rate": 1.5373477543482453e-08, + "loss": 0.4365, + "step": 58600 + }, + { + "epoch": 0.9670351394540453, + "grad_norm": 3.1810550689697266, + "learning_rate": 1.3946190830552431e-08, + "loss": 0.4385, + "step": 58700 + }, + { + "epoch": 0.9686825587716842, + "grad_norm": 2.5934085845947266, + "learning_rate": 1.2588241130129242e-08, + "loss": 0.4453, + "step": 58800 + }, + { + "epoch": 0.9703299780893231, + "grad_norm": 3.0193750858306885, + "learning_rate": 1.1299666315961743e-08, + "loss": 0.4181, + "step": 58900 + }, + { + "epoch": 0.971977397406962, + "grad_norm": 2.132373809814453, + "learning_rate": 1.0080502326904329e-08, + "loss": 0.4217, + "step": 59000 + }, + { + "epoch": 0.9736248167246009, + "grad_norm": 2.04423189163208, + "learning_rate": 8.930783165917723e-09, + "loss": 0.4313, + "step": 59100 + }, + { + "epoch": 0.9752722360422398, + "grad_norm": 1.6803611516952515, + "learning_rate": 7.85054089911863e-09, + "loss": 0.4507, + "step": 59200 + }, + { + "epoch": 0.9769196553598788, + "grad_norm": 2.210566520690918, + "learning_rate": 6.8398056548860116e-09, + "loss": 0.4446, + "step": 59300 + }, + { + "epoch": 0.9785670746775177, + "grad_norm": 1.9046763181686401, + "learning_rate": 5.898605623021192e-09, + "loss": 0.4478, + "step": 59400 + }, + { + "epoch": 0.9802144939951566, + "grad_norm": 1.7694292068481445, + "learning_rate": 5.026967053960441e-09, + "loss": 0.4296, + "step": 59500 + }, + { + "epoch": 0.9818619133127955, + "grad_norm": 1.8257120847702026, + "learning_rate": 4.224914258044721e-09, + "loss": 0.4303, + "step": 59600 + }, + { + "epoch": 0.9835093326304344, + "grad_norm": 1.4642283916473389, + "learning_rate": 3.4924696048396765e-09, + "loss": 0.4322, + "step": 59700 + }, + { + "epoch": 0.9851567519480734, + "grad_norm": 2.159425973892212, + "learning_rate": 2.829653522513076e-09, + "loss": 0.4279, + "step": 59800 + }, + { + "epoch": 0.9868041712657123, + "grad_norm": 1.9656975269317627, + "learning_rate": 2.2364844972647125e-09, + "loss": 0.4386, + "step": 59900 + }, + { + "epoch": 0.9884515905833512, + "grad_norm": 2.556670665740967, + "learning_rate": 1.7129790728101503e-09, + "loss": 0.4393, + "step": 60000 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 2.0833001136779785, + "learning_rate": 1.2591518499208143e-09, + "loss": 0.4191, + "step": 60100 + }, + { + "epoch": 0.991746429218629, + "grad_norm": 2.159656524658203, + "learning_rate": 8.750154860151516e-10, + "loss": 0.4675, + "step": 60200 + }, + { + "epoch": 0.9933938485362679, + "grad_norm": 2.0303680896759033, + "learning_rate": 5.605806948061343e-10, + "loss": 0.447, + "step": 60300 + }, + { + "epoch": 0.9950412678539069, + "grad_norm": 1.8287807703018188, + "learning_rate": 3.1585624600372066e-10, + "loss": 0.4306, + "step": 60400 + }, + { + "epoch": 0.9966886871715458, + "grad_norm": 2.2728703022003174, + "learning_rate": 1.4084896506783018e-10, + "loss": 0.4284, + "step": 60500 + }, + { + "epoch": 0.9983361064891847, + "grad_norm": 2.0561728477478027, + "learning_rate": 3.556373302016081e-11, + "loss": 0.4195, + "step": 60600 + }, + { + "epoch": 0.9999835258068236, + "grad_norm": 2.020707130432129, + "learning_rate": 3.4863070763613284e-15, + "loss": 0.4415, + "step": 60700 + } + ], + "logging_steps": 100, + "max_steps": 60701, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.004310214013092e+17, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}